更新libclamav库1.0.0版本

This commit is contained in:
2023-01-14 18:28:39 +08:00
parent b879ee0b2e
commit 45fe15f472
8531 changed files with 1222046 additions and 177272 deletions

View File

@@ -0,0 +1,801 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use core::cmp;
use crate::tables::grapheme::GraphemeCat;
/// External iterator for grapheme clusters and byte offsets.
///
/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
/// trait. See its documentation for more.
///
/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct GraphemeIndices<'a> {
start_offset: usize,
iter: Graphemes<'a>,
}
impl<'a> GraphemeIndices<'a> {
#[inline]
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
///
/// ```rust
/// # use unicode_segmentation::UnicodeSegmentation;
/// let mut iter = "abc".grapheme_indices(true);
/// assert_eq!(iter.as_str(), "abc");
/// iter.next();
/// assert_eq!(iter.as_str(), "bc");
/// iter.next();
/// iter.next();
/// assert_eq!(iter.as_str(), "");
/// ```
pub fn as_str(&self) -> &'a str {
self.iter.as_str()
}
}
impl<'a> Iterator for GraphemeIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}
impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next_back()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
}
/// External iterator for a string's
/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
///
/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
/// documentation for more.
///
/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone, Debug)]
pub struct Graphemes<'a> {
string: &'a str,
cursor: GraphemeCursor,
cursor_back: GraphemeCursor,
}
impl<'a> Graphemes<'a> {
#[inline]
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
///
/// ```rust
/// # use unicode_segmentation::UnicodeSegmentation;
/// let mut iter = "abc".graphemes(true);
/// assert_eq!(iter.as_str(), "abc");
/// iter.next();
/// assert_eq!(iter.as_str(), "bc");
/// iter.next();
/// iter.next();
/// assert_eq!(iter.as_str(), "");
/// ```
pub fn as_str(&self) -> &'a str {
&self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
}
}
impl<'a> Iterator for Graphemes<'a> {
type Item = &'a str;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
(cmp::min(slen, 1), Some(slen))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
let start = self.cursor.cur_cursor();
if start == self.cursor_back.cur_cursor() {
return None;
}
let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
Some(&self.string[start..next])
}
}
impl<'a> DoubleEndedIterator for Graphemes<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
let end = self.cursor_back.cur_cursor();
if end == self.cursor.cur_cursor() {
return None;
}
let prev = self
.cursor_back
.prev_boundary(self.string, 0)
.unwrap()
.unwrap();
Some(&self.string[prev..end])
}
}
#[inline]
pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
let len = s.len();
Graphemes {
string: s,
cursor: GraphemeCursor::new(0, len, is_extended),
cursor_back: GraphemeCursor::new(len, len, is_extended),
}
}
#[inline]
pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
GraphemeIndices {
start_offset: s.as_ptr() as usize,
iter: new_graphemes(s, is_extended),
}
}
// maybe unify with PairResult?
// An enum describing information about a potential boundary.
#[derive(PartialEq, Eq, Clone, Debug)]
enum GraphemeState {
// No information is known.
Unknown,
// It is known to not be a boundary.
NotBreak,
// It is known to be a boundary.
Break,
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
Regional,
// The codepoint after is Extended_Pictographic,
// so whether it's a boundary depends on pre-context according to GB11.
Emoji,
}
/// Cursor-based segmenter for grapheme clusters.
///
/// This allows working with ropes and other datastructures where the string is not contiguous or
/// fully known at initialization time.
#[derive(Clone, Debug)]
pub struct GraphemeCursor {
// Current cursor position.
offset: usize,
// Total length of the string.
len: usize,
// A config flag indicating whether this cursor computes legacy or extended
// grapheme cluster boundaries (enables GB9a and GB9b if set).
is_extended: bool,
// Information about the potential boundary at `offset`
state: GraphemeState,
// Category of codepoint immediately preceding cursor, if known.
cat_before: Option<GraphemeCat>,
// Category of codepoint immediately after cursor, if known.
cat_after: Option<GraphemeCat>,
// If set, at least one more codepoint immediately preceding this offset
// is needed to resolve whether there's a boundary at `offset`.
pre_context_offset: Option<usize>,
// The number of RIS codepoints preceding `offset`. If `pre_context_offset`
// is set, then counts the number of RIS between that and `offset`, otherwise
// is an accurate count relative to the string.
ris_count: Option<usize>,
// Set if a call to `prev_boundary` or `next_boundary` was suspended due
// to needing more input.
resuming: bool,
// Cached grapheme category and associated scalar value range.
grapheme_cat_cache: (u32, u32, GraphemeCat),
}
/// An error return indicating that not enough content was available in the
/// provided chunk to satisfy the query, and that more content must be provided.
#[derive(PartialEq, Eq, Debug)]
pub enum GraphemeIncomplete {
/// More pre-context is needed. The caller should call `provide_context`
/// with a chunk ending at the offset given, then retry the query. This
/// will only be returned if the `chunk_start` parameter is nonzero.
PreContext(usize),
/// When requesting `prev_boundary`, the cursor is moving past the beginning
/// of the current chunk, so the chunk before that is requested. This will
/// only be returned if the `chunk_start` parameter is nonzero.
PrevChunk,
/// When requesting `next_boundary`, the cursor is moving past the end of the
/// current chunk, so the chunk after that is requested. This will only be
/// returned if the chunk ends before the `len` parameter provided on
/// creation of the cursor.
NextChunk, // requesting chunk following the one given
/// An error returned when the chunk given does not contain the cursor position.
InvalidOffset,
}
// An enum describing the result from lookup of a pair of categories.
#[derive(PartialEq, Eq)]
enum PairResult {
NotBreak, // definitely not a break
Break, // definitely a break
Extended, // a break iff not in extended mode
Regional, // a break if preceded by an even number of RIS
Emoji, // a break if preceded by emoji base and (Extend)*
}
#[inline]
fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
use self::PairResult::*;
use crate::tables::grapheme::GraphemeCat::*;
match (before, after) {
(GC_CR, GC_LF) => NotBreak, // GB3
(GC_Control, _) => Break, // GB4
(GC_CR, _) => Break, // GB4
(GC_LF, _) => Break, // GB4
(_, GC_Control) => Break, // GB5
(_, GC_CR) => Break, // GB5
(_, GC_LF) => Break, // GB5
(GC_L, GC_L) => NotBreak, // GB6
(GC_L, GC_V) => NotBreak, // GB6
(GC_L, GC_LV) => NotBreak, // GB6
(GC_L, GC_LVT) => NotBreak, // GB6
(GC_LV, GC_V) => NotBreak, // GB7
(GC_LV, GC_T) => NotBreak, // GB7
(GC_V, GC_V) => NotBreak, // GB7
(GC_V, GC_T) => NotBreak, // GB7
(GC_LVT, GC_T) => NotBreak, // GB8
(GC_T, GC_T) => NotBreak, // GB8
(_, GC_Extend) => NotBreak, // GB9
(_, GC_ZWJ) => NotBreak, // GB9
(_, GC_SpacingMark) => Extended, // GB9a
(GC_Prepend, _) => Extended, // GB9b
(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
(_, _) => Break, // GB999
}
}
impl GraphemeCursor {
/// Create a new cursor. The string and initial offset are given at creation
/// time, but the contents of the string are not. The `is_extended` parameter
/// controls whether extended grapheme clusters are selected.
///
/// The `offset` parameter must be on a codepoint boundary.
///
/// ```rust
/// # use unicode_segmentation::GraphemeCursor;
/// let s = "हिन्दी";
/// let mut legacy = GraphemeCursor::new(0, s.len(), false);
/// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
/// let mut extended = GraphemeCursor::new(0, s.len(), true);
/// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
/// ```
pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
let state = if offset == 0 || offset == len {
GraphemeState::Break
} else {
GraphemeState::Unknown
};
GraphemeCursor {
offset: offset,
len: len,
state: state,
is_extended: is_extended,
cat_before: None,
cat_after: None,
pre_context_offset: None,
ris_count: None,
resuming: false,
grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
}
}
fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
use crate::tables::grapheme as gr;
use crate::tables::grapheme::GraphemeCat::*;
if ch <= '\u{7e}' {
// Special-case optimization for ascii, except U+007F. This
// improves performance even for many primarily non-ascii texts,
// due to use of punctuation and white space characters from the
// ascii range.
if ch >= '\u{20}' {
GC_Any
} else if ch == '\n' {
GC_LF
} else if ch == '\r' {
GC_CR
} else {
GC_Control
}
} else {
// If this char isn't within the cached range, update the cache to the
// range that includes it.
if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
self.grapheme_cat_cache = gr::grapheme_category(ch);
}
self.grapheme_cat_cache.2
}
}
// Not sure I'm gonna keep this, the advantage over new() seems thin.
/// Set the cursor to a new location in the same string.
///
/// ```rust
/// # use unicode_segmentation::GraphemeCursor;
/// let s = "abcd";
/// let mut cursor = GraphemeCursor::new(0, s.len(), false);
/// assert_eq!(cursor.cur_cursor(), 0);
/// cursor.set_cursor(2);
/// assert_eq!(cursor.cur_cursor(), 2);
/// ```
pub fn set_cursor(&mut self, offset: usize) {
if offset != self.offset {
self.offset = offset;
self.state = if offset == 0 || offset == self.len {
GraphemeState::Break
} else {
GraphemeState::Unknown
};
// reset state derived from text around cursor
self.cat_before = None;
self.cat_after = None;
self.ris_count = None;
}
}
#[inline]
/// The current offset of the cursor. Equal to the last value provided to
/// `new()` or `set_cursor()`, or returned from `next_boundary()` or
/// `prev_boundary()`.
///
/// ```rust
/// # use unicode_segmentation::GraphemeCursor;
/// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
/// assert_eq!(cursor.cur_cursor(), 4);
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
/// assert_eq!(cursor.cur_cursor(), 8);
/// ```
pub fn cur_cursor(&self) -> usize {
self.offset
}
/// Provide additional pre-context when it is needed to decide a boundary.
/// The end of the chunk must coincide with the value given in the
/// `GraphemeIncomplete::PreContext` request.
///
/// ```rust
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
/// // Not enough pre-context to decide if there's a boundary between the two flags.
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
/// // Provide one more Regional Indicator Symbol of pre-context
/// cursor.provide_context(&flags[4..8], 4);
/// // Still not enough context to decide.
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
/// // Provide additional requested context.
/// cursor.provide_context(&flags[0..4], 0);
/// // That's enough to decide (it always is when context goes to the start of the string)
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
/// ```
pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
use crate::tables::grapheme as gr;
assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
self.pre_context_offset = None;
if self.is_extended && chunk_start + chunk.len() == self.offset {
let ch = chunk.chars().rev().next().unwrap();
if self.grapheme_category(ch) == gr::GC_Prepend {
self.decide(false); // GB9b
return;
}
}
match self.state {
GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
_ => {
if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
let ch = chunk.chars().rev().next().unwrap();
self.cat_before = Some(self.grapheme_category(ch));
}
}
}
}
#[inline]
fn decide(&mut self, is_break: bool) {
self.state = if is_break {
GraphemeState::Break
} else {
GraphemeState::NotBreak
};
}
#[inline]
fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
self.decide(is_break);
Ok(is_break)
}
#[inline]
fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
if self.state == GraphemeState::Break {
Ok(true)
} else if self.state == GraphemeState::NotBreak {
Ok(false)
} else if let Some(pre_context_offset) = self.pre_context_offset {
Err(GraphemeIncomplete::PreContext(pre_context_offset))
} else {
unreachable!("inconsistent state");
}
}
#[inline]
fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
use crate::tables::grapheme as gr;
let mut ris_count = self.ris_count.unwrap_or(0);
for ch in chunk.chars().rev() {
if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
self.ris_count = Some(ris_count);
self.decide((ris_count % 2) == 0);
return;
}
ris_count += 1;
}
self.ris_count = Some(ris_count);
if chunk_start == 0 {
self.decide((ris_count % 2) == 0);
return;
}
self.pre_context_offset = Some(chunk_start);
self.state = GraphemeState::Regional;
}
#[inline]
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
use crate::tables::grapheme as gr;
let mut iter = chunk.chars().rev();
if let Some(ch) = iter.next() {
if self.grapheme_category(ch) != gr::GC_ZWJ {
self.decide(true);
return;
}
}
for ch in iter {
match self.grapheme_category(ch) {
gr::GC_Extend => (),
gr::GC_Extended_Pictographic => {
self.decide(false);
return;
}
_ => {
self.decide(true);
return;
}
}
}
if chunk_start == 0 {
self.decide(true);
return;
}
self.pre_context_offset = Some(chunk_start);
self.state = GraphemeState::Emoji;
}
#[inline]
/// Determine whether the current cursor location is a grapheme cluster boundary.
/// Only a part of the string need be supplied. If `chunk_start` is nonzero or
/// the length of `chunk` is not equal to `len` on creation, then this method
/// may return `GraphemeIncomplete::PreContext`. The caller should then
/// call `provide_context` with the requested chunk, then retry calling this
/// method.
///
/// For partial chunks, if the cursor is not at the beginning or end of the
/// string, the chunk should contain at least the codepoint following the cursor.
/// If the string is nonempty, the chunk must be nonempty.
///
/// All calls should have consistent chunk contents (ie, if a chunk provides
/// content for a given slice, all further chunks covering that slice must have
/// the same content for it).
///
/// ```rust
/// # use unicode_segmentation::GraphemeCursor;
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
/// cursor.set_cursor(12);
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
/// ```
pub fn is_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<bool, GraphemeIncomplete> {
use crate::tables::grapheme as gr;
if self.state == GraphemeState::Break {
return Ok(true);
}
if self.state == GraphemeState::NotBreak {
return Ok(false);
}
if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
return Err(GraphemeIncomplete::InvalidOffset);
}
}
if let Some(pre_context_offset) = self.pre_context_offset {
return Err(GraphemeIncomplete::PreContext(pre_context_offset));
}
let offset_in_chunk = self.offset - chunk_start;
if self.cat_after.is_none() {
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
self.cat_after = Some(self.grapheme_category(ch));
}
if self.offset == chunk_start {
let mut need_pre_context = true;
match self.cat_after.unwrap() {
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
_ => need_pre_context = self.cat_before.is_none(),
}
if need_pre_context {
self.pre_context_offset = Some(chunk_start);
return Err(GraphemeIncomplete::PreContext(chunk_start));
}
}
if self.cat_before.is_none() {
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
self.cat_before = Some(self.grapheme_category(ch));
}
match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
PairResult::NotBreak => return self.decision(false),
PairResult::Break => return self.decision(true),
PairResult::Extended => {
let is_extended = self.is_extended;
return self.decision(!is_extended);
}
PairResult::Regional => {
if let Some(ris_count) = self.ris_count {
return self.decision((ris_count % 2) == 0);
}
self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
self.is_boundary_result()
}
PairResult::Emoji => {
self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
self.is_boundary_result()
}
}
}
#[inline]
/// Find the next boundary after the current cursor position. Only a part of
/// the string need be supplied. If the chunk is incomplete, then this
/// method might return `GraphemeIncomplete::PreContext` or
/// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
/// call `provide_context` with the requested chunk, then retry. In the
/// latter case, the caller should provide the chunk following the one
/// given, then retry.
///
/// See `is_boundary` for expectations on the provided chunk.
///
/// ```rust
/// # use unicode_segmentation::GraphemeCursor;
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
/// ```
///
/// And an example that uses partial strings:
///
/// ```rust
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
/// let s = "abcd";
/// let mut cursor = GraphemeCursor::new(0, s.len(), false);
/// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
/// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
/// ```
pub fn next_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<Option<usize>, GraphemeIncomplete> {
if self.offset == self.len {
return Ok(None);
}
let mut iter = chunk[self.offset - chunk_start..].chars();
let mut ch = iter.next().unwrap();
loop {
if self.resuming {
if self.cat_after.is_none() {
self.cat_after = Some(self.grapheme_category(ch));
}
} else {
self.offset += ch.len_utf8();
self.state = GraphemeState::Unknown;
self.cat_before = self.cat_after.take();
if self.cat_before.is_none() {
self.cat_before = Some(self.grapheme_category(ch));
}
if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
self.ris_count = self.ris_count.map(|c| c + 1);
} else {
self.ris_count = Some(0);
}
if let Some(next_ch) = iter.next() {
ch = next_ch;
self.cat_after = Some(self.grapheme_category(ch));
} else if self.offset == self.len {
self.decide(true);
} else {
self.resuming = true;
return Err(GraphemeIncomplete::NextChunk);
}
}
self.resuming = true;
if self.is_boundary(chunk, chunk_start)? {
self.resuming = false;
return Ok(Some(self.offset));
}
self.resuming = false;
}
}
/// Find the previous boundary after the current cursor position. Only a part
/// of the string need be supplied. If the chunk is incomplete, then this
/// method might return `GraphemeIncomplete::PreContext` or
/// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
/// call `provide_context` with the requested chunk, then retry. In the
/// latter case, the caller should provide the chunk preceding the one
/// given, then retry.
///
/// See `is_boundary` for expectations on the provided chunk.
///
/// ```rust
/// # use unicode_segmentation::GraphemeCursor;
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
/// ```
///
/// And an example that uses partial strings (note the exact return is not
/// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
///
/// ```rust
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
/// let s = "abcd";
/// let mut cursor = GraphemeCursor::new(4, s.len(), false);
/// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
/// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
/// ```
pub fn prev_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<Option<usize>, GraphemeIncomplete> {
if self.offset == 0 {
return Ok(None);
}
if self.offset == chunk_start {
return Err(GraphemeIncomplete::PrevChunk);
}
let mut iter = chunk[..self.offset - chunk_start].chars().rev();
let mut ch = iter.next().unwrap();
loop {
if self.offset == chunk_start {
self.resuming = true;
return Err(GraphemeIncomplete::PrevChunk);
}
if self.resuming {
self.cat_before = Some(self.grapheme_category(ch));
} else {
self.offset -= ch.len_utf8();
self.cat_after = self.cat_before.take();
self.state = GraphemeState::Unknown;
if let Some(ris_count) = self.ris_count {
self.ris_count = if ris_count > 0 {
Some(ris_count - 1)
} else {
None
};
}
if let Some(prev_ch) = iter.next() {
ch = prev_ch;
self.cat_before = Some(self.grapheme_category(ch));
} else if self.offset == 0 {
self.decide(true);
} else {
self.resuming = true;
self.cat_after = Some(self.grapheme_category(ch));
return Err(GraphemeIncomplete::PrevChunk);
}
}
self.resuming = true;
if self.is_boundary(chunk, chunk_start)? {
self.resuming = false;
return Ok(Some(self.offset));
}
self.resuming = false;
}
}
}
#[test]
fn test_grapheme_cursor_ris_precontext() {
let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
let mut c = GraphemeCursor::new(8, s.len(), true);
assert_eq!(
c.is_boundary(&s[4..], 4),
Err(GraphemeIncomplete::PreContext(4))
);
c.provide_context(&s[..4], 0);
assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
}
#[test]
fn test_grapheme_cursor_chunk_start_require_precontext() {
let s = "\r\n";
let mut c = GraphemeCursor::new(1, s.len(), true);
assert_eq!(
c.is_boundary(&s[1..], 1),
Err(GraphemeIncomplete::PreContext(1))
);
c.provide_context(&s[..1], 0);
assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
}
#[test]
fn test_grapheme_cursor_prev_boundary() {
let s = "abcd";
let mut c = GraphemeCursor::new(3, s.len(), true);
assert_eq!(
c.prev_boundary(&s[2..], 2),
Err(GraphemeIncomplete::PrevChunk)
);
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
}
#[test]
fn test_grapheme_cursor_prev_boundary_chunk_start() {
let s = "abcd";
let mut c = GraphemeCursor::new(2, s.len(), true);
assert_eq!(
c.prev_boundary(&s[2..], 2),
Err(GraphemeIncomplete::PrevChunk)
);
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
}

View File

@@ -0,0 +1,307 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
//!
//! ```rust
//! extern crate unicode_segmentation;
//!
//! use unicode_segmentation::UnicodeSegmentation;
//!
//! fn main() {
//! let s = "a̐éö̲\r\n";
//! let g = UnicodeSegmentation::graphemes(s, true).collect::<Vec<&str>>();
//! let b: &[_] = &["a̐", "é", "ö̲", "\r\n"];
//! assert_eq!(g, b);
//!
//! let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
//! let w = s.unicode_words().collect::<Vec<&str>>();
//! let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
//! assert_eq!(w, b);
//!
//! let s = "The quick (\"brown\") fox";
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
//! assert_eq!(w, b);
//! }
//! ```
//!
//! # no_std
//!
//! unicode-segmentation does not depend on libstd, so it can be used in crates
//! with the `#![no_std]` attribute.
//!
//! # crates.io
//!
//! You can use this package in your project by adding the following
//! to your `Cargo.toml`:
//!
//! ```toml
//! [dependencies]
//! unicode-segmentation = "1.9.0"
//! ```
#![deny(missing_docs, unsafe_code)]
#![doc(
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
)]
#![no_std]
#[cfg(test)]
#[macro_use]
extern crate std;
#[cfg(test)]
#[macro_use]
extern crate quickcheck;
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use grapheme::{GraphemeIndices, Graphemes};
pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
pub use tables::UNICODE_VERSION;
pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
mod grapheme;
#[rustfmt::skip]
mod tables;
mod sentence;
mod word;
#[cfg(test)]
mod test;
#[cfg(test)]
mod testdata;
/// Methods for segmenting strings according to
/// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).
pub trait UnicodeSegmentation {
/// Returns an iterator over the [grapheme clusters][graphemes] of `self`.
///
/// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
///
/// If `is_extended` is true, the iterator is over the
/// *extended grapheme clusters*;
/// otherwise, the iterator is over the *legacy grapheme clusters*.
/// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
/// recommends extended grapheme cluster boundaries for general processing.
///
/// # Examples
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let gr1 = UnicodeSegmentation::graphemes("a\u{310}e\u{301}o\u{308}\u{332}", true)
/// .collect::<Vec<&str>>();
/// let b: &[_] = &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"];
///
/// assert_eq!(&gr1[..], b);
///
/// let gr2 = UnicodeSegmentation::graphemes("a\r\nb🇷🇺🇸🇹", true).collect::<Vec<&str>>();
/// let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"];
///
/// assert_eq!(&gr2[..], b);
/// ```
fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
/// Returns an iterator over the grapheme clusters of `self` and their
/// byte offsets. See `graphemes()` for more information.
///
/// # Examples
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲\r\n", true)
/// .collect::<Vec<(usize, &str)>>();
/// let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
///
/// assert_eq!(&gr_inds[..], b);
/// ```
fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
/// Returns an iterator over the words of `self`, separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
/// Here, "words" are just those substrings which, after splitting on
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
/// substring must contain at least one character with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
/// let uw1 = uws.unicode_words().collect::<Vec<&str>>();
/// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
///
/// assert_eq!(&uw1[..], b);
/// ```
fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
/// Returns an iterator over the words of `self`, separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
/// offsets.
///
/// Here, "words" are just those substrings which, after splitting on
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
/// substring must contain at least one character with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
/// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
/// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
/// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
///
/// assert_eq!(&uwi1[..], b);
/// ```
fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;
/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
/// The concatenation of the substrings returned by this function is just the original string.
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
///
/// assert_eq!(&swu1[..], b);
/// ```
fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>;
/// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
/// and their offsets. See `split_word_bounds()` for more information.
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>();
/// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
/// (14, "°"), (16, "F"), (17, "!")];
///
/// assert_eq!(&swi1[..], b);
/// ```
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// Here, "sentences" are just those substrings which, after splitting on
/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
/// substring must contain at least one character with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let uss = "Mr. Fox jumped. [...] The dog was too lazy.";
/// let us1 = uss.unicode_sentences().collect::<Vec<&str>>();
/// let b: &[_] = &["Mr. ", "Fox jumped. ", "The dog was too lazy."];
///
/// assert_eq!(&us1[..], b);
/// ```
fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// The concatenation of the substrings returned by this function is just the original string.
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let ssbs = "Mr. Fox jumped. [...] The dog was too lazy.";
/// let ssb1 = ssbs.split_sentence_bounds().collect::<Vec<&str>>();
/// let b: &[_] = &["Mr. ", "Fox jumped. ", "[...] ", "The dog was too lazy."];
///
/// assert_eq!(&ssb1[..], b);
/// ```
fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
/// and their offsets. See `split_sentence_bounds()` for more information.
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let ssis = "Mr. Fox jumped. [...] The dog was too lazy.";
/// let ssi1 = ssis.split_sentence_bound_indices().collect::<Vec<(usize, &str)>>();
/// let b: &[_] = &[(0, "Mr. "), (4, "Fox jumped. "), (16, "[...] "),
/// (22, "The dog was too lazy.")];
///
/// assert_eq!(&ssi1[..], b);
/// ```
fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
}
impl UnicodeSegmentation for str {
#[inline]
fn graphemes(&self, is_extended: bool) -> Graphemes {
grapheme::new_graphemes(self, is_extended)
}
#[inline]
fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
grapheme::new_grapheme_indices(self, is_extended)
}
#[inline]
fn unicode_words(&self) -> UnicodeWords {
word::new_unicode_words(self)
}
#[inline]
fn unicode_word_indices(&self) -> UnicodeWordIndices {
word::new_unicode_word_indices(self)
}
#[inline]
fn split_word_bounds(&self) -> UWordBounds {
word::new_word_bounds(self)
}
#[inline]
fn split_word_bound_indices(&self) -> UWordBoundIndices {
word::new_word_bound_indices(self)
}
#[inline]
fn unicode_sentences(&self) -> UnicodeSentences {
sentence::new_unicode_sentences(self)
}
#[inline]
fn split_sentence_bounds(&self) -> USentenceBounds {
sentence::new_sentence_bounds(self)
}
#[inline]
fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
sentence::new_sentence_bound_indices(self)
}
}

View File

@@ -0,0 +1,415 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use core::cmp;
use core::iter::Filter;
// All of the logic for forward iteration over sentences
mod fwd {
use crate::tables::sentence::SentenceCat;
use core::cmp;
// Describe a parsed part of source string as described in this table:
// https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
#[derive(Clone, Copy, PartialEq, Eq)]
enum StatePart {
Sot,
Eot,
Other,
CR,
LF,
Sep,
ATerm,
UpperLower,
ClosePlus,
SpPlus,
STerm,
}
#[derive(Clone, PartialEq, Eq)]
struct SentenceBreaksState(pub [StatePart; 4]);
const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
StatePart::Sot,
StatePart::Sot,
StatePart::Sot,
StatePart::Sot,
]);
#[derive(Clone)]
pub struct SentenceBreaks<'a> {
pub string: &'a str,
pos: usize,
state: SentenceBreaksState,
}
impl SentenceBreaksState {
// Attempt to advance the internal state by one part
// Whitespace and some punctutation will be collapsed
fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
let &SentenceBreaksState(parts) = self;
let parts = match (parts[3], cat) {
(StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
(StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
_ => [
parts[1],
parts[2],
parts[3],
match cat {
SentenceCat::SC_CR => StatePart::CR,
SentenceCat::SC_LF => StatePart::LF,
SentenceCat::SC_Sep => StatePart::Sep,
SentenceCat::SC_ATerm => StatePart::ATerm,
SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
SentenceCat::SC_Close => StatePart::ClosePlus,
SentenceCat::SC_Sp => StatePart::SpPlus,
SentenceCat::SC_STerm => StatePart::STerm,
_ => StatePart::Other,
},
],
};
SentenceBreaksState(parts)
}
fn end(&self) -> SentenceBreaksState {
let &SentenceBreaksState(parts) = self;
SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
}
// Helper function to check if state head matches a single `StatePart`
fn match1(&self, part: StatePart) -> bool {
let &SentenceBreaksState(parts) = self;
part == parts[3]
}
// Helper function to check if first two `StateParts` in state match
// the given two
fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
let &SentenceBreaksState(parts) = self;
part1 == parts[2] && part2 == parts[3]
}
}
// https://unicode.org/reports/tr29/#SB8
// TODO cache this, it is currently quadratic
fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
let &SentenceBreaksState(parts) = state;
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
if parts[idx] == StatePart::ClosePlus {
idx -= 1
}
if parts[idx] == StatePart::ATerm {
use crate::tables::sentence as se;
for next_char in ahead.chars() {
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
match se::sentence_category(next_char).2 {
se::SC_Lower => return true,
se::SC_OLetter
| se::SC_Upper
| se::SC_Sep
| se::SC_CR
| se::SC_LF
| se::SC_STerm
| se::SC_ATerm => return false,
_ => continue,
}
}
}
false
}
// https://unicode.org/reports/tr29/#SB8a
fn match_sb8a(state: &SentenceBreaksState) -> bool {
// SATerm Close* Sp*
let &SentenceBreaksState(parts) = state;
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
if parts[idx] == StatePart::ClosePlus {
idx -= 1
}
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
}
// https://unicode.org/reports/tr29/#SB9
fn match_sb9(state: &SentenceBreaksState) -> bool {
// SATerm Close*
let &SentenceBreaksState(parts) = state;
let idx = if parts[3] == StatePart::ClosePlus {
2
} else {
3
};
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
}
// https://unicode.org/reports/tr29/#SB11
fn match_sb11(state: &SentenceBreaksState) -> bool {
// SATerm Close* Sp* ParaSep?
let &SentenceBreaksState(parts) = state;
let mut idx = match parts[3] {
StatePart::Sep | StatePart::CR | StatePart::LF => 2,
_ => 3,
};
if parts[idx] == StatePart::SpPlus {
idx -= 1
}
if parts[idx] == StatePart::ClosePlus {
idx -= 1
}
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
}
impl<'a> Iterator for SentenceBreaks<'a> {
// Returns the index of the character which follows a break
type Item = usize;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.string.len();
// A sentence could be one character
(cmp::min(slen, 2), Some(slen + 1))
}
#[inline]
fn next(&mut self) -> Option<usize> {
use crate::tables::sentence as se;
for next_char in self.string[self.pos..].chars() {
let position_before = self.pos;
let state_before = self.state.clone();
let next_cat = se::sentence_category(next_char).2;
self.pos += next_char.len_utf8();
self.state = self.state.next(next_cat);
match next_cat {
// SB1 https://unicode.org/reports/tr29/#SB1
_ if state_before.match1(StatePart::Sot) => return Some(position_before),
// SB2 is handled when inner iterator (chars) is finished
// SB3 https://unicode.org/reports/tr29/#SB3
SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
// SB4 https://unicode.org/reports/tr29/#SB4
_ if state_before.match1(StatePart::Sep)
|| state_before.match1(StatePart::CR)
|| state_before.match1(StatePart::LF) =>
{
return Some(position_before)
}
// SB5 https://unicode.org/reports/tr29/#SB5
SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,
// SB6 https://unicode.org/reports/tr29/#SB6
SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
// SB7 https://unicode.org/reports/tr29/#SB7
SentenceCat::SC_Upper
if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
{
continue
}
// SB8 https://unicode.org/reports/tr29/#SB8
_ if match_sb8(&state_before, &self.string[position_before..]) => continue,
// SB8a https://unicode.org/reports/tr29/#SB8a
SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
if match_sb8a(&state_before) =>
{
continue
}
// SB9 https://unicode.org/reports/tr29/#SB9
SentenceCat::SC_Close
| SentenceCat::SC_Sp
| SentenceCat::SC_Sep
| SentenceCat::SC_CR
| SentenceCat::SC_LF
if match_sb9(&state_before) =>
{
continue
}
// SB10 https://unicode.org/reports/tr29/#SB10
SentenceCat::SC_Sp
| SentenceCat::SC_Sep
| SentenceCat::SC_CR
| SentenceCat::SC_LF
if match_sb8a(&state_before) =>
{
continue
}
// SB11 https://unicode.org/reports/tr29/#SB11
_ if match_sb11(&state_before) => return Some(position_before),
// SB998 https://unicode.org/reports/tr29/#SB998
_ => continue,
}
}
// SB2 https://unicode.org/reports/tr29/#SB2
if self.state.match1(StatePart::Sot) {
None
} else if self.state.match1(StatePart::Eot) {
None
} else {
self.state = self.state.end();
Some(self.pos)
}
}
}
pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
SentenceBreaks {
string: source,
pos: 0,
state: INITIAL_STATE,
}
}
}
/// An iterator over the substrings of a string which, after splitting the string on
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
/// contain any characters with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
/// trait. See its documentation for more.
///
/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct UnicodeSentences<'a> {
inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
}
/// External iterator for a string's
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
/// trait. See its documentation for more.
///
/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct USentenceBounds<'a> {
iter: fwd::SentenceBreaks<'a>,
sentence_start: Option<usize>,
}
/// External iterator for sentence boundaries and byte offsets.
///
/// This struct is created by the [`split_sentence_bound_indices`] method on the
/// [`UnicodeSegmentation`] trait. See its documentation for more.
///
/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct USentenceBoundIndices<'a> {
start_offset: usize,
iter: USentenceBounds<'a>,
}
#[inline]
pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
USentenceBounds {
iter: fwd::new_sentence_breaks(source),
sentence_start: None,
}
}
#[inline]
pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
USentenceBoundIndices {
start_offset: source.as_ptr() as usize,
iter: new_sentence_bounds(source),
}
}
#[inline]
pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
use super::UnicodeSegmentation;
use crate::tables::util::is_alphanumeric;
fn has_alphanumeric(s: &&str) -> bool {
s.chars().any(|c| is_alphanumeric(c))
}
let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
UnicodeSentences {
inner: s.split_sentence_bounds().filter(has_alphanumeric),
}
}
impl<'a> Iterator for UnicodeSentences<'a> {
type Item = &'a str;
#[inline]
fn next(&mut self) -> Option<&'a str> {
self.inner.next()
}
}
impl<'a> Iterator for USentenceBounds<'a> {
type Item = &'a str;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let (lower, upper) = self.iter.size_hint();
(cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
if self.sentence_start == None {
if let Some(start_pos) = self.iter.next() {
self.sentence_start = Some(start_pos)
} else {
return None;
}
}
if let Some(break_pos) = self.iter.next() {
let start_pos = self.sentence_start.unwrap();
let sentence = &self.iter.string[start_pos..break_pos];
self.sentence_start = Some(break_pos);
Some(sentence)
} else {
None
}
}
}
impl<'a> Iterator for USentenceBoundIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,247 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::UnicodeSegmentation;
use std::prelude::v1::*;
#[test]
fn test_graphemes() {
use crate::testdata::{TEST_DIFF, TEST_SAME};
pub const EXTRA_DIFF: &'static [(
&'static str,
&'static [&'static str],
&'static [&'static str],
)] = &[
// Official test suite doesn't include two Prepend chars between two other chars.
(
"\u{20}\u{600}\u{600}\u{20}",
&["\u{20}", "\u{600}\u{600}\u{20}"],
&["\u{20}", "\u{600}", "\u{600}", "\u{20}"],
),
// Test for Prepend followed by two Any chars
(
"\u{600}\u{20}\u{20}",
&["\u{600}\u{20}", "\u{20}"],
&["\u{600}", "\u{20}", "\u{20}"],
),
];
pub const EXTRA_SAME: &'static [(&'static str, &'static [&'static str])] = &[
// family emoji (more than two emoji joined by ZWJ)
(
"\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}",
&["\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}"],
),
// cartwheel emoji followed by two fitzpatrick skin tone modifiers
// (test case from issue #19)
(
"\u{1F938}\u{1F3FE}\u{1F3FE}",
&["\u{1F938}\u{1F3FE}\u{1F3FE}"],
),
];
for &(s, g) in TEST_SAME.iter().chain(EXTRA_SAME) {
// test forward iterator
assert!(UnicodeSegmentation::graphemes(s, true).eq(g.iter().cloned()));
assert!(UnicodeSegmentation::graphemes(s, false).eq(g.iter().cloned()));
// test reverse iterator
assert!(UnicodeSegmentation::graphemes(s, true)
.rev()
.eq(g.iter().rev().cloned()));
assert!(UnicodeSegmentation::graphemes(s, false)
.rev()
.eq(g.iter().rev().cloned()));
}
for &(s, gt, gf) in TEST_DIFF.iter().chain(EXTRA_DIFF) {
// test forward iterator
assert!(UnicodeSegmentation::graphemes(s, true).eq(gt.iter().cloned()));
assert!(UnicodeSegmentation::graphemes(s, false).eq(gf.iter().cloned()));
// test reverse iterator
assert!(UnicodeSegmentation::graphemes(s, true)
.rev()
.eq(gt.iter().rev().cloned()));
assert!(UnicodeSegmentation::graphemes(s, false)
.rev()
.eq(gf.iter().rev().cloned()));
}
// test the indices iterators
let s = "a̐éö̲\r\n";
let gr_inds = UnicodeSegmentation::grapheme_indices(s, true).collect::<Vec<(usize, &str)>>();
let b: &[_] = &[(0, ""), (3, ""), (6, "ö̲"), (11, "\r\n")];
assert_eq!(gr_inds, b);
let gr_inds = UnicodeSegmentation::grapheme_indices(s, true)
.rev()
.collect::<Vec<(usize, &str)>>();
let b: &[_] = &[(11, "\r\n"), (6, "ö̲"), (3, ""), (0, "")];
assert_eq!(gr_inds, b);
let mut gr_inds_iter = UnicodeSegmentation::grapheme_indices(s, true);
{
let gr_inds = gr_inds_iter.by_ref();
let e1 = gr_inds.size_hint();
assert_eq!(e1, (1, Some(13)));
let c = gr_inds.count();
assert_eq!(c, 4);
}
let e2 = gr_inds_iter.size_hint();
assert_eq!(e2, (0, Some(0)));
// make sure the reverse iterator does the right thing with "\n" at beginning of string
let s = "\n\r\n\r";
let gr = UnicodeSegmentation::graphemes(s, true)
.rev()
.collect::<Vec<&str>>();
let b: &[_] = &["\r", "\r\n", "\n"];
assert_eq!(gr, b);
}
#[test]
fn test_words() {
use crate::testdata::TEST_WORD;
// Unicode's official tests don't really test longer chains of flag emoji
// TODO This could be improved with more tests like flag emoji with interspersed Extend chars and ZWJ
const EXTRA_TESTS: &'static [(&'static str, &'static [&'static str])] = &[
(
"🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦🇴",
&["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦🇴"],
),
("🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦", &["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"]),
(
"🇦a🇫🇦🇽a🇦🇱🇩🇿🇦🇸🇦🇩🇦",
&["🇦", "a", "🇫🇦", "🇽", "a", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"],
),
(
"\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}",
&["\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}"],
),
("😌👎🏼", &["😌", "👎🏼"]),
// perhaps wrong, spaces should not be included?
("hello world", &["hello", " ", "world"]),
("🇨🇦🇨🇭🇿🇲🇿 hi", &["🇨🇦", "🇨🇭", "🇿🇲", "🇿", " ", "hi"]),
];
for &(s, w) in TEST_WORD.iter().chain(EXTRA_TESTS.iter()) {
macro_rules! assert_ {
($test:expr, $exp:expr, $name:expr) => {
// collect into vector for better diagnostics in failure case
let testing = $test.collect::<Vec<_>>();
let expected = $exp.collect::<Vec<_>>();
assert_eq!(
testing, expected,
"{} test for testcase ({:?}, {:?}) failed.",
$name, s, w
)
};
}
// test forward iterator
assert_!(
s.split_word_bounds(),
w.iter().cloned(),
"Forward word boundaries"
);
// test reverse iterator
assert_!(
s.split_word_bounds().rev(),
w.iter().rev().cloned(),
"Reverse word boundaries"
);
// generate offsets from word string lengths
let mut indices = vec![0];
for i in w.iter().cloned().map(|s| s.len()).scan(0, |t, n| {
*t += n;
Some(*t)
}) {
indices.push(i);
}
indices.pop();
let indices = indices;
// test forward indices iterator
assert_!(
s.split_word_bound_indices().map(|(l, _)| l),
indices.iter().cloned(),
"Forward word indices"
);
// test backward indices iterator
assert_!(
s.split_word_bound_indices().rev().map(|(l, _)| l),
indices.iter().rev().cloned(),
"Reverse word indices"
);
}
}
#[test]
fn test_sentences() {
use crate::testdata::TEST_SENTENCE;
for &(s, w) in TEST_SENTENCE.iter() {
macro_rules! assert_ {
($test:expr, $exp:expr, $name:expr) => {
// collect into vector for better diagnostics in failure case
let testing = $test.collect::<Vec<_>>();
let expected = $exp.collect::<Vec<_>>();
assert_eq!(
testing, expected,
"{} test for testcase ({:?}, {:?}) failed.",
$name, s, w
)
};
}
assert_!(
s.split_sentence_bounds(),
w.iter().cloned(),
"Forward sentence boundaries"
);
}
}
quickcheck! {
fn quickcheck_forward_reverse_graphemes_extended(s: String) -> bool {
let a = s.graphemes(true).collect::<Vec<_>>();
let mut b = s.graphemes(true).rev().collect::<Vec<_>>();
b.reverse();
a == b
}
fn quickcheck_forward_reverse_graphemes_legacy(s: String) -> bool {
let a = s.graphemes(false).collect::<Vec<_>>();
let mut b = s.graphemes(false).rev().collect::<Vec<_>>();
b.reverse();
a == b
}
fn quickcheck_join_graphemes(s: String) -> bool {
let a = s.graphemes(true).collect::<String>();
let b = s.graphemes(false).collect::<String>();
a == s && b == s
}
fn quickcheck_forward_reverse_words(s: String) -> bool {
let a = s.split_word_bounds().collect::<Vec<_>>();
let mut b = s.split_word_bounds().rev().collect::<Vec<_>>();
b.reverse();
a == b
}
fn quickcheck_join_words(s: String) -> bool {
let a = s.split_word_bounds().collect::<String>();
a == s
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,754 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use core::cmp;
use core::iter::Filter;
use crate::tables::word::WordCat;
/// An iterator over the substrings of a string which, after splitting the string on
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
/// contain any characters with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
/// its documentation for more.
///
/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
pub struct UnicodeWords<'a> {
inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
}
impl<'a> Iterator for UnicodeWords<'a> {
type Item = &'a str;
#[inline]
fn next(&mut self) -> Option<&'a str> {
self.inner.next()
}
}
impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
self.inner.next_back()
}
}
/// An iterator over the substrings of a string which, after splitting the string on
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
/// contain any characters with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
/// This iterator also provides the byte offsets for each substring.
///
/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
/// its documentation for more.
///
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
pub struct UnicodeWordIndices<'a> {
inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
}
impl<'a> Iterator for UnicodeWordIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.inner.next()
}
}
impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.inner.next_back()
}
}
/// External iterator for a string's
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
/// trait. See its documentation for more.
///
/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct UWordBounds<'a> {
string: &'a str,
cat: Option<WordCat>,
catb: Option<WordCat>,
}
/// External iterator for word boundaries and byte offsets.
///
/// This struct is created by the [`split_word_bound_indices`] method on the
/// [`UnicodeSegmentation`] trait. See its documentation for more.
///
/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct UWordBoundIndices<'a> {
start_offset: usize,
iter: UWordBounds<'a>,
}
impl<'a> UWordBoundIndices<'a> {
#[inline]
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
///
/// ```rust
/// # use unicode_segmentation::UnicodeSegmentation;
/// let mut iter = "Hello world".split_word_bound_indices();
/// assert_eq!(iter.as_str(), "Hello world");
/// iter.next();
/// assert_eq!(iter.as_str(), " world");
/// iter.next();
/// assert_eq!(iter.as_str(), "world");
/// ```
pub fn as_str(&self) -> &'a str {
self.iter.as_str()
}
}
impl<'a> Iterator for UWordBoundIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}
impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next_back()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
}
// state machine for word boundary rules
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum UWordBoundsState {
Start,
Letter,
HLetter,
Numeric,
Katakana,
ExtendNumLet,
Regional(RegionalState),
FormatExtend(FormatExtendType),
Zwj,
Emoji,
WSegSpace,
}
// subtypes for FormatExtend state in UWordBoundsState
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum FormatExtendType {
AcceptAny,
AcceptNone,
RequireLetter,
RequireHLetter,
AcceptQLetter,
RequireNumeric,
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum RegionalState {
Half,
Full,
Unknown,
}
fn is_emoji(ch: char) -> bool {
use crate::tables::emoji;
emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
}
impl<'a> Iterator for UWordBounds<'a> {
type Item = &'a str;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.string.len();
(cmp::min(slen, 1), Some(slen))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
use self::FormatExtendType::*;
use self::UWordBoundsState::*;
use crate::tables::word as wd;
if self.string.len() == 0 {
return None;
}
let mut take_curr = true;
let mut take_cat = true;
let mut idx = 0;
let mut saveidx = 0;
let mut state = Start;
let mut cat = wd::WC_Any;
let mut savecat = wd::WC_Any;
// If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
let mut skipped_format_extend = false;
for (curr, ch) in self.string.char_indices() {
idx = curr;
// Whether or not the previous category was ZWJ
// ZWJs get collapsed, so this handles precedence of WB3c over WB4
let prev_zwj = cat == wd::WC_ZWJ;
// if there's a category cached, grab it
cat = match self.cat {
None => wd::word_category(ch).2,
_ => self.cat.take().unwrap(),
};
take_cat = true;
// handle rule WB4
// just skip all format, extend, and zwj chars
// note that Start is a special case: if there's a bunch of Format | Extend
// characters at the beginning of a block of text, dump them out as one unit.
//
// (This is not obvious from the wording of UAX#29, but if you look at the
// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
// then the "correct" interpretation of WB4 becomes apparent.)
if state != Start {
match cat {
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
skipped_format_extend = true;
continue;
}
_ => {}
}
}
// rule WB3c
// WB4 makes all ZWJs collapse into the previous state
// but you can still be in a Zwj state if you started with Zwj
//
// This means that an EP + Zwj will collapse into EP, which is wrong,
// since EP+EP is not a boundary but EP+ZWJ+EP is
//
// Thus, we separately keep track of whether or not the last character
// was a ZWJ. This is an additional bit of state tracked outside of the
// state enum; the state enum represents the last non-zwj state encountered.
// When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
// however we are in the previous state for the purposes of all other rules.
if prev_zwj {
if is_emoji(ch) {
state = Emoji;
continue;
}
}
// Don't use `continue` in this match without updating `cat`
state = match state {
Start if cat == wd::WC_CR => {
idx += match self.get_next_cat(idx) {
Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
_ => 0,
};
break; // rule WB3a
}
Start => match cat {
wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
wd::WC_Katakana => Katakana, // rule WB13, WB13a
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
wd::WC_ZWJ => Zwj, // rule WB3c
wd::WC_WSegSpace => WSegSpace, // rule WB3d
_ => {
if let Some(ncat) = self.get_next_cat(idx) {
// rule WB4
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
{
state = FormatExtend(AcceptNone);
self.cat = Some(ncat);
continue;
}
}
break; // rule WB999
}
},
WSegSpace => match cat {
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
_ => {
take_curr = false;
break;
}
},
Zwj => {
// We already handle WB3c above.
take_curr = false;
break;
}
Letter | HLetter => match cat {
wd::WC_ALetter => Letter, // rule WB5
wd::WC_Hebrew_Letter => HLetter, // rule WB5
wd::WC_Numeric => Numeric, // rule WB9
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_Double_Quote if state == HLetter => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireHLetter) // rule WB7b
}
wd::WC_Single_Quote if state == HLetter => {
FormatExtend(AcceptQLetter) // rule WB7a
}
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireLetter) // rule WB6
}
_ => {
take_curr = false;
break;
}
},
Numeric => match cat {
wd::WC_Numeric => Numeric, // rule WB8
wd::WC_ALetter => Letter, // rule WB10
wd::WC_Hebrew_Letter => HLetter, // rule WB10
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireNumeric) // rule WB12
}
_ => {
take_curr = false;
break;
}
},
Katakana => match cat {
wd::WC_Katakana => Katakana, // rule WB13
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
_ => {
take_curr = false;
break;
}
},
ExtendNumLet => match cat {
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_ALetter => Letter, // rule WB13b
wd::WC_Hebrew_Letter => HLetter, // rule WB13b
wd::WC_Numeric => Numeric, // rule WB13b
wd::WC_Katakana => Katakana, // rule WB13b
_ => {
take_curr = false;
break;
}
},
Regional(RegionalState::Full) => {
// if it reaches here we've gone too far,
// a full flag can only compose with ZWJ/Extend/Format
// proceeding it.
take_curr = false;
break;
}
Regional(RegionalState::Half) => match cat {
wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
_ => {
take_curr = false;
break;
}
},
Regional(_) => {
unreachable!("RegionalState::Unknown should not occur on forward iteration")
}
Emoji => {
// We already handle WB3c above. If you've reached this point, the emoji sequence is over.
take_curr = false;
break;
}
FormatExtend(t) => match t {
// handle FormatExtends depending on what type
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
AcceptNone | AcceptQLetter => {
take_curr = false; // emit all the Format|Extend characters
take_cat = false;
break;
}
_ => break, // rewind (in if statement below)
},
}
}
if let FormatExtend(t) = state {
// we were looking for something and didn't find it; we have to back up
if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
idx = saveidx;
cat = savecat;
take_curr = false;
}
}
self.cat = if take_curr {
idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
None
} else if take_cat {
Some(cat)
} else {
None
};
let retstr = &self.string[..idx];
self.string = &self.string[idx..];
Some(retstr)
}
}
impl<'a> DoubleEndedIterator for UWordBounds<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
use self::FormatExtendType::*;
use self::UWordBoundsState::*;
use crate::tables::word as wd;
if self.string.len() == 0 {
return None;
}
let mut take_curr = true;
let mut take_cat = true;
let mut idx = self.string.len();
idx -= self.string.chars().next_back().unwrap().len_utf8();
let mut previdx = idx;
let mut saveidx = idx;
let mut state = Start;
let mut savestate = Start;
let mut cat = wd::WC_Any;
let mut skipped_format_extend = false;
for (curr, ch) in self.string.char_indices().rev() {
previdx = idx;
idx = curr;
// if there's a category cached, grab it
cat = match self.catb {
None => wd::word_category(ch).2,
_ => self.catb.take().unwrap(),
};
take_cat = true;
// backward iterator over word boundaries. Mostly the same as the forward
// iterator, with two weirdnesses:
// (1) If we encounter a single quote in the Start state, we have to check for a
// Hebrew Letter immediately before it.
// (2) Format and Extend char handling takes some gymnastics.
if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
// WB3c has more priority so we should not
// fold in that case
if match state {
FormatExtend(_) | Start => false,
_ => true,
} {
saveidx = previdx;
savestate = state;
state = FormatExtend(AcceptNone);
}
if state != Start {
continue;
}
} else if state == FormatExtend(AcceptNone) {
// finished a scan of some Format|Extend chars, restore previous state
state = savestate;
previdx = saveidx;
take_cat = false;
skipped_format_extend = true;
}
// Don't use `continue` in this match without updating `catb`
state = match state {
Start | FormatExtend(AcceptAny) => match cat {
_ if is_emoji(ch) => Zwj,
wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
wd::WC_Katakana => Katakana, // rule WB13, WB13b
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
// rule WB4:
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
wd::WC_Single_Quote => {
saveidx = idx;
FormatExtend(AcceptQLetter) // rule WB7a
}
wd::WC_WSegSpace => WSegSpace,
wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
if state == Start {
if cat == wd::WC_LF {
idx -= match self.get_prev_cat(idx) {
Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
_ => 0,
};
}
} else {
take_curr = false;
}
break; // rule WB3a
}
_ => break, // rule WB999
},
Zwj => match cat {
// rule WB3c
wd::WC_ZWJ => FormatExtend(AcceptAny),
_ => {
take_curr = false;
break;
}
},
WSegSpace => match cat {
// rule WB3d
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
_ => {
take_curr = false;
break;
}
},
Letter | HLetter => match cat {
wd::WC_ALetter => Letter, // rule WB5
wd::WC_Hebrew_Letter => HLetter, // rule WB5
wd::WC_Numeric => Numeric, // rule WB10
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
wd::WC_Double_Quote if state == HLetter => {
saveidx = previdx;
FormatExtend(RequireHLetter) // rule WB7c
}
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
saveidx = previdx;
FormatExtend(RequireLetter) // rule WB7
}
_ => {
take_curr = false;
break;
}
},
Numeric => match cat {
wd::WC_Numeric => Numeric, // rule WB8
wd::WC_ALetter => Letter, // rule WB9
wd::WC_Hebrew_Letter => HLetter, // rule WB9
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
saveidx = previdx;
FormatExtend(RequireNumeric) // rule WB11
}
_ => {
take_curr = false;
break;
}
},
Katakana => match cat {
wd::WC_Katakana => Katakana, // rule WB13
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
_ => {
take_curr = false;
break;
}
},
ExtendNumLet => match cat {
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_ALetter => Letter, // rule WB13a
wd::WC_Hebrew_Letter => HLetter, // rule WB13a
wd::WC_Numeric => Numeric, // rule WB13a
wd::WC_Katakana => Katakana, // rule WB13a
_ => {
take_curr = false;
break;
}
},
Regional(mut regional_state) => match cat {
// rule WB13c
wd::WC_Regional_Indicator => {
if regional_state == RegionalState::Unknown {
let count = self.string[..previdx]
.chars()
.rev()
.map(|c| wd::word_category(c).2)
.filter(|&c| {
!(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
})
.take_while(|&c| c == wd::WC_Regional_Indicator)
.count();
regional_state = if count % 2 == 0 {
RegionalState::Full
} else {
RegionalState::Half
};
}
if regional_state == RegionalState::Full {
take_curr = false;
break;
} else {
Regional(RegionalState::Full)
}
}
_ => {
take_curr = false;
break;
}
},
Emoji => {
if is_emoji(ch) {
// rule WB3c
Zwj
} else {
take_curr = false;
break;
}
}
FormatExtend(t) => match t {
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
_ => break, // backtrack will happens
},
}
}
if let FormatExtend(t) = state {
// if we required something but didn't find it, backtrack
if t == RequireLetter
|| t == RequireHLetter
|| t == RequireNumeric
|| t == AcceptNone
|| t == AcceptQLetter
{
previdx = saveidx;
take_cat = false;
take_curr = false;
}
}
self.catb = if take_curr {
None
} else {
idx = previdx;
if take_cat {
Some(cat)
} else {
None
}
};
let retstr = &self.string[idx..];
self.string = &self.string[..idx];
Some(retstr)
}
}
impl<'a> UWordBounds<'a> {
#[inline]
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
///
/// ```rust
/// # use unicode_segmentation::UnicodeSegmentation;
/// let mut iter = "Hello world".split_word_bounds();
/// assert_eq!(iter.as_str(), "Hello world");
/// iter.next();
/// assert_eq!(iter.as_str(), " world");
/// iter.next();
/// assert_eq!(iter.as_str(), "world");
/// ```
pub fn as_str(&self) -> &'a str {
self.string
}
#[inline]
fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
use crate::tables::word as wd;
let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
if nidx < self.string.len() {
let nch = self.string[nidx..].chars().next().unwrap();
Some(wd::word_category(nch).2)
} else {
None
}
}
#[inline]
fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
use crate::tables::word as wd;
if idx > 0 {
let nch = self.string[..idx].chars().next_back().unwrap();
Some(wd::word_category(nch).2)
} else {
None
}
}
}
#[inline]
pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
UWordBounds {
string: s,
cat: None,
catb: None,
}
}
#[inline]
pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
UWordBoundIndices {
start_offset: s.as_ptr() as usize,
iter: new_word_bounds(s),
}
}
#[inline]
fn has_alphanumeric(s: &&str) -> bool {
use crate::tables::util::is_alphanumeric;
s.chars().any(|c| is_alphanumeric(c))
}
#[inline]
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
use super::UnicodeSegmentation;
UnicodeWords {
inner: s.split_word_bounds().filter(has_alphanumeric),
}
}
#[inline]
pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
use super::UnicodeSegmentation;
UnicodeWordIndices {
inner: s
.split_word_bound_indices()
.filter(|(_, c)| has_alphanumeric(c)),
}
}