更新libclamav库1.0.0版本
This commit is contained in:
801
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/grapheme.rs
vendored
Normal file
801
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/grapheme.rs
vendored
Normal file
@@ -0,0 +1,801 @@
|
||||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use core::cmp;
|
||||
|
||||
use crate::tables::grapheme::GraphemeCat;
|
||||
|
||||
/// External iterator for grapheme clusters and byte offsets.
|
||||
///
|
||||
/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
|
||||
/// trait. See its documentation for more.
|
||||
///
|
||||
/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone)]
|
||||
pub struct GraphemeIndices<'a> {
|
||||
start_offset: usize,
|
||||
iter: Graphemes<'a>,
|
||||
}
|
||||
|
||||
impl<'a> GraphemeIndices<'a> {
|
||||
#[inline]
|
||||
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::UnicodeSegmentation;
|
||||
/// let mut iter = "abc".grapheme_indices(true);
|
||||
/// assert_eq!(iter.as_str(), "abc");
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), "bc");
|
||||
/// iter.next();
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), "");
|
||||
/// ```
|
||||
pub fn as_str(&self) -> &'a str {
|
||||
self.iter.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for GraphemeIndices<'a> {
|
||||
type Item = (usize, &'a str);
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.iter
|
||||
.next()
|
||||
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.iter.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.iter
|
||||
.next_back()
|
||||
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
|
||||
}
|
||||
}
|
||||
|
||||
/// External iterator for a string's
|
||||
/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
|
||||
///
|
||||
/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
|
||||
/// documentation for more.
|
||||
///
|
||||
/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Graphemes<'a> {
|
||||
string: &'a str,
|
||||
cursor: GraphemeCursor,
|
||||
cursor_back: GraphemeCursor,
|
||||
}
|
||||
|
||||
impl<'a> Graphemes<'a> {
|
||||
#[inline]
|
||||
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::UnicodeSegmentation;
|
||||
/// let mut iter = "abc".graphemes(true);
|
||||
/// assert_eq!(iter.as_str(), "abc");
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), "bc");
|
||||
/// iter.next();
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), "");
|
||||
/// ```
|
||||
pub fn as_str(&self) -> &'a str {
|
||||
&self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Graphemes<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
|
||||
(cmp::min(slen, 1), Some(slen))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
let start = self.cursor.cur_cursor();
|
||||
if start == self.cursor_back.cur_cursor() {
|
||||
return None;
|
||||
}
|
||||
let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
|
||||
Some(&self.string[start..next])
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DoubleEndedIterator for Graphemes<'a> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<&'a str> {
|
||||
let end = self.cursor_back.cur_cursor();
|
||||
if end == self.cursor.cur_cursor() {
|
||||
return None;
|
||||
}
|
||||
let prev = self
|
||||
.cursor_back
|
||||
.prev_boundary(self.string, 0)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
Some(&self.string[prev..end])
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
|
||||
let len = s.len();
|
||||
Graphemes {
|
||||
string: s,
|
||||
cursor: GraphemeCursor::new(0, len, is_extended),
|
||||
cursor_back: GraphemeCursor::new(len, len, is_extended),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
|
||||
GraphemeIndices {
|
||||
start_offset: s.as_ptr() as usize,
|
||||
iter: new_graphemes(s, is_extended),
|
||||
}
|
||||
}
|
||||
|
||||
// maybe unify with PairResult?
|
||||
// An enum describing information about a potential boundary.
|
||||
#[derive(PartialEq, Eq, Clone, Debug)]
|
||||
enum GraphemeState {
|
||||
// No information is known.
|
||||
Unknown,
|
||||
// It is known to not be a boundary.
|
||||
NotBreak,
|
||||
// It is known to be a boundary.
|
||||
Break,
|
||||
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
|
||||
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
|
||||
Regional,
|
||||
// The codepoint after is Extended_Pictographic,
|
||||
// so whether it's a boundary depends on pre-context according to GB11.
|
||||
Emoji,
|
||||
}
|
||||
|
||||
/// Cursor-based segmenter for grapheme clusters.
|
||||
///
|
||||
/// This allows working with ropes and other datastructures where the string is not contiguous or
|
||||
/// fully known at initialization time.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct GraphemeCursor {
|
||||
// Current cursor position.
|
||||
offset: usize,
|
||||
// Total length of the string.
|
||||
len: usize,
|
||||
// A config flag indicating whether this cursor computes legacy or extended
|
||||
// grapheme cluster boundaries (enables GB9a and GB9b if set).
|
||||
is_extended: bool,
|
||||
// Information about the potential boundary at `offset`
|
||||
state: GraphemeState,
|
||||
// Category of codepoint immediately preceding cursor, if known.
|
||||
cat_before: Option<GraphemeCat>,
|
||||
// Category of codepoint immediately after cursor, if known.
|
||||
cat_after: Option<GraphemeCat>,
|
||||
// If set, at least one more codepoint immediately preceding this offset
|
||||
// is needed to resolve whether there's a boundary at `offset`.
|
||||
pre_context_offset: Option<usize>,
|
||||
// The number of RIS codepoints preceding `offset`. If `pre_context_offset`
|
||||
// is set, then counts the number of RIS between that and `offset`, otherwise
|
||||
// is an accurate count relative to the string.
|
||||
ris_count: Option<usize>,
|
||||
// Set if a call to `prev_boundary` or `next_boundary` was suspended due
|
||||
// to needing more input.
|
||||
resuming: bool,
|
||||
// Cached grapheme category and associated scalar value range.
|
||||
grapheme_cat_cache: (u32, u32, GraphemeCat),
|
||||
}
|
||||
|
||||
/// An error return indicating that not enough content was available in the
|
||||
/// provided chunk to satisfy the query, and that more content must be provided.
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub enum GraphemeIncomplete {
|
||||
/// More pre-context is needed. The caller should call `provide_context`
|
||||
/// with a chunk ending at the offset given, then retry the query. This
|
||||
/// will only be returned if the `chunk_start` parameter is nonzero.
|
||||
PreContext(usize),
|
||||
|
||||
/// When requesting `prev_boundary`, the cursor is moving past the beginning
|
||||
/// of the current chunk, so the chunk before that is requested. This will
|
||||
/// only be returned if the `chunk_start` parameter is nonzero.
|
||||
PrevChunk,
|
||||
|
||||
/// When requesting `next_boundary`, the cursor is moving past the end of the
|
||||
/// current chunk, so the chunk after that is requested. This will only be
|
||||
/// returned if the chunk ends before the `len` parameter provided on
|
||||
/// creation of the cursor.
|
||||
NextChunk, // requesting chunk following the one given
|
||||
|
||||
/// An error returned when the chunk given does not contain the cursor position.
|
||||
InvalidOffset,
|
||||
}
|
||||
|
||||
// An enum describing the result from lookup of a pair of categories.
|
||||
#[derive(PartialEq, Eq)]
|
||||
enum PairResult {
|
||||
NotBreak, // definitely not a break
|
||||
Break, // definitely a break
|
||||
Extended, // a break iff not in extended mode
|
||||
Regional, // a break if preceded by an even number of RIS
|
||||
Emoji, // a break if preceded by emoji base and (Extend)*
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
|
||||
use self::PairResult::*;
|
||||
use crate::tables::grapheme::GraphemeCat::*;
|
||||
match (before, after) {
|
||||
(GC_CR, GC_LF) => NotBreak, // GB3
|
||||
(GC_Control, _) => Break, // GB4
|
||||
(GC_CR, _) => Break, // GB4
|
||||
(GC_LF, _) => Break, // GB4
|
||||
(_, GC_Control) => Break, // GB5
|
||||
(_, GC_CR) => Break, // GB5
|
||||
(_, GC_LF) => Break, // GB5
|
||||
(GC_L, GC_L) => NotBreak, // GB6
|
||||
(GC_L, GC_V) => NotBreak, // GB6
|
||||
(GC_L, GC_LV) => NotBreak, // GB6
|
||||
(GC_L, GC_LVT) => NotBreak, // GB6
|
||||
(GC_LV, GC_V) => NotBreak, // GB7
|
||||
(GC_LV, GC_T) => NotBreak, // GB7
|
||||
(GC_V, GC_V) => NotBreak, // GB7
|
||||
(GC_V, GC_T) => NotBreak, // GB7
|
||||
(GC_LVT, GC_T) => NotBreak, // GB8
|
||||
(GC_T, GC_T) => NotBreak, // GB8
|
||||
(_, GC_Extend) => NotBreak, // GB9
|
||||
(_, GC_ZWJ) => NotBreak, // GB9
|
||||
(_, GC_SpacingMark) => Extended, // GB9a
|
||||
(GC_Prepend, _) => Extended, // GB9b
|
||||
(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
|
||||
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
|
||||
(_, _) => Break, // GB999
|
||||
}
|
||||
}
|
||||
|
||||
impl GraphemeCursor {
|
||||
/// Create a new cursor. The string and initial offset are given at creation
|
||||
/// time, but the contents of the string are not. The `is_extended` parameter
|
||||
/// controls whether extended grapheme clusters are selected.
|
||||
///
|
||||
/// The `offset` parameter must be on a codepoint boundary.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::GraphemeCursor;
|
||||
/// let s = "हिन्दी";
|
||||
/// let mut legacy = GraphemeCursor::new(0, s.len(), false);
|
||||
/// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
|
||||
/// let mut extended = GraphemeCursor::new(0, s.len(), true);
|
||||
/// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
|
||||
/// ```
|
||||
pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
|
||||
let state = if offset == 0 || offset == len {
|
||||
GraphemeState::Break
|
||||
} else {
|
||||
GraphemeState::Unknown
|
||||
};
|
||||
GraphemeCursor {
|
||||
offset: offset,
|
||||
len: len,
|
||||
state: state,
|
||||
is_extended: is_extended,
|
||||
cat_before: None,
|
||||
cat_after: None,
|
||||
pre_context_offset: None,
|
||||
ris_count: None,
|
||||
resuming: false,
|
||||
grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
|
||||
}
|
||||
}
|
||||
|
||||
fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
|
||||
use crate::tables::grapheme as gr;
|
||||
use crate::tables::grapheme::GraphemeCat::*;
|
||||
|
||||
if ch <= '\u{7e}' {
|
||||
// Special-case optimization for ascii, except U+007F. This
|
||||
// improves performance even for many primarily non-ascii texts,
|
||||
// due to use of punctuation and white space characters from the
|
||||
// ascii range.
|
||||
if ch >= '\u{20}' {
|
||||
GC_Any
|
||||
} else if ch == '\n' {
|
||||
GC_LF
|
||||
} else if ch == '\r' {
|
||||
GC_CR
|
||||
} else {
|
||||
GC_Control
|
||||
}
|
||||
} else {
|
||||
// If this char isn't within the cached range, update the cache to the
|
||||
// range that includes it.
|
||||
if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
|
||||
self.grapheme_cat_cache = gr::grapheme_category(ch);
|
||||
}
|
||||
self.grapheme_cat_cache.2
|
||||
}
|
||||
}
|
||||
|
||||
// Not sure I'm gonna keep this, the advantage over new() seems thin.
|
||||
|
||||
/// Set the cursor to a new location in the same string.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::GraphemeCursor;
|
||||
/// let s = "abcd";
|
||||
/// let mut cursor = GraphemeCursor::new(0, s.len(), false);
|
||||
/// assert_eq!(cursor.cur_cursor(), 0);
|
||||
/// cursor.set_cursor(2);
|
||||
/// assert_eq!(cursor.cur_cursor(), 2);
|
||||
/// ```
|
||||
pub fn set_cursor(&mut self, offset: usize) {
|
||||
if offset != self.offset {
|
||||
self.offset = offset;
|
||||
self.state = if offset == 0 || offset == self.len {
|
||||
GraphemeState::Break
|
||||
} else {
|
||||
GraphemeState::Unknown
|
||||
};
|
||||
// reset state derived from text around cursor
|
||||
self.cat_before = None;
|
||||
self.cat_after = None;
|
||||
self.ris_count = None;
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// The current offset of the cursor. Equal to the last value provided to
|
||||
/// `new()` or `set_cursor()`, or returned from `next_boundary()` or
|
||||
/// `prev_boundary()`.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::GraphemeCursor;
|
||||
/// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
|
||||
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
|
||||
/// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
|
||||
/// assert_eq!(cursor.cur_cursor(), 4);
|
||||
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
|
||||
/// assert_eq!(cursor.cur_cursor(), 8);
|
||||
/// ```
|
||||
pub fn cur_cursor(&self) -> usize {
|
||||
self.offset
|
||||
}
|
||||
|
||||
/// Provide additional pre-context when it is needed to decide a boundary.
|
||||
/// The end of the chunk must coincide with the value given in the
|
||||
/// `GraphemeIncomplete::PreContext` request.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
|
||||
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
|
||||
/// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
|
||||
/// // Not enough pre-context to decide if there's a boundary between the two flags.
|
||||
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
|
||||
/// // Provide one more Regional Indicator Symbol of pre-context
|
||||
/// cursor.provide_context(&flags[4..8], 4);
|
||||
/// // Still not enough context to decide.
|
||||
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
|
||||
/// // Provide additional requested context.
|
||||
/// cursor.provide_context(&flags[0..4], 0);
|
||||
/// // That's enough to decide (it always is when context goes to the start of the string)
|
||||
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
|
||||
/// ```
|
||||
pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
|
||||
use crate::tables::grapheme as gr;
|
||||
assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
|
||||
self.pre_context_offset = None;
|
||||
if self.is_extended && chunk_start + chunk.len() == self.offset {
|
||||
let ch = chunk.chars().rev().next().unwrap();
|
||||
if self.grapheme_category(ch) == gr::GC_Prepend {
|
||||
self.decide(false); // GB9b
|
||||
return;
|
||||
}
|
||||
}
|
||||
match self.state {
|
||||
GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
|
||||
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
|
||||
_ => {
|
||||
if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
|
||||
let ch = chunk.chars().rev().next().unwrap();
|
||||
self.cat_before = Some(self.grapheme_category(ch));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn decide(&mut self, is_break: bool) {
|
||||
self.state = if is_break {
|
||||
GraphemeState::Break
|
||||
} else {
|
||||
GraphemeState::NotBreak
|
||||
};
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
|
||||
self.decide(is_break);
|
||||
Ok(is_break)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
|
||||
if self.state == GraphemeState::Break {
|
||||
Ok(true)
|
||||
} else if self.state == GraphemeState::NotBreak {
|
||||
Ok(false)
|
||||
} else if let Some(pre_context_offset) = self.pre_context_offset {
|
||||
Err(GraphemeIncomplete::PreContext(pre_context_offset))
|
||||
} else {
|
||||
unreachable!("inconsistent state");
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
|
||||
use crate::tables::grapheme as gr;
|
||||
let mut ris_count = self.ris_count.unwrap_or(0);
|
||||
for ch in chunk.chars().rev() {
|
||||
if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
|
||||
self.ris_count = Some(ris_count);
|
||||
self.decide((ris_count % 2) == 0);
|
||||
return;
|
||||
}
|
||||
ris_count += 1;
|
||||
}
|
||||
self.ris_count = Some(ris_count);
|
||||
if chunk_start == 0 {
|
||||
self.decide((ris_count % 2) == 0);
|
||||
return;
|
||||
}
|
||||
self.pre_context_offset = Some(chunk_start);
|
||||
self.state = GraphemeState::Regional;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
|
||||
use crate::tables::grapheme as gr;
|
||||
let mut iter = chunk.chars().rev();
|
||||
if let Some(ch) = iter.next() {
|
||||
if self.grapheme_category(ch) != gr::GC_ZWJ {
|
||||
self.decide(true);
|
||||
return;
|
||||
}
|
||||
}
|
||||
for ch in iter {
|
||||
match self.grapheme_category(ch) {
|
||||
gr::GC_Extend => (),
|
||||
gr::GC_Extended_Pictographic => {
|
||||
self.decide(false);
|
||||
return;
|
||||
}
|
||||
_ => {
|
||||
self.decide(true);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
if chunk_start == 0 {
|
||||
self.decide(true);
|
||||
return;
|
||||
}
|
||||
self.pre_context_offset = Some(chunk_start);
|
||||
self.state = GraphemeState::Emoji;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Determine whether the current cursor location is a grapheme cluster boundary.
|
||||
/// Only a part of the string need be supplied. If `chunk_start` is nonzero or
|
||||
/// the length of `chunk` is not equal to `len` on creation, then this method
|
||||
/// may return `GraphemeIncomplete::PreContext`. The caller should then
|
||||
/// call `provide_context` with the requested chunk, then retry calling this
|
||||
/// method.
|
||||
///
|
||||
/// For partial chunks, if the cursor is not at the beginning or end of the
|
||||
/// string, the chunk should contain at least the codepoint following the cursor.
|
||||
/// If the string is nonempty, the chunk must be nonempty.
|
||||
///
|
||||
/// All calls should have consistent chunk contents (ie, if a chunk provides
|
||||
/// content for a given slice, all further chunks covering that slice must have
|
||||
/// the same content for it).
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::GraphemeCursor;
|
||||
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
|
||||
/// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
|
||||
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
|
||||
/// cursor.set_cursor(12);
|
||||
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
|
||||
/// ```
|
||||
pub fn is_boundary(
|
||||
&mut self,
|
||||
chunk: &str,
|
||||
chunk_start: usize,
|
||||
) -> Result<bool, GraphemeIncomplete> {
|
||||
use crate::tables::grapheme as gr;
|
||||
if self.state == GraphemeState::Break {
|
||||
return Ok(true);
|
||||
}
|
||||
if self.state == GraphemeState::NotBreak {
|
||||
return Ok(false);
|
||||
}
|
||||
if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
|
||||
if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
|
||||
return Err(GraphemeIncomplete::InvalidOffset);
|
||||
}
|
||||
}
|
||||
if let Some(pre_context_offset) = self.pre_context_offset {
|
||||
return Err(GraphemeIncomplete::PreContext(pre_context_offset));
|
||||
}
|
||||
let offset_in_chunk = self.offset - chunk_start;
|
||||
if self.cat_after.is_none() {
|
||||
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
|
||||
self.cat_after = Some(self.grapheme_category(ch));
|
||||
}
|
||||
if self.offset == chunk_start {
|
||||
let mut need_pre_context = true;
|
||||
match self.cat_after.unwrap() {
|
||||
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
|
||||
gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
|
||||
_ => need_pre_context = self.cat_before.is_none(),
|
||||
}
|
||||
if need_pre_context {
|
||||
self.pre_context_offset = Some(chunk_start);
|
||||
return Err(GraphemeIncomplete::PreContext(chunk_start));
|
||||
}
|
||||
}
|
||||
if self.cat_before.is_none() {
|
||||
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
|
||||
self.cat_before = Some(self.grapheme_category(ch));
|
||||
}
|
||||
match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
|
||||
PairResult::NotBreak => return self.decision(false),
|
||||
PairResult::Break => return self.decision(true),
|
||||
PairResult::Extended => {
|
||||
let is_extended = self.is_extended;
|
||||
return self.decision(!is_extended);
|
||||
}
|
||||
PairResult::Regional => {
|
||||
if let Some(ris_count) = self.ris_count {
|
||||
return self.decision((ris_count % 2) == 0);
|
||||
}
|
||||
self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
|
||||
self.is_boundary_result()
|
||||
}
|
||||
PairResult::Emoji => {
|
||||
self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
|
||||
self.is_boundary_result()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Find the next boundary after the current cursor position. Only a part of
|
||||
/// the string need be supplied. If the chunk is incomplete, then this
|
||||
/// method might return `GraphemeIncomplete::PreContext` or
|
||||
/// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
|
||||
/// call `provide_context` with the requested chunk, then retry. In the
|
||||
/// latter case, the caller should provide the chunk following the one
|
||||
/// given, then retry.
|
||||
///
|
||||
/// See `is_boundary` for expectations on the provided chunk.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::GraphemeCursor;
|
||||
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
|
||||
/// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
|
||||
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
|
||||
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
|
||||
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
|
||||
/// ```
|
||||
///
|
||||
/// And an example that uses partial strings:
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
|
||||
/// let s = "abcd";
|
||||
/// let mut cursor = GraphemeCursor::new(0, s.len(), false);
|
||||
/// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
|
||||
/// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
|
||||
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
|
||||
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
|
||||
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
|
||||
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
|
||||
/// ```
|
||||
pub fn next_boundary(
|
||||
&mut self,
|
||||
chunk: &str,
|
||||
chunk_start: usize,
|
||||
) -> Result<Option<usize>, GraphemeIncomplete> {
|
||||
if self.offset == self.len {
|
||||
return Ok(None);
|
||||
}
|
||||
let mut iter = chunk[self.offset - chunk_start..].chars();
|
||||
let mut ch = iter.next().unwrap();
|
||||
loop {
|
||||
if self.resuming {
|
||||
if self.cat_after.is_none() {
|
||||
self.cat_after = Some(self.grapheme_category(ch));
|
||||
}
|
||||
} else {
|
||||
self.offset += ch.len_utf8();
|
||||
self.state = GraphemeState::Unknown;
|
||||
self.cat_before = self.cat_after.take();
|
||||
if self.cat_before.is_none() {
|
||||
self.cat_before = Some(self.grapheme_category(ch));
|
||||
}
|
||||
if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
|
||||
self.ris_count = self.ris_count.map(|c| c + 1);
|
||||
} else {
|
||||
self.ris_count = Some(0);
|
||||
}
|
||||
if let Some(next_ch) = iter.next() {
|
||||
ch = next_ch;
|
||||
self.cat_after = Some(self.grapheme_category(ch));
|
||||
} else if self.offset == self.len {
|
||||
self.decide(true);
|
||||
} else {
|
||||
self.resuming = true;
|
||||
return Err(GraphemeIncomplete::NextChunk);
|
||||
}
|
||||
}
|
||||
self.resuming = true;
|
||||
if self.is_boundary(chunk, chunk_start)? {
|
||||
self.resuming = false;
|
||||
return Ok(Some(self.offset));
|
||||
}
|
||||
self.resuming = false;
|
||||
}
|
||||
}
|
||||
|
||||
/// Find the previous boundary after the current cursor position. Only a part
|
||||
/// of the string need be supplied. If the chunk is incomplete, then this
|
||||
/// method might return `GraphemeIncomplete::PreContext` or
|
||||
/// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
|
||||
/// call `provide_context` with the requested chunk, then retry. In the
|
||||
/// latter case, the caller should provide the chunk preceding the one
|
||||
/// given, then retry.
|
||||
///
|
||||
/// See `is_boundary` for expectations on the provided chunk.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::GraphemeCursor;
|
||||
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
|
||||
/// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
|
||||
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
|
||||
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
|
||||
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
|
||||
/// ```
|
||||
///
|
||||
/// And an example that uses partial strings (note the exact return is not
|
||||
/// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
|
||||
/// let s = "abcd";
|
||||
/// let mut cursor = GraphemeCursor::new(4, s.len(), false);
|
||||
/// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
|
||||
/// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
|
||||
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
|
||||
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
|
||||
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
|
||||
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
|
||||
/// ```
|
||||
pub fn prev_boundary(
|
||||
&mut self,
|
||||
chunk: &str,
|
||||
chunk_start: usize,
|
||||
) -> Result<Option<usize>, GraphemeIncomplete> {
|
||||
if self.offset == 0 {
|
||||
return Ok(None);
|
||||
}
|
||||
if self.offset == chunk_start {
|
||||
return Err(GraphemeIncomplete::PrevChunk);
|
||||
}
|
||||
let mut iter = chunk[..self.offset - chunk_start].chars().rev();
|
||||
let mut ch = iter.next().unwrap();
|
||||
loop {
|
||||
if self.offset == chunk_start {
|
||||
self.resuming = true;
|
||||
return Err(GraphemeIncomplete::PrevChunk);
|
||||
}
|
||||
if self.resuming {
|
||||
self.cat_before = Some(self.grapheme_category(ch));
|
||||
} else {
|
||||
self.offset -= ch.len_utf8();
|
||||
self.cat_after = self.cat_before.take();
|
||||
self.state = GraphemeState::Unknown;
|
||||
if let Some(ris_count) = self.ris_count {
|
||||
self.ris_count = if ris_count > 0 {
|
||||
Some(ris_count - 1)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
}
|
||||
if let Some(prev_ch) = iter.next() {
|
||||
ch = prev_ch;
|
||||
self.cat_before = Some(self.grapheme_category(ch));
|
||||
} else if self.offset == 0 {
|
||||
self.decide(true);
|
||||
} else {
|
||||
self.resuming = true;
|
||||
self.cat_after = Some(self.grapheme_category(ch));
|
||||
return Err(GraphemeIncomplete::PrevChunk);
|
||||
}
|
||||
}
|
||||
self.resuming = true;
|
||||
if self.is_boundary(chunk, chunk_start)? {
|
||||
self.resuming = false;
|
||||
return Ok(Some(self.offset));
|
||||
}
|
||||
self.resuming = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grapheme_cursor_ris_precontext() {
|
||||
let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
|
||||
let mut c = GraphemeCursor::new(8, s.len(), true);
|
||||
assert_eq!(
|
||||
c.is_boundary(&s[4..], 4),
|
||||
Err(GraphemeIncomplete::PreContext(4))
|
||||
);
|
||||
c.provide_context(&s[..4], 0);
|
||||
assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grapheme_cursor_chunk_start_require_precontext() {
|
||||
let s = "\r\n";
|
||||
let mut c = GraphemeCursor::new(1, s.len(), true);
|
||||
assert_eq!(
|
||||
c.is_boundary(&s[1..], 1),
|
||||
Err(GraphemeIncomplete::PreContext(1))
|
||||
);
|
||||
c.provide_context(&s[..1], 0);
|
||||
assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grapheme_cursor_prev_boundary() {
|
||||
let s = "abcd";
|
||||
let mut c = GraphemeCursor::new(3, s.len(), true);
|
||||
assert_eq!(
|
||||
c.prev_boundary(&s[2..], 2),
|
||||
Err(GraphemeIncomplete::PrevChunk)
|
||||
);
|
||||
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grapheme_cursor_prev_boundary_chunk_start() {
|
||||
let s = "abcd";
|
||||
let mut c = GraphemeCursor::new(2, s.len(), true);
|
||||
assert_eq!(
|
||||
c.prev_boundary(&s[2..], 2),
|
||||
Err(GraphemeIncomplete::PrevChunk)
|
||||
);
|
||||
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
|
||||
}
|
||||
307
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/lib.rs
vendored
Normal file
307
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/lib.rs
vendored
Normal file
@@ -0,0 +1,307 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
|
||||
//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
|
||||
//!
|
||||
//! ```rust
|
||||
//! extern crate unicode_segmentation;
|
||||
//!
|
||||
//! use unicode_segmentation::UnicodeSegmentation;
|
||||
//!
|
||||
//! fn main() {
|
||||
//! let s = "a̐éö̲\r\n";
|
||||
//! let g = UnicodeSegmentation::graphemes(s, true).collect::<Vec<&str>>();
|
||||
//! let b: &[_] = &["a̐", "é", "ö̲", "\r\n"];
|
||||
//! assert_eq!(g, b);
|
||||
//!
|
||||
//! let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
|
||||
//! let w = s.unicode_words().collect::<Vec<&str>>();
|
||||
//! let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
|
||||
//! assert_eq!(w, b);
|
||||
//!
|
||||
//! let s = "The quick (\"brown\") fox";
|
||||
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
|
||||
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
|
||||
//! assert_eq!(w, b);
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! # no_std
|
||||
//!
|
||||
//! unicode-segmentation does not depend on libstd, so it can be used in crates
|
||||
//! with the `#![no_std]` attribute.
|
||||
//!
|
||||
//! # crates.io
|
||||
//!
|
||||
//! You can use this package in your project by adding the following
|
||||
//! to your `Cargo.toml`:
|
||||
//!
|
||||
//! ```toml
|
||||
//! [dependencies]
|
||||
//! unicode-segmentation = "1.9.0"
|
||||
//! ```
|
||||
|
||||
#![deny(missing_docs, unsafe_code)]
|
||||
#![doc(
|
||||
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
|
||||
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
|
||||
)]
|
||||
#![no_std]
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate std;
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate quickcheck;
|
||||
|
||||
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
|
||||
pub use grapheme::{GraphemeIndices, Graphemes};
|
||||
pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
|
||||
pub use tables::UNICODE_VERSION;
|
||||
pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
|
||||
|
||||
mod grapheme;
|
||||
#[rustfmt::skip]
|
||||
mod tables;
|
||||
mod sentence;
|
||||
mod word;
|
||||
|
||||
#[cfg(test)]
|
||||
mod test;
|
||||
#[cfg(test)]
|
||||
mod testdata;
|
||||
|
||||
/// Methods for segmenting strings according to
|
||||
/// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).
|
||||
pub trait UnicodeSegmentation {
|
||||
/// Returns an iterator over the [grapheme clusters][graphemes] of `self`.
|
||||
///
|
||||
/// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
|
||||
///
|
||||
/// If `is_extended` is true, the iterator is over the
|
||||
/// *extended grapheme clusters*;
|
||||
/// otherwise, the iterator is over the *legacy grapheme clusters*.
|
||||
/// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
|
||||
/// recommends extended grapheme cluster boundaries for general processing.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let gr1 = UnicodeSegmentation::graphemes("a\u{310}e\u{301}o\u{308}\u{332}", true)
|
||||
/// .collect::<Vec<&str>>();
|
||||
/// let b: &[_] = &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"];
|
||||
///
|
||||
/// assert_eq!(&gr1[..], b);
|
||||
///
|
||||
/// let gr2 = UnicodeSegmentation::graphemes("a\r\nb🇷🇺🇸🇹", true).collect::<Vec<&str>>();
|
||||
/// let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"];
|
||||
///
|
||||
/// assert_eq!(&gr2[..], b);
|
||||
/// ```
|
||||
fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
|
||||
|
||||
/// Returns an iterator over the grapheme clusters of `self` and their
|
||||
/// byte offsets. See `graphemes()` for more information.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲\r\n", true)
|
||||
/// .collect::<Vec<(usize, &str)>>();
|
||||
/// let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
|
||||
///
|
||||
/// assert_eq!(&gr_inds[..], b);
|
||||
/// ```
|
||||
fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
|
||||
|
||||
/// Returns an iterator over the words of `self`, separated on
|
||||
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
|
||||
///
|
||||
/// Here, "words" are just those substrings which, after splitting on
|
||||
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
|
||||
/// substring must contain at least one character with the
|
||||
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
|
||||
/// property, or with
|
||||
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
|
||||
/// let uw1 = uws.unicode_words().collect::<Vec<&str>>();
|
||||
/// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
|
||||
///
|
||||
/// assert_eq!(&uw1[..], b);
|
||||
/// ```
|
||||
fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
|
||||
|
||||
/// Returns an iterator over the words of `self`, separated on
|
||||
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
|
||||
/// offsets.
|
||||
///
|
||||
/// Here, "words" are just those substrings which, after splitting on
|
||||
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
|
||||
/// substring must contain at least one character with the
|
||||
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
|
||||
/// property, or with
|
||||
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
|
||||
/// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
|
||||
/// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
|
||||
/// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
|
||||
///
|
||||
/// assert_eq!(&uwi1[..], b);
|
||||
/// ```
|
||||
fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;
|
||||
|
||||
/// Returns an iterator over substrings of `self` separated on
|
||||
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
|
||||
///
|
||||
/// The concatenation of the substrings returned by this function is just the original string.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
|
||||
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
|
||||
///
|
||||
/// assert_eq!(&swu1[..], b);
|
||||
/// ```
|
||||
fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>;
|
||||
|
||||
/// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
|
||||
/// and their offsets. See `split_word_bounds()` for more information.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>();
|
||||
/// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
|
||||
/// (14, "°"), (16, "F"), (17, "!")];
|
||||
///
|
||||
/// assert_eq!(&swi1[..], b);
|
||||
/// ```
|
||||
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
|
||||
|
||||
/// Returns an iterator over substrings of `self` separated on
|
||||
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
|
||||
///
|
||||
/// Here, "sentences" are just those substrings which, after splitting on
|
||||
/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
|
||||
/// substring must contain at least one character with the
|
||||
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
|
||||
/// property, or with
|
||||
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let uss = "Mr. Fox jumped. [...] The dog was too lazy.";
|
||||
/// let us1 = uss.unicode_sentences().collect::<Vec<&str>>();
|
||||
/// let b: &[_] = &["Mr. ", "Fox jumped. ", "The dog was too lazy."];
|
||||
///
|
||||
/// assert_eq!(&us1[..], b);
|
||||
/// ```
|
||||
fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
|
||||
|
||||
/// Returns an iterator over substrings of `self` separated on
|
||||
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
|
||||
///
|
||||
/// The concatenation of the substrings returned by this function is just the original string.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let ssbs = "Mr. Fox jumped. [...] The dog was too lazy.";
|
||||
/// let ssb1 = ssbs.split_sentence_bounds().collect::<Vec<&str>>();
|
||||
/// let b: &[_] = &["Mr. ", "Fox jumped. ", "[...] ", "The dog was too lazy."];
|
||||
///
|
||||
/// assert_eq!(&ssb1[..], b);
|
||||
/// ```
|
||||
fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
|
||||
|
||||
/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
|
||||
/// and their offsets. See `split_sentence_bounds()` for more information.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let ssis = "Mr. Fox jumped. [...] The dog was too lazy.";
|
||||
/// let ssi1 = ssis.split_sentence_bound_indices().collect::<Vec<(usize, &str)>>();
|
||||
/// let b: &[_] = &[(0, "Mr. "), (4, "Fox jumped. "), (16, "[...] "),
|
||||
/// (22, "The dog was too lazy.")];
|
||||
///
|
||||
/// assert_eq!(&ssi1[..], b);
|
||||
/// ```
|
||||
fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
|
||||
}
|
||||
|
||||
impl UnicodeSegmentation for str {
|
||||
#[inline]
|
||||
fn graphemes(&self, is_extended: bool) -> Graphemes {
|
||||
grapheme::new_graphemes(self, is_extended)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
|
||||
grapheme::new_grapheme_indices(self, is_extended)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn unicode_words(&self) -> UnicodeWords {
|
||||
word::new_unicode_words(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn unicode_word_indices(&self) -> UnicodeWordIndices {
|
||||
word::new_unicode_word_indices(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn split_word_bounds(&self) -> UWordBounds {
|
||||
word::new_word_bounds(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn split_word_bound_indices(&self) -> UWordBoundIndices {
|
||||
word::new_word_bound_indices(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn unicode_sentences(&self) -> UnicodeSentences {
|
||||
sentence::new_unicode_sentences(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn split_sentence_bounds(&self) -> USentenceBounds {
|
||||
sentence::new_sentence_bounds(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
|
||||
sentence::new_sentence_bound_indices(self)
|
||||
}
|
||||
}
|
||||
415
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/sentence.rs
vendored
Normal file
415
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/sentence.rs
vendored
Normal file
@@ -0,0 +1,415 @@
|
||||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use core::cmp;
|
||||
use core::iter::Filter;
|
||||
|
||||
// All of the logic for forward iteration over sentences
|
||||
mod fwd {
|
||||
use crate::tables::sentence::SentenceCat;
|
||||
use core::cmp;
|
||||
|
||||
// Describe a parsed part of source string as described in this table:
|
||||
// https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
enum StatePart {
|
||||
Sot,
|
||||
Eot,
|
||||
Other,
|
||||
CR,
|
||||
LF,
|
||||
Sep,
|
||||
ATerm,
|
||||
UpperLower,
|
||||
ClosePlus,
|
||||
SpPlus,
|
||||
STerm,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
struct SentenceBreaksState(pub [StatePart; 4]);
|
||||
|
||||
const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
|
||||
StatePart::Sot,
|
||||
StatePart::Sot,
|
||||
StatePart::Sot,
|
||||
StatePart::Sot,
|
||||
]);
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SentenceBreaks<'a> {
|
||||
pub string: &'a str,
|
||||
pos: usize,
|
||||
state: SentenceBreaksState,
|
||||
}
|
||||
|
||||
impl SentenceBreaksState {
|
||||
// Attempt to advance the internal state by one part
|
||||
// Whitespace and some punctutation will be collapsed
|
||||
fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
|
||||
let &SentenceBreaksState(parts) = self;
|
||||
let parts = match (parts[3], cat) {
|
||||
(StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
|
||||
(StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
|
||||
_ => [
|
||||
parts[1],
|
||||
parts[2],
|
||||
parts[3],
|
||||
match cat {
|
||||
SentenceCat::SC_CR => StatePart::CR,
|
||||
SentenceCat::SC_LF => StatePart::LF,
|
||||
SentenceCat::SC_Sep => StatePart::Sep,
|
||||
SentenceCat::SC_ATerm => StatePart::ATerm,
|
||||
SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
|
||||
SentenceCat::SC_Close => StatePart::ClosePlus,
|
||||
SentenceCat::SC_Sp => StatePart::SpPlus,
|
||||
SentenceCat::SC_STerm => StatePart::STerm,
|
||||
_ => StatePart::Other,
|
||||
},
|
||||
],
|
||||
};
|
||||
SentenceBreaksState(parts)
|
||||
}
|
||||
|
||||
fn end(&self) -> SentenceBreaksState {
|
||||
let &SentenceBreaksState(parts) = self;
|
||||
SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
|
||||
}
|
||||
|
||||
// Helper function to check if state head matches a single `StatePart`
|
||||
fn match1(&self, part: StatePart) -> bool {
|
||||
let &SentenceBreaksState(parts) = self;
|
||||
part == parts[3]
|
||||
}
|
||||
|
||||
// Helper function to check if first two `StateParts` in state match
|
||||
// the given two
|
||||
fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
|
||||
let &SentenceBreaksState(parts) = self;
|
||||
part1 == parts[2] && part2 == parts[3]
|
||||
}
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#SB8
|
||||
// TODO cache this, it is currently quadratic
|
||||
fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
|
||||
let &SentenceBreaksState(parts) = state;
|
||||
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
|
||||
if parts[idx] == StatePart::ClosePlus {
|
||||
idx -= 1
|
||||
}
|
||||
|
||||
if parts[idx] == StatePart::ATerm {
|
||||
use crate::tables::sentence as se;
|
||||
|
||||
for next_char in ahead.chars() {
|
||||
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
|
||||
match se::sentence_category(next_char).2 {
|
||||
se::SC_Lower => return true,
|
||||
se::SC_OLetter
|
||||
| se::SC_Upper
|
||||
| se::SC_Sep
|
||||
| se::SC_CR
|
||||
| se::SC_LF
|
||||
| se::SC_STerm
|
||||
| se::SC_ATerm => return false,
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#SB8a
|
||||
fn match_sb8a(state: &SentenceBreaksState) -> bool {
|
||||
// SATerm Close* Sp*
|
||||
let &SentenceBreaksState(parts) = state;
|
||||
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
|
||||
if parts[idx] == StatePart::ClosePlus {
|
||||
idx -= 1
|
||||
}
|
||||
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#SB9
|
||||
fn match_sb9(state: &SentenceBreaksState) -> bool {
|
||||
// SATerm Close*
|
||||
let &SentenceBreaksState(parts) = state;
|
||||
let idx = if parts[3] == StatePart::ClosePlus {
|
||||
2
|
||||
} else {
|
||||
3
|
||||
};
|
||||
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#SB11
|
||||
fn match_sb11(state: &SentenceBreaksState) -> bool {
|
||||
// SATerm Close* Sp* ParaSep?
|
||||
let &SentenceBreaksState(parts) = state;
|
||||
let mut idx = match parts[3] {
|
||||
StatePart::Sep | StatePart::CR | StatePart::LF => 2,
|
||||
_ => 3,
|
||||
};
|
||||
|
||||
if parts[idx] == StatePart::SpPlus {
|
||||
idx -= 1
|
||||
}
|
||||
if parts[idx] == StatePart::ClosePlus {
|
||||
idx -= 1
|
||||
}
|
||||
|
||||
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
|
||||
}
|
||||
|
||||
impl<'a> Iterator for SentenceBreaks<'a> {
|
||||
// Returns the index of the character which follows a break
|
||||
type Item = usize;
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let slen = self.string.len();
|
||||
// A sentence could be one character
|
||||
(cmp::min(slen, 2), Some(slen + 1))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<usize> {
|
||||
use crate::tables::sentence as se;
|
||||
|
||||
for next_char in self.string[self.pos..].chars() {
|
||||
let position_before = self.pos;
|
||||
let state_before = self.state.clone();
|
||||
|
||||
let next_cat = se::sentence_category(next_char).2;
|
||||
|
||||
self.pos += next_char.len_utf8();
|
||||
self.state = self.state.next(next_cat);
|
||||
|
||||
match next_cat {
|
||||
// SB1 https://unicode.org/reports/tr29/#SB1
|
||||
_ if state_before.match1(StatePart::Sot) => return Some(position_before),
|
||||
|
||||
// SB2 is handled when inner iterator (chars) is finished
|
||||
|
||||
// SB3 https://unicode.org/reports/tr29/#SB3
|
||||
SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
|
||||
|
||||
// SB4 https://unicode.org/reports/tr29/#SB4
|
||||
_ if state_before.match1(StatePart::Sep)
|
||||
|| state_before.match1(StatePart::CR)
|
||||
|| state_before.match1(StatePart::LF) =>
|
||||
{
|
||||
return Some(position_before)
|
||||
}
|
||||
|
||||
// SB5 https://unicode.org/reports/tr29/#SB5
|
||||
SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,
|
||||
|
||||
// SB6 https://unicode.org/reports/tr29/#SB6
|
||||
SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
|
||||
|
||||
// SB7 https://unicode.org/reports/tr29/#SB7
|
||||
SentenceCat::SC_Upper
|
||||
if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
|
||||
{
|
||||
continue
|
||||
}
|
||||
|
||||
// SB8 https://unicode.org/reports/tr29/#SB8
|
||||
_ if match_sb8(&state_before, &self.string[position_before..]) => continue,
|
||||
|
||||
// SB8a https://unicode.org/reports/tr29/#SB8a
|
||||
SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
|
||||
if match_sb8a(&state_before) =>
|
||||
{
|
||||
continue
|
||||
}
|
||||
|
||||
// SB9 https://unicode.org/reports/tr29/#SB9
|
||||
SentenceCat::SC_Close
|
||||
| SentenceCat::SC_Sp
|
||||
| SentenceCat::SC_Sep
|
||||
| SentenceCat::SC_CR
|
||||
| SentenceCat::SC_LF
|
||||
if match_sb9(&state_before) =>
|
||||
{
|
||||
continue
|
||||
}
|
||||
|
||||
// SB10 https://unicode.org/reports/tr29/#SB10
|
||||
SentenceCat::SC_Sp
|
||||
| SentenceCat::SC_Sep
|
||||
| SentenceCat::SC_CR
|
||||
| SentenceCat::SC_LF
|
||||
if match_sb8a(&state_before) =>
|
||||
{
|
||||
continue
|
||||
}
|
||||
|
||||
// SB11 https://unicode.org/reports/tr29/#SB11
|
||||
_ if match_sb11(&state_before) => return Some(position_before),
|
||||
|
||||
// SB998 https://unicode.org/reports/tr29/#SB998
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
|
||||
// SB2 https://unicode.org/reports/tr29/#SB2
|
||||
if self.state.match1(StatePart::Sot) {
|
||||
None
|
||||
} else if self.state.match1(StatePart::Eot) {
|
||||
None
|
||||
} else {
|
||||
self.state = self.state.end();
|
||||
Some(self.pos)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
|
||||
SentenceBreaks {
|
||||
string: source,
|
||||
pos: 0,
|
||||
state: INITIAL_STATE,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over the substrings of a string which, after splitting the string on
|
||||
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
|
||||
/// contain any characters with the
|
||||
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
|
||||
/// property, or with
|
||||
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
|
||||
///
|
||||
/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
|
||||
/// trait. See its documentation for more.
|
||||
///
|
||||
/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone)]
|
||||
pub struct UnicodeSentences<'a> {
|
||||
inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
|
||||
}
|
||||
|
||||
/// External iterator for a string's
|
||||
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
|
||||
///
|
||||
/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
|
||||
/// trait. See its documentation for more.
|
||||
///
|
||||
/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone)]
|
||||
pub struct USentenceBounds<'a> {
|
||||
iter: fwd::SentenceBreaks<'a>,
|
||||
sentence_start: Option<usize>,
|
||||
}
|
||||
|
||||
/// External iterator for sentence boundaries and byte offsets.
|
||||
///
|
||||
/// This struct is created by the [`split_sentence_bound_indices`] method on the
|
||||
/// [`UnicodeSegmentation`] trait. See its documentation for more.
|
||||
///
|
||||
/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone)]
|
||||
pub struct USentenceBoundIndices<'a> {
|
||||
start_offset: usize,
|
||||
iter: USentenceBounds<'a>,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
|
||||
USentenceBounds {
|
||||
iter: fwd::new_sentence_breaks(source),
|
||||
sentence_start: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
|
||||
USentenceBoundIndices {
|
||||
start_offset: source.as_ptr() as usize,
|
||||
iter: new_sentence_bounds(source),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
|
||||
use super::UnicodeSegmentation;
|
||||
use crate::tables::util::is_alphanumeric;
|
||||
|
||||
fn has_alphanumeric(s: &&str) -> bool {
|
||||
s.chars().any(|c| is_alphanumeric(c))
|
||||
}
|
||||
let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
|
||||
|
||||
UnicodeSentences {
|
||||
inner: s.split_sentence_bounds().filter(has_alphanumeric),
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UnicodeSentences<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
self.inner.next()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for USentenceBounds<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (lower, upper) = self.iter.size_hint();
|
||||
(cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
if self.sentence_start == None {
|
||||
if let Some(start_pos) = self.iter.next() {
|
||||
self.sentence_start = Some(start_pos)
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(break_pos) = self.iter.next() {
|
||||
let start_pos = self.sentence_start.unwrap();
|
||||
let sentence = &self.iter.string[start_pos..break_pos];
|
||||
self.sentence_start = Some(break_pos);
|
||||
Some(sentence)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for USentenceBoundIndices<'a> {
|
||||
type Item = (usize, &'a str);
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.iter
|
||||
.next()
|
||||
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.iter.size_hint()
|
||||
}
|
||||
}
|
||||
2675
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/tables.rs
vendored
Normal file
2675
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/tables.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
247
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/test.rs
vendored
Normal file
247
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/test.rs
vendored
Normal file
@@ -0,0 +1,247 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::UnicodeSegmentation;
|
||||
|
||||
use std::prelude::v1::*;
|
||||
|
||||
#[test]
|
||||
fn test_graphemes() {
|
||||
use crate::testdata::{TEST_DIFF, TEST_SAME};
|
||||
|
||||
pub const EXTRA_DIFF: &'static [(
|
||||
&'static str,
|
||||
&'static [&'static str],
|
||||
&'static [&'static str],
|
||||
)] = &[
|
||||
// Official test suite doesn't include two Prepend chars between two other chars.
|
||||
(
|
||||
"\u{20}\u{600}\u{600}\u{20}",
|
||||
&["\u{20}", "\u{600}\u{600}\u{20}"],
|
||||
&["\u{20}", "\u{600}", "\u{600}", "\u{20}"],
|
||||
),
|
||||
// Test for Prepend followed by two Any chars
|
||||
(
|
||||
"\u{600}\u{20}\u{20}",
|
||||
&["\u{600}\u{20}", "\u{20}"],
|
||||
&["\u{600}", "\u{20}", "\u{20}"],
|
||||
),
|
||||
];
|
||||
|
||||
pub const EXTRA_SAME: &'static [(&'static str, &'static [&'static str])] = &[
|
||||
// family emoji (more than two emoji joined by ZWJ)
|
||||
(
|
||||
"\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}",
|
||||
&["\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}"],
|
||||
),
|
||||
// cartwheel emoji followed by two fitzpatrick skin tone modifiers
|
||||
// (test case from issue #19)
|
||||
(
|
||||
"\u{1F938}\u{1F3FE}\u{1F3FE}",
|
||||
&["\u{1F938}\u{1F3FE}\u{1F3FE}"],
|
||||
),
|
||||
];
|
||||
|
||||
for &(s, g) in TEST_SAME.iter().chain(EXTRA_SAME) {
|
||||
// test forward iterator
|
||||
assert!(UnicodeSegmentation::graphemes(s, true).eq(g.iter().cloned()));
|
||||
assert!(UnicodeSegmentation::graphemes(s, false).eq(g.iter().cloned()));
|
||||
|
||||
// test reverse iterator
|
||||
assert!(UnicodeSegmentation::graphemes(s, true)
|
||||
.rev()
|
||||
.eq(g.iter().rev().cloned()));
|
||||
assert!(UnicodeSegmentation::graphemes(s, false)
|
||||
.rev()
|
||||
.eq(g.iter().rev().cloned()));
|
||||
}
|
||||
|
||||
for &(s, gt, gf) in TEST_DIFF.iter().chain(EXTRA_DIFF) {
|
||||
// test forward iterator
|
||||
assert!(UnicodeSegmentation::graphemes(s, true).eq(gt.iter().cloned()));
|
||||
assert!(UnicodeSegmentation::graphemes(s, false).eq(gf.iter().cloned()));
|
||||
|
||||
// test reverse iterator
|
||||
assert!(UnicodeSegmentation::graphemes(s, true)
|
||||
.rev()
|
||||
.eq(gt.iter().rev().cloned()));
|
||||
assert!(UnicodeSegmentation::graphemes(s, false)
|
||||
.rev()
|
||||
.eq(gf.iter().rev().cloned()));
|
||||
}
|
||||
|
||||
// test the indices iterators
|
||||
let s = "a̐éö̲\r\n";
|
||||
let gr_inds = UnicodeSegmentation::grapheme_indices(s, true).collect::<Vec<(usize, &str)>>();
|
||||
let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
|
||||
assert_eq!(gr_inds, b);
|
||||
let gr_inds = UnicodeSegmentation::grapheme_indices(s, true)
|
||||
.rev()
|
||||
.collect::<Vec<(usize, &str)>>();
|
||||
let b: &[_] = &[(11, "\r\n"), (6, "ö̲"), (3, "é"), (0, "a̐")];
|
||||
assert_eq!(gr_inds, b);
|
||||
let mut gr_inds_iter = UnicodeSegmentation::grapheme_indices(s, true);
|
||||
{
|
||||
let gr_inds = gr_inds_iter.by_ref();
|
||||
let e1 = gr_inds.size_hint();
|
||||
assert_eq!(e1, (1, Some(13)));
|
||||
let c = gr_inds.count();
|
||||
assert_eq!(c, 4);
|
||||
}
|
||||
let e2 = gr_inds_iter.size_hint();
|
||||
assert_eq!(e2, (0, Some(0)));
|
||||
|
||||
// make sure the reverse iterator does the right thing with "\n" at beginning of string
|
||||
let s = "\n\r\n\r";
|
||||
let gr = UnicodeSegmentation::graphemes(s, true)
|
||||
.rev()
|
||||
.collect::<Vec<&str>>();
|
||||
let b: &[_] = &["\r", "\r\n", "\n"];
|
||||
assert_eq!(gr, b);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_words() {
|
||||
use crate::testdata::TEST_WORD;
|
||||
|
||||
// Unicode's official tests don't really test longer chains of flag emoji
|
||||
// TODO This could be improved with more tests like flag emoji with interspersed Extend chars and ZWJ
|
||||
const EXTRA_TESTS: &'static [(&'static str, &'static [&'static str])] = &[
|
||||
(
|
||||
"🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦🇴",
|
||||
&["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦🇴"],
|
||||
),
|
||||
("🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦", &["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"]),
|
||||
(
|
||||
"🇦a🇫🇦🇽a🇦🇱🇩🇿🇦🇸🇦🇩🇦",
|
||||
&["🇦", "a", "🇫🇦", "🇽", "a", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"],
|
||||
),
|
||||
(
|
||||
"\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}",
|
||||
&["\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}"],
|
||||
),
|
||||
("😌👎🏼", &["😌", "👎🏼"]),
|
||||
// perhaps wrong, spaces should not be included?
|
||||
("hello world", &["hello", " ", "world"]),
|
||||
("🇨🇦🇨🇭🇿🇲🇿 hi", &["🇨🇦", "🇨🇭", "🇿🇲", "🇿", " ", "hi"]),
|
||||
];
|
||||
for &(s, w) in TEST_WORD.iter().chain(EXTRA_TESTS.iter()) {
|
||||
macro_rules! assert_ {
|
||||
($test:expr, $exp:expr, $name:expr) => {
|
||||
// collect into vector for better diagnostics in failure case
|
||||
let testing = $test.collect::<Vec<_>>();
|
||||
let expected = $exp.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
testing, expected,
|
||||
"{} test for testcase ({:?}, {:?}) failed.",
|
||||
$name, s, w
|
||||
)
|
||||
};
|
||||
}
|
||||
// test forward iterator
|
||||
assert_!(
|
||||
s.split_word_bounds(),
|
||||
w.iter().cloned(),
|
||||
"Forward word boundaries"
|
||||
);
|
||||
|
||||
// test reverse iterator
|
||||
assert_!(
|
||||
s.split_word_bounds().rev(),
|
||||
w.iter().rev().cloned(),
|
||||
"Reverse word boundaries"
|
||||
);
|
||||
|
||||
// generate offsets from word string lengths
|
||||
let mut indices = vec![0];
|
||||
for i in w.iter().cloned().map(|s| s.len()).scan(0, |t, n| {
|
||||
*t += n;
|
||||
Some(*t)
|
||||
}) {
|
||||
indices.push(i);
|
||||
}
|
||||
indices.pop();
|
||||
let indices = indices;
|
||||
|
||||
// test forward indices iterator
|
||||
assert_!(
|
||||
s.split_word_bound_indices().map(|(l, _)| l),
|
||||
indices.iter().cloned(),
|
||||
"Forward word indices"
|
||||
);
|
||||
|
||||
// test backward indices iterator
|
||||
assert_!(
|
||||
s.split_word_bound_indices().rev().map(|(l, _)| l),
|
||||
indices.iter().rev().cloned(),
|
||||
"Reverse word indices"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sentences() {
|
||||
use crate::testdata::TEST_SENTENCE;
|
||||
|
||||
for &(s, w) in TEST_SENTENCE.iter() {
|
||||
macro_rules! assert_ {
|
||||
($test:expr, $exp:expr, $name:expr) => {
|
||||
// collect into vector for better diagnostics in failure case
|
||||
let testing = $test.collect::<Vec<_>>();
|
||||
let expected = $exp.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
testing, expected,
|
||||
"{} test for testcase ({:?}, {:?}) failed.",
|
||||
$name, s, w
|
||||
)
|
||||
};
|
||||
}
|
||||
|
||||
assert_!(
|
||||
s.split_sentence_bounds(),
|
||||
w.iter().cloned(),
|
||||
"Forward sentence boundaries"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
quickcheck! {
|
||||
fn quickcheck_forward_reverse_graphemes_extended(s: String) -> bool {
|
||||
let a = s.graphemes(true).collect::<Vec<_>>();
|
||||
let mut b = s.graphemes(true).rev().collect::<Vec<_>>();
|
||||
b.reverse();
|
||||
a == b
|
||||
}
|
||||
|
||||
fn quickcheck_forward_reverse_graphemes_legacy(s: String) -> bool {
|
||||
let a = s.graphemes(false).collect::<Vec<_>>();
|
||||
let mut b = s.graphemes(false).rev().collect::<Vec<_>>();
|
||||
b.reverse();
|
||||
a == b
|
||||
}
|
||||
|
||||
fn quickcheck_join_graphemes(s: String) -> bool {
|
||||
let a = s.graphemes(true).collect::<String>();
|
||||
let b = s.graphemes(false).collect::<String>();
|
||||
a == s && b == s
|
||||
}
|
||||
|
||||
fn quickcheck_forward_reverse_words(s: String) -> bool {
|
||||
let a = s.split_word_bounds().collect::<Vec<_>>();
|
||||
let mut b = s.split_word_bounds().rev().collect::<Vec<_>>();
|
||||
b.reverse();
|
||||
a == b
|
||||
}
|
||||
|
||||
fn quickcheck_join_words(s: String) -> bool {
|
||||
let a = s.split_word_bounds().collect::<String>();
|
||||
a == s
|
||||
}
|
||||
}
|
||||
5250
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/testdata.rs
vendored
Normal file
5250
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/testdata.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
754
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/word.rs
vendored
Normal file
754
clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/word.rs
vendored
Normal file
@@ -0,0 +1,754 @@
|
||||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use core::cmp;
|
||||
use core::iter::Filter;
|
||||
|
||||
use crate::tables::word::WordCat;
|
||||
|
||||
/// An iterator over the substrings of a string which, after splitting the string on
|
||||
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
|
||||
/// contain any characters with the
|
||||
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
|
||||
/// property, or with
|
||||
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
|
||||
///
|
||||
/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
|
||||
/// its documentation for more.
|
||||
///
|
||||
/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
pub struct UnicodeWords<'a> {
|
||||
inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UnicodeWords<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
self.inner.next()
|
||||
}
|
||||
}
|
||||
impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<&'a str> {
|
||||
self.inner.next_back()
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over the substrings of a string which, after splitting the string on
|
||||
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
|
||||
/// contain any characters with the
|
||||
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
|
||||
/// property, or with
|
||||
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
|
||||
/// This iterator also provides the byte offsets for each substring.
|
||||
///
|
||||
/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
|
||||
/// its documentation for more.
|
||||
///
|
||||
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
pub struct UnicodeWordIndices<'a> {
|
||||
inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UnicodeWordIndices<'a> {
|
||||
type Item = (usize, &'a str);
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.inner.next()
|
||||
}
|
||||
}
|
||||
impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.inner.next_back()
|
||||
}
|
||||
}
|
||||
|
||||
/// External iterator for a string's
|
||||
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
|
||||
///
|
||||
/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
|
||||
/// trait. See its documentation for more.
|
||||
///
|
||||
/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone)]
|
||||
pub struct UWordBounds<'a> {
|
||||
string: &'a str,
|
||||
cat: Option<WordCat>,
|
||||
catb: Option<WordCat>,
|
||||
}
|
||||
|
||||
/// External iterator for word boundaries and byte offsets.
|
||||
///
|
||||
/// This struct is created by the [`split_word_bound_indices`] method on the
|
||||
/// [`UnicodeSegmentation`] trait. See its documentation for more.
|
||||
///
|
||||
/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone)]
|
||||
pub struct UWordBoundIndices<'a> {
|
||||
start_offset: usize,
|
||||
iter: UWordBounds<'a>,
|
||||
}
|
||||
|
||||
impl<'a> UWordBoundIndices<'a> {
|
||||
#[inline]
|
||||
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::UnicodeSegmentation;
|
||||
/// let mut iter = "Hello world".split_word_bound_indices();
|
||||
/// assert_eq!(iter.as_str(), "Hello world");
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), " world");
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), "world");
|
||||
/// ```
|
||||
pub fn as_str(&self) -> &'a str {
|
||||
self.iter.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UWordBoundIndices<'a> {
|
||||
type Item = (usize, &'a str);
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.iter
|
||||
.next()
|
||||
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.iter.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.iter
|
||||
.next_back()
|
||||
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
|
||||
}
|
||||
}
|
||||
|
||||
// state machine for word boundary rules
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
enum UWordBoundsState {
|
||||
Start,
|
||||
Letter,
|
||||
HLetter,
|
||||
Numeric,
|
||||
Katakana,
|
||||
ExtendNumLet,
|
||||
Regional(RegionalState),
|
||||
FormatExtend(FormatExtendType),
|
||||
Zwj,
|
||||
Emoji,
|
||||
WSegSpace,
|
||||
}
|
||||
|
||||
// subtypes for FormatExtend state in UWordBoundsState
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
enum FormatExtendType {
|
||||
AcceptAny,
|
||||
AcceptNone,
|
||||
RequireLetter,
|
||||
RequireHLetter,
|
||||
AcceptQLetter,
|
||||
RequireNumeric,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
enum RegionalState {
|
||||
Half,
|
||||
Full,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
fn is_emoji(ch: char) -> bool {
|
||||
use crate::tables::emoji;
|
||||
emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UWordBounds<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let slen = self.string.len();
|
||||
(cmp::min(slen, 1), Some(slen))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
use self::FormatExtendType::*;
|
||||
use self::UWordBoundsState::*;
|
||||
use crate::tables::word as wd;
|
||||
if self.string.len() == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut take_curr = true;
|
||||
let mut take_cat = true;
|
||||
let mut idx = 0;
|
||||
let mut saveidx = 0;
|
||||
let mut state = Start;
|
||||
let mut cat = wd::WC_Any;
|
||||
let mut savecat = wd::WC_Any;
|
||||
|
||||
// If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
|
||||
let mut skipped_format_extend = false;
|
||||
for (curr, ch) in self.string.char_indices() {
|
||||
idx = curr;
|
||||
// Whether or not the previous category was ZWJ
|
||||
// ZWJs get collapsed, so this handles precedence of WB3c over WB4
|
||||
let prev_zwj = cat == wd::WC_ZWJ;
|
||||
// if there's a category cached, grab it
|
||||
cat = match self.cat {
|
||||
None => wd::word_category(ch).2,
|
||||
_ => self.cat.take().unwrap(),
|
||||
};
|
||||
take_cat = true;
|
||||
|
||||
// handle rule WB4
|
||||
// just skip all format, extend, and zwj chars
|
||||
// note that Start is a special case: if there's a bunch of Format | Extend
|
||||
// characters at the beginning of a block of text, dump them out as one unit.
|
||||
//
|
||||
// (This is not obvious from the wording of UAX#29, but if you look at the
|
||||
// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
|
||||
// then the "correct" interpretation of WB4 becomes apparent.)
|
||||
if state != Start {
|
||||
match cat {
|
||||
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
|
||||
skipped_format_extend = true;
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// rule WB3c
|
||||
// WB4 makes all ZWJs collapse into the previous state
|
||||
// but you can still be in a Zwj state if you started with Zwj
|
||||
//
|
||||
// This means that an EP + Zwj will collapse into EP, which is wrong,
|
||||
// since EP+EP is not a boundary but EP+ZWJ+EP is
|
||||
//
|
||||
// Thus, we separately keep track of whether or not the last character
|
||||
// was a ZWJ. This is an additional bit of state tracked outside of the
|
||||
// state enum; the state enum represents the last non-zwj state encountered.
|
||||
// When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
|
||||
// however we are in the previous state for the purposes of all other rules.
|
||||
if prev_zwj {
|
||||
if is_emoji(ch) {
|
||||
state = Emoji;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Don't use `continue` in this match without updating `cat`
|
||||
state = match state {
|
||||
Start if cat == wd::WC_CR => {
|
||||
idx += match self.get_next_cat(idx) {
|
||||
Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
|
||||
_ => 0,
|
||||
};
|
||||
break; // rule WB3a
|
||||
}
|
||||
Start => match cat {
|
||||
wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
|
||||
wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
|
||||
wd::WC_Katakana => Katakana, // rule WB13, WB13a
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
|
||||
wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
|
||||
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
|
||||
wd::WC_ZWJ => Zwj, // rule WB3c
|
||||
wd::WC_WSegSpace => WSegSpace, // rule WB3d
|
||||
_ => {
|
||||
if let Some(ncat) = self.get_next_cat(idx) {
|
||||
// rule WB4
|
||||
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
|
||||
{
|
||||
state = FormatExtend(AcceptNone);
|
||||
self.cat = Some(ncat);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break; // rule WB999
|
||||
}
|
||||
},
|
||||
WSegSpace => match cat {
|
||||
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Zwj => {
|
||||
// We already handle WB3c above.
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
Letter | HLetter => match cat {
|
||||
wd::WC_ALetter => Letter, // rule WB5
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB5
|
||||
wd::WC_Numeric => Numeric, // rule WB9
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
|
||||
wd::WC_Double_Quote if state == HLetter => {
|
||||
savecat = cat;
|
||||
saveidx = idx;
|
||||
FormatExtend(RequireHLetter) // rule WB7b
|
||||
}
|
||||
wd::WC_Single_Quote if state == HLetter => {
|
||||
FormatExtend(AcceptQLetter) // rule WB7a
|
||||
}
|
||||
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
|
||||
savecat = cat;
|
||||
saveidx = idx;
|
||||
FormatExtend(RequireLetter) // rule WB6
|
||||
}
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Numeric => match cat {
|
||||
wd::WC_Numeric => Numeric, // rule WB8
|
||||
wd::WC_ALetter => Letter, // rule WB10
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB10
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
|
||||
wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
|
||||
savecat = cat;
|
||||
saveidx = idx;
|
||||
FormatExtend(RequireNumeric) // rule WB12
|
||||
}
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Katakana => match cat {
|
||||
wd::WC_Katakana => Katakana, // rule WB13
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
ExtendNumLet => match cat {
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
|
||||
wd::WC_ALetter => Letter, // rule WB13b
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB13b
|
||||
wd::WC_Numeric => Numeric, // rule WB13b
|
||||
wd::WC_Katakana => Katakana, // rule WB13b
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Regional(RegionalState::Full) => {
|
||||
// if it reaches here we've gone too far,
|
||||
// a full flag can only compose with ZWJ/Extend/Format
|
||||
// proceeding it.
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
Regional(RegionalState::Half) => match cat {
|
||||
wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Regional(_) => {
|
||||
unreachable!("RegionalState::Unknown should not occur on forward iteration")
|
||||
}
|
||||
Emoji => {
|
||||
// We already handle WB3c above. If you've reached this point, the emoji sequence is over.
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
FormatExtend(t) => match t {
|
||||
// handle FormatExtends depending on what type
|
||||
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
|
||||
RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
|
||||
RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
|
||||
RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
|
||||
AcceptNone | AcceptQLetter => {
|
||||
take_curr = false; // emit all the Format|Extend characters
|
||||
take_cat = false;
|
||||
break;
|
||||
}
|
||||
_ => break, // rewind (in if statement below)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
if let FormatExtend(t) = state {
|
||||
// we were looking for something and didn't find it; we have to back up
|
||||
if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
|
||||
idx = saveidx;
|
||||
cat = savecat;
|
||||
take_curr = false;
|
||||
}
|
||||
}
|
||||
|
||||
self.cat = if take_curr {
|
||||
idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
|
||||
None
|
||||
} else if take_cat {
|
||||
Some(cat)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let retstr = &self.string[..idx];
|
||||
self.string = &self.string[idx..];
|
||||
Some(retstr)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DoubleEndedIterator for UWordBounds<'a> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<&'a str> {
|
||||
use self::FormatExtendType::*;
|
||||
use self::UWordBoundsState::*;
|
||||
use crate::tables::word as wd;
|
||||
if self.string.len() == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut take_curr = true;
|
||||
let mut take_cat = true;
|
||||
let mut idx = self.string.len();
|
||||
idx -= self.string.chars().next_back().unwrap().len_utf8();
|
||||
let mut previdx = idx;
|
||||
let mut saveidx = idx;
|
||||
let mut state = Start;
|
||||
let mut savestate = Start;
|
||||
let mut cat = wd::WC_Any;
|
||||
|
||||
let mut skipped_format_extend = false;
|
||||
|
||||
for (curr, ch) in self.string.char_indices().rev() {
|
||||
previdx = idx;
|
||||
idx = curr;
|
||||
|
||||
// if there's a category cached, grab it
|
||||
cat = match self.catb {
|
||||
None => wd::word_category(ch).2,
|
||||
_ => self.catb.take().unwrap(),
|
||||
};
|
||||
take_cat = true;
|
||||
|
||||
// backward iterator over word boundaries. Mostly the same as the forward
|
||||
// iterator, with two weirdnesses:
|
||||
// (1) If we encounter a single quote in the Start state, we have to check for a
|
||||
// Hebrew Letter immediately before it.
|
||||
// (2) Format and Extend char handling takes some gymnastics.
|
||||
|
||||
if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
|
||||
// WB3c has more priority so we should not
|
||||
// fold in that case
|
||||
if match state {
|
||||
FormatExtend(_) | Start => false,
|
||||
_ => true,
|
||||
} {
|
||||
saveidx = previdx;
|
||||
savestate = state;
|
||||
state = FormatExtend(AcceptNone);
|
||||
}
|
||||
|
||||
if state != Start {
|
||||
continue;
|
||||
}
|
||||
} else if state == FormatExtend(AcceptNone) {
|
||||
// finished a scan of some Format|Extend chars, restore previous state
|
||||
state = savestate;
|
||||
previdx = saveidx;
|
||||
take_cat = false;
|
||||
skipped_format_extend = true;
|
||||
}
|
||||
|
||||
// Don't use `continue` in this match without updating `catb`
|
||||
state = match state {
|
||||
Start | FormatExtend(AcceptAny) => match cat {
|
||||
_ if is_emoji(ch) => Zwj,
|
||||
wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
|
||||
wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
|
||||
wd::WC_Katakana => Katakana, // rule WB13, WB13b
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
|
||||
wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
|
||||
// rule WB4:
|
||||
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
|
||||
wd::WC_Single_Quote => {
|
||||
saveidx = idx;
|
||||
FormatExtend(AcceptQLetter) // rule WB7a
|
||||
}
|
||||
wd::WC_WSegSpace => WSegSpace,
|
||||
wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
|
||||
if state == Start {
|
||||
if cat == wd::WC_LF {
|
||||
idx -= match self.get_prev_cat(idx) {
|
||||
Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
|
||||
_ => 0,
|
||||
};
|
||||
}
|
||||
} else {
|
||||
take_curr = false;
|
||||
}
|
||||
break; // rule WB3a
|
||||
}
|
||||
_ => break, // rule WB999
|
||||
},
|
||||
Zwj => match cat {
|
||||
// rule WB3c
|
||||
wd::WC_ZWJ => FormatExtend(AcceptAny),
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
WSegSpace => match cat {
|
||||
// rule WB3d
|
||||
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Letter | HLetter => match cat {
|
||||
wd::WC_ALetter => Letter, // rule WB5
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB5
|
||||
wd::WC_Numeric => Numeric, // rule WB10
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
|
||||
wd::WC_Double_Quote if state == HLetter => {
|
||||
saveidx = previdx;
|
||||
FormatExtend(RequireHLetter) // rule WB7c
|
||||
}
|
||||
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
|
||||
saveidx = previdx;
|
||||
FormatExtend(RequireLetter) // rule WB7
|
||||
}
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Numeric => match cat {
|
||||
wd::WC_Numeric => Numeric, // rule WB8
|
||||
wd::WC_ALetter => Letter, // rule WB9
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB9
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
|
||||
wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
|
||||
saveidx = previdx;
|
||||
FormatExtend(RequireNumeric) // rule WB11
|
||||
}
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Katakana => match cat {
|
||||
wd::WC_Katakana => Katakana, // rule WB13
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
ExtendNumLet => match cat {
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
|
||||
wd::WC_ALetter => Letter, // rule WB13a
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB13a
|
||||
wd::WC_Numeric => Numeric, // rule WB13a
|
||||
wd::WC_Katakana => Katakana, // rule WB13a
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Regional(mut regional_state) => match cat {
|
||||
// rule WB13c
|
||||
wd::WC_Regional_Indicator => {
|
||||
if regional_state == RegionalState::Unknown {
|
||||
let count = self.string[..previdx]
|
||||
.chars()
|
||||
.rev()
|
||||
.map(|c| wd::word_category(c).2)
|
||||
.filter(|&c| {
|
||||
!(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
|
||||
})
|
||||
.take_while(|&c| c == wd::WC_Regional_Indicator)
|
||||
.count();
|
||||
regional_state = if count % 2 == 0 {
|
||||
RegionalState::Full
|
||||
} else {
|
||||
RegionalState::Half
|
||||
};
|
||||
}
|
||||
if regional_state == RegionalState::Full {
|
||||
take_curr = false;
|
||||
break;
|
||||
} else {
|
||||
Regional(RegionalState::Full)
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Emoji => {
|
||||
if is_emoji(ch) {
|
||||
// rule WB3c
|
||||
Zwj
|
||||
} else {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
FormatExtend(t) => match t {
|
||||
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
|
||||
RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
|
||||
RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
|
||||
AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
|
||||
RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
|
||||
_ => break, // backtrack will happens
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
if let FormatExtend(t) = state {
|
||||
// if we required something but didn't find it, backtrack
|
||||
if t == RequireLetter
|
||||
|| t == RequireHLetter
|
||||
|| t == RequireNumeric
|
||||
|| t == AcceptNone
|
||||
|| t == AcceptQLetter
|
||||
{
|
||||
previdx = saveidx;
|
||||
take_cat = false;
|
||||
take_curr = false;
|
||||
}
|
||||
}
|
||||
|
||||
self.catb = if take_curr {
|
||||
None
|
||||
} else {
|
||||
idx = previdx;
|
||||
if take_cat {
|
||||
Some(cat)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let retstr = &self.string[idx..];
|
||||
self.string = &self.string[..idx];
|
||||
Some(retstr)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> UWordBounds<'a> {
|
||||
#[inline]
|
||||
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::UnicodeSegmentation;
|
||||
/// let mut iter = "Hello world".split_word_bounds();
|
||||
/// assert_eq!(iter.as_str(), "Hello world");
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), " world");
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), "world");
|
||||
/// ```
|
||||
pub fn as_str(&self) -> &'a str {
|
||||
self.string
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
|
||||
use crate::tables::word as wd;
|
||||
let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
|
||||
if nidx < self.string.len() {
|
||||
let nch = self.string[nidx..].chars().next().unwrap();
|
||||
Some(wd::word_category(nch).2)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
|
||||
use crate::tables::word as wd;
|
||||
if idx > 0 {
|
||||
let nch = self.string[..idx].chars().next_back().unwrap();
|
||||
Some(wd::word_category(nch).2)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
|
||||
UWordBounds {
|
||||
string: s,
|
||||
cat: None,
|
||||
catb: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
|
||||
UWordBoundIndices {
|
||||
start_offset: s.as_ptr() as usize,
|
||||
iter: new_word_bounds(s),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn has_alphanumeric(s: &&str) -> bool {
|
||||
use crate::tables::util::is_alphanumeric;
|
||||
|
||||
s.chars().any(|c| is_alphanumeric(c))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
|
||||
use super::UnicodeSegmentation;
|
||||
|
||||
UnicodeWords {
|
||||
inner: s.split_word_bounds().filter(has_alphanumeric),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
|
||||
use super::UnicodeSegmentation;
|
||||
|
||||
UnicodeWordIndices {
|
||||
inner: s
|
||||
.split_word_bound_indices()
|
||||
.filter(|(_, c)| has_alphanumeric(c)),
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user