更新libclamav库1.0.0版本

2023-01-14 18:28:39 +08:00
parent b879ee0b2e
commit 45fe15f472
8531 changed files with 1222046 additions and 177272 deletions
--- a/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/grapheme.rs
+++ b/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/grapheme.rs
@@ -0,0 +1,801 @@
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use core::cmp;
+
+use crate::tables::grapheme::GraphemeCat;
+
+/// External iterator for grapheme clusters and byte offsets.
+///
+/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
+/// trait. See its documentation for more.
+///
+/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[derive(Clone)]
+pub struct GraphemeIndices<'a> {
+    start_offset: usize,
+    iter: Graphemes<'a>,
+}
+
+impl<'a> GraphemeIndices<'a> {
+    #[inline]
+    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::UnicodeSegmentation;
+    /// let mut iter = "abc".grapheme_indices(true);
+    /// assert_eq!(iter.as_str(), "abc");
+    /// iter.next();
+    /// assert_eq!(iter.as_str(), "bc");
+    /// iter.next();
+    /// iter.next();
+    /// assert_eq!(iter.as_str(), "");
+    /// ```
+    pub fn as_str(&self) -> &'a str {
+        self.iter.as_str()
+    }
+}
+
+impl<'a> Iterator for GraphemeIndices<'a> {
+    type Item = (usize, &'a str);
+
+    #[inline]
+    fn next(&mut self) -> Option<(usize, &'a str)> {
+        self.iter
+            .next()
+            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.iter.size_hint()
+    }
+}
+
+impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<(usize, &'a str)> {
+        self.iter
+            .next_back()
+            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
+    }
+}
+
+/// External iterator for a string's
+/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
+///
+/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
+/// documentation for more.
+///
+/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[derive(Clone, Debug)]
+pub struct Graphemes<'a> {
+    string: &'a str,
+    cursor: GraphemeCursor,
+    cursor_back: GraphemeCursor,
+}
+
+impl<'a> Graphemes<'a> {
+    #[inline]
+    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::UnicodeSegmentation;
+    /// let mut iter = "abc".graphemes(true);
+    /// assert_eq!(iter.as_str(), "abc");
+    /// iter.next();
+    /// assert_eq!(iter.as_str(), "bc");
+    /// iter.next();
+    /// iter.next();
+    /// assert_eq!(iter.as_str(), "");
+    /// ```
+    pub fn as_str(&self) -> &'a str {
+        &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
+    }
+}
+
+impl<'a> Iterator for Graphemes<'a> {
+    type Item = &'a str;
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
+        (cmp::min(slen, 1), Some(slen))
+    }
+
+    #[inline]
+    fn next(&mut self) -> Option<&'a str> {
+        let start = self.cursor.cur_cursor();
+        if start == self.cursor_back.cur_cursor() {
+            return None;
+        }
+        let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
+        Some(&self.string[start..next])
+    }
+}
+
+impl<'a> DoubleEndedIterator for Graphemes<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<&'a str> {
+        let end = self.cursor_back.cur_cursor();
+        if end == self.cursor.cur_cursor() {
+            return None;
+        }
+        let prev = self
+            .cursor_back
+            .prev_boundary(self.string, 0)
+            .unwrap()
+            .unwrap();
+        Some(&self.string[prev..end])
+    }
+}
+
+#[inline]
+pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
+    let len = s.len();
+    Graphemes {
+        string: s,
+        cursor: GraphemeCursor::new(0, len, is_extended),
+        cursor_back: GraphemeCursor::new(len, len, is_extended),
+    }
+}
+
+#[inline]
+pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
+    GraphemeIndices {
+        start_offset: s.as_ptr() as usize,
+        iter: new_graphemes(s, is_extended),
+    }
+}
+
+// maybe unify with PairResult?
+// An enum describing information about a potential boundary.
+#[derive(PartialEq, Eq, Clone, Debug)]
+enum GraphemeState {
+    // No information is known.
+    Unknown,
+    // It is known to not be a boundary.
+    NotBreak,
+    // It is known to be a boundary.
+    Break,
+    // The codepoint after is a Regional Indicator Symbol, so a boundary iff
+    // it is preceded by an even number of RIS codepoints. (GB12, GB13)
+    Regional,
+    // The codepoint after is Extended_Pictographic,
+    // so whether it's a boundary depends on pre-context according to GB11.
+    Emoji,
+}
+
+/// Cursor-based segmenter for grapheme clusters.
+///
+/// This allows working with ropes and other datastructures where the string is not contiguous or
+/// fully known at initialization time.
+#[derive(Clone, Debug)]
+pub struct GraphemeCursor {
+    // Current cursor position.
+    offset: usize,
+    // Total length of the string.
+    len: usize,
+    // A config flag indicating whether this cursor computes legacy or extended
+    // grapheme cluster boundaries (enables GB9a and GB9b if set).
+    is_extended: bool,
+    // Information about the potential boundary at `offset`
+    state: GraphemeState,
+    // Category of codepoint immediately preceding cursor, if known.
+    cat_before: Option<GraphemeCat>,
+    // Category of codepoint immediately after cursor, if known.
+    cat_after: Option<GraphemeCat>,
+    // If set, at least one more codepoint immediately preceding this offset
+    // is needed to resolve whether there's a boundary at `offset`.
+    pre_context_offset: Option<usize>,
+    // The number of RIS codepoints preceding `offset`. If `pre_context_offset`
+    // is set, then counts the number of RIS between that and `offset`, otherwise
+    // is an accurate count relative to the string.
+    ris_count: Option<usize>,
+    // Set if a call to `prev_boundary` or `next_boundary` was suspended due
+    // to needing more input.
+    resuming: bool,
+    // Cached grapheme category and associated scalar value range.
+    grapheme_cat_cache: (u32, u32, GraphemeCat),
+}
+
+/// An error return indicating that not enough content was available in the
+/// provided chunk to satisfy the query, and that more content must be provided.
+#[derive(PartialEq, Eq, Debug)]
+pub enum GraphemeIncomplete {
+    /// More pre-context is needed. The caller should call `provide_context`
+    /// with a chunk ending at the offset given, then retry the query. This
+    /// will only be returned if the `chunk_start` parameter is nonzero.
+    PreContext(usize),
+
+    /// When requesting `prev_boundary`, the cursor is moving past the beginning
+    /// of the current chunk, so the chunk before that is requested. This will
+    /// only be returned if the `chunk_start` parameter is nonzero.
+    PrevChunk,
+
+    /// When requesting `next_boundary`, the cursor is moving past the end of the
+    /// current chunk, so the chunk after that is requested. This will only be
+    /// returned if the chunk ends before the `len` parameter provided on
+    /// creation of the cursor.
+    NextChunk, // requesting chunk following the one given
+
+    /// An error returned when the chunk given does not contain the cursor position.
+    InvalidOffset,
+}
+
+// An enum describing the result from lookup of a pair of categories.
+#[derive(PartialEq, Eq)]
+enum PairResult {
+    NotBreak, // definitely not a break
+    Break,    // definitely a break
+    Extended, // a break iff not in extended mode
+    Regional, // a break if preceded by an even number of RIS
+    Emoji,    // a break if preceded by emoji base and (Extend)*
+}
+
+#[inline]
+fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
+    use self::PairResult::*;
+    use crate::tables::grapheme::GraphemeCat::*;
+    match (before, after) {
+        (GC_CR, GC_LF) => NotBreak,                                 // GB3
+        (GC_Control, _) => Break,                                   // GB4
+        (GC_CR, _) => Break,                                        // GB4
+        (GC_LF, _) => Break,                                        // GB4
+        (_, GC_Control) => Break,                                   // GB5
+        (_, GC_CR) => Break,                                        // GB5
+        (_, GC_LF) => Break,                                        // GB5
+        (GC_L, GC_L) => NotBreak,                                   // GB6
+        (GC_L, GC_V) => NotBreak,                                   // GB6
+        (GC_L, GC_LV) => NotBreak,                                  // GB6
+        (GC_L, GC_LVT) => NotBreak,                                 // GB6
+        (GC_LV, GC_V) => NotBreak,                                  // GB7
+        (GC_LV, GC_T) => NotBreak,                                  // GB7
+        (GC_V, GC_V) => NotBreak,                                   // GB7
+        (GC_V, GC_T) => NotBreak,                                   // GB7
+        (GC_LVT, GC_T) => NotBreak,                                 // GB8
+        (GC_T, GC_T) => NotBreak,                                   // GB8
+        (_, GC_Extend) => NotBreak,                                 // GB9
+        (_, GC_ZWJ) => NotBreak,                                    // GB9
+        (_, GC_SpacingMark) => Extended,                            // GB9a
+        (GC_Prepend, _) => Extended,                                // GB9b
+        (GC_ZWJ, GC_Extended_Pictographic) => Emoji,                // GB11
+        (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
+        (_, _) => Break,                                            // GB999
+    }
+}
+
+impl GraphemeCursor {
+    /// Create a new cursor. The string and initial offset are given at creation
+    /// time, but the contents of the string are not. The `is_extended` parameter
+    /// controls whether extended grapheme clusters are selected.
+    ///
+    /// The `offset` parameter must be on a codepoint boundary.
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::GraphemeCursor;
+    /// let s = "हिन्दी";
+    /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
+    /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
+    /// let mut extended = GraphemeCursor::new(0, s.len(), true);
+    /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
+    /// ```
+    pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
+        let state = if offset == 0 || offset == len {
+            GraphemeState::Break
+        } else {
+            GraphemeState::Unknown
+        };
+        GraphemeCursor {
+            offset: offset,
+            len: len,
+            state: state,
+            is_extended: is_extended,
+            cat_before: None,
+            cat_after: None,
+            pre_context_offset: None,
+            ris_count: None,
+            resuming: false,
+            grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
+        }
+    }
+
+    fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
+        use crate::tables::grapheme as gr;
+        use crate::tables::grapheme::GraphemeCat::*;
+
+        if ch <= '\u{7e}' {
+            // Special-case optimization for ascii, except U+007F.  This
+            // improves performance even for many primarily non-ascii texts,
+            // due to use of punctuation and white space characters from the
+            // ascii range.
+            if ch >= '\u{20}' {
+                GC_Any
+            } else if ch == '\n' {
+                GC_LF
+            } else if ch == '\r' {
+                GC_CR
+            } else {
+                GC_Control
+            }
+        } else {
+            // If this char isn't within the cached range, update the cache to the
+            // range that includes it.
+            if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
+                self.grapheme_cat_cache = gr::grapheme_category(ch);
+            }
+            self.grapheme_cat_cache.2
+        }
+    }
+
+    // Not sure I'm gonna keep this, the advantage over new() seems thin.
+
+    /// Set the cursor to a new location in the same string.
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::GraphemeCursor;
+    /// let s = "abcd";
+    /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
+    /// assert_eq!(cursor.cur_cursor(), 0);
+    /// cursor.set_cursor(2);
+    /// assert_eq!(cursor.cur_cursor(), 2);
+    /// ```
+    pub fn set_cursor(&mut self, offset: usize) {
+        if offset != self.offset {
+            self.offset = offset;
+            self.state = if offset == 0 || offset == self.len {
+                GraphemeState::Break
+            } else {
+                GraphemeState::Unknown
+            };
+            // reset state derived from text around cursor
+            self.cat_before = None;
+            self.cat_after = None;
+            self.ris_count = None;
+        }
+    }
+
+    #[inline]
+    /// The current offset of the cursor. Equal to the last value provided to
+    /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
+    /// `prev_boundary()`.
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::GraphemeCursor;
+    /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
+    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
+    /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
+    /// assert_eq!(cursor.cur_cursor(), 4);
+    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
+    /// assert_eq!(cursor.cur_cursor(), 8);
+    /// ```
+    pub fn cur_cursor(&self) -> usize {
+        self.offset
+    }
+
+    /// Provide additional pre-context when it is needed to decide a boundary.
+    /// The end of the chunk must coincide with the value given in the
+    /// `GraphemeIncomplete::PreContext` request.
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
+    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
+    /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
+    /// // Not enough pre-context to decide if there's a boundary between the two flags.
+    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
+    /// // Provide one more Regional Indicator Symbol of pre-context
+    /// cursor.provide_context(&flags[4..8], 4);
+    /// // Still not enough context to decide.
+    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
+    /// // Provide additional requested context.
+    /// cursor.provide_context(&flags[0..4], 0);
+    /// // That's enough to decide (it always is when context goes to the start of the string)
+    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
+    /// ```
+    pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
+        use crate::tables::grapheme as gr;
+        assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
+        self.pre_context_offset = None;
+        if self.is_extended && chunk_start + chunk.len() == self.offset {
+            let ch = chunk.chars().rev().next().unwrap();
+            if self.grapheme_category(ch) == gr::GC_Prepend {
+                self.decide(false); // GB9b
+                return;
+            }
+        }
+        match self.state {
+            GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
+            GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
+            _ => {
+                if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
+                    let ch = chunk.chars().rev().next().unwrap();
+                    self.cat_before = Some(self.grapheme_category(ch));
+                }
+            }
+        }
+    }
+
+    #[inline]
+    fn decide(&mut self, is_break: bool) {
+        self.state = if is_break {
+            GraphemeState::Break
+        } else {
+            GraphemeState::NotBreak
+        };
+    }
+
+    #[inline]
+    fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
+        self.decide(is_break);
+        Ok(is_break)
+    }
+
+    #[inline]
+    fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
+        if self.state == GraphemeState::Break {
+            Ok(true)
+        } else if self.state == GraphemeState::NotBreak {
+            Ok(false)
+        } else if let Some(pre_context_offset) = self.pre_context_offset {
+            Err(GraphemeIncomplete::PreContext(pre_context_offset))
+        } else {
+            unreachable!("inconsistent state");
+        }
+    }
+
+    #[inline]
+    fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
+        use crate::tables::grapheme as gr;
+        let mut ris_count = self.ris_count.unwrap_or(0);
+        for ch in chunk.chars().rev() {
+            if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
+                self.ris_count = Some(ris_count);
+                self.decide((ris_count % 2) == 0);
+                return;
+            }
+            ris_count += 1;
+        }
+        self.ris_count = Some(ris_count);
+        if chunk_start == 0 {
+            self.decide((ris_count % 2) == 0);
+            return;
+        }
+        self.pre_context_offset = Some(chunk_start);
+        self.state = GraphemeState::Regional;
+    }
+
+    #[inline]
+    fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
+        use crate::tables::grapheme as gr;
+        let mut iter = chunk.chars().rev();
+        if let Some(ch) = iter.next() {
+            if self.grapheme_category(ch) != gr::GC_ZWJ {
+                self.decide(true);
+                return;
+            }
+        }
+        for ch in iter {
+            match self.grapheme_category(ch) {
+                gr::GC_Extend => (),
+                gr::GC_Extended_Pictographic => {
+                    self.decide(false);
+                    return;
+                }
+                _ => {
+                    self.decide(true);
+                    return;
+                }
+            }
+        }
+        if chunk_start == 0 {
+            self.decide(true);
+            return;
+        }
+        self.pre_context_offset = Some(chunk_start);
+        self.state = GraphemeState::Emoji;
+    }
+
+    #[inline]
+    /// Determine whether the current cursor location is a grapheme cluster boundary.
+    /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
+    /// the length of `chunk` is not equal to `len` on creation, then this method
+    /// may return `GraphemeIncomplete::PreContext`. The caller should then
+    /// call `provide_context` with the requested chunk, then retry calling this
+    /// method.
+    ///
+    /// For partial chunks, if the cursor is not at the beginning or end of the
+    /// string, the chunk should contain at least the codepoint following the cursor.
+    /// If the string is nonempty, the chunk must be nonempty.
+    ///
+    /// All calls should have consistent chunk contents (ie, if a chunk provides
+    /// content for a given slice, all further chunks covering that slice must have
+    /// the same content for it).
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::GraphemeCursor;
+    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
+    /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
+    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
+    /// cursor.set_cursor(12);
+    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
+    /// ```
+    pub fn is_boundary(
+        &mut self,
+        chunk: &str,
+        chunk_start: usize,
+    ) -> Result<bool, GraphemeIncomplete> {
+        use crate::tables::grapheme as gr;
+        if self.state == GraphemeState::Break {
+            return Ok(true);
+        }
+        if self.state == GraphemeState::NotBreak {
+            return Ok(false);
+        }
+        if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
+            if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
+                return Err(GraphemeIncomplete::InvalidOffset);
+            }
+        }
+        if let Some(pre_context_offset) = self.pre_context_offset {
+            return Err(GraphemeIncomplete::PreContext(pre_context_offset));
+        }
+        let offset_in_chunk = self.offset - chunk_start;
+        if self.cat_after.is_none() {
+            let ch = chunk[offset_in_chunk..].chars().next().unwrap();
+            self.cat_after = Some(self.grapheme_category(ch));
+        }
+        if self.offset == chunk_start {
+            let mut need_pre_context = true;
+            match self.cat_after.unwrap() {
+                gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
+                gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
+                _ => need_pre_context = self.cat_before.is_none(),
+            }
+            if need_pre_context {
+                self.pre_context_offset = Some(chunk_start);
+                return Err(GraphemeIncomplete::PreContext(chunk_start));
+            }
+        }
+        if self.cat_before.is_none() {
+            let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
+            self.cat_before = Some(self.grapheme_category(ch));
+        }
+        match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
+            PairResult::NotBreak => return self.decision(false),
+            PairResult::Break => return self.decision(true),
+            PairResult::Extended => {
+                let is_extended = self.is_extended;
+                return self.decision(!is_extended);
+            }
+            PairResult::Regional => {
+                if let Some(ris_count) = self.ris_count {
+                    return self.decision((ris_count % 2) == 0);
+                }
+                self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
+                self.is_boundary_result()
+            }
+            PairResult::Emoji => {
+                self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
+                self.is_boundary_result()
+            }
+        }
+    }
+
+    #[inline]
+    /// Find the next boundary after the current cursor position. Only a part of
+    /// the string need be supplied. If the chunk is incomplete, then this
+    /// method might return `GraphemeIncomplete::PreContext` or
+    /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
+    /// call `provide_context` with the requested chunk, then retry. In the
+    /// latter case, the caller should provide the chunk following the one
+    /// given, then retry.
+    ///
+    /// See `is_boundary` for expectations on the provided chunk.
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::GraphemeCursor;
+    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
+    /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
+    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
+    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
+    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
+    /// ```
+    ///
+    /// And an example that uses partial strings:
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
+    /// let s = "abcd";
+    /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
+    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
+    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
+    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
+    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
+    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
+    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
+    /// ```
+    pub fn next_boundary(
+        &mut self,
+        chunk: &str,
+        chunk_start: usize,
+    ) -> Result<Option<usize>, GraphemeIncomplete> {
+        if self.offset == self.len {
+            return Ok(None);
+        }
+        let mut iter = chunk[self.offset - chunk_start..].chars();
+        let mut ch = iter.next().unwrap();
+        loop {
+            if self.resuming {
+                if self.cat_after.is_none() {
+                    self.cat_after = Some(self.grapheme_category(ch));
+                }
+            } else {
+                self.offset += ch.len_utf8();
+                self.state = GraphemeState::Unknown;
+                self.cat_before = self.cat_after.take();
+                if self.cat_before.is_none() {
+                    self.cat_before = Some(self.grapheme_category(ch));
+                }
+                if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
+                    self.ris_count = self.ris_count.map(|c| c + 1);
+                } else {
+                    self.ris_count = Some(0);
+                }
+                if let Some(next_ch) = iter.next() {
+                    ch = next_ch;
+                    self.cat_after = Some(self.grapheme_category(ch));
+                } else if self.offset == self.len {
+                    self.decide(true);
+                } else {
+                    self.resuming = true;
+                    return Err(GraphemeIncomplete::NextChunk);
+                }
+            }
+            self.resuming = true;
+            if self.is_boundary(chunk, chunk_start)? {
+                self.resuming = false;
+                return Ok(Some(self.offset));
+            }
+            self.resuming = false;
+        }
+    }
+
+    /// Find the previous boundary after the current cursor position. Only a part
+    /// of the string need be supplied. If the chunk is incomplete, then this
+    /// method might return `GraphemeIncomplete::PreContext` or
+    /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
+    /// call `provide_context` with the requested chunk, then retry. In the
+    /// latter case, the caller should provide the chunk preceding the one
+    /// given, then retry.
+    ///
+    /// See `is_boundary` for expectations on the provided chunk.
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::GraphemeCursor;
+    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
+    /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
+    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
+    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
+    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
+    /// ```
+    ///
+    /// And an example that uses partial strings (note the exact return is not
+    /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
+    /// let s = "abcd";
+    /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
+    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
+    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
+    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
+    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
+    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
+    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
+    /// ```
+    pub fn prev_boundary(
+        &mut self,
+        chunk: &str,
+        chunk_start: usize,
+    ) -> Result<Option<usize>, GraphemeIncomplete> {
+        if self.offset == 0 {
+            return Ok(None);
+        }
+        if self.offset == chunk_start {
+            return Err(GraphemeIncomplete::PrevChunk);
+        }
+        let mut iter = chunk[..self.offset - chunk_start].chars().rev();
+        let mut ch = iter.next().unwrap();
+        loop {
+            if self.offset == chunk_start {
+                self.resuming = true;
+                return Err(GraphemeIncomplete::PrevChunk);
+            }
+            if self.resuming {
+                self.cat_before = Some(self.grapheme_category(ch));
+            } else {
+                self.offset -= ch.len_utf8();
+                self.cat_after = self.cat_before.take();
+                self.state = GraphemeState::Unknown;
+                if let Some(ris_count) = self.ris_count {
+                    self.ris_count = if ris_count > 0 {
+                        Some(ris_count - 1)
+                    } else {
+                        None
+                    };
+                }
+                if let Some(prev_ch) = iter.next() {
+                    ch = prev_ch;
+                    self.cat_before = Some(self.grapheme_category(ch));
+                } else if self.offset == 0 {
+                    self.decide(true);
+                } else {
+                    self.resuming = true;
+                    self.cat_after = Some(self.grapheme_category(ch));
+                    return Err(GraphemeIncomplete::PrevChunk);
+                }
+            }
+            self.resuming = true;
+            if self.is_boundary(chunk, chunk_start)? {
+                self.resuming = false;
+                return Ok(Some(self.offset));
+            }
+            self.resuming = false;
+        }
+    }
+}
+
+#[test]
+fn test_grapheme_cursor_ris_precontext() {
+    let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
+    let mut c = GraphemeCursor::new(8, s.len(), true);
+    assert_eq!(
+        c.is_boundary(&s[4..], 4),
+        Err(GraphemeIncomplete::PreContext(4))
+    );
+    c.provide_context(&s[..4], 0);
+    assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
+}
+
+#[test]
+fn test_grapheme_cursor_chunk_start_require_precontext() {
+    let s = "\r\n";
+    let mut c = GraphemeCursor::new(1, s.len(), true);
+    assert_eq!(
+        c.is_boundary(&s[1..], 1),
+        Err(GraphemeIncomplete::PreContext(1))
+    );
+    c.provide_context(&s[..1], 0);
+    assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
+}
+
+#[test]
+fn test_grapheme_cursor_prev_boundary() {
+    let s = "abcd";
+    let mut c = GraphemeCursor::new(3, s.len(), true);
+    assert_eq!(
+        c.prev_boundary(&s[2..], 2),
+        Err(GraphemeIncomplete::PrevChunk)
+    );
+    assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
+}
+
+#[test]
+fn test_grapheme_cursor_prev_boundary_chunk_start() {
+    let s = "abcd";
+    let mut c = GraphemeCursor::new(2, s.len(), true);
+    assert_eq!(
+        c.prev_boundary(&s[2..], 2),
+        Err(GraphemeIncomplete::PrevChunk)
+    );
+    assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
+}
--- a/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/lib.rs
+++ b/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/lib.rs
@@ -0,0 +1,307 @@
+// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
+//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
+//!
+//! ```rust
+//! extern crate unicode_segmentation;
+//!
+//! use unicode_segmentation::UnicodeSegmentation;
+//!
+//! fn main() {
+//!     let s = "a̐éö̲\r\n";
+//!     let g = UnicodeSegmentation::graphemes(s, true).collect::<Vec<&str>>();
+//!     let b: &[_] = &["a̐", "é", "ö̲", "\r\n"];
+//!     assert_eq!(g, b);
+//!
+//!     let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
+//!     let w = s.unicode_words().collect::<Vec<&str>>();
+//!     let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
+//!     assert_eq!(w, b);
+//!
+//!     let s = "The quick (\"brown\")  fox";
+//!     let w = s.split_word_bounds().collect::<Vec<&str>>();
+//!     let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
+//!     assert_eq!(w, b);
+//! }
+//! ```
+//!
+//! # no_std
+//!
+//! unicode-segmentation does not depend on libstd, so it can be used in crates
+//! with the `#![no_std]` attribute.
+//!
+//! # crates.io
+//!
+//! You can use this package in your project by adding the following
+//! to your `Cargo.toml`:
+//!
+//! ```toml
+//! [dependencies]
+//! unicode-segmentation = "1.9.0"
+//! ```
+
+#![deny(missing_docs, unsafe_code)]
+#![doc(
+    html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
+    html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
+)]
+#![no_std]
+
+#[cfg(test)]
+#[macro_use]
+extern crate std;
+
+#[cfg(test)]
+#[macro_use]
+extern crate quickcheck;
+
+pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
+pub use grapheme::{GraphemeIndices, Graphemes};
+pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
+pub use tables::UNICODE_VERSION;
+pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
+
+mod grapheme;
+#[rustfmt::skip]
+mod tables;
+mod sentence;
+mod word;
+
+#[cfg(test)]
+mod test;
+#[cfg(test)]
+mod testdata;
+
+/// Methods for segmenting strings according to
+/// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).
+pub trait UnicodeSegmentation {
+    /// Returns an iterator over the [grapheme clusters][graphemes] of `self`.
+    ///
+    /// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
+    ///
+    /// If `is_extended` is true, the iterator is over the
+    /// *extended grapheme clusters*;
+    /// otherwise, the iterator is over the *legacy grapheme clusters*.
+    /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
+    /// recommends extended grapheme cluster boundaries for general processing.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use self::unicode_segmentation::UnicodeSegmentation;
+    /// let gr1 = UnicodeSegmentation::graphemes("a\u{310}e\u{301}o\u{308}\u{332}", true)
+    ///           .collect::<Vec<&str>>();
+    /// let b: &[_] = &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"];
+    ///
+    /// assert_eq!(&gr1[..], b);
+    ///
+    /// let gr2 = UnicodeSegmentation::graphemes("a\r\nb🇷🇺🇸🇹", true).collect::<Vec<&str>>();
+    /// let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"];
+    ///
+    /// assert_eq!(&gr2[..], b);
+    /// ```
+    fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
+
+    /// Returns an iterator over the grapheme clusters of `self` and their
+    /// byte offsets. See `graphemes()` for more information.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use self::unicode_segmentation::UnicodeSegmentation;
+    /// let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲\r\n", true)
+    ///               .collect::<Vec<(usize, &str)>>();
+    /// let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
+    ///
+    /// assert_eq!(&gr_inds[..], b);
+    /// ```
+    fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
+
+    /// Returns an iterator over the words of `self`, separated on
+    /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
+    ///
+    /// Here, "words" are just those substrings which, after splitting on
+    /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
+    /// substring must contain at least one character with the
+    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+    /// property, or with
+    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use self::unicode_segmentation::UnicodeSegmentation;
+    /// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
+    /// let uw1 = uws.unicode_words().collect::<Vec<&str>>();
+    /// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
+    ///
+    /// assert_eq!(&uw1[..], b);
+    /// ```
+    fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
+
+    /// Returns an iterator over the words of `self`, separated on
+    /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
+    /// offsets.
+    ///
+    /// Here, "words" are just those substrings which, after splitting on
+    /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
+    /// substring must contain at least one character with the
+    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+    /// property, or with
+    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use self::unicode_segmentation::UnicodeSegmentation;
+    /// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
+    /// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
+    /// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
+    ///                 (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
+    ///
+    /// assert_eq!(&uwi1[..], b);
+    /// ```
+    fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;
+
+    /// Returns an iterator over substrings of `self` separated on
+    /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
+    ///
+    /// The concatenation of the substrings returned by this function is just the original string.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use self::unicode_segmentation::UnicodeSegmentation;
+    /// let swu1 = "The quick (\"brown\")  fox".split_word_bounds().collect::<Vec<&str>>();
+    /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
+    ///
+    /// assert_eq!(&swu1[..], b);
+    /// ```
+    fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>;
+
+    /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
+    /// and their offsets. See `split_word_bounds()` for more information.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use self::unicode_segmentation::UnicodeSegmentation;
+    /// let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>();
+    /// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
+    ///                 (14, "°"), (16, "F"), (17, "!")];
+    ///
+    /// assert_eq!(&swi1[..], b);
+    /// ```
+    fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
+
+    /// Returns an iterator over substrings of `self` separated on
+    /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
+    ///
+    /// Here, "sentences" are just those substrings which, after splitting on
+    /// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
+    /// substring must contain at least one character with the
+    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+    /// property, or with
+    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use self::unicode_segmentation::UnicodeSegmentation;
+    /// let uss = "Mr. Fox jumped. [...] The dog was too lazy.";
+    /// let us1 = uss.unicode_sentences().collect::<Vec<&str>>();
+    /// let b: &[_] = &["Mr. ", "Fox jumped. ", "The dog was too lazy."];
+    ///
+    /// assert_eq!(&us1[..], b);
+    /// ```
+    fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
+
+    /// Returns an iterator over substrings of `self` separated on
+    /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
+    ///
+    /// The concatenation of the substrings returned by this function is just the original string.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use self::unicode_segmentation::UnicodeSegmentation;
+    /// let ssbs = "Mr. Fox jumped. [...] The dog was too lazy.";
+    /// let ssb1 = ssbs.split_sentence_bounds().collect::<Vec<&str>>();
+    /// let b: &[_] = &["Mr. ", "Fox jumped. ", "[...] ", "The dog was too lazy."];
+    ///
+    /// assert_eq!(&ssb1[..], b);
+    /// ```
+    fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
+
+    /// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
+    /// and their offsets. See `split_sentence_bounds()` for more information.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use self::unicode_segmentation::UnicodeSegmentation;
+    /// let ssis = "Mr. Fox jumped. [...] The dog was too lazy.";
+    /// let ssi1 = ssis.split_sentence_bound_indices().collect::<Vec<(usize, &str)>>();
+    /// let b: &[_] = &[(0, "Mr. "), (4, "Fox jumped. "), (16, "[...] "),
+    ///                 (22, "The dog was too lazy.")];
+    ///
+    /// assert_eq!(&ssi1[..], b);
+    /// ```
+    fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
+}
+
+impl UnicodeSegmentation for str {
+    #[inline]
+    fn graphemes(&self, is_extended: bool) -> Graphemes {
+        grapheme::new_graphemes(self, is_extended)
+    }
+
+    #[inline]
+    fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
+        grapheme::new_grapheme_indices(self, is_extended)
+    }
+
+    #[inline]
+    fn unicode_words(&self) -> UnicodeWords {
+        word::new_unicode_words(self)
+    }
+
+    #[inline]
+    fn unicode_word_indices(&self) -> UnicodeWordIndices {
+        word::new_unicode_word_indices(self)
+    }
+
+    #[inline]
+    fn split_word_bounds(&self) -> UWordBounds {
+        word::new_word_bounds(self)
+    }
+
+    #[inline]
+    fn split_word_bound_indices(&self) -> UWordBoundIndices {
+        word::new_word_bound_indices(self)
+    }
+
+    #[inline]
+    fn unicode_sentences(&self) -> UnicodeSentences {
+        sentence::new_unicode_sentences(self)
+    }
+
+    #[inline]
+    fn split_sentence_bounds(&self) -> USentenceBounds {
+        sentence::new_sentence_bounds(self)
+    }
+
+    #[inline]
+    fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
+        sentence::new_sentence_bound_indices(self)
+    }
+}
--- a/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/sentence.rs
+++ b/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/sentence.rs
@@ -0,0 +1,415 @@
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use core::cmp;
+use core::iter::Filter;
+
+// All of the logic for forward iteration over sentences
+mod fwd {
+    use crate::tables::sentence::SentenceCat;
+    use core::cmp;
+
+    // Describe a parsed part of source string as described in this table:
+    // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
+    #[derive(Clone, Copy, PartialEq, Eq)]
+    enum StatePart {
+        Sot,
+        Eot,
+        Other,
+        CR,
+        LF,
+        Sep,
+        ATerm,
+        UpperLower,
+        ClosePlus,
+        SpPlus,
+        STerm,
+    }
+
+    #[derive(Clone, PartialEq, Eq)]
+    struct SentenceBreaksState(pub [StatePart; 4]);
+
+    const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
+        StatePart::Sot,
+        StatePart::Sot,
+        StatePart::Sot,
+        StatePart::Sot,
+    ]);
+
+    #[derive(Clone)]
+    pub struct SentenceBreaks<'a> {
+        pub string: &'a str,
+        pos: usize,
+        state: SentenceBreaksState,
+    }
+
+    impl SentenceBreaksState {
+        // Attempt to advance the internal state by one part
+        // Whitespace and some punctutation will be collapsed
+        fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
+            let &SentenceBreaksState(parts) = self;
+            let parts = match (parts[3], cat) {
+                (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
+                (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
+                _ => [
+                    parts[1],
+                    parts[2],
+                    parts[3],
+                    match cat {
+                        SentenceCat::SC_CR => StatePart::CR,
+                        SentenceCat::SC_LF => StatePart::LF,
+                        SentenceCat::SC_Sep => StatePart::Sep,
+                        SentenceCat::SC_ATerm => StatePart::ATerm,
+                        SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
+                        SentenceCat::SC_Close => StatePart::ClosePlus,
+                        SentenceCat::SC_Sp => StatePart::SpPlus,
+                        SentenceCat::SC_STerm => StatePart::STerm,
+                        _ => StatePart::Other,
+                    },
+                ],
+            };
+            SentenceBreaksState(parts)
+        }
+
+        fn end(&self) -> SentenceBreaksState {
+            let &SentenceBreaksState(parts) = self;
+            SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
+        }
+
+        // Helper function to check if state head matches a single `StatePart`
+        fn match1(&self, part: StatePart) -> bool {
+            let &SentenceBreaksState(parts) = self;
+            part == parts[3]
+        }
+
+        // Helper function to check if first two `StateParts` in state match
+        // the given two
+        fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
+            let &SentenceBreaksState(parts) = self;
+            part1 == parts[2] && part2 == parts[3]
+        }
+    }
+
+    // https://unicode.org/reports/tr29/#SB8
+    // TODO cache this, it is currently quadratic
+    fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
+        let &SentenceBreaksState(parts) = state;
+        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
+        if parts[idx] == StatePart::ClosePlus {
+            idx -= 1
+        }
+
+        if parts[idx] == StatePart::ATerm {
+            use crate::tables::sentence as se;
+
+            for next_char in ahead.chars() {
+                //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
+                match se::sentence_category(next_char).2 {
+                    se::SC_Lower => return true,
+                    se::SC_OLetter
+                    | se::SC_Upper
+                    | se::SC_Sep
+                    | se::SC_CR
+                    | se::SC_LF
+                    | se::SC_STerm
+                    | se::SC_ATerm => return false,
+                    _ => continue,
+                }
+            }
+        }
+
+        false
+    }
+
+    // https://unicode.org/reports/tr29/#SB8a
+    fn match_sb8a(state: &SentenceBreaksState) -> bool {
+        // SATerm Close* Sp*
+        let &SentenceBreaksState(parts) = state;
+        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
+        if parts[idx] == StatePart::ClosePlus {
+            idx -= 1
+        }
+        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
+    }
+
+    // https://unicode.org/reports/tr29/#SB9
+    fn match_sb9(state: &SentenceBreaksState) -> bool {
+        // SATerm Close*
+        let &SentenceBreaksState(parts) = state;
+        let idx = if parts[3] == StatePart::ClosePlus {
+            2
+        } else {
+            3
+        };
+        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
+    }
+
+    // https://unicode.org/reports/tr29/#SB11
+    fn match_sb11(state: &SentenceBreaksState) -> bool {
+        // SATerm Close* Sp* ParaSep?
+        let &SentenceBreaksState(parts) = state;
+        let mut idx = match parts[3] {
+            StatePart::Sep | StatePart::CR | StatePart::LF => 2,
+            _ => 3,
+        };
+
+        if parts[idx] == StatePart::SpPlus {
+            idx -= 1
+        }
+        if parts[idx] == StatePart::ClosePlus {
+            idx -= 1
+        }
+
+        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
+    }
+
+    impl<'a> Iterator for SentenceBreaks<'a> {
+        // Returns the index of the character which follows a break
+        type Item = usize;
+
+        #[inline]
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            let slen = self.string.len();
+            // A sentence could be one character
+            (cmp::min(slen, 2), Some(slen + 1))
+        }
+
+        #[inline]
+        fn next(&mut self) -> Option<usize> {
+            use crate::tables::sentence as se;
+
+            for next_char in self.string[self.pos..].chars() {
+                let position_before = self.pos;
+                let state_before = self.state.clone();
+
+                let next_cat = se::sentence_category(next_char).2;
+
+                self.pos += next_char.len_utf8();
+                self.state = self.state.next(next_cat);
+
+                match next_cat {
+                    // SB1 https://unicode.org/reports/tr29/#SB1
+                    _ if state_before.match1(StatePart::Sot) => return Some(position_before),
+
+                    // SB2 is handled when inner iterator (chars) is finished
+
+                    // SB3 https://unicode.org/reports/tr29/#SB3
+                    SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
+
+                    // SB4 https://unicode.org/reports/tr29/#SB4
+                    _ if state_before.match1(StatePart::Sep)
+                        || state_before.match1(StatePart::CR)
+                        || state_before.match1(StatePart::LF) =>
+                    {
+                        return Some(position_before)
+                    }
+
+                    // SB5 https://unicode.org/reports/tr29/#SB5
+                    SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,
+
+                    // SB6 https://unicode.org/reports/tr29/#SB6
+                    SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
+
+                    // SB7 https://unicode.org/reports/tr29/#SB7
+                    SentenceCat::SC_Upper
+                        if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
+                    {
+                        continue
+                    }
+
+                    // SB8 https://unicode.org/reports/tr29/#SB8
+                    _ if match_sb8(&state_before, &self.string[position_before..]) => continue,
+
+                    // SB8a https://unicode.org/reports/tr29/#SB8a
+                    SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
+                        if match_sb8a(&state_before) =>
+                    {
+                        continue
+                    }
+
+                    // SB9 https://unicode.org/reports/tr29/#SB9
+                    SentenceCat::SC_Close
+                    | SentenceCat::SC_Sp
+                    | SentenceCat::SC_Sep
+                    | SentenceCat::SC_CR
+                    | SentenceCat::SC_LF
+                        if match_sb9(&state_before) =>
+                    {
+                        continue
+                    }
+
+                    // SB10 https://unicode.org/reports/tr29/#SB10
+                    SentenceCat::SC_Sp
+                    | SentenceCat::SC_Sep
+                    | SentenceCat::SC_CR
+                    | SentenceCat::SC_LF
+                        if match_sb8a(&state_before) =>
+                    {
+                        continue
+                    }
+
+                    // SB11 https://unicode.org/reports/tr29/#SB11
+                    _ if match_sb11(&state_before) => return Some(position_before),
+
+                    // SB998 https://unicode.org/reports/tr29/#SB998
+                    _ => continue,
+                }
+            }
+
+            // SB2 https://unicode.org/reports/tr29/#SB2
+            if self.state.match1(StatePart::Sot) {
+                None
+            } else if self.state.match1(StatePart::Eot) {
+                None
+            } else {
+                self.state = self.state.end();
+                Some(self.pos)
+            }
+        }
+    }
+
+    pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
+        SentenceBreaks {
+            string: source,
+            pos: 0,
+            state: INITIAL_STATE,
+        }
+    }
+}
+
+/// An iterator over the substrings of a string which, after splitting the string on
+/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
+/// contain any characters with the
+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+/// property, or with
+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+///
+/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
+/// trait. See its documentation for more.
+///
+/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[derive(Clone)]
+pub struct UnicodeSentences<'a> {
+    inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
+}
+
+/// External iterator for a string's
+/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
+///
+/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
+/// trait. See its documentation for more.
+///
+/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[derive(Clone)]
+pub struct USentenceBounds<'a> {
+    iter: fwd::SentenceBreaks<'a>,
+    sentence_start: Option<usize>,
+}
+
+/// External iterator for sentence boundaries and byte offsets.
+///
+/// This struct is created by the [`split_sentence_bound_indices`] method on the
+/// [`UnicodeSegmentation`] trait. See its documentation for more.
+///
+/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[derive(Clone)]
+pub struct USentenceBoundIndices<'a> {
+    start_offset: usize,
+    iter: USentenceBounds<'a>,
+}
+
+#[inline]
+pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
+    USentenceBounds {
+        iter: fwd::new_sentence_breaks(source),
+        sentence_start: None,
+    }
+}
+
+#[inline]
+pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
+    USentenceBoundIndices {
+        start_offset: source.as_ptr() as usize,
+        iter: new_sentence_bounds(source),
+    }
+}
+
+#[inline]
+pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
+    use super::UnicodeSegmentation;
+    use crate::tables::util::is_alphanumeric;
+
+    fn has_alphanumeric(s: &&str) -> bool {
+        s.chars().any(|c| is_alphanumeric(c))
+    }
+    let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
+
+    UnicodeSentences {
+        inner: s.split_sentence_bounds().filter(has_alphanumeric),
+    }
+}
+
+impl<'a> Iterator for UnicodeSentences<'a> {
+    type Item = &'a str;
+
+    #[inline]
+    fn next(&mut self) -> Option<&'a str> {
+        self.inner.next()
+    }
+}
+
+impl<'a> Iterator for USentenceBounds<'a> {
+    type Item = &'a str;
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let (lower, upper) = self.iter.size_hint();
+        (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
+    }
+
+    #[inline]
+    fn next(&mut self) -> Option<&'a str> {
+        if self.sentence_start == None {
+            if let Some(start_pos) = self.iter.next() {
+                self.sentence_start = Some(start_pos)
+            } else {
+                return None;
+            }
+        }
+
+        if let Some(break_pos) = self.iter.next() {
+            let start_pos = self.sentence_start.unwrap();
+            let sentence = &self.iter.string[start_pos..break_pos];
+            self.sentence_start = Some(break_pos);
+            Some(sentence)
+        } else {
+            None
+        }
+    }
+}
+
+impl<'a> Iterator for USentenceBoundIndices<'a> {
+    type Item = (usize, &'a str);
+
+    #[inline]
+    fn next(&mut self) -> Option<(usize, &'a str)> {
+        self.iter
+            .next()
+            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.iter.size_hint()
+    }
+}
--- a/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/tables.rs
+++ b/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/tables.rs
--- a/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/test.rs
+++ b/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/test.rs
@@ -0,0 +1,247 @@
+// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use super::UnicodeSegmentation;
+
+use std::prelude::v1::*;
+
+#[test]
+fn test_graphemes() {
+    use crate::testdata::{TEST_DIFF, TEST_SAME};
+
+    pub const EXTRA_DIFF: &'static [(
+        &'static str,
+        &'static [&'static str],
+        &'static [&'static str],
+    )] = &[
+        // Official test suite doesn't include two Prepend chars between two other chars.
+        (
+            "\u{20}\u{600}\u{600}\u{20}",
+            &["\u{20}", "\u{600}\u{600}\u{20}"],
+            &["\u{20}", "\u{600}", "\u{600}", "\u{20}"],
+        ),
+        // Test for Prepend followed by two Any chars
+        (
+            "\u{600}\u{20}\u{20}",
+            &["\u{600}\u{20}", "\u{20}"],
+            &["\u{600}", "\u{20}", "\u{20}"],
+        ),
+    ];
+
+    pub const EXTRA_SAME: &'static [(&'static str, &'static [&'static str])] = &[
+        // family emoji (more than two emoji joined by ZWJ)
+        (
+            "\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}",
+            &["\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}"],
+        ),
+        // cartwheel emoji followed by two fitzpatrick skin tone modifiers
+        // (test case from issue #19)
+        (
+            "\u{1F938}\u{1F3FE}\u{1F3FE}",
+            &["\u{1F938}\u{1F3FE}\u{1F3FE}"],
+        ),
+    ];
+
+    for &(s, g) in TEST_SAME.iter().chain(EXTRA_SAME) {
+        // test forward iterator
+        assert!(UnicodeSegmentation::graphemes(s, true).eq(g.iter().cloned()));
+        assert!(UnicodeSegmentation::graphemes(s, false).eq(g.iter().cloned()));
+
+        // test reverse iterator
+        assert!(UnicodeSegmentation::graphemes(s, true)
+            .rev()
+            .eq(g.iter().rev().cloned()));
+        assert!(UnicodeSegmentation::graphemes(s, false)
+            .rev()
+            .eq(g.iter().rev().cloned()));
+    }
+
+    for &(s, gt, gf) in TEST_DIFF.iter().chain(EXTRA_DIFF) {
+        // test forward iterator
+        assert!(UnicodeSegmentation::graphemes(s, true).eq(gt.iter().cloned()));
+        assert!(UnicodeSegmentation::graphemes(s, false).eq(gf.iter().cloned()));
+
+        // test reverse iterator
+        assert!(UnicodeSegmentation::graphemes(s, true)
+            .rev()
+            .eq(gt.iter().rev().cloned()));
+        assert!(UnicodeSegmentation::graphemes(s, false)
+            .rev()
+            .eq(gf.iter().rev().cloned()));
+    }
+
+    // test the indices iterators
+    let s = "a̐éö̲\r\n";
+    let gr_inds = UnicodeSegmentation::grapheme_indices(s, true).collect::<Vec<(usize, &str)>>();
+    let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
+    assert_eq!(gr_inds, b);
+    let gr_inds = UnicodeSegmentation::grapheme_indices(s, true)
+        .rev()
+        .collect::<Vec<(usize, &str)>>();
+    let b: &[_] = &[(11, "\r\n"), (6, "ö̲"), (3, "é"), (0, "a̐")];
+    assert_eq!(gr_inds, b);
+    let mut gr_inds_iter = UnicodeSegmentation::grapheme_indices(s, true);
+    {
+        let gr_inds = gr_inds_iter.by_ref();
+        let e1 = gr_inds.size_hint();
+        assert_eq!(e1, (1, Some(13)));
+        let c = gr_inds.count();
+        assert_eq!(c, 4);
+    }
+    let e2 = gr_inds_iter.size_hint();
+    assert_eq!(e2, (0, Some(0)));
+
+    // make sure the reverse iterator does the right thing with "\n" at beginning of string
+    let s = "\n\r\n\r";
+    let gr = UnicodeSegmentation::graphemes(s, true)
+        .rev()
+        .collect::<Vec<&str>>();
+    let b: &[_] = &["\r", "\r\n", "\n"];
+    assert_eq!(gr, b);
+}
+
+#[test]
+fn test_words() {
+    use crate::testdata::TEST_WORD;
+
+    // Unicode's official tests don't really test longer chains of flag emoji
+    // TODO This could be improved with more tests like flag emoji with interspersed Extend chars and ZWJ
+    const EXTRA_TESTS: &'static [(&'static str, &'static [&'static str])] = &[
+        (
+            "🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦🇴",
+            &["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦🇴"],
+        ),
+        ("🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦", &["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"]),
+        (
+            "🇦a🇫🇦🇽a🇦🇱🇩🇿🇦🇸🇦🇩🇦",
+            &["🇦", "a", "🇫🇦", "🇽", "a", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"],
+        ),
+        (
+            "\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}",
+            &["\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}"],
+        ),
+        ("😌👎🏼", &["😌", "👎🏼"]),
+        // perhaps wrong, spaces should not be included?
+        ("hello world", &["hello", " ", "world"]),
+        ("🇨🇦🇨🇭🇿🇲🇿 hi", &["🇨🇦", "🇨🇭", "🇿🇲", "🇿", " ", "hi"]),
+    ];
+    for &(s, w) in TEST_WORD.iter().chain(EXTRA_TESTS.iter()) {
+        macro_rules! assert_ {
+            ($test:expr, $exp:expr, $name:expr) => {
+                // collect into vector for better diagnostics in failure case
+                let testing = $test.collect::<Vec<_>>();
+                let expected = $exp.collect::<Vec<_>>();
+                assert_eq!(
+                    testing, expected,
+                    "{} test for testcase ({:?}, {:?}) failed.",
+                    $name, s, w
+                )
+            };
+        }
+        // test forward iterator
+        assert_!(
+            s.split_word_bounds(),
+            w.iter().cloned(),
+            "Forward word boundaries"
+        );
+
+        // test reverse iterator
+        assert_!(
+            s.split_word_bounds().rev(),
+            w.iter().rev().cloned(),
+            "Reverse word boundaries"
+        );
+
+        // generate offsets from word string lengths
+        let mut indices = vec![0];
+        for i in w.iter().cloned().map(|s| s.len()).scan(0, |t, n| {
+            *t += n;
+            Some(*t)
+        }) {
+            indices.push(i);
+        }
+        indices.pop();
+        let indices = indices;
+
+        // test forward indices iterator
+        assert_!(
+            s.split_word_bound_indices().map(|(l, _)| l),
+            indices.iter().cloned(),
+            "Forward word indices"
+        );
+
+        // test backward indices iterator
+        assert_!(
+            s.split_word_bound_indices().rev().map(|(l, _)| l),
+            indices.iter().rev().cloned(),
+            "Reverse word indices"
+        );
+    }
+}
+
+#[test]
+fn test_sentences() {
+    use crate::testdata::TEST_SENTENCE;
+
+    for &(s, w) in TEST_SENTENCE.iter() {
+        macro_rules! assert_ {
+            ($test:expr, $exp:expr, $name:expr) => {
+                // collect into vector for better diagnostics in failure case
+                let testing = $test.collect::<Vec<_>>();
+                let expected = $exp.collect::<Vec<_>>();
+                assert_eq!(
+                    testing, expected,
+                    "{} test for testcase ({:?}, {:?}) failed.",
+                    $name, s, w
+                )
+            };
+        }
+
+        assert_!(
+            s.split_sentence_bounds(),
+            w.iter().cloned(),
+            "Forward sentence boundaries"
+        );
+    }
+}
+
+quickcheck! {
+    fn quickcheck_forward_reverse_graphemes_extended(s: String) -> bool {
+        let a = s.graphemes(true).collect::<Vec<_>>();
+        let mut b = s.graphemes(true).rev().collect::<Vec<_>>();
+        b.reverse();
+        a == b
+    }
+
+    fn quickcheck_forward_reverse_graphemes_legacy(s: String) -> bool {
+        let a = s.graphemes(false).collect::<Vec<_>>();
+        let mut b = s.graphemes(false).rev().collect::<Vec<_>>();
+        b.reverse();
+        a == b
+    }
+
+    fn quickcheck_join_graphemes(s: String) -> bool {
+        let a = s.graphemes(true).collect::<String>();
+        let b = s.graphemes(false).collect::<String>();
+        a == s && b == s
+    }
+
+    fn quickcheck_forward_reverse_words(s: String) -> bool {
+        let a = s.split_word_bounds().collect::<Vec<_>>();
+        let mut b = s.split_word_bounds().rev().collect::<Vec<_>>();
+        b.reverse();
+        a == b
+    }
+
+    fn quickcheck_join_words(s: String) -> bool {
+        let a = s.split_word_bounds().collect::<String>();
+        a == s
+    }
+}
--- a/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/testdata.rs
+++ b/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/testdata.rs
--- a/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/word.rs
+++ b/clamav/libclamav_rust/.cargo/vendor/unicode-segmentation/src/word.rs
@@ -0,0 +1,754 @@
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use core::cmp;
+use core::iter::Filter;
+
+use crate::tables::word::WordCat;
+
+/// An iterator over the substrings of a string which, after splitting the string on
+/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
+/// contain any characters with the
+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+/// property, or with
+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+///
+/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
+/// its documentation for more.
+///
+/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+pub struct UnicodeWords<'a> {
+    inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
+}
+
+impl<'a> Iterator for UnicodeWords<'a> {
+    type Item = &'a str;
+
+    #[inline]
+    fn next(&mut self) -> Option<&'a str> {
+        self.inner.next()
+    }
+}
+impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<&'a str> {
+        self.inner.next_back()
+    }
+}
+
+/// An iterator over the substrings of a string which, after splitting the string on
+/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
+/// contain any characters with the
+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+/// property, or with
+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+/// This iterator also provides the byte offsets for each substring.
+///
+/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
+/// its documentation for more.
+///
+/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+pub struct UnicodeWordIndices<'a> {
+    inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
+}
+
+impl<'a> Iterator for UnicodeWordIndices<'a> {
+    type Item = (usize, &'a str);
+
+    #[inline]
+    fn next(&mut self) -> Option<(usize, &'a str)> {
+        self.inner.next()
+    }
+}
+impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<(usize, &'a str)> {
+        self.inner.next_back()
+    }
+}
+
+/// External iterator for a string's
+/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
+///
+/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
+/// trait. See its documentation for more.
+///
+/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[derive(Clone)]
+pub struct UWordBounds<'a> {
+    string: &'a str,
+    cat: Option<WordCat>,
+    catb: Option<WordCat>,
+}
+
+/// External iterator for word boundaries and byte offsets.
+///
+/// This struct is created by the [`split_word_bound_indices`] method on the
+/// [`UnicodeSegmentation`] trait. See its documentation for more.
+///
+/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[derive(Clone)]
+pub struct UWordBoundIndices<'a> {
+    start_offset: usize,
+    iter: UWordBounds<'a>,
+}
+
+impl<'a> UWordBoundIndices<'a> {
+    #[inline]
+    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::UnicodeSegmentation;
+    /// let mut iter = "Hello world".split_word_bound_indices();
+    /// assert_eq!(iter.as_str(), "Hello world");
+    /// iter.next();
+    /// assert_eq!(iter.as_str(), " world");
+    /// iter.next();
+    /// assert_eq!(iter.as_str(), "world");
+    /// ```
+    pub fn as_str(&self) -> &'a str {
+        self.iter.as_str()
+    }
+}
+
+impl<'a> Iterator for UWordBoundIndices<'a> {
+    type Item = (usize, &'a str);
+
+    #[inline]
+    fn next(&mut self) -> Option<(usize, &'a str)> {
+        self.iter
+            .next()
+            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.iter.size_hint()
+    }
+}
+
+impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<(usize, &'a str)> {
+        self.iter
+            .next_back()
+            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
+    }
+}
+
+// state machine for word boundary rules
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+enum UWordBoundsState {
+    Start,
+    Letter,
+    HLetter,
+    Numeric,
+    Katakana,
+    ExtendNumLet,
+    Regional(RegionalState),
+    FormatExtend(FormatExtendType),
+    Zwj,
+    Emoji,
+    WSegSpace,
+}
+
+// subtypes for FormatExtend state in UWordBoundsState
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+enum FormatExtendType {
+    AcceptAny,
+    AcceptNone,
+    RequireLetter,
+    RequireHLetter,
+    AcceptQLetter,
+    RequireNumeric,
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+enum RegionalState {
+    Half,
+    Full,
+    Unknown,
+}
+
+fn is_emoji(ch: char) -> bool {
+    use crate::tables::emoji;
+    emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
+}
+
+impl<'a> Iterator for UWordBounds<'a> {
+    type Item = &'a str;
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let slen = self.string.len();
+        (cmp::min(slen, 1), Some(slen))
+    }
+
+    #[inline]
+    fn next(&mut self) -> Option<&'a str> {
+        use self::FormatExtendType::*;
+        use self::UWordBoundsState::*;
+        use crate::tables::word as wd;
+        if self.string.len() == 0 {
+            return None;
+        }
+
+        let mut take_curr = true;
+        let mut take_cat = true;
+        let mut idx = 0;
+        let mut saveidx = 0;
+        let mut state = Start;
+        let mut cat = wd::WC_Any;
+        let mut savecat = wd::WC_Any;
+
+        // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
+        let mut skipped_format_extend = false;
+        for (curr, ch) in self.string.char_indices() {
+            idx = curr;
+            // Whether or not the previous category was ZWJ
+            // ZWJs get collapsed, so this handles precedence of WB3c over WB4
+            let prev_zwj = cat == wd::WC_ZWJ;
+            // if there's a category cached, grab it
+            cat = match self.cat {
+                None => wd::word_category(ch).2,
+                _ => self.cat.take().unwrap(),
+            };
+            take_cat = true;
+
+            // handle rule WB4
+            // just skip all format, extend, and zwj chars
+            // note that Start is a special case: if there's a bunch of Format | Extend
+            // characters at the beginning of a block of text, dump them out as one unit.
+            //
+            // (This is not obvious from the wording of UAX#29, but if you look at the
+            // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
+            // then the "correct" interpretation of WB4 becomes apparent.)
+            if state != Start {
+                match cat {
+                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
+                        skipped_format_extend = true;
+                        continue;
+                    }
+                    _ => {}
+                }
+            }
+
+            // rule WB3c
+            // WB4 makes all ZWJs collapse into the previous state
+            // but you can still be in a Zwj state if you started with Zwj
+            //
+            // This means that an EP + Zwj will collapse into EP, which is wrong,
+            // since EP+EP is not a boundary but EP+ZWJ+EP is
+            //
+            // Thus, we separately keep track of whether or not the last character
+            // was a ZWJ. This is an additional bit of state tracked outside of the
+            // state enum; the state enum represents the last non-zwj state encountered.
+            // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
+            // however we are in the previous state for the purposes of all other rules.
+            if prev_zwj {
+                if is_emoji(ch) {
+                    state = Emoji;
+                    continue;
+                }
+            }
+            // Don't use `continue` in this match without updating `cat`
+            state = match state {
+                Start if cat == wd::WC_CR => {
+                    idx += match self.get_next_cat(idx) {
+                        Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
+                        _ => 0,
+                    };
+                    break; // rule WB3a
+                }
+                Start => match cat {
+                    wd::WC_ALetter => Letter,            // rule WB5, WB6, WB9, WB13a
+                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
+                    wd::WC_Numeric => Numeric,           // rule WB8, WB10, WB12, WB13a
+                    wd::WC_Katakana => Katakana,         // rule WB13, WB13a
+                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
+                    wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
+                    wd::WC_LF | wd::WC_Newline => break, // rule WB3a
+                    wd::WC_ZWJ => Zwj,                   // rule WB3c
+                    wd::WC_WSegSpace => WSegSpace,       // rule WB3d
+                    _ => {
+                        if let Some(ncat) = self.get_next_cat(idx) {
+                            // rule WB4
+                            if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
+                            {
+                                state = FormatExtend(AcceptNone);
+                                self.cat = Some(ncat);
+                                continue;
+                            }
+                        }
+                        break; // rule WB999
+                    }
+                },
+                WSegSpace => match cat {
+                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                Zwj => {
+                    // We already handle WB3c above.
+                    take_curr = false;
+                    break;
+                }
+                Letter | HLetter => match cat {
+                    wd::WC_ALetter => Letter,            // rule WB5
+                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
+                    wd::WC_Numeric => Numeric,           // rule WB9
+                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
+                    wd::WC_Double_Quote if state == HLetter => {
+                        savecat = cat;
+                        saveidx = idx;
+                        FormatExtend(RequireHLetter) // rule WB7b
+                    }
+                    wd::WC_Single_Quote if state == HLetter => {
+                        FormatExtend(AcceptQLetter) // rule WB7a
+                    }
+                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
+                        savecat = cat;
+                        saveidx = idx;
+                        FormatExtend(RequireLetter) // rule WB6
+                    }
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                Numeric => match cat {
+                    wd::WC_Numeric => Numeric,           // rule WB8
+                    wd::WC_ALetter => Letter,            // rule WB10
+                    wd::WC_Hebrew_Letter => HLetter,     // rule WB10
+                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
+                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
+                        savecat = cat;
+                        saveidx = idx;
+                        FormatExtend(RequireNumeric) // rule WB12
+                    }
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                Katakana => match cat {
+                    wd::WC_Katakana => Katakana,         // rule WB13
+                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                ExtendNumLet => match cat {
+                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
+                    wd::WC_ALetter => Letter,            // rule WB13b
+                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13b
+                    wd::WC_Numeric => Numeric,           // rule WB13b
+                    wd::WC_Katakana => Katakana,         // rule WB13b
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                Regional(RegionalState::Full) => {
+                    // if it reaches here we've gone too far,
+                    // a full flag can only compose with ZWJ/Extend/Format
+                    // proceeding it.
+                    take_curr = false;
+                    break;
+                }
+                Regional(RegionalState::Half) => match cat {
+                    wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                Regional(_) => {
+                    unreachable!("RegionalState::Unknown should not occur on forward iteration")
+                }
+                Emoji => {
+                    // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
+                    take_curr = false;
+                    break;
+                }
+                FormatExtend(t) => match t {
+                    // handle FormatExtends depending on what type
+                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
+                    RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
+                    RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
+                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
+                    AcceptNone | AcceptQLetter => {
+                        take_curr = false; // emit all the Format|Extend characters
+                        take_cat = false;
+                        break;
+                    }
+                    _ => break, // rewind (in if statement below)
+                },
+            }
+        }
+
+        if let FormatExtend(t) = state {
+            // we were looking for something and didn't find it; we have to back up
+            if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
+                idx = saveidx;
+                cat = savecat;
+                take_curr = false;
+            }
+        }
+
+        self.cat = if take_curr {
+            idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
+            None
+        } else if take_cat {
+            Some(cat)
+        } else {
+            None
+        };
+
+        let retstr = &self.string[..idx];
+        self.string = &self.string[idx..];
+        Some(retstr)
+    }
+}
+
+impl<'a> DoubleEndedIterator for UWordBounds<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<&'a str> {
+        use self::FormatExtendType::*;
+        use self::UWordBoundsState::*;
+        use crate::tables::word as wd;
+        if self.string.len() == 0 {
+            return None;
+        }
+
+        let mut take_curr = true;
+        let mut take_cat = true;
+        let mut idx = self.string.len();
+        idx -= self.string.chars().next_back().unwrap().len_utf8();
+        let mut previdx = idx;
+        let mut saveidx = idx;
+        let mut state = Start;
+        let mut savestate = Start;
+        let mut cat = wd::WC_Any;
+
+        let mut skipped_format_extend = false;
+
+        for (curr, ch) in self.string.char_indices().rev() {
+            previdx = idx;
+            idx = curr;
+
+            // if there's a category cached, grab it
+            cat = match self.catb {
+                None => wd::word_category(ch).2,
+                _ => self.catb.take().unwrap(),
+            };
+            take_cat = true;
+
+            // backward iterator over word boundaries. Mostly the same as the forward
+            // iterator, with two weirdnesses:
+            // (1) If we encounter a single quote in the Start state, we have to check for a
+            //     Hebrew Letter immediately before it.
+            // (2) Format and Extend char handling takes some gymnastics.
+
+            if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
+                // WB3c has more priority so we should not
+                // fold in that case
+                if match state {
+                    FormatExtend(_) | Start => false,
+                    _ => true,
+                } {
+                    saveidx = previdx;
+                    savestate = state;
+                    state = FormatExtend(AcceptNone);
+                }
+
+                if state != Start {
+                    continue;
+                }
+            } else if state == FormatExtend(AcceptNone) {
+                // finished a scan of some Format|Extend chars, restore previous state
+                state = savestate;
+                previdx = saveidx;
+                take_cat = false;
+                skipped_format_extend = true;
+            }
+
+            // Don't use `continue` in this match without updating `catb`
+            state = match state {
+                Start | FormatExtend(AcceptAny) => match cat {
+                    _ if is_emoji(ch) => Zwj,
+                    wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
+                    wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
+                    wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
+                    wd::WC_Katakana => Katakana, // rule WB13, WB13b
+                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
+                    wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
+                    // rule WB4:
+                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
+                    wd::WC_Single_Quote => {
+                        saveidx = idx;
+                        FormatExtend(AcceptQLetter) // rule WB7a
+                    }
+                    wd::WC_WSegSpace => WSegSpace,
+                    wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
+                        if state == Start {
+                            if cat == wd::WC_LF {
+                                idx -= match self.get_prev_cat(idx) {
+                                    Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
+                                    _ => 0,
+                                };
+                            }
+                        } else {
+                            take_curr = false;
+                        }
+                        break; // rule WB3a
+                    }
+                    _ => break, // rule WB999
+                },
+                Zwj => match cat {
+                    // rule WB3c
+                    wd::WC_ZWJ => FormatExtend(AcceptAny),
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                WSegSpace => match cat {
+                    // rule WB3d
+                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                Letter | HLetter => match cat {
+                    wd::WC_ALetter => Letter,            // rule WB5
+                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
+                    wd::WC_Numeric => Numeric,           // rule WB10
+                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
+                    wd::WC_Double_Quote if state == HLetter => {
+                        saveidx = previdx;
+                        FormatExtend(RequireHLetter) // rule WB7c
+                    }
+                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
+                        saveidx = previdx;
+                        FormatExtend(RequireLetter) // rule WB7
+                    }
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                Numeric => match cat {
+                    wd::WC_Numeric => Numeric,           // rule WB8
+                    wd::WC_ALetter => Letter,            // rule WB9
+                    wd::WC_Hebrew_Letter => HLetter,     // rule WB9
+                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
+                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
+                        saveidx = previdx;
+                        FormatExtend(RequireNumeric) // rule WB11
+                    }
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                Katakana => match cat {
+                    wd::WC_Katakana => Katakana,         // rule WB13
+                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                ExtendNumLet => match cat {
+                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
+                    wd::WC_ALetter => Letter,            // rule WB13a
+                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13a
+                    wd::WC_Numeric => Numeric,           // rule WB13a
+                    wd::WC_Katakana => Katakana,         // rule WB13a
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                Regional(mut regional_state) => match cat {
+                    // rule WB13c
+                    wd::WC_Regional_Indicator => {
+                        if regional_state == RegionalState::Unknown {
+                            let count = self.string[..previdx]
+                                .chars()
+                                .rev()
+                                .map(|c| wd::word_category(c).2)
+                                .filter(|&c| {
+                                    !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
+                                })
+                                .take_while(|&c| c == wd::WC_Regional_Indicator)
+                                .count();
+                            regional_state = if count % 2 == 0 {
+                                RegionalState::Full
+                            } else {
+                                RegionalState::Half
+                            };
+                        }
+                        if regional_state == RegionalState::Full {
+                            take_curr = false;
+                            break;
+                        } else {
+                            Regional(RegionalState::Full)
+                        }
+                    }
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                Emoji => {
+                    if is_emoji(ch) {
+                        // rule WB3c
+                        Zwj
+                    } else {
+                        take_curr = false;
+                        break;
+                    }
+                }
+                FormatExtend(t) => match t {
+                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
+                    RequireLetter if cat == wd::WC_ALetter => Letter,   // rule WB6
+                    RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
+                    AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
+                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
+                    _ => break,                                         // backtrack will happens
+                },
+            }
+        }
+
+        if let FormatExtend(t) = state {
+            // if we required something but didn't find it, backtrack
+            if t == RequireLetter
+                || t == RequireHLetter
+                || t == RequireNumeric
+                || t == AcceptNone
+                || t == AcceptQLetter
+            {
+                previdx = saveidx;
+                take_cat = false;
+                take_curr = false;
+            }
+        }
+
+        self.catb = if take_curr {
+            None
+        } else {
+            idx = previdx;
+            if take_cat {
+                Some(cat)
+            } else {
+                None
+            }
+        };
+
+        let retstr = &self.string[idx..];
+        self.string = &self.string[..idx];
+        Some(retstr)
+    }
+}
+
+impl<'a> UWordBounds<'a> {
+    #[inline]
+    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::UnicodeSegmentation;
+    /// let mut iter = "Hello world".split_word_bounds();
+    /// assert_eq!(iter.as_str(), "Hello world");
+    /// iter.next();
+    /// assert_eq!(iter.as_str(), " world");
+    /// iter.next();
+    /// assert_eq!(iter.as_str(), "world");
+    /// ```
+    pub fn as_str(&self) -> &'a str {
+        self.string
+    }
+
+    #[inline]
+    fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
+        use crate::tables::word as wd;
+        let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
+        if nidx < self.string.len() {
+            let nch = self.string[nidx..].chars().next().unwrap();
+            Some(wd::word_category(nch).2)
+        } else {
+            None
+        }
+    }
+
+    #[inline]
+    fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
+        use crate::tables::word as wd;
+        if idx > 0 {
+            let nch = self.string[..idx].chars().next_back().unwrap();
+            Some(wd::word_category(nch).2)
+        } else {
+            None
+        }
+    }
+}
+
+#[inline]
+pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
+    UWordBounds {
+        string: s,
+        cat: None,
+        catb: None,
+    }
+}
+
+#[inline]
+pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
+    UWordBoundIndices {
+        start_offset: s.as_ptr() as usize,
+        iter: new_word_bounds(s),
+    }
+}
+
+#[inline]
+fn has_alphanumeric(s: &&str) -> bool {
+    use crate::tables::util::is_alphanumeric;
+
+    s.chars().any(|c| is_alphanumeric(c))
+}
+
+#[inline]
+pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
+    use super::UnicodeSegmentation;
+
+    UnicodeWords {
+        inner: s.split_word_bounds().filter(has_alphanumeric),
+    }
+}
+
+#[inline]
+pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
+    use super::UnicodeSegmentation;
+
+    UnicodeWordIndices {
+        inner: s
+            .split_word_bound_indices()
+            .filter(|(_, c)| has_alphanumeric(c)),
+    }
+}