更新libclamav库1.0.0版本

This commit is contained in:
2023-01-14 18:28:39 +08:00
parent b879ee0b2e
commit 45fe15f472
8531 changed files with 1222046 additions and 177272 deletions

View File

@@ -0,0 +1,258 @@
pub const BYTE_FREQUENCIES: [u8; 256] = [
55, // '\x00'
52, // '\x01'
51, // '\x02'
50, // '\x03'
49, // '\x04'
48, // '\x05'
47, // '\x06'
46, // '\x07'
45, // '\x08'
103, // '\t'
242, // '\n'
66, // '\x0b'
67, // '\x0c'
229, // '\r'
44, // '\x0e'
43, // '\x0f'
42, // '\x10'
41, // '\x11'
40, // '\x12'
39, // '\x13'
38, // '\x14'
37, // '\x15'
36, // '\x16'
35, // '\x17'
34, // '\x18'
33, // '\x19'
56, // '\x1a'
32, // '\x1b'
31, // '\x1c'
30, // '\x1d'
29, // '\x1e'
28, // '\x1f'
255, // ' '
148, // '!'
164, // '"'
149, // '#'
136, // '$'
160, // '%'
155, // '&'
173, // "'"
221, // '('
222, // ')'
134, // '*'
122, // '+'
232, // ','
202, // '-'
215, // '.'
224, // '/'
208, // '0'
220, // '1'
204, // '2'
187, // '3'
183, // '4'
179, // '5'
177, // '6'
168, // '7'
178, // '8'
200, // '9'
226, // ':'
195, // ';'
154, // '<'
184, // '='
174, // '>'
126, // '?'
120, // '@'
191, // 'A'
157, // 'B'
194, // 'C'
170, // 'D'
189, // 'E'
162, // 'F'
161, // 'G'
150, // 'H'
193, // 'I'
142, // 'J'
137, // 'K'
171, // 'L'
176, // 'M'
185, // 'N'
167, // 'O'
186, // 'P'
112, // 'Q'
175, // 'R'
192, // 'S'
188, // 'T'
156, // 'U'
140, // 'V'
143, // 'W'
123, // 'X'
133, // 'Y'
128, // 'Z'
147, // '['
138, // '\\'
146, // ']'
114, // '^'
223, // '_'
151, // '`'
249, // 'a'
216, // 'b'
238, // 'c'
236, // 'd'
253, // 'e'
227, // 'f'
218, // 'g'
230, // 'h'
247, // 'i'
135, // 'j'
180, // 'k'
241, // 'l'
233, // 'm'
246, // 'n'
244, // 'o'
231, // 'p'
139, // 'q'
245, // 'r'
243, // 's'
251, // 't'
235, // 'u'
201, // 'v'
196, // 'w'
240, // 'x'
214, // 'y'
152, // 'z'
182, // '{'
205, // '|'
181, // '}'
127, // '~'
27, // '\x7f'
212, // '\x80'
211, // '\x81'
210, // '\x82'
213, // '\x83'
228, // '\x84'
197, // '\x85'
169, // '\x86'
159, // '\x87'
131, // '\x88'
172, // '\x89'
105, // '\x8a'
80, // '\x8b'
98, // '\x8c'
96, // '\x8d'
97, // '\x8e'
81, // '\x8f'
207, // '\x90'
145, // '\x91'
116, // '\x92'
115, // '\x93'
144, // '\x94'
130, // '\x95'
153, // '\x96'
121, // '\x97'
107, // '\x98'
132, // '\x99'
109, // '\x9a'
110, // '\x9b'
124, // '\x9c'
111, // '\x9d'
82, // '\x9e'
108, // '\x9f'
118, // '\xa0'
141, // '¡'
113, // '¢'
129, // '£'
119, // '¤'
125, // '¥'
165, // '¦'
117, // '§'
92, // '¨'
106, // '©'
83, // 'ª'
72, // '«'
99, // '¬'
93, // '\xad'
65, // '®'
79, // '¯'
166, // '°'
237, // '±'
163, // '²'
199, // '³'
190, // '´'
225, // 'µ'
209, // '¶'
203, // '·'
198, // '¸'
217, // '¹'
219, // 'º'
206, // '»'
234, // '¼'
248, // '½'
158, // '¾'
239, // '¿'
255, // 'À'
255, // 'Á'
255, // 'Â'
255, // 'Ã'
255, // 'Ä'
255, // 'Å'
255, // 'Æ'
255, // 'Ç'
255, // 'È'
255, // 'É'
255, // 'Ê'
255, // 'Ë'
255, // 'Ì'
255, // 'Í'
255, // 'Î'
255, // 'Ï'
255, // 'Ð'
255, // 'Ñ'
255, // 'Ò'
255, // 'Ó'
255, // 'Ô'
255, // 'Õ'
255, // 'Ö'
255, // '×'
255, // 'Ø'
255, // 'Ù'
255, // 'Ú'
255, // 'Û'
255, // 'Ü'
255, // 'Ý'
255, // 'Þ'
255, // 'ß'
255, // 'à'
255, // 'á'
255, // 'â'
255, // 'ã'
255, // 'ä'
255, // 'å'
255, // 'æ'
255, // 'ç'
255, // 'è'
255, // 'é'
255, // 'ê'
255, // 'ë'
255, // 'ì'
255, // 'í'
255, // 'î'
255, // 'ï'
255, // 'ð'
255, // 'ñ'
255, // 'ò'
255, // 'ó'
255, // 'ô'
255, // 'õ'
255, // 'ö'
255, // '÷'
255, // 'ø'
255, // 'ù'
255, // 'ú'
255, // 'û'
255, // 'ü'
255, // 'ý'
255, // 'þ'
255, // 'ÿ'
];

View File

@@ -0,0 +1,266 @@
use core::mem::size_of;
use crate::memmem::{util::memcmp, vector::Vector, NeedleInfo};
/// The minimum length of a needle required for this algorithm. The minimum
/// is 2 since a length of 1 should just use memchr and a length of 0 isn't
/// a case handled by this searcher.
pub(crate) const MIN_NEEDLE_LEN: usize = 2;
/// The maximum length of a needle required for this algorithm.
///
/// In reality, there is no hard max here. The code below can handle any
/// length needle. (Perhaps that suggests there are missing optimizations.)
/// Instead, this is a heuristic and a bound guaranteeing our linear time
/// complexity.
///
/// It is a heuristic because when a candidate match is found, memcmp is run.
/// For very large needles with lots of false positives, memcmp can make the
/// code run quite slow.
///
/// It is a bound because the worst case behavior with memcmp is multiplicative
/// in the size of the needle and haystack, and we want to keep that additive.
/// This bound ensures we still meet that bound theoretically, since it's just
/// a constant. We aren't acting in bad faith here, memcmp on tiny needles
/// is so fast that even in pathological cases (see pathological vector
/// benchmarks), this is still just as fast or faster in practice.
///
/// This specific number was chosen by tweaking a bit and running benchmarks.
/// The rare-medium-needle, for example, gets about 5% faster by using this
/// algorithm instead of a prefilter-accelerated Two-Way. There's also a
/// theoretical desire to keep this number reasonably low, to mitigate the
/// impact of pathological cases. I did try 64, and some benchmarks got a
/// little better, and others (particularly the pathological ones), got a lot
/// worse. So... 32 it is?
pub(crate) const MAX_NEEDLE_LEN: usize = 32;
/// The implementation of the forward vector accelerated substring search.
///
/// This is extremely similar to the prefilter vector module by the same name.
/// The key difference is that this is not a prefilter. Instead, it handles
/// confirming its own matches. The trade off is that this only works with
/// smaller needles. The speed up here is that an inlined memcmp on a tiny
/// needle is very quick, even on pathological inputs. This is much better than
/// combining a prefilter with Two-Way, where using Two-Way to confirm the
/// match has higher latency.
///
/// So why not use this for all needles? We could, and it would probably work
/// really well on most inputs. But its worst case is multiplicative and we
/// want to guarantee worst case additive time. Some of the benchmarks try to
/// justify this (see the pathological ones).
///
/// The prefilter variant of this has more comments. Also note that we only
/// implement this for forward searches for now. If you have a compelling use
/// case for accelerated reverse search, please file an issue.
#[derive(Clone, Copy, Debug)]
pub(crate) struct Forward {
rare1i: u8,
rare2i: u8,
}
impl Forward {
/// Create a new "generic simd" forward searcher. If one could not be
/// created from the given inputs, then None is returned.
pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward> {
let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_u8();
// If the needle is too short or too long, give up. Also, give up
// if the rare bytes detected are at the same position. (It likely
// suggests a degenerate case, although it should technically not be
// possible.)
if needle.len() < MIN_NEEDLE_LEN
|| needle.len() > MAX_NEEDLE_LEN
|| rare1i == rare2i
{
return None;
}
Some(Forward { rare1i, rare2i })
}
/// Returns the minimum length of haystack that is needed for this searcher
/// to work for a particular vector. Passing a haystack with a length
/// smaller than this will cause `fwd_find` to panic.
#[inline(always)]
pub(crate) fn min_haystack_len<V: Vector>(&self) -> usize {
self.rare2i as usize + size_of::<V>()
}
}
/// Searches the given haystack for the given needle. The needle given should
/// be the same as the needle that this searcher was initialized with.
///
/// # Panics
///
/// When the given haystack has a length smaller than `min_haystack_len`.
///
/// # Safety
///
/// Since this is meant to be used with vector functions, callers need to
/// specialize this inside of a function with a `target_feature` attribute.
/// Therefore, callers must ensure that whatever target feature is being used
/// supports the vector functions that this function is specialized for. (For
/// the specific vector functions used, see the Vector trait implementations.)
#[inline(always)]
pub(crate) unsafe fn fwd_find<V: Vector>(
fwd: &Forward,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
// It would be nice if we didn't have this check here, since the meta
// searcher should handle it for us. But without this, I don't think we
// guarantee that end_ptr.sub(needle.len()) won't result in UB. We could
// put it as part of the safety contract, but it makes it more complicated
// than necessary.
if haystack.len() < needle.len() {
return None;
}
let min_haystack_len = fwd.min_haystack_len::<V>();
assert!(haystack.len() >= min_haystack_len, "haystack too small");
debug_assert!(needle.len() <= haystack.len());
debug_assert!(
needle.len() >= MIN_NEEDLE_LEN,
"needle must be at least {} bytes",
MIN_NEEDLE_LEN,
);
debug_assert!(
needle.len() <= MAX_NEEDLE_LEN,
"needle must be at most {} bytes",
MAX_NEEDLE_LEN,
);
let (rare1i, rare2i) = (fwd.rare1i as usize, fwd.rare2i as usize);
let rare1chunk = V::splat(needle[rare1i]);
let rare2chunk = V::splat(needle[rare2i]);
let start_ptr = haystack.as_ptr();
let end_ptr = start_ptr.add(haystack.len());
let max_ptr = end_ptr.sub(min_haystack_len);
let mut ptr = start_ptr;
// N.B. I did experiment with unrolling the loop to deal with size(V)
// bytes at a time and 2*size(V) bytes at a time. The double unroll was
// marginally faster while the quadruple unroll was unambiguously slower.
// In the end, I decided the complexity from unrolling wasn't worth it. I
// used the memmem/krate/prebuilt/huge-en/ benchmarks to compare.
while ptr <= max_ptr {
let m = fwd_find_in_chunk(
fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, !0,
);
if let Some(chunki) = m {
return Some(matched(start_ptr, ptr, chunki));
}
ptr = ptr.add(size_of::<V>());
}
if ptr < end_ptr {
let remaining = diff(end_ptr, ptr);
debug_assert!(
remaining < min_haystack_len,
"remaining bytes should be smaller than the minimum haystack \
length of {}, but there are {} bytes remaining",
min_haystack_len,
remaining,
);
if remaining < needle.len() {
return None;
}
debug_assert!(
max_ptr < ptr,
"after main loop, ptr should have exceeded max_ptr",
);
let overlap = diff(ptr, max_ptr);
debug_assert!(
overlap > 0,
"overlap ({}) must always be non-zero",
overlap,
);
debug_assert!(
overlap < size_of::<V>(),
"overlap ({}) cannot possibly be >= than a vector ({})",
overlap,
size_of::<V>(),
);
// The mask has all of its bits set except for the first N least
// significant bits, where N=overlap. This way, any matches that
// occur in find_in_chunk within the overlap are automatically
// ignored.
let mask = !((1 << overlap) - 1);
ptr = max_ptr;
let m = fwd_find_in_chunk(
fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, mask,
);
if let Some(chunki) = m {
return Some(matched(start_ptr, ptr, chunki));
}
}
None
}
/// Search for an occurrence of two rare bytes from the needle in the chunk
/// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When
/// an occurrence is found, memcmp is run to check if a match occurs at the
/// corresponding position.
///
/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
/// bytes repeated in each 8-bit lane, respectively.
///
/// mask should have bits set corresponding the positions in the chunk in which
/// matches are considered. This is only used for the last vector load where
/// the beginning of the vector might have overlapped with the last load in
/// the main loop. The mask lets us avoid visiting positions that have already
/// been discarded as matches.
///
/// # Safety
///
/// It must be safe to do an unaligned read of size(V) bytes starting at both
/// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned
/// loads on ptr up to (end_ptr - needle.len()).
#[inline(always)]
unsafe fn fwd_find_in_chunk<V: Vector>(
fwd: &Forward,
needle: &[u8],
ptr: *const u8,
end_ptr: *const u8,
rare1chunk: V,
rare2chunk: V,
mask: u32,
) -> Option<usize> {
let chunk0 = V::load_unaligned(ptr.add(fwd.rare1i as usize));
let chunk1 = V::load_unaligned(ptr.add(fwd.rare2i as usize));
let eq0 = chunk0.cmpeq(rare1chunk);
let eq1 = chunk1.cmpeq(rare2chunk);
let mut match_offsets = eq0.and(eq1).movemask() & mask;
while match_offsets != 0 {
let offset = match_offsets.trailing_zeros() as usize;
let ptr = ptr.add(offset);
if end_ptr.sub(needle.len()) < ptr {
return None;
}
let chunk = core::slice::from_raw_parts(ptr, needle.len());
if memcmp(needle, chunk) {
return Some(offset);
}
match_offsets &= match_offsets - 1;
}
None
}
/// Accepts a chunk-relative offset and returns a haystack relative offset
/// after updating the prefilter state.
///
/// See the same function with the same name in the prefilter variant of this
/// algorithm to learned why it's tagged with inline(never). Even here, where
/// the function is simpler, inlining it leads to poorer codegen. (Although
/// it does improve some benchmarks, like prebuiltiter/huge-en/common-you.)
#[cold]
#[inline(never)]
fn matched(start_ptr: *const u8, ptr: *const u8, chunki: usize) -> usize {
diff(ptr, start_ptr) + chunki
}
/// Subtract `b` from `a` and return the difference. `a` must be greater than
/// or equal to `b`.
fn diff(a: *const u8, b: *const u8) -> usize {
debug_assert!(a >= b);
(a as usize) - (b as usize)
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,122 @@
/*
This module implements a "fallback" prefilter that only relies on memchr to
function. While memchr works best when it's explicitly vectorized, its
fallback implementations are fast enough to make a prefilter like this
worthwhile.
The essence of this implementation is to identify two rare bytes in a needle
based on a background frequency distribution of bytes. We then run memchr on the
rarer byte. For each match, we use the second rare byte as a guard to quickly
check if a match is possible. If the position passes the guard test, then we do
a naive memcmp to confirm the match.
In practice, this formulation works amazingly well, primarily because of the
heuristic use of a background frequency distribution. However, it does have a
number of weaknesses where it can get quite slow when its background frequency
distribution doesn't line up with the haystack being searched. This is why we
have specialized vector routines that essentially take this idea and move the
guard check into vectorized code. (Those specialized vector routines do still
make use of the background frequency distribution of bytes though.)
This fallback implementation was originally formulated in regex many moons ago:
https://github.com/rust-lang/regex/blob/3db8722d0b204a85380fe2a65e13d7065d7dd968/src/literal/imp.rs#L370-L501
Prior to that, I'm not aware of anyone using this technique in any prominent
substring search implementation. Although, I'm sure folks have had this same
insight long before me.
Another version of this also appeared in bstr:
https://github.com/BurntSushi/bstr/blob/a444256ca7407fe180ee32534688549655b7a38e/src/search/prefilter.rs#L83-L340
*/
use crate::memmem::{
prefilter::{PrefilterFnTy, PrefilterState},
NeedleInfo,
};
// Check that the functions below satisfy the Prefilter function type.
const _: PrefilterFnTy = find;
/// Look for a possible occurrence of needle. The position returned
/// corresponds to the beginning of the occurrence, if one exists.
///
/// Callers may assume that this never returns false negatives (i.e., it
/// never misses an actual occurrence), but must check that the returned
/// position corresponds to a match. That is, it can return false
/// positives.
///
/// This should only be used when Freqy is constructed for forward
/// searching.
pub(crate) fn find(
prestate: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
let mut i = 0;
let (rare1i, rare2i) = ninfo.rarebytes.as_rare_usize();
let (rare1, rare2) = ninfo.rarebytes.as_rare_bytes(needle);
while prestate.is_effective() {
// Use a fast vectorized implementation to skip to the next
// occurrence of the rarest byte (heuristically chosen) in the
// needle.
let found = crate::memchr(rare1, &haystack[i..])?;
prestate.update(found);
i += found;
// If we can't align our first match with the haystack, then a
// match is impossible.
if i < rare1i {
i += 1;
continue;
}
// Align our rare2 byte with the haystack. A mismatch means that
// a match is impossible.
let aligned_rare2i = i - rare1i + rare2i;
if haystack.get(aligned_rare2i) != Some(&rare2) {
i += 1;
continue;
}
// We've done what we can. There might be a match here.
return Some(i - rare1i);
}
// The only way we get here is if we believe our skipping heuristic
// has become ineffective. We're allowed to return false positives,
// so return the position at which we advanced to, aligned to the
// haystack.
Some(i.saturating_sub(rare1i))
}
#[cfg(all(test, feature = "std"))]
mod tests {
use super::*;
fn freqy_find(haystack: &[u8], needle: &[u8]) -> Option<usize> {
let ninfo = NeedleInfo::new(needle);
let mut prestate = PrefilterState::new();
find(&mut prestate, &ninfo, haystack, needle)
}
#[test]
fn freqy_forward() {
assert_eq!(Some(0), freqy_find(b"BARFOO", b"BAR"));
assert_eq!(Some(3), freqy_find(b"FOOBAR", b"BAR"));
assert_eq!(Some(0), freqy_find(b"zyzz", b"zyzy"));
assert_eq!(Some(2), freqy_find(b"zzzy", b"zyzy"));
assert_eq!(None, freqy_find(b"zazb", b"zyzy"));
assert_eq!(Some(0), freqy_find(b"yzyy", b"yzyz"));
assert_eq!(Some(2), freqy_find(b"yyyz", b"yzyz"));
assert_eq!(None, freqy_find(b"yayb", b"yzyz"));
}
#[test]
#[cfg(not(miri))]
fn prefilter_permutations() {
use crate::memmem::prefilter::tests::PrefilterTest;
// SAFETY: super::find is safe to call for all inputs and on all
// platforms.
unsafe { PrefilterTest::run_all_tests(super::find) };
}
}

View File

@@ -0,0 +1,207 @@
use core::mem::size_of;
use crate::memmem::{
prefilter::{PrefilterFnTy, PrefilterState},
vector::Vector,
NeedleInfo,
};
/// The implementation of the forward vector accelerated candidate finder.
///
/// This is inspired by the "generic SIMD" algorithm described here:
/// http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd
///
/// The main difference is that this is just a prefilter. That is, it reports
/// candidates once they are seen and doesn't attempt to confirm them. Also,
/// the bytes this routine uses to check for candidates are selected based on
/// an a priori background frequency distribution. This means that on most
/// haystacks, this will on average spend more time in vectorized code than you
/// would if you just selected the first and last bytes of the needle.
///
/// Note that a non-prefilter variant of this algorithm can be found in the
/// parent module, but it only works on smaller needles.
///
/// `prestate`, `ninfo`, `haystack` and `needle` are the four prefilter
/// function parameters. `fallback` is a prefilter that is used if the haystack
/// is too small to be handled with the given vector size.
///
/// This routine is not safe because it is intended for callers to specialize
/// this with a particular vector (e.g., __m256i) and then call it with the
/// relevant target feature (e.g., avx2) enabled.
///
/// # Panics
///
/// If `needle.len() <= 1`, then this panics.
///
/// # Safety
///
/// Since this is meant to be used with vector functions, callers need to
/// specialize this inside of a function with a `target_feature` attribute.
/// Therefore, callers must ensure that whatever target feature is being used
/// supports the vector functions that this function is specialized for. (For
/// the specific vector functions used, see the Vector trait implementations.)
#[inline(always)]
pub(crate) unsafe fn find<V: Vector>(
prestate: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
fallback: PrefilterFnTy,
) -> Option<usize> {
assert!(needle.len() >= 2, "needle must be at least 2 bytes");
let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_usize();
let min_haystack_len = rare2i + size_of::<V>();
if haystack.len() < min_haystack_len {
return fallback(prestate, ninfo, haystack, needle);
}
let start_ptr = haystack.as_ptr();
let end_ptr = start_ptr.add(haystack.len());
let max_ptr = end_ptr.sub(min_haystack_len);
let mut ptr = start_ptr;
let rare1chunk = V::splat(needle[rare1i]);
let rare2chunk = V::splat(needle[rare2i]);
// N.B. I did experiment with unrolling the loop to deal with size(V)
// bytes at a time and 2*size(V) bytes at a time. The double unroll
// was marginally faster while the quadruple unroll was unambiguously
// slower. In the end, I decided the complexity from unrolling wasn't
// worth it. I used the memmem/krate/prebuilt/huge-en/ benchmarks to
// compare.
while ptr <= max_ptr {
let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk);
if let Some(chunki) = m {
return Some(matched(prestate, start_ptr, ptr, chunki));
}
ptr = ptr.add(size_of::<V>());
}
if ptr < end_ptr {
// This routine immediately quits if a candidate match is found.
// That means that if we're here, no candidate matches have been
// found at or before 'ptr'. Thus, we don't need to mask anything
// out even though we might technically search part of the haystack
// that we've already searched (because we know it can't match).
ptr = max_ptr;
let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk);
if let Some(chunki) = m {
return Some(matched(prestate, start_ptr, ptr, chunki));
}
}
prestate.update(haystack.len());
None
}
// Below are two different techniques for checking whether a candidate
// match exists in a given chunk or not. find_in_chunk2 checks two bytes
// where as find_in_chunk3 checks three bytes. The idea behind checking
// three bytes is that while we do a bit more work per iteration, we
// decrease the chances of a false positive match being reported and thus
// make the search faster overall. This actually works out for the
// memmem/krate/prebuilt/huge-en/never-all-common-bytes benchmark, where
// using find_in_chunk3 is about 25% faster than find_in_chunk2. However,
// it turns out that find_in_chunk2 is faster for all other benchmarks, so
// perhaps the extra check isn't worth it in practice.
//
// For now, we go with find_in_chunk2, but we leave find_in_chunk3 around
// to make it easy to switch to and benchmark when possible.
/// Search for an occurrence of two rare bytes from the needle in the current
/// chunk pointed to by ptr.
///
/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
/// bytes repeated in each 8-bit lane, respectively.
///
/// # Safety
///
/// It must be safe to do an unaligned read of size(V) bytes starting at both
/// (ptr + rare1i) and (ptr + rare2i).
#[inline(always)]
unsafe fn find_in_chunk2<V: Vector>(
ptr: *const u8,
rare1i: usize,
rare2i: usize,
rare1chunk: V,
rare2chunk: V,
) -> Option<usize> {
let chunk0 = V::load_unaligned(ptr.add(rare1i));
let chunk1 = V::load_unaligned(ptr.add(rare2i));
let eq0 = chunk0.cmpeq(rare1chunk);
let eq1 = chunk1.cmpeq(rare2chunk);
let match_offsets = eq0.and(eq1).movemask();
if match_offsets == 0 {
return None;
}
Some(match_offsets.trailing_zeros() as usize)
}
/// Search for an occurrence of two rare bytes and the first byte (even if one
/// of the rare bytes is equivalent to the first byte) from the needle in the
/// current chunk pointed to by ptr.
///
/// firstchunk, rare1chunk and rare2chunk correspond to vectors with the first,
/// rare1 and rare2 bytes repeated in each 8-bit lane, respectively.
///
/// # Safety
///
/// It must be safe to do an unaligned read of size(V) bytes starting at ptr,
/// (ptr + rare1i) and (ptr + rare2i).
#[allow(dead_code)]
#[inline(always)]
unsafe fn find_in_chunk3<V: Vector>(
ptr: *const u8,
rare1i: usize,
rare2i: usize,
firstchunk: V,
rare1chunk: V,
rare2chunk: V,
) -> Option<usize> {
let chunk0 = V::load_unaligned(ptr);
let chunk1 = V::load_unaligned(ptr.add(rare1i));
let chunk2 = V::load_unaligned(ptr.add(rare2i));
let eq0 = chunk0.cmpeq(firstchunk);
let eq1 = chunk1.cmpeq(rare1chunk);
let eq2 = chunk2.cmpeq(rare2chunk);
let match_offsets = eq0.and(eq1).and(eq2).movemask();
if match_offsets == 0 {
return None;
}
Some(match_offsets.trailing_zeros() as usize)
}
/// Accepts a chunk-relative offset and returns a haystack relative offset
/// after updating the prefilter state.
///
/// Why do we use this unlineable function when a search completes? Well,
/// I don't know. Really. Obviously this function was not here initially.
/// When doing profiling, the codegen for the inner loop here looked bad and
/// I didn't know why. There were a couple extra 'add' instructions and an
/// extra 'lea' instruction that I couldn't explain. I hypothesized that the
/// optimizer was having trouble untangling the hot code in the loop from the
/// code that deals with a candidate match. By putting the latter into an
/// unlineable function, it kind of forces the issue and it had the intended
/// effect: codegen improved measurably. It's good for a ~10% improvement
/// across the board on the memmem/krate/prebuilt/huge-en/ benchmarks.
#[cold]
#[inline(never)]
fn matched(
prestate: &mut PrefilterState,
start_ptr: *const u8,
ptr: *const u8,
chunki: usize,
) -> usize {
let found = diff(ptr, start_ptr) + chunki;
prestate.update(found);
found
}
/// Subtract `b` from `a` and return the difference. `a` must be greater than
/// or equal to `b`.
fn diff(a: *const u8, b: *const u8) -> usize {
debug_assert!(a >= b);
(a as usize) - (b as usize)
}

View File

@@ -0,0 +1,570 @@
use crate::memmem::{rarebytes::RareNeedleBytes, NeedleInfo};
mod fallback;
#[cfg(memchr_runtime_simd)]
mod genericsimd;
#[cfg(all(not(miri), target_arch = "wasm32", memchr_runtime_simd))]
mod wasm;
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
mod x86;
/// The maximum frequency rank permitted for the fallback prefilter. If the
/// rarest byte in the needle has a frequency rank above this value, then no
/// prefilter is used if the fallback prefilter would otherwise be selected.
const MAX_FALLBACK_RANK: usize = 250;
/// A combination of prefilter effectiveness state, the prefilter function and
/// the needle info required to run a prefilter.
///
/// For the most part, these are grouped into a single type for convenience,
/// instead of needing to pass around all three as distinct function
/// parameters.
pub(crate) struct Pre<'a> {
/// State that tracks the effectiveness of a prefilter.
pub(crate) state: &'a mut PrefilterState,
/// The actual prefilter function.
pub(crate) prefn: PrefilterFn,
/// Information about a needle, such as its RK hash and rare byte offsets.
pub(crate) ninfo: &'a NeedleInfo,
}
impl<'a> Pre<'a> {
/// Call this prefilter on the given haystack with the given needle.
#[inline(always)]
pub(crate) fn call(
&mut self,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
self.prefn.call(self.state, self.ninfo, haystack, needle)
}
/// Return true if and only if this prefilter should be used.
#[inline(always)]
pub(crate) fn should_call(&mut self) -> bool {
self.state.is_effective()
}
}
/// A prefilter function.
///
/// A prefilter function describes both forward and reverse searches.
/// (Although, we don't currently implement prefilters for reverse searching.)
/// In the case of a forward search, the position returned corresponds to
/// the starting offset of a match (confirmed or possible). Its minimum
/// value is `0`, and its maximum value is `haystack.len() - 1`. In the case
/// of a reverse search, the position returned corresponds to the position
/// immediately after a match (confirmed or possible). Its minimum value is `1`
/// and its maximum value is `haystack.len()`.
///
/// In both cases, the position returned is the starting (or ending) point of a
/// _possible_ match. That is, returning a false positive is okay. A prefilter,
/// however, must never return any false negatives. That is, if a match exists
/// at a particular position `i`, then a prefilter _must_ return that position.
/// It cannot skip past it.
///
/// # Safety
///
/// A prefilter function is not safe to create, since not all prefilters are
/// safe to call in all contexts. (e.g., A prefilter that uses AVX instructions
/// may only be called on x86_64 CPUs with the relevant AVX feature enabled.)
/// Thus, callers must ensure that when a prefilter function is created that it
/// is safe to call for the current environment.
#[derive(Clone, Copy)]
pub(crate) struct PrefilterFn(PrefilterFnTy);
/// The type of a prefilter function. All prefilters must satisfy this
/// signature.
///
/// Using a function pointer like this does inhibit inlining, but it does
/// eliminate branching and the extra costs associated with copying a larger
/// enum. Note also, that using Box<dyn SomePrefilterTrait> can't really work
/// here, since we want to work in contexts that don't have dynamic memory
/// allocation. Moreover, in the default configuration of this crate on x86_64
/// CPUs released in the past ~decade, we will use an AVX2-optimized prefilter,
/// which generally won't be inlineable into the surrounding code anyway.
/// (Unless AVX2 is enabled at compile time, but this is typically rare, since
/// it produces a non-portable binary.)
pub(crate) type PrefilterFnTy = unsafe fn(
prestate: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize>;
// If the haystack is too small for SSE2, then just run memchr on the
// rarest byte and be done with it. (It is likely that this code path is
// rarely exercised, since a higher level routine will probably dispatch to
// Rabin-Karp for such a small haystack.)
#[cfg(memchr_runtime_simd)]
fn simple_memchr_fallback(
_prestate: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
let (rare, _) = ninfo.rarebytes.as_rare_ordered_usize();
crate::memchr(needle[rare], haystack).map(|i| i.saturating_sub(rare))
}
impl PrefilterFn {
/// Create a new prefilter function from the function pointer given.
///
/// # Safety
///
/// Callers must ensure that the given prefilter function is safe to call
/// for all inputs in the current environment. For example, if the given
/// prefilter function uses AVX instructions, then the caller must ensure
/// that the appropriate AVX CPU features are enabled.
pub(crate) unsafe fn new(prefn: PrefilterFnTy) -> PrefilterFn {
PrefilterFn(prefn)
}
/// Call the underlying prefilter function with the given arguments.
pub fn call(
self,
prestate: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
// SAFETY: Callers have the burden of ensuring that a prefilter
// function is safe to call for all inputs in the current environment.
unsafe { (self.0)(prestate, ninfo, haystack, needle) }
}
}
impl core::fmt::Debug for PrefilterFn {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
"<prefilter-fn(...)>".fmt(f)
}
}
/// Prefilter controls whether heuristics are used to accelerate searching.
///
/// A prefilter refers to the idea of detecting candidate matches very quickly,
/// and then confirming whether those candidates are full matches. This
/// idea can be quite effective since it's often the case that looking for
/// candidates can be a lot faster than running a complete substring search
/// over the entire input. Namely, looking for candidates can be done with
/// extremely fast vectorized code.
///
/// The downside of a prefilter is that it assumes false positives (which are
/// candidates generated by a prefilter that aren't matches) are somewhat rare
/// relative to the frequency of full matches. That is, if a lot of false
/// positives are generated, then it's possible for search time to be worse
/// than if the prefilter wasn't enabled in the first place.
///
/// Another downside of a prefilter is that it can result in highly variable
/// performance, where some cases are extraordinarily fast and others aren't.
/// Typically, variable performance isn't a problem, but it may be for your use
/// case.
///
/// The use of prefilters in this implementation does use a heuristic to detect
/// when a prefilter might not be carrying its weight, and will dynamically
/// disable its use. Nevertheless, this configuration option gives callers
/// the ability to disable prefilters if you have knowledge that they won't be
/// useful.
#[derive(Clone, Copy, Debug)]
#[non_exhaustive]
pub enum Prefilter {
/// Never used a prefilter in substring search.
None,
/// Automatically detect whether a heuristic prefilter should be used. If
/// it is used, then heuristics will be used to dynamically disable the
/// prefilter if it is believed to not be carrying its weight.
Auto,
}
impl Default for Prefilter {
fn default() -> Prefilter {
Prefilter::Auto
}
}
impl Prefilter {
pub(crate) fn is_none(&self) -> bool {
match *self {
Prefilter::None => true,
_ => false,
}
}
}
/// PrefilterState tracks state associated with the effectiveness of a
/// prefilter. It is used to track how many bytes, on average, are skipped by
/// the prefilter. If this average dips below a certain threshold over time,
/// then the state renders the prefilter inert and stops using it.
///
/// A prefilter state should be created for each search. (Where creating an
/// iterator is treated as a single search.) A prefilter state should only be
/// created from a `Freqy`. e.g., An inert `Freqy` will produce an inert
/// `PrefilterState`.
#[derive(Clone, Debug)]
pub(crate) struct PrefilterState {
/// The number of skips that has been executed. This is always 1 greater
/// than the actual number of skips. The special sentinel value of 0
/// indicates that the prefilter is inert. This is useful to avoid
/// additional checks to determine whether the prefilter is still
/// "effective." Once a prefilter becomes inert, it should no longer be
/// used (according to our heuristics).
skips: u32,
/// The total number of bytes that have been skipped.
skipped: u32,
}
impl PrefilterState {
/// The minimum number of skip attempts to try before considering whether
/// a prefilter is effective or not.
const MIN_SKIPS: u32 = 50;
/// The minimum amount of bytes that skipping must average.
///
/// This value was chosen based on varying it and checking
/// the microbenchmarks. In particular, this can impact the
/// pathological/repeated-{huge,small} benchmarks quite a bit if it's set
/// too low.
const MIN_SKIP_BYTES: u32 = 8;
/// Create a fresh prefilter state.
pub(crate) fn new() -> PrefilterState {
PrefilterState { skips: 1, skipped: 0 }
}
/// Create a fresh prefilter state that is always inert.
pub(crate) fn inert() -> PrefilterState {
PrefilterState { skips: 0, skipped: 0 }
}
/// Update this state with the number of bytes skipped on the last
/// invocation of the prefilter.
#[inline]
pub(crate) fn update(&mut self, skipped: usize) {
self.skips = self.skips.saturating_add(1);
// We need to do this dance since it's technically possible for
// `skipped` to overflow a `u32`. (And we use a `u32` to reduce the
// size of a prefilter state.)
if skipped > core::u32::MAX as usize {
self.skipped = core::u32::MAX;
} else {
self.skipped = self.skipped.saturating_add(skipped as u32);
}
}
/// Return true if and only if this state indicates that a prefilter is
/// still effective.
#[inline]
pub(crate) fn is_effective(&mut self) -> bool {
if self.is_inert() {
return false;
}
if self.skips() < PrefilterState::MIN_SKIPS {
return true;
}
if self.skipped >= PrefilterState::MIN_SKIP_BYTES * self.skips() {
return true;
}
// We're inert.
self.skips = 0;
false
}
#[inline]
fn is_inert(&self) -> bool {
self.skips == 0
}
#[inline]
fn skips(&self) -> u32 {
self.skips.saturating_sub(1)
}
}
/// Determine which prefilter function, if any, to use.
///
/// This only applies to x86_64 when runtime SIMD detection is enabled (which
/// is the default). In general, we try to use an AVX prefilter, followed by
/// SSE and then followed by a generic one based on memchr.
#[inline(always)]
pub(crate) fn forward(
config: &Prefilter,
rare: &RareNeedleBytes,
needle: &[u8],
) -> Option<PrefilterFn> {
if config.is_none() || needle.len() <= 1 {
return None;
}
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
{
#[cfg(feature = "std")]
{
if cfg!(memchr_runtime_avx) {
if is_x86_feature_detected!("avx2") {
// SAFETY: x86::avx::find only requires the avx2 feature,
// which we've just checked above.
return unsafe { Some(PrefilterFn::new(x86::avx::find)) };
}
}
}
if cfg!(memchr_runtime_sse2) {
// SAFETY: x86::sse::find only requires the sse2 feature, which is
// guaranteed to be available on x86_64.
return unsafe { Some(PrefilterFn::new(x86::sse::find)) };
}
}
#[cfg(all(not(miri), target_arch = "wasm32", memchr_runtime_simd))]
{
// SAFETY: `wasm::find` is actually a safe function
//
// Also note that the `if true` is here to prevent, on wasm with simd,
// rustc warning about the code below being dead code.
if true {
return unsafe { Some(PrefilterFn::new(wasm::find)) };
}
}
// Check that our rarest byte has a reasonably low rank. The main issue
// here is that the fallback prefilter can perform pretty poorly if it's
// given common bytes. So we try to avoid the worst cases here.
let (rare1_rank, _) = rare.as_ranks(needle);
if rare1_rank <= MAX_FALLBACK_RANK {
// SAFETY: fallback::find is safe to call in all environments.
return unsafe { Some(PrefilterFn::new(fallback::find)) };
}
None
}
/// Return the minimum length of the haystack in which a prefilter should be
/// used. If the haystack is below this length, then it's probably not worth
/// the overhead of running the prefilter.
///
/// We used to look at the length of a haystack here. That is, if it was too
/// small, then don't bother with the prefilter. But two things changed:
/// the prefilter falls back to memchr for small haystacks, and, at the
/// meta-searcher level, Rabin-Karp is employed for tiny haystacks anyway.
///
/// We keep it around for now in case we want to bring it back.
#[allow(dead_code)]
pub(crate) fn minimum_len(_haystack: &[u8], needle: &[u8]) -> usize {
// If the haystack length isn't greater than needle.len() * FACTOR, then
// no prefilter will be used. The presumption here is that since there
// are so few bytes to check, it's not worth running the prefilter since
// there will need to be a validation step anyway. Thus, the prefilter is
// largely redundant work.
//
// Increase the factor noticeably hurts the
// memmem/krate/prebuilt/teeny-*/never-john-watson benchmarks.
const PREFILTER_LENGTH_FACTOR: usize = 2;
const VECTOR_MIN_LENGTH: usize = 16;
let min = core::cmp::max(
VECTOR_MIN_LENGTH,
PREFILTER_LENGTH_FACTOR * needle.len(),
);
// For haystacks with length==min, we still want to avoid the prefilter,
// so add 1.
min + 1
}
#[cfg(all(test, feature = "std", not(miri)))]
pub(crate) mod tests {
use std::convert::{TryFrom, TryInto};
use super::*;
use crate::memmem::{
prefilter::PrefilterFnTy, rabinkarp, rarebytes::RareNeedleBytes,
};
// Below is a small jig that generates prefilter tests. The main purpose
// of this jig is to generate tests of varying needle/haystack lengths
// in order to try and exercise all code paths in our prefilters. And in
// particular, this is especially important for vectorized prefilters where
// certain code paths might only be exercised at certain lengths.
/// A test that represents the input and expected output to a prefilter
/// function. The test should be able to run with any prefilter function
/// and get the expected output.
pub(crate) struct PrefilterTest {
// These fields represent the inputs and expected output of a forwards
// prefilter function.
pub(crate) ninfo: NeedleInfo,
pub(crate) haystack: Vec<u8>,
pub(crate) needle: Vec<u8>,
pub(crate) output: Option<usize>,
}
impl PrefilterTest {
/// Run all generated forward prefilter tests on the given prefn.
///
/// # Safety
///
/// Callers must ensure that the given prefilter function pointer is
/// safe to call for all inputs in the current environment.
pub(crate) unsafe fn run_all_tests(prefn: PrefilterFnTy) {
PrefilterTest::run_all_tests_filter(prefn, |_| true)
}
/// Run all generated forward prefilter tests that pass the given
/// predicate on the given prefn.
///
/// # Safety
///
/// Callers must ensure that the given prefilter function pointer is
/// safe to call for all inputs in the current environment.
pub(crate) unsafe fn run_all_tests_filter(
prefn: PrefilterFnTy,
mut predicate: impl FnMut(&PrefilterTest) -> bool,
) {
for seed in PREFILTER_TEST_SEEDS {
for test in seed.generate() {
if predicate(&test) {
test.run(prefn);
}
}
}
}
/// Create a new prefilter test from a seed and some chose offsets to
/// rare bytes in the seed's needle.
///
/// If a valid test could not be constructed, then None is returned.
/// (Currently, we take the approach of massaging tests to be valid
/// instead of rejecting them outright.)
fn new(
seed: PrefilterTestSeed,
rare1i: usize,
rare2i: usize,
haystack_len: usize,
needle_len: usize,
output: Option<usize>,
) -> Option<PrefilterTest> {
let mut rare1i: u8 = rare1i.try_into().unwrap();
let mut rare2i: u8 = rare2i.try_into().unwrap();
// The '#' byte is never used in a haystack (unless we're expecting
// a match), while the '@' byte is never used in a needle.
let mut haystack = vec![b'@'; haystack_len];
let mut needle = vec![b'#'; needle_len];
needle[0] = seed.first;
needle[rare1i as usize] = seed.rare1;
needle[rare2i as usize] = seed.rare2;
// If we're expecting a match, then make sure the needle occurs
// in the haystack at the expected position.
if let Some(i) = output {
haystack[i..i + needle.len()].copy_from_slice(&needle);
}
// If the operations above lead to rare offsets pointing to the
// non-first occurrence of a byte, then adjust it. This might lead
// to redundant tests, but it's simpler than trying to change the
// generation process I think.
if let Some(i) = crate::memchr(seed.rare1, &needle) {
rare1i = u8::try_from(i).unwrap();
}
if let Some(i) = crate::memchr(seed.rare2, &needle) {
rare2i = u8::try_from(i).unwrap();
}
let ninfo = NeedleInfo {
rarebytes: RareNeedleBytes::new(rare1i, rare2i),
nhash: rabinkarp::NeedleHash::forward(&needle),
};
Some(PrefilterTest { ninfo, haystack, needle, output })
}
/// Run this specific test on the given prefilter function. If the
/// outputs do no match, then this routine panics with a failure
/// message.
///
/// # Safety
///
/// Callers must ensure that the given prefilter function pointer is
/// safe to call for all inputs in the current environment.
unsafe fn run(&self, prefn: PrefilterFnTy) {
let mut prestate = PrefilterState::new();
assert_eq!(
self.output,
prefn(
&mut prestate,
&self.ninfo,
&self.haystack,
&self.needle
),
"ninfo: {:?}, haystack(len={}): {:?}, needle(len={}): {:?}",
self.ninfo,
self.haystack.len(),
std::str::from_utf8(&self.haystack).unwrap(),
self.needle.len(),
std::str::from_utf8(&self.needle).unwrap(),
);
}
}
/// A set of prefilter test seeds. Each seed serves as the base for the
/// generation of many other tests. In essence, the seed captures the
/// "rare" and first bytes among our needle. The tests generated from each
/// seed essentially vary the length of the needle and haystack, while
/// using the rare/first byte configuration from the seed.
///
/// The purpose of this is to test many different needle/haystack lengths.
/// In particular, some of the vector optimizations might only have bugs
/// in haystacks of a certain size.
const PREFILTER_TEST_SEEDS: &[PrefilterTestSeed] = &[
PrefilterTestSeed { first: b'x', rare1: b'y', rare2: b'z' },
PrefilterTestSeed { first: b'x', rare1: b'x', rare2: b'z' },
PrefilterTestSeed { first: b'x', rare1: b'y', rare2: b'x' },
PrefilterTestSeed { first: b'x', rare1: b'x', rare2: b'x' },
PrefilterTestSeed { first: b'x', rare1: b'y', rare2: b'y' },
];
/// Data that describes a single prefilter test seed.
#[derive(Clone, Copy)]
struct PrefilterTestSeed {
first: u8,
rare1: u8,
rare2: u8,
}
impl PrefilterTestSeed {
/// Generate a series of prefilter tests from this seed.
fn generate(self) -> impl Iterator<Item = PrefilterTest> {
let len_start = 2;
// The iterator below generates *a lot* of tests. The number of
// tests was chosen somewhat empirically to be "bearable" when
// running the test suite.
//
// We use an iterator here because the collective haystacks of all
// these test cases add up to enough memory to OOM a conservative
// sandbox or a small laptop.
(len_start..=40).flat_map(move |needle_len| {
let rare_start = len_start - 1;
(rare_start..needle_len).flat_map(move |rare1i| {
(rare1i..needle_len).flat_map(move |rare2i| {
(needle_len..=66).flat_map(move |haystack_len| {
PrefilterTest::new(
self,
rare1i,
rare2i,
haystack_len,
needle_len,
None,
)
.into_iter()
.chain(
(0..=(haystack_len - needle_len)).flat_map(
move |output| {
PrefilterTest::new(
self,
rare1i,
rare2i,
haystack_len,
needle_len,
Some(output),
)
},
),
)
})
})
})
})
}
}
}

View File

@@ -0,0 +1,39 @@
use core::arch::wasm32::v128;
use crate::memmem::{
prefilter::{PrefilterFnTy, PrefilterState},
NeedleInfo,
};
// Check that the functions below satisfy the Prefilter function type.
const _: PrefilterFnTy = find;
/// A `v128`-accelerated candidate finder for single-substring search.
#[target_feature(enable = "simd128")]
pub(crate) fn find(
prestate: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
unsafe {
super::genericsimd::find::<v128>(
prestate,
ninfo,
haystack,
needle,
super::simple_memchr_fallback,
)
}
}
#[cfg(all(test, feature = "std"))]
mod tests {
#[test]
#[cfg(not(miri))]
fn prefilter_permutations() {
use crate::memmem::prefilter::tests::PrefilterTest;
// SAFETY: super::find is safe to call for all inputs on x86.
unsafe { PrefilterTest::run_all_tests(super::find) };
}
}

View File

@@ -0,0 +1,46 @@
use core::arch::x86_64::__m256i;
use crate::memmem::{
prefilter::{PrefilterFnTy, PrefilterState},
NeedleInfo,
};
// Check that the functions below satisfy the Prefilter function type.
const _: PrefilterFnTy = find;
/// An AVX2 accelerated candidate finder for single-substring search.
///
/// # Safety
///
/// Callers must ensure that the avx2 CPU feature is enabled in the current
/// environment.
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn find(
prestate: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
super::super::genericsimd::find::<__m256i>(
prestate,
ninfo,
haystack,
needle,
super::sse::find,
)
}
#[cfg(test)]
mod tests {
#[test]
#[cfg(not(miri))]
fn prefilter_permutations() {
use crate::memmem::prefilter::tests::PrefilterTest;
if !is_x86_feature_detected!("avx2") {
return;
}
// SAFETY: The safety of super::find only requires that the current
// CPU support AVX2, which we checked above.
unsafe { PrefilterTest::run_all_tests(super::find) };
}
}

View File

@@ -0,0 +1,5 @@
// We only use AVX when we can detect at runtime whether it's available, which
// requires std.
#[cfg(feature = "std")]
pub(crate) mod avx;
pub(crate) mod sse;

View File

@@ -0,0 +1,42 @@
use core::arch::x86_64::__m128i;
use crate::memmem::{
prefilter::{PrefilterFnTy, PrefilterState},
NeedleInfo,
};
// Check that the functions below satisfy the Prefilter function type.
const _: PrefilterFnTy = find;
/// An SSE2 accelerated candidate finder for single-substring search.
///
/// # Safety
///
/// Callers must ensure that the sse2 CPU feature is enabled in the current
/// environment. This feature should be enabled in all x86_64 targets.
#[target_feature(enable = "sse2")]
pub(crate) unsafe fn find(
prestate: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
super::super::genericsimd::find::<__m128i>(
prestate,
ninfo,
haystack,
needle,
super::super::simple_memchr_fallback,
)
}
#[cfg(all(test, feature = "std"))]
mod tests {
#[test]
#[cfg(not(miri))]
fn prefilter_permutations() {
use crate::memmem::prefilter::tests::PrefilterTest;
// SAFETY: super::find is safe to call for all inputs on x86.
unsafe { PrefilterTest::run_all_tests(super::find) };
}
}

View File

@@ -0,0 +1,233 @@
/*
This module implements the classical Rabin-Karp substring search algorithm,
with no extra frills. While its use would seem to break our time complexity
guarantee of O(m+n) (RK's time complexity is O(mn)), we are careful to only
ever use RK on a constant subset of haystacks. The main point here is that
RK has good latency properties for small needles/haystacks. It's very quick
to compute a needle hash and zip through the haystack when compared to
initializing Two-Way, for example. And this is especially useful for cases
where the haystack is just too short for vector instructions to do much good.
The hashing function used here is the same one recommended by ESMAJ.
Another choice instead of Rabin-Karp would be Shift-Or. But its latency
isn't quite as good since its preprocessing time is a bit more expensive
(both in practice and in theory). However, perhaps Shift-Or has a place
somewhere else for short patterns. I think the main problem is that it
requires space proportional to the alphabet and the needle. If we, for
example, supported needles up to length 16, then the total table size would be
len(alphabet)*size_of::<u16>()==512 bytes. Which isn't exactly small, and it's
probably bad to put that on the stack. So ideally, we'd throw it on the heap,
but we'd really like to write as much code without using alloc/std as possible.
But maybe it's worth the special casing. It's a TODO to benchmark.
Wikipedia has a decent explanation, if a bit heavy on the theory:
https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
But ESMAJ provides something a bit more concrete:
http://www-igm.univ-mlv.fr/~lecroq/string/node5.html
Finally, aho-corasick uses Rabin-Karp for multiple pattern match in some cases:
https://github.com/BurntSushi/aho-corasick/blob/3852632f10587db0ff72ef29e88d58bf305a0946/src/packed/rabinkarp.rs
*/
/// Whether RK is believed to be very fast for the given needle/haystack.
pub(crate) fn is_fast(haystack: &[u8], _needle: &[u8]) -> bool {
haystack.len() < 16
}
/// Search for the first occurrence of needle in haystack using Rabin-Karp.
pub(crate) fn find(haystack: &[u8], needle: &[u8]) -> Option<usize> {
find_with(&NeedleHash::forward(needle), haystack, needle)
}
/// Search for the first occurrence of needle in haystack using Rabin-Karp with
/// a pre-computed needle hash.
pub(crate) fn find_with(
nhash: &NeedleHash,
mut haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
if haystack.len() < needle.len() {
return None;
}
let start = haystack.as_ptr() as usize;
let mut hash = Hash::from_bytes_fwd(&haystack[..needle.len()]);
// N.B. I've experimented with unrolling this loop, but couldn't realize
// any obvious gains.
loop {
if nhash.eq(hash) && is_prefix(haystack, needle) {
return Some(haystack.as_ptr() as usize - start);
}
if needle.len() >= haystack.len() {
return None;
}
hash.roll(&nhash, haystack[0], haystack[needle.len()]);
haystack = &haystack[1..];
}
}
/// Search for the last occurrence of needle in haystack using Rabin-Karp.
pub(crate) fn rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> {
rfind_with(&NeedleHash::reverse(needle), haystack, needle)
}
/// Search for the last occurrence of needle in haystack using Rabin-Karp with
/// a pre-computed needle hash.
pub(crate) fn rfind_with(
nhash: &NeedleHash,
mut haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
if haystack.len() < needle.len() {
return None;
}
let mut hash =
Hash::from_bytes_rev(&haystack[haystack.len() - needle.len()..]);
loop {
if nhash.eq(hash) && is_suffix(haystack, needle) {
return Some(haystack.len() - needle.len());
}
if needle.len() >= haystack.len() {
return None;
}
hash.roll(
&nhash,
haystack[haystack.len() - 1],
haystack[haystack.len() - needle.len() - 1],
);
haystack = &haystack[..haystack.len() - 1];
}
}
/// A hash derived from a needle.
#[derive(Clone, Copy, Debug, Default)]
pub(crate) struct NeedleHash {
/// The actual hash.
hash: Hash,
/// The factor needed to multiply a byte by in order to subtract it from
/// the hash. It is defined to be 2^(n-1) (using wrapping exponentiation),
/// where n is the length of the needle. This is how we "remove" a byte
/// from the hash once the hash window rolls past it.
hash_2pow: u32,
}
impl NeedleHash {
/// Create a new Rabin-Karp hash for the given needle for use in forward
/// searching.
pub(crate) fn forward(needle: &[u8]) -> NeedleHash {
let mut nh = NeedleHash { hash: Hash::new(), hash_2pow: 1 };
if needle.is_empty() {
return nh;
}
nh.hash.add(needle[0]);
for &b in needle.iter().skip(1) {
nh.hash.add(b);
nh.hash_2pow = nh.hash_2pow.wrapping_shl(1);
}
nh
}
/// Create a new Rabin-Karp hash for the given needle for use in reverse
/// searching.
pub(crate) fn reverse(needle: &[u8]) -> NeedleHash {
let mut nh = NeedleHash { hash: Hash::new(), hash_2pow: 1 };
if needle.is_empty() {
return nh;
}
nh.hash.add(needle[needle.len() - 1]);
for &b in needle.iter().rev().skip(1) {
nh.hash.add(b);
nh.hash_2pow = nh.hash_2pow.wrapping_shl(1);
}
nh
}
/// Return true if the hashes are equivalent.
fn eq(&self, hash: Hash) -> bool {
self.hash == hash
}
}
/// A Rabin-Karp hash. This might represent the hash of a needle, or the hash
/// of a rolling window in the haystack.
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub(crate) struct Hash(u32);
impl Hash {
/// Create a new hash that represents the empty string.
pub(crate) fn new() -> Hash {
Hash(0)
}
/// Create a new hash from the bytes given for use in forward searches.
pub(crate) fn from_bytes_fwd(bytes: &[u8]) -> Hash {
let mut hash = Hash::new();
for &b in bytes {
hash.add(b);
}
hash
}
/// Create a new hash from the bytes given for use in reverse searches.
fn from_bytes_rev(bytes: &[u8]) -> Hash {
let mut hash = Hash::new();
for &b in bytes.iter().rev() {
hash.add(b);
}
hash
}
/// Add 'new' and remove 'old' from this hash. The given needle hash should
/// correspond to the hash computed for the needle being searched for.
///
/// This is meant to be used when the rolling window of the haystack is
/// advanced.
fn roll(&mut self, nhash: &NeedleHash, old: u8, new: u8) {
self.del(nhash, old);
self.add(new);
}
/// Add a byte to this hash.
fn add(&mut self, byte: u8) {
self.0 = self.0.wrapping_shl(1).wrapping_add(byte as u32);
}
/// Remove a byte from this hash. The given needle hash should correspond
/// to the hash computed for the needle being searched for.
fn del(&mut self, nhash: &NeedleHash, byte: u8) {
let factor = nhash.hash_2pow;
self.0 = self.0.wrapping_sub((byte as u32).wrapping_mul(factor));
}
}
/// Returns true if the given needle is a prefix of the given haystack.
///
/// We forcefully don't inline the is_prefix call and hint at the compiler that
/// it is unlikely to be called. This causes the inner rabinkarp loop above
/// to be a bit tighter and leads to some performance improvement. See the
/// memmem/krate/prebuilt/sliceslice-words/words benchmark.
#[cold]
#[inline(never)]
fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool {
crate::memmem::util::is_prefix(haystack, needle)
}
/// Returns true if the given needle is a suffix of the given haystack.
///
/// See is_prefix for why this is forcefully not inlined.
#[cold]
#[inline(never)]
fn is_suffix(haystack: &[u8], needle: &[u8]) -> bool {
crate::memmem::util::is_suffix(haystack, needle)
}
#[cfg(test)]
mod simpletests {
define_memmem_simple_tests!(super::find, super::rfind);
}
#[cfg(all(test, feature = "std", not(miri)))]
mod proptests {
define_memmem_quickcheck_tests!(super::find, super::rfind);
}

View File

@@ -0,0 +1,136 @@
/// A heuristic frequency based detection of rare bytes for substring search.
///
/// This detector attempts to pick out two bytes in a needle that are predicted
/// to occur least frequently. The purpose is to use these bytes to implement
/// fast candidate search using vectorized code.
///
/// A set of offsets is only computed for needles of length 2 or greater.
/// Smaller needles should be special cased by the substring search algorithm
/// in use. (e.g., Use memchr for single byte needles.)
///
/// Note that we use `u8` to represent the offsets of the rare bytes in a
/// needle to reduce space usage. This means that rare byte occurring after the
/// first 255 bytes in a needle will never be used.
#[derive(Clone, Copy, Debug, Default)]
pub(crate) struct RareNeedleBytes {
/// The leftmost offset of the rarest byte in the needle, according to
/// pre-computed frequency analysis. The "leftmost offset" means that
/// rare1i <= i for all i where needle[i] == needle[rare1i].
rare1i: u8,
/// The leftmost offset of the second rarest byte in the needle, according
/// to pre-computed frequency analysis. The "leftmost offset" means that
/// rare2i <= i for all i where needle[i] == needle[rare2i].
///
/// The second rarest byte is used as a type of guard for quickly detecting
/// a mismatch if the first byte matches. This is a hedge against
/// pathological cases where the pre-computed frequency analysis may be
/// off. (But of course, does not prevent *all* pathological cases.)
///
/// In general, rare1i != rare2i by construction, although there is no hard
/// requirement that they be different. However, since the case of a single
/// byte needle is handled specially by memchr itself, rare2i generally
/// always should be different from rare1i since it would otherwise be
/// ineffective as a guard.
rare2i: u8,
}
impl RareNeedleBytes {
/// Create a new pair of rare needle bytes with the given offsets. This is
/// only used in tests for generating input data.
#[cfg(all(test, feature = "std"))]
pub(crate) fn new(rare1i: u8, rare2i: u8) -> RareNeedleBytes {
RareNeedleBytes { rare1i, rare2i }
}
/// Detect the leftmost offsets of the two rarest bytes in the given
/// needle.
pub(crate) fn forward(needle: &[u8]) -> RareNeedleBytes {
if needle.len() <= 1 || needle.len() > core::u8::MAX as usize {
// For needles bigger than u8::MAX, our offsets aren't big enough.
// (We make our offsets small to reduce stack copying.)
// If you have a use case for it, please file an issue. In that
// case, we should probably just adjust the routine below to pick
// some rare bytes from the first 255 bytes of the needle.
//
// Also note that for needles of size 0 or 1, they are special
// cased in Two-Way.
//
// TODO: Benchmar this.
return RareNeedleBytes { rare1i: 0, rare2i: 0 };
}
// Find the rarest two bytes. We make them distinct by construction.
let (mut rare1, mut rare1i) = (needle[0], 0);
let (mut rare2, mut rare2i) = (needle[1], 1);
if rank(rare2) < rank(rare1) {
core::mem::swap(&mut rare1, &mut rare2);
core::mem::swap(&mut rare1i, &mut rare2i);
}
for (i, &b) in needle.iter().enumerate().skip(2) {
if rank(b) < rank(rare1) {
rare2 = rare1;
rare2i = rare1i;
rare1 = b;
rare1i = i as u8;
} else if b != rare1 && rank(b) < rank(rare2) {
rare2 = b;
rare2i = i as u8;
}
}
// While not strictly required, we really don't want these to be
// equivalent. If they were, it would reduce the effectiveness of
// candidate searching using these rare bytes by increasing the rate of
// false positives.
assert_ne!(rare1i, rare2i);
RareNeedleBytes { rare1i, rare2i }
}
/// Return the rare bytes in the given needle in the forward direction.
/// The needle given must be the same one given to the RareNeedleBytes
/// constructor.
pub(crate) fn as_rare_bytes(&self, needle: &[u8]) -> (u8, u8) {
(needle[self.rare1i as usize], needle[self.rare2i as usize])
}
/// Return the rare offsets such that the first offset is always <= to the
/// second offset. This is useful when the caller doesn't care whether
/// rare1 is rarer than rare2, but just wants to ensure that they are
/// ordered with respect to one another.
#[cfg(memchr_runtime_simd)]
pub(crate) fn as_rare_ordered_usize(&self) -> (usize, usize) {
let (rare1i, rare2i) = self.as_rare_ordered_u8();
(rare1i as usize, rare2i as usize)
}
/// Like as_rare_ordered_usize, but returns the offsets as their native
/// u8 values.
#[cfg(memchr_runtime_simd)]
pub(crate) fn as_rare_ordered_u8(&self) -> (u8, u8) {
if self.rare1i <= self.rare2i {
(self.rare1i, self.rare2i)
} else {
(self.rare2i, self.rare1i)
}
}
/// Return the rare offsets as usize values in the order in which they were
/// constructed. rare1, for example, is constructed as the "rarer" byte,
/// and thus, callers may want to treat it differently from rare2.
pub(crate) fn as_rare_usize(&self) -> (usize, usize) {
(self.rare1i as usize, self.rare2i as usize)
}
/// Return the byte frequency rank of each byte. The higher the rank, the
/// more frequency the byte is predicted to be. The needle given must be
/// the same one given to the RareNeedleBytes constructor.
pub(crate) fn as_ranks(&self, needle: &[u8]) -> (usize, usize) {
let (b1, b2) = self.as_rare_bytes(needle);
(rank(b1), rank(b2))
}
}
/// Return the heuristical frequency rank of the given byte. A lower rank
/// means the byte is believed to occur less frequently.
fn rank(b: u8) -> usize {
crate::memmem::byte_frequencies::BYTE_FREQUENCIES[b as usize] as usize
}

View File

@@ -0,0 +1,878 @@
use core::cmp;
use crate::memmem::{prefilter::Pre, util};
/// Two-Way search in the forward direction.
#[derive(Clone, Copy, Debug)]
pub(crate) struct Forward(TwoWay);
/// Two-Way search in the reverse direction.
#[derive(Clone, Copy, Debug)]
pub(crate) struct Reverse(TwoWay);
/// An implementation of the TwoWay substring search algorithm, with heuristics
/// for accelerating search based on frequency analysis.
///
/// This searcher supports forward and reverse search, although not
/// simultaneously. It runs in O(n + m) time and O(1) space, where
/// `n ~ len(needle)` and `m ~ len(haystack)`.
///
/// The implementation here roughly matches that which was developed by
/// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The
/// changes in this implementation are 1) the use of zero-based indices, 2) a
/// heuristic skip table based on the last byte (borrowed from Rust's standard
/// library) and 3) the addition of heuristics for a fast skip loop. That is,
/// (3) this will detect bytes that are believed to be rare in the needle and
/// use fast vectorized instructions to find their occurrences quickly. The
/// Two-Way algorithm is then used to confirm whether a match at that location
/// occurred.
///
/// The heuristic for fast skipping is automatically shut off if it's
/// detected to be ineffective at search time. Generally, this only occurs in
/// pathological cases. But this is generally necessary in order to preserve
/// a `O(n + m)` time bound.
///
/// The code below is fairly complex and not obviously correct at all. It's
/// likely necessary to read the Two-Way paper cited above in order to fully
/// grok this code. The essence of it is:
///
/// 1) Do something to detect a "critical" position in the needle.
/// 2) For the current position in the haystack, look if needle[critical..]
/// matches at that position.
/// 3) If so, look if needle[..critical] matches.
/// 4) If a mismatch occurs, shift the search by some amount based on the
/// critical position and a pre-computed shift.
///
/// This type is wrapped in Forward and Reverse types that expose consistent
/// forward or reverse APIs.
#[derive(Clone, Copy, Debug)]
struct TwoWay {
/// A small bitset used as a quick prefilter (in addition to the faster
/// SIMD based prefilter). Namely, a bit 'i' is set if and only if b%64==i
/// for any b in the needle.
///
/// When used as a prefilter, if the last byte at the current candidate
/// position is NOT in this set, then we can skip that entire candidate
/// position (the length of the needle). This is essentially the shift
/// trick found in Boyer-Moore, but only applied to bytes that don't appear
/// in the needle.
///
/// N.B. This trick was inspired by something similar in std's
/// implementation of Two-Way.
byteset: ApproximateByteSet,
/// A critical position in needle. Specifically, this position corresponds
/// to beginning of either the minimal or maximal suffix in needle. (N.B.
/// See SuffixType below for why "minimal" isn't quite the correct word
/// here.)
///
/// This is the position at which every search begins. Namely, search
/// starts by scanning text to the right of this position, and only if
/// there's a match does the text to the left of this position get scanned.
critical_pos: usize,
/// The amount we shift by in the Two-Way search algorithm. This
/// corresponds to the "small period" and "large period" cases.
shift: Shift,
}
impl Forward {
/// Create a searcher that uses the Two-Way algorithm by searching forwards
/// through any haystack.
pub(crate) fn new(needle: &[u8]) -> Forward {
if needle.is_empty() {
return Forward(TwoWay::empty());
}
let byteset = ApproximateByteSet::new(needle);
let min_suffix = Suffix::forward(needle, SuffixKind::Minimal);
let max_suffix = Suffix::forward(needle, SuffixKind::Maximal);
let (period_lower_bound, critical_pos) =
if min_suffix.pos > max_suffix.pos {
(min_suffix.period, min_suffix.pos)
} else {
(max_suffix.period, max_suffix.pos)
};
let shift = Shift::forward(needle, period_lower_bound, critical_pos);
Forward(TwoWay { byteset, critical_pos, shift })
}
/// Find the position of the first occurrence of this searcher's needle in
/// the given haystack. If one does not exist, then return None.
///
/// This accepts prefilter state that is useful when using the same
/// searcher multiple times, such as in an iterator.
///
/// Callers must guarantee that the needle is non-empty and its length is
/// <= the haystack's length.
#[inline(always)]
pub(crate) fn find(
&self,
pre: Option<&mut Pre<'_>>,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
debug_assert!(!needle.is_empty(), "needle should not be empty");
debug_assert!(needle.len() <= haystack.len(), "haystack too short");
match self.0.shift {
Shift::Small { period } => {
self.find_small_imp(pre, haystack, needle, period)
}
Shift::Large { shift } => {
self.find_large_imp(pre, haystack, needle, shift)
}
}
}
/// Like find, but handles the degenerate substring test cases. This is
/// only useful for conveniently testing this substring implementation in
/// isolation.
#[cfg(test)]
fn find_general(
&self,
pre: Option<&mut Pre<'_>>,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
if needle.is_empty() {
Some(0)
} else if haystack.len() < needle.len() {
None
} else {
self.find(pre, haystack, needle)
}
}
// Each of the two search implementations below can be accelerated by a
// prefilter, but it is not always enabled. To avoid its overhead when
// its disabled, we explicitly inline each search implementation based on
// whether a prefilter will be used or not. The decision on which to use
// is made in the parent meta searcher.
#[inline(always)]
fn find_small_imp(
&self,
mut pre: Option<&mut Pre<'_>>,
haystack: &[u8],
needle: &[u8],
period: usize,
) -> Option<usize> {
let last_byte = needle.len() - 1;
let mut pos = 0;
let mut shift = 0;
while pos + needle.len() <= haystack.len() {
let mut i = cmp::max(self.0.critical_pos, shift);
if let Some(pre) = pre.as_mut() {
if pre.should_call() {
pos += pre.call(&haystack[pos..], needle)?;
shift = 0;
i = self.0.critical_pos;
if pos + needle.len() > haystack.len() {
return None;
}
}
}
if !self.0.byteset.contains(haystack[pos + last_byte]) {
pos += needle.len();
shift = 0;
continue;
}
while i < needle.len() && needle[i] == haystack[pos + i] {
i += 1;
}
if i < needle.len() {
pos += i - self.0.critical_pos + 1;
shift = 0;
} else {
let mut j = self.0.critical_pos;
while j > shift && needle[j] == haystack[pos + j] {
j -= 1;
}
if j <= shift && needle[shift] == haystack[pos + shift] {
return Some(pos);
}
pos += period;
shift = needle.len() - period;
}
}
None
}
#[inline(always)]
fn find_large_imp(
&self,
mut pre: Option<&mut Pre<'_>>,
haystack: &[u8],
needle: &[u8],
shift: usize,
) -> Option<usize> {
let last_byte = needle.len() - 1;
let mut pos = 0;
'outer: while pos + needle.len() <= haystack.len() {
if let Some(pre) = pre.as_mut() {
if pre.should_call() {
pos += pre.call(&haystack[pos..], needle)?;
if pos + needle.len() > haystack.len() {
return None;
}
}
}
if !self.0.byteset.contains(haystack[pos + last_byte]) {
pos += needle.len();
continue;
}
let mut i = self.0.critical_pos;
while i < needle.len() && needle[i] == haystack[pos + i] {
i += 1;
}
if i < needle.len() {
pos += i - self.0.critical_pos + 1;
} else {
for j in (0..self.0.critical_pos).rev() {
if needle[j] != haystack[pos + j] {
pos += shift;
continue 'outer;
}
}
return Some(pos);
}
}
None
}
}
impl Reverse {
/// Create a searcher that uses the Two-Way algorithm by searching in
/// reverse through any haystack.
pub(crate) fn new(needle: &[u8]) -> Reverse {
if needle.is_empty() {
return Reverse(TwoWay::empty());
}
let byteset = ApproximateByteSet::new(needle);
let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal);
let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal);
let (period_lower_bound, critical_pos) =
if min_suffix.pos < max_suffix.pos {
(min_suffix.period, min_suffix.pos)
} else {
(max_suffix.period, max_suffix.pos)
};
// let critical_pos = needle.len() - critical_pos;
let shift = Shift::reverse(needle, period_lower_bound, critical_pos);
Reverse(TwoWay { byteset, critical_pos, shift })
}
/// Find the position of the last occurrence of this searcher's needle
/// in the given haystack. If one does not exist, then return None.
///
/// This will automatically initialize prefilter state. This should only
/// be used for one-off searches.
///
/// Callers must guarantee that the needle is non-empty and its length is
/// <= the haystack's length.
#[inline(always)]
pub(crate) fn rfind(
&self,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
debug_assert!(!needle.is_empty(), "needle should not be empty");
debug_assert!(needle.len() <= haystack.len(), "haystack too short");
// For the reverse case, we don't use a prefilter. It's plausible that
// perhaps we should, but it's a lot of additional code to do it, and
// it's not clear that it's actually worth it. If you have a really
// compelling use case for this, please file an issue.
match self.0.shift {
Shift::Small { period } => {
self.rfind_small_imp(haystack, needle, period)
}
Shift::Large { shift } => {
self.rfind_large_imp(haystack, needle, shift)
}
}
}
/// Like rfind, but handles the degenerate substring test cases. This is
/// only useful for conveniently testing this substring implementation in
/// isolation.
#[cfg(test)]
fn rfind_general(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
if needle.is_empty() {
Some(haystack.len())
} else if haystack.len() < needle.len() {
None
} else {
self.rfind(haystack, needle)
}
}
#[inline(always)]
fn rfind_small_imp(
&self,
haystack: &[u8],
needle: &[u8],
period: usize,
) -> Option<usize> {
let nlen = needle.len();
let mut pos = haystack.len();
let mut shift = nlen;
while pos >= nlen {
if !self.0.byteset.contains(haystack[pos - nlen]) {
pos -= nlen;
shift = nlen;
continue;
}
let mut i = cmp::min(self.0.critical_pos, shift);
while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
i -= 1;
}
if i > 0 || needle[0] != haystack[pos - nlen] {
pos -= self.0.critical_pos - i + 1;
shift = nlen;
} else {
let mut j = self.0.critical_pos;
while j < shift && needle[j] == haystack[pos - nlen + j] {
j += 1;
}
if j >= shift {
return Some(pos - nlen);
}
pos -= period;
shift = period;
}
}
None
}
#[inline(always)]
fn rfind_large_imp(
&self,
haystack: &[u8],
needle: &[u8],
shift: usize,
) -> Option<usize> {
let nlen = needle.len();
let mut pos = haystack.len();
while pos >= nlen {
if !self.0.byteset.contains(haystack[pos - nlen]) {
pos -= nlen;
continue;
}
let mut i = self.0.critical_pos;
while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
i -= 1;
}
if i > 0 || needle[0] != haystack[pos - nlen] {
pos -= self.0.critical_pos - i + 1;
} else {
let mut j = self.0.critical_pos;
while j < nlen && needle[j] == haystack[pos - nlen + j] {
j += 1;
}
if j == nlen {
return Some(pos - nlen);
}
pos -= shift;
}
}
None
}
}
impl TwoWay {
fn empty() -> TwoWay {
TwoWay {
byteset: ApproximateByteSet::new(b""),
critical_pos: 0,
shift: Shift::Large { shift: 0 },
}
}
}
/// A representation of the amount we're allowed to shift by during Two-Way
/// search.
///
/// When computing a critical factorization of the needle, we find the position
/// of the critical factorization by finding the needle's maximal (or minimal)
/// suffix, along with the period of that suffix. It turns out that the period
/// of that suffix is a lower bound on the period of the needle itself.
///
/// This lower bound is equivalent to the actual period of the needle in
/// some cases. To describe that case, we denote the needle as `x` where
/// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower
/// bound given here is always the period of `v`, which is `<= period(x)`. The
/// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and
/// where `u` is a suffix of `v[0..period(v)]`.
///
/// This case is important because the search algorithm for when the
/// periods are equivalent is slightly different than the search algorithm
/// for when the periods are not equivalent. In particular, when they aren't
/// equivalent, we know that the period of the needle is no less than half its
/// length. In this case, we shift by an amount less than or equal to the
/// period of the needle (determined by the maximum length of the components
/// of the critical factorization of `x`, i.e., `max(len(u), len(v))`)..
///
/// The above two cases are represented by the variants below. Each entails
/// a different instantiation of the Two-Way search algorithm.
///
/// N.B. If we could find a way to compute the exact period in all cases,
/// then we could collapse this case analysis and simplify the algorithm. The
/// Two-Way paper suggests this is possible, but more reading is required to
/// grok why the authors didn't pursue that path.
#[derive(Clone, Copy, Debug)]
enum Shift {
Small { period: usize },
Large { shift: usize },
}
impl Shift {
/// Compute the shift for a given needle in the forward direction.
///
/// This requires a lower bound on the period and a critical position.
/// These can be computed by extracting both the minimal and maximal
/// lexicographic suffixes, and choosing the right-most starting position.
/// The lower bound on the period is then the period of the chosen suffix.
fn forward(
needle: &[u8],
period_lower_bound: usize,
critical_pos: usize,
) -> Shift {
let large = cmp::max(critical_pos, needle.len() - critical_pos);
if critical_pos * 2 >= needle.len() {
return Shift::Large { shift: large };
}
let (u, v) = needle.split_at(critical_pos);
if !util::is_suffix(&v[..period_lower_bound], u) {
return Shift::Large { shift: large };
}
Shift::Small { period: period_lower_bound }
}
/// Compute the shift for a given needle in the reverse direction.
///
/// This requires a lower bound on the period and a critical position.
/// These can be computed by extracting both the minimal and maximal
/// lexicographic suffixes, and choosing the left-most starting position.
/// The lower bound on the period is then the period of the chosen suffix.
fn reverse(
needle: &[u8],
period_lower_bound: usize,
critical_pos: usize,
) -> Shift {
let large = cmp::max(critical_pos, needle.len() - critical_pos);
if (needle.len() - critical_pos) * 2 >= needle.len() {
return Shift::Large { shift: large };
}
let (v, u) = needle.split_at(critical_pos);
if !util::is_prefix(&v[v.len() - period_lower_bound..], u) {
return Shift::Large { shift: large };
}
Shift::Small { period: period_lower_bound }
}
}
/// A suffix extracted from a needle along with its period.
#[derive(Debug)]
struct Suffix {
/// The starting position of this suffix.
///
/// If this is a forward suffix, then `&bytes[pos..]` can be used. If this
/// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for
/// forward suffixes, this is an inclusive starting position, where as for
/// reverse suffixes, this is an exclusive ending position.
pos: usize,
/// The period of this suffix.
///
/// Note that this is NOT necessarily the period of the string from which
/// this suffix comes from. (It is always less than or equal to the period
/// of the original string.)
period: usize,
}
impl Suffix {
fn forward(needle: &[u8], kind: SuffixKind) -> Suffix {
debug_assert!(!needle.is_empty());
// suffix represents our maximal (or minimal) suffix, along with
// its period.
let mut suffix = Suffix { pos: 0, period: 1 };
// The start of a suffix in `needle` that we are considering as a
// more maximal (or minimal) suffix than what's in `suffix`.
let mut candidate_start = 1;
// The current offset of our suffixes that we're comparing.
//
// When the characters at this offset are the same, then we mush on
// to the next position since no decision is possible. When the
// candidate's character is greater (or lesser) than the corresponding
// character than our current maximal (or minimal) suffix, then the
// current suffix is changed over to the candidate and we restart our
// search. Otherwise, the candidate suffix is no good and we restart
// our search on the next candidate.
//
// The three cases above correspond to the three cases in the loop
// below.
let mut offset = 0;
while candidate_start + offset < needle.len() {
let current = needle[suffix.pos + offset];
let candidate = needle[candidate_start + offset];
match kind.cmp(current, candidate) {
SuffixOrdering::Accept => {
suffix = Suffix { pos: candidate_start, period: 1 };
candidate_start += 1;
offset = 0;
}
SuffixOrdering::Skip => {
candidate_start += offset + 1;
offset = 0;
suffix.period = candidate_start - suffix.pos;
}
SuffixOrdering::Push => {
if offset + 1 == suffix.period {
candidate_start += suffix.period;
offset = 0;
} else {
offset += 1;
}
}
}
}
suffix
}
fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix {
debug_assert!(!needle.is_empty());
// See the comments in `forward` for how this works.
let mut suffix = Suffix { pos: needle.len(), period: 1 };
if needle.len() == 1 {
return suffix;
}
let mut candidate_start = needle.len() - 1;
let mut offset = 0;
while offset < candidate_start {
let current = needle[suffix.pos - offset - 1];
let candidate = needle[candidate_start - offset - 1];
match kind.cmp(current, candidate) {
SuffixOrdering::Accept => {
suffix = Suffix { pos: candidate_start, period: 1 };
candidate_start -= 1;
offset = 0;
}
SuffixOrdering::Skip => {
candidate_start -= offset + 1;
offset = 0;
suffix.period = suffix.pos - candidate_start;
}
SuffixOrdering::Push => {
if offset + 1 == suffix.period {
candidate_start -= suffix.period;
offset = 0;
} else {
offset += 1;
}
}
}
}
suffix
}
}
/// The kind of suffix to extract.
#[derive(Clone, Copy, Debug)]
enum SuffixKind {
/// Extract the smallest lexicographic suffix from a string.
///
/// Technically, this doesn't actually pick the smallest lexicographic
/// suffix. e.g., Given the choice between `a` and `aa`, this will choose
/// the latter over the former, even though `a < aa`. The reasoning for
/// this isn't clear from the paper, but it still smells like a minimal
/// suffix.
Minimal,
/// Extract the largest lexicographic suffix from a string.
///
/// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given
/// the choice between `z` and `zz`, this will choose the latter over the
/// former.
Maximal,
}
/// The result of comparing corresponding bytes between two suffixes.
#[derive(Clone, Copy, Debug)]
enum SuffixOrdering {
/// This occurs when the given candidate byte indicates that the candidate
/// suffix is better than the current maximal (or minimal) suffix. That is,
/// the current candidate suffix should supplant the current maximal (or
/// minimal) suffix.
Accept,
/// This occurs when the given candidate byte excludes the candidate suffix
/// from being better than the current maximal (or minimal) suffix. That
/// is, the current candidate suffix should be dropped and the next one
/// should be considered.
Skip,
/// This occurs when no decision to accept or skip the candidate suffix
/// can be made, e.g., when corresponding bytes are equivalent. In this
/// case, the next corresponding bytes should be compared.
Push,
}
impl SuffixKind {
/// Returns true if and only if the given candidate byte indicates that
/// it should replace the current suffix as the maximal (or minimal)
/// suffix.
fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering {
use self::SuffixOrdering::*;
match self {
SuffixKind::Minimal if candidate < current => Accept,
SuffixKind::Minimal if candidate > current => Skip,
SuffixKind::Minimal => Push,
SuffixKind::Maximal if candidate > current => Accept,
SuffixKind::Maximal if candidate < current => Skip,
SuffixKind::Maximal => Push,
}
}
}
/// A bitset used to track whether a particular byte exists in a needle or not.
///
/// Namely, bit 'i' is set if and only if byte%64==i for any byte in the
/// needle. If a particular byte in the haystack is NOT in this set, then one
/// can conclude that it is also not in the needle, and thus, one can advance
/// in the haystack by needle.len() bytes.
#[derive(Clone, Copy, Debug)]
struct ApproximateByteSet(u64);
impl ApproximateByteSet {
/// Create a new set from the given needle.
fn new(needle: &[u8]) -> ApproximateByteSet {
let mut bits = 0;
for &b in needle {
bits |= 1 << (b % 64);
}
ApproximateByteSet(bits)
}
/// Return true if and only if the given byte might be in this set. This
/// may return a false positive, but will never return a false negative.
#[inline(always)]
fn contains(&self, byte: u8) -> bool {
self.0 & (1 << (byte % 64)) != 0
}
}
#[cfg(all(test, feature = "std", not(miri)))]
mod tests {
use quickcheck::quickcheck;
use super::*;
define_memmem_quickcheck_tests!(
super::simpletests::twoway_find,
super::simpletests::twoway_rfind
);
/// Convenience wrapper for computing the suffix as a byte string.
fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
let s = Suffix::forward(needle, kind);
(&needle[s.pos..], s.period)
}
/// Convenience wrapper for computing the reverse suffix as a byte string.
fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
let s = Suffix::reverse(needle, kind);
(&needle[..s.pos], s.period)
}
/// Return all of the non-empty suffixes in the given byte string.
fn suffixes(bytes: &[u8]) -> Vec<&[u8]> {
(0..bytes.len()).map(|i| &bytes[i..]).collect()
}
/// Return the lexicographically maximal suffix of the given byte string.
fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] {
let mut sufs = suffixes(needle);
sufs.sort();
sufs.pop().unwrap()
}
/// Return the lexicographically maximal suffix of the reverse of the given
/// byte string.
fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> {
let mut reversed = needle.to_vec();
reversed.reverse();
let mut got = naive_maximal_suffix_forward(&reversed).to_vec();
got.reverse();
got
}
#[test]
fn suffix_forward() {
macro_rules! assert_suffix_min {
($given:expr, $expected:expr, $period:expr) => {
let (got_suffix, got_period) =
get_suffix_forward($given.as_bytes(), SuffixKind::Minimal);
let got_suffix = std::str::from_utf8(got_suffix).unwrap();
assert_eq!(($expected, $period), (got_suffix, got_period));
};
}
macro_rules! assert_suffix_max {
($given:expr, $expected:expr, $period:expr) => {
let (got_suffix, got_period) =
get_suffix_forward($given.as_bytes(), SuffixKind::Maximal);
let got_suffix = std::str::from_utf8(got_suffix).unwrap();
assert_eq!(($expected, $period), (got_suffix, got_period));
};
}
assert_suffix_min!("a", "a", 1);
assert_suffix_max!("a", "a", 1);
assert_suffix_min!("ab", "ab", 2);
assert_suffix_max!("ab", "b", 1);
assert_suffix_min!("ba", "a", 1);
assert_suffix_max!("ba", "ba", 2);
assert_suffix_min!("abc", "abc", 3);
assert_suffix_max!("abc", "c", 1);
assert_suffix_min!("acb", "acb", 3);
assert_suffix_max!("acb", "cb", 2);
assert_suffix_min!("cba", "a", 1);
assert_suffix_max!("cba", "cba", 3);
assert_suffix_min!("abcabc", "abcabc", 3);
assert_suffix_max!("abcabc", "cabc", 3);
assert_suffix_min!("abcabcabc", "abcabcabc", 3);
assert_suffix_max!("abcabcabc", "cabcabc", 3);
assert_suffix_min!("abczz", "abczz", 5);
assert_suffix_max!("abczz", "zz", 1);
assert_suffix_min!("zzabc", "abc", 3);
assert_suffix_max!("zzabc", "zzabc", 5);
assert_suffix_min!("aaa", "aaa", 1);
assert_suffix_max!("aaa", "aaa", 1);
assert_suffix_min!("foobar", "ar", 2);
assert_suffix_max!("foobar", "r", 1);
}
#[test]
fn suffix_reverse() {
macro_rules! assert_suffix_min {
($given:expr, $expected:expr, $period:expr) => {
let (got_suffix, got_period) =
get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal);
let got_suffix = std::str::from_utf8(got_suffix).unwrap();
assert_eq!(($expected, $period), (got_suffix, got_period));
};
}
macro_rules! assert_suffix_max {
($given:expr, $expected:expr, $period:expr) => {
let (got_suffix, got_period) =
get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal);
let got_suffix = std::str::from_utf8(got_suffix).unwrap();
assert_eq!(($expected, $period), (got_suffix, got_period));
};
}
assert_suffix_min!("a", "a", 1);
assert_suffix_max!("a", "a", 1);
assert_suffix_min!("ab", "a", 1);
assert_suffix_max!("ab", "ab", 2);
assert_suffix_min!("ba", "ba", 2);
assert_suffix_max!("ba", "b", 1);
assert_suffix_min!("abc", "a", 1);
assert_suffix_max!("abc", "abc", 3);
assert_suffix_min!("acb", "a", 1);
assert_suffix_max!("acb", "ac", 2);
assert_suffix_min!("cba", "cba", 3);
assert_suffix_max!("cba", "c", 1);
assert_suffix_min!("abcabc", "abca", 3);
assert_suffix_max!("abcabc", "abcabc", 3);
assert_suffix_min!("abcabcabc", "abcabca", 3);
assert_suffix_max!("abcabcabc", "abcabcabc", 3);
assert_suffix_min!("abczz", "a", 1);
assert_suffix_max!("abczz", "abczz", 5);
assert_suffix_min!("zzabc", "zza", 3);
assert_suffix_max!("zzabc", "zz", 1);
assert_suffix_min!("aaa", "aaa", 1);
assert_suffix_max!("aaa", "aaa", 1);
}
quickcheck! {
fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool {
if bytes.is_empty() {
return true;
}
let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal);
let expected = naive_maximal_suffix_forward(&bytes);
got == expected
}
fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool {
if bytes.is_empty() {
return true;
}
let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal);
let expected = naive_maximal_suffix_reverse(&bytes);
expected == got
}
}
}
#[cfg(test)]
mod simpletests {
use super::*;
pub(crate) fn twoway_find(
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
Forward::new(needle).find_general(None, haystack, needle)
}
pub(crate) fn twoway_rfind(
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
Reverse::new(needle).rfind_general(haystack, needle)
}
define_memmem_simple_tests!(twoway_find, twoway_rfind);
// This is a regression test caught by quickcheck that exercised a bug in
// the reverse small period handling. The bug was that we were using 'if j
// == shift' to determine if a match occurred, but the correct guard is 'if
// j >= shift', which matches the corresponding guard in the forward impl.
#[test]
fn regression_rev_small_period() {
let rfind = super::simpletests::twoway_rfind;
let haystack = "ababaz";
let needle = "abab";
assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes()));
}
}

View File

@@ -0,0 +1,88 @@
// These routines are meant to be optimized specifically for low latency as
// compared to the equivalent routines offered by std. (Which may invoke the
// dynamic linker and call out to libc, which introduces a bit more latency
// than we'd like.)
/// Returns true if and only if needle is a prefix of haystack.
#[inline(always)]
pub(crate) fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool {
needle.len() <= haystack.len() && memcmp(&haystack[..needle.len()], needle)
}
/// Returns true if and only if needle is a suffix of haystack.
#[inline(always)]
pub(crate) fn is_suffix(haystack: &[u8], needle: &[u8]) -> bool {
needle.len() <= haystack.len()
&& memcmp(&haystack[haystack.len() - needle.len()..], needle)
}
/// Return true if and only if x.len() == y.len() && x[i] == y[i] for all
/// 0 <= i < x.len().
///
/// Why not just use actual memcmp for this? Well, memcmp requires calling out
/// to libc, and this routine is called in fairly hot code paths. Other than
/// just calling out to libc, it also seems to result in worse codegen. By
/// rolling our own memcmp in pure Rust, it seems to appear more friendly to
/// the optimizer.
///
/// We mark this as inline always, although, some callers may not want it
/// inlined for better codegen (like Rabin-Karp). In that case, callers are
/// advised to create a non-inlineable wrapper routine that calls memcmp.
#[inline(always)]
pub(crate) fn memcmp(x: &[u8], y: &[u8]) -> bool {
if x.len() != y.len() {
return false;
}
// If we don't have enough bytes to do 4-byte at a time loads, then
// fall back to the naive slow version.
//
// TODO: We could do a copy_nonoverlapping combined with a mask instead
// of a loop. Benchmark it.
if x.len() < 4 {
for (&b1, &b2) in x.iter().zip(y) {
if b1 != b2 {
return false;
}
}
return true;
}
// When we have 4 or more bytes to compare, then proceed in chunks of 4 at
// a time using unaligned loads.
//
// Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is
// that this particular version of memcmp is likely to be called with tiny
// needles. That means that if we do 8 byte loads, then a higher proportion
// of memcmp calls will use the slower variant above. With that said, this
// is a hypothesis and is only loosely supported by benchmarks. There's
// likely some improvement that could be made here. The main thing here
// though is to optimize for latency, not throughput.
// SAFETY: Via the conditional above, we know that both `px` and `py`
// have the same length, so `px < pxend` implies that `py < pyend`.
// Thus, derefencing both `px` and `py` in the loop below is safe.
//
// Moreover, we set `pxend` and `pyend` to be 4 bytes before the actual
// end of of `px` and `py`. Thus, the final dereference outside of the
// loop is guaranteed to be valid. (The final comparison will overlap with
// the last comparison done in the loop for lengths that aren't multiples
// of four.)
//
// Finally, we needn't worry about alignment here, since we do unaligned
// loads.
unsafe {
let (mut px, mut py) = (x.as_ptr(), y.as_ptr());
let (pxend, pyend) = (px.add(x.len() - 4), py.add(y.len() - 4));
while px < pxend {
let vx = (px as *const u32).read_unaligned();
let vy = (py as *const u32).read_unaligned();
if vx != vy {
return false;
}
px = px.add(4);
py = py.add(4);
}
let vx = (pxend as *const u32).read_unaligned();
let vy = (pyend as *const u32).read_unaligned();
vx == vy
}
}

View File

@@ -0,0 +1,131 @@
/// A trait for describing vector operations used by vectorized searchers.
///
/// The trait is highly constrained to low level vector operations needed. In
/// general, it was invented mostly to be generic over x86's __m128i and
/// __m256i types. It's likely that once std::simd becomes a thing, we can
/// migrate to that since the operations required are quite simple.
///
/// TODO: Consider moving this trait up a level and using it to implement
/// memchr as well. The trait might need to grow one or two methods, but
/// otherwise should be close to sufficient already.
///
/// # Safety
///
/// All methods are not safe since they are intended to be implemented using
/// vendor intrinsics, which are also not safe. Callers must ensure that the
/// appropriate target features are enabled in the calling function, and that
/// the current CPU supports them. All implementations should avoid marking the
/// routines with #[target_feature] and instead mark them as #[inline(always)]
/// to ensure they get appropriately inlined. (inline(always) cannot be used
/// with target_feature.)
pub(crate) trait Vector: Copy + core::fmt::Debug {
/// _mm_set1_epi8 or _mm256_set1_epi8
unsafe fn splat(byte: u8) -> Self;
/// _mm_loadu_si128 or _mm256_loadu_si256
unsafe fn load_unaligned(data: *const u8) -> Self;
/// _mm_movemask_epi8 or _mm256_movemask_epi8
unsafe fn movemask(self) -> u32;
/// _mm_cmpeq_epi8 or _mm256_cmpeq_epi8
unsafe fn cmpeq(self, vector2: Self) -> Self;
/// _mm_and_si128 or _mm256_and_si256
unsafe fn and(self, vector2: Self) -> Self;
}
#[cfg(target_arch = "x86_64")]
mod x86sse {
use super::Vector;
use core::arch::x86_64::*;
impl Vector for __m128i {
#[inline(always)]
unsafe fn splat(byte: u8) -> __m128i {
_mm_set1_epi8(byte as i8)
}
#[inline(always)]
unsafe fn load_unaligned(data: *const u8) -> __m128i {
_mm_loadu_si128(data as *const __m128i)
}
#[inline(always)]
unsafe fn movemask(self) -> u32 {
_mm_movemask_epi8(self) as u32
}
#[inline(always)]
unsafe fn cmpeq(self, vector2: Self) -> __m128i {
_mm_cmpeq_epi8(self, vector2)
}
#[inline(always)]
unsafe fn and(self, vector2: Self) -> __m128i {
_mm_and_si128(self, vector2)
}
}
}
#[cfg(all(feature = "std", target_arch = "x86_64"))]
mod x86avx {
use super::Vector;
use core::arch::x86_64::*;
impl Vector for __m256i {
#[inline(always)]
unsafe fn splat(byte: u8) -> __m256i {
_mm256_set1_epi8(byte as i8)
}
#[inline(always)]
unsafe fn load_unaligned(data: *const u8) -> __m256i {
_mm256_loadu_si256(data as *const __m256i)
}
#[inline(always)]
unsafe fn movemask(self) -> u32 {
_mm256_movemask_epi8(self) as u32
}
#[inline(always)]
unsafe fn cmpeq(self, vector2: Self) -> __m256i {
_mm256_cmpeq_epi8(self, vector2)
}
#[inline(always)]
unsafe fn and(self, vector2: Self) -> __m256i {
_mm256_and_si256(self, vector2)
}
}
}
#[cfg(target_arch = "wasm32")]
mod wasm_simd128 {
use super::Vector;
use core::arch::wasm32::*;
impl Vector for v128 {
#[inline(always)]
unsafe fn splat(byte: u8) -> v128 {
u8x16_splat(byte)
}
#[inline(always)]
unsafe fn load_unaligned(data: *const u8) -> v128 {
v128_load(data.cast())
}
#[inline(always)]
unsafe fn movemask(self) -> u32 {
u8x16_bitmask(self).into()
}
#[inline(always)]
unsafe fn cmpeq(self, vector2: Self) -> v128 {
u8x16_eq(self, vector2)
}
#[inline(always)]
unsafe fn and(self, vector2: Self) -> v128 {
v128_and(self, vector2)
}
}
}

View File

@@ -0,0 +1,75 @@
use core::arch::wasm32::v128;
use crate::memmem::{genericsimd, NeedleInfo};
/// A `v128` accelerated vectorized substring search routine that only works on
/// small needles.
#[derive(Clone, Copy, Debug)]
pub(crate) struct Forward(genericsimd::Forward);
impl Forward {
/// Create a new "generic simd" forward searcher. If one could not be
/// created from the given inputs, then None is returned.
pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward> {
if !cfg!(memchr_runtime_simd) {
return None;
}
genericsimd::Forward::new(ninfo, needle).map(Forward)
}
/// Returns the minimum length of haystack that is needed for this searcher
/// to work. Passing a haystack with a length smaller than this will cause
/// `find` to panic.
#[inline(always)]
pub(crate) fn min_haystack_len(&self) -> usize {
self.0.min_haystack_len::<v128>()
}
#[inline(always)]
pub(crate) fn find(
&self,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
self.find_impl(haystack, needle)
}
/// The implementation of find marked with the appropriate target feature.
#[target_feature(enable = "simd128")]
fn find_impl(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
unsafe { genericsimd::fwd_find::<v128>(&self.0, haystack, needle) }
}
}
#[cfg(all(test, feature = "std", not(miri)))]
mod tests {
use crate::memmem::{prefilter::PrefilterState, NeedleInfo};
fn find(
_: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
super::Forward::new(ninfo, needle).unwrap().find(haystack, needle)
}
#[test]
fn prefilter_permutations() {
use crate::memmem::prefilter::tests::PrefilterTest;
unsafe {
PrefilterTest::run_all_tests_filter(find, |t| {
// This substring searcher only works on certain configs, so
// filter our tests such that Forward::new will be guaranteed
// to succeed. (And also remove tests with a haystack that is
// too small.)
let fwd = match super::Forward::new(&t.ninfo, &t.needle) {
None => return false,
Some(fwd) => fwd,
};
t.haystack.len() >= fwd.min_haystack_len()
})
}
}
}

View File

@@ -0,0 +1,139 @@
#[cfg(not(feature = "std"))]
pub(crate) use self::nostd::Forward;
#[cfg(feature = "std")]
pub(crate) use self::std::Forward;
#[cfg(feature = "std")]
mod std {
use core::arch::x86_64::{__m128i, __m256i};
use crate::memmem::{genericsimd, NeedleInfo};
/// An AVX accelerated vectorized substring search routine that only works
/// on small needles.
#[derive(Clone, Copy, Debug)]
pub(crate) struct Forward(genericsimd::Forward);
impl Forward {
/// Create a new "generic simd" forward searcher. If one could not be
/// created from the given inputs, then None is returned.
pub(crate) fn new(
ninfo: &NeedleInfo,
needle: &[u8],
) -> Option<Forward> {
if !cfg!(memchr_runtime_avx) || !is_x86_feature_detected!("avx2") {
return None;
}
genericsimd::Forward::new(ninfo, needle).map(Forward)
}
/// Returns the minimum length of haystack that is needed for this
/// searcher to work. Passing a haystack with a length smaller than
/// this will cause `find` to panic.
#[inline(always)]
pub(crate) fn min_haystack_len(&self) -> usize {
self.0.min_haystack_len::<__m128i>()
}
#[inline(always)]
pub(crate) fn find(
&self,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
// SAFETY: The only way a Forward value can exist is if the avx2
// target feature is enabled. This is the only safety requirement
// for calling the genericsimd searcher.
unsafe { self.find_impl(haystack, needle) }
}
/// The implementation of find marked with the appropriate target
/// feature.
///
/// # Safety
///
/// Callers must ensure that the avx2 CPU feature is enabled in the
/// current environment.
#[target_feature(enable = "avx2")]
unsafe fn find_impl(
&self,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
if haystack.len() < self.0.min_haystack_len::<__m256i>() {
genericsimd::fwd_find::<__m128i>(&self.0, haystack, needle)
} else {
genericsimd::fwd_find::<__m256i>(&self.0, haystack, needle)
}
}
}
}
// We still define the avx "forward" type on nostd to make caller code a bit
// simpler. This avoids needing a lot more conditional compilation.
#[cfg(not(feature = "std"))]
mod nostd {
use crate::memmem::NeedleInfo;
#[derive(Clone, Copy, Debug)]
pub(crate) struct Forward(());
impl Forward {
pub(crate) fn new(
ninfo: &NeedleInfo,
needle: &[u8],
) -> Option<Forward> {
None
}
pub(crate) fn min_haystack_len(&self) -> usize {
unreachable!()
}
pub(crate) fn find(
&self,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
unreachable!()
}
}
}
#[cfg(all(test, feature = "std", not(miri)))]
mod tests {
use crate::memmem::{prefilter::PrefilterState, NeedleInfo};
fn find(
_: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
super::Forward::new(ninfo, needle).unwrap().find(haystack, needle)
}
#[test]
fn prefilter_permutations() {
use crate::memmem::prefilter::tests::PrefilterTest;
if !is_x86_feature_detected!("avx2") {
return;
}
// SAFETY: The safety of find only requires that the current CPU
// support AVX2, which we checked above.
unsafe {
PrefilterTest::run_all_tests_filter(find, |t| {
// This substring searcher only works on certain configs, so
// filter our tests such that Forward::new will be guaranteed
// to succeed. (And also remove tests with a haystack that is
// too small.)
let fwd = match super::Forward::new(&t.ninfo, &t.needle) {
None => return false,
Some(fwd) => fwd,
};
t.haystack.len() >= fwd.min_haystack_len()
})
}
}
}

View File

@@ -0,0 +1,2 @@
pub(crate) mod avx;
pub(crate) mod sse;

View File

@@ -0,0 +1,89 @@
use core::arch::x86_64::__m128i;
use crate::memmem::{genericsimd, NeedleInfo};
/// An SSE accelerated vectorized substring search routine that only works on
/// small needles.
#[derive(Clone, Copy, Debug)]
pub(crate) struct Forward(genericsimd::Forward);
impl Forward {
/// Create a new "generic simd" forward searcher. If one could not be
/// created from the given inputs, then None is returned.
pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward> {
if !cfg!(memchr_runtime_sse2) {
return None;
}
genericsimd::Forward::new(ninfo, needle).map(Forward)
}
/// Returns the minimum length of haystack that is needed for this searcher
/// to work. Passing a haystack with a length smaller than this will cause
/// `find` to panic.
#[inline(always)]
pub(crate) fn min_haystack_len(&self) -> usize {
self.0.min_haystack_len::<__m128i>()
}
#[inline(always)]
pub(crate) fn find(
&self,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
// SAFETY: sse2 is enabled on all x86_64 targets, so this is always
// safe to call.
unsafe { self.find_impl(haystack, needle) }
}
/// The implementation of find marked with the appropriate target feature.
///
/// # Safety
///
/// This is safe to call in all cases since sse2 is guaranteed to be part
/// of x86_64. It is marked as unsafe because of the target feature
/// attribute.
#[target_feature(enable = "sse2")]
unsafe fn find_impl(
&self,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
genericsimd::fwd_find::<__m128i>(&self.0, haystack, needle)
}
}
#[cfg(all(test, feature = "std", not(miri)))]
mod tests {
use crate::memmem::{prefilter::PrefilterState, NeedleInfo};
fn find(
_: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
super::Forward::new(ninfo, needle).unwrap().find(haystack, needle)
}
#[test]
fn prefilter_permutations() {
use crate::memmem::prefilter::tests::PrefilterTest;
// SAFETY: sse2 is enabled on all x86_64 targets, so this is always
// safe to call.
unsafe {
PrefilterTest::run_all_tests_filter(find, |t| {
// This substring searcher only works on certain configs, so
// filter our tests such that Forward::new will be guaranteed
// to succeed. (And also remove tests with a haystack that is
// too small.)
let fwd = match super::Forward::new(&t.ninfo, &t.needle) {
None => return false,
Some(fwd) => fwd,
};
t.haystack.len() >= fwd.min_haystack_len()
})
}
}
}