更新libclamav库1.0.0版本

This commit is contained in:
2023-01-14 18:28:39 +08:00
parent b879ee0b2e
commit 45fe15f472
8531 changed files with 1222046 additions and 177272 deletions

View File

@@ -0,0 +1,234 @@
#[test]
fn empty_regex_empty_match() {
let re = regex!("");
assert_eq!(vec![(0, 0)], findall!(re, ""));
}
#[test]
fn empty_regex_nonempty_match() {
let re = regex!("");
assert_eq!(vec![(0, 0), (1, 1), (2, 2), (3, 3)], findall!(re, "abc"));
}
#[test]
fn one_zero_length_match() {
let re = regex!(r"[0-9]*");
assert_eq!(vec![(0, 0), (1, 2), (3, 4)], findall!(re, "a1b2"));
}
#[test]
fn many_zero_length_match() {
let re = regex!(r"[0-9]*");
assert_eq!(
vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)],
findall!(re, "a1bbb2")
);
}
#[test]
fn many_sequential_zero_length_match() {
let re = regex!(r"[0-9]?");
assert_eq!(
vec![(0, 0), (1, 2), (2, 3), (4, 5), (6, 6)],
findall!(re, "a12b3c")
);
}
#[test]
fn quoted_bracket_set() {
let re = regex!(r"([\x{5b}\x{5d}])");
assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]"));
let re = regex!(r"([\[\]])");
assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]"));
}
#[test]
fn first_range_starts_with_left_bracket() {
let re = regex!(r"([\[-z])");
assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]"));
}
#[test]
fn range_ends_with_escape() {
let re = regex!(r"([\[-\x{5d}])");
assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]"));
}
#[test]
fn empty_match_find_iter() {
let re = regex!(r".*?");
assert_eq!(vec![(0, 0), (1, 1), (2, 2), (3, 3)], findall!(re, "abc"));
}
#[test]
fn empty_match_captures_iter() {
let re = regex!(r".*?");
let ms: Vec<_> = re
.captures_iter(text!("abc"))
.map(|c| c.get(0).unwrap())
.map(|m| (m.start(), m.end()))
.collect();
assert_eq!(ms, vec![(0, 0), (1, 1), (2, 2), (3, 3)]);
}
#[test]
fn capture_names() {
let re = regex!(r"(.)(?P<a>.)");
assert_eq!(3, re.captures_len());
assert_eq!((3, Some(3)), re.capture_names().size_hint());
assert_eq!(
vec![None, None, Some("a")],
re.capture_names().collect::<Vec<_>>()
);
}
#[test]
fn regex_string() {
assert_eq!(r"[a-zA-Z0-9]+", regex!(r"[a-zA-Z0-9]+").as_str());
assert_eq!(r"[a-zA-Z0-9]+", &format!("{}", regex!(r"[a-zA-Z0-9]+")));
assert_eq!(r"[a-zA-Z0-9]+", &format!("{:?}", regex!(r"[a-zA-Z0-9]+")));
}
#[test]
fn capture_index() {
let re = regex!(r"^(?P<name>.+)$");
let cap = re.captures(t!("abc")).unwrap();
assert_eq!(&cap[0], t!("abc"));
assert_eq!(&cap[1], t!("abc"));
assert_eq!(&cap["name"], t!("abc"));
}
#[test]
#[should_panic]
#[cfg_attr(all(target_env = "msvc", target_pointer_width = "32"), ignore)]
fn capture_index_panic_usize() {
let re = regex!(r"^(?P<name>.+)$");
let cap = re.captures(t!("abc")).unwrap();
let _ = cap[2];
}
#[test]
#[should_panic]
#[cfg_attr(all(target_env = "msvc", target_pointer_width = "32"), ignore)]
fn capture_index_panic_name() {
let re = regex!(r"^(?P<name>.+)$");
let cap = re.captures(t!("abc")).unwrap();
let _ = cap["bad name"];
}
#[test]
fn capture_index_lifetime() {
// This is a test of whether the types on `caps["..."]` are general
// enough. If not, this will fail to typecheck.
fn inner(s: &str) -> usize {
let re = regex!(r"(?P<number>[0-9]+)");
let caps = re.captures(t!(s)).unwrap();
caps["number"].len()
}
assert_eq!(3, inner("123"));
}
#[test]
fn capture_misc() {
let re = regex!(r"(.)(?P<a>a)?(.)(?P<b>.)");
let cap = re.captures(t!("abc")).unwrap();
assert_eq!(5, cap.len());
assert_eq!((0, 3), {
let m = cap.get(0).unwrap();
(m.start(), m.end())
});
assert_eq!(None, cap.get(2));
assert_eq!((2, 3), {
let m = cap.get(4).unwrap();
(m.start(), m.end())
});
assert_eq!(t!("abc"), match_text!(cap.get(0).unwrap()));
assert_eq!(None, cap.get(2));
assert_eq!(t!("c"), match_text!(cap.get(4).unwrap()));
assert_eq!(None, cap.name("a"));
assert_eq!(t!("c"), match_text!(cap.name("b").unwrap()));
}
#[test]
fn sub_capture_matches() {
let re = regex!(r"([a-z])(([a-z])|([0-9]))");
let cap = re.captures(t!("a5")).unwrap();
let subs: Vec<_> = cap.iter().collect();
assert_eq!(5, subs.len());
assert!(subs[0].is_some());
assert!(subs[1].is_some());
assert!(subs[2].is_some());
assert!(subs[3].is_none());
assert!(subs[4].is_some());
assert_eq!(t!("a5"), match_text!(subs[0].unwrap()));
assert_eq!(t!("a"), match_text!(subs[1].unwrap()));
assert_eq!(t!("5"), match_text!(subs[2].unwrap()));
assert_eq!(t!("5"), match_text!(subs[4].unwrap()));
}
expand!(expand1, r"(?-u)(?P<foo>\w+)", "abc", "$foo", "abc");
expand!(expand2, r"(?-u)(?P<foo>\w+)", "abc", "$0", "abc");
expand!(expand3, r"(?-u)(?P<foo>\w+)", "abc", "$1", "abc");
expand!(expand4, r"(?-u)(?P<foo>\w+)", "abc", "$$1", "$1");
expand!(expand5, r"(?-u)(?P<foo>\w+)", "abc", "$$foo", "$foo");
expand!(expand6, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$b$a", "123abc");
expand!(expand7, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "z$bz$az", "z");
expand!(
expand8,
r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)",
"abc 123",
".$b.$a.",
".123.abc."
);
expand!(
expand9,
r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)",
"abc 123",
" $b $a ",
" 123 abc "
);
expand!(expand10, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$bz$az", "");
expand!(expand_name1, r"%(?P<Z>[a-z]+)", "%abc", "$Z%", "abc%");
expand!(expand_name2, r"\[(?P<Z>[a-z]+)", "[abc", "$Z[", "abc[");
expand!(expand_name3, r"\{(?P<Z>[a-z]+)", "{abc", "$Z{", "abc{");
expand!(expand_name4, r"\}(?P<Z>[a-z]+)", "}abc", "$Z}", "abc}");
expand!(expand_name5, r"%([a-z]+)", "%abc", "$1a%", "%");
expand!(expand_name6, r"%([a-z]+)", "%abc", "${1}a%", "abca%");
expand!(expand_name7, r"\[(?P<Z[>[a-z]+)", "[abc", "${Z[}[", "abc[");
expand!(expand_name8, r"\[(?P<Z[>[a-z]+)", "[abc", "${foo}[", "[");
expand!(expand_name9, r"\[(?P<Z[>[a-z]+)", "[abc", "${1a}[", "[");
expand!(expand_name10, r"\[(?P<Z[>[a-z]+)", "[abc", "${#}[", "[");
expand!(expand_name11, r"\[(?P<Z[>[a-z]+)", "[abc", "${$$}[", "[");
split!(
split1,
r"(?-u)\s+",
"a b\nc\td\n\t e",
&[t!("a"), t!("b"), t!("c"), t!("d"), t!("e")]
);
split!(
split2,
r"(?-u)\b",
"a b c",
&[t!(""), t!("a"), t!(" "), t!("b"), t!(" "), t!("c"), t!("")]
);
split!(split3, r"a$", "a", &[t!(""), t!("")]);
split!(split_none, r"-", r"a", &[t!("a")]);
split!(split_trailing_blank, r"-", r"a-", &[t!("a"), t!("")]);
split!(split_trailing_blanks, r"-", r"a--", &[t!("a"), t!(""), t!("")]);
split!(split_empty, r"-", r"", &[t!("")]);
splitn!(splitn_below_limit, r"-", r"a", 2, &[t!("a")]);
splitn!(splitn_at_limit, r"-", r"a-b", 2, &[t!("a"), t!("b")]);
splitn!(splitn_above_limit, r"-", r"a-b-c", 2, &[t!("a"), t!("b-c")]);
splitn!(splitn_zero_limit, r"-", r"a-b", 0, empty_vec!());
splitn!(splitn_trailing_blank, r"-", r"a-", 2, &[t!("a"), t!("")]);
splitn!(splitn_trailing_separator, r"-", r"a--", 2, &[t!("a"), t!("-")]);
splitn!(splitn_empty, r"-", r"", 1, &[t!("")]);

View File

@@ -0,0 +1,34 @@
// These tests don't really make sense with the bytes API, so we only test them
// on the Unicode API.
#[test]
fn empty_match_unicode_find_iter() {
// Tests that we still yield byte ranges at valid UTF-8 sequence boundaries
// even when we're susceptible to empty width matches.
let re = regex!(r".*?");
assert_eq!(
vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)],
findall!(re, "1Ⅱ2")
);
}
#[test]
fn empty_match_unicode_captures_iter() {
// Same as empty_match_unicode_find_iter, but tests capture iteration.
let re = regex!(r".*?");
let ms: Vec<_> = re
.captures_iter(text!("1Ⅱ2"))
.map(|c| c.get(0).unwrap())
.map(|m| (m.start(), m.end()))
.collect();
assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], ms);
}
#[test]
fn match_as_str() {
let re = regex!(r"fo+");
let caps = re.captures("barfoobar").unwrap();
assert_eq!(caps.get(0).map(|m| m.as_str()), Some("foo"));
assert_eq!(caps.get(0).map(From::from), Some("foo"));
assert_eq!(caps.get(0).map(Into::into), Some("foo"));
}

View File

@@ -0,0 +1,107 @@
// These are tests specifically crafted for regexes that can match arbitrary
// bytes.
// A silly wrapper to make it possible to write and match raw bytes.
struct R<'a>(&'a [u8]);
impl<'a> R<'a> {
fn as_bytes(&self) -> &'a [u8] {
self.0
}
}
mat!(word_boundary, r"(?-u) \b", " δ", None);
#[cfg(feature = "unicode-perl")]
mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1)));
mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1)));
#[cfg(feature = "unicode-perl")]
mat!(word_not_boundary_unicode, r" \B", " δ", None);
mat!(perl_w_ascii, r"(?-u)\w+", "", Some((0, 1)));
#[cfg(feature = "unicode-perl")]
mat!(perl_w_unicode, r"\w+", "", Some((0, 3)));
mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1)));
#[cfg(feature = "unicode-perl")]
mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8)));
mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1)));
#[cfg(feature = "unicode-perl")]
mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4)));
// The first `(.+)` matches two Unicode codepoints, but can't match the 5th
// byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
// matches.
mat!(
mixed1,
r"(.+)(?-u)(.+)",
R(b"\xCE\x93\xCE\x94\xFF"),
Some((0, 5)),
Some((0, 4)),
Some((4, 5))
);
mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1)));
mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5)));
#[cfg(feature = "unicode-case")]
mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7)));
mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2)));
mat!(negate_unicode, r"[^a]", "δ", Some((0, 2)));
mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1)));
// This doesn't match in a normal Unicode regex because the implicit preceding
// `.*?` is Unicode aware.
mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2)));
mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2)));
// Have fun with null bytes.
mat!(
null_bytes,
r"(?-u)(?P<cstr>[^\x00]+)\x00",
R(b"foo\x00"),
Some((0, 4)),
Some((0, 3))
);
// Test that lookahead operators work properly in the face of invalid UTF-8.
// See: https://github.com/rust-lang/regex/issues/277
matiter!(
invalidutf8_anchor1,
r"(?-u)\xcc?^",
R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
(0, 0)
);
matiter!(
invalidutf8_anchor2,
r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$",
R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
(22, 22)
);
matiter!(
invalidutf8_anchor3,
r"(?-u)^|ddp\xff\xffdddddlQd@\x80",
R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
(0, 0)
);
// See https://github.com/rust-lang/regex/issues/303
#[test]
fn negated_full_byte_range() {
assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err());
}
matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ");
matiter!(
word_boundary_ascii2,
r"(?-u:\B)",
"0\u{7EF5E}",
(2, 2),
(3, 3),
(4, 4),
(5, 5)
);
// See: https://github.com/rust-lang/regex/issues/264
mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0)));
mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0)));
// See: https://github.com/rust-lang/regex/issues/271
mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8)));

View File

@@ -0,0 +1,238 @@
use regex::internal::ExecBuilder;
/// Given a regex, check if all of the backends produce the same
/// results on a number of different inputs.
///
/// For now this just throws quickcheck at the problem, which
/// is not very good because it only really tests half of the
/// problem space. It is pretty unlikely that a random string
/// will match any given regex, so this will probably just
/// be checking that the different backends fail in the same
/// way. This is still worthwhile to test, but is definitely not
/// the whole story.
///
/// TODO(ethan): In order to cover the other half of the problem
/// space, we should generate a random matching string by inspecting
/// the AST of the input regex. The right way to do this probably
/// involves adding a custom Arbitrary instance around a couple
/// of newtypes. That way we can respect the quickcheck size hinting
/// and shrinking and whatnot.
pub fn backends_are_consistent(re: &str) -> Result<u64, String> {
let standard_backends = vec![
(
"bounded_backtracking_re",
ExecBuilder::new(re)
.bounded_backtracking()
.build()
.map(|exec| exec.into_regex())
.map_err(|err| format!("{}", err))?,
),
(
"pikevm_re",
ExecBuilder::new(re)
.nfa()
.build()
.map(|exec| exec.into_regex())
.map_err(|err| format!("{}", err))?,
),
(
"default_re",
ExecBuilder::new(re)
.build()
.map(|exec| exec.into_regex())
.map_err(|err| format!("{}", err))?,
),
];
let utf8bytes_backends = vec![
(
"bounded_backtracking_utf8bytes_re",
ExecBuilder::new(re)
.bounded_backtracking()
.bytes(true)
.build()
.map(|exec| exec.into_regex())
.map_err(|err| format!("{}", err))?,
),
(
"pikevm_utf8bytes_re",
ExecBuilder::new(re)
.nfa()
.bytes(true)
.build()
.map(|exec| exec.into_regex())
.map_err(|err| format!("{}", err))?,
),
(
"default_utf8bytes_re",
ExecBuilder::new(re)
.bytes(true)
.build()
.map(|exec| exec.into_regex())
.map_err(|err| format!("{}", err))?,
),
];
let bytes_backends = vec![
(
"bounded_backtracking_bytes_re",
ExecBuilder::new(re)
.bounded_backtracking()
.only_utf8(false)
.build()
.map(|exec| exec.into_byte_regex())
.map_err(|err| format!("{}", err))?,
),
(
"pikevm_bytes_re",
ExecBuilder::new(re)
.nfa()
.only_utf8(false)
.build()
.map(|exec| exec.into_byte_regex())
.map_err(|err| format!("{}", err))?,
),
(
"default_bytes_re",
ExecBuilder::new(re)
.only_utf8(false)
.build()
.map(|exec| exec.into_byte_regex())
.map_err(|err| format!("{}", err))?,
),
];
Ok(string_checker::check_backends(&standard_backends)?
+ string_checker::check_backends(&utf8bytes_backends)?
+ bytes_checker::check_backends(&bytes_backends)?)
}
//
// A consistency checker parameterized by the input type (&str or &[u8]).
//
macro_rules! checker {
($module_name:ident, $regex_type:path, $mk_input:expr) => {
mod $module_name {
use quickcheck;
use quickcheck::{Arbitrary, TestResult};
pub fn check_backends(
backends: &[(&str, $regex_type)],
) -> Result<u64, String> {
let mut total_passed = 0;
for regex in backends[1..].iter() {
total_passed += quickcheck_regex_eq(&backends[0], regex)?;
}
Ok(total_passed)
}
fn quickcheck_regex_eq(
&(name1, ref re1): &(&str, $regex_type),
&(name2, ref re2): &(&str, $regex_type),
) -> Result<u64, String> {
quickcheck::QuickCheck::new()
.quicktest(RegexEqualityTest::new(
re1.clone(),
re2.clone(),
))
.map_err(|err| {
format!(
"{}(/{}/) and {}(/{}/) are inconsistent.\
QuickCheck Err: {:?}",
name1, re1, name2, re2, err
)
})
}
struct RegexEqualityTest {
re1: $regex_type,
re2: $regex_type,
}
impl RegexEqualityTest {
fn new(re1: $regex_type, re2: $regex_type) -> Self {
RegexEqualityTest { re1: re1, re2: re2 }
}
}
impl quickcheck::Testable for RegexEqualityTest {
fn result(&self, gen: &mut quickcheck::Gen) -> TestResult {
let input = $mk_input(gen);
let input = &input;
if self.re1.find(&input) != self.re2.find(input) {
return TestResult::error(format!(
"find mismatch input={:?}",
input
));
}
let cap1 = self.re1.captures(input);
let cap2 = self.re2.captures(input);
match (cap1, cap2) {
(None, None) => {}
(Some(cap1), Some(cap2)) => {
for (c1, c2) in cap1.iter().zip(cap2.iter()) {
if c1 != c2 {
return TestResult::error(format!(
"captures mismatch input={:?}",
input
));
}
}
}
_ => {
return TestResult::error(format!(
"captures mismatch input={:?}",
input
))
}
}
let fi1 = self.re1.find_iter(input);
let fi2 = self.re2.find_iter(input);
for (m1, m2) in fi1.zip(fi2) {
if m1 != m2 {
return TestResult::error(format!(
"find_iter mismatch input={:?}",
input
));
}
}
let ci1 = self.re1.captures_iter(input);
let ci2 = self.re2.captures_iter(input);
for (cap1, cap2) in ci1.zip(ci2) {
for (c1, c2) in cap1.iter().zip(cap2.iter()) {
if c1 != c2 {
return TestResult::error(format!(
"captures_iter mismatch input={:?}",
input
));
}
}
}
let s1 = self.re1.split(input);
let s2 = self.re2.split(input);
for (chunk1, chunk2) in s1.zip(s2) {
if chunk1 != chunk2 {
return TestResult::error(format!(
"split mismatch input={:?}",
input
));
}
}
TestResult::from_bool(true)
}
}
} // mod
}; // rule case
} // macro_rules!
checker!(string_checker, ::regex::Regex, |gen| String::arbitrary(gen));
checker!(bytes_checker, ::regex::bytes::Regex, |gen| Vec::<u8>::arbitrary(
gen
));

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,459 @@
mat!(ascii_literal, r"a", "a", Some((0, 1)));
// Some crazy expressions from regular-expressions.info.
mat!(
match_ranges,
r"(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
"num: 255",
Some((5, 8))
);
mat!(
match_ranges_not,
r"(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
"num: 256",
None
);
mat!(match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3)));
mat!(match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3)));
mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4)));
mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None);
mat!(
match_email,
r"(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
"mine is jam.slam@gmail.com ",
Some((8, 26))
);
mat!(
match_email_not,
r"(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
"mine is jam.slam@gmail ",
None
);
mat!(
match_email_big,
r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?",
"mine is jam.slam@gmail.com ",
Some((8, 26))
);
mat!(
match_date1,
r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
"1900-01-01",
Some((0, 10))
);
mat!(
match_date2,
r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
"1900-00-01",
None
);
mat!(
match_date3,
r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
"1900-13-01",
None
);
// Do some crazy dancing with the start/end assertions.
matiter!(match_start_end_empty, r"^$", "", (0, 0));
matiter!(match_start_end_empty_many_1, r"^$^$^$", "", (0, 0));
matiter!(match_start_end_empty_many_2, r"^^^$$$", "", (0, 0));
matiter!(match_start_end_empty_rev, r"$^", "", (0, 0));
matiter!(
match_start_end_empty_rep,
r"(?:^$)*",
"a\nb\nc",
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5)
);
matiter!(
match_start_end_empty_rep_rev,
r"(?:$^)*",
"a\nb\nc",
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5)
);
// Test negated character classes.
mat!(negclass_letters, r"[^ac]", "acx", Some((2, 3)));
mat!(negclass_letter_comma, r"[^a,]", "a,x", Some((2, 3)));
mat!(negclass_letter_space, r"[^a[:space:]]", "a x", Some((2, 3)));
mat!(negclass_comma, r"[^,]", ",,x", Some((2, 3)));
mat!(negclass_space, r"[^[:space:]]", " a", Some((1, 2)));
mat!(negclass_space_comma, r"[^,[:space:]]", ", a", Some((2, 3)));
mat!(negclass_comma_space, r"[^[:space:],]", " ,a", Some((2, 3)));
mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2)));
// Test that repeated empty expressions don't loop forever.
mat!(lazy_many_many, r"((?:.*)*?)=", "a=b", Some((0, 2)));
mat!(lazy_many_optional, r"((?:.?)*?)=", "a=b", Some((0, 2)));
mat!(lazy_one_many_many, r"((?:.*)+?)=", "a=b", Some((0, 2)));
mat!(lazy_one_many_optional, r"((?:.?)+?)=", "a=b", Some((0, 2)));
mat!(lazy_range_min_many, r"((?:.*){1,}?)=", "a=b", Some((0, 2)));
mat!(lazy_range_many, r"((?:.*){1,2}?)=", "a=b", Some((0, 2)));
mat!(greedy_many_many, r"((?:.*)*)=", "a=b", Some((0, 2)));
mat!(greedy_many_optional, r"((?:.?)*)=", "a=b", Some((0, 2)));
mat!(greedy_one_many_many, r"((?:.*)+)=", "a=b", Some((0, 2)));
mat!(greedy_one_many_optional, r"((?:.?)+)=", "a=b", Some((0, 2)));
mat!(greedy_range_min_many, r"((?:.*){1,})=", "a=b", Some((0, 2)));
mat!(greedy_range_many, r"((?:.*){1,2})=", "a=b", Some((0, 2)));
// Test that we handle various flavors of empty expressions.
matiter!(match_empty1, r"", "", (0, 0));
matiter!(match_empty2, r"", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty3, r"()", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty4, r"()*", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty5, r"()+", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty6, r"()?", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty7, r"()()", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty8, r"()+|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty9, r"z|()+", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty10, r"()+|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty11, r"b|()+", "abc", (0, 0), (1, 2), (3, 3));
matiter!(match_empty12, r"|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty13, r"b|", "abc", (0, 0), (1, 2), (3, 3));
matiter!(match_empty14, r"|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty15, r"z|", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty16, r"|", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty17, r"||", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty18, r"||z", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty19, r"(?:)|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty20, r"b|(?:)", "abc", (0, 0), (1, 2), (3, 3));
matiter!(match_empty21, r"(?:|)", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty22, r"(?:|)|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
matiter!(match_empty23, r"a(?:)|b", "abc", (0, 1), (1, 2));
// Test that the DFA can handle pathological cases.
// (This should result in the DFA's cache being flushed too frequently, which
// should cause it to quit and fall back to the NFA algorithm.)
#[test]
fn dfa_handles_pathological_case() {
fn ones_and_zeroes(count: usize) -> String {
use rand::rngs::SmallRng;
use rand::{Rng, SeedableRng};
let mut rng = SmallRng::from_entropy();
let mut s = String::new();
for _ in 0..count {
if rng.gen() {
s.push('1');
} else {
s.push('0');
}
}
s
}
let re = regex!(r"[01]*1[01]{20}$");
let text = {
let mut pieces = ones_and_zeroes(100_000);
pieces.push('1');
pieces.push_str(&ones_and_zeroes(20));
pieces
};
assert!(re.is_match(text!(&*text)));
}
#[test]
fn nest_limit_makes_it_parse() {
use regex::RegexBuilder;
RegexBuilder::new(
r#"(?-u)
2(?:
[45]\d{3}|
7(?:
1[0-267]|
2[0-289]|
3[0-29]|
4[01]|
5[1-3]|
6[013]|
7[0178]|
91
)|
8(?:
0[125]|
[139][1-6]|
2[0157-9]|
41|
6[1-35]|
7[1-5]|
8[1-8]|
90
)|
9(?:
0[0-2]|
1[0-4]|
2[568]|
3[3-6]|
5[5-7]|
6[0167]|
7[15]|
8[0146-9]
)
)\d{4}|
3(?:
12?[5-7]\d{2}|
0(?:
2(?:
[025-79]\d|
[348]\d{1,2}
)|
3(?:
[2-4]\d|
[56]\d?
)
)|
2(?:
1\d{2}|
2(?:
[12]\d|
[35]\d{1,2}|
4\d?
)
)|
3(?:
1\d{2}|
2(?:
[2356]\d|
4\d{1,2}
)
)|
4(?:
1\d{2}|
2(?:
2\d{1,2}|
[47]|
5\d{2}
)
)|
5(?:
1\d{2}|
29
)|
[67]1\d{2}|
8(?:
1\d{2}|
2(?:
2\d{2}|
3|
4\d
)
)
)\d{3}|
4(?:
0(?:
2(?:
[09]\d|
7
)|
33\d{2}
)|
1\d{3}|
2(?:
1\d{2}|
2(?:
[25]\d?|
[348]\d|
[67]\d{1,2}
)
)|
3(?:
1\d{2}(?:
\d{2}
)?|
2(?:
[045]\d|
[236-9]\d{1,2}
)|
32\d{2}
)|
4(?:
[18]\d{2}|
2(?:
[2-46]\d{2}|
3
)|
5[25]\d{2}
)|
5(?:
1\d{2}|
2(?:
3\d|
5
)
)|
6(?:
[18]\d{2}|
2(?:
3(?:
\d{2}
)?|
[46]\d{1,2}|
5\d{2}|
7\d
)|
5(?:
3\d?|
4\d|
[57]\d{1,2}|
6\d{2}|
8
)
)|
71\d{2}|
8(?:
[18]\d{2}|
23\d{2}|
54\d{2}
)|
9(?:
[18]\d{2}|
2[2-5]\d{2}|
53\d{1,2}
)
)\d{3}|
5(?:
02[03489]\d{2}|
1\d{2}|
2(?:
1\d{2}|
2(?:
2(?:
\d{2}
)?|
[457]\d{2}
)
)|
3(?:
1\d{2}|
2(?:
[37](?:
\d{2}
)?|
[569]\d{2}
)
)|
4(?:
1\d{2}|
2[46]\d{2}
)|
5(?:
1\d{2}|
26\d{1,2}
)|
6(?:
[18]\d{2}|
2|
53\d{2}
)|
7(?:
1|
24
)\d{2}|
8(?:
1|
26
)\d{2}|
91\d{2}
)\d{3}|
6(?:
0(?:
1\d{2}|
2(?:
3\d{2}|
4\d{1,2}
)
)|
2(?:
2[2-5]\d{2}|
5(?:
[3-5]\d{2}|
7
)|
8\d{2}
)|
3(?:
1|
2[3478]
)\d{2}|
4(?:
1|
2[34]
)\d{2}|
5(?:
1|
2[47]
)\d{2}|
6(?:
[18]\d{2}|
6(?:
2(?:
2\d|
[34]\d{2}
)|
5(?:
[24]\d{2}|
3\d|
5\d{1,2}
)
)
)|
72[2-5]\d{2}|
8(?:
1\d{2}|
2[2-5]\d{2}
)|
9(?:
1\d{2}|
2[2-6]\d{2}
)
)\d{3}|
7(?:
(?:
02|
[3-589]1|
6[12]|
72[24]
)\d{2}|
21\d{3}|
32
)\d{3}|
8(?:
(?:
4[12]|
[5-7]2|
1\d?
)|
(?:
0|
3[12]|
[5-7]1|
217
)\d
)\d{4}|
9(?:
[35]1|
(?:
[024]2|
81
)\d|
(?:
1|
[24]1
)\d{2}
)\d{3}
"#,
)
.build()
.unwrap();
}

View File

@@ -0,0 +1,31 @@
mat!(match_flag_case, "(?-u)(?i)abc", "ABC", Some((0, 3)));
mat!(match_flag_weird_case, "(?-u)(?i)a(?-i)bc", "Abc", Some((0, 3)));
mat!(match_flag_weird_case_not, "(?-u)(?i)a(?-i)bc", "ABC", None);
mat!(match_flag_case_dotnl, "(?-u)(?is)a(?u:.)", "A\n", Some((0, 2)));
mat!(
match_flag_case_dotnl_toggle,
"(?-u)(?is)a(?u:.)(?-is)a(?u:.)",
"A\nab",
Some((0, 4))
);
mat!(
match_flag_case_dotnl_toggle_not,
"(?-u)(?is)a(?u:.)(?-is)a(?u:.)",
"A\na\n",
None
);
mat!(
match_flag_case_dotnl_toggle_ok,
"(?-u)(?is)a(?u:.)(?-is:a(?u:.))?",
"A\na\n",
Some((0, 2))
);
mat!(
match_flag_multi,
r"(?-u)(?m)(?:^\d+$\n?)+",
"123\n456\n789",
Some((0, 11))
);
mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1)));
mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2)));
mat!(match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2)));

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,160 @@
// Convenience macros.
macro_rules! findall {
($re:expr, $text:expr) => {{
$re.find_iter(text!($text))
.map(|m| (m.start(), m.end())).collect::<Vec<_>>()
}}
}
// Macros for automatically producing tests.
macro_rules! ismatch {
($name:ident, $re:expr, $text:expr, $ismatch:expr) => {
#[test]
fn $name() {
let re = regex!($re);
assert_eq!($ismatch, re.is_match(text!($text)));
}
};
}
macro_rules! mat(
($name:ident, $re:expr, $text:expr, $($loc:tt)+) => (
#[test]
fn $name() {
let text = text!($text);
let expected: Vec<Option<_>> = vec![$($loc)+];
let r = regex!($re);
let got: Vec<Option<_>> = match r.captures(text) {
Some(c) => {
assert!(r.is_match(text));
assert!(r.shortest_match(text).is_some());
r.capture_names()
.enumerate()
.map(|(i, _)| c.get(i).map(|m| (m.start(), m.end())))
.collect()
}
None => vec![None],
};
// The test set sometimes leave out capture groups, so truncate
// actual capture groups to match test set.
let mut sgot = &got[..];
if sgot.len() > expected.len() {
sgot = &sgot[0..expected.len()]
}
if expected != sgot {
panic!("For RE '{}' against '{:?}', \
expected '{:?}' but got '{:?}'",
$re, text, expected, sgot);
}
}
);
);
macro_rules! matiter(
($name:ident, $re:expr, $text:expr) => (
#[test]
fn $name() {
let text = text!($text);
let expected: Vec<(usize, usize)> = vec![];
let r = regex!($re);
let got: Vec<_> =
r.find_iter(text).map(|m| (m.start(), m.end())).collect();
if expected != got {
panic!("For RE '{}' against '{:?}', \
expected '{:?}' but got '{:?}'",
$re, text, expected, got);
}
let captures_got: Vec<_> =
r.captures_iter(text)
.map(|c| c.get(0).unwrap())
.map(|m| (m.start(), m.end()))
.collect();
if captures_got != got {
panic!("For RE '{}' against '{:?}', \
got '{:?}' using find_iter but got '{:?}' \
using captures_iter",
$re, text, got, captures_got);
}
}
);
($name:ident, $re:expr, $text:expr, $($loc:tt)+) => (
#[test]
fn $name() {
let text = text!($text);
let expected: Vec<_> = vec![$($loc)+];
let r = regex!($re);
let got: Vec<_> =
r.find_iter(text).map(|m| (m.start(), m.end())).collect();
if expected != got {
panic!("For RE '{}' against '{:?}', \
expected '{:?}' but got '{:?}'",
$re, text, expected, got);
}
let captures_got: Vec<_> =
r.captures_iter(text)
.map(|c| c.get(0).unwrap())
.map(|m| (m.start(), m.end()))
.collect();
if captures_got != got {
panic!("For RE '{}' against '{:?}', \
got '{:?}' using find_iter but got '{:?}' \
using captures_iter",
$re, text, got, captures_got);
}
}
);
);
macro_rules! matset {
($name:ident, $res:expr, $text:expr, $($match_index:expr),*) => {
#[test]
fn $name() {
let text = text!($text);
let set = regex_set!($res);
assert!(set.is_match(text));
let expected = vec![$($match_index),*];
let matches = set.matches(text);
assert!(matches.matched_any());
let got: Vec<_> = matches.into_iter().collect();
assert_eq!(expected, got);
}
}
}
macro_rules! nomatset {
($name:ident, $res:expr, $text:expr) => {
#[test]
fn $name() {
let text = text!($text);
let set = regex_set!($res);
assert!(!set.is_match(text));
let matches = set.matches(text);
assert!(!matches.matched_any());
assert_eq!(0, matches.into_iter().count());
}
}
}
macro_rules! split {
($name:ident, $re:expr, $text:expr, $expected:expr) => {
#[test]
fn $name() {
let re = regex!($re);
let splitted: Vec<_> = re.split(t!($text)).collect();
assert_eq!($expected, &*splitted);
}
}
}
macro_rules! splitn {
($name:ident, $re:expr, $text:expr, $limit:expr, $expected:expr) => {
#[test]
fn $name() {
let re = regex!($re);
let splitted: Vec<_> = re.splitn(t!($text), $limit).collect();
assert_eq!($expected, &*splitted);
}
}
}

View File

@@ -0,0 +1,39 @@
// Macros for use in writing tests generic over &str/&[u8].
macro_rules! text { ($text:expr) => { $text.as_bytes() } }
macro_rules! t { ($re:expr) => { text!($re) } }
macro_rules! match_text { ($text:expr) => { $text.as_bytes() } }
macro_rules! use_ { ($($path: tt)*) => { use regex::bytes::$($path)*; } }
macro_rules! empty_vec { () => { <Vec<&[u8]>>::new() } }
macro_rules! bytes { ($text:expr) => { $text } }
macro_rules! no_expand {
($text:expr) => {{
use regex::bytes::NoExpand;
NoExpand(text!($text))
}}
}
macro_rules! show {
($text:expr) => {{
use std::ascii::escape_default;
let mut s = vec![];
for &b in bytes!($text) {
s.extend(escape_default(b));
}
String::from_utf8(s).unwrap()
}}
}
macro_rules! expand {
($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => {
#[test]
fn $name() {
let re = regex!($re);
let cap = re.captures(t!($text)).unwrap();
let mut got = vec![];
cap.expand(t!($expand), &mut got);
assert_eq!(show!(t!($expected)), show!(&*got));
}
}
}

View File

@@ -0,0 +1,38 @@
// Macros for use in writing tests generic over &str/&[u8].
macro_rules! text { ($text:expr) => { $text } }
macro_rules! t { ($text:expr) => { text!($text) } }
macro_rules! match_text { ($text:expr) => { $text.as_str() } }
macro_rules! use_ { ($($path: tt)*) => { use regex::$($path)*; } }
macro_rules! empty_vec { () => { <Vec<&str>>::new() } }
macro_rules! bytes { ($text:expr) => { std::str::from_utf8($text.as_ref()).unwrap() } }
macro_rules! no_expand {
($text:expr) => {{
use regex::NoExpand;
NoExpand(text!($text))
}}
}
macro_rules! show { ($text:expr) => { $text } }
// N.B. The expansion API for &str and &[u8] APIs differs slightly for now,
// but they should be unified in 1.0. Then we can move this macro back into
// tests/api.rs where it is used. ---AG
macro_rules! expand {
($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => {
#[test]
fn $name() {
let re = regex!($re);
let cap = re.captures(t!($text)).unwrap();
let mut got = String::new();
cap.expand(t!($expand), &mut got);
assert_eq!(show!(t!($expected)), show!(&*got));
}
}
}
#[cfg(feature = "pattern")]
macro_rules! searcher_expr { ($e:expr) => ($e) }
#[cfg(not(feature = "pattern"))]
macro_rules! searcher_expr { ($e:expr) => ({}) }

View File

@@ -0,0 +1,4 @@
mat!(prefix_literal_match, r"^abc", r"abc", Some((0, 3)));
mat!(prefix_literal_nomatch, r"^abc", r"zabc", None);
mat!(one_literal_edge, r"abc", r"xxxxxab", None);
matiter!(terminates, r"a$", r"a", (0, 1));

View File

@@ -0,0 +1,144 @@
matiter!(
match_multi_1,
r"(?m)^[a-z]+$",
"abc\ndef\nxyz",
(0, 3),
(4, 7),
(8, 11)
);
matiter!(match_multi_2, r"(?m)^$", "abc\ndef\nxyz");
matiter!(match_multi_3, r"(?m)^", "abc\ndef\nxyz", (0, 0), (4, 4), (8, 8));
matiter!(match_multi_4, r"(?m)$", "abc\ndef\nxyz", (3, 3), (7, 7), (11, 11));
matiter!(
match_multi_5,
r"(?m)^[a-z]",
"abc\ndef\nxyz",
(0, 1),
(4, 5),
(8, 9)
);
matiter!(match_multi_6, r"(?m)[a-z]^", "abc\ndef\nxyz");
matiter!(
match_multi_7,
r"(?m)[a-z]$",
"abc\ndef\nxyz",
(2, 3),
(6, 7),
(10, 11)
);
matiter!(match_multi_8, r"(?m)$[a-z]", "abc\ndef\nxyz");
matiter!(match_multi_9, r"(?m)^$", "", (0, 0));
matiter!(
match_multi_rep_1,
r"(?m)(?:^$)*",
"a\nb\nc",
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5)
);
matiter!(
match_multi_rep_2,
r"(?m)(?:^|a)+",
"a\naaa\n",
(0, 0),
(2, 2),
(3, 5),
(6, 6)
);
matiter!(
match_multi_rep_3,
r"(?m)(?:^|a)*",
"a\naaa\n",
(0, 1),
(2, 5),
(6, 6)
);
matiter!(
match_multi_rep_4,
r"(?m)(?:^[a-z])+",
"abc\ndef\nxyz",
(0, 1),
(4, 5),
(8, 9)
);
matiter!(
match_multi_rep_5,
r"(?m)(?:^[a-z]{3}\n?)+",
"abc\ndef\nxyz",
(0, 11)
);
matiter!(
match_multi_rep_6,
r"(?m)(?:^[a-z]{3}\n?)*",
"abc\ndef\nxyz",
(0, 11)
);
matiter!(
match_multi_rep_7,
r"(?m)(?:\n?[a-z]{3}$)+",
"abc\ndef\nxyz",
(0, 11)
);
matiter!(
match_multi_rep_8,
r"(?m)(?:\n?[a-z]{3}$)*",
"abc\ndef\nxyz",
(0, 11)
);
matiter!(
match_multi_rep_9,
r"(?m)^*",
"\naa\n",
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4)
);
matiter!(match_multi_rep_10, r"(?m)^+", "\naa\n", (0, 0), (1, 1), (4, 4));
matiter!(
match_multi_rep_11,
r"(?m)$*",
"\naa\n",
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4)
);
matiter!(match_multi_rep_12, r"(?m)$+", "\naa\n", (0, 0), (3, 3), (4, 4));
matiter!(match_multi_rep_13, r"(?m)(?:$\n)+", "\n\naaa\n\n", (0, 2), (5, 7));
matiter!(
match_multi_rep_14,
r"(?m)(?:$\n)*",
"\n\naaa\n\n",
(0, 2),
(3, 3),
(4, 4),
(5, 7)
);
matiter!(match_multi_rep_15, r"(?m)(?:$\n^)+", "\n\naaa\n\n", (0, 2), (5, 7));
matiter!(
match_multi_rep_16,
r"(?m)(?:^|$)+",
"\n\naaa\n\n",
(0, 0),
(1, 1),
(2, 2),
(5, 5),
(6, 6),
(7, 7)
);
matiter!(
match_multi_rep_17,
r"(?m)(?:$\n)*",
"\n\naaa\n\n",
(0, 2),
(3, 3),
(4, 4),
(5, 7)
);

View File

@@ -0,0 +1,45 @@
macro_rules! noparse(
($name:ident, $re:expr) => (
#[test]
fn $name() {
let re = $re;
match regex_new!(re) {
Err(_) => {},
Ok(_) => panic!("Regex '{}' should cause a parse error.", re),
}
}
);
);
noparse!(fail_no_repeat_arg, "*");
noparse!(fail_incomplete_escape, "\\");
noparse!(fail_class_incomplete, "[A-");
noparse!(fail_class_not_closed, "[A");
noparse!(fail_class_no_begin, r"[\A]");
noparse!(fail_class_no_end, r"[\z]");
noparse!(fail_class_no_boundary, r"[\b]");
noparse!(fail_open_paren, "(");
noparse!(fail_close_paren, ")");
noparse!(fail_invalid_range, "[a-Z]");
noparse!(fail_empty_capture_name, "(?P<>a)");
noparse!(fail_bad_capture_name, "(?P<na-me>)");
noparse!(fail_bad_flag, "(?a)a");
noparse!(fail_too_big, "a{10000000}");
noparse!(fail_counted_no_close, "a{1001");
noparse!(fail_counted_decreasing, "a{2,1}");
noparse!(fail_counted_nonnegative, "a{-1,1}");
noparse!(fail_unfinished_cap, "(?");
noparse!(fail_unfinished_escape, "\\");
noparse!(fail_octal_digit, r"\8");
noparse!(fail_hex_digit, r"\xG0");
noparse!(fail_hex_short, r"\xF");
noparse!(fail_hex_long_digits, r"\x{fffg}");
noparse!(fail_flag_bad, "(?a)");
noparse!(fail_flag_empty, "(?)");
noparse!(fail_double_neg, "(?-i-i)");
noparse!(fail_neg_empty, "(?i-)");
noparse!(fail_dupe_named, "(?P<a>.)(?P<a>.)");
noparse!(fail_range_end_no_class, "[a-[:lower:]]");
noparse!(fail_range_end_no_begin, r"[a-\A]");
noparse!(fail_range_end_no_end, r"[a-\z]");
noparse!(fail_range_end_no_boundary, r"[a-\b]");

View File

@@ -0,0 +1,222 @@
// See: https://github.com/rust-lang/regex/issues/48
#[test]
fn invalid_regexes_no_crash() {
assert!(regex_new!("(*)").is_err());
assert!(regex_new!("(?:?)").is_err());
assert!(regex_new!("(?)").is_err());
assert!(regex_new!("*").is_err());
}
// See: https://github.com/rust-lang/regex/issues/98
#[test]
fn regression_many_repeat_stack_overflow() {
let re = regex!("^.{1,2500}");
assert_eq!(vec![(0, 1)], findall!(re, "a"));
}
// See: https://github.com/rust-lang/regex/issues/555
#[test]
fn regression_invalid_repetition_expr() {
assert!(regex_new!("(?m){1,1}").is_err());
}
// See: https://github.com/rust-lang/regex/issues/527
#[test]
fn regression_invalid_flags_expression() {
assert!(regex_new!("(((?x)))").is_ok());
}
// See: https://github.com/rust-lang/regex/issues/75
mat!(regression_unsorted_binary_search_1, r"(?i-u)[a_]+", "A_", Some((0, 2)));
mat!(regression_unsorted_binary_search_2, r"(?i-u)[A_]+", "a_", Some((0, 2)));
// See: https://github.com/rust-lang/regex/issues/99
#[cfg(feature = "unicode-case")]
mat!(regression_negated_char_class_1, r"(?i)[^x]", "x", None);
#[cfg(feature = "unicode-case")]
mat!(regression_negated_char_class_2, r"(?i)[^x]", "X", None);
// See: https://github.com/rust-lang/regex/issues/101
mat!(regression_ascii_word_underscore, r"[[:word:]]", "_", Some((0, 1)));
// See: https://github.com/rust-lang/regex/issues/129
#[test]
fn regression_captures_rep() {
let re = regex!(r"([a-f]){2}(?P<foo>[x-z])");
let caps = re.captures(text!("abx")).unwrap();
assert_eq!(match_text!(caps.name("foo").unwrap()), text!("x"));
}
// See: https://github.com/rust-lang/regex/issues/153
mat!(regression_alt_in_alt1, r"ab?|$", "az", Some((0, 1)));
mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3)));
// See: https://github.com/rust-lang/regex/issues/169
mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3)));
// See: https://github.com/rust-lang/regex/issues/76
#[cfg(all(feature = "unicode-case", feature = "unicode-gencat"))]
mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10)));
// See: https://github.com/rust-lang/regex/issues/191
mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3)));
// burntsushi was bad and didn't create an issue for this bug.
mat!(anchored_prefix1, r"^a[[:^space:]]", "a ", None);
mat!(anchored_prefix2, r"^a[[:^space:]]", "foo boo a ", None);
mat!(anchored_prefix3, r"^-[a-z]", "r-f", None);
// See: https://github.com/rust-lang/regex/issues/204
#[cfg(feature = "unicode-perl")]
split!(
split_on_word_boundary,
r"\b",
r"Should this (work?)",
&[
t!(""),
t!("Should"),
t!(" "),
t!("this"),
t!(" ("),
t!("work"),
t!("?)")
]
);
#[cfg(feature = "unicode-perl")]
matiter!(
word_boundary_dfa,
r"\b",
"a b c",
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5)
);
// See: https://github.com/rust-lang/regex/issues/268
matiter!(partial_anchor, r"^a|b", "ba", (0, 1));
// See: https://github.com/rust-lang/regex/issues/280
ismatch!(partial_anchor_alternate_begin, r"^a|z", "yyyyya", false);
ismatch!(partial_anchor_alternate_end, r"a$|z", "ayyyyy", false);
// See: https://github.com/rust-lang/regex/issues/289
mat!(lits_unambiguous1, r"(ABC|CDA|BC)X", "CDAX", Some((0, 4)));
// See: https://github.com/rust-lang/regex/issues/291
mat!(
lits_unambiguous2,
r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$",
"CIMG2341",
Some((0, 8)),
Some((0, 4)),
None,
Some((0, 4)),
Some((4, 8))
);
// See: https://github.com/rust-lang/regex/issues/271
mat!(endl_or_wb, r"(?m:$)|(?-u:\b)", "\u{6084e}", Some((4, 4)));
mat!(zero_or_end, r"(?i-u:\x00)|$", "\u{e682f}", Some((4, 4)));
mat!(y_or_endl, r"(?i-u:y)|(?m:$)", "\u{b4331}", Some((4, 4)));
#[cfg(feature = "unicode-perl")]
mat!(wb_start_x, r"(?u:\b)^(?-u:X)", "X", Some((0, 1)));
// See: https://github.com/rust-lang/regex/issues/321
ismatch!(strange_anchor_non_complete_prefix, r"a^{2}", "", false);
ismatch!(strange_anchor_non_complete_suffix, r"${2}a", "", false);
// See: https://github.com/BurntSushi/ripgrep/issues/1203
ismatch!(reverse_suffix1, r"[0-4][0-4][0-4]000", "153.230000", true);
ismatch!(reverse_suffix2, r"[0-9][0-9][0-9]000", "153.230000\n", true);
matiter!(reverse_suffix3, r"[0-9][0-9][0-9]000", "153.230000\n", (4, 10));
// See: https://github.com/rust-lang/regex/issues/334
// See: https://github.com/rust-lang/regex/issues/557
mat!(
captures_after_dfa_premature_end1,
r"a(b*(X|$))?",
"abcbX",
Some((0, 1)),
None,
None
);
mat!(
captures_after_dfa_premature_end2,
r"a(bc*(X|$))?",
"abcbX",
Some((0, 1)),
None,
None
);
mat!(captures_after_dfa_premature_end3, r"(aa$)?", "aaz", Some((0, 0)));
// See: https://github.com/rust-lang/regex/issues/437
ismatch!(
literal_panic,
r"typename type\-parameter\-[0-9]+\-[0-9]+::.+",
"test",
false
);
// See: https://github.com/rust-lang/regex/issues/533
ismatch!(
blank_matches_nothing_between_space_and_tab,
r"[[:blank:]]",
"\u{a}\u{b}\u{c}\u{d}\u{e}\u{f}\
\u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\
\u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}",
false
);
ismatch!(
inverted_blank_matches_everything_between_space_and_tab,
r"^[[:^blank:]]+$",
"\u{a}\u{b}\u{c}\u{d}\u{e}\u{f}\
\u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\
\u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}",
true
);
// Tests that our Aho-Corasick optimization works correctly. It only
// kicks in when we have >32 literals. By "works correctly," we mean that
// leftmost-first match semantics are properly respected. That is, samwise
// should match, not sam.
mat!(
ahocorasick1,
"samwise|sam|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|\
A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z",
"samwise",
Some((0, 7))
);
// See: https://github.com/BurntSushi/ripgrep/issues/1247
#[test]
#[cfg(feature = "unicode-perl")]
fn regression_nfa_stops1() {
let re = ::regex::bytes::Regex::new(r"\bs(?:[ab])").unwrap();
assert_eq!(0, re.find_iter(b"s\xE4").count());
}
// See: https://github.com/rust-lang/regex/issues/640
#[cfg(feature = "unicode-case")]
matiter!(
flags_are_unset,
r"((?i)foo)|Bar",
"foo Foo bar Bar",
(0, 3),
(4, 7),
(12, 15)
);
// See: https://github.com/rust-lang/regex/issues/659
//
// Note that 'Ј' is not 'j', but cyrillic Je
// https://en.wikipedia.org/wiki/Je_(Cyrillic)
ismatch!(empty_group_match, r"()Ј01", "zЈ01", true);
matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5));
// See: https://github.com/rust-lang/regex/issues/862
mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1)));

View File

@@ -0,0 +1,31 @@
// These tests are only run for the "default" test target because some of them
// can take quite a long time. Some of them take long enough that it's not
// practical to run them in debug mode. :-/
// See: https://oss-fuzz.com/testcase-detail/5673225499181056
//
// Ignored by default since it takes too long in debug mode (almost a minute).
#[test]
#[ignore]
fn fuzz1() {
regex!(r"1}{55}{0}*{1}{55}{55}{5}*{1}{55}+{56}|;**");
}
// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=26505
// See: https://github.com/rust-lang/regex/issues/722
#[test]
fn empty_any_errors_no_panic() {
assert!(regex_new!(r"\P{any}").is_err());
}
// This tests that a very large regex errors during compilation instead of
// using gratuitous amounts of memory. The specific problem is that the
// compiler wasn't accounting for the memory used by Unicode character classes
// correctly.
//
// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=33579
#[test]
fn big_regex_fails_to_compile() {
let pat = "[\u{0}\u{e}\u{2}\\w~~>[l\t\u{0}]p?<]{971158}";
assert!(regex_new!(pat).is_err());
}

View File

@@ -0,0 +1,230 @@
macro_rules! replace(
($name:ident, $which:ident, $re:expr,
$search:expr, $replace:expr, $result:expr) => (
#[test]
fn $name() {
let re = regex!($re);
assert_eq!(re.$which(text!($search), $replace), text!($result));
}
);
);
replace!(first, replace, r"[0-9]", "age: 26", t!("Z"), "age: Z6");
replace!(plus, replace, r"[0-9]+", "age: 26", t!("Z"), "age: Z");
replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ");
replace!(
groups,
replace,
r"(?-u)(\S+)\s+(\S+)",
"w1 w2",
t!("$2 $1"),
"w2 w1"
);
replace!(
double_dollar,
replace,
r"(?-u)(\S+)\s+(\S+)",
"w1 w2",
t!("$2 $$1"),
"w2 $1"
);
// replace!(adjacent_index, replace,
// r"([^aeiouy])ies$", "skies", t!("$1y"), "sky");
replace!(
named,
replace_all,
r"(?-u)(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
"w1 w2 w3 w4",
t!("$last $first$space"),
"w2 w1 w4 w3"
);
replace!(
trim,
replace_all,
"^[ \t]+|[ \t]+$",
" \t trim me\t \t",
t!(""),
"trim me"
);
replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b");
// replace!(number_underscore, replace, r"(.)(.)", "ab", t!("$1_$2"), "a_b");
replace!(
simple_expand,
replace_all,
r"(?-u)(\w) (\w)",
"a b",
t!("$2 $1"),
"b a"
);
replace!(
literal_dollar1,
replace_all,
r"(?-u)(\w+) (\w+)",
"a b",
t!("$$1"),
"$1"
);
replace!(
literal_dollar2,
replace_all,
r"(?-u)(\w+) (\w+)",
"a b",
t!("$2 $$c $1"),
"b $c a"
);
replace!(
no_expand1,
replace,
r"(?-u)(\S+)\s+(\S+)",
"w1 w2",
no_expand!("$2 $1"),
"$2 $1"
);
replace!(
no_expand2,
replace,
r"(?-u)(\S+)\s+(\S+)",
"w1 w2",
no_expand!("$$1"),
"$$1"
);
use_!(Captures);
replace!(
closure_returning_reference,
replace,
r"([0-9]+)",
"age: 26",
|captures: &Captures<'_>| {
match_text!(captures.get(1).unwrap())[0..1].to_owned()
},
"age: 2"
);
replace!(
closure_returning_value,
replace,
r"[0-9]+",
"age: 26",
|_captures: &Captures<'_>| t!("Z").to_owned(),
"age: Z"
);
// See https://github.com/rust-lang/regex/issues/314
replace!(
match_at_start_replace_with_empty,
replace_all,
r"foo",
"foobar",
t!(""),
"bar"
);
// See https://github.com/rust-lang/regex/issues/393
replace!(single_empty_match, replace, r"^", "bar", t!("foo"), "foobar");
// See https://github.com/rust-lang/regex/issues/399
replace!(
capture_longest_possible_name,
replace_all,
r"(.)",
"b",
t!("${1}a $1a"),
"ba "
);
replace!(
impl_string,
replace,
r"[0-9]",
"age: 26",
t!("Z".to_string()),
"age: Z6"
);
replace!(
impl_string_ref,
replace,
r"[0-9]",
"age: 26",
t!(&"Z".to_string()),
"age: Z6"
);
replace!(
impl_cow_str_borrowed,
replace,
r"[0-9]",
"age: 26",
t!(std::borrow::Cow::<'_, str>::Borrowed("Z")),
"age: Z6"
);
replace!(
impl_cow_str_borrowed_ref,
replace,
r"[0-9]",
"age: 26",
t!(&std::borrow::Cow::<'_, str>::Borrowed("Z")),
"age: Z6"
);
replace!(
impl_cow_str_owned,
replace,
r"[0-9]",
"age: 26",
t!(std::borrow::Cow::<'_, str>::Owned("Z".to_string())),
"age: Z6"
);
replace!(
impl_cow_str_owned_ref,
replace,
r"[0-9]",
"age: 26",
t!(&std::borrow::Cow::<'_, str>::Owned("Z".to_string())),
"age: Z6"
);
replace!(
impl_vec_u8,
replace,
r"[0-9]",
"age: 26",
bytes!(vec![b'Z']),
"age: Z6"
);
replace!(
impl_vec_u8_ref,
replace,
r"[0-9]",
"age: 26",
bytes!(&vec![b'Z']),
"age: Z6"
);
replace!(
impl_cow_slice_borrowed,
replace,
r"[0-9]",
"age: 26",
bytes!(std::borrow::Cow::<'_, [u8]>::Borrowed(&[b'Z'])),
"age: Z6"
);
replace!(
impl_cow_slice_borrowed_ref,
replace,
r"[0-9]",
"age: 26",
bytes!(&std::borrow::Cow::<'_, [u8]>::Borrowed(&[b'Z'])),
"age: Z6"
);
replace!(
impl_cow_slice_owned,
replace,
r"[0-9]",
"age: 26",
bytes!(std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])),
"age: Z6"
);
replace!(
impl_cow_slice_owned_ref,
replace,
r"[0-9]",
"age: 26",
bytes!(&std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])),
"age: Z6"
);

View File

@@ -0,0 +1,95 @@
macro_rules! searcher {
($name:ident, $re:expr, $haystack:expr) => (
searcher!($name, $re, $haystack, vec vec![]);
);
($name:ident, $re:expr, $haystack:expr, $($steps:expr,)*) => (
searcher!($name, $re, $haystack, vec vec![$($steps),*]);
);
($name:ident, $re:expr, $haystack:expr, $($steps:expr),*) => (
searcher!($name, $re, $haystack, vec vec![$($steps),*]);
);
($name:ident, $re:expr, $haystack:expr, vec $expect_steps:expr) => (
#[test]
#[allow(unused_imports)]
fn $name() {
searcher_expr! {{
use std::str::pattern::{Pattern, Searcher};
use std::str::pattern::SearchStep::{Match, Reject, Done};
let re = regex!($re);
let mut se = re.into_searcher($haystack);
let mut got_steps = vec![];
loop {
match se.next() {
Done => break,
step => { got_steps.push(step); }
}
}
assert_eq!(got_steps, $expect_steps);
}}
}
);
}
searcher!(searcher_empty_regex_empty_haystack, r"", "", Match(0, 0));
searcher!(
searcher_empty_regex,
r"",
"ab",
Match(0, 0),
Reject(0, 1),
Match(1, 1),
Reject(1, 2),
Match(2, 2)
);
searcher!(searcher_empty_haystack, r"\d", "");
searcher!(searcher_one_match, r"\d", "5", Match(0, 1));
searcher!(searcher_no_match, r"\d", "a", Reject(0, 1));
searcher!(
searcher_two_adjacent_matches,
r"\d",
"56",
Match(0, 1),
Match(1, 2)
);
searcher!(
searcher_two_non_adjacent_matches,
r"\d",
"5a6",
Match(0, 1),
Reject(1, 2),
Match(2, 3)
);
searcher!(searcher_reject_first, r"\d", "a6", Reject(0, 1), Match(1, 2));
searcher!(
searcher_one_zero_length_matches,
r"\d*",
"a1b2",
Match(0, 0), // ^
Reject(0, 1), // a
Match(1, 2), // a1
Reject(2, 3), // a1b
Match(3, 4), // a1b2
);
searcher!(
searcher_many_zero_length_matches,
r"\d*",
"a1bbb2",
Match(0, 0), // ^
Reject(0, 1), // a
Match(1, 2), // a1
Reject(2, 3), // a1b
Match(3, 3), // a1bb
Reject(3, 4), // a1bb
Match(4, 4), // a1bbb
Reject(4, 5), // a1bbb
Match(5, 6), // a1bbba
);
searcher!(
searcher_unicode,
r".+?",
"1Ⅱ2",
Match(0, 3),
Match(3, 4),
Match(4, 7),
Match(7, 8)
);

View File

@@ -0,0 +1,67 @@
matset!(set1, &["a", "a"], "a", 0, 1);
matset!(set2, &["a", "a"], "ba", 0, 1);
matset!(set3, &["a", "b"], "a", 0);
matset!(set4, &["a", "b"], "b", 1);
matset!(set5, &["a|b", "b|a"], "b", 0, 1);
matset!(set6, &["foo", "oo"], "foo", 0, 1);
matset!(set7, &["^foo", "bar$"], "foo", 0);
matset!(set8, &["^foo", "bar$"], "foo bar", 0, 1);
matset!(set9, &["^foo", "bar$"], "bar", 1);
matset!(set10, &[r"[a-z]+$", "foo"], "01234 foo", 0, 1);
matset!(set11, &[r"[a-z]+$", "foo"], "foo 01234", 1);
matset!(set12, &[r".*?", "a"], "zzzzzza", 0, 1);
matset!(set13, &[r".*", "a"], "zzzzzza", 0, 1);
matset!(set14, &[r".*", "a"], "zzzzzz", 0);
matset!(set15, &[r"(?-u)\ba\b"], "hello a bye", 0);
matset!(set16, &["a"], "a", 0);
matset!(set17, &[".*a"], "a", 0);
matset!(set18, &["a", "β"], "β", 1);
// regexes that match the empty string
matset!(setempty1, &["", "a"], "abc", 0, 1);
matset!(setempty2, &["", "b"], "abc", 0, 1);
matset!(setempty3, &["", "z"], "abc", 0);
matset!(setempty4, &["a", ""], "abc", 0, 1);
matset!(setempty5, &["b", ""], "abc", 0, 1);
matset!(setempty6, &["z", ""], "abc", 1);
matset!(setempty7, &["b", "(?:)"], "abc", 0, 1);
matset!(setempty8, &["(?:)", "b"], "abc", 0, 1);
matset!(setempty9, &["c(?:)", "b"], "abc", 0, 1);
nomatset!(nset1, &["a", "a"], "b");
nomatset!(nset2, &["^foo", "bar$"], "bar foo");
nomatset!(
nset3,
{
let xs: &[&str] = &[];
xs
},
"a"
);
nomatset!(nset4, &[r"^rooted$", r"\.log$"], "notrooted");
// See: https://github.com/rust-lang/regex/issues/187
#[test]
fn regression_subsequent_matches() {
let set = regex_set!(&["ab", "b"]);
let text = text!("ba");
assert!(set.matches(text).matched(1));
assert!(set.matches(text).matched(1));
}
#[test]
fn get_set_patterns() {
let set = regex_set!(&["a", "b"]);
assert_eq!(vec!["a", "b"], set.patterns());
}
#[test]
fn len_and_empty() {
let empty = regex_set!(&[""; 0]);
assert_eq!(empty.len(), 0);
assert!(empty.is_empty());
let not_empty = regex_set!(&["ab", "b"]);
assert_eq!(not_empty.len(), 2);
assert!(!not_empty.is_empty());
}

View File

@@ -0,0 +1,14 @@
macro_rules! shortmat {
($name:ident, $re:expr, $text:expr, $shortest_match:expr) => {
#[test]
fn $name() {
let text = text!($text);
let re = regex!($re);
assert_eq!($shortest_match, re.shortest_match(text));
}
};
}
shortmat!(t01, r"a+", r"aa", Some(1));
// Test that the reverse suffix optimization gets it right.
shortmat!(t02, r".*(?:abcd)+", r"abcdabcd", Some(4));

View File

@@ -0,0 +1,6 @@
mat!(t01, r".*abcd", r"abcd", Some((0, 4)));
mat!(t02, r".*(?:abcd)+", r"abcd", Some((0, 4)));
mat!(t03, r".*(?:abcd)+", r"abcdabcd", Some((0, 8)));
mat!(t04, r".*(?:abcd)+", r"abcdxabcd", Some((0, 9)));
mat!(t05, r".*x(?:abcd)+", r"abcdxabcd", Some((0, 9)));
mat!(t06, r"[^abcd]*x(?:abcd)+", r"abcdxabcd", Some((4, 9)));

View File

@@ -0,0 +1,56 @@
#![cfg_attr(feature = "pattern", feature(pattern))]
macro_rules! regex_new {
($re:expr) => {{
use regex::internal::ExecBuilder;
ExecBuilder::new($re)
.bounded_backtracking()
.build()
.map(|e| e.into_regex())
}};
}
macro_rules! regex {
($re:expr) => {
regex_new!($re).unwrap()
};
}
macro_rules! regex_set_new {
($re:expr) => {{
use regex::internal::ExecBuilder;
ExecBuilder::new_many($re)
.bounded_backtracking()
.build()
.map(|e| e.into_regex_set())
}};
}
macro_rules! regex_set {
($res:expr) => {
regex_set_new!($res).unwrap()
};
}
// Must come before other module definitions.
include!("macros_str.rs");
include!("macros.rs");
mod api;
mod api_str;
mod crazy;
mod flags;
mod fowler;
mod multiline;
mod noparse;
mod regression;
mod replace;
mod searcher;
mod set;
mod suffix_reverse;
#[cfg(feature = "unicode")]
mod unicode;
#[cfg(feature = "unicode-perl")]
mod word_boundary;
#[cfg(feature = "unicode-perl")]
mod word_boundary_unicode;

View File

@@ -0,0 +1,55 @@
macro_rules! regex_new {
($re:expr) => {{
use regex::internal::ExecBuilder;
ExecBuilder::new($re)
.bounded_backtracking()
.only_utf8(false)
.build()
.map(|e| e.into_byte_regex())
}};
}
macro_rules! regex {
($re:expr) => {
regex_new!($re).unwrap()
};
}
macro_rules! regex_set_new {
($re:expr) => {{
use regex::internal::ExecBuilder;
ExecBuilder::new_many($re)
.bounded_backtracking()
.only_utf8(false)
.build()
.map(|e| e.into_byte_regex_set())
}};
}
macro_rules! regex_set {
($res:expr) => {
regex_set_new!($res).unwrap()
};
}
// Must come before other module definitions.
include!("macros_bytes.rs");
include!("macros.rs");
mod api;
mod bytes;
mod crazy;
mod flags;
mod fowler;
mod multiline;
mod noparse;
mod regression;
mod replace;
mod set;
mod suffix_reverse;
#[cfg(feature = "unicode")]
mod unicode;
#[cfg(feature = "unicode-perl")]
mod word_boundary;
#[cfg(feature = "unicode-perl")]
mod word_boundary_ascii;

View File

@@ -0,0 +1,58 @@
#![cfg_attr(feature = "pattern", feature(pattern))]
macro_rules! regex_new {
($re:expr) => {{
use regex::internal::ExecBuilder;
ExecBuilder::new($re)
.bounded_backtracking()
.bytes(true)
.build()
.map(|e| e.into_regex())
}};
}
macro_rules! regex {
($re:expr) => {
regex_new!($re).unwrap()
};
}
macro_rules! regex_set_new {
($re:expr) => {{
use regex::internal::ExecBuilder;
ExecBuilder::new_many($re)
.bounded_backtracking()
.bytes(true)
.build()
.map(|e| e.into_regex_set())
}};
}
macro_rules! regex_set {
($res:expr) => {
regex_set_new!($res).unwrap()
};
}
// Must come before other module definitions.
include!("macros_str.rs");
include!("macros.rs");
mod api;
mod api_str;
mod crazy;
mod flags;
mod fowler;
mod multiline;
mod noparse;
mod regression;
mod replace;
mod searcher;
mod set;
mod suffix_reverse;
#[cfg(feature = "unicode")]
mod unicode;
#[cfg(feature = "unicode-perl")]
mod word_boundary;
#[cfg(feature = "unicode-perl")]
mod word_boundary_unicode;

View File

@@ -0,0 +1,54 @@
/*
* This test is a minimal version of <rofl_0> and <subdiff_0>
*
* Once this bug gets fixed, uncomment rofl_0 and subdiff_0
* (in `tests/crates_regex.rs`).
#[test]
fn word_boundary_backtracking_default_mismatch() {
use regex::internal::ExecBuilder;
let backtrack_re = ExecBuilder::new(r"\b")
.bounded_backtracking()
.build()
.map(|exec| exec.into_regex())
.map_err(|err| format!("{}", err))
.unwrap();
let default_re = ExecBuilder::new(r"\b")
.build()
.map(|exec| exec.into_regex())
.map_err(|err| format!("{}", err))
.unwrap();
let input = "䅅\\u{a0}";
let fi1 = backtrack_re.find_iter(input);
let fi2 = default_re.find_iter(input);
for (m1, m2) in fi1.zip(fi2) {
assert_eq!(m1, m2);
}
}
*/
mod consistent;
mod crates_regex {
macro_rules! consistent {
($test_name:ident, $regex_src:expr) => {
#[test]
fn $test_name() {
use super::consistent::backends_are_consistent;
if option_env!("RUST_REGEX_RANDOM_TEST").is_some() {
match backends_are_consistent($regex_src) {
Ok(_) => {}
Err(err) => panic!("{}", err),
}
}
}
};
}
include!("crates_regex.rs");
}

View File

@@ -0,0 +1,222 @@
#![cfg_attr(feature = "pattern", feature(pattern))]
use regex;
// Due to macro scoping rules, this definition only applies for the modules
// defined below. Effectively, it allows us to use the same tests for both
// native and dynamic regexes.
//
// This is also used to test the various matching engines. This one exercises
// the normal code path which automatically chooses the engine based on the
// regex and the input. Other dynamic tests explicitly set the engine to use.
macro_rules! regex_new {
($re:expr) => {{
use regex::Regex;
Regex::new($re)
}};
}
macro_rules! regex {
($re:expr) => {
regex_new!($re).unwrap()
};
}
macro_rules! regex_set_new {
($re:expr) => {{
use regex::RegexSet;
RegexSet::new($re)
}};
}
macro_rules! regex_set {
($res:expr) => {
regex_set_new!($res).unwrap()
};
}
// Must come before other module definitions.
include!("macros_str.rs");
include!("macros.rs");
mod api;
mod api_str;
mod crazy;
mod flags;
mod fowler;
mod misc;
mod multiline;
mod noparse;
mod regression;
mod regression_fuzz;
mod replace;
mod searcher;
mod set;
mod shortest_match;
mod suffix_reverse;
#[cfg(feature = "unicode")]
mod unicode;
#[cfg(feature = "unicode-perl")]
mod word_boundary;
#[cfg(feature = "unicode-perl")]
mod word_boundary_unicode;
#[test]
fn disallow_non_utf8() {
assert!(regex::Regex::new(r"(?-u)\xFF").is_err());
assert!(regex::Regex::new(r"(?-u).").is_err());
assert!(regex::Regex::new(r"(?-u)[\xFF]").is_err());
assert!(regex::Regex::new(r"(?-u)☃").is_err());
}
#[test]
fn disallow_octal() {
assert!(regex::Regex::new(r"\0").is_err());
}
#[test]
fn allow_octal() {
assert!(regex::RegexBuilder::new(r"\0").octal(true).build().is_ok());
}
#[test]
fn oibits() {
use regex::bytes;
use regex::{Regex, RegexBuilder, RegexSet, RegexSetBuilder};
use std::panic::{RefUnwindSafe, UnwindSafe};
fn assert_send<T: Send>() {}
fn assert_sync<T: Sync>() {}
fn assert_unwind_safe<T: UnwindSafe>() {}
fn assert_ref_unwind_safe<T: RefUnwindSafe>() {}
assert_send::<Regex>();
assert_sync::<Regex>();
assert_unwind_safe::<Regex>();
assert_ref_unwind_safe::<Regex>();
assert_send::<RegexBuilder>();
assert_sync::<RegexBuilder>();
assert_unwind_safe::<RegexBuilder>();
assert_ref_unwind_safe::<RegexBuilder>();
assert_send::<bytes::Regex>();
assert_sync::<bytes::Regex>();
assert_unwind_safe::<bytes::Regex>();
assert_ref_unwind_safe::<bytes::Regex>();
assert_send::<bytes::RegexBuilder>();
assert_sync::<bytes::RegexBuilder>();
assert_unwind_safe::<bytes::RegexBuilder>();
assert_ref_unwind_safe::<bytes::RegexBuilder>();
assert_send::<RegexSet>();
assert_sync::<RegexSet>();
assert_unwind_safe::<RegexSet>();
assert_ref_unwind_safe::<RegexSet>();
assert_send::<RegexSetBuilder>();
assert_sync::<RegexSetBuilder>();
assert_unwind_safe::<RegexSetBuilder>();
assert_ref_unwind_safe::<RegexSetBuilder>();
assert_send::<bytes::RegexSet>();
assert_sync::<bytes::RegexSet>();
assert_unwind_safe::<bytes::RegexSet>();
assert_ref_unwind_safe::<bytes::RegexSet>();
assert_send::<bytes::RegexSetBuilder>();
assert_sync::<bytes::RegexSetBuilder>();
assert_unwind_safe::<bytes::RegexSetBuilder>();
assert_ref_unwind_safe::<bytes::RegexSetBuilder>();
}
// See: https://github.com/rust-lang/regex/issues/568
#[test]
fn oibits_regression() {
use regex::Regex;
use std::panic;
let _ = panic::catch_unwind(|| Regex::new("a").unwrap());
}
// See: https://github.com/rust-lang/regex/issues/750
#[test]
#[cfg(target_pointer_width = "64")]
fn regex_is_reasonably_small() {
use std::mem::size_of;
use regex::bytes;
use regex::{Regex, RegexSet};
assert_eq!(16, size_of::<Regex>());
assert_eq!(16, size_of::<RegexSet>());
assert_eq!(16, size_of::<bytes::Regex>());
assert_eq!(16, size_of::<bytes::RegexSet>());
}
// See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
// See: CVE-2022-24713
//
// We test that our regex compiler will correctly return a "too big" error when
// we try to use a very large repetition on an *empty* sub-expression.
//
// At the time this test was written, the regex compiler does not represent
// empty sub-expressions with any bytecode instructions. In effect, it's an
// "optimization" to leave them out, since they would otherwise correspond
// to an unconditional JUMP in the regex bytecode (i.e., an unconditional
// epsilon transition in the NFA graph). Therefore, an empty sub-expression
// represents an interesting case for the compiler's size limits. Since it
// doesn't actually contribute any additional memory to the compiled regex
// instructions, the size limit machinery never detects it. Instead, it just
// dumbly tries to compile the empty sub-expression N times, where N is the
// repetition size.
//
// When N is very large, this will cause the compiler to essentially spin and
// do nothing for a decently large amount of time. It causes the regex to take
// quite a bit of time to compile, despite the concrete syntax of the regex
// being quite small.
//
// The degree to which this is actually a problem is somewhat of a judgment
// call. Some regexes simply take a long time to compile. But in general, you
// should be able to reasonably control this by setting lower or higher size
// limits on the compiled object size. But this mitigation doesn't work at all
// for this case.
//
// This particular test is somewhat narrow. It merely checks that regex
// compilation will, at some point, return a "too big" error. Before the
// fix landed, this test would eventually fail because the regex would be
// successfully compiled (after enough time elapsed). So while this test
// doesn't check that we exit in a reasonable amount of time, it does at least
// check that we are properly returning an error at some point.
#[test]
fn big_empty_regex_fails() {
use regex::Regex;
let result = Regex::new("(?:){4294967295}");
assert!(result.is_err());
}
// Below is a "billion laughs" variant of the previous test case.
#[test]
fn big_empty_reps_chain_regex_fails() {
use regex::Regex;
let result = Regex::new("(?:){64}{64}{64}{64}{64}{64}");
assert!(result.is_err());
}
// Below is another situation where a zero-length sub-expression can be
// introduced.
#[test]
fn big_zero_reps_regex_fails() {
use regex::Regex;
let result = Regex::new(r"x{0}{4294967295}");
assert!(result.is_err());
}
// Testing another case for completeness.
#[test]
fn empty_alt_regex_fails() {
use regex::Regex;
let result = Regex::new(r"(?:|){4294967295}");
assert!(result.is_err());
}

View File

@@ -0,0 +1,75 @@
macro_rules! regex_new {
($re:expr) => {{
use regex::bytes::Regex;
Regex::new($re)
}};
}
macro_rules! regex_set_new {
($res:expr) => {{
use regex::bytes::RegexSet;
RegexSet::new($res)
}};
}
macro_rules! regex {
($re:expr) => {
regex_new!($re).unwrap()
};
}
macro_rules! regex_set {
($res:expr) => {
regex_set_new!($res).unwrap()
};
}
// Must come before other module definitions.
include!("macros_bytes.rs");
include!("macros.rs");
// A silly wrapper to make it possible to write and match raw bytes.
struct R<'a>(&'a [u8]);
impl<'a> R<'a> {
fn as_bytes(&self) -> &'a [u8] {
self.0
}
}
// See: https://github.com/rust-lang/regex/issues/321
//
// These tests are here because they do not have the same behavior in every
// regex engine.
mat!(invalid_utf8_nfa1, r".", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), Some((2, 3)));
mat!(invalid_utf8_nfa2, r"${2}ä", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), None);
mat!(
invalid_utf8_nfa3,
r".",
R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
Some((1, 3))
);
mat!(
invalid_utf8_nfa4,
r"${2}ä",
R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
None
);
mod api;
mod bytes;
mod crazy;
mod flags;
mod fowler;
mod multiline;
mod noparse;
mod regression;
mod replace;
mod set;
mod shortest_match;
mod suffix_reverse;
#[cfg(feature = "unicode")]
mod unicode;
#[cfg(feature = "unicode-perl")]
mod word_boundary;
#[cfg(feature = "unicode-perl")]
mod word_boundary_unicode;

View File

@@ -0,0 +1,50 @@
#![cfg_attr(feature = "pattern", feature(pattern))]
macro_rules! regex_new {
($re:expr) => {{
use regex::internal::ExecBuilder;
ExecBuilder::new($re).nfa().build().map(|e| e.into_regex())
}};
}
macro_rules! regex {
($re:expr) => {
regex_new!($re).unwrap()
};
}
macro_rules! regex_set_new {
($re:expr) => {{
use regex::internal::ExecBuilder;
ExecBuilder::new_many($re).nfa().build().map(|e| e.into_regex_set())
}};
}
macro_rules! regex_set {
($res:expr) => {
regex_set_new!($res).unwrap()
};
}
// Must come before other module definitions.
include!("macros_str.rs");
include!("macros.rs");
mod api;
mod api_str;
mod crazy;
mod flags;
mod fowler;
mod multiline;
mod noparse;
mod regression;
mod replace;
mod searcher;
mod set;
mod suffix_reverse;
#[cfg(feature = "unicode")]
mod unicode;
#[cfg(feature = "unicode-perl")]
mod word_boundary;
#[cfg(feature = "unicode-perl")]
mod word_boundary_unicode;

View File

@@ -0,0 +1,55 @@
macro_rules! regex_new {
($re:expr) => {{
use regex::internal::ExecBuilder;
ExecBuilder::new($re)
.nfa()
.only_utf8(false)
.build()
.map(|e| e.into_byte_regex())
}};
}
macro_rules! regex {
($re:expr) => {
regex_new!($re).unwrap()
};
}
macro_rules! regex_set_new {
($re:expr) => {{
use regex::internal::ExecBuilder;
ExecBuilder::new_many($re)
.nfa()
.only_utf8(false)
.build()
.map(|e| e.into_byte_regex_set())
}};
}
macro_rules! regex_set {
($res:expr) => {
regex_set_new!($res).unwrap()
};
}
// Must come before other module definitions.
include!("macros_bytes.rs");
include!("macros.rs");
mod api;
mod bytes;
mod crazy;
mod flags;
mod fowler;
mod multiline;
mod noparse;
mod regression;
mod replace;
mod set;
mod suffix_reverse;
#[cfg(feature = "unicode")]
mod unicode;
#[cfg(feature = "unicode-perl")]
mod word_boundary;
#[cfg(feature = "unicode-perl")]
mod word_boundary_unicode;

View File

@@ -0,0 +1,54 @@
#![cfg_attr(feature = "pattern", feature(pattern))]
macro_rules! regex_new {
($re:expr) => {{
use regex::internal::ExecBuilder;
ExecBuilder::new($re).nfa().bytes(true).build().map(|e| e.into_regex())
}};
}
macro_rules! regex {
($re:expr) => {
regex_new!($re).unwrap()
};
}
macro_rules! regex_set_new {
($re:expr) => {{
use regex::internal::ExecBuilder;
ExecBuilder::new_many($re)
.nfa()
.bytes(true)
.build()
.map(|e| e.into_regex_set())
}};
}
macro_rules! regex_set {
($res:expr) => {
regex_set_new!($res).unwrap()
};
}
// Must come before other module definitions.
include!("macros_str.rs");
include!("macros.rs");
mod api;
mod api_str;
mod crazy;
mod flags;
mod fowler;
mod multiline;
mod noparse;
mod regression;
mod replace;
mod searcher;
mod set;
mod suffix_reverse;
#[cfg(feature = "unicode")]
mod unicode;
#[cfg(feature = "unicode-perl")]
mod word_boundary;
#[cfg(feature = "unicode-perl")]
mod word_boundary_unicode;

View File

@@ -0,0 +1,251 @@
mat!(uni_literal, r"☃", "", Some((0, 3)));
mat!(uni_literal_plus, r"☃+", "", Some((0, 3)));
mat!(uni_literal_casei_plus, r"(?i)☃+", "", Some((0, 3)));
mat!(uni_class_plus, r"[☃Ⅰ]+", "", Some((0, 3)));
mat!(uni_one, r"\pN", "", Some((0, 3)));
mat!(uni_mixed, r"\pN+", "1Ⅱ2", Some((0, 8)));
mat!(uni_not, r"\PN+", "ab", Some((0, 2)));
mat!(uni_not_class, r"[\PN]+", "ab", Some((0, 2)));
mat!(uni_not_class_neg, r"[^\PN]+", "ab", Some((2, 5)));
mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2)));
mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8)));
mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)));
mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)));
mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)));
// Test the Unicode friendliness of Perl character classes.
mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)));
mat!(uni_perl_w_not, r"\w+", "", None);
mat!(uni_perl_w_neg, r"\W+", "", Some((0, 3)));
mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8)));
mat!(uni_perl_d_not, r"\d+", "", None);
mat!(uni_perl_d_neg, r"\D+", "", Some((0, 3)));
mat!(uni_perl_s, r"\s+", "", Some((0, 3)));
mat!(uni_perl_s_not, r"\s+", "", None);
mat!(uni_perl_s_neg, r"\S+", "", Some((0, 3)));
// And do the same for word boundaries.
mat!(uni_boundary_none, r"\d\b", "", None);
mat!(uni_boundary_ogham, r"\d\b", "6", Some((0, 1)));
mat!(uni_not_boundary_none, r"\d\B", "", Some((0, 1)));
mat!(uni_not_boundary_ogham, r"\d\B", "6", None);
// Test general categories.
//
// We should test more, but there's a lot. Write a script to generate more of
// these tests.
mat!(uni_class_gencat_cased_letter, r"\p{Cased_Letter}", "", Some((0, 3)));
mat!(
uni_class_gencat_close_punctuation,
r"\p{Close_Punctuation}",
"",
Some((0, 3))
);
mat!(
uni_class_gencat_connector_punctuation,
r"\p{Connector_Punctuation}",
"",
Some((0, 3))
);
mat!(uni_class_gencat_control, r"\p{Control}", "\u{9f}", Some((0, 2)));
mat!(
uni_class_gencat_currency_symbol,
r"\p{Currency_Symbol}",
"",
Some((0, 3))
);
mat!(
uni_class_gencat_dash_punctuation,
r"\p{Dash_Punctuation}",
"",
Some((0, 3))
);
mat!(uni_class_gencat_decimal_numer, r"\p{Decimal_Number}", "𑓙", Some((0, 4)));
mat!(
uni_class_gencat_enclosing_mark,
r"\p{Enclosing_Mark}",
"\u{A672}",
Some((0, 3))
);
mat!(
uni_class_gencat_final_punctuation,
r"\p{Final_Punctuation}",
"",
Some((0, 3))
);
mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4)));
// See: https://github.com/rust-lang/regex/issues/719
mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4)));
mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4)));
mat!(
uni_class_gencat_initial_punctuation,
r"\p{Initial_Punctuation}",
"",
Some((0, 3))
);
mat!(uni_class_gencat_letter, r"\p{Letter}", "Έ", Some((0, 2)));
mat!(uni_class_gencat_letter_number, r"\p{Letter_Number}", "", Some((0, 3)));
mat!(
uni_class_gencat_line_separator,
r"\p{Line_Separator}",
"\u{2028}",
Some((0, 3))
);
mat!(
uni_class_gencat_lowercase_letter,
r"\p{Lowercase_Letter}",
"ϛ",
Some((0, 2))
);
mat!(uni_class_gencat_mark, r"\p{Mark}", "\u{E01EF}", Some((0, 4)));
mat!(uni_class_gencat_math, r"\p{Math}", "", Some((0, 3)));
mat!(
uni_class_gencat_modifier_letter,
r"\p{Modifier_Letter}",
"𖭃",
Some((0, 4))
);
mat!(
uni_class_gencat_modifier_symbol,
r"\p{Modifier_Symbol}",
"🏿",
Some((0, 4))
);
mat!(
uni_class_gencat_nonspacing_mark,
r"\p{Nonspacing_Mark}",
"\u{1E94A}",
Some((0, 4))
);
mat!(uni_class_gencat_number, r"\p{Number}", "", Some((0, 3)));
mat!(
uni_class_gencat_open_punctuation,
r"\p{Open_Punctuation}",
"",
Some((0, 3))
);
mat!(uni_class_gencat_other, r"\p{Other}", "\u{bc9}", Some((0, 3)));
mat!(uni_class_gencat_other_letter, r"\p{Other_Letter}", "", Some((0, 3)));
mat!(uni_class_gencat_other_number, r"\p{Other_Number}", "", Some((0, 3)));
mat!(
uni_class_gencat_other_punctuation,
r"\p{Other_Punctuation}",
"𞥞",
Some((0, 4))
);
mat!(uni_class_gencat_other_symbol, r"\p{Other_Symbol}", "", Some((0, 3)));
mat!(
uni_class_gencat_paragraph_separator,
r"\p{Paragraph_Separator}",
"\u{2029}",
Some((0, 3))
);
mat!(
uni_class_gencat_private_use,
r"\p{Private_Use}",
"\u{10FFFD}",
Some((0, 4))
);
mat!(uni_class_gencat_punctuation, r"\p{Punctuation}", "𑁍", Some((0, 4)));
mat!(uni_class_gencat_separator, r"\p{Separator}", "\u{3000}", Some((0, 3)));
mat!(
uni_class_gencat_space_separator,
r"\p{Space_Separator}",
"\u{205F}",
Some((0, 3))
);
mat!(
uni_class_gencat_spacing_mark,
r"\p{Spacing_Mark}",
"\u{16F7E}",
Some((0, 4))
);
mat!(uni_class_gencat_symbol, r"\p{Symbol}", "", Some((0, 3)));
mat!(
uni_class_gencat_titlecase_letter,
r"\p{Titlecase_Letter}",
"",
Some((0, 3))
);
mat!(
uni_class_gencat_unassigned,
r"\p{Unassigned}",
"\u{10FFFF}",
Some((0, 4))
);
mat!(
uni_class_gencat_uppercase_letter,
r"\p{Uppercase_Letter}",
"",
Some((0, 3))
);
// Test a smattering of properties.
mat!(uni_class_prop_emoji1, r"\p{Emoji}", "\u{23E9}", Some((0, 3)));
mat!(uni_class_prop_emoji2, r"\p{emoji}", "\u{1F21A}", Some((0, 4)));
mat!(
uni_class_prop_picto1,
r"\p{extendedpictographic}",
"\u{1FA6E}",
Some((0, 4))
);
mat!(
uni_class_prop_picto2,
r"\p{extendedpictographic}",
"\u{1FFFD}",
Some((0, 4))
);
// grapheme_cluster_break
mat!(
uni_class_gcb_prepend,
r"\p{grapheme_cluster_break=prepend}",
"\u{11D46}",
Some((0, 4))
);
mat!(
uni_class_gcb_ri1,
r"\p{gcb=regional_indicator}",
"\u{1F1E6}",
Some((0, 4))
);
mat!(uni_class_gcb_ri2, r"\p{gcb=ri}", "\u{1F1E7}", Some((0, 4)));
mat!(
uni_class_gcb_ri3,
r"\p{gcb=regionalindicator}",
"\u{1F1FF}",
Some((0, 4))
);
mat!(uni_class_gcb_lvt, r"\p{gcb=lvt}", "\u{C989}", Some((0, 3)));
mat!(uni_class_gcb_zwj, r"\p{gcb=zwj}", "\u{200D}", Some((0, 3)));
// word_break
mat!(uni_class_wb1, r"\p{word_break=Hebrew_Letter}", "\u{FB46}", Some((0, 3)));
mat!(uni_class_wb2, r"\p{wb=hebrewletter}", "\u{FB46}", Some((0, 3)));
mat!(uni_class_wb3, r"\p{wb=ExtendNumLet}", "\u{FF3F}", Some((0, 3)));
mat!(uni_class_wb4, r"\p{wb=WSegSpace}", "\u{3000}", Some((0, 3)));
mat!(uni_class_wb5, r"\p{wb=numeric}", "\u{1E950}", Some((0, 4)));
// sentence_break
mat!(uni_class_sb1, r"\p{sentence_break=Lower}", "\u{0469}", Some((0, 2)));
mat!(uni_class_sb2, r"\p{sb=lower}", "\u{0469}", Some((0, 2)));
mat!(uni_class_sb3, r"\p{sb=Close}", "\u{FF60}", Some((0, 3)));
mat!(uni_class_sb4, r"\p{sb=Close}", "\u{1F677}", Some((0, 4)));
mat!(uni_class_sb5, r"\p{sb=SContinue}", "\u{FF64}", Some((0, 3)));
// Test 'Vithkuqi' support, which was added in Unicode 14.
// See: https://github.com/rust-lang/regex/issues/877
mat!(
uni_vithkuqi_literal_upper,
r"(?i)^\u{10570}$",
"\u{10570}",
Some((0, 4))
);
mat!(
uni_vithkuqi_literal_lower,
r"(?i)^\u{10570}$",
"\u{10597}",
Some((0, 4))
);
mat!(uni_vithkuqi_word_upper, r"^\w$", "\u{10570}", Some((0, 4)));
mat!(uni_vithkuqi_word_lower, r"^\w$", "\u{10597}", Some((0, 4)));

View File

@@ -0,0 +1,89 @@
// Many of these are cribbed from RE2's test suite.
matiter!(wb1, r"\b", "");
matiter!(wb2, r"\b", "a", (0, 0), (1, 1));
matiter!(wb3, r"\b", "ab", (0, 0), (2, 2));
matiter!(wb4, r"^\b", "ab", (0, 0));
matiter!(wb5, r"\b$", "ab", (2, 2));
matiter!(wb6, r"^\b$", "ab");
matiter!(wb7, r"\bbar\b", "nobar bar foo bar", (6, 9), (14, 17));
matiter!(wb8, r"a\b", "faoa x", (3, 4));
matiter!(wb9, r"\bbar", "bar x", (0, 3));
matiter!(wb10, r"\bbar", "foo\nbar x", (4, 7));
matiter!(wb11, r"bar\b", "foobar", (3, 6));
matiter!(wb12, r"bar\b", "foobar\nxxx", (3, 6));
matiter!(wb13, r"(foo|bar|[A-Z])\b", "foo", (0, 3));
matiter!(wb14, r"(foo|bar|[A-Z])\b", "foo\n", (0, 3));
matiter!(wb15, r"\b(foo|bar|[A-Z])", "foo", (0, 3));
matiter!(wb16, r"\b(foo|bar|[A-Z])\b", "X", (0, 1));
matiter!(wb17, r"\b(foo|bar|[A-Z])\b", "XY");
matiter!(wb18, r"\b(foo|bar|[A-Z])\b", "bar", (0, 3));
matiter!(wb19, r"\b(foo|bar|[A-Z])\b", "foo", (0, 3));
matiter!(wb20, r"\b(foo|bar|[A-Z])\b", "foo\n", (0, 3));
matiter!(wb21, r"\b(foo|bar|[A-Z])\b", "ffoo bbar N x", (10, 11));
matiter!(wb22, r"\b(fo|foo)\b", "fo", (0, 2));
matiter!(wb23, r"\b(fo|foo)\b", "foo", (0, 3));
matiter!(wb24, r"\b\b", "");
matiter!(wb25, r"\b\b", "a", (0, 0), (1, 1));
matiter!(wb26, r"\b$", "");
matiter!(wb27, r"\b$", "x", (1, 1));
matiter!(wb28, r"\b$", "y x", (3, 3));
matiter!(wb29, r"\b.$", "x", (0, 1));
matiter!(wb30, r"^\b(fo|foo)\b", "fo", (0, 2));
matiter!(wb31, r"^\b(fo|foo)\b", "foo", (0, 3));
matiter!(wb32, r"^\b$", "");
matiter!(wb33, r"^\b$", "x");
matiter!(wb34, r"^\b.$", "x", (0, 1));
matiter!(wb35, r"^\b.\b$", "x", (0, 1));
matiter!(wb36, r"^^^^^\b$$$$$", "");
matiter!(wb37, r"^^^^^\b.$$$$$", "x", (0, 1));
matiter!(wb38, r"^^^^^\b$$$$$", "x");
matiter!(wb39, r"^^^^^\b\b\b.\b\b\b$$$$$", "x", (0, 1));
matiter!(wb40, r"\b.+\b", "$$abc$$", (2, 5));
matiter!(wb41, r"\b", "a b c", (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
matiter!(nb1, r"\Bfoo\B", "n foo xfoox that", (7, 10));
matiter!(nb2, r"a\B", "faoa x", (1, 2));
matiter!(nb3, r"\Bbar", "bar x");
matiter!(nb4, r"\Bbar", "foo\nbar x");
matiter!(nb5, r"bar\B", "foobar");
matiter!(nb6, r"bar\B", "foobar\nxxx");
matiter!(nb7, r"(foo|bar|[A-Z])\B", "foox", (0, 3));
matiter!(nb8, r"(foo|bar|[A-Z])\B", "foo\n");
matiter!(nb9, r"\B", "", (0, 0));
matiter!(nb10, r"\B", "x");
matiter!(nb11, r"\B(foo|bar|[A-Z])", "foo");
matiter!(nb12, r"\B(foo|bar|[A-Z])\B", "xXy", (1, 2));
matiter!(nb13, r"\B(foo|bar|[A-Z])\B", "XY");
matiter!(nb14, r"\B(foo|bar|[A-Z])\B", "XYZ", (1, 2));
matiter!(nb15, r"\B(foo|bar|[A-Z])\B", "abara", (1, 4));
matiter!(nb16, r"\B(foo|bar|[A-Z])\B", "xfoo_", (1, 4));
matiter!(nb17, r"\B(foo|bar|[A-Z])\B", "xfoo\n");
matiter!(nb18, r"\B(foo|bar|[A-Z])\B", "foo bar vNX", (9, 10));
matiter!(nb19, r"\B(fo|foo)\B", "xfoo", (1, 3));
matiter!(nb20, r"\B(foo|fo)\B", "xfooo", (1, 4));
matiter!(nb21, r"\B\B", "", (0, 0));
matiter!(nb22, r"\B\B", "x");
matiter!(nb23, r"\B$", "", (0, 0));
matiter!(nb24, r"\B$", "x");
matiter!(nb25, r"\B$", "y x");
matiter!(nb26, r"\B.$", "x");
matiter!(nb27, r"^\B(fo|foo)\B", "fo");
matiter!(nb28, r"^\B(fo|foo)\B", "foo");
matiter!(nb29, r"^\B", "", (0, 0));
matiter!(nb30, r"^\B", "x");
matiter!(nb31, r"^\B\B", "", (0, 0));
matiter!(nb32, r"^\B\B", "x");
matiter!(nb33, r"^\B$", "", (0, 0));
matiter!(nb34, r"^\B$", "x");
matiter!(nb35, r"^\B.$", "x");
matiter!(nb36, r"^\B.\B$", "x");
matiter!(nb37, r"^^^^^\B$$$$$", "", (0, 0));
matiter!(nb38, r"^^^^^\B.$$$$$", "x");
matiter!(nb39, r"^^^^^\B$$$$$", "x");
// These work for both Unicode and ASCII because all matches are reported as
// byte offsets, and « and » do not correspond to word boundaries at either
// the character or byte level.
matiter!(unicode1, r"\bx\b", "«x", (2, 3));
matiter!(unicode2, r"\bx\b", "", (0, 1));

View File

@@ -0,0 +1,9 @@
// ASCII word boundaries are completely oblivious to Unicode characters.
// For Unicode word boundaries, the tests are precisely inverted.
matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));
matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ");
matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5));
// We still get Unicode word boundaries by default in byte regexes.
matiter!(unicode1, r"\bx\b", "áxβ");
matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));

View File

@@ -0,0 +1,6 @@
// Unicode word boundaries know about Unicode characters.
// For ASCII word boundaries, the tests are precisely inverted.
matiter!(unicode1, r"\bx\b", "áxβ");
matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));
matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));