283 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Rust
		
	
	
	
	
	
			
		
		
	
	
			283 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Rust
		
	
	
	
	
	
| // This is the backtracking matching engine. It has the same exact capability
 | |
| // as the full NFA simulation, except it is artificially restricted to small
 | |
| // regexes on small inputs because of its memory requirements.
 | |
| //
 | |
| // In particular, this is a *bounded* backtracking engine. It retains worst
 | |
| // case linear time by keeping track of the states that it has visited (using a
 | |
| // bitmap). Namely, once a state is visited, it is never visited again. Since a
 | |
| // state is keyed by `(instruction index, input index)`, we have that its time
 | |
| // complexity is `O(mn)` (i.e., linear in the size of the search text).
 | |
| //
 | |
| // The backtracking engine can beat out the NFA simulation on small
 | |
| // regexes/inputs because it doesn't have to keep track of multiple copies of
 | |
| // the capture groups. In benchmarks, the backtracking engine is roughly twice
 | |
| // as fast as the full NFA simulation. Note though that its performance doesn't
 | |
| // scale, even if you're willing to live with the memory requirements. Namely,
 | |
| // the bitset has to be zeroed on each execution, which becomes quite expensive
 | |
| // on large bitsets.
 | |
| 
 | |
| use crate::exec::ProgramCache;
 | |
| use crate::input::{Input, InputAt};
 | |
| use crate::prog::{InstPtr, Program};
 | |
| use crate::re_trait::Slot;
 | |
| 
 | |
| type Bits = u32;
 | |
| 
 | |
| const BIT_SIZE: usize = 32;
 | |
| const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB
 | |
| 
 | |
| /// Returns true iff the given regex and input should be executed by this
 | |
| /// engine with reasonable memory usage.
 | |
| pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
 | |
|     // Total memory usage in bytes is determined by:
 | |
|     //
 | |
|     //   ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
 | |
|     //
 | |
|     // The actual limit picked is pretty much a heuristic.
 | |
|     // See: https://github.com/rust-lang/regex/issues/215
 | |
|     let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4;
 | |
|     size <= MAX_SIZE_BYTES
 | |
| }
 | |
| 
 | |
| /// A backtracking matching engine.
 | |
| #[derive(Debug)]
 | |
| pub struct Bounded<'a, 'm, 'r, 's, I> {
 | |
|     prog: &'r Program,
 | |
|     input: I,
 | |
|     matches: &'m mut [bool],
 | |
|     slots: &'s mut [Slot],
 | |
|     m: &'a mut Cache,
 | |
| }
 | |
| 
 | |
| /// Shared cached state between multiple invocations of a backtracking engine
 | |
| /// in the same thread.
 | |
| #[derive(Clone, Debug)]
 | |
| pub struct Cache {
 | |
|     jobs: Vec<Job>,
 | |
|     visited: Vec<Bits>,
 | |
| }
 | |
| 
 | |
| impl Cache {
 | |
|     /// Create new empty cache for the backtracking engine.
 | |
|     pub fn new(_prog: &Program) -> Self {
 | |
|         Cache { jobs: vec![], visited: vec![] }
 | |
|     }
 | |
| }
 | |
| 
 | |
| /// A job is an explicit unit of stack space in the backtracking engine.
 | |
| ///
 | |
| /// The "normal" representation is a single state transition, which corresponds
 | |
| /// to an NFA state and a character in the input. However, the backtracking
 | |
| /// engine must keep track of old capture group values. We use the explicit
 | |
| /// stack to do it.
 | |
| #[derive(Clone, Copy, Debug)]
 | |
| enum Job {
 | |
|     Inst { ip: InstPtr, at: InputAt },
 | |
|     SaveRestore { slot: usize, old_pos: Option<usize> },
 | |
| }
 | |
| 
 | |
| impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
 | |
|     /// Execute the backtracking matching engine.
 | |
|     ///
 | |
|     /// If there's a match, `exec` returns `true` and populates the given
 | |
|     /// captures accordingly.
 | |
|     pub fn exec(
 | |
|         prog: &'r Program,
 | |
|         cache: &ProgramCache,
 | |
|         matches: &'m mut [bool],
 | |
|         slots: &'s mut [Slot],
 | |
|         input: I,
 | |
|         start: usize,
 | |
|         end: usize,
 | |
|     ) -> bool {
 | |
|         let mut cache = cache.borrow_mut();
 | |
|         let cache = &mut cache.backtrack;
 | |
|         let start = input.at(start);
 | |
|         let mut b = Bounded { prog, input, matches, slots, m: cache };
 | |
|         b.exec_(start, end)
 | |
|     }
 | |
| 
 | |
|     /// Clears the cache such that the backtracking engine can be executed
 | |
|     /// on some input of fixed length.
 | |
|     fn clear(&mut self) {
 | |
|         // Reset the job memory so that we start fresh.
 | |
|         self.m.jobs.clear();
 | |
| 
 | |
|         // Now we need to clear the bit state set.
 | |
|         // We do this by figuring out how much space we need to keep track
 | |
|         // of the states we've visited.
 | |
|         // Then we reset all existing allocated space to 0.
 | |
|         // Finally, we request more space if we need it.
 | |
|         //
 | |
|         // This is all a little circuitous, but doing this using unchecked
 | |
|         // operations doesn't seem to have a measurable impact on performance.
 | |
|         // (Probably because backtracking is limited to such small
 | |
|         // inputs/regexes in the first place.)
 | |
|         let visited_len =
 | |
|             (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
 | |
|                 / BIT_SIZE;
 | |
|         self.m.visited.truncate(visited_len);
 | |
|         for v in &mut self.m.visited {
 | |
|             *v = 0;
 | |
|         }
 | |
|         if visited_len > self.m.visited.len() {
 | |
|             let len = self.m.visited.len();
 | |
|             self.m.visited.reserve_exact(visited_len - len);
 | |
|             for _ in 0..(visited_len - len) {
 | |
|                 self.m.visited.push(0);
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /// Start backtracking at the given position in the input, but also look
 | |
|     /// for literal prefixes.
 | |
|     fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
 | |
|         self.clear();
 | |
|         // If this is an anchored regex at the beginning of the input, then
 | |
|         // we're either already done or we only need to try backtracking once.
 | |
|         if self.prog.is_anchored_start {
 | |
|             return if !at.is_start() { false } else { self.backtrack(at) };
 | |
|         }
 | |
|         let mut matched = false;
 | |
|         loop {
 | |
|             if !self.prog.prefixes.is_empty() {
 | |
|                 at = match self.input.prefix_at(&self.prog.prefixes, at) {
 | |
|                     None => break,
 | |
|                     Some(at) => at,
 | |
|                 };
 | |
|             }
 | |
|             matched = self.backtrack(at) || matched;
 | |
|             if matched && self.prog.matches.len() == 1 {
 | |
|                 return true;
 | |
|             }
 | |
|             if at.pos() >= end {
 | |
|                 break;
 | |
|             }
 | |
|             at = self.input.at(at.next_pos());
 | |
|         }
 | |
|         matched
 | |
|     }
 | |
| 
 | |
|     /// The main backtracking loop starting at the given input position.
 | |
|     fn backtrack(&mut self, start: InputAt) -> bool {
 | |
|         // N.B. We use an explicit stack to avoid recursion.
 | |
|         // To avoid excessive pushing and popping, most transitions are handled
 | |
|         // in the `step` helper function, which only pushes to the stack when
 | |
|         // there's a capture or a branch.
 | |
|         let mut matched = false;
 | |
|         self.m.jobs.push(Job::Inst { ip: 0, at: start });
 | |
|         while let Some(job) = self.m.jobs.pop() {
 | |
|             match job {
 | |
|                 Job::Inst { ip, at } => {
 | |
|                     if self.step(ip, at) {
 | |
|                         // Only quit if we're matching one regex.
 | |
|                         // If we're matching a regex set, then mush on and
 | |
|                         // try to find other matches (if we want them).
 | |
|                         if self.prog.matches.len() == 1 {
 | |
|                             return true;
 | |
|                         }
 | |
|                         matched = true;
 | |
|                     }
 | |
|                 }
 | |
|                 Job::SaveRestore { slot, old_pos } => {
 | |
|                     if slot < self.slots.len() {
 | |
|                         self.slots[slot] = old_pos;
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         matched
 | |
|     }
 | |
| 
 | |
|     fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
 | |
|         use crate::prog::Inst::*;
 | |
|         loop {
 | |
|             // This loop is an optimization to avoid constantly pushing/popping
 | |
|             // from the stack. Namely, if we're pushing a job only to run it
 | |
|             // next, avoid the push and just mutate `ip` (and possibly `at`)
 | |
|             // in place.
 | |
|             if self.has_visited(ip, at) {
 | |
|                 return false;
 | |
|             }
 | |
|             match self.prog[ip] {
 | |
|                 Match(slot) => {
 | |
|                     if slot < self.matches.len() {
 | |
|                         self.matches[slot] = true;
 | |
|                     }
 | |
|                     return true;
 | |
|                 }
 | |
|                 Save(ref inst) => {
 | |
|                     if let Some(&old_pos) = self.slots.get(inst.slot) {
 | |
|                         // If this path doesn't work out, then we save the old
 | |
|                         // capture index (if one exists) in an alternate
 | |
|                         // job. If the next path fails, then the alternate
 | |
|                         // job is popped and the old capture index is restored.
 | |
|                         self.m.jobs.push(Job::SaveRestore {
 | |
|                             slot: inst.slot,
 | |
|                             old_pos,
 | |
|                         });
 | |
|                         self.slots[inst.slot] = Some(at.pos());
 | |
|                     }
 | |
|                     ip = inst.goto;
 | |
|                 }
 | |
|                 Split(ref inst) => {
 | |
|                     self.m.jobs.push(Job::Inst { ip: inst.goto2, at });
 | |
|                     ip = inst.goto1;
 | |
|                 }
 | |
|                 EmptyLook(ref inst) => {
 | |
|                     if self.input.is_empty_match(at, inst) {
 | |
|                         ip = inst.goto;
 | |
|                     } else {
 | |
|                         return false;
 | |
|                     }
 | |
|                 }
 | |
|                 Char(ref inst) => {
 | |
|                     if inst.c == at.char() {
 | |
|                         ip = inst.goto;
 | |
|                         at = self.input.at(at.next_pos());
 | |
|                     } else {
 | |
|                         return false;
 | |
|                     }
 | |
|                 }
 | |
|                 Ranges(ref inst) => {
 | |
|                     if inst.matches(at.char()) {
 | |
|                         ip = inst.goto;
 | |
|                         at = self.input.at(at.next_pos());
 | |
|                     } else {
 | |
|                         return false;
 | |
|                     }
 | |
|                 }
 | |
|                 Bytes(ref inst) => {
 | |
|                     if let Some(b) = at.byte() {
 | |
|                         if inst.matches(b) {
 | |
|                             ip = inst.goto;
 | |
|                             at = self.input.at(at.next_pos());
 | |
|                             continue;
 | |
|                         }
 | |
|                     }
 | |
|                     return false;
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
 | |
|         let k = ip * (self.input.len() + 1) + at.pos();
 | |
|         let k1 = k / BIT_SIZE;
 | |
|         let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1)));
 | |
|         if self.m.visited[k1] & k2 == 0 {
 | |
|             self.m.visited[k1] |= k2;
 | |
|             false
 | |
|         } else {
 | |
|             true
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| fn usize_to_u32(n: usize) -> u32 {
 | |
|     if (n as u64) > (::std::u32::MAX as u64) {
 | |
|         panic!("BUG: {} is too big to fit into u32", n)
 | |
|     }
 | |
|     n as u32
 | |
| }
 |