regex_lite/
pikevm.rs

use alloc::{vec, vec::Vec};

use crate::{
    int::{NonMaxUsize, U32},
    nfa::{State, StateID, NFA},
    pool::CachePoolGuard,
    utf8,
};

/// A PikeVM searcher.
///
/// A PikeVM uses the standard Thompson NFA linear time search algorithm, but
/// augmented to support tracking the offsets of matching capture groups.
#[derive(Clone, Debug)]
pub(crate) struct PikeVM {
    nfa: NFA,
}

impl PikeVM {
    /// Create a new PikeVM searcher that uses the given NFA.
    pub(crate) fn new(nfa: NFA) -> PikeVM {
        PikeVM { nfa }
    }

    /// Return the underlying NFA used by this PikeVM.
    pub(crate) fn nfa(&self) -> &NFA {
        &self.nfa
    }

    /// Returns an iterator of non-overlapping matches in the given haystack.
    pub(crate) fn find_iter<'r, 'h>(
        &'r self,
        cache: CachePoolGuard<'r>,
        haystack: &'h [u8],
    ) -> FindMatches<'r, 'h> {
        FindMatches {
            pikevm: self,
            cache,
            haystack,
            at: 0,
            slots: vec![None, None],
            last_match_end: None,
        }
    }

    /// Returns an iterator of non-overlapping capture matches in the given
    /// haystack.
    pub(crate) fn captures_iter<'r, 'h>(
        &'r self,
        cache: CachePoolGuard<'r>,
        haystack: &'h [u8],
    ) -> CapturesMatches<'r, 'h> {
        // OK because the NFA wouldn't have compiled if this could overflow.
        let len = self.nfa().group_len().checked_mul(2).unwrap();
        CapturesMatches {
            it: FindMatches {
                pikevm: self,
                cache,
                haystack,
                at: 0,
                slots: vec![None; len],
                last_match_end: None,
            },
        }
    }

    /// The implementation of standard leftmost search.
    ///
    /// Capturing group spans are written to `slots`, but only if requested.
    /// `slots` can be any length. Any slot in the NFA that is activated but
    /// which is out of bounds for the given `slots` is ignored.
    pub(crate) fn search(
        &self,
        cache: &mut Cache,
        haystack: &[u8],
        start: usize,
        end: usize,
        earliest: bool,
        slots: &mut [Option<NonMaxUsize>],
    ) -> bool {
        cache.setup_search(slots.len());
        if start > end {
            return false;
        }
        // Why do we even care about this? Well, in our `slots` representation,
        // we use usize::MAX as a sentinel to indicate "no match." This isn't
        // problematic so long as our haystack doesn't have a maximal length.
        // Byte slices are guaranteed by Rust to have a length that fits into
        // isize, and so this assert should always pass. But we put it here to
        // make our assumption explicit.
        assert!(
            haystack.len() < core::usize::MAX,
            "byte slice lengths must be less than usize MAX",
        );

        let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
        let start_id = self.nfa().start();
        let anchored = self.nfa().is_start_anchored();
        let mut matched = false;
        // Yes, our search doesn't end at `end`, but includes it. This is
        // necessary because matches are delayed by one byte. The delay is used
        // to handle look-behind assertions. In the case of the PikeVM, the
        // delay is implemented by not considering a match to exist until it
        // is visited in `nexts`. Technically, we know a match exists in the
        // previous iteration via `epsilon_closure`.
        let mut at = start;
        while at <= end {
            // If we have no states left to visit, then there are some cases
            // where we know we can quit early or even skip ahead.
            if curr.set.is_empty() {
                // We have a match so we can quit.
                if matched {
                    break;
                }
                // If we're running an anchored search and we've advanced
                // beyond the start position with no other states to try, then
                // we will never observe a match and thus can stop.
                if anchored && at > start {
                    break;
                }
            }
            // Instead of using a hypothetical unanchored start state in the
            // NFA (which doesn't exist, but we could add it), we actually
            // always use its anchored starting state. As a result, when doing
            // an unanchored search, we need to simulate our own '(?s:.)*?'
            // prefix, to permit a match to appear anywhere.
            //
            // Now, we don't *have* to do things this way. We could create
            // a proper unanchored start state in the NFA and do one
            // `epsilon_closure` call from that starting state before the main
            // loop here. And that is just as correct. However, it turns out to
            // be slower than our approach here because it slightly increases
            // the cost of processing each byte by requiring us to visit
            // more NFA states to deal with the additional NFA states in the
            // unanchored prefix. By simulating it explicitly here, we lower
            // those costs substantially. The cost is itself small, but it adds
            // up for large haystacks.
            //
            // In order to simulate the '(?s:.)*?' prefix---which is not
            // greedy---we are careful not to perform an epsilon closure on
            // the start state if we already have a match. Namely, if we
            // did otherwise, we would never reach a terminating condition
            // because there would always be additional states to process.
            if !matched {
                // Since we are adding to the 'curr' active states and since
                // this is for the start ID, we use a slots slice that is
                // guaranteed to have the right length but where every element
                // is absent. This is exactly what we want, because this
                // epsilon closure is responsible for simulating an unanchored
                // '(?s:.)*?' prefix. It is specifically outside of any
                // capturing groups, and thus, using slots that are always
                // absent is correct.
                //
                // Note though that we can't just use `&mut []` here, since
                // this epsilon closure may traverse through `Capture` states
                // transitions, and thus must be able to write offsets to the
                // slots given which are later copied to slot values in `curr`.
                let slots = next.slot_table.all_absent();
                self.epsilon_closure(
                    stack, slots, curr, haystack, at, start_id,
                );
            }
            let (ch, len) = utf8::decode_lossy(&haystack[at..]);
            if self.nexts(stack, curr, next, haystack, at, ch, len, slots) {
                matched = true;
            }
            // Unless the caller asked us to return early, we need to mush
            // on to see if we can extend our match. (But note that 'nexts'
            // will quit right after seeing a match, as is consistent with
            // leftmost-first match priority.)
            if (earliest && matched) || len == 0 {
                break;
            }
            core::mem::swap(curr, next);
            next.set.clear();
            at += len;
        }
        matched
    }

    /// Process the active states in 'curr' to find the states (written to
    /// 'next') we should process for the next byte in the haystack.
    ///
    /// 'stack' is used to perform a depth first traversal of the NFA when
    /// computing an epsilon closure.
    ///
    /// When a match is found, the slots for that match state (in 'curr') are
    /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr'
    /// stops (unless the PikeVM was configured with MatchKind::All semantics).
    ///
    /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at`
    /// in `haystack`.
    ///
    /// `at_len` is the number of bytes consumed by `at_ch`. This is usually
    /// equal to `at_ch.len_utf8()`, but not always. For example, in the case
    /// where `at_ch` is the replacement codepoint that results from decoding
    /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3.
    fn nexts(
        &self,
        stack: &mut Vec<FollowEpsilon>,
        curr: &mut ActiveStates,
        next: &mut ActiveStates,
        haystack: &[u8],
        at: usize,
        at_ch: char,
        at_len: usize,
        slots: &mut [Option<NonMaxUsize>],
    ) -> bool {
        let ActiveStates { ref set, ref mut slot_table } = *curr;
        for sid in set.iter() {
            if self.next(
                stack, slot_table, next, haystack, at, at_ch, at_len, sid,
            ) {
                slots.copy_from_slice(slot_table.for_state(sid));
                return true;
            }
        }
        false
    }

    /// Starting from `sid`, if the position `at` in the `haystack` has a
    /// transition defined out of `sid`, then add the state transitioned to and
    /// its epsilon closure to the `next` set of states to explore.
    ///
    /// `stack` is used by the epsilon closure computation to perform a depth
    /// first traversal of the NFA.
    ///
    /// `curr_slot_table` should be the table of slots for the current set of
    /// states being explored. If there is a transition out of `sid`, then
    /// sid's row in the slot table is used to perform the epsilon closure.
    ///
    /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at`
    /// in `haystack`. The caller provides it so that this routine doesn't
    /// need to re-decode it. (Since it's expected that this routine is called
    /// multiple times for each position.)
    ///
    /// `at_len` is the number of bytes consumed by `at_ch`. This is usually
    /// equal to `at_ch.len_utf8()`, but not always. For example, in the case
    /// where `at_ch` is the replacement codepoint that results from decoding
    /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3.
    fn next(
        &self,
        stack: &mut Vec<FollowEpsilon>,
        curr_slot_table: &mut SlotTable,
        next: &mut ActiveStates,
        haystack: &[u8],
        at: usize,
        at_ch: char,
        at_len: usize,
        sid: StateID,
    ) -> bool {
        match *self.nfa.state(sid) {
            State::Fail
            | State::Goto { .. }
            | State::Splits { .. }
            | State::Capture { .. } => false,
            State::Char { target, ch } => {
                if at_ch == ch && at_len > 0 {
                    let slots = curr_slot_table.for_state(sid);
                    // OK because `at_len` is always derived from the number
                    // of bytes read from `at` that make up `at_ch`. So this
                    // will never wrap.
                    let at = at.wrapping_add(at_len);
                    self.epsilon_closure(
                        stack, slots, next, haystack, at, target,
                    );
                }
                false
            }
            State::Ranges { target, ref ranges } => {
                for (start, end) in ranges.iter().copied() {
                    if start > at_ch {
                        break;
                    } else if start <= at_ch && at_ch <= end {
                        if at_len == 0 {
                            return false;
                        }
                        let slots = curr_slot_table.for_state(sid);
                        // OK because `at_len` is always derived from the
                        // number of bytes read from `at` that make up `at_ch`.
                        // So this will never wrap.
                        let at = at.wrapping_add(at_len);
                        self.epsilon_closure(
                            stack, slots, next, haystack, at, target,
                        );
                    }
                }
                false
            }
            State::Match => true,
        }
    }

    /// Compute the epsilon closure of `sid`, writing the closure into `next`
    /// while copying slot values from `curr_slots` into corresponding states
    /// in `next`. `curr_slots` should be the slot values corresponding to
    /// `sid`.
    ///
    /// The given `stack` is used to perform a depth first traversal of the
    /// NFA by recursively following all epsilon transitions out of `sid`.
    /// Conditional epsilon transitions are followed if and only if they are
    /// satisfied for the position `at` in the `input` haystack.
    ///
    /// While this routine may write to `curr_slots`, once it returns, any
    /// writes are undone and the original values (even if absent) are
    /// restored.
    fn epsilon_closure(
        &self,
        stack: &mut Vec<FollowEpsilon>,
        curr_slots: &mut [Option<NonMaxUsize>],
        next: &mut ActiveStates,
        haystack: &[u8],
        at: usize,
        sid: StateID,
    ) {
        stack.push(FollowEpsilon::Explore(sid));
        while let Some(frame) = stack.pop() {
            match frame {
                FollowEpsilon::RestoreCapture { slot, offset } => {
                    curr_slots[slot.as_usize()] = offset;
                }
                FollowEpsilon::Explore(sid) => {
                    self.epsilon_closure_explore(
                        stack, curr_slots, next, haystack, at, sid,
                    );
                }
            }
        }
    }

    /// Explore all of the epsilon transitions out of `sid`. This is mostly
    /// split out from `epsilon_closure` in order to clearly delineate
    /// the actual work of computing an epsilon closure from the stack
    /// book-keeping.
    ///
    /// This will push any additional explorations needed on to `stack`.
    ///
    /// `curr_slots` should refer to the slots for the currently active NFA
    /// state. That is, the current state we are stepping through. These
    /// slots are mutated in place as new `Captures` states are traversed
    /// during epsilon closure, but the slots are restored to their original
    /// values once the full epsilon closure is completed. The ultimate use of
    /// `curr_slots` is to copy them to the corresponding `next_slots`, so that
    /// the capturing group spans are forwarded from the currently active state
    /// to the next.
    ///
    /// `next` refers to the next set of active states. Computing an epsilon
    /// closure may increase the next set of active states.
    ///
    /// `haystack` refers to the what we're searching and `at` refers to the
    /// current position in the haystack. These are used to check whether
    /// conditional epsilon transitions (like look-around) are satisfied at
    /// the current position. If they aren't, then the epsilon closure won't
    /// include them.
    fn epsilon_closure_explore(
        &self,
        stack: &mut Vec<FollowEpsilon>,
        curr_slots: &mut [Option<NonMaxUsize>],
        next: &mut ActiveStates,
        haystack: &[u8],
        at: usize,
        mut sid: StateID,
    ) {
        // We can avoid pushing some state IDs on to our stack in precisely
        // the cases where a 'push(x)' would be immediately followed by a 'x
        // = pop()'. This is achieved by this outer-loop. We simply set 'sid'
        // to be the next state ID we want to explore once we're done with
        // our initial exploration. In practice, this avoids a lot of stack
        // thrashing.
        loop {
            // Record this state as part of our next set of active states. If
            // we've already explored it, then no need to do it again.
            if !next.set.insert(sid) {
                return;
            }
            match *self.nfa.state(sid) {
                State::Fail
                | State::Match { .. }
                | State::Char { .. }
                | State::Ranges { .. } => {
                    next.slot_table.for_state(sid).copy_from_slice(curr_slots);
                    return;
                }
                State::Goto { target, look: None } => {
                    sid = target;
                }
                State::Goto { target, look: Some(look) } => {
                    if !look.is_match(haystack, at) {
                        return;
                    }
                    sid = target;
                }
                State::Splits { ref targets, reverse: false } => {
                    sid = match targets.get(0) {
                        None => return,
                        Some(&sid) => sid,
                    };
                    stack.extend(
                        targets[1..]
                            .iter()
                            .copied()
                            .rev()
                            .map(FollowEpsilon::Explore),
                    );
                }
                State::Splits { ref targets, reverse: true } => {
                    sid = match targets.last() {
                        None => return,
                        Some(&sid) => sid,
                    };
                    stack.extend(
                        targets[..targets.len() - 1]
                            .iter()
                            .copied()
                            .map(FollowEpsilon::Explore),
                    );
                }
                State::Capture { target, slot } => {
                    // There's no need to do anything with slots that
                    // ultimately won't be copied into the caller-provided
                    // 'Captures' value. So we just skip dealing with them at
                    // all.
                    if slot.as_usize() < curr_slots.len() {
                        stack.push(FollowEpsilon::RestoreCapture {
                            slot,
                            offset: curr_slots[slot.as_usize()],
                        });
                        // OK because length of a slice must fit into an isize.
                        curr_slots[slot.as_usize()] =
                            Some(NonMaxUsize::new(at).unwrap());
                    }
                    sid = target;
                }
            }
        }
    }
}

/// An iterator over all successive non-overlapping matches in a particular
/// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime
/// of the cache and `'h` represents the lifetime of the haystack.
#[derive(Debug)]
pub(crate) struct FindMatches<'r, 'h> {
    pikevm: &'r PikeVM,
    cache: CachePoolGuard<'r>,
    haystack: &'h [u8],
    at: usize,
    slots: Vec<Option<NonMaxUsize>>,
    last_match_end: Option<usize>,
}

impl<'r, 'h> Iterator for FindMatches<'r, 'h> {
    type Item = (usize, usize);

    fn next(&mut self) -> Option<(usize, usize)> {
        if !self.pikevm.search(
            &mut self.cache,
            self.haystack,
            self.at,
            self.haystack.len(),
            false,
            &mut self.slots,
        ) {
            return None;
        }
        let mut m =
            (self.slots[0].unwrap().get(), self.slots[1].unwrap().get());
        if m.0 >= m.1 {
            m = self.handle_overlapping_empty_match(m)?;
        }
        self.at = m.1;
        self.last_match_end = Some(m.1);
        Some(m)
    }
}

impl<'r, 'h> FindMatches<'r, 'h> {
    /// Handles the special case of an empty match by ensuring that 1) the
    /// iterator always advances and 2) empty matches never overlap with other
    /// matches.
    ///
    /// Note that we mark this cold and forcefully prevent inlining because
    /// handling empty matches like this is extremely rare and does require a
    /// bit of code, comparatively. Keeping this code out of the main iterator
    /// function keeps it smaller and more amenable to inlining itself.
    #[cold]
    #[inline(never)]
    fn handle_overlapping_empty_match(
        &mut self,
        mut m: (usize, usize),
    ) -> Option<(usize, usize)> {
        assert!(m.0 >= m.1);
        if Some(m.1) == self.last_match_end {
            let len =
                core::cmp::max(1, utf8::decode(&self.haystack[self.at..]).1);
            self.at = self.at.checked_add(len).unwrap();
            if !self.pikevm.search(
                &mut self.cache,
                self.haystack,
                self.at,
                self.haystack.len(),
                false,
                &mut self.slots,
            ) {
                return None;
            }
            m = (self.slots[0].unwrap().get(), self.slots[1].unwrap().get());
        }
        Some(m)
    }
}

/// An iterator over all successive non-overlapping capture matches in a particular
/// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime
/// of the cache and `'h` represents the lifetime of the haystack.
#[derive(Debug)]
pub(crate) struct CapturesMatches<'r, 'h> {
    it: FindMatches<'r, 'h>,
}

impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> {
    type Item = Vec<Option<NonMaxUsize>>;

    fn next(&mut self) -> Option<Vec<Option<NonMaxUsize>>> {
        self.it.next()?;
        Some(self.it.slots.clone())
    }
}

/// A cache represents mutable state that a `PikeVM` requires during a search.
///
/// For a given `PikeVM`, its corresponding cache may be created either via
/// `PikeVM::create_cache`, or via `Cache::new`. They are equivalent in every
/// way, except the former does not require explicitly importing `Cache`.
///
/// A particular `Cache` is coupled with the `PikeVM` from which it was
/// created. It may only be used with that `PikeVM`. A cache and its
/// allocations may be re-purposed via `Cache::reset`, in which case, it can
/// only be used with the new `PikeVM` (and not the old one).
#[derive(Clone, Debug)]
pub(crate) struct Cache {
    /// Stack used while computing epsilon closure. This effectively lets us
    /// move what is more naturally expressed through recursion to a stack
    /// on the heap.
    stack: Vec<FollowEpsilon>,
    /// The current active states being explored for the current byte in the
    /// haystack.
    curr: ActiveStates,
    /// The next set of states we're building that will be explored for the
    /// next byte in the haystack.
    next: ActiveStates,
}

impl Cache {
    /// Create a new `PikeVM` cache.
    ///
    /// A potentially more convenient routine to create a cache is
    /// `PikeVM::create_cache`, as it does not require also importing the
    /// `Cache` type.
    ///
    /// If you want to reuse the returned `Cache` with some other `PikeVM`,
    /// then you must call `Cache::reset` with the desired `PikeVM`.
    pub(crate) fn new(re: &PikeVM) -> Cache {
        Cache {
            stack: vec![],
            curr: ActiveStates::new(re),
            next: ActiveStates::new(re),
        }
    }

    /// Clears this cache. This should be called at the start of every search
    /// to ensure we start with a clean slate.
    ///
    /// This also sets the length of the capturing groups used in the current
    /// search. This permits an optimization where by 'SlotTable::for_state'
    /// only returns the number of slots equivalent to the number of slots
    /// given in the 'Captures' value. This may be less than the total number
    /// of possible slots, e.g., when one only wants to track overall match
    /// offsets. This in turn permits less copying of capturing group spans
    /// in the PikeVM.
    fn setup_search(&mut self, captures_slot_len: usize) {
        self.stack.clear();
        self.curr.setup_search(captures_slot_len);
        self.next.setup_search(captures_slot_len);
    }
}

/// A set of active states used to "simulate" the execution of an NFA via the
/// PikeVM.
///
/// There are two sets of these used during NFA simulation. One set corresponds
/// to the "current" set of states being traversed for the current position
/// in a haystack. The other set corresponds to the "next" set of states being
/// built, which will become the new "current" set for the next position in the
/// haystack. These two sets correspond to CLIST and NLIST in Thompson's
/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387
///
/// In addition to representing a set of NFA states, this also maintains slot
/// values for each state. These slot values are what turn the NFA simulation
/// into the "Pike VM." Namely, they track capturing group values for each
/// state. During the computation of epsilon closure, we copy slot values from
/// states in the "current" set to the "next" set. Eventually, once a match
/// is found, the slot values for that match state are what we write to the
/// caller provided slots.
#[derive(Clone, Debug)]
struct ActiveStates {
    /// The set of active NFA states. This set preserves insertion order, which
    /// is critical for simulating the match semantics of backtracking regex
    /// engines.
    set: SparseSet,
    /// The slots for every NFA state, where each slot stores a (possibly
    /// absent) offset. Every capturing group has two slots. One for a start
    /// offset and one for an end offset.
    slot_table: SlotTable,
}

impl ActiveStates {
    /// Create a new set of active states for the given PikeVM. The active
    /// states returned may only be used with the given PikeVM. (Use 'reset'
    /// to re-purpose the allocation for a different PikeVM.)
    fn new(re: &PikeVM) -> ActiveStates {
        let mut active = ActiveStates {
            set: SparseSet::new(0),
            slot_table: SlotTable::new(),
        };
        active.reset(re);
        active
    }

    /// Reset this set of active states such that it can be used with the given
    /// PikeVM (and only that PikeVM).
    fn reset(&mut self, re: &PikeVM) {
        self.set.resize(re.nfa().len());
        self.slot_table.reset(re);
    }

    /// Setup this set of active states for a new search. The given slot
    /// length should be the number of slots in a caller provided 'Captures'
    /// (and may be zero).
    fn setup_search(&mut self, captures_slot_len: usize) {
        self.set.clear();
        self.slot_table.setup_search(captures_slot_len);
    }
}

/// A table of slots, where each row represent a state in an NFA. Thus, the
/// table has room for storing slots for every single state in an NFA.
///
/// This table is represented with a single contiguous allocation. In general,
/// the notion of "capturing group" doesn't really exist at this level of
/// abstraction, hence the name "slot" instead. (Indeed, every capturing group
/// maps to a pair of slots, one for the start offset and one for the end
/// offset.) Slots are indexed by the `Captures` NFA state.
#[derive(Clone, Debug)]
struct SlotTable {
    /// The actual table of offsets.
    table: Vec<Option<NonMaxUsize>>,
    /// The number of slots per state, i.e., the table's stride or the length
    /// of each row.
    slots_per_state: usize,
    /// The number of slots in the caller-provided `Captures` value for the
    /// current search. Setting this to `slots_per_state` is always correct,
    /// but may be wasteful.
    slots_for_captures: usize,
}

impl SlotTable {
    /// Create a new slot table.
    ///
    /// One should call 'reset' with the corresponding PikeVM before use.
    fn new() -> SlotTable {
        SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 }
    }

    /// Reset this slot table such that it can be used with the given PikeVM
    /// (and only that PikeVM).
    fn reset(&mut self, re: &PikeVM) {
        let nfa = re.nfa();
        // OK because NFA construction would have failed if this overflowed.
        self.slots_per_state = nfa.group_len().checked_mul(2).unwrap();
        // This is always correct, but may be reduced for a particular search
        // if fewer slots were given by the caller, e.g., none at all or only
        // slots for tracking the overall match instead of all slots for every
        // group.
        self.slots_for_captures = self.slots_per_state;
        let len = nfa
            .len()
            // We add 1 so that our last row is always empty. We use it as
            // "scratch" space for computing the epsilon closure off of the
            // starting state.
            .checked_add(1)
            .and_then(|x| x.checked_mul(self.slots_per_state))
            // It seems like this could actually panic on legitimate inputs
            // on 32-bit targets. Should we somehow convert this to an error?
            // What about something similar for the lazy DFA cache? If you're
            // tripping this assert, please file a bug.
            .expect("slot table length doesn't overflow");
        self.table.resize(len, None);
    }

    /// Perform any per-search setup for this slot table.
    ///
    /// In particular, this sets the length of the number of slots used in the
    /// slots given by the caller (if any at all). This number may be smaller
    /// than the total number of slots available, e.g., when the caller is only
    /// interested in tracking the overall match and not the spans of every
    /// matching capturing group. Only tracking the overall match can save a
    /// substantial amount of time copying capturing spans during a search.
    fn setup_search(&mut self, captures_slot_len: usize) {
        self.slots_for_captures = captures_slot_len;
    }

    /// Return a mutable slice of the slots for the given state.
    ///
    /// Note that the length of the slice returned may be less than the total
    /// number of slots available for this state. In particular, the length
    /// always matches the number of slots indicated via `setup_search`.
    fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] {
        let i = sid.as_usize() * self.slots_per_state;
        &mut self.table[i..i + self.slots_for_captures]
    }

    /// Return a slice of slots of appropriate length where every slot offset
    /// is guaranteed to be absent. This is useful in cases where you need to
    /// compute an epsilon closure outside of the user supplied regex, and thus
    /// never want it to have any capturing slots set.
    fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] {
        let i = self.table.len() - self.slots_per_state;
        &mut self.table[i..i + self.slots_for_captures]
    }
}

/// Represents a stack frame for use while computing an epsilon closure.
///
/// (An "epsilon closure" refers to the set of reachable NFA states from a
/// single state without consuming any input. That is, the set of all epsilon
/// transitions not only from that single state, but from every other state
/// reachable by an epsilon transition as well. This is why it's called a
/// "closure.")
///
/// Computing the epsilon closure in a Thompson NFA proceeds via a depth
/// first traversal over all epsilon transitions from a particular state.
/// (A depth first traversal is important because it emulates the same priority
/// of matches that is typically found in backtracking regex engines.) This
/// depth first traversal is naturally expressed using recursion, but to avoid
/// a call stack size proportional to the size of a regex, we put our stack on
/// the heap instead.
///
/// This stack thus consists of call frames. The typical call frame is
/// `Explore`, which instructs epsilon closure to explore the epsilon
/// transitions from that state. (Subsequent epsilon transitions are then
/// pushed on to the stack as more `Explore` frames.) If the state ID being
/// explored has no epsilon transitions, then the capturing group slots are
/// copied from the original state that sparked the epsilon closure (from the
/// 'step' routine) to the state ID being explored. This way, capturing group
/// slots are forwarded from the previous state to the next.
///
/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to
/// set the position for a particular slot back to some particular offset. This
/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will
/// set the offset of the slot indicated in `Capture` to the current offset,
/// and then push the old offset on to the stack as a `RestoreCapture` frame.
/// Thus, the new offset is only used until the epsilon closure reverts back to
/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon
/// transition its "scope" to only states that come "after" it during depth
/// first traversal.
#[derive(Clone, Debug)]
enum FollowEpsilon {
    /// Explore the epsilon transitions from a state ID.
    Explore(StateID),
    /// Reset the given `slot` to the given `offset` (which might be `None`).
    RestoreCapture { slot: u32, offset: Option<NonMaxUsize> },
}

/// A sparse set used for representing ordered NFA states.
///
/// This supports constant time addition and membership testing. Clearing an
/// entire set can also be done in constant time. Iteration yields elements
/// in the order in which they were inserted.
///
/// The data structure is based on: https://research.swtch.com/sparse
/// Note though that we don't actually use uninitialized memory. We generally
/// reuse sparse sets, so the initial allocation cost is bareable. However, its
/// other properties listed above are extremely useful.
#[derive(Clone)]
struct SparseSet {
    /// The number of elements currently in this set.
    len: usize,
    /// Dense contains the ids in the order in which they were inserted.
    dense: Vec<StateID>,
    /// Sparse maps ids to their location in dense.
    ///
    /// A state ID is in the set if and only if
    /// sparse[id] < len && id == dense[sparse[id]].
    ///
    /// Note that these are indices into 'dense'. It's a little weird to use
    /// StateID here, but we know our length can never exceed the bounds of
    /// StateID (enforced by 'resize') and StateID will be at most 4 bytes
    /// where as a usize is likely double that in most cases.
    sparse: Vec<StateID>,
}

impl SparseSet {
    /// Create a new sparse set with the given capacity.
    ///
    /// Sparse sets have a fixed size and they cannot grow. Attempting to
    /// insert more distinct elements than the total capacity of the set will
    /// result in a panic.
    ///
    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
    fn new(capacity: usize) -> SparseSet {
        let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] };
        set.resize(capacity);
        set
    }

    /// Resizes this sparse set to have the new capacity given.
    ///
    /// This set is automatically cleared.
    ///
    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
    fn resize(&mut self, new_capacity: usize) {
        assert!(
            new_capacity <= u32::MAX.as_usize(),
            "sparse set capacity cannot excced {:?}",
            u32::MAX,
        );
        self.clear();
        self.dense.resize(new_capacity, 0);
        self.sparse.resize(new_capacity, 0);
    }

    /// Returns the capacity of this set.
    ///
    /// The capacity represents a fixed limit on the number of distinct
    /// elements that are allowed in this set. The capacity cannot be changed.
    fn capacity(&self) -> usize {
        self.dense.len()
    }

    /// Returns the number of elements in this set.
    fn len(&self) -> usize {
        self.len
    }

    /// Returns true if and only if this set is empty.
    fn is_empty(&self) -> bool {
        self.len() == 0
    }

    /// Insert the state ID value into this set and return true if the given
    /// state ID was not previously in this set.
    ///
    /// This operation is idempotent. If the given value is already in this
    /// set, then this is a no-op.
    ///
    /// If more than `capacity` ids are inserted, then this panics.
    fn insert(&mut self, id: StateID) -> bool {
        if self.contains(id) {
            return false;
        }

        let index = self.len();
        assert!(
            index < self.capacity(),
            "{:?} exceeds capacity of {:?} when inserting {:?}",
            index,
            self.capacity(),
            id,
        );
        self.dense[index] = id;
        // OK because we don't permit the capacity to be set higher than
        // u32::MAX.
        self.sparse[id.as_usize()] = u32::try_from(index).unwrap();
        self.len += 1;
        true
    }

    /// Returns true if and only if this set contains the given value.
    fn contains(&self, id: StateID) -> bool {
        let index = self.sparse[id.as_usize()];
        index.as_usize() < self.len() && self.dense[index.as_usize()] == id
    }

    /// Clear this set such that it has no members.
    fn clear(&mut self) {
        self.len = 0;
    }

    /// Returns an iterator over all the state IDs in this set in the order in
    /// which they were inserted.
    fn iter(&self) -> SparseSetIter<'_> {
        SparseSetIter(self.dense[..self.len()].iter())
    }
}

impl core::fmt::Debug for SparseSet {
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
        let elements: Vec<StateID> = self.iter().collect();
        f.debug_tuple("SparseSet").field(&elements).finish()
    }
}

/// An iterator over all elements in a sparse set.
///
/// The lifetime `'a` refers to the lifetime of the set being iterated over.
#[derive(Debug)]
struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);

impl<'a> Iterator for SparseSetIter<'a> {
    type Item = StateID;

    fn next(&mut self) -> Option<StateID> {
        self.0.next().map(|&id| id)
    }
}
regex_lite/pikevm.rs

regex_lite/
pikevm.rs