regex_automata/util/prefilter/teddy.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
use crate::util::{
prefilter::PrefilterI,
search::{MatchKind, Span},
};
#[derive(Clone, Debug)]
pub(crate) struct Teddy {
#[cfg(not(feature = "perf-literal-multisubstring"))]
_unused: (),
/// The actual Teddy searcher.
///
/// Technically, it's possible that Teddy doesn't actually get used, since
/// Teddy does require its haystack to at least be of a certain size
/// (usually around the size of whatever vector is being used, so ~16
/// or ~32 bytes). For haystacks shorter than that, the implementation
/// currently uses Rabin-Karp.
#[cfg(feature = "perf-literal-multisubstring")]
searcher: aho_corasick::packed::Searcher,
/// When running an anchored search, the packed searcher can't handle it so
/// we defer to Aho-Corasick itself. Kind of sad, but changing the packed
/// searchers to support anchored search would be difficult at worst and
/// annoying at best. Since packed searchers only apply to small numbers of
/// literals, we content ourselves that this is not much of an added cost.
/// (That packed searchers only work with a small number of literals is
/// also why we use a DFA here. Otherwise, the memory usage of a DFA would
/// likely be unacceptable.)
#[cfg(feature = "perf-literal-multisubstring")]
anchored_ac: aho_corasick::dfa::DFA,
/// The length of the smallest literal we look for.
///
/// We use this as a heuristic to figure out whether this will be "fast" or
/// not. Generally, the longer the better, because longer needles are more
/// discriminating and thus reduce false positive rate.
#[cfg(feature = "perf-literal-multisubstring")]
minimum_len: usize,
}
impl Teddy {
pub(crate) fn new<B: AsRef<[u8]>>(
kind: MatchKind,
needles: &[B],
) -> Option<Teddy> {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
None
}
#[cfg(feature = "perf-literal-multisubstring")]
{
// We only really support leftmost-first semantics. In
// theory we could at least support leftmost-longest, as the
// aho-corasick crate does, but regex-automata doesn't know about
// leftmost-longest currently.
//
// And like the aho-corasick prefilter, if we're using `All`
// semantics, then we can still use leftmost semantics for a
// prefilter. (This might be a suspicious choice for the literal
// engine, which uses a prefilter as a regex engine directly, but
// that only happens when using leftmost-first semantics.)
let (packed_match_kind, ac_match_kind) = match kind {
MatchKind::LeftmostFirst | MatchKind::All => (
aho_corasick::packed::MatchKind::LeftmostFirst,
aho_corasick::MatchKind::LeftmostFirst,
),
};
let minimum_len =
needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0);
let packed = aho_corasick::packed::Config::new()
.match_kind(packed_match_kind)
.builder()
.extend(needles)
.build()?;
let anchored_ac = aho_corasick::dfa::DFA::builder()
.match_kind(ac_match_kind)
.start_kind(aho_corasick::StartKind::Anchored)
.prefilter(false)
.build(needles)
.ok()?;
Some(Teddy { searcher: packed, anchored_ac, minimum_len })
}
}
}
impl PrefilterI for Teddy {
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
let ac_span =
aho_corasick::Span { start: span.start, end: span.end };
self.searcher
.find_in(haystack, ac_span)
.map(|m| Span { start: m.start(), end: m.end() })
}
}
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
use aho_corasick::automaton::Automaton;
let input = aho_corasick::Input::new(haystack)
.anchored(aho_corasick::Anchored::Yes)
.span(span.start..span.end);
self.anchored_ac
.try_find(&input)
// OK because we build the DFA with anchored support.
.expect("aho-corasick DFA should never fail")
.map(|m| Span { start: m.start(), end: m.end() })
}
}
fn memory_usage(&self) -> usize {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
use aho_corasick::automaton::Automaton;
self.searcher.memory_usage() + self.anchored_ac.memory_usage()
}
}
fn is_fast(&self) -> bool {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
// Teddy is usually quite fast, but I have seen some cases where
// a large number of literals can overwhelm it and make it not so
// fast. We make an educated but conservative guess at a limit, at
// which point, we're not so comfortable thinking Teddy is "fast."
//
// Well... this used to incorporate a "limit" on the *number*
// of literals, but I have since changed it to a minimum on the
// *smallest* literal. Namely, when there is a very small literal
// (1 or 2 bytes), it is far more likely that it leads to a higher
// false positive rate. (Although, of course, not always. For
// example, 'zq' is likely to have a very low false positive rate.)
// But when we have 3 bytes, we have a really good chance of being
// quite discriminatory and thus fast.
//
// We may still want to add some kind of limit on the number of
// literals here, but keep in mind that Teddy already has its own
// somewhat small limit (64 at time of writing). The main issue
// here is that if 'is_fast' is false, it opens the door for the
// reverse inner optimization to kick in. We really only want to
// resort to the reverse inner optimization if we absolutely must.
self.minimum_len >= 3
}
}
}