aho_corasick/util/
buffer.rs

1use alloc::{vec, vec::Vec};
2
3/// The default buffer capacity that we use for the stream buffer.
4const DEFAULT_BUFFER_CAPACITY: usize = 64 * (1 << 10); // 64 KB
5
6/// A fairly simple roll buffer for supporting stream searches.
7///
8/// This buffer acts as a temporary place to store a fixed amount of data when
9/// reading from a stream. Its central purpose is to allow "rolling" some
10/// suffix of the data to the beginning of the buffer before refilling it with
11/// more data from the stream. For example, let's say we are trying to match
12/// "foobar" on a stream. When we report the match, we'd like to not only
13/// report the correct offsets at which the match occurs, but also the matching
14/// bytes themselves. So let's say our stream is a file with the following
15/// contents: `test test foobar test test`. Now assume that we happen to read
16/// the aforementioned file in two chunks: `test test foo` and `bar test test`.
17/// Naively, it would not be possible to report a single contiguous `foobar`
18/// match, but this roll buffer allows us to do that. Namely, after the second
19/// read, the contents of the buffer should be `st foobar test test`, where the
20/// search should ultimately resume immediately after `foo`. (The prefix `st `
21/// is included because the roll buffer saves N bytes at the end of the buffer,
22/// where N is the maximum possible length of a match.)
23///
24/// A lot of the logic for dealing with this is unfortunately split out between
25/// this roll buffer and the `StreamChunkIter`.
26///
27/// Note also that this buffer is not actually required to just report matches.
28/// Because a `Match` is just some offsets. But it *is* required for supporting
29/// things like `try_stream_replace_all` because that needs some mechanism for
30/// knowing which bytes in the stream correspond to a match and which don't. So
31/// when a match occurs across two `read` calls, *something* needs to retain
32/// the bytes from the previous `read` call because you don't know before the
33/// second read call whether a match exists or not.
34#[derive(Debug)]
35pub(crate) struct Buffer {
36    /// The raw buffer contents. This has a fixed size and never increases.
37    buf: Vec<u8>,
38    /// The minimum size of the buffer, which is equivalent to the maximum
39    /// possible length of a match. This corresponds to the amount that we
40    /// roll
41    min: usize,
42    /// The end of the contents of this buffer.
43    end: usize,
44}
45
46impl Buffer {
47    /// Create a new buffer for stream searching. The minimum buffer length
48    /// given should be the size of the maximum possible match length.
49    pub(crate) fn new(min_buffer_len: usize) -> Buffer {
50        let min = core::cmp::max(1, min_buffer_len);
51        // The minimum buffer amount is also the amount that we roll our
52        // buffer in order to support incremental searching. To this end,
53        // our actual capacity needs to be at least 1 byte bigger than our
54        // minimum amount, otherwise we won't have any overlap. In actuality,
55        // we want our buffer to be a bit bigger than that for performance
56        // reasons, so we set a lower bound of `8 * min`.
57        //
58        // TODO: It would be good to find a way to test the streaming
59        // implementation with the minimal buffer size. For now, we just
60        // uncomment out the next line and comment out the subsequent line.
61        // let capacity = 1 + min;
62        let capacity = core::cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
63        Buffer { buf: vec![0; capacity], min, end: 0 }
64    }
65
66    /// Return the contents of this buffer.
67    #[inline]
68    pub(crate) fn buffer(&self) -> &[u8] {
69        &self.buf[..self.end]
70    }
71
72    /// Return the minimum size of the buffer. The only way a buffer may be
73    /// smaller than this is if the stream itself contains less than the
74    /// minimum buffer amount.
75    #[inline]
76    pub(crate) fn min_buffer_len(&self) -> usize {
77        self.min
78    }
79
80    /// Return all free capacity in this buffer.
81    fn free_buffer(&mut self) -> &mut [u8] {
82        &mut self.buf[self.end..]
83    }
84
85    /// Refill the contents of this buffer by reading as much as possible into
86    /// this buffer's free capacity. If no more bytes could be read, then this
87    /// returns false. Otherwise, this reads until it has filled the buffer
88    /// past the minimum amount.
89    pub(crate) fn fill<R: std::io::Read>(
90        &mut self,
91        mut rdr: R,
92    ) -> std::io::Result<bool> {
93        let mut readany = false;
94        loop {
95            let readlen = rdr.read(self.free_buffer())?;
96            if readlen == 0 {
97                return Ok(readany);
98            }
99            readany = true;
100            self.end += readlen;
101            if self.buffer().len() >= self.min {
102                return Ok(true);
103            }
104        }
105    }
106
107    /// Roll the contents of the buffer so that the suffix of this buffer is
108    /// moved to the front and all other contents are dropped. The size of the
109    /// suffix corresponds precisely to the minimum buffer length.
110    ///
111    /// This should only be called when the entire contents of this buffer have
112    /// been searched.
113    pub(crate) fn roll(&mut self) {
114        let roll_start = self
115            .end
116            .checked_sub(self.min)
117            .expect("buffer capacity should be bigger than minimum amount");
118        let roll_end = roll_start + self.min;
119
120        assert!(roll_end <= self.end);
121        self.buf.copy_within(roll_start..roll_end, 0);
122        self.end = self.min;
123    }
124}
aho_corasick/util/buffer.rs

aho_corasick/util/
buffer.rs