regex/regex/
bytes.rs

1use alloc::{borrow::Cow, string::String, sync::Arc, vec::Vec};
2
3use regex_automata::{meta, util::captures, Input, PatternID};
4
5use crate::{bytes::RegexBuilder, error::Error};
6
7/// A compiled regular expression for searching Unicode haystacks.
8///
9/// A `Regex` can be used to search haystacks, split haystacks into substrings
10/// or replace substrings in a haystack with a different substring. All
11/// searching is done with an implicit `(?s:.)*?` at the beginning and end of
12/// an pattern. To force an expression to match the whole string (or a prefix
13/// or a suffix), you must use an anchor like `^` or `$` (or `\A` and `\z`).
14///
15/// Like the `Regex` type in the parent module, matches with this regex return
16/// byte offsets into the haystack. **Unlike** the parent `Regex` type, these
17/// byte offsets may not correspond to UTF-8 sequence boundaries since the
18/// regexes in this module can match arbitrary bytes.
19///
20/// The only methods that allocate new byte strings are the string replacement
21/// methods. All other methods (searching and splitting) return borrowed
22/// references into the haystack given.
23///
24/// # Example
25///
26/// Find the offsets of a US phone number:
27///
28/// ```
29/// use regex::bytes::Regex;
30///
31/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap();
32/// let m = re.find(b"phone: 111-222-3333").unwrap();
33/// assert_eq!(7..19, m.range());
34/// ```
35///
36/// # Example: extracting capture groups
37///
38/// A common way to use regexes is with capture groups. That is, instead of
39/// just looking for matches of an entire regex, parentheses are used to create
40/// groups that represent part of the match.
41///
42/// For example, consider a haystack with multiple lines, and each line has
43/// three whitespace delimited fields where the second field is expected to be
44/// a number and the third field a boolean. To make this convenient, we use
45/// the [`Captures::extract`] API to put the strings that match each group
46/// into a fixed size array:
47///
48/// ```
49/// use regex::bytes::Regex;
50///
51/// let hay = b"
52/// rabbit         54 true
53/// groundhog 2 true
54/// does not match
55/// fox   109    false
56/// ";
57/// let re = Regex::new(r"(?m)^\s*(\S+)\s+([0-9]+)\s+(true|false)\s*$").unwrap();
58/// let mut fields: Vec<(&[u8], i64, bool)> = vec![];
59/// for (_, [f1, f2, f3]) in re.captures_iter(hay).map(|caps| caps.extract()) {
60///     // These unwraps are OK because our pattern is written in a way where
61///     // all matches for f2 and f3 will be valid UTF-8.
62///     let f2 = std::str::from_utf8(f2).unwrap();
63///     let f3 = std::str::from_utf8(f3).unwrap();
64///     fields.push((f1, f2.parse()?, f3.parse()?));
65/// }
66/// assert_eq!(fields, vec![
67///     (&b"rabbit"[..], 54, true),
68///     (&b"groundhog"[..], 2, true),
69///     (&b"fox"[..], 109, false),
70/// ]);
71///
72/// # Ok::<(), Box<dyn std::error::Error>>(())
73/// ```
74///
75/// # Example: matching invalid UTF-8
76///
77/// One of the reasons for searching `&[u8]` haystacks is that the `&[u8]`
78/// might not be valid UTF-8. Indeed, with a `bytes::Regex`, patterns that
79/// match invalid UTF-8 are explicitly allowed. Here's one example that looks
80/// for valid UTF-8 fields that might be separated by invalid UTF-8. In this
81/// case, we use `(?s-u:.)`, which matches any byte. Attempting to use it in a
82/// top-level `Regex` will result in the regex failing to compile. Notice also
83/// that we use `.` with Unicode mode enabled, in which case, only valid UTF-8
84/// is matched. In this way, we can build one pattern where some parts only
85/// match valid UTF-8 while other parts are more permissive.
86///
87/// ```
88/// use regex::bytes::Regex;
89///
90/// // F0 9F 92 A9 is the UTF-8 encoding for a Pile of Poo.
91/// let hay = b"\xFF\xFFfoo\xFF\xFF\xFF\xF0\x9F\x92\xA9\xFF";
92/// // An equivalent to '(?s-u:.)' is '(?-u:[\x00-\xFF])'.
93/// let re = Regex::new(r"(?s)(?-u:.)*?(?<f1>.+)(?-u:.)*?(?<f2>.+)").unwrap();
94/// let caps = re.captures(hay).unwrap();
95/// assert_eq!(&caps["f1"], &b"foo"[..]);
96/// assert_eq!(&caps["f2"], "💩".as_bytes());
97/// ```
98#[derive(Clone)]
99pub struct Regex {
100    pub(crate) meta: meta::Regex,
101    pub(crate) pattern: Arc<str>,
102}
103
104impl core::fmt::Display for Regex {
105    /// Shows the original regular expression.
106    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
107        write!(f, "{}", self.as_str())
108    }
109}
110
111impl core::fmt::Debug for Regex {
112    /// Shows the original regular expression.
113    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
114        f.debug_tuple("Regex").field(&self.as_str()).finish()
115    }
116}
117
118impl core::str::FromStr for Regex {
119    type Err = Error;
120
121    /// Attempts to parse a string into a regular expression
122    fn from_str(s: &str) -> Result<Regex, Error> {
123        Regex::new(s)
124    }
125}
126
127impl TryFrom<&str> for Regex {
128    type Error = Error;
129
130    /// Attempts to parse a string into a regular expression
131    fn try_from(s: &str) -> Result<Regex, Error> {
132        Regex::new(s)
133    }
134}
135
136impl TryFrom<String> for Regex {
137    type Error = Error;
138
139    /// Attempts to parse a string into a regular expression
140    fn try_from(s: String) -> Result<Regex, Error> {
141        Regex::new(&s)
142    }
143}
144
145/// Core regular expression methods.
146impl Regex {
147    /// Compiles a regular expression. Once compiled, it can be used repeatedly
148    /// to search, split or replace substrings in a haystack.
149    ///
150    /// Note that regex compilation tends to be a somewhat expensive process,
151    /// and unlike higher level environments, compilation is not automatically
152    /// cached for you. One should endeavor to compile a regex once and then
153    /// reuse it. For example, it's a bad idea to compile the same regex
154    /// repeatedly in a loop.
155    ///
156    /// # Errors
157    ///
158    /// If an invalid pattern is given, then an error is returned.
159    /// An error is also returned if the pattern is valid, but would
160    /// produce a regex that is bigger than the configured size limit via
161    /// [`RegexBuilder::size_limit`]. (A reasonable size limit is enabled by
162    /// default.)
163    ///
164    /// # Example
165    ///
166    /// ```
167    /// use regex::bytes::Regex;
168    ///
169    /// // An Invalid pattern because of an unclosed parenthesis
170    /// assert!(Regex::new(r"foo(bar").is_err());
171    /// // An invalid pattern because the regex would be too big
172    /// // because Unicode tends to inflate things.
173    /// assert!(Regex::new(r"\w{1000}").is_err());
174    /// // Disabling Unicode can make the regex much smaller,
175    /// // potentially by up to or more than an order of magnitude.
176    /// assert!(Regex::new(r"(?-u:\w){1000}").is_ok());
177    /// ```
178    pub fn new(re: &str) -> Result<Regex, Error> {
179        RegexBuilder::new(re).build()
180    }
181
182    /// Returns true if and only if there is a match for the regex anywhere
183    /// in the haystack given.
184    ///
185    /// It is recommended to use this method if all you need to do is test
186    /// whether a match exists, since the underlying matching engine may be
187    /// able to do less work.
188    ///
189    /// # Example
190    ///
191    /// Test if some haystack contains at least one word with exactly 13
192    /// Unicode word characters:
193    ///
194    /// ```
195    /// use regex::bytes::Regex;
196    ///
197    /// let re = Regex::new(r"\b\w{13}\b").unwrap();
198    /// let hay = b"I categorically deny having triskaidekaphobia.";
199    /// assert!(re.is_match(hay));
200    /// ```
201    #[inline]
202    pub fn is_match(&self, haystack: &[u8]) -> bool {
203        self.is_match_at(haystack, 0)
204    }
205
206    /// This routine searches for the first match of this regex in the
207    /// haystack given, and if found, returns a [`Match`]. The `Match`
208    /// provides access to both the byte offsets of the match and the actual
209    /// substring that matched.
210    ///
211    /// Note that this should only be used if you want to find the entire
212    /// match. If instead you just want to test the existence of a match,
213    /// it's potentially faster to use `Regex::is_match(hay)` instead of
214    /// `Regex::find(hay).is_some()`.
215    ///
216    /// # Example
217    ///
218    /// Find the first word with exactly 13 Unicode word characters:
219    ///
220    /// ```
221    /// use regex::bytes::Regex;
222    ///
223    /// let re = Regex::new(r"\b\w{13}\b").unwrap();
224    /// let hay = b"I categorically deny having triskaidekaphobia.";
225    /// let mat = re.find(hay).unwrap();
226    /// assert_eq!(2..15, mat.range());
227    /// assert_eq!(b"categorically", mat.as_bytes());
228    /// ```
229    #[inline]
230    pub fn find<'h>(&self, haystack: &'h [u8]) -> Option<Match<'h>> {
231        self.find_at(haystack, 0)
232    }
233
234    /// Returns an iterator that yields successive non-overlapping matches in
235    /// the given haystack. The iterator yields values of type [`Match`].
236    ///
237    /// # Time complexity
238    ///
239    /// Note that since `find_iter` runs potentially many searches on the
240    /// haystack and since each search has worst case `O(m * n)` time
241    /// complexity, the overall worst case time complexity for iteration is
242    /// `O(m * n^2)`.
243    ///
244    /// # Example
245    ///
246    /// Find every word with exactly 13 Unicode word characters:
247    ///
248    /// ```
249    /// use regex::bytes::Regex;
250    ///
251    /// let re = Regex::new(r"\b\w{13}\b").unwrap();
252    /// let hay = b"Retroactively relinquishing remunerations is reprehensible.";
253    /// let matches: Vec<_> = re.find_iter(hay).map(|m| m.as_bytes()).collect();
254    /// assert_eq!(matches, vec![
255    ///     &b"Retroactively"[..],
256    ///     &b"relinquishing"[..],
257    ///     &b"remunerations"[..],
258    ///     &b"reprehensible"[..],
259    /// ]);
260    /// ```
261    #[inline]
262    pub fn find_iter<'r, 'h>(&'r self, haystack: &'h [u8]) -> Matches<'r, 'h> {
263        Matches { haystack, it: self.meta.find_iter(haystack) }
264    }
265
266    /// This routine searches for the first match of this regex in the haystack
267    /// given, and if found, returns not only the overall match but also the
268    /// matches of each capture group in the regex. If no match is found, then
269    /// `None` is returned.
270    ///
271    /// Capture group `0` always corresponds to an implicit unnamed group that
272    /// includes the entire match. If a match is found, this group is always
273    /// present. Subsequent groups may be named and are numbered, starting
274    /// at 1, by the order in which the opening parenthesis appears in the
275    /// pattern. For example, in the pattern `(?<a>.(?<b>.))(?<c>.)`, `a`,
276    /// `b` and `c` correspond to capture group indices `1`, `2` and `3`,
277    /// respectively.
278    ///
279    /// You should only use `captures` if you need access to the capture group
280    /// matches. Otherwise, [`Regex::find`] is generally faster for discovering
281    /// just the overall match.
282    ///
283    /// # Example
284    ///
285    /// Say you have some haystack with movie names and their release years,
286    /// like "'Citizen Kane' (1941)". It'd be nice if we could search for
287    /// strings looking like that, while also extracting the movie name and its
288    /// release year separately. The example below shows how to do that.
289    ///
290    /// ```
291    /// use regex::bytes::Regex;
292    ///
293    /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
294    /// let hay = b"Not my favorite movie: 'Citizen Kane' (1941).";
295    /// let caps = re.captures(hay).unwrap();
296    /// assert_eq!(caps.get(0).unwrap().as_bytes(), b"'Citizen Kane' (1941)");
297    /// assert_eq!(caps.get(1).unwrap().as_bytes(), b"Citizen Kane");
298    /// assert_eq!(caps.get(2).unwrap().as_bytes(), b"1941");
299    /// // You can also access the groups by index using the Index notation.
300    /// // Note that this will panic on an invalid index. In this case, these
301    /// // accesses are always correct because the overall regex will only
302    /// // match when these capture groups match.
303    /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
304    /// assert_eq!(&caps[1], b"Citizen Kane");
305    /// assert_eq!(&caps[2], b"1941");
306    /// ```
307    ///
308    /// Note that the full match is at capture group `0`. Each subsequent
309    /// capture group is indexed by the order of its opening `(`.
310    ///
311    /// We can make this example a bit clearer by using *named* capture groups:
312    ///
313    /// ```
314    /// use regex::bytes::Regex;
315    ///
316    /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>\d{4})\)").unwrap();
317    /// let hay = b"Not my favorite movie: 'Citizen Kane' (1941).";
318    /// let caps = re.captures(hay).unwrap();
319    /// assert_eq!(caps.get(0).unwrap().as_bytes(), b"'Citizen Kane' (1941)");
320    /// assert_eq!(caps.name("title").unwrap().as_bytes(), b"Citizen Kane");
321    /// assert_eq!(caps.name("year").unwrap().as_bytes(), b"1941");
322    /// // You can also access the groups by name using the Index notation.
323    /// // Note that this will panic on an invalid group name. In this case,
324    /// // these accesses are always correct because the overall regex will
325    /// // only match when these capture groups match.
326    /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
327    /// assert_eq!(&caps["title"], b"Citizen Kane");
328    /// assert_eq!(&caps["year"], b"1941");
329    /// ```
330    ///
331    /// Here we name the capture groups, which we can access with the `name`
332    /// method or the `Index` notation with a `&str`. Note that the named
333    /// capture groups are still accessible with `get` or the `Index` notation
334    /// with a `usize`.
335    ///
336    /// The `0`th capture group is always unnamed, so it must always be
337    /// accessed with `get(0)` or `[0]`.
338    ///
339    /// Finally, one other way to get the matched substrings is with the
340    /// [`Captures::extract`] API:
341    ///
342    /// ```
343    /// use regex::bytes::Regex;
344    ///
345    /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
346    /// let hay = b"Not my favorite movie: 'Citizen Kane' (1941).";
347    /// let (full, [title, year]) = re.captures(hay).unwrap().extract();
348    /// assert_eq!(full, b"'Citizen Kane' (1941)");
349    /// assert_eq!(title, b"Citizen Kane");
350    /// assert_eq!(year, b"1941");
351    /// ```
352    #[inline]
353    pub fn captures<'h>(&self, haystack: &'h [u8]) -> Option<Captures<'h>> {
354        self.captures_at(haystack, 0)
355    }
356
357    /// Returns an iterator that yields successive non-overlapping matches in
358    /// the given haystack. The iterator yields values of type [`Captures`].
359    ///
360    /// This is the same as [`Regex::find_iter`], but instead of only providing
361    /// access to the overall match, each value yield includes access to the
362    /// matches of all capture groups in the regex. Reporting this extra match
363    /// data is potentially costly, so callers should only use `captures_iter`
364    /// over `find_iter` when they actually need access to the capture group
365    /// matches.
366    ///
367    /// # Time complexity
368    ///
369    /// Note that since `captures_iter` runs potentially many searches on the
370    /// haystack and since each search has worst case `O(m * n)` time
371    /// complexity, the overall worst case time complexity for iteration is
372    /// `O(m * n^2)`.
373    ///
374    /// # Example
375    ///
376    /// We can use this to find all movie titles and their release years in
377    /// some haystack, where the movie is formatted like "'Title' (xxxx)":
378    ///
379    /// ```
380    /// use regex::bytes::Regex;
381    ///
382    /// let re = Regex::new(r"'([^']+)'\s+\(([0-9]{4})\)").unwrap();
383    /// let hay = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
384    /// let mut movies = vec![];
385    /// for (_, [title, year]) in re.captures_iter(hay).map(|c| c.extract()) {
386    ///     // OK because [0-9]{4} can only match valid UTF-8.
387    ///     let year = std::str::from_utf8(year).unwrap();
388    ///     movies.push((title, year.parse::<i64>()?));
389    /// }
390    /// assert_eq!(movies, vec![
391    ///     (&b"Citizen Kane"[..], 1941),
392    ///     (&b"The Wizard of Oz"[..], 1939),
393    ///     (&b"M"[..], 1931),
394    /// ]);
395    /// # Ok::<(), Box<dyn std::error::Error>>(())
396    /// ```
397    ///
398    /// Or with named groups:
399    ///
400    /// ```
401    /// use regex::bytes::Regex;
402    ///
403    /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>[0-9]{4})\)").unwrap();
404    /// let hay = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
405    /// let mut it = re.captures_iter(hay);
406    ///
407    /// let caps = it.next().unwrap();
408    /// assert_eq!(&caps["title"], b"Citizen Kane");
409    /// assert_eq!(&caps["year"], b"1941");
410    ///
411    /// let caps = it.next().unwrap();
412    /// assert_eq!(&caps["title"], b"The Wizard of Oz");
413    /// assert_eq!(&caps["year"], b"1939");
414    ///
415    /// let caps = it.next().unwrap();
416    /// assert_eq!(&caps["title"], b"M");
417    /// assert_eq!(&caps["year"], b"1931");
418    /// ```
419    #[inline]
420    pub fn captures_iter<'r, 'h>(
421        &'r self,
422        haystack: &'h [u8],
423    ) -> CaptureMatches<'r, 'h> {
424        CaptureMatches { haystack, it: self.meta.captures_iter(haystack) }
425    }
426
427    /// Returns an iterator of substrings of the haystack given, delimited by a
428    /// match of the regex. Namely, each element of the iterator corresponds to
429    /// a part of the haystack that *isn't* matched by the regular expression.
430    ///
431    /// # Time complexity
432    ///
433    /// Since iterators over all matches requires running potentially many
434    /// searches on the haystack, and since each search has worst case
435    /// `O(m * n)` time complexity, the overall worst case time complexity for
436    /// this routine is `O(m * n^2)`.
437    ///
438    /// # Example
439    ///
440    /// To split a string delimited by arbitrary amounts of spaces or tabs:
441    ///
442    /// ```
443    /// use regex::bytes::Regex;
444    ///
445    /// let re = Regex::new(r"[ \t]+").unwrap();
446    /// let hay = b"a b \t  c\td    e";
447    /// let fields: Vec<&[u8]> = re.split(hay).collect();
448    /// assert_eq!(fields, vec![
449    ///     &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..],
450    /// ]);
451    /// ```
452    ///
453    /// # Example: more cases
454    ///
455    /// Basic usage:
456    ///
457    /// ```
458    /// use regex::bytes::Regex;
459    ///
460    /// let re = Regex::new(r" ").unwrap();
461    /// let hay = b"Mary had a little lamb";
462    /// let got: Vec<&[u8]> = re.split(hay).collect();
463    /// assert_eq!(got, vec![
464    ///     &b"Mary"[..], &b"had"[..], &b"a"[..], &b"little"[..], &b"lamb"[..],
465    /// ]);
466    ///
467    /// let re = Regex::new(r"X").unwrap();
468    /// let hay = b"";
469    /// let got: Vec<&[u8]> = re.split(hay).collect();
470    /// assert_eq!(got, vec![&b""[..]]);
471    ///
472    /// let re = Regex::new(r"X").unwrap();
473    /// let hay = b"lionXXtigerXleopard";
474    /// let got: Vec<&[u8]> = re.split(hay).collect();
475    /// assert_eq!(got, vec![
476    ///     &b"lion"[..], &b""[..], &b"tiger"[..], &b"leopard"[..],
477    /// ]);
478    ///
479    /// let re = Regex::new(r"::").unwrap();
480    /// let hay = b"lion::tiger::leopard";
481    /// let got: Vec<&[u8]> = re.split(hay).collect();
482    /// assert_eq!(got, vec![&b"lion"[..], &b"tiger"[..], &b"leopard"[..]]);
483    /// ```
484    ///
485    /// If a haystack contains multiple contiguous matches, you will end up
486    /// with empty spans yielded by the iterator:
487    ///
488    /// ```
489    /// use regex::bytes::Regex;
490    ///
491    /// let re = Regex::new(r"X").unwrap();
492    /// let hay = b"XXXXaXXbXc";
493    /// let got: Vec<&[u8]> = re.split(hay).collect();
494    /// assert_eq!(got, vec![
495    ///     &b""[..], &b""[..], &b""[..], &b""[..],
496    ///     &b"a"[..], &b""[..], &b"b"[..], &b"c"[..],
497    /// ]);
498    ///
499    /// let re = Regex::new(r"/").unwrap();
500    /// let hay = b"(///)";
501    /// let got: Vec<&[u8]> = re.split(hay).collect();
502    /// assert_eq!(got, vec![&b"("[..], &b""[..], &b""[..], &b")"[..]]);
503    /// ```
504    ///
505    /// Separators at the start or end of a haystack are neighbored by empty
506    /// substring.
507    ///
508    /// ```
509    /// use regex::bytes::Regex;
510    ///
511    /// let re = Regex::new(r"0").unwrap();
512    /// let hay = b"010";
513    /// let got: Vec<&[u8]> = re.split(hay).collect();
514    /// assert_eq!(got, vec![&b""[..], &b"1"[..], &b""[..]]);
515    /// ```
516    ///
517    /// When the regex can match the empty string, it splits at every byte
518    /// position in the haystack. This includes between all UTF-8 code units.
519    /// (The top-level [`Regex::split`](crate::Regex::split) will only split
520    /// at valid UTF-8 boundaries.)
521    ///
522    /// ```
523    /// use regex::bytes::Regex;
524    ///
525    /// let re = Regex::new(r"").unwrap();
526    /// let hay = "☃".as_bytes();
527    /// let got: Vec<&[u8]> = re.split(hay).collect();
528    /// assert_eq!(got, vec![
529    ///     &[][..], &[b'\xE2'][..], &[b'\x98'][..], &[b'\x83'][..], &[][..],
530    /// ]);
531    /// ```
532    ///
533    /// Contiguous separators (commonly shows up with whitespace), can lead to
534    /// possibly surprising behavior. For example, this code is correct:
535    ///
536    /// ```
537    /// use regex::bytes::Regex;
538    ///
539    /// let re = Regex::new(r" ").unwrap();
540    /// let hay = b"    a  b c";
541    /// let got: Vec<&[u8]> = re.split(hay).collect();
542    /// assert_eq!(got, vec![
543    ///     &b""[..], &b""[..], &b""[..], &b""[..],
544    ///     &b"a"[..], &b""[..], &b"b"[..], &b"c"[..],
545    /// ]);
546    /// ```
547    ///
548    /// It does *not* give you `["a", "b", "c"]`. For that behavior, you'd want
549    /// to match contiguous space characters:
550    ///
551    /// ```
552    /// use regex::bytes::Regex;
553    ///
554    /// let re = Regex::new(r" +").unwrap();
555    /// let hay = b"    a  b c";
556    /// let got: Vec<&[u8]> = re.split(hay).collect();
557    /// // N.B. This does still include a leading empty span because ' +'
558    /// // matches at the beginning of the haystack.
559    /// assert_eq!(got, vec![&b""[..], &b"a"[..], &b"b"[..], &b"c"[..]]);
560    /// ```
561    #[inline]
562    pub fn split<'r, 'h>(&'r self, haystack: &'h [u8]) -> Split<'r, 'h> {
563        Split { haystack, it: self.meta.split(haystack) }
564    }
565
566    /// Returns an iterator of at most `limit` substrings of the haystack
567    /// given, delimited by a match of the regex. (A `limit` of `0` will return
568    /// no substrings.) Namely, each element of the iterator corresponds to a
569    /// part of the haystack that *isn't* matched by the regular expression.
570    /// The remainder of the haystack that is not split will be the last
571    /// element in the iterator.
572    ///
573    /// # Time complexity
574    ///
575    /// Since iterators over all matches requires running potentially many
576    /// searches on the haystack, and since each search has worst case
577    /// `O(m * n)` time complexity, the overall worst case time complexity for
578    /// this routine is `O(m * n^2)`.
579    ///
580    /// Although note that the worst case time here has an upper bound given
581    /// by the `limit` parameter.
582    ///
583    /// # Example
584    ///
585    /// Get the first two words in some haystack:
586    ///
587    /// ```
588    /// use regex::bytes::Regex;
589    ///
590    /// let re = Regex::new(r"\W+").unwrap();
591    /// let hay = b"Hey! How are you?";
592    /// let fields: Vec<&[u8]> = re.splitn(hay, 3).collect();
593    /// assert_eq!(fields, vec![&b"Hey"[..], &b"How"[..], &b"are you?"[..]]);
594    /// ```
595    ///
596    /// # Examples: more cases
597    ///
598    /// ```
599    /// use regex::bytes::Regex;
600    ///
601    /// let re = Regex::new(r" ").unwrap();
602    /// let hay = b"Mary had a little lamb";
603    /// let got: Vec<&[u8]> = re.splitn(hay, 3).collect();
604    /// assert_eq!(got, vec![&b"Mary"[..], &b"had"[..], &b"a little lamb"[..]]);
605    ///
606    /// let re = Regex::new(r"X").unwrap();
607    /// let hay = b"";
608    /// let got: Vec<&[u8]> = re.splitn(hay, 3).collect();
609    /// assert_eq!(got, vec![&b""[..]]);
610    ///
611    /// let re = Regex::new(r"X").unwrap();
612    /// let hay = b"lionXXtigerXleopard";
613    /// let got: Vec<&[u8]> = re.splitn(hay, 3).collect();
614    /// assert_eq!(got, vec![&b"lion"[..], &b""[..], &b"tigerXleopard"[..]]);
615    ///
616    /// let re = Regex::new(r"::").unwrap();
617    /// let hay = b"lion::tiger::leopard";
618    /// let got: Vec<&[u8]> = re.splitn(hay, 2).collect();
619    /// assert_eq!(got, vec![&b"lion"[..], &b"tiger::leopard"[..]]);
620    ///
621    /// let re = Regex::new(r"X").unwrap();
622    /// let hay = b"abcXdef";
623    /// let got: Vec<&[u8]> = re.splitn(hay, 1).collect();
624    /// assert_eq!(got, vec![&b"abcXdef"[..]]);
625    ///
626    /// let re = Regex::new(r"X").unwrap();
627    /// let hay = b"abcdef";
628    /// let got: Vec<&[u8]> = re.splitn(hay, 2).collect();
629    /// assert_eq!(got, vec![&b"abcdef"[..]]);
630    ///
631    /// let re = Regex::new(r"X").unwrap();
632    /// let hay = b"abcXdef";
633    /// let got: Vec<&[u8]> = re.splitn(hay, 0).collect();
634    /// assert!(got.is_empty());
635    /// ```
636    #[inline]
637    pub fn splitn<'r, 'h>(
638        &'r self,
639        haystack: &'h [u8],
640        limit: usize,
641    ) -> SplitN<'r, 'h> {
642        SplitN { haystack, it: self.meta.splitn(haystack, limit) }
643    }
644
645    /// Replaces the leftmost-first match in the given haystack with the
646    /// replacement provided. The replacement can be a regular string (where
647    /// `$N` and `$name` are expanded to match capture groups) or a function
648    /// that takes a [`Captures`] and returns the replaced string.
649    ///
650    /// If no match is found, then the haystack is returned unchanged. In that
651    /// case, this implementation will likely return a `Cow::Borrowed` value
652    /// such that no allocation is performed.
653    ///
654    /// When a `Cow::Borrowed` is returned, the value returned is guaranteed
655    /// to be equivalent to the `haystack` given.
656    ///
657    /// # Replacement string syntax
658    ///
659    /// All instances of `$ref` in the replacement string are replaced with
660    /// the substring corresponding to the capture group identified by `ref`.
661    ///
662    /// `ref` may be an integer corresponding to the index of the capture group
663    /// (counted by order of opening parenthesis where `0` is the entire match)
664    /// or it can be a name (consisting of letters, digits or underscores)
665    /// corresponding to a named capture group.
666    ///
667    /// If `ref` isn't a valid capture group (whether the name doesn't exist or
668    /// isn't a valid index), then it is replaced with the empty string.
669    ///
670    /// The longest possible name is used. For example, `$1a` looks up the
671    /// capture group named `1a` and not the capture group at index `1`. To
672    /// exert more precise control over the name, use braces, e.g., `${1}a`.
673    ///
674    /// To write a literal `$` use `$$`.
675    ///
676    /// # Example
677    ///
678    /// Note that this function is polymorphic with respect to the replacement.
679    /// In typical usage, this can just be a normal string:
680    ///
681    /// ```
682    /// use regex::bytes::Regex;
683    ///
684    /// let re = Regex::new(r"[^01]+").unwrap();
685    /// assert_eq!(re.replace(b"1078910", b""), &b"1010"[..]);
686    /// ```
687    ///
688    /// But anything satisfying the [`Replacer`] trait will work. For example,
689    /// a closure of type `|&Captures| -> String` provides direct access to the
690    /// captures corresponding to a match. This allows one to access capturing
691    /// group matches easily:
692    ///
693    /// ```
694    /// use regex::bytes::{Captures, Regex};
695    ///
696    /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
697    /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| {
698    ///     let mut buf = vec![];
699    ///     buf.extend_from_slice(&caps[2]);
700    ///     buf.push(b' ');
701    ///     buf.extend_from_slice(&caps[1]);
702    ///     buf
703    /// });
704    /// assert_eq!(result, &b"Bruce Springsteen"[..]);
705    /// ```
706    ///
707    /// But this is a bit cumbersome to use all the time. Instead, a simple
708    /// syntax is supported (as described above) that expands `$name` into the
709    /// corresponding capture group. Here's the last example, but using this
710    /// expansion technique with named capture groups:
711    ///
712    /// ```
713    /// use regex::bytes::Regex;
714    ///
715    /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap();
716    /// let result = re.replace(b"Springsteen, Bruce", b"$first $last");
717    /// assert_eq!(result, &b"Bruce Springsteen"[..]);
718    /// ```
719    ///
720    /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
721    /// would produce the same result. To write a literal `$` use `$$`.
722    ///
723    /// Sometimes the replacement string requires use of curly braces to
724    /// delineate a capture group replacement when it is adjacent to some other
725    /// literal text. For example, if we wanted to join two words together with
726    /// an underscore:
727    ///
728    /// ```
729    /// use regex::bytes::Regex;
730    ///
731    /// let re = Regex::new(r"(?<first>\w+)\s+(?<second>\w+)").unwrap();
732    /// let result = re.replace(b"deep fried", b"${first}_$second");
733    /// assert_eq!(result, &b"deep_fried"[..]);
734    /// ```
735    ///
736    /// Without the curly braces, the capture group name `first_` would be
737    /// used, and since it doesn't exist, it would be replaced with the empty
738    /// string.
739    ///
740    /// Finally, sometimes you just want to replace a literal string with no
741    /// regard for capturing group expansion. This can be done by wrapping a
742    /// string with [`NoExpand`]:
743    ///
744    /// ```
745    /// use regex::bytes::{NoExpand, Regex};
746    ///
747    /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap();
748    /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last"));
749    /// assert_eq!(result, &b"$2 $last"[..]);
750    /// ```
751    ///
752    /// Using `NoExpand` may also be faster, since the replacement string won't
753    /// need to be parsed for the `$` syntax.
754    #[inline]
755    pub fn replace<'h, R: Replacer>(
756        &self,
757        haystack: &'h [u8],
758        rep: R,
759    ) -> Cow<'h, [u8]> {
760        self.replacen(haystack, 1, rep)
761    }
762
763    /// Replaces all non-overlapping matches in the haystack with the
764    /// replacement provided. This is the same as calling `replacen` with
765    /// `limit` set to `0`.
766    ///
767    /// If no match is found, then the haystack is returned unchanged. In that
768    /// case, this implementation will likely return a `Cow::Borrowed` value
769    /// such that no allocation is performed.
770    ///
771    /// When a `Cow::Borrowed` is returned, the value returned is guaranteed
772    /// to be equivalent to the `haystack` given.
773    ///
774    /// The documentation for [`Regex::replace`] goes into more detail about
775    /// what kinds of replacement strings are supported.
776    ///
777    /// # Time complexity
778    ///
779    /// Since iterators over all matches requires running potentially many
780    /// searches on the haystack, and since each search has worst case
781    /// `O(m * n)` time complexity, the overall worst case time complexity for
782    /// this routine is `O(m * n^2)`.
783    ///
784    /// # Fallibility
785    ///
786    /// If you need to write a replacement routine where any individual
787    /// replacement might "fail," doing so with this API isn't really feasible
788    /// because there's no way to stop the search process if a replacement
789    /// fails. Instead, if you need this functionality, you should consider
790    /// implementing your own replacement routine:
791    ///
792    /// ```
793    /// use regex::bytes::{Captures, Regex};
794    ///
795    /// fn replace_all<E>(
796    ///     re: &Regex,
797    ///     haystack: &[u8],
798    ///     replacement: impl Fn(&Captures) -> Result<Vec<u8>, E>,
799    /// ) -> Result<Vec<u8>, E> {
800    ///     let mut new = Vec::with_capacity(haystack.len());
801    ///     let mut last_match = 0;
802    ///     for caps in re.captures_iter(haystack) {
803    ///         let m = caps.get(0).unwrap();
804    ///         new.extend_from_slice(&haystack[last_match..m.start()]);
805    ///         new.extend_from_slice(&replacement(&caps)?);
806    ///         last_match = m.end();
807    ///     }
808    ///     new.extend_from_slice(&haystack[last_match..]);
809    ///     Ok(new)
810    /// }
811    ///
812    /// // Let's replace each word with the number of bytes in that word.
813    /// // But if we see a word that is "too long," we'll give up.
814    /// let re = Regex::new(r"\w+").unwrap();
815    /// let replacement = |caps: &Captures| -> Result<Vec<u8>, &'static str> {
816    ///     if caps[0].len() >= 5 {
817    ///         return Err("word too long");
818    ///     }
819    ///     Ok(caps[0].len().to_string().into_bytes())
820    /// };
821    /// assert_eq!(
822    ///     Ok(b"2 3 3 3?".to_vec()),
823    ///     replace_all(&re, b"hi how are you?", &replacement),
824    /// );
825    /// assert!(replace_all(&re, b"hi there", &replacement).is_err());
826    /// ```
827    ///
828    /// # Example
829    ///
830    /// This example shows how to flip the order of whitespace (excluding line
831    /// terminators) delimited fields, and normalizes the whitespace that
832    /// delimits the fields:
833    ///
834    /// ```
835    /// use regex::bytes::Regex;
836    ///
837    /// let re = Regex::new(r"(?m)^(\S+)[\s--\r\n]+(\S+)$").unwrap();
838    /// let hay = b"
839    /// Greetings  1973
840    /// Wild\t1973
841    /// BornToRun\t\t\t\t1975
842    /// Darkness                    1978
843    /// TheRiver 1980
844    /// ";
845    /// let new = re.replace_all(hay, b"$2 $1");
846    /// assert_eq!(new, &b"
847    /// 1973 Greetings
848    /// 1973 Wild
849    /// 1975 BornToRun
850    /// 1978 Darkness
851    /// 1980 TheRiver
852    /// "[..]);
853    /// ```
854    #[inline]
855    pub fn replace_all<'h, R: Replacer>(
856        &self,
857        haystack: &'h [u8],
858        rep: R,
859    ) -> Cow<'h, [u8]> {
860        self.replacen(haystack, 0, rep)
861    }
862
863    /// Replaces at most `limit` non-overlapping matches in the haystack with
864    /// the replacement provided. If `limit` is `0`, then all non-overlapping
865    /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is
866    /// equivalent to `Regex::replacen(hay, 0, rep)`.
867    ///
868    /// If no match is found, then the haystack is returned unchanged. In that
869    /// case, this implementation will likely return a `Cow::Borrowed` value
870    /// such that no allocation is performed.
871    ///
872    /// When a `Cow::Borrowed` is returned, the value returned is guaranteed
873    /// to be equivalent to the `haystack` given.
874    ///
875    /// The documentation for [`Regex::replace`] goes into more detail about
876    /// what kinds of replacement strings are supported.
877    ///
878    /// # Time complexity
879    ///
880    /// Since iterators over all matches requires running potentially many
881    /// searches on the haystack, and since each search has worst case
882    /// `O(m * n)` time complexity, the overall worst case time complexity for
883    /// this routine is `O(m * n^2)`.
884    ///
885    /// Although note that the worst case time here has an upper bound given
886    /// by the `limit` parameter.
887    ///
888    /// # Fallibility
889    ///
890    /// See the corresponding section in the docs for [`Regex::replace_all`]
891    /// for tips on how to deal with a replacement routine that can fail.
892    ///
893    /// # Example
894    ///
895    /// This example shows how to flip the order of whitespace (excluding line
896    /// terminators) delimited fields, and normalizes the whitespace that
897    /// delimits the fields. But we only do it for the first two matches.
898    ///
899    /// ```
900    /// use regex::bytes::Regex;
901    ///
902    /// let re = Regex::new(r"(?m)^(\S+)[\s--\r\n]+(\S+)$").unwrap();
903    /// let hay = b"
904    /// Greetings  1973
905    /// Wild\t1973
906    /// BornToRun\t\t\t\t1975
907    /// Darkness                    1978
908    /// TheRiver 1980
909    /// ";
910    /// let new = re.replacen(hay, 2, b"$2 $1");
911    /// assert_eq!(new, &b"
912    /// 1973 Greetings
913    /// 1973 Wild
914    /// BornToRun\t\t\t\t1975
915    /// Darkness                    1978
916    /// TheRiver 1980
917    /// "[..]);
918    /// ```
919    #[inline]
920    pub fn replacen<'h, R: Replacer>(
921        &self,
922        haystack: &'h [u8],
923        limit: usize,
924        mut rep: R,
925    ) -> Cow<'h, [u8]> {
926        // If we know that the replacement doesn't have any capture expansions,
927        // then we can use the fast path. The fast path can make a tremendous
928        // difference:
929        //
930        //   1) We use `find_iter` instead of `captures_iter`. Not asking for
931        //      captures generally makes the regex engines faster.
932        //   2) We don't need to look up all of the capture groups and do
933        //      replacements inside the replacement string. We just push it
934        //      at each match and be done with it.
935        if let Some(rep) = rep.no_expansion() {
936            let mut it = self.find_iter(haystack).enumerate().peekable();
937            if it.peek().is_none() {
938                return Cow::Borrowed(haystack);
939            }
940            let mut new = Vec::with_capacity(haystack.len());
941            let mut last_match = 0;
942            for (i, m) in it {
943                new.extend_from_slice(&haystack[last_match..m.start()]);
944                new.extend_from_slice(&rep);
945                last_match = m.end();
946                if limit > 0 && i >= limit - 1 {
947                    break;
948                }
949            }
950            new.extend_from_slice(&haystack[last_match..]);
951            return Cow::Owned(new);
952        }
953
954        // The slower path, which we use if the replacement needs access to
955        // capture groups.
956        let mut it = self.captures_iter(haystack).enumerate().peekable();
957        if it.peek().is_none() {
958            return Cow::Borrowed(haystack);
959        }
960        let mut new = Vec::with_capacity(haystack.len());
961        let mut last_match = 0;
962        for (i, cap) in it {
963            // unwrap on 0 is OK because captures only reports matches
964            let m = cap.get(0).unwrap();
965            new.extend_from_slice(&haystack[last_match..m.start()]);
966            rep.replace_append(&cap, &mut new);
967            last_match = m.end();
968            if limit > 0 && i >= limit - 1 {
969                break;
970            }
971        }
972        new.extend_from_slice(&haystack[last_match..]);
973        Cow::Owned(new)
974    }
975}
976
977/// A group of advanced or "lower level" search methods. Some methods permit
978/// starting the search at a position greater than `0` in the haystack. Other
979/// methods permit reusing allocations, for example, when extracting the
980/// matches for capture groups.
981impl Regex {
982    /// Returns the end byte offset of the first match in the haystack given.
983    ///
984    /// This method may have the same performance characteristics as
985    /// `is_match`. Behaviorally, it doesn't just report whether it match
986    /// occurs, but also the end offset for a match. In particular, the offset
987    /// returned *may be shorter* than the proper end of the leftmost-first
988    /// match that you would find via [`Regex::find`].
989    ///
990    /// Note that it is not guaranteed that this routine finds the shortest or
991    /// "earliest" possible match. Instead, the main idea of this API is that
992    /// it returns the offset at the point at which the internal regex engine
993    /// has determined that a match has occurred. This may vary depending on
994    /// which internal regex engine is used, and thus, the offset itself may
995    /// change based on internal heuristics.
996    ///
997    /// # Example
998    ///
999    /// Typically, `a+` would match the entire first sequence of `a` in some
1000    /// haystack, but `shortest_match` *may* give up as soon as it sees the
1001    /// first `a`.
1002    ///
1003    /// ```
1004    /// use regex::bytes::Regex;
1005    ///
1006    /// let re = Regex::new(r"a+").unwrap();
1007    /// let offset = re.shortest_match(b"aaaaa").unwrap();
1008    /// assert_eq!(offset, 1);
1009    /// ```
1010    #[inline]
1011    pub fn shortest_match(&self, haystack: &[u8]) -> Option<usize> {
1012        self.shortest_match_at(haystack, 0)
1013    }
1014
1015    /// Returns the same as `shortest_match`, but starts the search at the
1016    /// given offset.
1017    ///
1018    /// The significance of the starting point is that it takes the surrounding
1019    /// context into consideration. For example, the `\A` anchor can only match
1020    /// when `start == 0`.
1021    ///
1022    /// If a match is found, the offset returned is relative to the beginning
1023    /// of the haystack, not the beginning of the search.
1024    ///
1025    /// # Panics
1026    ///
1027    /// This panics when `start >= haystack.len() + 1`.
1028    ///
1029    /// # Example
1030    ///
1031    /// This example shows the significance of `start` by demonstrating how it
1032    /// can be used to permit look-around assertions in a regex to take the
1033    /// surrounding context into account.
1034    ///
1035    /// ```
1036    /// use regex::bytes::Regex;
1037    ///
1038    /// let re = Regex::new(r"\bchew\b").unwrap();
1039    /// let hay = b"eschew";
1040    /// // We get a match here, but it's probably not intended.
1041    /// assert_eq!(re.shortest_match(&hay[2..]), Some(4));
1042    /// // No match because the  assertions take the context into account.
1043    /// assert_eq!(re.shortest_match_at(hay, 2), None);
1044    /// ```
1045    #[inline]
1046    pub fn shortest_match_at(
1047        &self,
1048        haystack: &[u8],
1049        start: usize,
1050    ) -> Option<usize> {
1051        let input =
1052            Input::new(haystack).earliest(true).span(start..haystack.len());
1053        self.meta.search_half(&input).map(|hm| hm.offset())
1054    }
1055
1056    /// Returns the same as [`Regex::is_match`], but starts the search at the
1057    /// given offset.
1058    ///
1059    /// The significance of the starting point is that it takes the surrounding
1060    /// context into consideration. For example, the `\A` anchor can only
1061    /// match when `start == 0`.
1062    ///
1063    /// # Panics
1064    ///
1065    /// This panics when `start >= haystack.len() + 1`.
1066    ///
1067    /// # Example
1068    ///
1069    /// This example shows the significance of `start` by demonstrating how it
1070    /// can be used to permit look-around assertions in a regex to take the
1071    /// surrounding context into account.
1072    ///
1073    /// ```
1074    /// use regex::bytes::Regex;
1075    ///
1076    /// let re = Regex::new(r"\bchew\b").unwrap();
1077    /// let hay = b"eschew";
1078    /// // We get a match here, but it's probably not intended.
1079    /// assert!(re.is_match(&hay[2..]));
1080    /// // No match because the  assertions take the context into account.
1081    /// assert!(!re.is_match_at(hay, 2));
1082    /// ```
1083    #[inline]
1084    pub fn is_match_at(&self, haystack: &[u8], start: usize) -> bool {
1085        self.meta.is_match(Input::new(haystack).span(start..haystack.len()))
1086    }
1087
1088    /// Returns the same as [`Regex::find`], but starts the search at the given
1089    /// offset.
1090    ///
1091    /// The significance of the starting point is that it takes the surrounding
1092    /// context into consideration. For example, the `\A` anchor can only
1093    /// match when `start == 0`.
1094    ///
1095    /// # Panics
1096    ///
1097    /// This panics when `start >= haystack.len() + 1`.
1098    ///
1099    /// # Example
1100    ///
1101    /// This example shows the significance of `start` by demonstrating how it
1102    /// can be used to permit look-around assertions in a regex to take the
1103    /// surrounding context into account.
1104    ///
1105    /// ```
1106    /// use regex::bytes::Regex;
1107    ///
1108    /// let re = Regex::new(r"\bchew\b").unwrap();
1109    /// let hay = b"eschew";
1110    /// // We get a match here, but it's probably not intended.
1111    /// assert_eq!(re.find(&hay[2..]).map(|m| m.range()), Some(0..4));
1112    /// // No match because the  assertions take the context into account.
1113    /// assert_eq!(re.find_at(hay, 2), None);
1114    /// ```
1115    #[inline]
1116    pub fn find_at<'h>(
1117        &self,
1118        haystack: &'h [u8],
1119        start: usize,
1120    ) -> Option<Match<'h>> {
1121        let input = Input::new(haystack).span(start..haystack.len());
1122        self.meta.find(input).map(|m| Match::new(haystack, m.start(), m.end()))
1123    }
1124
1125    /// Returns the same as [`Regex::captures`], but starts the search at the
1126    /// given offset.
1127    ///
1128    /// The significance of the starting point is that it takes the surrounding
1129    /// context into consideration. For example, the `\A` anchor can only
1130    /// match when `start == 0`.
1131    ///
1132    /// # Panics
1133    ///
1134    /// This panics when `start >= haystack.len() + 1`.
1135    ///
1136    /// # Example
1137    ///
1138    /// This example shows the significance of `start` by demonstrating how it
1139    /// can be used to permit look-around assertions in a regex to take the
1140    /// surrounding context into account.
1141    ///
1142    /// ```
1143    /// use regex::bytes::Regex;
1144    ///
1145    /// let re = Regex::new(r"\bchew\b").unwrap();
1146    /// let hay = b"eschew";
1147    /// // We get a match here, but it's probably not intended.
1148    /// assert_eq!(&re.captures(&hay[2..]).unwrap()[0], b"chew");
1149    /// // No match because the  assertions take the context into account.
1150    /// assert!(re.captures_at(hay, 2).is_none());
1151    /// ```
1152    #[inline]
1153    pub fn captures_at<'h>(
1154        &self,
1155        haystack: &'h [u8],
1156        start: usize,
1157    ) -> Option<Captures<'h>> {
1158        let input = Input::new(haystack).span(start..haystack.len());
1159        let mut caps = self.meta.create_captures();
1160        self.meta.captures(input, &mut caps);
1161        if caps.is_match() {
1162            let static_captures_len = self.static_captures_len();
1163            Some(Captures { haystack, caps, static_captures_len })
1164        } else {
1165            None
1166        }
1167    }
1168
1169    /// This is like [`Regex::captures`], but writes the byte offsets of each
1170    /// capture group match into the locations given.
1171    ///
1172    /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`],
1173    /// but does *not* store a reference to the haystack. This makes its API
1174    /// a bit lower level and less convenient. But in exchange, callers
1175    /// may allocate their own `CaptureLocations` and reuse it for multiple
1176    /// searches. This may be helpful if allocating a `Captures` shows up in a
1177    /// profile as too costly.
1178    ///
1179    /// To create a `CaptureLocations` value, use the
1180    /// [`Regex::capture_locations`] method.
1181    ///
1182    /// This also returns the overall match if one was found. When a match is
1183    /// found, its offsets are also always stored in `locs` at index `0`.
1184    ///
1185    /// # Example
1186    ///
1187    /// ```
1188    /// use regex::bytes::Regex;
1189    ///
1190    /// let re = Regex::new(r"^([a-z]+)=(\S*)$").unwrap();
1191    /// let mut locs = re.capture_locations();
1192    /// assert!(re.captures_read(&mut locs, b"id=foo123").is_some());
1193    /// assert_eq!(Some((0, 9)), locs.get(0));
1194    /// assert_eq!(Some((0, 2)), locs.get(1));
1195    /// assert_eq!(Some((3, 9)), locs.get(2));
1196    /// ```
1197    #[inline]
1198    pub fn captures_read<'h>(
1199        &self,
1200        locs: &mut CaptureLocations,
1201        haystack: &'h [u8],
1202    ) -> Option<Match<'h>> {
1203        self.captures_read_at(locs, haystack, 0)
1204    }
1205
1206    /// Returns the same as [`Regex::captures_read`], but starts the search at
1207    /// the given offset.
1208    ///
1209    /// The significance of the starting point is that it takes the surrounding
1210    /// context into consideration. For example, the `\A` anchor can only
1211    /// match when `start == 0`.
1212    ///
1213    /// # Panics
1214    ///
1215    /// This panics when `start >= haystack.len() + 1`.
1216    ///
1217    /// # Example
1218    ///
1219    /// This example shows the significance of `start` by demonstrating how it
1220    /// can be used to permit look-around assertions in a regex to take the
1221    /// surrounding context into account.
1222    ///
1223    /// ```
1224    /// use regex::bytes::Regex;
1225    ///
1226    /// let re = Regex::new(r"\bchew\b").unwrap();
1227    /// let hay = b"eschew";
1228    /// let mut locs = re.capture_locations();
1229    /// // We get a match here, but it's probably not intended.
1230    /// assert!(re.captures_read(&mut locs, &hay[2..]).is_some());
1231    /// // No match because the  assertions take the context into account.
1232    /// assert!(re.captures_read_at(&mut locs, hay, 2).is_none());
1233    /// ```
1234    #[inline]
1235    pub fn captures_read_at<'h>(
1236        &self,
1237        locs: &mut CaptureLocations,
1238        haystack: &'h [u8],
1239        start: usize,
1240    ) -> Option<Match<'h>> {
1241        let input = Input::new(haystack).span(start..haystack.len());
1242        self.meta.search_captures(&input, &mut locs.0);
1243        locs.0.get_match().map(|m| Match::new(haystack, m.start(), m.end()))
1244    }
1245
1246    /// An undocumented alias for `captures_read_at`.
1247    ///
1248    /// The `regex-capi` crate previously used this routine, so to avoid
1249    /// breaking that crate, we continue to provide the name as an undocumented
1250    /// alias.
1251    #[doc(hidden)]
1252    #[inline]
1253    pub fn read_captures_at<'h>(
1254        &self,
1255        locs: &mut CaptureLocations,
1256        haystack: &'h [u8],
1257        start: usize,
1258    ) -> Option<Match<'h>> {
1259        self.captures_read_at(locs, haystack, start)
1260    }
1261}
1262
1263/// Auxiliary methods.
1264impl Regex {
1265    /// Returns the original string of this regex.
1266    ///
1267    /// # Example
1268    ///
1269    /// ```
1270    /// use regex::bytes::Regex;
1271    ///
1272    /// let re = Regex::new(r"foo\w+bar").unwrap();
1273    /// assert_eq!(re.as_str(), r"foo\w+bar");
1274    /// ```
1275    #[inline]
1276    pub fn as_str(&self) -> &str {
1277        &self.pattern
1278    }
1279
1280    /// Returns an iterator over the capture names in this regex.
1281    ///
1282    /// The iterator returned yields elements of type `Option<&str>`. That is,
1283    /// the iterator yields values for all capture groups, even ones that are
1284    /// unnamed. The order of the groups corresponds to the order of the group's
1285    /// corresponding opening parenthesis.
1286    ///
1287    /// The first element of the iterator always yields the group corresponding
1288    /// to the overall match, and this group is always unnamed. Therefore, the
1289    /// iterator always yields at least one group.
1290    ///
1291    /// # Example
1292    ///
1293    /// This shows basic usage with a mix of named and unnamed capture groups:
1294    ///
1295    /// ```
1296    /// use regex::bytes::Regex;
1297    ///
1298    /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap();
1299    /// let mut names = re.capture_names();
1300    /// assert_eq!(names.next(), Some(None));
1301    /// assert_eq!(names.next(), Some(Some("a")));
1302    /// assert_eq!(names.next(), Some(Some("b")));
1303    /// assert_eq!(names.next(), Some(None));
1304    /// // the '(?:.)' group is non-capturing and so doesn't appear here!
1305    /// assert_eq!(names.next(), Some(Some("c")));
1306    /// assert_eq!(names.next(), None);
1307    /// ```
1308    ///
1309    /// The iterator always yields at least one element, even for regexes with
1310    /// no capture groups and even for regexes that can never match:
1311    ///
1312    /// ```
1313    /// use regex::bytes::Regex;
1314    ///
1315    /// let re = Regex::new(r"").unwrap();
1316    /// let mut names = re.capture_names();
1317    /// assert_eq!(names.next(), Some(None));
1318    /// assert_eq!(names.next(), None);
1319    ///
1320    /// let re = Regex::new(r"[a&&b]").unwrap();
1321    /// let mut names = re.capture_names();
1322    /// assert_eq!(names.next(), Some(None));
1323    /// assert_eq!(names.next(), None);
1324    /// ```
1325    #[inline]
1326    pub fn capture_names(&self) -> CaptureNames<'_> {
1327        CaptureNames(self.meta.group_info().pattern_names(PatternID::ZERO))
1328    }
1329
1330    /// Returns the number of captures groups in this regex.
1331    ///
1332    /// This includes all named and unnamed groups, including the implicit
1333    /// unnamed group that is always present and corresponds to the entire
1334    /// match.
1335    ///
1336    /// Since the implicit unnamed group is always included in this length, the
1337    /// length returned is guaranteed to be greater than zero.
1338    ///
1339    /// # Example
1340    ///
1341    /// ```
1342    /// use regex::bytes::Regex;
1343    ///
1344    /// let re = Regex::new(r"foo").unwrap();
1345    /// assert_eq!(1, re.captures_len());
1346    ///
1347    /// let re = Regex::new(r"(foo)").unwrap();
1348    /// assert_eq!(2, re.captures_len());
1349    ///
1350    /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap();
1351    /// assert_eq!(5, re.captures_len());
1352    ///
1353    /// let re = Regex::new(r"[a&&b]").unwrap();
1354    /// assert_eq!(1, re.captures_len());
1355    /// ```
1356    #[inline]
1357    pub fn captures_len(&self) -> usize {
1358        self.meta.group_info().group_len(PatternID::ZERO)
1359    }
1360
1361    /// Returns the total number of capturing groups that appear in every
1362    /// possible match.
1363    ///
1364    /// If the number of capture groups can vary depending on the match, then
1365    /// this returns `None`. That is, a value is only returned when the number
1366    /// of matching groups is invariant or "static."
1367    ///
1368    /// Note that like [`Regex::captures_len`], this **does** include the
1369    /// implicit capturing group corresponding to the entire match. Therefore,
1370    /// when a non-None value is returned, it is guaranteed to be at least `1`.
1371    /// Stated differently, a return value of `Some(0)` is impossible.
1372    ///
1373    /// # Example
1374    ///
1375    /// This shows a few cases where a static number of capture groups is
1376    /// available and a few cases where it is not.
1377    ///
1378    /// ```
1379    /// use regex::bytes::Regex;
1380    ///
1381    /// let len = |pattern| {
1382    ///     Regex::new(pattern).map(|re| re.static_captures_len())
1383    /// };
1384    ///
1385    /// assert_eq!(Some(1), len("a")?);
1386    /// assert_eq!(Some(2), len("(a)")?);
1387    /// assert_eq!(Some(2), len("(a)|(b)")?);
1388    /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
1389    /// assert_eq!(None, len("(a)|b")?);
1390    /// assert_eq!(None, len("a|(b)")?);
1391    /// assert_eq!(None, len("(b)*")?);
1392    /// assert_eq!(Some(2), len("(b)+")?);
1393    ///
1394    /// # Ok::<(), Box<dyn std::error::Error>>(())
1395    /// ```
1396    #[inline]
1397    pub fn static_captures_len(&self) -> Option<usize> {
1398        self.meta.static_captures_len()
1399    }
1400
1401    /// Returns a fresh allocated set of capture locations that can
1402    /// be reused in multiple calls to [`Regex::captures_read`] or
1403    /// [`Regex::captures_read_at`].
1404    ///
1405    /// # Example
1406    ///
1407    /// ```
1408    /// use regex::bytes::Regex;
1409    ///
1410    /// let re = Regex::new(r"(.)(.)(\w+)").unwrap();
1411    /// let mut locs = re.capture_locations();
1412    /// assert!(re.captures_read(&mut locs, b"Padron").is_some());
1413    /// assert_eq!(locs.get(0), Some((0, 6)));
1414    /// assert_eq!(locs.get(1), Some((0, 1)));
1415    /// assert_eq!(locs.get(2), Some((1, 2)));
1416    /// assert_eq!(locs.get(3), Some((2, 6)));
1417    /// ```
1418    #[inline]
1419    pub fn capture_locations(&self) -> CaptureLocations {
1420        CaptureLocations(self.meta.create_captures())
1421    }
1422
1423    /// An alias for `capture_locations` to preserve backward compatibility.
1424    ///
1425    /// The `regex-capi` crate uses this method, so to avoid breaking that
1426    /// crate, we continue to export it as an undocumented API.
1427    #[doc(hidden)]
1428    #[inline]
1429    pub fn locations(&self) -> CaptureLocations {
1430        self.capture_locations()
1431    }
1432}
1433
1434/// Represents a single match of a regex in a haystack.
1435///
1436/// A `Match` contains both the start and end byte offsets of the match and the
1437/// actual substring corresponding to the range of those byte offsets. It is
1438/// guaranteed that `start <= end`. When `start == end`, the match is empty.
1439///
1440/// Unlike the top-level `Match` type, this `Match` type is produced by APIs
1441/// that search `&[u8]` haystacks. This means that the offsets in a `Match` can
1442/// point to anywhere in the haystack, including in a place that splits the
1443/// UTF-8 encoding of a Unicode scalar value.
1444///
1445/// The lifetime parameter `'h` refers to the lifetime of the matched of the
1446/// haystack that this match was produced from.
1447///
1448/// # Numbering
1449///
1450/// The byte offsets in a `Match` form a half-open interval. That is, the
1451/// start of the range is inclusive and the end of the range is exclusive.
1452/// For example, given a haystack `abcFOOxyz` and a match of `FOO`, its byte
1453/// offset range starts at `3` and ends at `6`. `3` corresponds to `F` and
1454/// `6` corresponds to `x`, which is one past the end of the match. This
1455/// corresponds to the same kind of slicing that Rust uses.
1456///
1457/// For more on why this was chosen over other schemes (aside from being
1458/// consistent with how Rust the language works), see [this discussion] and
1459/// [Dijkstra's note on a related topic][note].
1460///
1461/// [this discussion]: https://github.com/rust-lang/regex/discussions/866
1462/// [note]: https://www.cs.utexas.edu/users/EWD/transcriptions/EWD08xx/EWD831.html
1463///
1464/// # Example
1465///
1466/// This example shows the value of each of the methods on `Match` for a
1467/// particular search.
1468///
1469/// ```
1470/// use regex::bytes::Regex;
1471///
1472/// let re = Regex::new(r"\p{Greek}+").unwrap();
1473/// let hay = "Greek: αβγδ".as_bytes();
1474/// let m = re.find(hay).unwrap();
1475/// assert_eq!(7, m.start());
1476/// assert_eq!(15, m.end());
1477/// assert!(!m.is_empty());
1478/// assert_eq!(8, m.len());
1479/// assert_eq!(7..15, m.range());
1480/// assert_eq!("αβγδ".as_bytes(), m.as_bytes());
1481/// ```
1482#[derive(Copy, Clone, Eq, PartialEq)]
1483pub struct Match<'h> {
1484    haystack: &'h [u8],
1485    start: usize,
1486    end: usize,
1487}
1488
1489impl<'h> Match<'h> {
1490    /// Returns the byte offset of the start of the match in the haystack. The
1491    /// start of the match corresponds to the position where the match begins
1492    /// and includes the first byte in the match.
1493    ///
1494    /// It is guaranteed that `Match::start() <= Match::end()`.
1495    ///
1496    /// Unlike the top-level `Match` type, the start offset may appear anywhere
1497    /// in the haystack. This includes between the code units of a UTF-8
1498    /// encoded Unicode scalar value.
1499    #[inline]
1500    pub fn start(&self) -> usize {
1501        self.start
1502    }
1503
1504    /// Returns the byte offset of the end of the match in the haystack. The
1505    /// end of the match corresponds to the byte immediately following the last
1506    /// byte in the match. This means that `&slice[start..end]` works as one
1507    /// would expect.
1508    ///
1509    /// It is guaranteed that `Match::start() <= Match::end()`.
1510    ///
1511    /// Unlike the top-level `Match` type, the start offset may appear anywhere
1512    /// in the haystack. This includes between the code units of a UTF-8
1513    /// encoded Unicode scalar value.
1514    #[inline]
1515    pub fn end(&self) -> usize {
1516        self.end
1517    }
1518
1519    /// Returns true if and only if this match has a length of zero.
1520    ///
1521    /// Note that an empty match can only occur when the regex itself can
1522    /// match the empty string. Here are some examples of regexes that can
1523    /// all match the empty string: `^`, `^$`, `\b`, `a?`, `a*`, `a{0}`,
1524    /// `(foo|\d+|quux)?`.
1525    #[inline]
1526    pub fn is_empty(&self) -> bool {
1527        self.start == self.end
1528    }
1529
1530    /// Returns the length, in bytes, of this match.
1531    #[inline]
1532    pub fn len(&self) -> usize {
1533        self.end - self.start
1534    }
1535
1536    /// Returns the range over the starting and ending byte offsets of the
1537    /// match in the haystack.
1538    #[inline]
1539    pub fn range(&self) -> core::ops::Range<usize> {
1540        self.start..self.end
1541    }
1542
1543    /// Returns the substring of the haystack that matched.
1544    #[inline]
1545    pub fn as_bytes(&self) -> &'h [u8] {
1546        &self.haystack[self.range()]
1547    }
1548
1549    /// Creates a new match from the given haystack and byte offsets.
1550    #[inline]
1551    fn new(haystack: &'h [u8], start: usize, end: usize) -> Match<'h> {
1552        Match { haystack, start, end }
1553    }
1554}
1555
1556impl<'h> core::fmt::Debug for Match<'h> {
1557    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1558        use regex_automata::util::escape::DebugHaystack;
1559
1560        let mut fmt = f.debug_struct("Match");
1561        fmt.field("start", &self.start)
1562            .field("end", &self.end)
1563            .field("bytes", &DebugHaystack(&self.as_bytes()));
1564
1565        fmt.finish()
1566    }
1567}
1568
1569impl<'h> From<Match<'h>> for &'h [u8] {
1570    fn from(m: Match<'h>) -> &'h [u8] {
1571        m.as_bytes()
1572    }
1573}
1574
1575impl<'h> From<Match<'h>> for core::ops::Range<usize> {
1576    fn from(m: Match<'h>) -> core::ops::Range<usize> {
1577        m.range()
1578    }
1579}
1580
1581/// Represents the capture groups for a single match.
1582///
1583/// Capture groups refer to parts of a regex enclosed in parentheses. They
1584/// can be optionally named. The purpose of capture groups is to be able to
1585/// reference different parts of a match based on the original pattern. In
1586/// essence, a `Captures` is a container of [`Match`] values for each group
1587/// that participated in a regex match. Each `Match` can be looked up by either
1588/// its capture group index or name (if it has one).
1589///
1590/// For example, say you want to match the individual letters in a 5-letter
1591/// word:
1592///
1593/// ```text
1594/// (?<first>\w)(\w)(?:\w)\w(?<last>\w)
1595/// ```
1596///
1597/// This regex has 4 capture groups:
1598///
1599/// * The group at index `0` corresponds to the overall match. It is always
1600/// present in every match and never has a name.
1601/// * The group at index `1` with name `first` corresponding to the first
1602/// letter.
1603/// * The group at index `2` with no name corresponding to the second letter.
1604/// * The group at index `3` with name `last` corresponding to the fifth and
1605/// last letter.
1606///
1607/// Notice that `(?:\w)` was not listed above as a capture group despite it
1608/// being enclosed in parentheses. That's because `(?:pattern)` is a special
1609/// syntax that permits grouping but *without* capturing. The reason for not
1610/// treating it as a capture is that tracking and reporting capture groups
1611/// requires additional state that may lead to slower searches. So using as few
1612/// capture groups as possible can help performance. (Although the difference
1613/// in performance of a couple of capture groups is likely immaterial.)
1614///
1615/// Values with this type are created by [`Regex::captures`] or
1616/// [`Regex::captures_iter`].
1617///
1618/// `'h` is the lifetime of the haystack that these captures were matched from.
1619///
1620/// # Example
1621///
1622/// ```
1623/// use regex::bytes::Regex;
1624///
1625/// let re = Regex::new(r"(?<first>\w)(\w)(?:\w)\w(?<last>\w)").unwrap();
1626/// let caps = re.captures(b"toady").unwrap();
1627/// assert_eq!(b"toady", &caps[0]);
1628/// assert_eq!(b"t", &caps["first"]);
1629/// assert_eq!(b"o", &caps[2]);
1630/// assert_eq!(b"y", &caps["last"]);
1631/// ```
1632pub struct Captures<'h> {
1633    haystack: &'h [u8],
1634    caps: captures::Captures,
1635    static_captures_len: Option<usize>,
1636}
1637
1638impl<'h> Captures<'h> {
1639    /// Returns the `Match` associated with the capture group at index `i`. If
1640    /// `i` does not correspond to a capture group, or if the capture group did
1641    /// not participate in the match, then `None` is returned.
1642    ///
1643    /// When `i == 0`, this is guaranteed to return a non-`None` value.
1644    ///
1645    /// # Examples
1646    ///
1647    /// Get the substring that matched with a default of an empty string if the
1648    /// group didn't participate in the match:
1649    ///
1650    /// ```
1651    /// use regex::bytes::Regex;
1652    ///
1653    /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
1654    /// let caps = re.captures(b"abc123").unwrap();
1655    ///
1656    /// let substr1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes());
1657    /// let substr2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes());
1658    /// assert_eq!(substr1, b"123");
1659    /// assert_eq!(substr2, b"");
1660    /// ```
1661    #[inline]
1662    pub fn get(&self, i: usize) -> Option<Match<'h>> {
1663        self.caps
1664            .get_group(i)
1665            .map(|sp| Match::new(self.haystack, sp.start, sp.end))
1666    }
1667
1668    /// Return the overall match for the capture.
1669    ///
1670    /// This returns the match for index `0`. That is it is equivalent to
1671    /// `m.get(0).unwrap()`
1672    ///
1673    /// # Example
1674    ///
1675    /// ```
1676    /// use regex::bytes::Regex;
1677    ///
1678    /// let re = Regex::new(r"[a-z]+([0-9]+)").unwrap();
1679    /// let caps = re.captures(b"   abc123-def").unwrap();
1680    ///
1681    /// assert_eq!(caps.get_match().as_bytes(), b"abc123");
1682    /// ```
1683    #[inline]
1684    pub fn get_match(&self) -> Match<'h> {
1685        self.get(0).unwrap()
1686    }
1687
1688    /// Returns the `Match` associated with the capture group named `name`. If
1689    /// `name` isn't a valid capture group or it refers to a group that didn't
1690    /// match, then `None` is returned.
1691    ///
1692    /// Note that unlike `caps["name"]`, this returns a `Match` whose lifetime
1693    /// matches the lifetime of the haystack in this `Captures` value.
1694    /// Conversely, the substring returned by `caps["name"]` has a lifetime
1695    /// of the `Captures` value, which is likely shorter than the lifetime of
1696    /// the haystack. In some cases, it may be necessary to use this method to
1697    /// access the matching substring instead of the `caps["name"]` notation.
1698    ///
1699    /// # Examples
1700    ///
1701    /// Get the substring that matched with a default of an empty string if the
1702    /// group didn't participate in the match:
1703    ///
1704    /// ```
1705    /// use regex::bytes::Regex;
1706    ///
1707    /// let re = Regex::new(
1708    ///     r"[a-z]+(?:(?<numbers>[0-9]+)|(?<letters>[A-Z]+))",
1709    /// ).unwrap();
1710    /// let caps = re.captures(b"abc123").unwrap();
1711    ///
1712    /// let numbers = caps.name("numbers").map_or(&b""[..], |m| m.as_bytes());
1713    /// let letters = caps.name("letters").map_or(&b""[..], |m| m.as_bytes());
1714    /// assert_eq!(numbers, b"123");
1715    /// assert_eq!(letters, b"");
1716    /// ```
1717    #[inline]
1718    pub fn name(&self, name: &str) -> Option<Match<'h>> {
1719        self.caps
1720            .get_group_by_name(name)
1721            .map(|sp| Match::new(self.haystack, sp.start, sp.end))
1722    }
1723
1724    /// This is a convenience routine for extracting the substrings
1725    /// corresponding to matching capture groups.
1726    ///
1727    /// This returns a tuple where the first element corresponds to the full
1728    /// substring of the haystack that matched the regex. The second element is
1729    /// an array of substrings, with each corresponding to the substring that
1730    /// matched for a particular capture group.
1731    ///
1732    /// # Panics
1733    ///
1734    /// This panics if the number of possible matching groups in this
1735    /// `Captures` value is not fixed to `N` in all circumstances.
1736    /// More precisely, this routine only works when `N` is equivalent to
1737    /// [`Regex::static_captures_len`].
1738    ///
1739    /// Stated more plainly, if the number of matching capture groups in a
1740    /// regex can vary from match to match, then this function always panics.
1741    ///
1742    /// For example, `(a)(b)|(c)` could produce two matching capture groups
1743    /// or one matching capture group for any given match. Therefore, one
1744    /// cannot use `extract` with such a pattern.
1745    ///
1746    /// But a pattern like `(a)(b)|(c)(d)` can be used with `extract` because
1747    /// the number of capture groups in every match is always equivalent,
1748    /// even if the capture _indices_ in each match are not.
1749    ///
1750    /// # Example
1751    ///
1752    /// ```
1753    /// use regex::bytes::Regex;
1754    ///
1755    /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap();
1756    /// let hay = b"On 2010-03-14, I became a Tennessee lamb.";
1757    /// let Some((full, [year, month, day])) =
1758    ///     re.captures(hay).map(|caps| caps.extract()) else { return };
1759    /// assert_eq!(b"2010-03-14", full);
1760    /// assert_eq!(b"2010", year);
1761    /// assert_eq!(b"03", month);
1762    /// assert_eq!(b"14", day);
1763    /// ```
1764    ///
1765    /// # Example: iteration
1766    ///
1767    /// This example shows how to use this method when iterating over all
1768    /// `Captures` matches in a haystack.
1769    ///
1770    /// ```
1771    /// use regex::bytes::Regex;
1772    ///
1773    /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap();
1774    /// let hay = b"1973-01-05, 1975-08-25 and 1980-10-18";
1775    ///
1776    /// let mut dates: Vec<(&[u8], &[u8], &[u8])> = vec![];
1777    /// for (_, [y, m, d]) in re.captures_iter(hay).map(|c| c.extract()) {
1778    ///     dates.push((y, m, d));
1779    /// }
1780    /// assert_eq!(dates, vec![
1781    ///     (&b"1973"[..], &b"01"[..], &b"05"[..]),
1782    ///     (&b"1975"[..], &b"08"[..], &b"25"[..]),
1783    ///     (&b"1980"[..], &b"10"[..], &b"18"[..]),
1784    /// ]);
1785    /// ```
1786    ///
1787    /// # Example: parsing different formats
1788    ///
1789    /// This API is particularly useful when you need to extract a particular
1790    /// value that might occur in a different format. Consider, for example,
1791    /// an identifier that might be in double quotes or single quotes:
1792    ///
1793    /// ```
1794    /// use regex::bytes::Regex;
1795    ///
1796    /// let re = Regex::new(r#"id:(?:"([^"]+)"|'([^']+)')"#).unwrap();
1797    /// let hay = br#"The first is id:"foo" and the second is id:'bar'."#;
1798    /// let mut ids = vec![];
1799    /// for (_, [id]) in re.captures_iter(hay).map(|c| c.extract()) {
1800    ///     ids.push(id);
1801    /// }
1802    /// assert_eq!(ids, vec![b"foo", b"bar"]);
1803    /// ```
1804    pub fn extract<const N: usize>(&self) -> (&'h [u8], [&'h [u8]; N]) {
1805        let len = self
1806            .static_captures_len
1807            .expect("number of capture groups can vary in a match")
1808            .checked_sub(1)
1809            .expect("number of groups is always greater than zero");
1810        assert_eq!(N, len, "asked for {N} groups, but must ask for {len}");
1811        // The regex-automata variant of extract is a bit more permissive.
1812        // It doesn't require the number of matching capturing groups to be
1813        // static, and you can even request fewer groups than what's there. So
1814        // this is guaranteed to never panic because we've asserted above that
1815        // the user has requested precisely the number of groups that must be
1816        // present in any match for this regex.
1817        self.caps.extract_bytes(self.haystack)
1818    }
1819
1820    /// Expands all instances of `$ref` in `replacement` to the corresponding
1821    /// capture group, and writes them to the `dst` buffer given. A `ref` can
1822    /// be a capture group index or a name. If `ref` doesn't refer to a capture
1823    /// group that participated in the match, then it is replaced with the
1824    /// empty string.
1825    ///
1826    /// # Format
1827    ///
1828    /// The format of the replacement string supports two different kinds of
1829    /// capture references: unbraced and braced.
1830    ///
1831    /// For the unbraced format, the format supported is `$ref` where `name`
1832    /// can be any character in the class `[0-9A-Za-z_]`. `ref` is always
1833    /// the longest possible parse. So for example, `$1a` corresponds to the
1834    /// capture group named `1a` and not the capture group at index `1`. If
1835    /// `ref` matches `^[0-9]+$`, then it is treated as a capture group index
1836    /// itself and not a name.
1837    ///
1838    /// For the braced format, the format supported is `${ref}` where `ref` can
1839    /// be any sequence of bytes except for `}`. If no closing brace occurs,
1840    /// then it is not considered a capture reference. As with the unbraced
1841    /// format, if `ref` matches `^[0-9]+$`, then it is treated as a capture
1842    /// group index and not a name.
1843    ///
1844    /// The braced format is useful for exerting precise control over the name
1845    /// of the capture reference. For example, `${1}a` corresponds to the
1846    /// capture group reference `1` followed by the letter `a`, where as `$1a`
1847    /// (as mentioned above) corresponds to the capture group reference `1a`.
1848    /// The braced format is also useful for expressing capture group names
1849    /// that use characters not supported by the unbraced format. For example,
1850    /// `${foo[bar].baz}` refers to the capture group named `foo[bar].baz`.
1851    ///
1852    /// If a capture group reference is found and it does not refer to a valid
1853    /// capture group, then it will be replaced with the empty string.
1854    ///
1855    /// To write a literal `$`, use `$$`.
1856    ///
1857    /// # Example
1858    ///
1859    /// ```
1860    /// use regex::bytes::Regex;
1861    ///
1862    /// let re = Regex::new(
1863    ///     r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})",
1864    /// ).unwrap();
1865    /// let hay = b"On 14-03-2010, I became a Tennessee lamb.";
1866    /// let caps = re.captures(hay).unwrap();
1867    ///
1868    /// let mut dst = vec![];
1869    /// caps.expand(b"year=$year, month=$month, day=$day", &mut dst);
1870    /// assert_eq!(dst, b"year=2010, month=03, day=14");
1871    /// ```
1872    #[inline]
1873    pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
1874        self.caps.interpolate_bytes_into(self.haystack, replacement, dst);
1875    }
1876
1877    /// Returns an iterator over all capture groups. This includes both
1878    /// matching and non-matching groups.
1879    ///
1880    /// The iterator always yields at least one matching group: the first group
1881    /// (at index `0`) with no name. Subsequent groups are returned in the order
1882    /// of their opening parenthesis in the regex.
1883    ///
1884    /// The elements yielded have type `Option<Match<'h>>`, where a non-`None`
1885    /// value is present if the capture group matches.
1886    ///
1887    /// # Example
1888    ///
1889    /// ```
1890    /// use regex::bytes::Regex;
1891    ///
1892    /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap();
1893    /// let caps = re.captures(b"AZ").unwrap();
1894    ///
1895    /// let mut it = caps.iter();
1896    /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), Some(&b"AZ"[..]));
1897    /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), Some(&b"A"[..]));
1898    /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), None);
1899    /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), Some(&b"Z"[..]));
1900    /// assert_eq!(it.next(), None);
1901    /// ```
1902    #[inline]
1903    pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 'h> {
1904        SubCaptureMatches { haystack: self.haystack, it: self.caps.iter() }
1905    }
1906
1907    /// Returns the total number of capture groups. This includes both
1908    /// matching and non-matching groups.
1909    ///
1910    /// The length returned is always equivalent to the number of elements
1911    /// yielded by [`Captures::iter`]. Consequently, the length is always
1912    /// greater than zero since every `Captures` value always includes the
1913    /// match for the entire regex.
1914    ///
1915    /// # Example
1916    ///
1917    /// ```
1918    /// use regex::bytes::Regex;
1919    ///
1920    /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap();
1921    /// let caps = re.captures(b"AZ").unwrap();
1922    /// assert_eq!(caps.len(), 4);
1923    /// ```
1924    #[inline]
1925    pub fn len(&self) -> usize {
1926        self.caps.group_len()
1927    }
1928}
1929
1930impl<'h> core::fmt::Debug for Captures<'h> {
1931    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1932        /// A little helper type to provide a nice map-like debug
1933        /// representation for our capturing group spans.
1934        ///
1935        /// regex-automata has something similar, but it includes the pattern
1936        /// ID in its debug output, which is confusing. It also doesn't include
1937        /// that strings that match because a regex-automata `Captures` doesn't
1938        /// borrow the haystack.
1939        struct CapturesDebugMap<'a> {
1940            caps: &'a Captures<'a>,
1941        }
1942
1943        impl<'a> core::fmt::Debug for CapturesDebugMap<'a> {
1944            fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1945                let mut map = f.debug_map();
1946                let names =
1947                    self.caps.caps.group_info().pattern_names(PatternID::ZERO);
1948                for (group_index, maybe_name) in names.enumerate() {
1949                    let key = Key(group_index, maybe_name);
1950                    match self.caps.get(group_index) {
1951                        None => map.entry(&key, &None::<()>),
1952                        Some(mat) => map.entry(&key, &Value(mat)),
1953                    };
1954                }
1955                map.finish()
1956            }
1957        }
1958
1959        struct Key<'a>(usize, Option<&'a str>);
1960
1961        impl<'a> core::fmt::Debug for Key<'a> {
1962            fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1963                write!(f, "{}", self.0)?;
1964                if let Some(name) = self.1 {
1965                    write!(f, "/{name:?}")?;
1966                }
1967                Ok(())
1968            }
1969        }
1970
1971        struct Value<'a>(Match<'a>);
1972
1973        impl<'a> core::fmt::Debug for Value<'a> {
1974            fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1975                use regex_automata::util::escape::DebugHaystack;
1976
1977                write!(
1978                    f,
1979                    "{}..{}/{:?}",
1980                    self.0.start(),
1981                    self.0.end(),
1982                    DebugHaystack(self.0.as_bytes())
1983                )
1984            }
1985        }
1986
1987        f.debug_tuple("Captures")
1988            .field(&CapturesDebugMap { caps: self })
1989            .finish()
1990    }
1991}
1992
1993/// Get a matching capture group's haystack substring by index.
1994///
1995/// The haystack substring returned can't outlive the `Captures` object if this
1996/// method is used, because of how `Index` is defined (normally `a[i]` is part
1997/// of `a` and can't outlive it). To work around this limitation, do that, use
1998/// [`Captures::get`] instead.
1999///
2000/// `'h` is the lifetime of the matched haystack, but the lifetime of the
2001/// `&str` returned by this implementation is the lifetime of the `Captures`
2002/// value itself.
2003///
2004/// # Panics
2005///
2006/// If there is no matching group at the given index.
2007impl<'h> core::ops::Index<usize> for Captures<'h> {
2008    type Output = [u8];
2009
2010    // The lifetime is written out to make it clear that the &str returned
2011    // does NOT have a lifetime equivalent to 'h.
2012    fn index<'a>(&'a self, i: usize) -> &'a [u8] {
2013        self.get(i)
2014            .map(|m| m.as_bytes())
2015            .unwrap_or_else(|| panic!("no group at index '{i}'"))
2016    }
2017}
2018
2019/// Get a matching capture group's haystack substring by name.
2020///
2021/// The haystack substring returned can't outlive the `Captures` object if this
2022/// method is used, because of how `Index` is defined (normally `a[i]` is part
2023/// of `a` and can't outlive it). To work around this limitation, do that, use
2024/// [`Captures::name`] instead.
2025///
2026/// `'h` is the lifetime of the matched haystack, but the lifetime of the
2027/// `&str` returned by this implementation is the lifetime of the `Captures`
2028/// value itself.
2029///
2030/// `'n` is the lifetime of the group name used to index the `Captures` value.
2031///
2032/// # Panics
2033///
2034/// If there is no matching group at the given name.
2035impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> {
2036    type Output = [u8];
2037
2038    fn index<'a>(&'a self, name: &'n str) -> &'a [u8] {
2039        self.name(name)
2040            .map(|m| m.as_bytes())
2041            .unwrap_or_else(|| panic!("no group named '{name}'"))
2042    }
2043}
2044
2045/// A low level representation of the byte offsets of each capture group.
2046///
2047/// You can think of this as a lower level [`Captures`], where this type does
2048/// not support named capturing groups directly and it does not borrow the
2049/// haystack that these offsets were matched on.
2050///
2051/// Primarily, this type is useful when using the lower level `Regex` APIs such
2052/// as [`Regex::captures_read`], which permits amortizing the allocation in
2053/// which capture match offsets are stored.
2054///
2055/// In order to build a value of this type, you'll need to call the
2056/// [`Regex::capture_locations`] method. The value returned can then be reused
2057/// in subsequent searches for that regex. Using it for other regexes may
2058/// result in a panic or otherwise incorrect results.
2059///
2060/// # Example
2061///
2062/// This example shows how to create and use `CaptureLocations` in a search.
2063///
2064/// ```
2065/// use regex::bytes::Regex;
2066///
2067/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
2068/// let mut locs = re.capture_locations();
2069/// let m = re.captures_read(&mut locs, b"Bruce Springsteen").unwrap();
2070/// assert_eq!(0..17, m.range());
2071/// assert_eq!(Some((0, 17)), locs.get(0));
2072/// assert_eq!(Some((0, 5)), locs.get(1));
2073/// assert_eq!(Some((6, 17)), locs.get(2));
2074///
2075/// // Asking for an invalid capture group always returns None.
2076/// assert_eq!(None, locs.get(3));
2077/// # // literals are too big for 32-bit usize: #1041
2078/// # #[cfg(target_pointer_width = "64")]
2079/// assert_eq!(None, locs.get(34973498648));
2080/// # #[cfg(target_pointer_width = "64")]
2081/// assert_eq!(None, locs.get(9944060567225171988));
2082/// ```
2083#[derive(Clone, Debug)]
2084pub struct CaptureLocations(captures::Captures);
2085
2086/// A type alias for `CaptureLocations` for backwards compatibility.
2087///
2088/// Previously, we exported `CaptureLocations` as `Locations` in an
2089/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
2090/// we continue re-exporting the same undocumented API.
2091#[doc(hidden)]
2092pub type Locations = CaptureLocations;
2093
2094impl CaptureLocations {
2095    /// Returns the start and end byte offsets of the capture group at index
2096    /// `i`. This returns `None` if `i` is not a valid capture group or if the
2097    /// capture group did not match.
2098    ///
2099    /// # Example
2100    ///
2101    /// ```
2102    /// use regex::bytes::Regex;
2103    ///
2104    /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
2105    /// let mut locs = re.capture_locations();
2106    /// re.captures_read(&mut locs, b"Bruce Springsteen").unwrap();
2107    /// assert_eq!(Some((0, 17)), locs.get(0));
2108    /// assert_eq!(Some((0, 5)), locs.get(1));
2109    /// assert_eq!(Some((6, 17)), locs.get(2));
2110    /// ```
2111    #[inline]
2112    pub fn get(&self, i: usize) -> Option<(usize, usize)> {
2113        self.0.get_group(i).map(|sp| (sp.start, sp.end))
2114    }
2115
2116    /// Returns the total number of capture groups (even if they didn't match).
2117    /// That is, the length returned is unaffected by the result of a search.
2118    ///
2119    /// This is always at least `1` since every regex has at least `1`
2120    /// capturing group that corresponds to the entire match.
2121    ///
2122    /// # Example
2123    ///
2124    /// ```
2125    /// use regex::bytes::Regex;
2126    ///
2127    /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
2128    /// let mut locs = re.capture_locations();
2129    /// assert_eq!(3, locs.len());
2130    /// re.captures_read(&mut locs, b"Bruce Springsteen").unwrap();
2131    /// assert_eq!(3, locs.len());
2132    /// ```
2133    ///
2134    /// Notice that the length is always at least `1`, regardless of the regex:
2135    ///
2136    /// ```
2137    /// use regex::bytes::Regex;
2138    ///
2139    /// let re = Regex::new(r"").unwrap();
2140    /// let locs = re.capture_locations();
2141    /// assert_eq!(1, locs.len());
2142    ///
2143    /// // [a&&b] is a regex that never matches anything.
2144    /// let re = Regex::new(r"[a&&b]").unwrap();
2145    /// let locs = re.capture_locations();
2146    /// assert_eq!(1, locs.len());
2147    /// ```
2148    #[inline]
2149    pub fn len(&self) -> usize {
2150        // self.0.group_len() returns 0 if the underlying captures doesn't
2151        // represent a match, but the behavior guaranteed for this method is
2152        // that the length doesn't change based on a match or not.
2153        self.0.group_info().group_len(PatternID::ZERO)
2154    }
2155
2156    /// An alias for the `get` method for backwards compatibility.
2157    ///
2158    /// Previously, we exported `get` as `pos` in an undocumented API. To
2159    /// prevent breaking that code (e.g., in `regex-capi`), we continue
2160    /// re-exporting the same undocumented API.
2161    #[doc(hidden)]
2162    #[inline]
2163    pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
2164        self.get(i)
2165    }
2166}
2167
2168/// An iterator over all non-overlapping matches in a haystack.
2169///
2170/// This iterator yields [`Match`] values. The iterator stops when no more
2171/// matches can be found.
2172///
2173/// `'r` is the lifetime of the compiled regular expression and `'h` is the
2174/// lifetime of the haystack.
2175///
2176/// This iterator is created by [`Regex::find_iter`].
2177///
2178/// # Time complexity
2179///
2180/// Note that since an iterator runs potentially many searches on the haystack
2181/// and since each search has worst case `O(m * n)` time complexity, the
2182/// overall worst case time complexity for iteration is `O(m * n^2)`.
2183#[derive(Debug)]
2184pub struct Matches<'r, 'h> {
2185    haystack: &'h [u8],
2186    it: meta::FindMatches<'r, 'h>,
2187}
2188
2189impl<'r, 'h> Iterator for Matches<'r, 'h> {
2190    type Item = Match<'h>;
2191
2192    #[inline]
2193    fn next(&mut self) -> Option<Match<'h>> {
2194        self.it
2195            .next()
2196            .map(|sp| Match::new(self.haystack, sp.start(), sp.end()))
2197    }
2198
2199    #[inline]
2200    fn count(self) -> usize {
2201        // This can actually be up to 2x faster than calling `next()` until
2202        // completion, because counting matches when using a DFA only requires
2203        // finding the end of each match. But returning a `Match` via `next()`
2204        // requires the start of each match which, with a DFA, requires a
2205        // reverse forward scan to find it.
2206        self.it.count()
2207    }
2208}
2209
2210impl<'r, 'h> core::iter::FusedIterator for Matches<'r, 'h> {}
2211
2212/// An iterator over all non-overlapping capture matches in a haystack.
2213///
2214/// This iterator yields [`Captures`] values. The iterator stops when no more
2215/// matches can be found.
2216///
2217/// `'r` is the lifetime of the compiled regular expression and `'h` is the
2218/// lifetime of the matched string.
2219///
2220/// This iterator is created by [`Regex::captures_iter`].
2221///
2222/// # Time complexity
2223///
2224/// Note that since an iterator runs potentially many searches on the haystack
2225/// and since each search has worst case `O(m * n)` time complexity, the
2226/// overall worst case time complexity for iteration is `O(m * n^2)`.
2227#[derive(Debug)]
2228pub struct CaptureMatches<'r, 'h> {
2229    haystack: &'h [u8],
2230    it: meta::CapturesMatches<'r, 'h>,
2231}
2232
2233impl<'r, 'h> Iterator for CaptureMatches<'r, 'h> {
2234    type Item = Captures<'h>;
2235
2236    #[inline]
2237    fn next(&mut self) -> Option<Captures<'h>> {
2238        let static_captures_len = self.it.regex().static_captures_len();
2239        self.it.next().map(|caps| Captures {
2240            haystack: self.haystack,
2241            caps,
2242            static_captures_len,
2243        })
2244    }
2245
2246    #[inline]
2247    fn count(self) -> usize {
2248        // This can actually be up to 2x faster than calling `next()` until
2249        // completion, because counting matches when using a DFA only requires
2250        // finding the end of each match. But returning a `Match` via `next()`
2251        // requires the start of each match which, with a DFA, requires a
2252        // reverse forward scan to find it.
2253        self.it.count()
2254    }
2255}
2256
2257impl<'r, 'h> core::iter::FusedIterator for CaptureMatches<'r, 'h> {}
2258
2259/// An iterator over all substrings delimited by a regex match.
2260///
2261/// `'r` is the lifetime of the compiled regular expression and `'h` is the
2262/// lifetime of the byte string being split.
2263///
2264/// This iterator is created by [`Regex::split`].
2265///
2266/// # Time complexity
2267///
2268/// Note that since an iterator runs potentially many searches on the haystack
2269/// and since each search has worst case `O(m * n)` time complexity, the
2270/// overall worst case time complexity for iteration is `O(m * n^2)`.
2271#[derive(Debug)]
2272pub struct Split<'r, 'h> {
2273    haystack: &'h [u8],
2274    it: meta::Split<'r, 'h>,
2275}
2276
2277impl<'r, 'h> Iterator for Split<'r, 'h> {
2278    type Item = &'h [u8];
2279
2280    #[inline]
2281    fn next(&mut self) -> Option<&'h [u8]> {
2282        self.it.next().map(|span| &self.haystack[span])
2283    }
2284}
2285
2286impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {}
2287
2288/// An iterator over at most `N` substrings delimited by a regex match.
2289///
2290/// The last substring yielded by this iterator will be whatever remains after
2291/// `N-1` splits.
2292///
2293/// `'r` is the lifetime of the compiled regular expression and `'h` is the
2294/// lifetime of the byte string being split.
2295///
2296/// This iterator is created by [`Regex::splitn`].
2297///
2298/// # Time complexity
2299///
2300/// Note that since an iterator runs potentially many searches on the haystack
2301/// and since each search has worst case `O(m * n)` time complexity, the
2302/// overall worst case time complexity for iteration is `O(m * n^2)`.
2303///
2304/// Although note that the worst case time here has an upper bound given
2305/// by the `limit` parameter to [`Regex::splitn`].
2306#[derive(Debug)]
2307pub struct SplitN<'r, 'h> {
2308    haystack: &'h [u8],
2309    it: meta::SplitN<'r, 'h>,
2310}
2311
2312impl<'r, 'h> Iterator for SplitN<'r, 'h> {
2313    type Item = &'h [u8];
2314
2315    #[inline]
2316    fn next(&mut self) -> Option<&'h [u8]> {
2317        self.it.next().map(|span| &self.haystack[span])
2318    }
2319
2320    #[inline]
2321    fn size_hint(&self) -> (usize, Option<usize>) {
2322        self.it.size_hint()
2323    }
2324}
2325
2326impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
2327
2328/// An iterator over the names of all capture groups in a regex.
2329///
2330/// This iterator yields values of type `Option<&str>` in order of the opening
2331/// capture group parenthesis in the regex pattern. `None` is yielded for
2332/// groups with no name. The first element always corresponds to the implicit
2333/// and unnamed group for the overall match.
2334///
2335/// `'r` is the lifetime of the compiled regular expression.
2336///
2337/// This iterator is created by [`Regex::capture_names`].
2338#[derive(Clone, Debug)]
2339pub struct CaptureNames<'r>(captures::GroupInfoPatternNames<'r>);
2340
2341impl<'r> Iterator for CaptureNames<'r> {
2342    type Item = Option<&'r str>;
2343
2344    #[inline]
2345    fn next(&mut self) -> Option<Option<&'r str>> {
2346        self.0.next()
2347    }
2348
2349    #[inline]
2350    fn size_hint(&self) -> (usize, Option<usize>) {
2351        self.0.size_hint()
2352    }
2353
2354    #[inline]
2355    fn count(self) -> usize {
2356        self.0.count()
2357    }
2358}
2359
2360impl<'r> ExactSizeIterator for CaptureNames<'r> {}
2361
2362impl<'r> core::iter::FusedIterator for CaptureNames<'r> {}
2363
2364/// An iterator over all group matches in a [`Captures`] value.
2365///
2366/// This iterator yields values of type `Option<Match<'h>>`, where `'h` is the
2367/// lifetime of the haystack that the matches are for. The order of elements
2368/// yielded corresponds to the order of the opening parenthesis for the group
2369/// in the regex pattern. `None` is yielded for groups that did not participate
2370/// in the match.
2371///
2372/// The first element always corresponds to the implicit group for the overall
2373/// match. Since this iterator is created by a [`Captures`] value, and a
2374/// `Captures` value is only created when a match occurs, it follows that the
2375/// first element yielded by this iterator is guaranteed to be non-`None`.
2376///
2377/// The lifetime `'c` corresponds to the lifetime of the `Captures` value that
2378/// created this iterator, and the lifetime `'h` corresponds to the originally
2379/// matched haystack.
2380#[derive(Clone, Debug)]
2381pub struct SubCaptureMatches<'c, 'h> {
2382    haystack: &'h [u8],
2383    it: captures::CapturesPatternIter<'c>,
2384}
2385
2386impl<'c, 'h> Iterator for SubCaptureMatches<'c, 'h> {
2387    type Item = Option<Match<'h>>;
2388
2389    #[inline]
2390    fn next(&mut self) -> Option<Option<Match<'h>>> {
2391        self.it.next().map(|group| {
2392            group.map(|sp| Match::new(self.haystack, sp.start, sp.end))
2393        })
2394    }
2395
2396    #[inline]
2397    fn size_hint(&self) -> (usize, Option<usize>) {
2398        self.it.size_hint()
2399    }
2400
2401    #[inline]
2402    fn count(self) -> usize {
2403        self.it.count()
2404    }
2405}
2406
2407impl<'c, 'h> ExactSizeIterator for SubCaptureMatches<'c, 'h> {}
2408
2409impl<'c, 'h> core::iter::FusedIterator for SubCaptureMatches<'c, 'h> {}
2410
2411/// A trait for types that can be used to replace matches in a haystack.
2412///
2413/// In general, users of this crate shouldn't need to implement this trait,
2414/// since implementations are already provided for `&[u8]` along with other
2415/// variants of byte string types, as well as `FnMut(&Captures) -> Vec<u8>` (or
2416/// any `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`). Those cover most use
2417/// cases, but callers can implement this trait directly if necessary.
2418///
2419/// # Example
2420///
2421/// This example shows a basic implementation of the `Replacer` trait. This can
2422/// be done much more simply using the replacement byte string interpolation
2423/// support (e.g., `$first $last`), but this approach avoids needing to parse
2424/// the replacement byte string at all.
2425///
2426/// ```
2427/// use regex::bytes::{Captures, Regex, Replacer};
2428///
2429/// struct NameSwapper;
2430///
2431/// impl Replacer for NameSwapper {
2432///     fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2433///         dst.extend_from_slice(&caps["first"]);
2434///         dst.extend_from_slice(b" ");
2435///         dst.extend_from_slice(&caps["last"]);
2436///     }
2437/// }
2438///
2439/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap();
2440/// let result = re.replace(b"Springsteen, Bruce", NameSwapper);
2441/// assert_eq!(result, &b"Bruce Springsteen"[..]);
2442/// ```
2443pub trait Replacer {
2444    /// Appends possibly empty data to `dst` to replace the current match.
2445    ///
2446    /// The current match is represented by `caps`, which is guaranteed to have
2447    /// a match at capture group `0`.
2448    ///
2449    /// For example, a no-op replacement would be
2450    /// `dst.extend_from_slice(&caps[0])`.
2451    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>);
2452
2453    /// Return a fixed unchanging replacement byte string.
2454    ///
2455    /// When doing replacements, if access to [`Captures`] is not needed (e.g.,
2456    /// the replacement byte string does not need `$` expansion), then it can
2457    /// be beneficial to avoid finding sub-captures.
2458    ///
2459    /// In general, this is called once for every call to a replacement routine
2460    /// such as [`Regex::replace_all`].
2461    fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
2462        None
2463    }
2464
2465    /// Returns a type that implements `Replacer`, but that borrows and wraps
2466    /// this `Replacer`.
2467    ///
2468    /// This is useful when you want to take a generic `Replacer` (which might
2469    /// not be cloneable) and use it without consuming it, so it can be used
2470    /// more than once.
2471    ///
2472    /// # Example
2473    ///
2474    /// ```
2475    /// use regex::bytes::{Regex, Replacer};
2476    ///
2477    /// fn replace_all_twice<R: Replacer>(
2478    ///     re: Regex,
2479    ///     src: &[u8],
2480    ///     mut rep: R,
2481    /// ) -> Vec<u8> {
2482    ///     let dst = re.replace_all(src, rep.by_ref());
2483    ///     let dst = re.replace_all(&dst, rep.by_ref());
2484    ///     dst.into_owned()
2485    /// }
2486    /// ```
2487    fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> {
2488        ReplacerRef(self)
2489    }
2490}
2491
2492impl<'a, const N: usize> Replacer for &'a [u8; N] {
2493    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2494        caps.expand(&**self, dst);
2495    }
2496
2497    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2498        no_expansion(self)
2499    }
2500}
2501
2502impl<const N: usize> Replacer for [u8; N] {
2503    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2504        caps.expand(&*self, dst);
2505    }
2506
2507    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2508        no_expansion(self)
2509    }
2510}
2511
2512impl<'a> Replacer for &'a [u8] {
2513    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2514        caps.expand(*self, dst);
2515    }
2516
2517    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2518        no_expansion(self)
2519    }
2520}
2521
2522impl<'a> Replacer for &'a Vec<u8> {
2523    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2524        caps.expand(*self, dst);
2525    }
2526
2527    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2528        no_expansion(self)
2529    }
2530}
2531
2532impl Replacer for Vec<u8> {
2533    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2534        caps.expand(self, dst);
2535    }
2536
2537    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2538        no_expansion(self)
2539    }
2540}
2541
2542impl<'a> Replacer for Cow<'a, [u8]> {
2543    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2544        caps.expand(self.as_ref(), dst);
2545    }
2546
2547    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2548        no_expansion(self)
2549    }
2550}
2551
2552impl<'a> Replacer for &'a Cow<'a, [u8]> {
2553    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2554        caps.expand(self.as_ref(), dst);
2555    }
2556
2557    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2558        no_expansion(self)
2559    }
2560}
2561
2562impl<F, T> Replacer for F
2563where
2564    F: FnMut(&Captures<'_>) -> T,
2565    T: AsRef<[u8]>,
2566{
2567    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2568        dst.extend_from_slice((*self)(caps).as_ref());
2569    }
2570}
2571
2572/// A by-reference adaptor for a [`Replacer`].
2573///
2574/// This permits reusing the same `Replacer` value in multiple calls to a
2575/// replacement routine like [`Regex::replace_all`].
2576///
2577/// This type is created by [`Replacer::by_ref`].
2578#[derive(Debug)]
2579pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R);
2580
2581impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
2582    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2583        self.0.replace_append(caps, dst)
2584    }
2585
2586    fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
2587        self.0.no_expansion()
2588    }
2589}
2590
2591/// A helper type for forcing literal string replacement.
2592///
2593/// It can be used with routines like [`Regex::replace`] and
2594/// [`Regex::replace_all`] to do a literal string replacement without expanding
2595/// `$name` to their corresponding capture groups. This can be both convenient
2596/// (to avoid escaping `$`, for example) and faster (since capture groups
2597/// don't need to be found).
2598///
2599/// `'s` is the lifetime of the literal string to use.
2600///
2601/// # Example
2602///
2603/// ```
2604/// use regex::bytes::{NoExpand, Regex};
2605///
2606/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap();
2607/// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last"));
2608/// assert_eq!(result, &b"$2 $last"[..]);
2609/// ```
2610#[derive(Clone, Debug)]
2611pub struct NoExpand<'s>(pub &'s [u8]);
2612
2613impl<'s> Replacer for NoExpand<'s> {
2614    fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) {
2615        dst.extend_from_slice(self.0);
2616    }
2617
2618    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2619        Some(Cow::Borrowed(self.0))
2620    }
2621}
2622
2623/// Quickly checks the given replacement string for whether interpolation
2624/// should be done on it. It returns `None` if a `$` was found anywhere in the
2625/// given string, which suggests interpolation needs to be done. But if there's
2626/// no `$` anywhere, then interpolation definitely does not need to be done. In
2627/// that case, the given string is returned as a borrowed `Cow`.
2628///
2629/// This is meant to be used to implement the `Replacer::no_expansion` method
2630/// in its various trait impls.
2631fn no_expansion<T: AsRef<[u8]>>(replacement: &T) -> Option<Cow<'_, [u8]>> {
2632    let replacement = replacement.as_ref();
2633    match crate::find_byte::find_byte(b'$', replacement) {
2634        Some(_) => None,
2635        None => Some(Cow::Borrowed(replacement)),
2636    }
2637}
2638
2639#[cfg(test)]
2640mod tests {
2641    use super::*;
2642    use alloc::format;
2643
2644    #[test]
2645    fn test_match_properties() {
2646        let haystack = b"Hello, world!";
2647        let m = Match::new(haystack, 7, 12);
2648
2649        assert_eq!(m.start(), 7);
2650        assert_eq!(m.end(), 12);
2651        assert_eq!(m.is_empty(), false);
2652        assert_eq!(m.len(), 5);
2653        assert_eq!(m.as_bytes(), b"world");
2654    }
2655
2656    #[test]
2657    fn test_empty_match() {
2658        let haystack = b"";
2659        let m = Match::new(haystack, 0, 0);
2660
2661        assert_eq!(m.is_empty(), true);
2662        assert_eq!(m.len(), 0);
2663    }
2664
2665    #[test]
2666    fn test_debug_output_valid_utf8() {
2667        let haystack = b"Hello, world!";
2668        let m = Match::new(haystack, 7, 12);
2669        let debug_str = format!("{m:?}");
2670
2671        assert_eq!(
2672            debug_str,
2673            r#"Match { start: 7, end: 12, bytes: "world" }"#
2674        );
2675    }
2676
2677    #[test]
2678    fn test_debug_output_invalid_utf8() {
2679        let haystack = b"Hello, \xFFworld!";
2680        let m = Match::new(haystack, 7, 13);
2681        let debug_str = format!("{m:?}");
2682
2683        assert_eq!(
2684            debug_str,
2685            r#"Match { start: 7, end: 13, bytes: "\xffworld" }"#
2686        );
2687    }
2688
2689    #[test]
2690    fn test_debug_output_various_unicode() {
2691        let haystack =
2692            "Hello, 😊 world! 안녕하세요? مرحبا بالعالم!".as_bytes();
2693        let m = Match::new(haystack, 0, haystack.len());
2694        let debug_str = format!("{m:?}");
2695
2696        assert_eq!(
2697            debug_str,
2698            r#"Match { start: 0, end: 62, bytes: "Hello, 😊 world! 안녕하세요? مرحبا بالعالم!" }"#
2699        );
2700    }
2701
2702    #[test]
2703    fn test_debug_output_ascii_escape() {
2704        let haystack = b"Hello,\tworld!\nThis is a \x1b[31mtest\x1b[0m.";
2705        let m = Match::new(haystack, 0, haystack.len());
2706        let debug_str = format!("{m:?}");
2707
2708        assert_eq!(
2709            debug_str,
2710            r#"Match { start: 0, end: 38, bytes: "Hello,\tworld!\nThis is a \u{1b}[31mtest\u{1b}[0m." }"#
2711        );
2712    }
2713
2714    #[test]
2715    fn test_debug_output_match_in_middle() {
2716        let haystack = b"The quick brown fox jumps over the lazy dog.";
2717        let m = Match::new(haystack, 16, 19);
2718        let debug_str = format!("{m:?}");
2719
2720        assert_eq!(debug_str, r#"Match { start: 16, end: 19, bytes: "fox" }"#);
2721    }
2722}
regex/regex/bytes.rs

regex/regex/
bytes.rs