iri_string/parser/str/
maybe_pct_encoded.rs

1//! Processor for possibly- or invalidly-percent-encoded strings.
2
3use core::fmt::{self, Write as _};
4use core::marker::PhantomData;
5use core::num::NonZeroU8;
6use core::ops::ControlFlow;
7
8use crate::parser::str::find_split;
9use crate::parser::trusted::hexdigits_to_byte;
10
11/// Fragment in a possibly percent-encoded (and possibly broken) string.
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub(crate) enum PctEncodedFragments<'a> {
14    /// String fragment without percent-encoded triplets.
15    NoPctStr(&'a str),
16    /// Stray `%` (percent) character.
17    StrayPercent,
18    /// Valid percent-encoded triplets for a character.
19    Char(&'a str, char),
20    /// Percent-encoded triplets that does not consists of a valid UTF-8 sequence.
21    InvalidUtf8PctTriplets(&'a str),
22}
23
24/// Processes characters in a string which may contain (possibly invalid) percent-encoded triplets.
25pub(crate) fn process_percent_encoded_best_effort<T, F, B>(
26    v: T,
27    mut f: F,
28) -> Result<ControlFlow<B>, fmt::Error>
29where
30    T: fmt::Display,
31    F: FnMut(PctEncodedFragments<'_>) -> ControlFlow<B>,
32{
33    let mut buf = [0_u8; 12];
34    let mut writer = DecomposeWriter {
35        f: &mut f,
36        decoder: Default::default(),
37        buf: &mut buf,
38        result: ControlFlow::Continue(()),
39        _r: PhantomData,
40    };
41
42    if write!(writer, "{v}").is_err() {
43        match writer.result {
44            ControlFlow::Continue(_) => return Err(fmt::Error),
45            ControlFlow::Break(v) => return Ok(ControlFlow::Break(v)),
46        }
47    }
48
49    // Flush the internal buffer of the decoder.
50    if let Some(len) = writer.decoder.flush(&mut buf).map(|v| usize::from(v.get())) {
51        let len_suffix = len % 3;
52        let triplets_end = len - len_suffix;
53        let triplets = core::str::from_utf8(&buf[..triplets_end])
54            .expect("percent-encoded triplets consist of ASCII characters");
55        if let ControlFlow::Break(v) = f(PctEncodedFragments::InvalidUtf8PctTriplets(triplets)) {
56            return Ok(ControlFlow::Break(v));
57        }
58
59        if len_suffix > 0 {
60            if let ControlFlow::Break(v) = f(PctEncodedFragments::StrayPercent) {
61                return Ok(ControlFlow::Break(v));
62            }
63        }
64        if len_suffix > 1 {
65            let after_percent =
66                core::str::from_utf8(&buf[(triplets_end + 1)..(triplets_end + len_suffix)])
67                    .expect("percent-encoded triplets contains only ASCII characters");
68            if let ControlFlow::Break(v) = f(PctEncodedFragments::NoPctStr(after_percent)) {
69                return Ok(ControlFlow::Break(v));
70            }
71        }
72    }
73
74    Ok(ControlFlow::Continue(()))
75}
76
77/// Writer to decompose the input into fragments.
78struct DecomposeWriter<'a, F, B> {
79    /// Output function.
80    f: &'a mut F,
81    /// Decoder.
82    decoder: DecoderBuffer,
83    /// Buffer.
84    buf: &'a mut [u8],
85    /// Result of the last output function call.
86    result: ControlFlow<B>,
87    /// Dummy field for the type parameter of the return type of the function `f`.
88    _r: PhantomData<fn() -> B>,
89}
90impl<F, B> DecomposeWriter<'_, F, B>
91where
92    F: FnMut(PctEncodedFragments<'_>) -> ControlFlow<B>,
93{
94    /// Returns `Ok(_)` if the stored result is `Continue`, and `Err(_)` otherwise.
95    #[inline(always)]
96    fn result_continue_or_err(&self) -> fmt::Result {
97        if self.result.is_break() {
98            return Err(fmt::Error);
99        }
100        Ok(())
101    }
102
103    /// Calls the output functions with the undecodable fragments.
104    fn output_as_undecodable(&mut self, len_undecodable: u8) -> fmt::Result {
105        let len_written = usize::from(len_undecodable);
106        let frag = core::str::from_utf8(&self.buf[..len_written])
107            .expect("`DecoderBuffer` writes a valid ASCII string");
108        let len_incomplete = len_written % 3;
109        let len_complete = len_written - len_incomplete;
110        self.result = (self.f)(PctEncodedFragments::InvalidUtf8PctTriplets(
111            &frag[..len_complete],
112        ));
113        self.result_continue_or_err()?;
114        if len_incomplete > 0 {
115            // At least the first `%` exists.
116            self.result = (self.f)(PctEncodedFragments::StrayPercent);
117            if self.result.is_break() {
118                return Err(fmt::Error);
119            }
120            if len_incomplete > 1 {
121                // A following hexdigit is available.
122                debug_assert_eq!(
123                    len_incomplete, 2,
124                    "the length of incomplete percent-encoded triplet must be less than 2 bytes"
125                );
126                self.result = (self.f)(PctEncodedFragments::NoPctStr(
127                    &frag[(len_complete + 1)..len_written],
128                ));
129                self.result_continue_or_err()?;
130            }
131        }
132        Ok(())
133    }
134}
135
136impl<F, B> fmt::Write for DecomposeWriter<'_, F, B>
137where
138    F: FnMut(PctEncodedFragments<'_>) -> ControlFlow<B>,
139{
140    fn write_str(&mut self, s: &str) -> fmt::Result {
141        self.result_continue_or_err()?;
142        let mut rest = s;
143        while !rest.is_empty() {
144            let (len_consumed, result) = self.decoder.push_encoded(self.buf, rest);
145            if len_consumed == 0 {
146                // `rest` does not start with the percent-encoded triplets.
147                // Flush the decoder before attempting to decode more data.
148                if let Some(len_written) = self.decoder.flush(self.buf).map(NonZeroU8::get) {
149                    self.output_as_undecodable(len_written)?;
150                    rest = &rest[usize::from(len_written)..];
151                }
152
153                // Write plain string prefix (if found).
154                let (plain_prefix, suffix) = find_split(rest, b'%').unwrap_or((rest, ""));
155                debug_assert!(
156                    !plain_prefix.is_empty(),
157                    "`len_consumed == 0` indicates non-empty `rest` not starting with `%`"
158                );
159                self.result = (self.f)(PctEncodedFragments::NoPctStr(plain_prefix));
160                self.result_continue_or_err()?;
161                rest = suffix;
162                continue;
163            }
164
165            // Process decoding result.
166            match result {
167                PushResult::Decoded(len_written, c) => {
168                    let len_written = usize::from(len_written.get());
169                    let frag = core::str::from_utf8(&self.buf[..len_written])
170                        .expect("`DecoderBuffer` writes a valid ASCII string");
171                    self.result = (self.f)(PctEncodedFragments::Char(frag, c));
172                    self.result_continue_or_err()?;
173                }
174                PushResult::Undecodable(len_written) => {
175                    self.output_as_undecodable(len_written)?;
176                }
177                PushResult::NeedMoreBytes => {
178                    // Nothing to write at this time.
179                }
180            }
181            rest = &rest[len_consumed..];
182        }
183        Ok(())
184    }
185}
186
187/// A type for result of feeding data to [`DecoderBuffer`].
188#[derive(Debug, Clone, Copy)]
189enum PushResult {
190    /// Input is still incomplete, needs more bytes to get the decoding result.
191    NeedMoreBytes,
192    /// Bytes decodable to valid UTF-8 sequence.
193    // `.0`: Length of decodable fragment.
194    // `.1`: Decoded character.
195    Decoded(NonZeroU8, char),
196    /// Valid percent-encoded triplets but not decodable to valid UTF-8 sequence.
197    // `.0`: Length of undecodable fragment.
198    Undecodable(u8),
199}
200
201/// Buffer to contain (and to decode) incomplete percent-encoded triplets.
202#[derive(Default, Debug, Clone, Copy)]
203struct DecoderBuffer {
204    /// Percent-encoded triplets that possibly consists a valid UTF-8 sequence after decoded.
205    //
206    // `3 * 4`: 3 ASCII characters for single percent-encoded triplet, and
207    // 4 triplets at most for single Unicode codepoint in UTF-8.
208    encoded: [u8; 12],
209    /// Decoded bytes.
210    decoded: [u8; 4],
211    /// Number of bytes available in `buf_encoded` buffer.
212    ///
213    /// `buf_encoded_len / 3` also indicates the length of data in `decoded`.
214    len_encoded: u8,
215}
216
217impl DecoderBuffer {
218    /// Writes the data of the given length to the destination, and remove that part from buffer.
219    fn write_and_pop(&mut self, dest: &mut [u8], remove_len: u8) {
220        let new_len = self.len_encoded - remove_len;
221        let remove_len = usize::from(remove_len);
222        let src_range = remove_len..usize::from(self.len_encoded);
223        dest[..remove_len].copy_from_slice(&self.encoded[..remove_len]);
224
225        if new_len == 0 {
226            *self = Self::default();
227            return;
228        }
229        self.encoded.copy_within(src_range, 0);
230        self.decoded
231            .copy_within((remove_len / 3)..usize::from(self.len_encoded / 3), 0);
232        self.len_encoded = new_len;
233    }
234
235    /// Pushes a byte of a (possible) percent-encoded tripet to the buffer.
236    fn push_single_encoded_byte(&mut self, byte: u8) {
237        debug_assert!(
238            self.len_encoded < 12,
239            "four percent-encoded triplets are enough for a unicode code point"
240        );
241        let pos_enc = usize::from(self.len_encoded);
242        self.len_encoded += 1;
243        self.encoded[pos_enc] = byte;
244        if self.len_encoded % 3 == 0 {
245            // A new percent-encoded triplet is read. Decode and remember.
246            let pos_dec = usize::from(self.len_encoded / 3 - 1);
247            let upper = self.encoded[pos_enc - 1];
248            let lower = byte;
249            debug_assert!(
250                upper.is_ascii_hexdigit() && lower.is_ascii_hexdigit(),
251                "the `encoded` buffer should contain valid percent-encoded triplets"
252            );
253            self.decoded[pos_dec] = hexdigits_to_byte([upper, lower]);
254        }
255    }
256
257    /// Pushes the (possibly) encoded string to the buffer.
258    ///
259    /// When the push result is not `PctTripletPushResult::NeedMoreBytes`, the
260    /// caller should call `Self::clear()` before pushing more bytes.
261    ///
262    /// # Preconditions
263    ///
264    /// * `buf` should be more than 12 bytes. If not, this method may panic.
265    #[must_use]
266    pub(crate) fn push_encoded(&mut self, buf: &mut [u8], s: &str) -> (usize, PushResult) {
267        debug_assert!(
268            buf.len() >= 12,
269            "[precondition] destination buffer should be at least 12 bytes"
270        );
271        let mut chars = s.chars();
272        let mut len_triplet_incomplete = self.len_encoded % 3;
273        for c in &mut chars {
274            if len_triplet_incomplete == 0 {
275                // Expect `%`.
276                if c != '%' {
277                    // Undecodable.
278                    // `-1`: the last byte is peeked but not consumed.
279                    let len_consumed = s.len() - chars.as_str().len() - 1;
280                    let len_result = self.len_encoded;
281                    self.write_and_pop(buf, len_result);
282                    return (len_consumed, PushResult::Undecodable(len_result));
283                }
284                self.push_single_encoded_byte(b'%');
285                len_triplet_incomplete = 1;
286                continue;
287            }
288
289            // Expect a nibble.
290            if !c.is_ascii_hexdigit() {
291                // Undecodable.
292                // `-1`: the last byte is peeked but not consumed.
293                let len_consumed = s.len() - chars.as_str().len() - 1;
294                let len_result = self.len_encoded;
295                self.write_and_pop(buf, len_result);
296                return (len_consumed, PushResult::Undecodable(len_result));
297            }
298            self.push_single_encoded_byte(c as u8);
299            if len_triplet_incomplete == 1 {
300                len_triplet_incomplete = 2;
301                continue;
302            } else {
303                // Now a new percent-encoded triplet is read!
304                debug_assert_eq!(len_triplet_incomplete, 2);
305                len_triplet_incomplete = 0;
306            }
307
308            // Now a new percent-encoded triplet is read.
309            // Check if the buffer contains a valid decodable content.
310            let len_decoded = usize::from(self.len_encoded) / 3;
311            match core::str::from_utf8(&self.decoded[..len_decoded]) {
312                Ok(decoded_str) => {
313                    // Successfully decoded.
314                    let len_consumed = s.len() - chars.as_str().len();
315                    let c = decoded_str
316                        .chars()
317                        .next()
318                        .expect("`decoded` buffer is nonempty");
319                    let len_result = NonZeroU8::new(self.len_encoded).expect(
320                        "`encoded` buffer is nonempty since \
321                         `push_single_encoded_byte()` was called",
322                    );
323                    self.write_and_pop(buf, len_result.get());
324                    return (len_consumed, PushResult::Decoded(len_result, c));
325                }
326                Err(e) => {
327                    // Undecodable.
328                    assert_eq!(
329                        e.valid_up_to(),
330                        0,
331                        "`decoded` buffer contains at most one character"
332                    );
333                    let skip_len_decoded = match e.error_len() {
334                        // Unexpected EOF. Wait for remaining input.
335                        None => continue,
336                        // Skip invalid bytes.
337                        Some(v) => v,
338                    };
339                    let len_consumed = s.len() - chars.as_str().len();
340                    let len_result = skip_len_decoded as u8 * 3;
341                    assert_ne!(skip_len_decoded, 0, "empty bytes cannot be invalid");
342                    self.write_and_pop(buf, len_result);
343                    return (len_consumed, PushResult::Undecodable(len_result));
344                }
345            };
346        }
347        let len_consumed = s.len() - chars.as_str().len();
348        (len_consumed, PushResult::NeedMoreBytes)
349    }
350
351    /// Writes the incomplete data completely to the destination, and clears the internal buffer.
352    #[must_use]
353    pub(crate) fn flush(&mut self, buf: &mut [u8]) -> Option<NonZeroU8> {
354        let len_result = NonZeroU8::new(self.len_encoded)?;
355        // Emit the current (undecodable) buffer as is.
356        self.write_and_pop(buf, len_result.get());
357        debug_assert_eq!(
358            self.len_encoded, 0,
359            "the buffer should be cleared after flushed"
360        );
361        Some(len_result)
362    }
363}