iri_string/normalize/pct_case.rs
1//! Percent-encoding normalization and case normalization.
2
3use core::fmt::{self, Write as _};
4use core::marker::PhantomData;
5
6use crate::format::eq_str_display;
7use crate::parser::char::{is_ascii_unreserved, is_unreserved, is_utf8_byte_continue};
8use crate::parser::str::{find_split_hole, take_first_char};
9use crate::parser::trusted::take_xdigits2;
10use crate::spec::Spec;
11
12/// Returns true if the given string is percent-encoding normalized and case
13/// normalized.
14///
15/// Note that normalization of ASCII-only host requires additional case
16/// normalization, so checking by this function is not sufficient for that case.
17pub(crate) fn is_pct_case_normalized<S: Spec>(s: &str) -> bool {
18 eq_str_display(s, &PctCaseNormalized::<S>::new(s))
19}
20
21/// Returns a character for the slice.
22///
23/// Essentially equivalent to `core::str::from_utf8(bytes).unwrap().and_then(|s| s.get(0))`,
24/// but this function fully trusts that the input is a valid UTF-8 string with
25/// only one character.
26fn into_char_trusted(bytes: &[u8]) -> Result<char, ()> {
27 /// The bit mask to get the content part in a continue byte.
28 const CONTINUE_BYTE_MASK: u8 = 0b_0011_1111;
29 /// Minimum valid values for a code point in a UTF-8 sequence of 2, 3, and 4 bytes.
30 const MIN: [u32; 3] = [0x80, 0x800, 0x1_0000];
31
32 let len = bytes.len();
33 let c: u32 = match len {
34 2 => (u32::from(bytes[0] & 0b_0001_1111) << 6) | u32::from(bytes[1] & CONTINUE_BYTE_MASK),
35 3 => {
36 (u32::from(bytes[0] & 0b_0000_1111) << 12)
37 | (u32::from(bytes[1] & CONTINUE_BYTE_MASK) << 6)
38 | u32::from(bytes[2] & CONTINUE_BYTE_MASK)
39 }
40 4 => {
41 (u32::from(bytes[0] & 0b_0000_0111) << 18)
42 | (u32::from(bytes[1] & CONTINUE_BYTE_MASK) << 12)
43 | (u32::from(bytes[2] & CONTINUE_BYTE_MASK) << 6)
44 | u32::from(bytes[3] & CONTINUE_BYTE_MASK)
45 }
46 len => {
47 unreachable!("expected 2, 3, or 4 bytes for a character, but got {len} as the length")
48 }
49 };
50 if c < MIN[len - 2] {
51 // Redundant UTF-8 encoding.
52 return Err(());
53 }
54 // Can be an invalid Unicode code point.
55 char::from_u32(c).ok_or(())
56}
57
58/// A wrapper to make a path be written with percent-encoding normalization.
59///
60/// This wrapper does the things below when being formatted:
61///
62/// * Decode unnecessarily percent-encoded characters.
63/// * Convert alphabetic characters uppercase in percent-encoded triplets.
64///
65/// Note that this does not newly encode raw characters.
66///
67/// # Safety
68///
69/// The given string should be a valid path.
70#[derive(Debug, Clone, Copy)]
71pub(crate) struct PctCaseNormalized<'a, S> {
72 /// Valid path to normalize.
73 path: &'a str,
74 /// Spec.
75 _spec: PhantomData<fn() -> S>,
76}
77
78impl<'a, S: Spec> PctCaseNormalized<'a, S> {
79 /// Creates a new `PctCaseNormalized` value.
80 #[inline]
81 #[must_use]
82 pub(crate) fn new(source: &'a str) -> Self {
83 Self {
84 path: source,
85 _spec: PhantomData,
86 }
87 }
88}
89
90impl<S: Spec> fmt::Display for PctCaseNormalized<'_, S> {
91 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
92 let mut rest = self.path;
93
94 'outer_loop: while !rest.is_empty() {
95 // Scan the next percent-encoded triplet.
96 let (prefix, after_percent) = match find_split_hole(rest, b'%') {
97 Some(v) => v,
98 None => return f.write_str(rest),
99 };
100 // Write the string before the percent-encoded triplet.
101 f.write_str(prefix)?;
102 // Decode the percent-encoded triplet.
103 let (first_decoded, after_first_triplet) = take_xdigits2(after_percent);
104 rest = after_first_triplet;
105
106 let expected_char_len = match first_decoded {
107 0x00..=0x7F => {
108 // An ASCII character.
109 debug_assert!(first_decoded.is_ascii());
110 if is_ascii_unreserved(first_decoded) {
111 // Unreserved. Print the decoded.
112 f.write_char(char::from(first_decoded))?;
113 } else {
114 write!(f, "%{:02X}", first_decoded)?;
115 }
116 continue 'outer_loop;
117 }
118 0xC2..=0xDF => 2,
119 0xE0..=0xEF => 3,
120 0xF0..=0xF4 => 4,
121 0x80..=0xC1 | 0xF5..=0xFF => {
122 // Cannot appear as a first byte.
123 //
124 // * 0x80..=0xBF: continue byte.
125 // * 0xC0..=0xC1: redundant encoding.
126 // * 0xF5..=0xFF: above the maximum value for U+10FFFF.
127 write!(f, "%{:02X}", first_decoded)?;
128 continue 'outer_loop;
129 }
130 };
131
132 // Get continue bytes.
133 let c_buf = &mut [first_decoded, 0, 0, 0][..expected_char_len];
134 for (i, buf_dest) in c_buf[1..].iter_mut().enumerate() {
135 match take_first_char(rest) {
136 Some(('%', after_percent)) => {
137 let (byte, after_triplet) = take_xdigits2(after_percent);
138 if !is_utf8_byte_continue(byte) {
139 // Note that `byte` can start the new string.
140 // Leave the byte in the `rest` for next try (i.e.
141 // don't update `rest` in this case).
142 c_buf[..=i]
143 .iter()
144 .try_for_each(|b| write!(f, "%{:02X}", b))?;
145 continue 'outer_loop;
146 }
147 *buf_dest = byte;
148 rest = after_triplet;
149 }
150 // If the next character is not `%`, decoded bytes so far
151 // won't be valid UTF-8 byte sequence.
152 // Write the read percent-encoded triplets without decoding.
153 // Note that all characters in `&c_buf[1..]` (if available)
154 // will be decoded to "continue byte" of UTF-8, so they
155 // cannot be the start of a valid UTF-8 byte sequence if
156 // decoded.
157 Some((c, after_percent)) => {
158 c_buf[..=i]
159 .iter()
160 .try_for_each(|b| write!(f, "%{:02X}", b))?;
161 f.write_char(c)?;
162 rest = after_percent;
163 continue 'outer_loop;
164 }
165 None => {
166 c_buf[..=i]
167 .iter()
168 .try_for_each(|b| write!(f, "%{:02X}", b))?;
169 // Reached the end of the string.
170 break 'outer_loop;
171 }
172 }
173 }
174
175 // Decode the bytes into a character.
176 match into_char_trusted(&c_buf[..expected_char_len]) {
177 Ok(decoded_c) => {
178 if is_unreserved::<S>(decoded_c) {
179 // Unreserved. Print the decoded.
180 f.write_char(decoded_c)?;
181 } else {
182 c_buf[0..expected_char_len]
183 .iter()
184 .try_for_each(|b| write!(f, "%{:02X}", b))?;
185 }
186 }
187 Err(_) => {
188 // Skip decoding of the entire sequence of pct-encoded triplets loaded
189 // in `c_buf`. This is valid from the reasons below.
190 //
191 // * The first byte in `c_buf` is valid as the first byte, and it tells the
192 // expected number of bytes for a code unit. The cases the bytes being too
193 // short and the sequence being incomplete have already been handled, and
194 // the execution does not reach here then.
195 // * All of the non-first bytes are checked if they are valid as UTF8 continue
196 // bytes by `is_utf8_byte_continue()`. If they're not, the decoding of
197 // that codepoint is aborted and the bytes in the buffer are immediately
198 // emitted as pct-encoded, and the execution does not reach here. This
199 // means that the bytes in the current `c_buf` have passed these tests.
200 // * Since all of the the non-first bytes are UTF8 continue bytes, any of
201 // them cannot start the new valid UTF-8 byte sequence. This means that
202 // if the bytes in the buffer does not consitute a valid UTF-8 bytes
203 // sequence, the whole buffer can immediately be emmitted as pct-encoded.
204
205 debug_assert!(
206 c_buf[1..expected_char_len]
207 .iter()
208 .copied()
209 .all(is_utf8_byte_continue),
210 "all non-first bytes have been confirmed to be UTF-8 continue bytes"
211 );
212 // Note that the first pct-encoded triplet is stripped from
213 // `after_first_triplet`.
214 rest = &after_first_triplet[((expected_char_len - 1) * 3)..];
215 c_buf[0..expected_char_len]
216 .iter()
217 .try_for_each(|b| write!(f, "%{:02X}", b))?;
218 }
219 }
220 }
221
222 Ok(())
223 }
224}
225
226/// Writable as a normalized ASCII-only `host` (and optionally `port` followed).
227#[derive(Debug, Clone, Copy)]
228pub(crate) struct NormalizedAsciiOnlyHost<'a> {
229 /// Valid host (and additionaly port) to normalize.
230 host_port: &'a str,
231}
232
233impl<'a> NormalizedAsciiOnlyHost<'a> {
234 /// Creates a new `NormalizedAsciiOnlyHost` value.
235 ///
236 /// # Preconditions
237 ///
238 /// The given string should be the valid ASCII-only `host` or
239 /// `host ":" port` after percent-encoding normalization.
240 /// In other words, [`parser::trusted::is_ascii_only_host`] should return
241 /// true for the given value.
242 ///
243 /// [`parser::trusted::is_ascii_only_host`]: `crate::parser::trusted::is_ascii_only_host`
244 #[inline]
245 #[must_use]
246 pub(crate) fn new(host_port: &'a str) -> Self {
247 Self { host_port }
248 }
249}
250
251impl fmt::Display for NormalizedAsciiOnlyHost<'_> {
252 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
253 let mut rest = self.host_port;
254
255 while !rest.is_empty() {
256 // Scan the next percent-encoded triplet.
257 let (prefix, after_percent) = match find_split_hole(rest, b'%') {
258 Some(v) => v,
259 None => {
260 return rest
261 .chars()
262 .try_for_each(|c| f.write_char(c.to_ascii_lowercase()));
263 }
264 };
265 // Write the string before the percent-encoded triplet.
266 prefix
267 .chars()
268 .try_for_each(|c| f.write_char(c.to_ascii_lowercase()))?;
269 // Decode the percent-encoded triplet.
270 let (first_decoded, after_triplet) = take_xdigits2(after_percent);
271 rest = after_triplet;
272
273 assert!(
274 first_decoded.is_ascii(),
275 "this function requires ASCII-only host as an argument"
276 );
277
278 if is_ascii_unreserved(first_decoded) {
279 // Unreserved. Convert to lowercase and print.
280 f.write_char(char::from(first_decoded.to_ascii_lowercase()))?;
281 } else {
282 write!(f, "%{:02X}", first_decoded)?;
283 }
284 }
285
286 Ok(())
287 }
288}
289
290#[cfg(test)]
291#[cfg(feature = "alloc")]
292mod tests {
293 use super::*;
294
295 #[cfg(all(feature = "alloc", not(feature = "std")))]
296 use alloc::string::ToString;
297
298 use crate::spec::{IriSpec, UriSpec};
299
300 #[test]
301 fn invalid_utf8() {
302 assert_eq!(
303 PctCaseNormalized::<UriSpec>::new("%80%cc%cc%cc").to_string(),
304 "%80%CC%CC%CC"
305 );
306 assert_eq!(
307 PctCaseNormalized::<IriSpec>::new("%80%cc%cc%cc").to_string(),
308 "%80%CC%CC%CC"
309 );
310 }
311
312 #[test]
313 fn iri_unreserved() {
314 assert_eq!(
315 PctCaseNormalized::<UriSpec>::new("%ce%b1").to_string(),
316 "%CE%B1"
317 );
318 assert_eq!(
319 PctCaseNormalized::<IriSpec>::new("%ce%b1").to_string(),
320 "\u{03B1}"
321 );
322 }
323
324 #[test]
325 fn iri_middle_decode() {
326 assert_eq!(
327 PctCaseNormalized::<UriSpec>::new("%ce%ce%b1%b1").to_string(),
328 "%CE%CE%B1%B1"
329 );
330 assert_eq!(
331 PctCaseNormalized::<IriSpec>::new("%ce%ce%b1%b1").to_string(),
332 "%CE\u{03B1}%B1"
333 );
334 }
335
336 #[test]
337 fn ascii_reserved() {
338 assert_eq!(PctCaseNormalized::<UriSpec>::new("%3f").to_string(), "%3F");
339 assert_eq!(PctCaseNormalized::<IriSpec>::new("%3f").to_string(), "%3F");
340 }
341
342 #[test]
343 fn ascii_forbidden() {
344 assert_eq!(
345 PctCaseNormalized::<UriSpec>::new("%3c%3e").to_string(),
346 "%3C%3E"
347 );
348 assert_eq!(
349 PctCaseNormalized::<IriSpec>::new("%3c%3e").to_string(),
350 "%3C%3E"
351 );
352 }
353
354 #[test]
355 fn ascii_unreserved() {
356 assert_eq!(PctCaseNormalized::<UriSpec>::new("%7ea").to_string(), "~a");
357 assert_eq!(PctCaseNormalized::<IriSpec>::new("%7ea").to_string(), "~a");
358 }
359}