chrono/format/
scan.rs

1// This is a part of Chrono.
2// See README.md and LICENSE.txt for details.
3
4/*!
5 * Various scanning routines for the parser.
6 */
7
8use super::{INVALID, OUT_OF_RANGE, ParseResult, TOO_SHORT};
9use crate::Weekday;
10
11/// Tries to parse the non-negative number from `min` to `max` digits.
12///
13/// The absence of digits at all is an unconditional error.
14/// More than `max` digits are consumed up to the first `max` digits.
15/// Any number that does not fit in `i64` is an error.
16#[inline]
17pub(super) fn number(s: &str, min: usize, max: usize) -> ParseResult<(&str, i64)> {
18    assert!(min <= max);
19
20    // We are only interested in ascii numbers, so we can work with the `str` as bytes. We stop on
21    // the first non-numeric byte, which may be another ascii character or beginning of multi-byte
22    // UTF-8 character.
23    let bytes = s.as_bytes();
24    if bytes.len() < min {
25        return Err(TOO_SHORT);
26    }
27
28    let mut n = 0i64;
29    for (i, c) in bytes.iter().take(max).cloned().enumerate() {
30        // cloned() = copied()
31        if !c.is_ascii_digit() {
32            if i < min {
33                return Err(INVALID);
34            } else {
35                return Ok((&s[i..], n));
36            }
37        }
38
39        n = match n.checked_mul(10).and_then(|n| n.checked_add((c - b'0') as i64)) {
40            Some(n) => n,
41            None => return Err(OUT_OF_RANGE),
42        };
43    }
44
45    Ok((&s[core::cmp::min(max, bytes.len())..], n))
46}
47
48/// Tries to consume at least one digits as a fractional second.
49/// Returns the number of whole nanoseconds (0--999,999,999).
50pub(super) fn nanosecond(s: &str) -> ParseResult<(&str, u32)> {
51    // record the number of digits consumed for later scaling.
52    let origlen = s.len();
53    let (s, v) = number(s, 1, 9)?;
54    let v = u32::try_from(v).expect("999,999,999 should fit u32");
55    let consumed = origlen - s.len();
56
57    // scale the number accordingly.
58    const SCALE: [u32; 10] =
59        [0, 100_000_000, 10_000_000, 1_000_000, 100_000, 10_000, 1_000, 100, 10, 1];
60    let v = v.checked_mul(SCALE[consumed]).ok_or(OUT_OF_RANGE)?;
61
62    // if there are more than 9 digits, skip next digits.
63    let s = s.trim_start_matches(|c: char| c.is_ascii_digit());
64
65    Ok((s, v))
66}
67
68/// Tries to consume a fixed number of digits as a fractional second.
69/// Returns the number of whole nanoseconds (0--999,999,999).
70pub(super) fn nanosecond_fixed(s: &str, digits: usize) -> ParseResult<(&str, i64)> {
71    // record the number of digits consumed for later scaling.
72    let (s, v) = number(s, digits, digits)?;
73
74    // scale the number accordingly.
75    static SCALE: [i64; 10] =
76        [0, 100_000_000, 10_000_000, 1_000_000, 100_000, 10_000, 1_000, 100, 10, 1];
77    let v = v.checked_mul(SCALE[digits]).ok_or(OUT_OF_RANGE)?;
78
79    Ok((s, v))
80}
81
82/// Tries to parse the month index (0 through 11) with the first three ASCII letters.
83pub(super) fn short_month0(s: &str) -> ParseResult<(&str, u8)> {
84    if s.len() < 3 {
85        return Err(TOO_SHORT);
86    }
87    let buf = s.as_bytes();
88    let month0 = match (buf[0] | 32, buf[1] | 32, buf[2] | 32) {
89        (b'j', b'a', b'n') => 0,
90        (b'f', b'e', b'b') => 1,
91        (b'm', b'a', b'r') => 2,
92        (b'a', b'p', b'r') => 3,
93        (b'm', b'a', b'y') => 4,
94        (b'j', b'u', b'n') => 5,
95        (b'j', b'u', b'l') => 6,
96        (b'a', b'u', b'g') => 7,
97        (b's', b'e', b'p') => 8,
98        (b'o', b'c', b't') => 9,
99        (b'n', b'o', b'v') => 10,
100        (b'd', b'e', b'c') => 11,
101        _ => return Err(INVALID),
102    };
103    Ok((&s[3..], month0))
104}
105
106/// Tries to parse the weekday with the first three ASCII letters.
107pub(super) fn short_weekday(s: &str) -> ParseResult<(&str, Weekday)> {
108    if s.len() < 3 {
109        return Err(TOO_SHORT);
110    }
111    let buf = s.as_bytes();
112    let weekday = match (buf[0] | 32, buf[1] | 32, buf[2] | 32) {
113        (b'm', b'o', b'n') => Weekday::Mon,
114        (b't', b'u', b'e') => Weekday::Tue,
115        (b'w', b'e', b'd') => Weekday::Wed,
116        (b't', b'h', b'u') => Weekday::Thu,
117        (b'f', b'r', b'i') => Weekday::Fri,
118        (b's', b'a', b't') => Weekday::Sat,
119        (b's', b'u', b'n') => Weekday::Sun,
120        _ => return Err(INVALID),
121    };
122    Ok((&s[3..], weekday))
123}
124
125/// Tries to parse the month index (0 through 11) with short or long month names.
126/// It prefers long month names to short month names when both are possible.
127pub(super) fn short_or_long_month0(s: &str) -> ParseResult<(&str, u8)> {
128    // lowercased month names, minus first three chars
129    static LONG_MONTH_SUFFIXES: [&[u8]; 12] = [
130        b"uary", b"ruary", b"ch", b"il", b"", b"e", b"y", b"ust", b"tember", b"ober", b"ember",
131        b"ember",
132    ];
133
134    let (mut s, month0) = short_month0(s)?;
135
136    // tries to consume the suffix if possible
137    let suffix = LONG_MONTH_SUFFIXES[month0 as usize];
138    if s.len() >= suffix.len() && s.as_bytes()[..suffix.len()].eq_ignore_ascii_case(suffix) {
139        s = &s[suffix.len()..];
140    }
141
142    Ok((s, month0))
143}
144
145/// Tries to parse the weekday with short or long weekday names.
146/// It prefers long weekday names to short weekday names when both are possible.
147pub(super) fn short_or_long_weekday(s: &str) -> ParseResult<(&str, Weekday)> {
148    // lowercased weekday names, minus first three chars
149    static LONG_WEEKDAY_SUFFIXES: [&[u8]; 7] =
150        [b"day", b"sday", b"nesday", b"rsday", b"day", b"urday", b"day"];
151
152    let (mut s, weekday) = short_weekday(s)?;
153
154    // tries to consume the suffix if possible
155    let suffix = LONG_WEEKDAY_SUFFIXES[weekday.num_days_from_monday() as usize];
156    if s.len() >= suffix.len() && s.as_bytes()[..suffix.len()].eq_ignore_ascii_case(suffix) {
157        s = &s[suffix.len()..];
158    }
159
160    Ok((s, weekday))
161}
162
163/// Tries to consume exactly one given character.
164pub(super) fn char(s: &str, c1: u8) -> ParseResult<&str> {
165    match s.as_bytes().first() {
166        Some(&c) if c == c1 => Ok(&s[1..]),
167        Some(_) => Err(INVALID),
168        None => Err(TOO_SHORT),
169    }
170}
171
172/// Tries to consume one or more whitespace.
173pub(super) fn space(s: &str) -> ParseResult<&str> {
174    let s_ = s.trim_start();
175    if s_.len() < s.len() {
176        Ok(s_)
177    } else if s.is_empty() {
178        Err(TOO_SHORT)
179    } else {
180        Err(INVALID)
181    }
182}
183
184/// Consumes any number (including zero) of colon or spaces.
185pub(crate) fn colon_or_space(s: &str) -> ParseResult<&str> {
186    Ok(s.trim_start_matches(|c: char| c == ':' || c.is_whitespace()))
187}
188
189/// Parse a timezone from `s` and return the offset in seconds.
190///
191/// The `consume_colon` function is used to parse a mandatory or optional `:`
192/// separator between hours offset and minutes offset.
193///
194/// The `allow_missing_minutes` flag allows the timezone minutes offset to be
195/// missing from `s`.
196///
197/// The `allow_tz_minus_sign` flag allows the timezone offset negative character
198/// to also be `−` MINUS SIGN (U+2212) in addition to the typical
199/// ASCII-compatible `-` HYPHEN-MINUS (U+2D).
200/// This is part of [RFC 3339 & ISO 8601].
201///
202/// [RFC 3339 & ISO 8601]: https://en.wikipedia.org/w/index.php?title=ISO_8601&oldid=1114309368#Time_offsets_from_UTC
203pub(crate) fn timezone_offset<F>(
204    mut s: &str,
205    mut consume_colon: F,
206    allow_zulu: bool,
207    allow_missing_minutes: bool,
208    allow_tz_minus_sign: bool,
209) -> ParseResult<(&str, i32)>
210where
211    F: FnMut(&str) -> ParseResult<&str>,
212{
213    if allow_zulu {
214        if let Some(&b'Z' | &b'z') = s.as_bytes().first() {
215            return Ok((&s[1..], 0));
216        }
217    }
218
219    const fn digits(s: &str) -> ParseResult<(u8, u8)> {
220        let b = s.as_bytes();
221        if b.len() < 2 { Err(TOO_SHORT) } else { Ok((b[0], b[1])) }
222    }
223    let negative = match s.chars().next() {
224        Some('+') => {
225            // PLUS SIGN (U+2B)
226            s = &s['+'.len_utf8()..];
227
228            false
229        }
230        Some('-') => {
231            // HYPHEN-MINUS (U+2D)
232            s = &s['-'.len_utf8()..];
233
234            true
235        }
236        Some('−') => {
237            // MINUS SIGN (U+2212)
238            if !allow_tz_minus_sign {
239                return Err(INVALID);
240            }
241            s = &s['−'.len_utf8()..];
242
243            true
244        }
245        Some(_) => return Err(INVALID),
246        None => return Err(TOO_SHORT),
247    };
248
249    // hours (00--99)
250    let hours = match digits(s)? {
251        (h1 @ b'0'..=b'9', h2 @ b'0'..=b'9') => i32::from((h1 - b'0') * 10 + (h2 - b'0')),
252        _ => return Err(INVALID),
253    };
254    s = &s[2..];
255
256    // colons (and possibly other separators)
257    s = consume_colon(s)?;
258
259    // minutes (00--59)
260    // if the next two items are digits then we have to add minutes
261    let minutes = if let Ok(ds) = digits(s) {
262        match ds {
263            (m1 @ b'0'..=b'5', m2 @ b'0'..=b'9') => i32::from((m1 - b'0') * 10 + (m2 - b'0')),
264            (b'6'..=b'9', b'0'..=b'9') => return Err(OUT_OF_RANGE),
265            _ => return Err(INVALID),
266        }
267    } else if allow_missing_minutes {
268        0
269    } else {
270        return Err(TOO_SHORT);
271    };
272    s = match s.len() {
273        len if len >= 2 => &s[2..],
274        0 => s,
275        _ => return Err(TOO_SHORT),
276    };
277
278    let seconds = hours * 3600 + minutes * 60;
279    Ok((s, if negative { -seconds } else { seconds }))
280}
281
282/// Same as `timezone_offset` but also allows for RFC 2822 legacy timezones.
283/// May return `None` which indicates an insufficient offset data (i.e. `-0000`).
284/// See [RFC 2822 Section 4.3].
285///
286/// [RFC 2822 Section 4.3]: https://tools.ietf.org/html/rfc2822#section-4.3
287pub(super) fn timezone_offset_2822(s: &str) -> ParseResult<(&str, i32)> {
288    // tries to parse legacy time zone names
289    let upto = s.as_bytes().iter().position(|&c| !c.is_ascii_alphabetic()).unwrap_or(s.len());
290    if upto > 0 {
291        let name = &s.as_bytes()[..upto];
292        let s = &s[upto..];
293        let offset_hours = |o| Ok((s, o * 3600));
294        // RFC 2822 requires support for some named North America timezones, a small subset of all
295        // named timezones.
296        if name.eq_ignore_ascii_case(b"gmt")
297            || name.eq_ignore_ascii_case(b"ut")
298            || name.eq_ignore_ascii_case(b"z")
299        {
300            return offset_hours(0);
301        } else if name.eq_ignore_ascii_case(b"edt") {
302            return offset_hours(-4);
303        } else if name.eq_ignore_ascii_case(b"est") || name.eq_ignore_ascii_case(b"cdt") {
304            return offset_hours(-5);
305        } else if name.eq_ignore_ascii_case(b"cst") || name.eq_ignore_ascii_case(b"mdt") {
306            return offset_hours(-6);
307        } else if name.eq_ignore_ascii_case(b"mst") || name.eq_ignore_ascii_case(b"pdt") {
308            return offset_hours(-7);
309        } else if name.eq_ignore_ascii_case(b"pst") {
310            return offset_hours(-8);
311        } else if name.len() == 1 {
312            if let b'a'..=b'i' | b'k'..=b'y' | b'A'..=b'I' | b'K'..=b'Y' = name[0] {
313                // recommended by RFC 2822: consume but treat it as -0000
314                return Ok((s, 0));
315            }
316        }
317        Err(INVALID)
318    } else {
319        timezone_offset(s, |s| Ok(s), false, false, false)
320    }
321}
322
323/// Tries to consume an RFC2822 comment including preceding ` `.
324///
325/// Returns the remaining string after the closing parenthesis.
326pub(super) fn comment_2822(s: &str) -> ParseResult<(&str, ())> {
327    use CommentState::*;
328
329    let s = s.trim_start();
330
331    let mut state = Start;
332    for (i, c) in s.bytes().enumerate() {
333        state = match (state, c) {
334            (Start, b'(') => Next(1),
335            (Next(1), b')') => return Ok((&s[i + 1..], ())),
336            (Next(depth), b'\\') => Escape(depth),
337            (Next(depth), b'(') => Next(depth + 1),
338            (Next(depth), b')') => Next(depth - 1),
339            (Next(depth), _) | (Escape(depth), _) => Next(depth),
340            _ => return Err(INVALID),
341        };
342    }
343
344    Err(TOO_SHORT)
345}
346
347enum CommentState {
348    Start,
349    Next(usize),
350    Escape(usize),
351}
352
353#[cfg(test)]
354mod tests {
355    use super::{
356        comment_2822, nanosecond, nanosecond_fixed, short_or_long_month0, short_or_long_weekday,
357        timezone_offset_2822,
358    };
359    use crate::Weekday;
360    use crate::format::{INVALID, TOO_SHORT};
361
362    #[test]
363    fn test_rfc2822_comments() {
364        let testdata = [
365            ("", Err(TOO_SHORT)),
366            (" ", Err(TOO_SHORT)),
367            ("x", Err(INVALID)),
368            ("(", Err(TOO_SHORT)),
369            ("()", Ok("")),
370            (" \r\n\t()", Ok("")),
371            ("() ", Ok(" ")),
372            ("()z", Ok("z")),
373            ("(x)", Ok("")),
374            ("(())", Ok("")),
375            ("((()))", Ok("")),
376            ("(x(x(x)x)x)", Ok("")),
377            ("( x ( x ( x ) x ) x )", Ok("")),
378            (r"(\)", Err(TOO_SHORT)),
379            (r"(\()", Ok("")),
380            (r"(\))", Ok("")),
381            (r"(\\)", Ok("")),
382            ("(()())", Ok("")),
383            ("( x ( x ) x ( x ) x )", Ok("")),
384        ];
385
386        for (test_in, expected) in testdata.iter() {
387            let actual = comment_2822(test_in).map(|(s, _)| s);
388            assert_eq!(
389                *expected, actual,
390                "{test_in:?} expected to produce {expected:?}, but produced {actual:?}."
391            );
392        }
393    }
394
395    #[test]
396    fn test_timezone_offset_2822() {
397        assert_eq!(timezone_offset_2822("cSt").unwrap(), ("", -21600));
398        assert_eq!(timezone_offset_2822("pSt").unwrap(), ("", -28800));
399        assert_eq!(timezone_offset_2822("mSt").unwrap(), ("", -25200));
400        assert_eq!(timezone_offset_2822("-1551").unwrap(), ("", -57060));
401        assert_eq!(timezone_offset_2822("Gp"), Err(INVALID));
402    }
403
404    #[test]
405    fn test_short_or_long_month0() {
406        assert_eq!(short_or_long_month0("JUn").unwrap(), ("", 5));
407        assert_eq!(short_or_long_month0("mAy").unwrap(), ("", 4));
408        assert_eq!(short_or_long_month0("AuG").unwrap(), ("", 7));
409        assert_eq!(short_or_long_month0("Aprâ").unwrap(), ("â", 3));
410        assert_eq!(short_or_long_month0("JUl").unwrap(), ("", 6));
411        assert_eq!(short_or_long_month0("mAr").unwrap(), ("", 2));
412        assert_eq!(short_or_long_month0("Jan").unwrap(), ("", 0));
413    }
414
415    #[test]
416    fn test_short_or_long_weekday() {
417        assert_eq!(short_or_long_weekday("sAtu").unwrap(), ("u", Weekday::Sat));
418        assert_eq!(short_or_long_weekday("thu").unwrap(), ("", Weekday::Thu));
419    }
420
421    #[test]
422    fn test_nanosecond_fixed() {
423        assert_eq!(nanosecond_fixed("", 0usize).unwrap(), ("", 0));
424        assert!(nanosecond_fixed("", 1usize).is_err());
425    }
426
427    #[test]
428    fn test_nanosecond() {
429        assert_eq!(nanosecond("2Ù").unwrap(), ("Ù", 200000000));
430        assert_eq!(nanosecond("8").unwrap(), ("", 800000000));
431    }
432}
chrono/format/scan.rs

chrono/format/
scan.rs