stringprep/
tables.rs

1//! Character Tables
2use std::cmp::Ordering;
3use std::str::Chars;
4use unicode_bidi::{bidi_class, BidiClass};
5use unicode_properties::{GeneralCategoryGroup, UnicodeGeneralCategory};
6
7use super::rfc3454;
8
9/// A.1 Unassigned code points in Unicode 3.2
10pub fn unassigned_code_point(c: char) -> bool {
11    rfc3454::A_1
12        .binary_search_by(|&(start, end)| {
13            if start > c {
14                Ordering::Greater
15            } else if end < c {
16                Ordering::Less
17            } else {
18                Ordering::Equal
19            }
20        })
21        .is_ok()
22}
23
24/// B.1 Commonly mapped to nothing
25pub fn commonly_mapped_to_nothing(c: char) -> bool {
26    matches!(
27        c,
28        '\u{00AD}'
29            | '\u{034F}'
30            | '\u{1806}'
31            | '\u{180B}'
32            | '\u{180C}'
33            | '\u{180D}'
34            | '\u{200B}'
35            | '\u{200C}'
36            | '\u{200D}'
37            | '\u{2060}'
38            | '\u{FE00}'
39            | '\u{FE01}'
40            | '\u{FE02}'
41            | '\u{FE03}'
42            | '\u{FE04}'
43            | '\u{FE05}'
44            | '\u{FE06}'
45            | '\u{FE07}'
46            | '\u{FE08}'
47            | '\u{FE09}'
48            | '\u{FE0A}'
49            | '\u{FE0B}'
50            | '\u{FE0C}'
51            | '\u{FE0D}'
52            | '\u{FE0E}'
53            | '\u{FE0F}'
54            | '\u{FEFF}'
55    )
56}
57
58/// B.2 Mapping for case-folding used with NFKC.
59pub fn case_fold_for_nfkc(c: char) -> CaseFoldForNfkc {
60    let inner = match rfc3454::B_2.binary_search_by_key(&c, |e| e.0) {
61        Ok(idx) => FoldInner::Chars(rfc3454::B_2[idx].1.chars()),
62        Err(_) => FoldInner::Char(Some(c)),
63    };
64    CaseFoldForNfkc(inner)
65}
66
67enum FoldInner {
68    Chars(Chars<'static>),
69    Char(Option<char>),
70}
71
72/// The iterator returned by `case_fold_for_nfkc`.
73pub struct CaseFoldForNfkc(FoldInner);
74
75impl Iterator for CaseFoldForNfkc {
76    type Item = char;
77
78    fn next(&mut self) -> Option<char> {
79        match self.0 {
80            FoldInner::Chars(ref mut it) => it.next(),
81            FoldInner::Char(ref mut ch) => ch.take(),
82        }
83    }
84}
85
86/// C.1.1 ASCII space characters
87pub fn ascii_space_character(c: char) -> bool {
88    c == ' '
89}
90
91/// C.1.2 Non-ASCII space characters
92pub fn non_ascii_space_character(c: char) -> bool {
93    matches!(
94        c,
95        '\u{00A0}'
96            | '\u{1680}'
97            | '\u{2000}'
98            | '\u{2001}'
99            | '\u{2002}'
100            | '\u{2003}'
101            | '\u{2004}'
102            | '\u{2005}'
103            | '\u{2006}'
104            | '\u{2007}'
105            | '\u{2008}'
106            | '\u{2009}'
107            | '\u{200A}'
108            | '\u{200B}'
109            | '\u{202F}'
110            | '\u{205F}'
111            | '\u{3000}'
112    )
113}
114
115/// C.2.1 ASCII control characters
116pub fn ascii_control_character(c: char) -> bool {
117    matches!(c, '\u{0000}'..='\u{001F}' | '\u{007F}')
118}
119
120/// C.2.2 Non-ASCII control characters
121pub fn non_ascii_control_character(c: char) -> bool {
122    matches!(c, '\u{0080}'..='\u{009F}'
123        | '\u{06DD}'
124        | '\u{070F}'
125        | '\u{180E}'
126        | '\u{200C}'
127        | '\u{200D}'
128        | '\u{2028}'
129        | '\u{2029}'
130        | '\u{2060}'
131        | '\u{2061}'
132        | '\u{2062}'
133        | '\u{2063}'
134        | '\u{206A}'..='\u{206F}'
135        | '\u{FEFF}'
136        | '\u{FFF9}'..='\u{FFFC}'
137        | '\u{1D173}'..='\u{1D17A}')
138}
139
140/// C.3 Private use
141pub fn private_use(c: char) -> bool {
142    matches!(c, '\u{E000}'..='\u{F8FF}' | '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}')
143}
144
145/// C.4 Non-character code points
146pub fn non_character_code_point(c: char) -> bool {
147    matches!(c, '\u{FDD0}'..='\u{FDEF}'
148        | '\u{FFFE}'..='\u{FFFF}'
149        | '\u{1FFFE}'..='\u{1FFFF}'
150        | '\u{2FFFE}'..='\u{2FFFF}'
151        | '\u{3FFFE}'..='\u{3FFFF}'
152        | '\u{4FFFE}'..='\u{4FFFF}'
153        | '\u{5FFFE}'..='\u{5FFFF}'
154        | '\u{6FFFE}'..='\u{6FFFF}'
155        | '\u{7FFFE}'..='\u{7FFFF}'
156        | '\u{8FFFE}'..='\u{8FFFF}'
157        | '\u{9FFFE}'..='\u{9FFFF}'
158        | '\u{AFFFE}'..='\u{AFFFF}'
159        | '\u{BFFFE}'..='\u{BFFFF}'
160        | '\u{CFFFE}'..='\u{CFFFF}'
161        | '\u{DFFFE}'..='\u{DFFFF}'
162        | '\u{EFFFE}'..='\u{EFFFF}'
163        | '\u{FFFFE}'..='\u{FFFFF}'
164        | '\u{10FFFE}'..='\u{10FFFF}')
165}
166
167/// C.5 Surrogate codes
168#[allow(clippy::match_single_binding)]
169pub fn surrogate_code(c: char) -> bool {
170    match c {
171        // forbidden by rust
172        /*'\u{D800}'..='\u{DFFF}' => true,*/
173        _ => false,
174    }
175}
176
177/// C.6 Inappropriate for plain text
178pub fn inappropriate_for_plain_text(c: char) -> bool {
179    matches!(
180        c,
181        '\u{FFF9}' | '\u{FFFA}' | '\u{FFFB}' | '\u{FFFC}' | '\u{FFFD}'
182    )
183}
184
185/// C.7 Inappropriate for canonical representation
186pub fn inappropriate_for_canonical_representation(c: char) -> bool {
187    matches!(c, '\u{2FF0}'..='\u{2FFB}')
188}
189
190/// C.8 Change display properties or are deprecated
191pub fn change_display_properties_or_deprecated(c: char) -> bool {
192    matches!(
193        c,
194        '\u{0340}'
195            | '\u{0341}'
196            | '\u{200E}'
197            | '\u{200F}'
198            | '\u{202A}'
199            | '\u{202B}'
200            | '\u{202C}'
201            | '\u{202D}'
202            | '\u{202E}'
203            | '\u{206A}'
204            | '\u{206B}'
205            | '\u{206C}'
206            | '\u{206D}'
207            | '\u{206E}'
208            | '\u{206F}'
209    )
210}
211
212/// C.9 Tagging characters
213pub fn tagging_character(c: char) -> bool {
214    matches!(c, '\u{E0001}' | '\u{E0020}'..='\u{E007F}')
215}
216
217/// D.1 Characters with bidirectional property "R" or "AL"
218pub fn bidi_r_or_al(c: char) -> bool {
219    matches!(bidi_class(c), BidiClass::R | BidiClass::AL)
220}
221
222/// D.2 Characters with bidirectional property "L"
223pub fn bidi_l(c: char) -> bool {
224    matches!(bidi_class(c), BidiClass::L)
225}
226
227/// Determines if `c` is to be removed according to section 7.2 of
228/// [ITU-T Recommendation X.520 (2019)](https://www.itu.int/rec/T-REC-X.520-201910-I/en).
229pub fn x520_mapped_to_nothing(c: char) -> bool {
230    match c {
231        '\u{00AD}'
232        | '\u{1806}'
233        | '\u{034F}'
234        | '\u{180B}'..='\u{180D}'
235        | '\u{FE00}'..='\u{FE0F}'
236        | '\u{FFFC}'
237        | '\u{200B}' => true,
238        // Technically control characters, but mapped to whitespace in X.520.
239        '\u{09}' | '\u{0A}'..='\u{0D}' | '\u{85}' => false,
240        _ => c.is_control(),
241    }
242}
243
244/// Determines if `c` is to be replaced by SPACE (0x20) according to section 7.2 of
245/// [ITU-T Recommendation X.520 (2019)](https://www.itu.int/rec/T-REC-X.520-201910-I/en).
246pub fn x520_mapped_to_space(c: char) -> bool {
247    match c {
248        '\u{09}' | '\u{0A}'..='\u{0D}' | '\u{85}' => true,
249        _ => c.general_category_group() == GeneralCategoryGroup::Separator,
250    }
251}