encoding_rs/
mem.rs

1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! Functions for converting between different in-RAM representations of text
11//! and for quickly checking if the Unicode Bidirectional Algorithm can be
12//! avoided.
13//!
14//! By using slices for output, the functions here seek to enable by-register
15//! (ALU register or SIMD register as available) operations in order to
16//! outperform iterator-based conversions available in the Rust standard
17//! library.
18//!
19//! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
20//! U+00FF, inclusive, and does not refer to the windows-1252 range. This
21//! in-memory encoding is sometimes used as a storage optimization of text
22//! when UTF-16 indexing and length semantics are exposed.
23//!
24//! The FFI binding for this module are in the
25//! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
26
27#[cfg(feature = "alloc")]
28use alloc::borrow::Cow;
29#[cfg(feature = "alloc")]
30use alloc::string::String;
31#[cfg(feature = "alloc")]
32use alloc::vec::Vec;
33
34use super::in_inclusive_range16;
35use super::in_inclusive_range32;
36use super::in_inclusive_range8;
37use super::in_range16;
38use super::in_range32;
39use super::DecoderResult;
40use crate::ascii::*;
41use crate::utf_8::*;
42
43macro_rules! non_fuzz_debug_assert {
44    ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
45}
46
47cfg_if! {
48    if #[cfg(feature = "simd-accel")] {
49        use ::core::intrinsics::likely;
50        use ::core::intrinsics::unlikely;
51    } else {
52        #[inline(always)]
53        fn likely(b: bool) -> bool {
54            b
55        }
56        #[inline(always)]
57        fn unlikely(b: bool) -> bool {
58            b
59        }
60    }
61}
62
63/// Classification of text as Latin1 (all code points are below U+0100),
64/// left-to-right with some non-Latin1 characters or as containing at least
65/// some right-to-left characters.
66#[must_use]
67#[derive(Debug, PartialEq, Eq)]
68#[repr(C)]
69pub enum Latin1Bidi {
70    /// Every character is below U+0100.
71    Latin1 = 0,
72    /// There is at least one character that's U+0100 or higher, but there
73    /// are no right-to-left characters.
74    LeftToRight = 1,
75    /// There is at least one right-to-left character.
76    Bidi = 2,
77}
78
79// `as` truncates, so works on 32-bit, too.
80#[allow(dead_code)]
81const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
82
83#[allow(unused_macros)]
84macro_rules! by_unit_check_alu {
85    ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
86        #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
87        #[inline(always)]
88        fn $name(buffer: &[$unit]) -> bool {
89            let mut offset = 0usize;
90            let mut accu = 0usize;
91            let unit_size = ::core::mem::size_of::<$unit>();
92            let len = buffer.len();
93            if len >= ALU_ALIGNMENT / unit_size {
94                // The most common reason to return `false` is for the first code
95                // unit to fail the test, so check that first.
96                if buffer[0] >= $bound {
97                    return false;
98                }
99                let src = buffer.as_ptr();
100                let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
101                    & ALU_ALIGNMENT_MASK)
102                    / unit_size;
103                if until_alignment + ALU_ALIGNMENT / unit_size <= len {
104                    if until_alignment != 0 {
105                        accu |= buffer[offset] as usize;
106                        offset += 1;
107                        until_alignment -= 1;
108                        while until_alignment != 0 {
109                            accu |= buffer[offset] as usize;
110                            offset += 1;
111                            until_alignment -= 1;
112                        }
113                        if accu >= $bound {
114                            return false;
115                        }
116                    }
117                    let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
118                    if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
119                        // Safety: the above check lets us perform 4 consecutive reads of
120                        // length ALU_ALIGNMENT / unit_size. ALU_ALIGNMENT is the size of usize, and unit_size
121                        // is the size of the `src` pointer, so this is equal to performing four usize reads.
122                        //
123                        // This invariant is upheld on all loop iterations
124                        let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
125                        loop {
126                            let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
127                                | unsafe {
128                                    *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
129                                }
130                                | unsafe {
131                                    *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
132                                        as *const usize)
133                                }
134                                | unsafe {
135                                    *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
136                                        as *const usize)
137                                };
138                            if unroll_accu & $mask != 0 {
139                                return false;
140                            }
141                            offset += 4 * (ALU_ALIGNMENT / unit_size);
142                            // Safety: this check lets us continue to perform the 4 reads earlier
143                            if offset > len_minus_unroll {
144                                break;
145                            }
146                        }
147                    }
148                    while offset <= len_minus_stride {
149                        // Safety: the above check lets us perform one usize read.
150                        accu |= unsafe { *(src.add(offset) as *const usize) };
151                        offset += ALU_ALIGNMENT / unit_size;
152                    }
153                }
154            }
155            for &unit in &buffer[offset..] {
156                accu |= unit as usize;
157            }
158            accu & $mask == 0
159        }
160    };
161}
162
163#[allow(unused_macros)]
164macro_rules! by_unit_check_simd {
165    ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
166        #[inline(always)]
167        fn $name(buffer: &[$unit]) -> bool {
168            let mut offset = 0usize;
169            let mut accu = 0usize;
170            let unit_size = ::core::mem::size_of::<$unit>();
171            let len = buffer.len();
172            if len >= SIMD_STRIDE_SIZE / unit_size {
173                // The most common reason to return `false` is for the first code
174                // unit to fail the test, so check that first.
175                if buffer[0] >= $bound {
176                    return false;
177                }
178                let src = buffer.as_ptr();
179                let mut until_alignment = ((SIMD_ALIGNMENT
180                    - ((src as usize) & SIMD_ALIGNMENT_MASK))
181                    & SIMD_ALIGNMENT_MASK)
182                    / unit_size;
183                if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
184                    if until_alignment != 0 {
185                        accu |= buffer[offset] as usize;
186                        offset += 1;
187                        until_alignment -= 1;
188                        while until_alignment != 0 {
189                            accu |= buffer[offset] as usize;
190                            offset += 1;
191                            until_alignment -= 1;
192                        }
193                        if accu >= $bound {
194                            return false;
195                        }
196                    }
197                    let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
198                    if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
199                        // Safety: the above check lets us perform 4 consecutive reads of
200                        // length SIMD_STRIDE_SIZE / unit_size. SIMD_STRIDE_SIZE is the size of $simd_ty, and unit_size
201                        // is the size of the `src` pointer, so this is equal to performing four $simd_ty reads.
202                        //
203                        // This invariant is upheld on all loop iterations
204                        let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
205                        loop {
206                            let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
207                                | unsafe {
208                                    *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
209                                        as *const $simd_ty)
210                                }
211                                | unsafe {
212                                    *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
213                                        as *const $simd_ty)
214                                }
215                                | unsafe {
216                                    *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
217                                        as *const $simd_ty)
218                                };
219                            if !$func(unroll_accu) {
220                                return false;
221                            }
222                            offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
223                            // Safety: this check lets us continue to perform the 4 reads earlier
224                            if offset > len_minus_unroll {
225                                break;
226                            }
227                        }
228                    }
229                    let mut simd_accu = $splat;
230                    while offset <= len_minus_stride {
231                        // Safety: the above check lets us perform one $simd_ty read.
232                        simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
233                        offset += SIMD_STRIDE_SIZE / unit_size;
234                    }
235                    if !$func(simd_accu) {
236                        return false;
237                    }
238                }
239            }
240            for &unit in &buffer[offset..] {
241                accu |= unit as usize;
242            }
243            accu < $bound
244        }
245    };
246}
247
248cfg_if! {
249    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
250        use crate::simd_funcs::*;
251        use core::simd::u8x16;
252        use core::simd::u16x8;
253
254        const SIMD_ALIGNMENT: usize = 16;
255
256        const SIMD_ALIGNMENT_MASK: usize = 15;
257
258        by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
259        by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
260        by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);
261
262        #[inline(always)]
263        fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
264            // This function is a mess, because it simultaneously tries to do
265            // only aligned SIMD (perhaps misguidedly) and needs to deal with
266            // the last code unit in a SIMD stride being part of a valid
267            // surrogate pair.
268            let unit_size = ::core::mem::size_of::<u16>();
269            let src = buffer.as_ptr();
270            let len = buffer.len();
271            let mut offset = 0usize;
272            'outer: loop {
273                let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
274                                        SIMD_ALIGNMENT_MASK) / unit_size;
275                if until_alignment == 0 {
276                    if offset + SIMD_STRIDE_SIZE / unit_size > len {
277                        break;
278                    }
279                } else {
280                    let offset_plus_until_alignment = offset + until_alignment;
281                    let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
282                    if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
283                        break;
284                    }
285                    let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
286                    if up_to < until_alignment {
287                        return offset + up_to;
288                    }
289                    if last_valid_low {
290                        offset = offset_plus_until_alignment_plus_one;
291                        continue;
292                    }
293                    offset = offset_plus_until_alignment;
294                }
295                let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
296                loop {
297                    let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
298                    if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
299                        if offset_plus_stride == len {
300                            break 'outer;
301                        }
302                        let offset_plus_stride_plus_one = offset_plus_stride + 1;
303                        let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
304                        if up_to < SIMD_STRIDE_SIZE / unit_size {
305                            return offset + up_to;
306                        }
307                        if last_valid_low {
308                            offset = offset_plus_stride_plus_one;
309                            continue 'outer;
310                        }
311                    }
312                    offset = offset_plus_stride;
313                    if offset > len_minus_stride {
314                        break 'outer;
315                    }
316                }
317            }
318            let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
319            offset + up_to
320        }
321    } else {
322        by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
323        by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
324        by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);
325
326        #[inline(always)]
327        fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
328            let (up_to, _) = utf16_valid_up_to_alu(buffer);
329            up_to
330        }
331    }
332}
333
334/// The second return value is true iff the last code unit of the slice was
335/// reached and turned out to be a low surrogate that is part of a valid pair.
336#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
337#[inline(always)]
338fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
339    let len = buffer.len();
340    if len == 0 {
341        return (0, false);
342    }
343    let mut offset = 0usize;
344    loop {
345        let unit = buffer[offset];
346        let next = offset + 1;
347        let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
348        if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
349            // Not a surrogate
350            offset = next;
351            if offset == len {
352                return (offset, false);
353            }
354            continue;
355        }
356        if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
357            // high surrogate
358            if next < len {
359                let second = buffer[next];
360                let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
361                if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
362                    // The next code unit is a low surrogate. Advance position.
363                    offset = next + 1;
364                    if offset == len {
365                        return (offset, true);
366                    }
367                    continue;
368                }
369                // The next code unit is not a low surrogate. Don't advance
370                // position and treat the high surrogate as unpaired.
371                // fall through
372            }
373            // Unpaired, fall through
374        }
375        // Unpaired surrogate
376        return (offset, false);
377    }
378}
379
380cfg_if! {
381    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
382        #[inline(always)]
383        fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
384            let mut offset = 0usize;
385            let bytes = buffer.as_bytes();
386            let len = bytes.len();
387            if len >= SIMD_STRIDE_SIZE {
388                let src = bytes.as_ptr();
389                let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
390                                           SIMD_ALIGNMENT_MASK;
391                if until_alignment + SIMD_STRIDE_SIZE <= len {
392                    while until_alignment != 0 {
393                        if bytes[offset] > 0xC3 {
394                            return Some(offset);
395                        }
396                        offset += 1;
397                        until_alignment -= 1;
398                    }
399                    let len_minus_stride = len - SIMD_STRIDE_SIZE;
400                    loop {
401                        if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
402                            // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
403                            while bytes[offset] & 0xC0 == 0x80 {
404                                offset += 1;
405                            }
406                            return Some(offset);
407                        }
408                        offset += SIMD_STRIDE_SIZE;
409                        if offset > len_minus_stride {
410                            break;
411                        }
412                    }
413                }
414            }
415            for i in offset..len {
416                if bytes[i] > 0xC3 {
417                    return Some(i);
418                }
419            }
420            None
421        }
422    } else {
423        #[inline(always)]
424        fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
425            let mut bytes = buffer.as_bytes();
426            let mut total = 0;
427            loop {
428                if let Some((byte, offset)) = validate_ascii(bytes) {
429                    total += offset;
430                    if byte > 0xC3 {
431                        return Some(total);
432                    }
433                    bytes = &bytes[offset + 2..];
434                    total += 2;
435                } else {
436                    return None;
437                }
438            }
439        }
440    }
441}
442
443#[inline(always)]
444fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
445    let mut bytes = buffer;
446    let mut total = 0;
447    loop {
448        if let Some((byte, offset)) = validate_ascii(bytes) {
449            total += offset;
450            if in_inclusive_range8(byte, 0xC2, 0xC3) {
451                let next = offset + 1;
452                if next == bytes.len() {
453                    return Some(total);
454                }
455                if bytes[next] & 0xC0 != 0x80 {
456                    return Some(total);
457                }
458                bytes = &bytes[offset + 2..];
459                total += 2;
460            } else {
461                return Some(total);
462            }
463        } else {
464            return None;
465        }
466    }
467}
468
469cfg_if! {
470    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
471        #[inline(always)]
472        fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
473            let mut offset = 0usize;
474            let len = buffer.len();
475            if len >= SIMD_STRIDE_SIZE / 2 {
476                let src = buffer.as_ptr();
477                let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
478                                           SIMD_ALIGNMENT_MASK) / 2;
479                if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
480                    while until_alignment != 0 {
481                        if is_utf16_code_unit_bidi(buffer[offset]) {
482                            return true;
483                        }
484                        offset += 1;
485                        until_alignment -= 1;
486                    }
487                    let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
488                    loop {
489                        if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
490                            return true;
491                        }
492                        offset += SIMD_STRIDE_SIZE / 2;
493                        if offset > len_minus_stride {
494                            break;
495                        }
496                    }
497                }
498            }
499            for &u in &buffer[offset..] {
500                if is_utf16_code_unit_bidi(u) {
501                    return true;
502                }
503            }
504            false
505        }
506    } else {
507        #[inline(always)]
508        fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
509            for &u in buffer {
510                if is_utf16_code_unit_bidi(u) {
511                    return true;
512                }
513            }
514            false
515        }
516    }
517}
518
519cfg_if! {
520    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
521        #[inline(always)]
522        fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
523            let mut offset = 0usize;
524            let len = buffer.len();
525            if len >= SIMD_STRIDE_SIZE / 2 {
526                let src = buffer.as_ptr();
527                let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
528                                           SIMD_ALIGNMENT_MASK) / 2;
529                if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
530                    while until_alignment != 0 {
531                        if buffer[offset] > 0xFF {
532                            // This transition isn't optimal, since the aligment is recomputing
533                            // but not tweaking further today.
534                            if is_utf16_bidi_impl(&buffer[offset..]) {
535                                return Latin1Bidi::Bidi;
536                            }
537                            return Latin1Bidi::LeftToRight;
538                        }
539                        offset += 1;
540                        until_alignment -= 1;
541                    }
542                    let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
543                    loop {
544                        let mut s = unsafe { *(src.add(offset) as *const u16x8) };
545                        if !simd_is_latin1(s) {
546                            loop {
547                                if is_u16x8_bidi(s) {
548                                    return Latin1Bidi::Bidi;
549                                }
550                                offset += SIMD_STRIDE_SIZE / 2;
551                                if offset > len_minus_stride {
552                                    for &u in &buffer[offset..] {
553                                        if is_utf16_code_unit_bidi(u) {
554                                            return Latin1Bidi::Bidi;
555                                        }
556                                    }
557                                    return Latin1Bidi::LeftToRight;
558                                }
559                                s = unsafe { *(src.add(offset) as *const u16x8) };
560                            }
561                        }
562                        offset += SIMD_STRIDE_SIZE / 2;
563                        if offset > len_minus_stride {
564                            break;
565                        }
566                    }
567                }
568            }
569            let mut iter = (&buffer[offset..]).iter();
570            loop {
571                if let Some(&u) = iter.next() {
572                    if u > 0xFF {
573                        let mut inner_u = u;
574                        loop {
575                            if is_utf16_code_unit_bidi(inner_u) {
576                                return Latin1Bidi::Bidi;
577                            }
578                            if let Some(&code_unit) = iter.next() {
579                                inner_u = code_unit;
580                            } else {
581                                return Latin1Bidi::LeftToRight;
582                            }
583                        }
584                    }
585                } else {
586                    return Latin1Bidi::Latin1;
587                }
588            }
589        }
590    } else {
591        #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
592        #[inline(always)]
593        fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
594            let mut offset = 0usize;
595            let len = buffer.len();
596            if len >= ALU_ALIGNMENT / 2 {
597                let src = buffer.as_ptr();
598                let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
599                                           ALU_ALIGNMENT_MASK) / 2;
600                if until_alignment + ALU_ALIGNMENT / 2 <= len {
601                    while until_alignment != 0 {
602                        if buffer[offset] > 0xFF {
603                            if is_utf16_bidi_impl(&buffer[offset..]) {
604                                return Latin1Bidi::Bidi;
605                            }
606                            return Latin1Bidi::LeftToRight;
607                        }
608                        offset += 1;
609                        until_alignment -= 1;
610                    }
611                    let len_minus_stride = len - ALU_ALIGNMENT / 2;
612                    loop {
613                        if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
614                            if is_utf16_bidi_impl(&buffer[offset..]) {
615                                return Latin1Bidi::Bidi;
616                            }
617                            return Latin1Bidi::LeftToRight;
618                        }
619                        offset += ALU_ALIGNMENT / 2;
620                        if offset > len_minus_stride {
621                            break;
622                        }
623                    }
624                }
625            }
626            let mut iter = (&buffer[offset..]).iter();
627            loop {
628                if let Some(&u) = iter.next() {
629                    if u > 0xFF {
630                        let mut inner_u = u;
631                        loop {
632                            if is_utf16_code_unit_bidi(inner_u) {
633                                return Latin1Bidi::Bidi;
634                            }
635                            if let Some(&code_unit) = iter.next() {
636                                inner_u = code_unit;
637                            } else {
638                                return Latin1Bidi::LeftToRight;
639                            }
640                        }
641                    }
642                } else {
643                    return Latin1Bidi::Latin1;
644                }
645            }
646        }
647    }
648}
649
650/// Checks whether the buffer is all-ASCII.
651///
652/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
653/// is not guaranteed to fail fast.)
654pub fn is_ascii(buffer: &[u8]) -> bool {
655    is_ascii_impl(buffer)
656}
657
658/// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
659/// only ASCII characters).
660///
661/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
662/// is not guaranteed to fail fast.)
663pub fn is_basic_latin(buffer: &[u16]) -> bool {
664    is_basic_latin_impl(buffer)
665}
666
667/// Checks whether the buffer is valid UTF-8 representing only code points
668/// less than or equal to U+00FF.
669///
670/// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
671/// invalidity or code points above U+00FF are discovered.
672pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
673    is_utf8_latin1_impl(buffer).is_none()
674}
675
676/// Checks whether the buffer represents only code points less than or equal
677/// to U+00FF.
678///
679/// Fails fast. (I.e. returns before having read the whole buffer if code
680/// points above U+00FF are discovered.
681pub fn is_str_latin1(buffer: &str) -> bool {
682    is_str_latin1_impl(buffer).is_none()
683}
684
685/// Checks whether the buffer represents only code point less than or equal
686/// to U+00FF.
687///
688/// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
689/// is not guaranteed to fail fast.)
690pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
691    is_utf16_latin1_impl(buffer)
692}
693
694/// Checks whether a potentially-invalid UTF-8 buffer contains code points
695/// that trigger right-to-left processing.
696///
697/// The check is done on a Unicode block basis without regard to assigned
698/// vs. unassigned code points in the block. Hebrew presentation forms in
699/// the Alphabetic Presentation Forms block are treated as if they formed
700/// a block on their own (i.e. it treated as right-to-left). Additionally,
701/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
702/// for. Control characters that are technically bidi controls but do not
703/// cause right-to-left behavior without the presence of right-to-left
704/// characters or right-to-left controls are not checked for. As a special
705/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
706///
707/// Returns `true` if the input is invalid UTF-8 or the input contains an
708/// RTL character. Returns `false` if the input is valid UTF-8 and contains
709/// no RTL characters.
710#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
711#[inline]
712pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
713    // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
714    // than UTF-8 validation followed by `is_str_bidi()` for German,
715    // Russian and Japanese. However, this is considerably slower for Thai.
716    // Chances are that the compiler makes some branch predictions that are
717    // unfortunate for Thai. Not spending the time to manually optimize
718    // further at this time, since it's unclear if this variant even has
719    // use cases. However, this is worth revisiting once Rust gets the
720    // ability to annotate relative priorities of match arms.
721
722    // U+058F: D6 8F
723    // U+0590: D6 90
724    // U+08FF: E0 A3 BF
725    // U+0900: E0 A4 80
726    //
727    // U+200F: E2 80 8F
728    // U+202B: E2 80 AB
729    // U+202E: E2 80 AE
730    // U+2067: E2 81 A7
731    //
732    // U+FB1C: EF AC 9C
733    // U+FB1D: EF AC 9D
734    // U+FDFF: EF B7 BF
735    // U+FE00: EF B8 80
736    //
737    // U+FE6F: EF B9 AF
738    // U+FE70: EF B9 B0
739    // U+FEFE: EF BB BE
740    // U+FEFF: EF BB BF
741    //
742    // U+107FF: F0 90 9F BF
743    // U+10800: F0 90 A0 80
744    // U+10FFF: F0 90 BF BF
745    // U+11000: F0 91 80 80
746    //
747    // U+1E7FF: F0 9E 9F BF
748    // U+1E800: F0 9E A0 80
749    // U+1EFFF: F0 9E BF BF
750    // U+1F000: F0 9F 80 80
751    let mut src = buffer;
752    'outer: loop {
753        if let Some((mut byte, mut read)) = validate_ascii(src) {
754            // Check for the longest sequence to avoid checking twice for the
755            // multi-byte sequences.
756            if read + 4 <= src.len() {
757                'inner: loop {
758                    // At this point, `byte` is not included in `read`.
759                    match byte {
760                        0..=0x7F => {
761                            // ASCII: go back to SIMD.
762                            read += 1;
763                            src = &src[read..];
764                            continue 'outer;
765                        }
766                        0xC2..=0xD5 => {
767                            // Two-byte
768                            let second = unsafe { *(src.get_unchecked(read + 1)) };
769                            if !in_inclusive_range8(second, 0x80, 0xBF) {
770                                return true;
771                            }
772                            read += 2;
773                        }
774                        0xD6 => {
775                            // Two-byte
776                            let second = unsafe { *(src.get_unchecked(read + 1)) };
777                            if !in_inclusive_range8(second, 0x80, 0xBF) {
778                                return true;
779                            }
780                            // XXX consider folding the above and below checks
781                            if second > 0x8F {
782                                return true;
783                            }
784                            read += 2;
785                        }
786                        // two-byte starting with 0xD7 and above is bidi
787                        0xE1 | 0xE3..=0xEC | 0xEE => {
788                            // Three-byte normal
789                            let second = unsafe { *(src.get_unchecked(read + 1)) };
790                            let third = unsafe { *(src.get_unchecked(read + 2)) };
791                            if ((UTF8_DATA.table[usize::from(second)]
792                                & unsafe {
793                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
794                                })
795                                | (third >> 6))
796                                != 2
797                            {
798                                return true;
799                            }
800                            read += 3;
801                        }
802                        0xE2 => {
803                            // Three-byte normal, potentially bidi
804                            let second = unsafe { *(src.get_unchecked(read + 1)) };
805                            let third = unsafe { *(src.get_unchecked(read + 2)) };
806                            if ((UTF8_DATA.table[usize::from(second)]
807                                & unsafe {
808                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
809                                })
810                                | (third >> 6))
811                                != 2
812                            {
813                                return true;
814                            }
815                            if second == 0x80 {
816                                if third == 0x8F || third == 0xAB || third == 0xAE {
817                                    return true;
818                                }
819                            } else if second == 0x81 {
820                                if third == 0xA7 {
821                                    return true;
822                                }
823                            }
824                            read += 3;
825                        }
826                        0xEF => {
827                            // Three-byte normal, potentially bidi
828                            let second = unsafe { *(src.get_unchecked(read + 1)) };
829                            let third = unsafe { *(src.get_unchecked(read + 2)) };
830                            if ((UTF8_DATA.table[usize::from(second)]
831                                & unsafe {
832                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
833                                })
834                                | (third >> 6))
835                                != 2
836                            {
837                                return true;
838                            }
839                            if in_inclusive_range8(second, 0xAC, 0xB7) {
840                                if second == 0xAC {
841                                    if third > 0x9C {
842                                        return true;
843                                    }
844                                } else {
845                                    return true;
846                                }
847                            } else if in_inclusive_range8(second, 0xB9, 0xBB) {
848                                if second == 0xB9 {
849                                    if third > 0xAF {
850                                        return true;
851                                    }
852                                } else if second == 0xBB {
853                                    if third != 0xBF {
854                                        return true;
855                                    }
856                                } else {
857                                    return true;
858                                }
859                            }
860                            read += 3;
861                        }
862                        0xE0 => {
863                            // Three-byte special lower bound, potentially bidi
864                            let second = unsafe { *(src.get_unchecked(read + 1)) };
865                            let third = unsafe { *(src.get_unchecked(read + 2)) };
866                            if ((UTF8_DATA.table[usize::from(second)]
867                                & unsafe {
868                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
869                                })
870                                | (third >> 6))
871                                != 2
872                            {
873                                return true;
874                            }
875                            // XXX can this be folded into the above validity check
876                            if second < 0xA4 {
877                                return true;
878                            }
879                            read += 3;
880                        }
881                        0xED => {
882                            // Three-byte special upper bound
883                            let second = unsafe { *(src.get_unchecked(read + 1)) };
884                            let third = unsafe { *(src.get_unchecked(read + 2)) };
885                            if ((UTF8_DATA.table[usize::from(second)]
886                                & unsafe {
887                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
888                                })
889                                | (third >> 6))
890                                != 2
891                            {
892                                return true;
893                            }
894                            read += 3;
895                        }
896                        0xF1..=0xF4 => {
897                            // Four-byte normal
898                            let second = unsafe { *(src.get_unchecked(read + 1)) };
899                            let third = unsafe { *(src.get_unchecked(read + 2)) };
900                            let fourth = unsafe { *(src.get_unchecked(read + 3)) };
901                            if (u16::from(
902                                UTF8_DATA.table[usize::from(second)]
903                                    & unsafe {
904                                        *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
905                                    },
906                            ) | u16::from(third >> 6)
907                                | (u16::from(fourth & 0xC0) << 2))
908                                != 0x202
909                            {
910                                return true;
911                            }
912                            read += 4;
913                        }
914                        0xF0 => {
915                            // Four-byte special lower bound, potentially bidi
916                            let second = unsafe { *(src.get_unchecked(read + 1)) };
917                            let third = unsafe { *(src.get_unchecked(read + 2)) };
918                            let fourth = unsafe { *(src.get_unchecked(read + 3)) };
919                            if (u16::from(
920                                UTF8_DATA.table[usize::from(second)]
921                                    & unsafe {
922                                        *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
923                                    },
924                            ) | u16::from(third >> 6)
925                                | (u16::from(fourth & 0xC0) << 2))
926                                != 0x202
927                            {
928                                return true;
929                            }
930                            if unlikely(second == 0x90 || second == 0x9E) {
931                                let third = src[read + 2];
932                                if third >= 0xA0 {
933                                    return true;
934                                }
935                            }
936                            read += 4;
937                        }
938                        _ => {
939                            // Invalid lead or bidi-only lead
940                            return true;
941                        }
942                    }
943                    if read + 4 > src.len() {
944                        if read == src.len() {
945                            return false;
946                        }
947                        byte = src[read];
948                        break 'inner;
949                    }
950                    byte = src[read];
951                    continue 'inner;
952                }
953            }
954            // We can't have a complete 4-byte sequence, but we could still have
955            // a complete shorter sequence.
956
957            // At this point, `byte` is not included in `read`.
958            match byte {
959                0..=0x7F => {
960                    // ASCII: go back to SIMD.
961                    read += 1;
962                    src = &src[read..];
963                    continue 'outer;
964                }
965                0xC2..=0xD5 => {
966                    // Two-byte
967                    let new_read = read + 2;
968                    if new_read > src.len() {
969                        return true;
970                    }
971                    let second = unsafe { *(src.get_unchecked(read + 1)) };
972                    if !in_inclusive_range8(second, 0x80, 0xBF) {
973                        return true;
974                    }
975                    read = new_read;
976                    // We need to deal with the case where we came here with 3 bytes
977                    // left, so we need to take a look at the last one.
978                    src = &src[read..];
979                    continue 'outer;
980                }
981                0xD6 => {
982                    // Two-byte, potentially bidi
983                    let new_read = read + 2;
984                    if new_read > src.len() {
985                        return true;
986                    }
987                    let second = unsafe { *(src.get_unchecked(read + 1)) };
988                    if !in_inclusive_range8(second, 0x80, 0xBF) {
989                        return true;
990                    }
991                    // XXX consider folding the above and below checks
992                    if second > 0x8F {
993                        return true;
994                    }
995                    read = new_read;
996                    // We need to deal with the case where we came here with 3 bytes
997                    // left, so we need to take a look at the last one.
998                    src = &src[read..];
999                    continue 'outer;
1000                }
1001                // two-byte starting with 0xD7 and above is bidi
1002                0xE1 | 0xE3..=0xEC | 0xEE => {
1003                    // Three-byte normal
1004                    let new_read = read + 3;
1005                    if new_read > src.len() {
1006                        return true;
1007                    }
1008                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1009                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1010                    if ((UTF8_DATA.table[usize::from(second)]
1011                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1012                        | (third >> 6))
1013                        != 2
1014                    {
1015                        return true;
1016                    }
1017                }
1018                0xE2 => {
1019                    // Three-byte normal, potentially bidi
1020                    let new_read = read + 3;
1021                    if new_read > src.len() {
1022                        return true;
1023                    }
1024                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1025                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1026                    if ((UTF8_DATA.table[usize::from(second)]
1027                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1028                        | (third >> 6))
1029                        != 2
1030                    {
1031                        return true;
1032                    }
1033                    if second == 0x80 {
1034                        if third == 0x8F || third == 0xAB || third == 0xAE {
1035                            return true;
1036                        }
1037                    } else if second == 0x81 {
1038                        if third == 0xA7 {
1039                            return true;
1040                        }
1041                    }
1042                }
1043                0xEF => {
1044                    // Three-byte normal, potentially bidi
1045                    let new_read = read + 3;
1046                    if new_read > src.len() {
1047                        return true;
1048                    }
1049                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1050                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1051                    if ((UTF8_DATA.table[usize::from(second)]
1052                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1053                        | (third >> 6))
1054                        != 2
1055                    {
1056                        return true;
1057                    }
1058                    if in_inclusive_range8(second, 0xAC, 0xB7) {
1059                        if second == 0xAC {
1060                            if third > 0x9C {
1061                                return true;
1062                            }
1063                        } else {
1064                            return true;
1065                        }
1066                    } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1067                        if second == 0xB9 {
1068                            if third > 0xAF {
1069                                return true;
1070                            }
1071                        } else if second == 0xBB {
1072                            if third != 0xBF {
1073                                return true;
1074                            }
1075                        } else {
1076                            return true;
1077                        }
1078                    }
1079                }
1080                0xE0 => {
1081                    // Three-byte special lower bound, potentially bidi
1082                    let new_read = read + 3;
1083                    if new_read > src.len() {
1084                        return true;
1085                    }
1086                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1087                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1088                    if ((UTF8_DATA.table[usize::from(second)]
1089                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1090                        | (third >> 6))
1091                        != 2
1092                    {
1093                        return true;
1094                    }
1095                    // XXX can this be folded into the above validity check
1096                    if second < 0xA4 {
1097                        return true;
1098                    }
1099                }
1100                0xED => {
1101                    // Three-byte special upper bound
1102                    let new_read = read + 3;
1103                    if new_read > src.len() {
1104                        return true;
1105                    }
1106                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1107                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1108                    if ((UTF8_DATA.table[usize::from(second)]
1109                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1110                        | (third >> 6))
1111                        != 2
1112                    {
1113                        return true;
1114                    }
1115                }
1116                _ => {
1117                    // Invalid lead, 4-byte lead or 2-byte bidi-only lead
1118                    return true;
1119                }
1120            }
1121            return false;
1122        } else {
1123            return false;
1124        }
1125    }
1126}
1127
1128/// Checks whether a valid UTF-8 buffer contains code points that trigger
1129/// right-to-left processing.
1130///
1131/// The check is done on a Unicode block basis without regard to assigned
1132/// vs. unassigned code points in the block. Hebrew presentation forms in
1133/// the Alphabetic Presentation Forms block are treated as if they formed
1134/// a block on their own (i.e. it treated as right-to-left). Additionally,
1135/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1136/// for. Control characters that are technically bidi controls but do not
1137/// cause right-to-left behavior without the presence of right-to-left
1138/// characters or right-to-left controls are not checked for. As a special
1139/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1140#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
1141#[inline]
1142pub fn is_str_bidi(buffer: &str) -> bool {
1143    // U+058F: D6 8F
1144    // U+0590: D6 90
1145    // U+08FF: E0 A3 BF
1146    // U+0900: E0 A4 80
1147    //
1148    // U+200F: E2 80 8F
1149    // U+202B: E2 80 AB
1150    // U+202E: E2 80 AE
1151    // U+2067: E2 81 A7
1152    //
1153    // U+FB1C: EF AC 9C
1154    // U+FB1D: EF AC 9D
1155    // U+FDFF: EF B7 BF
1156    // U+FE00: EF B8 80
1157    //
1158    // U+FE6F: EF B9 AF
1159    // U+FE70: EF B9 B0
1160    // U+FEFE: EF BB BE
1161    // U+FEFF: EF BB BF
1162    //
1163    // U+107FF: F0 90 9F BF
1164    // U+10800: F0 90 A0 80
1165    // U+10FFF: F0 90 BF BF
1166    // U+11000: F0 91 80 80
1167    //
1168    // U+1E7FF: F0 9E 9F BF
1169    // U+1E800: F0 9E A0 80
1170    // U+1EFFF: F0 9E BF BF
1171    // U+1F000: F0 9F 80 80
1172    let mut bytes = buffer.as_bytes();
1173    'outer: loop {
1174        // TODO: Instead of just validating ASCII using SIMD, use SIMD
1175        // to check for non-ASCII lead bytes, too, to quickly conclude
1176        // that the vector consist entirely of CJK and below-Hebrew
1177        // code points.
1178        // Unfortunately, scripts above Arabic but below CJK share
1179        // lead bytes with RTL.
1180        if let Some((mut byte, mut read)) = validate_ascii(bytes) {
1181            'inner: loop {
1182                // At this point, `byte` is not included in `read`.
1183                if byte < 0xE0 {
1184                    if byte >= 0x80 {
1185                        // Two-byte
1186                        // Adding `unlikely` here improved throughput on
1187                        // Russian plain text by 33%!
1188                        if unlikely(byte >= 0xD6) {
1189                            if byte == 0xD6 {
1190                                let second = bytes[read + 1];
1191                                if second > 0x8F {
1192                                    return true;
1193                                }
1194                            } else {
1195                                return true;
1196                            }
1197                        }
1198                        read += 2;
1199                    } else {
1200                        // ASCII: write and go back to SIMD.
1201                        read += 1;
1202                        // Intuitively, we should go back to the outer loop only
1203                        // if byte is 0x30 or above, so as to avoid trashing on
1204                        // ASCII space, comma and period in non-Latin context.
1205                        // However, the extra branch seems to cost more than it's
1206                        // worth.
1207                        bytes = &bytes[read..];
1208                        continue 'outer;
1209                    }
1210                } else if byte < 0xF0 {
1211                    // Three-byte
1212                    if unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) {
1213                        let second = bytes[read + 1];
1214                        if byte == 0xE0 {
1215                            if second < 0xA4 {
1216                                return true;
1217                            }
1218                        } else if byte == 0xE2 {
1219                            let third = bytes[read + 2];
1220                            if second == 0x80 {
1221                                if third == 0x8F || third == 0xAB || third == 0xAE {
1222                                    return true;
1223                                }
1224                            } else if second == 0x81 {
1225                                if third == 0xA7 {
1226                                    return true;
1227                                }
1228                            }
1229                        } else {
1230                            debug_assert_eq!(byte, 0xEF);
1231                            if in_inclusive_range8(second, 0xAC, 0xB7) {
1232                                if second == 0xAC {
1233                                    let third = bytes[read + 2];
1234                                    if third > 0x9C {
1235                                        return true;
1236                                    }
1237                                } else {
1238                                    return true;
1239                                }
1240                            } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1241                                if second == 0xB9 {
1242                                    let third = bytes[read + 2];
1243                                    if third > 0xAF {
1244                                        return true;
1245                                    }
1246                                } else if second == 0xBB {
1247                                    let third = bytes[read + 2];
1248                                    if third != 0xBF {
1249                                        return true;
1250                                    }
1251                                } else {
1252                                    return true;
1253                                }
1254                            }
1255                        }
1256                    }
1257                    read += 3;
1258                } else {
1259                    // Four-byte
1260                    let second = bytes[read + 1];
1261                    if unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) {
1262                        let third = bytes[read + 2];
1263                        if third >= 0xA0 {
1264                            return true;
1265                        }
1266                    }
1267                    read += 4;
1268                }
1269                // The comparison is always < or == and never >, but including
1270                // > here to let the compiler assume that < is true if this
1271                // comparison is false.
1272                if read >= bytes.len() {
1273                    return false;
1274                }
1275                byte = bytes[read];
1276                continue 'inner;
1277            }
1278        } else {
1279            return false;
1280        }
1281    }
1282}
1283
1284/// Checks whether a UTF-16 buffer contains code points that trigger
1285/// right-to-left processing.
1286///
1287/// The check is done on a Unicode block basis without regard to assigned
1288/// vs. unassigned code points in the block. Hebrew presentation forms in
1289/// the Alphabetic Presentation Forms block are treated as if they formed
1290/// a block on their own (i.e. it treated as right-to-left). Additionally,
1291/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1292/// for. Control characters that are technically bidi controls but do not
1293/// cause right-to-left behavior without the presence of right-to-left
1294/// characters or right-to-left controls are not checked for. As a special
1295/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1296///
1297/// Returns `true` if the input contains an RTL character or an unpaired
1298/// high surrogate that could be the high half of an RTL character.
1299/// Returns `false` if the input contains neither RTL characters nor
1300/// unpaired high surrogates that could be higher halves of RTL characters.
1301pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
1302    is_utf16_bidi_impl(buffer)
1303}
1304
1305/// Checks whether a scalar value triggers right-to-left processing.
1306///
1307/// The check is done on a Unicode block basis without regard to assigned
1308/// vs. unassigned code points in the block. Hebrew presentation forms in
1309/// the Alphabetic Presentation Forms block are treated as if they formed
1310/// a block on their own (i.e. it treated as right-to-left). Additionally,
1311/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1312/// for. Control characters that are technically bidi controls but do not
1313/// cause right-to-left behavior without the presence of right-to-left
1314/// characters or right-to-left controls are not checked for. As a special
1315/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1316#[inline(always)]
1317pub fn is_char_bidi(c: char) -> bool {
1318    // Controls:
1319    // Every control with RIGHT-TO-LEFT in its name in
1320    // https://www.unicode.org/charts/PDF/U2000.pdf
1321    // U+200F RLM
1322    // U+202B RLE
1323    // U+202E RLO
1324    // U+2067 RLI
1325    //
1326    // BMP RTL:
1327    // https://www.unicode.org/roadmaps/bmp/
1328    // U+0590...U+08FF
1329    // U+FB1D...U+FDFF Hebrew presentation forms and
1330    //                 Arabic Presentation Forms A
1331    // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
1332    //
1333    // Supplementary RTL:
1334    // https://www.unicode.org/roadmaps/smp/
1335    // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
1336    // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
1337    let code_point = u32::from(c);
1338    if code_point < 0x0590 {
1339        // Below Hebrew
1340        return false;
1341    }
1342    if in_range32(code_point, 0x0900, 0xFB1D) {
1343        // Above Arabic Extended-A and below Hebrew presentation forms
1344        if in_inclusive_range32(code_point, 0x200F, 0x2067) {
1345            // In the range that contains the RTL controls
1346            return code_point == 0x200F
1347                || code_point == 0x202B
1348                || code_point == 0x202E
1349                || code_point == 0x2067;
1350        }
1351        return false;
1352    }
1353    if code_point > 0x1EFFF {
1354        // Above second astral RTL. (Emoji is here.)
1355        return false;
1356    }
1357    if in_range32(code_point, 0x11000, 0x1E800) {
1358        // Between astral RTL blocks
1359        return false;
1360    }
1361    if in_range32(code_point, 0xFEFF, 0x10800) {
1362        // Above Arabic Presentations Forms B (excl. BOM) and below first
1363        // astral RTL
1364        return false;
1365    }
1366    if in_range32(code_point, 0xFE00, 0xFE70) {
1367        // Between Arabic Presentations Forms
1368        return false;
1369    }
1370    true
1371}
1372
1373/// Checks whether a UTF-16 code unit triggers right-to-left processing.
1374///
1375/// The check is done on a Unicode block basis without regard to assigned
1376/// vs. unassigned code points in the block. Hebrew presentation forms in
1377/// the Alphabetic Presentation Forms block are treated as if they formed
1378/// a block on their own (i.e. it treated as right-to-left). Additionally,
1379/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1380/// for. Control characters that are technically bidi controls but do not
1381/// cause right-to-left behavior without the presence of right-to-left
1382/// characters or right-to-left controls are not checked for. As a special
1383/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1384///
1385/// Since supplementary-plane right-to-left blocks are identifiable from the
1386/// high surrogate without examining the low surrogate, this function returns
1387/// `true` for such high surrogates making the function suitable for handling
1388/// supplementary-plane text without decoding surrogate pairs to scalar
1389/// values. Obviously, such high surrogates are then reported as right-to-left
1390/// even if actually unpaired.
1391#[inline(always)]
1392pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
1393    if u < 0x0590 {
1394        // Below Hebrew
1395        return false;
1396    }
1397    if in_range16(u, 0x0900, 0xD802) {
1398        // Above Arabic Extended-A and below first RTL surrogate
1399        if in_inclusive_range16(u, 0x200F, 0x2067) {
1400            // In the range that contains the RTL controls
1401            return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
1402        }
1403        return false;
1404    }
1405    if in_range16(u, 0xD83C, 0xFB1D) {
1406        // Between astral RTL high surrogates and Hebrew presentation forms
1407        // (Emoji is here)
1408        return false;
1409    }
1410    if in_range16(u, 0xD804, 0xD83A) {
1411        // Between RTL high surragates
1412        return false;
1413    }
1414    if u > 0xFEFE {
1415        // Above Arabic Presentation Forms (excl. BOM)
1416        return false;
1417    }
1418    if in_range16(u, 0xFE00, 0xFE70) {
1419        // Between Arabic Presentations Forms
1420        return false;
1421    }
1422    true
1423}
1424
1425/// Checks whether a potentially invalid UTF-8 buffer contains code points
1426/// that trigger right-to-left processing or is all-Latin1.
1427///
1428/// Possibly more efficient than performing the checks separately.
1429///
1430/// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
1431/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
1432/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1433pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
1434    if let Some(offset) = is_utf8_latin1_impl(buffer) {
1435        if is_utf8_bidi(&buffer[offset..]) {
1436            Latin1Bidi::Bidi
1437        } else {
1438            Latin1Bidi::LeftToRight
1439        }
1440    } else {
1441        Latin1Bidi::Latin1
1442    }
1443}
1444
1445/// Checks whether a valid UTF-8 buffer contains code points
1446/// that trigger right-to-left processing or is all-Latin1.
1447///
1448/// Possibly more efficient than performing the checks separately.
1449///
1450/// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
1451/// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
1452/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1453pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
1454    // The transition from the latin1 check to the bidi check isn't
1455    // optimal but not tweaking it to perfection today.
1456    if let Some(offset) = is_str_latin1_impl(buffer) {
1457        if is_str_bidi(&buffer[offset..]) {
1458            Latin1Bidi::Bidi
1459        } else {
1460            Latin1Bidi::LeftToRight
1461        }
1462    } else {
1463        Latin1Bidi::Latin1
1464    }
1465}
1466
1467/// Checks whether a potentially invalid UTF-16 buffer contains code points
1468/// that trigger right-to-left processing or is all-Latin1.
1469///
1470/// Possibly more efficient than performing the checks separately.
1471///
1472/// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
1473/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
1474/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1475pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
1476    check_utf16_for_latin1_and_bidi_impl(buffer)
1477}
1478
1479/// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
1480/// with the REPLACEMENT CHARACTER.
1481///
1482/// The length of the destination buffer must be at least the length of the
1483/// source buffer _plus one_.
1484///
1485/// Returns the number of `u16`s written.
1486///
1487/// # Panics
1488///
1489/// Panics if the destination buffer is shorter than stated above.
1490pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
1491    // TODO: Can the requirement for dst to be at least one unit longer
1492    // be eliminated?
1493    assert!(dst.len() > src.len());
1494    let mut decoder = Utf8Decoder::new_inner();
1495    let mut total_read = 0usize;
1496    let mut total_written = 0usize;
1497    loop {
1498        let (result, read, written) =
1499            decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
1500        total_read += read;
1501        total_written += written;
1502        match result {
1503            DecoderResult::InputEmpty => {
1504                return total_written;
1505            }
1506            DecoderResult::OutputFull => {
1507                unreachable!("The assert at the top of the function should have caught this.");
1508            }
1509            DecoderResult::Malformed(_, _) => {
1510                // There should always be space for the U+FFFD, because
1511                // otherwise we'd have gotten OutputFull already.
1512                dst[total_written] = 0xFFFD;
1513                total_written += 1;
1514            }
1515        }
1516    }
1517}
1518
1519/// Converts valid UTF-8 to valid UTF-16.
1520///
1521/// The length of the destination buffer must be at least the length of the
1522/// source buffer.
1523///
1524/// Returns the number of `u16`s written.
1525///
1526/// # Panics
1527///
1528/// Panics if the destination buffer is shorter than stated above.
1529pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
1530    assert!(
1531        dst.len() >= src.len(),
1532        "Destination must not be shorter than the source."
1533    );
1534    let bytes = src.as_bytes();
1535    let mut read = 0;
1536    let mut written = 0;
1537    'outer: loop {
1538        let mut byte = {
1539            let src_remaining = &bytes[read..];
1540            let dst_remaining = &mut dst[written..];
1541            let length = src_remaining.len();
1542            match unsafe {
1543                ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
1544            } {
1545                None => {
1546                    written += length;
1547                    return written;
1548                }
1549                Some((non_ascii, consumed)) => {
1550                    read += consumed;
1551                    written += consumed;
1552                    non_ascii
1553                }
1554            }
1555        };
1556        'inner: loop {
1557            // At this point, `byte` is not included in `read`.
1558            if byte < 0xE0 {
1559                if byte >= 0x80 {
1560                    // Two-byte
1561                    let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1562                    let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
1563                    unsafe { *(dst.get_unchecked_mut(written)) = point };
1564                    read += 2;
1565                    written += 1;
1566                } else {
1567                    // ASCII: write and go back to SIMD.
1568                    unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
1569                    read += 1;
1570                    written += 1;
1571                    // Intuitively, we should go back to the outer loop only
1572                    // if byte is 0x30 or above, so as to avoid trashing on
1573                    // ASCII space, comma and period in non-Latin context.
1574                    // However, the extra branch seems to cost more than it's
1575                    // worth.
1576                    continue 'outer;
1577                }
1578            } else if byte < 0xF0 {
1579                // Three-byte
1580                let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1581                let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1582                let point = ((u16::from(byte) & 0xF) << 12)
1583                    | ((u16::from(second) & 0x3F) << 6)
1584                    | (u16::from(third) & 0x3F);
1585                unsafe { *(dst.get_unchecked_mut(written)) = point };
1586                read += 3;
1587                written += 1;
1588            } else {
1589                // Four-byte
1590                let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1591                let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1592                let fourth = unsafe { *(bytes.get_unchecked(read + 3)) };
1593                let point = ((u32::from(byte) & 0x7) << 18)
1594                    | ((u32::from(second) & 0x3F) << 12)
1595                    | ((u32::from(third) & 0x3F) << 6)
1596                    | (u32::from(fourth) & 0x3F);
1597                unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
1598                unsafe {
1599                    *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
1600                };
1601                read += 4;
1602                written += 2;
1603            }
1604            // The comparison is always < or == and never >, but including
1605            // > here to let the compiler assume that < is true if this
1606            // comparison is false.
1607            if read >= src.len() {
1608                return written;
1609            }
1610            byte = bytes[read];
1611            continue 'inner;
1612        }
1613    }
1614}
1615
1616/// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
1617///
1618/// The length of the destination buffer must be at least the length of the
1619/// source buffer.
1620///
1621/// Returns the number of `u16`s written or `None` if the input was invalid.
1622///
1623/// When the input was invalid, some output may have been written.
1624///
1625/// # Panics
1626///
1627/// Panics if the destination buffer is shorter than stated above.
1628pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> {
1629    assert!(
1630        dst.len() >= src.len(),
1631        "Destination must not be shorter than the source."
1632    );
1633    let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst);
1634    if read == src.len() {
1635        return Some(written);
1636    }
1637    None
1638}
1639
1640/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1641/// with the REPLACEMENT CHARACTER with potentially insufficient output
1642/// space.
1643///
1644/// Returns the number of code units read and the number of bytes written.
1645///
1646/// Guarantees that the bytes in the destination beyond the number of
1647/// bytes claimed as written by the second item of the return tuple
1648/// are left unmodified.
1649///
1650/// Not all code units are read if there isn't enough output space.
1651///
1652/// Note  that this method isn't designed for general streamability but for
1653/// not allocating memory for the worst case up front. Specifically,
1654/// if the input starts with or ends with an unpaired surrogate, those are
1655/// replaced with the REPLACEMENT CHARACTER.
1656///
1657/// Matches the semantics of `TextEncoder.encodeInto()` from the
1658/// Encoding Standard.
1659///
1660/// # Safety
1661///
1662/// If you want to convert into a `&mut str`, use
1663/// `convert_utf16_to_str_partial()` instead of using this function
1664/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1665#[inline(always)]
1666pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
1667    // The two functions called below are marked `inline(never)` to make
1668    // transitions from the hot part (first function) into the cold part
1669    // (second function) go through a return and another call to discouge
1670    // the CPU from speculating from the hot code into the cold code.
1671    // Letting the transitions be mere intra-function jumps, even to
1672    // basic blocks out-of-lined to the end of the function would wipe
1673    // away a quarter of Arabic encode performance on Haswell!
1674    let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst);
1675    if likely(read == src.len()) {
1676        return (read, written);
1677    }
1678    let (tail_read, tail_written) =
1679        convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]);
1680    (read + tail_read, written + tail_written)
1681}
1682
1683/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1684/// with the REPLACEMENT CHARACTER.
1685///
1686/// The length of the destination buffer must be at least the length of the
1687/// source buffer times three.
1688///
1689/// Returns the number of bytes written.
1690///
1691/// # Panics
1692///
1693/// Panics if the destination buffer is shorter than stated above.
1694///
1695/// # Safety
1696///
1697/// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
1698/// instead of using this function together with the `unsafe` method
1699/// `as_bytes_mut()` on `&mut str`.
1700#[inline(always)]
1701pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
1702    assert!(dst.len() >= src.len() * 3);
1703    let (read, written) = convert_utf16_to_utf8_partial(src, dst);
1704    debug_assert_eq!(read, src.len());
1705    written
1706}
1707
1708/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1709/// with the REPLACEMENT CHARACTER such that the validity of the output is
1710/// signaled using the Rust type system with potentially insufficient output
1711/// space.
1712///
1713/// Returns the number of code units read and the number of bytes written.
1714///
1715/// Not all code units are read if there isn't enough output space.
1716///
1717/// Note  that this method isn't designed for general streamability but for
1718/// not allocating memory for the worst case up front. Specifically,
1719/// if the input starts with or ends with an unpaired surrogate, those are
1720/// replaced with the REPLACEMENT CHARACTER.
1721pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
1722    let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1723    let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
1724    let len = bytes.len();
1725    let mut trail = written;
1726    while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1727        bytes[trail] = 0;
1728        trail += 1;
1729    }
1730    (read, written)
1731}
1732
1733/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1734/// with the REPLACEMENT CHARACTER such that the validity of the output is
1735/// signaled using the Rust type system.
1736///
1737/// The length of the destination buffer must be at least the length of the
1738/// source buffer times three.
1739///
1740/// Returns the number of bytes written.
1741///
1742/// # Panics
1743///
1744/// Panics if the destination buffer is shorter than stated above.
1745#[inline(always)]
1746pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
1747    assert!(dst.len() >= src.len() * 3);
1748    let (read, written) = convert_utf16_to_str_partial(src, dst);
1749    debug_assert_eq!(read, src.len());
1750    written
1751}
1752
1753/// Converts bytes whose unsigned value is interpreted as Unicode code point
1754/// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
1755///
1756/// The length of the destination buffer must be at least the length of the
1757/// source buffer.
1758///
1759/// The number of `u16`s written equals the length of the source buffer.
1760///
1761/// # Panics
1762///
1763/// Panics if the destination buffer is shorter than stated above.
1764pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
1765    assert!(
1766        dst.len() >= src.len(),
1767        "Destination must not be shorter than the source."
1768    );
1769    // TODO: On aarch64, the safe version autovectorizes to the same unpacking
1770    // instructions and this code, but, yet, the autovectorized version is
1771    // faster.
1772    unsafe {
1773        unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1774    }
1775}
1776
1777/// Converts bytes whose unsigned value is interpreted as Unicode code point
1778/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
1779/// output space.
1780///
1781/// Returns the number of bytes read and the number of bytes written.
1782///
1783/// If the output isn't large enough, not all input is consumed.
1784///
1785/// # Safety
1786///
1787/// If you want to convert into a `&mut str`, use
1788/// `convert_utf16_to_str_partial()` instead of using this function
1789/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1790pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
1791    let src_len = src.len();
1792    let src_ptr = src.as_ptr();
1793    let dst_ptr = dst.as_mut_ptr();
1794    let dst_len = dst.len();
1795    let mut total_read = 0usize;
1796    let mut total_written = 0usize;
1797    loop {
1798        // src can't advance more than dst
1799        let src_left = src_len - total_read;
1800        let dst_left = dst_len - total_written;
1801        let min_left = ::core::cmp::min(src_left, dst_left);
1802        if let Some((non_ascii, consumed)) = unsafe {
1803            ascii_to_ascii(
1804                src_ptr.add(total_read),
1805                dst_ptr.add(total_written),
1806                min_left,
1807            )
1808        } {
1809            total_read += consumed;
1810            total_written += consumed;
1811            if total_written.checked_add(2).unwrap() > dst_len {
1812                return (total_read, total_written);
1813            }
1814
1815            total_read += 1; // consume `non_ascii`
1816
1817            dst[total_written] = (non_ascii >> 6) | 0xC0;
1818            total_written += 1;
1819            dst[total_written] = (non_ascii & 0x3F) | 0x80;
1820            total_written += 1;
1821            continue;
1822        }
1823        return (total_read + min_left, total_written + min_left);
1824    }
1825}
1826
1827/// Converts bytes whose unsigned value is interpreted as Unicode code point
1828/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1829///
1830/// The length of the destination buffer must be at least the length of the
1831/// source buffer times two.
1832///
1833/// Returns the number of bytes written.
1834///
1835/// # Panics
1836///
1837/// Panics if the destination buffer is shorter than stated above.
1838///
1839/// # Safety
1840///
1841/// Note that this function may write garbage beyond the number of bytes
1842/// indicated by the return value, so using a `&mut str` interpreted as
1843/// `&mut [u8]` as the destination is not safe. If you want to convert into
1844/// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
1845#[inline]
1846pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
1847    assert!(
1848        dst.len() >= src.len() * 2,
1849        "Destination must not be shorter than the source times two."
1850    );
1851    let (read, written) = convert_latin1_to_utf8_partial(src, dst);
1852    debug_assert_eq!(read, src.len());
1853    written
1854}
1855
1856/// Converts bytes whose unsigned value is interpreted as Unicode code point
1857/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1858/// output is signaled using the Rust type system with potentially insufficient
1859/// output space.
1860///
1861/// Returns the number of bytes read and the number of bytes written.
1862///
1863/// If the output isn't large enough, not all input is consumed.
1864#[inline]
1865pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
1866    let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1867    let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
1868    let len = bytes.len();
1869    let mut trail = written;
1870    let max = ::core::cmp::min(len, trail + MAX_STRIDE_SIZE);
1871    while trail < max {
1872        bytes[trail] = 0;
1873        trail += 1;
1874    }
1875    while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1876        bytes[trail] = 0;
1877        trail += 1;
1878    }
1879    (read, written)
1880}
1881
1882/// Converts bytes whose unsigned value is interpreted as Unicode code point
1883/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1884/// output is signaled using the Rust type system.
1885///
1886/// The length of the destination buffer must be at least the length of the
1887/// source buffer times two.
1888///
1889/// Returns the number of bytes written.
1890///
1891/// # Panics
1892///
1893/// Panics if the destination buffer is shorter than stated above.
1894#[inline]
1895pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
1896    assert!(
1897        dst.len() >= src.len() * 2,
1898        "Destination must not be shorter than the source times two."
1899    );
1900    let (read, written) = convert_latin1_to_str_partial(src, dst);
1901    debug_assert_eq!(read, src.len());
1902    written
1903}
1904
1905/// If the input is valid UTF-8 representing only Unicode code points from
1906/// U+0000 to U+00FF, inclusive, converts the input into output that
1907/// represents the value of each code point as the unsigned byte value of
1908/// each output byte.
1909///
1910/// If the input does not fulfill the condition stated above, this function
1911/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
1912/// does something that is memory-safe without any promises about any
1913/// properties of the output. In particular, callers shouldn't assume the
1914/// output to be the same across crate versions or CPU architectures and
1915/// should not assume that non-ASCII input can't map to ASCII output.
1916///
1917/// The length of the destination buffer must be at least the length of the
1918/// source buffer.
1919///
1920/// Returns the number of bytes written.
1921///
1922/// # Panics
1923///
1924/// Panics if the destination buffer is shorter than stated above.
1925///
1926/// If debug assertions are enabled (and not fuzzing) and the input is
1927/// not in the range U+0000 to U+00FF, inclusive.
1928pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
1929    assert!(
1930        dst.len() >= src.len(),
1931        "Destination must not be shorter than the source."
1932    );
1933    non_fuzz_debug_assert!(is_utf8_latin1(src));
1934    let src_len = src.len();
1935    let src_ptr = src.as_ptr();
1936    let dst_ptr = dst.as_mut_ptr();
1937    let mut total_read = 0usize;
1938    let mut total_written = 0usize;
1939    loop {
1940        // dst can't advance more than src
1941        let src_left = src_len - total_read;
1942        if let Some((non_ascii, consumed)) = unsafe {
1943            ascii_to_ascii(
1944                src_ptr.add(total_read),
1945                dst_ptr.add(total_written),
1946                src_left,
1947            )
1948        } {
1949            total_read += consumed + 1;
1950            total_written += consumed;
1951
1952            if total_read == src_len {
1953                return total_written;
1954            }
1955
1956            let trail = src[total_read];
1957            total_read += 1;
1958
1959            dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
1960            total_written += 1;
1961            continue;
1962        }
1963        return total_written + src_left;
1964    }
1965}
1966
1967/// If the input is valid UTF-16 representing only Unicode code points from
1968/// U+0000 to U+00FF, inclusive, converts the input into output that
1969/// represents the value of each code point as the unsigned byte value of
1970/// each output byte.
1971///
1972/// If the input does not fulfill the condition stated above, does something
1973/// that is memory-safe without any promises about any properties of the
1974/// output and will probably assert in debug builds in future versions.
1975/// In particular, callers shouldn't assume the output to be the same across
1976/// crate versions or CPU architectures and should not assume that non-ASCII
1977/// input can't map to ASCII output.
1978///
1979/// The length of the destination buffer must be at least the length of the
1980/// source buffer.
1981///
1982/// The number of bytes written equals the length of the source buffer.
1983///
1984/// # Panics
1985///
1986/// Panics if the destination buffer is shorter than stated above.
1987///
1988/// (Probably in future versions if debug assertions are enabled (and not
1989/// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
1990pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
1991    assert!(
1992        dst.len() >= src.len(),
1993        "Destination must not be shorter than the source."
1994    );
1995    // non_fuzz_debug_assert!(is_utf16_latin1(src));
1996    unsafe {
1997        pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1998    }
1999}
2000
2001/// Converts bytes whose unsigned value is interpreted as Unicode code point
2002/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
2003///
2004/// Borrows if input is ASCII-only. Performs a single heap allocation
2005/// otherwise.
2006///
2007/// Only available if the `alloc` feature is enabled (enabled by default).
2008#[cfg(feature = "alloc")]
2009pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
2010    let up_to = ascii_valid_up_to(bytes);
2011    // >= makes later things optimize better than ==
2012    if up_to >= bytes.len() {
2013        debug_assert_eq!(up_to, bytes.len());
2014        let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) };
2015        return Cow::Borrowed(s);
2016    }
2017    let (head, tail) = bytes.split_at(up_to);
2018    let capacity = head.len() + tail.len() * 2;
2019    let mut vec = Vec::with_capacity(capacity);
2020    unsafe {
2021        vec.set_len(capacity);
2022    }
2023    (&mut vec[..up_to]).copy_from_slice(head);
2024    let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]);
2025    vec.truncate(up_to + written);
2026    Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
2027}
2028
2029/// If the input is valid UTF-8 representing only Unicode code points from
2030/// U+0000 to U+00FF, inclusive, converts the input into output that
2031/// represents the value of each code point as the unsigned byte value of
2032/// each output byte.
2033///
2034/// If the input does not fulfill the condition stated above, this function
2035/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
2036/// does something that is memory-safe without any promises about any
2037/// properties of the output. In particular, callers shouldn't assume the
2038/// output to be the same across crate versions or CPU architectures and
2039/// should not assume that non-ASCII input can't map to ASCII output.
2040///
2041/// Borrows if input is ASCII-only. Performs a single heap allocation
2042/// otherwise.
2043///
2044/// Only available if the `alloc` feature is enabled (enabled by default).
2045#[cfg(feature = "alloc")]
2046pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
2047    let bytes = string.as_bytes();
2048    let up_to = ascii_valid_up_to(bytes);
2049    // >= makes later things optimize better than ==
2050    if up_to >= bytes.len() {
2051        debug_assert_eq!(up_to, bytes.len());
2052        return Cow::Borrowed(bytes);
2053    }
2054    let (head, tail) = bytes.split_at(up_to);
2055    let capacity = bytes.len();
2056    let mut vec = Vec::with_capacity(capacity);
2057    unsafe {
2058        vec.set_len(capacity);
2059    }
2060    (&mut vec[..up_to]).copy_from_slice(head);
2061    let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]);
2062    vec.truncate(up_to + written);
2063    Cow::Owned(vec)
2064}
2065
2066/// Returns the index of the first unpaired surrogate or, if the input is
2067/// valid UTF-16 in its entirety, the length of the input.
2068pub fn utf16_valid_up_to(buffer: &[u16]) -> usize {
2069    utf16_valid_up_to_impl(buffer)
2070}
2071
2072/// Returns the index of first byte that starts an invalid byte
2073/// sequence or a non-Latin1 byte sequence, or the length of the
2074/// string if there are neither.
2075pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize {
2076    is_utf8_latin1_impl(buffer).unwrap_or(buffer.len())
2077}
2078
2079/// Returns the index of first byte that starts a non-Latin1 byte
2080/// sequence, or the length of the string if there are none.
2081pub fn str_latin1_up_to(buffer: &str) -> usize {
2082    is_str_latin1_impl(buffer).unwrap_or_else(|| buffer.len())
2083}
2084
2085/// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
2086#[inline]
2087pub fn ensure_utf16_validity(buffer: &mut [u16]) {
2088    let mut offset = 0;
2089    loop {
2090        offset += utf16_valid_up_to(&buffer[offset..]);
2091        if offset == buffer.len() {
2092            return;
2093        }
2094        buffer[offset] = 0xFFFD;
2095        offset += 1;
2096    }
2097}
2098
2099/// Copies ASCII from source to destination up to the first non-ASCII byte
2100/// (or the end of the input if it is ASCII in its entirety).
2101///
2102/// The length of the destination buffer must be at least the length of the
2103/// source buffer.
2104///
2105/// Returns the number of bytes written.
2106///
2107/// # Panics
2108///
2109/// Panics if the destination buffer is shorter than stated above.
2110pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize {
2111    assert!(
2112        dst.len() >= src.len(),
2113        "Destination must not be shorter than the source."
2114    );
2115    if let Some((_, consumed)) =
2116        unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2117    {
2118        consumed
2119    } else {
2120        src.len()
2121    }
2122}
2123
2124/// Copies ASCII from source to destination zero-extending it to UTF-16 up to
2125/// the first non-ASCII byte (or the end of the input if it is ASCII in its
2126/// entirety).
2127///
2128/// The length of the destination buffer must be at least the length of the
2129/// source buffer.
2130///
2131/// Returns the number of `u16`s written.
2132///
2133/// # Panics
2134///
2135/// Panics if the destination buffer is shorter than stated above.
2136pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize {
2137    assert!(
2138        dst.len() >= src.len(),
2139        "Destination must not be shorter than the source."
2140    );
2141    if let Some((_, consumed)) =
2142        unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2143    {
2144        consumed
2145    } else {
2146        src.len()
2147    }
2148}
2149
2150/// Copies Basic Latin from source to destination narrowing it to ASCII up to
2151/// the first non-Basic Latin code unit (or the end of the input if it is
2152/// Basic Latin in its entirety).
2153///
2154/// The length of the destination buffer must be at least the length of the
2155/// source buffer.
2156///
2157/// Returns the number of bytes written.
2158///
2159/// # Panics
2160///
2161/// Panics if the destination buffer is shorter than stated above.
2162pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize {
2163    assert!(
2164        dst.len() >= src.len(),
2165        "Destination must not be shorter than the source."
2166    );
2167    if let Some((_, consumed)) =
2168        unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2169    {
2170        consumed
2171    } else {
2172        src.len()
2173    }
2174}
2175
2176// Any copyright to the test code below this comment is dedicated to the
2177// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
2178
2179#[cfg(all(test, feature = "alloc"))]
2180mod tests {
2181    use super::*;
2182
2183    #[test]
2184    fn test_is_ascii_success() {
2185        let mut src: Vec<u8> = Vec::with_capacity(128);
2186        src.resize(128, 0);
2187        for i in 0..src.len() {
2188            src[i] = i as u8;
2189        }
2190        for i in 0..src.len() {
2191            assert!(is_ascii(&src[i..]));
2192        }
2193    }
2194
2195    #[test]
2196    fn test_is_ascii_fail() {
2197        let mut src: Vec<u8> = Vec::with_capacity(128);
2198        src.resize(128, 0);
2199        for i in 0..src.len() {
2200            src[i] = i as u8;
2201        }
2202        for i in 0..src.len() {
2203            let tail = &mut src[i..];
2204            for j in 0..tail.len() {
2205                tail[j] = 0xA0;
2206                assert!(!is_ascii(tail));
2207            }
2208        }
2209    }
2210
2211    #[test]
2212    fn test_is_basic_latin_success() {
2213        let mut src: Vec<u16> = Vec::with_capacity(128);
2214        src.resize(128, 0);
2215        for i in 0..src.len() {
2216            src[i] = i as u16;
2217        }
2218        for i in 0..src.len() {
2219            assert!(is_basic_latin(&src[i..]));
2220        }
2221    }
2222
2223    #[test]
2224    fn test_is_basic_latin_fail() {
2225        let mut src: Vec<u16> = Vec::with_capacity(128);
2226        src.resize(128, 0);
2227        for i in 0..src.len() {
2228            src[i] = i as u16;
2229        }
2230        for i in 0..src.len() {
2231            let tail = &mut src[i..];
2232            for j in 0..tail.len() {
2233                tail[j] = 0xA0;
2234                assert!(!is_basic_latin(tail));
2235            }
2236        }
2237    }
2238
2239    #[test]
2240    fn test_is_utf16_latin1_success() {
2241        let mut src: Vec<u16> = Vec::with_capacity(256);
2242        src.resize(256, 0);
2243        for i in 0..src.len() {
2244            src[i] = i as u16;
2245        }
2246        for i in 0..src.len() {
2247            assert!(is_utf16_latin1(&src[i..]));
2248            assert_eq!(
2249                check_utf16_for_latin1_and_bidi(&src[i..]),
2250                Latin1Bidi::Latin1
2251            );
2252        }
2253    }
2254
2255    #[test]
2256    fn test_is_utf16_latin1_fail() {
2257        let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2258        let mut src: Vec<u16> = Vec::with_capacity(len);
2259        src.resize(len, 0);
2260        for i in 0..src.len() {
2261            src[i] = i as u16;
2262        }
2263        for i in 0..src.len() {
2264            let tail = &mut src[i..];
2265            for j in 0..tail.len() {
2266                tail[j] = 0x100 + j as u16;
2267                assert!(!is_utf16_latin1(tail));
2268                assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1);
2269            }
2270        }
2271    }
2272
2273    #[test]
2274    fn test_is_str_latin1_success() {
2275        let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2276        let mut src: Vec<u16> = Vec::with_capacity(len);
2277        src.resize(len, 0);
2278        for i in 0..src.len() {
2279            src[i] = i as u16;
2280        }
2281        for i in 0..src.len() {
2282            let s = String::from_utf16(&src[i..]).unwrap();
2283            assert!(is_str_latin1(&s[..]));
2284            assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2285        }
2286    }
2287
2288    #[test]
2289    fn test_is_str_latin1_fail() {
2290        let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2291        let mut src: Vec<u16> = Vec::with_capacity(len);
2292        src.resize(len, 0);
2293        for i in 0..src.len() {
2294            src[i] = i as u16;
2295        }
2296        for i in 0..src.len() {
2297            let tail = &mut src[i..];
2298            for j in 0..tail.len() {
2299                tail[j] = 0x100 + j as u16;
2300                let s = String::from_utf16(tail).unwrap();
2301                assert!(!is_str_latin1(&s[..]));
2302                assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2303            }
2304        }
2305    }
2306
2307    #[test]
2308    fn test_is_utf8_latin1_success() {
2309        let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2310        let mut src: Vec<u16> = Vec::with_capacity(len);
2311        src.resize(len, 0);
2312        for i in 0..src.len() {
2313            src[i] = i as u16;
2314        }
2315        for i in 0..src.len() {
2316            let s = String::from_utf16(&src[i..]).unwrap();
2317            assert!(is_utf8_latin1(s.as_bytes()));
2318            assert_eq!(
2319                check_utf8_for_latin1_and_bidi(s.as_bytes()),
2320                Latin1Bidi::Latin1
2321            );
2322        }
2323    }
2324
2325    #[test]
2326    fn test_is_utf8_latin1_fail() {
2327        let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2328        let mut src: Vec<u16> = Vec::with_capacity(len);
2329        src.resize(len, 0);
2330        for i in 0..src.len() {
2331            src[i] = i as u16;
2332        }
2333        for i in 0..src.len() {
2334            let tail = &mut src[i..];
2335            for j in 0..tail.len() {
2336                tail[j] = 0x100 + j as u16;
2337                let s = String::from_utf16(tail).unwrap();
2338                assert!(!is_utf8_latin1(s.as_bytes()));
2339                assert_ne!(
2340                    check_utf8_for_latin1_and_bidi(s.as_bytes()),
2341                    Latin1Bidi::Latin1
2342                );
2343            }
2344        }
2345    }
2346
2347    #[test]
2348    fn test_is_utf8_latin1_invalid() {
2349        assert!(!is_utf8_latin1(b"\xC3"));
2350        assert!(!is_utf8_latin1(b"a\xC3"));
2351        assert!(!is_utf8_latin1(b"\xFF"));
2352        assert!(!is_utf8_latin1(b"a\xFF"));
2353        assert!(!is_utf8_latin1(b"\xC3\xFF"));
2354        assert!(!is_utf8_latin1(b"a\xC3\xFF"));
2355    }
2356
2357    #[test]
2358    fn test_convert_utf8_to_utf16() {
2359        let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2360        let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1);
2361        dst.resize(src.len() + 1, 0);
2362        let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]);
2363        dst.truncate(len);
2364        let reference: Vec<u16> = src.encode_utf16().collect();
2365        assert_eq!(dst, reference);
2366    }
2367
2368    #[test]
2369    fn test_convert_str_to_utf16() {
2370        let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2371        let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2372        dst.resize(src.len(), 0);
2373        let len = convert_str_to_utf16(src, &mut dst[..]);
2374        dst.truncate(len);
2375        let reference: Vec<u16> = src.encode_utf16().collect();
2376        assert_eq!(dst, reference);
2377    }
2378
2379    #[test]
2380    fn test_convert_utf16_to_utf8_partial() {
2381        let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2382        let src: Vec<u16> = reference.encode_utf16().collect();
2383        let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2384        dst.resize(src.len() * 3 + 1, 0);
2385        let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
2386        let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
2387        dst.truncate(len);
2388        assert_eq!(dst, reference.as_bytes());
2389    }
2390
2391    #[test]
2392    fn test_convert_utf16_to_utf8() {
2393        let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2394        let src: Vec<u16> = reference.encode_utf16().collect();
2395        let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2396        dst.resize(src.len() * 3 + 1, 0);
2397        let len = convert_utf16_to_utf8(&src[..], &mut dst[..]);
2398        dst.truncate(len);
2399        assert_eq!(dst, reference.as_bytes());
2400    }
2401
2402    #[test]
2403    fn test_convert_latin1_to_utf16() {
2404        let mut src: Vec<u8> = Vec::with_capacity(256);
2405        src.resize(256, 0);
2406        let mut reference: Vec<u16> = Vec::with_capacity(256);
2407        reference.resize(256, 0);
2408        for i in 0..256 {
2409            src[i] = i as u8;
2410            reference[i] = i as u16;
2411        }
2412        let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2413        dst.resize(src.len(), 0);
2414        convert_latin1_to_utf16(&src[..], &mut dst[..]);
2415        assert_eq!(dst, reference);
2416    }
2417
2418    #[test]
2419    fn test_convert_latin1_to_utf8_partial() {
2420        let mut dst = [0u8, 2];
2421        let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
2422        assert_eq!(read, 1);
2423        assert_eq!(written, 1);
2424    }
2425
2426    #[test]
2427    fn test_convert_latin1_to_utf8() {
2428        let mut src: Vec<u8> = Vec::with_capacity(256);
2429        src.resize(256, 0);
2430        let mut reference: Vec<u16> = Vec::with_capacity(256);
2431        reference.resize(256, 0);
2432        for i in 0..256 {
2433            src[i] = i as u8;
2434            reference[i] = i as u16;
2435        }
2436        let s = String::from_utf16(&reference[..]).unwrap();
2437        let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2);
2438        dst.resize(src.len() * 2, 0);
2439        let len = convert_latin1_to_utf8(&src[..], &mut dst[..]);
2440        dst.truncate(len);
2441        assert_eq!(&dst[..], s.as_bytes());
2442    }
2443
2444    #[test]
2445    fn test_convert_utf8_to_latin1_lossy() {
2446        let mut reference: Vec<u8> = Vec::with_capacity(256);
2447        reference.resize(256, 0);
2448        let mut src16: Vec<u16> = Vec::with_capacity(256);
2449        src16.resize(256, 0);
2450        for i in 0..256 {
2451            src16[i] = i as u16;
2452            reference[i] = i as u8;
2453        }
2454        let src = String::from_utf16(&src16[..]).unwrap();
2455        let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2456        dst.resize(src.len(), 0);
2457        let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]);
2458        dst.truncate(len);
2459        assert_eq!(dst, reference);
2460    }
2461
2462    #[cfg(all(debug_assertions, not(fuzzing)))]
2463    #[test]
2464    #[should_panic]
2465    fn test_convert_utf8_to_latin1_lossy_panics() {
2466        let mut dst = [0u8; 16];
2467        let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
2468    }
2469
2470    #[test]
2471    fn test_convert_utf16_to_latin1_lossy() {
2472        let mut src: Vec<u16> = Vec::with_capacity(256);
2473        src.resize(256, 0);
2474        let mut reference: Vec<u8> = Vec::with_capacity(256);
2475        reference.resize(256, 0);
2476        for i in 0..256 {
2477            src[i] = i as u16;
2478            reference[i] = i as u8;
2479        }
2480        let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2481        dst.resize(src.len(), 0);
2482        convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]);
2483        assert_eq!(dst, reference);
2484    }
2485
2486    #[test]
2487    // #[should_panic]
2488    fn test_convert_utf16_to_latin1_lossy_panics() {
2489        let mut dst = [0u8; 16];
2490        let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
2491    }
2492
2493    #[test]
2494    fn test_utf16_valid_up_to() {
2495        let valid = vec![
2496            0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16,
2497            0xD83Du16, 0xDCA9u16, 0x00B6u16,
2498        ];
2499        assert_eq!(utf16_valid_up_to(&valid[..]), 16);
2500        let lone_high = vec![
2501            0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2502            0x2603u16, 0xD83Du16, 0x00B6u16,
2503        ];
2504        assert_eq!(utf16_valid_up_to(&lone_high[..]), 14);
2505        let lone_low = vec![
2506            0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2507            0x2603u16, 0xDCA9u16, 0x00B6u16,
2508        ];
2509        assert_eq!(utf16_valid_up_to(&lone_low[..]), 14);
2510        let lone_high_at_end = vec![
2511            0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2512            0x2603u16, 0x00B6u16, 0xD83Du16,
2513        ];
2514        assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15);
2515    }
2516
2517    #[test]
2518    fn test_ensure_utf16_validity() {
2519        let mut src = vec![
2520            0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2521            0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2522            0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2523        ];
2524        let reference = vec![
2525            0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2526            0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2527            0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2528        ];
2529        ensure_utf16_validity(&mut src[..]);
2530        assert_eq!(src, reference);
2531    }
2532
2533    #[test]
2534    fn test_is_char_bidi() {
2535        assert!(!is_char_bidi('a'));
2536        assert!(!is_char_bidi('\u{03B1}'));
2537        assert!(!is_char_bidi('\u{3041}'));
2538        assert!(!is_char_bidi('\u{1F4A9}'));
2539        assert!(!is_char_bidi('\u{FE00}'));
2540        assert!(!is_char_bidi('\u{202C}'));
2541        assert!(!is_char_bidi('\u{FEFF}'));
2542        assert!(is_char_bidi('\u{0590}'));
2543        assert!(is_char_bidi('\u{08FF}'));
2544        assert!(is_char_bidi('\u{061C}'));
2545        assert!(is_char_bidi('\u{FB50}'));
2546        assert!(is_char_bidi('\u{FDFF}'));
2547        assert!(is_char_bidi('\u{FE70}'));
2548        assert!(is_char_bidi('\u{FEFE}'));
2549        assert!(is_char_bidi('\u{200F}'));
2550        assert!(is_char_bidi('\u{202B}'));
2551        assert!(is_char_bidi('\u{202E}'));
2552        assert!(is_char_bidi('\u{2067}'));
2553        assert!(is_char_bidi('\u{10800}'));
2554        assert!(is_char_bidi('\u{10FFF}'));
2555        assert!(is_char_bidi('\u{1E800}'));
2556        assert!(is_char_bidi('\u{1EFFF}'));
2557    }
2558
2559    #[test]
2560    fn test_is_utf16_code_unit_bidi() {
2561        assert!(!is_utf16_code_unit_bidi(0x0062));
2562        assert!(!is_utf16_code_unit_bidi(0x03B1));
2563        assert!(!is_utf16_code_unit_bidi(0x3041));
2564        assert!(!is_utf16_code_unit_bidi(0xD801));
2565        assert!(!is_utf16_code_unit_bidi(0xFE00));
2566        assert!(!is_utf16_code_unit_bidi(0x202C));
2567        assert!(!is_utf16_code_unit_bidi(0xFEFF));
2568        assert!(is_utf16_code_unit_bidi(0x0590));
2569        assert!(is_utf16_code_unit_bidi(0x08FF));
2570        assert!(is_utf16_code_unit_bidi(0x061C));
2571        assert!(is_utf16_code_unit_bidi(0xFB1D));
2572        assert!(is_utf16_code_unit_bidi(0xFB50));
2573        assert!(is_utf16_code_unit_bidi(0xFDFF));
2574        assert!(is_utf16_code_unit_bidi(0xFE70));
2575        assert!(is_utf16_code_unit_bidi(0xFEFE));
2576        assert!(is_utf16_code_unit_bidi(0x200F));
2577        assert!(is_utf16_code_unit_bidi(0x202B));
2578        assert!(is_utf16_code_unit_bidi(0x202E));
2579        assert!(is_utf16_code_unit_bidi(0x2067));
2580        assert!(is_utf16_code_unit_bidi(0xD802));
2581        assert!(is_utf16_code_unit_bidi(0xD803));
2582        assert!(is_utf16_code_unit_bidi(0xD83A));
2583        assert!(is_utf16_code_unit_bidi(0xD83B));
2584    }
2585
2586    #[test]
2587    fn test_is_str_bidi() {
2588        assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop"));
2589        assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"));
2590        assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"));
2591        assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"));
2592        assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"));
2593        assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"));
2594        assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
2595        assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"));
2596        assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"));
2597        assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"));
2598        assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"));
2599        assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"));
2600        assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"));
2601        assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"));
2602        assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"));
2603        assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"));
2604        assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"));
2605        assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"));
2606        assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"));
2607        assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"));
2608        assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"));
2609        assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"));
2610    }
2611
2612    #[test]
2613    fn test_is_utf8_bidi() {
2614        assert!(!is_utf8_bidi(
2615            "abcdefghijklmnopaabcdefghijklmnop".as_bytes()
2616        ));
2617        assert!(!is_utf8_bidi(
2618            "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()
2619        ));
2620        assert!(!is_utf8_bidi(
2621            "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()
2622        ));
2623        assert!(!is_utf8_bidi(
2624            "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()
2625        ));
2626        assert!(!is_utf8_bidi(
2627            "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()
2628        ));
2629        assert!(!is_utf8_bidi(
2630            "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()
2631        ));
2632        assert!(!is_utf8_bidi(
2633            "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
2634        ));
2635        assert!(is_utf8_bidi(
2636            "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()
2637        ));
2638        assert!(is_utf8_bidi(
2639            "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()
2640        ));
2641        assert!(is_utf8_bidi(
2642            "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()
2643        ));
2644        assert!(is_utf8_bidi(
2645            "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()
2646        ));
2647        assert!(is_utf8_bidi(
2648            "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()
2649        ));
2650        assert!(is_utf8_bidi(
2651            "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()
2652        ));
2653        assert!(is_utf8_bidi(
2654            "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()
2655        ));
2656        assert!(is_utf8_bidi(
2657            "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()
2658        ));
2659        assert!(is_utf8_bidi(
2660            "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()
2661        ));
2662        assert!(is_utf8_bidi(
2663            "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()
2664        ));
2665        assert!(is_utf8_bidi(
2666            "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()
2667        ));
2668        assert!(is_utf8_bidi(
2669            "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()
2670        ));
2671        assert!(is_utf8_bidi(
2672            "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()
2673        ));
2674        assert!(is_utf8_bidi(
2675            "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()
2676        ));
2677        assert!(is_utf8_bidi(
2678            "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()
2679        ));
2680    }
2681
2682    #[test]
2683    fn test_is_utf16_bidi() {
2684        assert!(!is_utf16_bidi(&[
2685            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66,
2686            0x67, 0x68, 0x69,
2687        ]));
2688        assert!(!is_utf16_bidi(&[
2689            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66,
2690            0x67, 0x68, 0x69,
2691        ]));
2692        assert!(!is_utf16_bidi(&[
2693            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66,
2694            0x67, 0x68, 0x69,
2695        ]));
2696        assert!(!is_utf16_bidi(&[
2697            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66,
2698            0x67, 0x68, 0x69,
2699        ]));
2700        assert!(!is_utf16_bidi(&[
2701            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66,
2702            0x67, 0x68, 0x69,
2703        ]));
2704        assert!(!is_utf16_bidi(&[
2705            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66,
2706            0x67, 0x68, 0x69,
2707        ]));
2708        assert!(!is_utf16_bidi(&[
2709            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2710            0x67, 0x68, 0x69,
2711        ]));
2712        assert!(is_utf16_bidi(&[
2713            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66,
2714            0x67, 0x68, 0x69,
2715        ]));
2716        assert!(is_utf16_bidi(&[
2717            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66,
2718            0x67, 0x68, 0x69,
2719        ]));
2720        assert!(is_utf16_bidi(&[
2721            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66,
2722            0x67, 0x68, 0x69,
2723        ]));
2724        assert!(is_utf16_bidi(&[
2725            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66,
2726            0x67, 0x68, 0x69,
2727        ]));
2728        assert!(is_utf16_bidi(&[
2729            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66,
2730            0x67, 0x68, 0x69,
2731        ]));
2732        assert!(is_utf16_bidi(&[
2733            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2734            0x67, 0x68, 0x69,
2735        ]));
2736        assert!(is_utf16_bidi(&[
2737            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66,
2738            0x67, 0x68, 0x69,
2739        ]));
2740        assert!(is_utf16_bidi(&[
2741            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66,
2742            0x67, 0x68, 0x69,
2743        ]));
2744        assert!(is_utf16_bidi(&[
2745            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66,
2746            0x67, 0x68, 0x69,
2747        ]));
2748        assert!(is_utf16_bidi(&[
2749            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66,
2750            0x67, 0x68, 0x69,
2751        ]));
2752        assert!(is_utf16_bidi(&[
2753            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66,
2754            0x67, 0x68, 0x69,
2755        ]));
2756        assert!(is_utf16_bidi(&[
2757            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66,
2758            0x67, 0x68, 0x69,
2759        ]));
2760        assert!(is_utf16_bidi(&[
2761            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66,
2762            0x67, 0x68, 0x69,
2763        ]));
2764        assert!(is_utf16_bidi(&[
2765            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66,
2766            0x67, 0x68, 0x69,
2767        ]));
2768        assert!(is_utf16_bidi(&[
2769            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66,
2770            0x67, 0x68, 0x69,
2771        ]));
2772        assert!(is_utf16_bidi(&[
2773            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66,
2774            0x67, 0x68, 0x69,
2775        ]));
2776
2777        assert!(is_utf16_bidi(&[
2778            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65,
2779            0x66, 0x67, 0x68, 0x69,
2780        ]));
2781    }
2782
2783    #[test]
2784    fn test_check_str_for_latin1_and_bidi() {
2785        assert_ne!(
2786            check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"),
2787            Latin1Bidi::Bidi
2788        );
2789        assert_ne!(
2790            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"),
2791            Latin1Bidi::Bidi
2792        );
2793        assert_ne!(
2794            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"),
2795            Latin1Bidi::Bidi
2796        );
2797        assert_ne!(
2798            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"),
2799            Latin1Bidi::Bidi
2800        );
2801        assert_ne!(
2802            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"),
2803            Latin1Bidi::Bidi
2804        );
2805        assert_ne!(
2806            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"),
2807            Latin1Bidi::Bidi
2808        );
2809        assert_ne!(
2810            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
2811            Latin1Bidi::Bidi
2812        );
2813        assert_eq!(
2814            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"),
2815            Latin1Bidi::Bidi
2816        );
2817        assert_eq!(
2818            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"),
2819            Latin1Bidi::Bidi
2820        );
2821        assert_eq!(
2822            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"),
2823            Latin1Bidi::Bidi
2824        );
2825        assert_eq!(
2826            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"),
2827            Latin1Bidi::Bidi
2828        );
2829        assert_eq!(
2830            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"),
2831            Latin1Bidi::Bidi
2832        );
2833        assert_eq!(
2834            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"),
2835            Latin1Bidi::Bidi
2836        );
2837        assert_eq!(
2838            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"),
2839            Latin1Bidi::Bidi
2840        );
2841        assert_eq!(
2842            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"),
2843            Latin1Bidi::Bidi
2844        );
2845        assert_eq!(
2846            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"),
2847            Latin1Bidi::Bidi
2848        );
2849        assert_eq!(
2850            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"),
2851            Latin1Bidi::Bidi
2852        );
2853        assert_eq!(
2854            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"),
2855            Latin1Bidi::Bidi
2856        );
2857        assert_eq!(
2858            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"),
2859            Latin1Bidi::Bidi
2860        );
2861        assert_eq!(
2862            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"),
2863            Latin1Bidi::Bidi
2864        );
2865        assert_eq!(
2866            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"),
2867            Latin1Bidi::Bidi
2868        );
2869        assert_eq!(
2870            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"),
2871            Latin1Bidi::Bidi
2872        );
2873    }
2874
2875    #[test]
2876    fn test_check_utf8_for_latin1_and_bidi() {
2877        assert_ne!(
2878            check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()),
2879            Latin1Bidi::Bidi
2880        );
2881        assert_ne!(
2882            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()),
2883            Latin1Bidi::Bidi
2884        );
2885        assert_ne!(
2886            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()),
2887            Latin1Bidi::Bidi
2888        );
2889        assert_ne!(
2890            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()),
2891            Latin1Bidi::Bidi
2892        );
2893        assert_ne!(
2894            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()),
2895            Latin1Bidi::Bidi
2896        );
2897        assert_ne!(
2898            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()),
2899            Latin1Bidi::Bidi
2900        );
2901        assert_ne!(
2902            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
2903            Latin1Bidi::Bidi
2904        );
2905        assert_eq!(
2906            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()),
2907            Latin1Bidi::Bidi
2908        );
2909        assert_eq!(
2910            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()),
2911            Latin1Bidi::Bidi
2912        );
2913        assert_eq!(
2914            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()),
2915            Latin1Bidi::Bidi
2916        );
2917        assert_eq!(
2918            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()),
2919            Latin1Bidi::Bidi
2920        );
2921        assert_eq!(
2922            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()),
2923            Latin1Bidi::Bidi
2924        );
2925        assert_eq!(
2926            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()),
2927            Latin1Bidi::Bidi
2928        );
2929        assert_eq!(
2930            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()),
2931            Latin1Bidi::Bidi
2932        );
2933        assert_eq!(
2934            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()),
2935            Latin1Bidi::Bidi
2936        );
2937        assert_eq!(
2938            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()),
2939            Latin1Bidi::Bidi
2940        );
2941        assert_eq!(
2942            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()),
2943            Latin1Bidi::Bidi
2944        );
2945        assert_eq!(
2946            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()),
2947            Latin1Bidi::Bidi
2948        );
2949        assert_eq!(
2950            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()),
2951            Latin1Bidi::Bidi
2952        );
2953        assert_eq!(
2954            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()),
2955            Latin1Bidi::Bidi
2956        );
2957        assert_eq!(
2958            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()),
2959            Latin1Bidi::Bidi
2960        );
2961        assert_eq!(
2962            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()),
2963            Latin1Bidi::Bidi
2964        );
2965    }
2966
2967    #[test]
2968    fn test_check_utf16_for_latin1_and_bidi() {
2969        assert_ne!(
2970            check_utf16_for_latin1_and_bidi(&[
2971                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65,
2972                0x66, 0x67, 0x68, 0x69,
2973            ]),
2974            Latin1Bidi::Bidi
2975        );
2976        assert_ne!(
2977            check_utf16_for_latin1_and_bidi(&[
2978                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65,
2979                0x66, 0x67, 0x68, 0x69,
2980            ]),
2981            Latin1Bidi::Bidi
2982        );
2983        assert_ne!(
2984            check_utf16_for_latin1_and_bidi(&[
2985                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65,
2986                0x66, 0x67, 0x68, 0x69,
2987            ]),
2988            Latin1Bidi::Bidi
2989        );
2990        assert_ne!(
2991            check_utf16_for_latin1_and_bidi(&[
2992                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65,
2993                0x66, 0x67, 0x68, 0x69,
2994            ]),
2995            Latin1Bidi::Bidi
2996        );
2997        assert_ne!(
2998            check_utf16_for_latin1_and_bidi(&[
2999                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65,
3000                0x66, 0x67, 0x68, 0x69,
3001            ]),
3002            Latin1Bidi::Bidi
3003        );
3004        assert_ne!(
3005            check_utf16_for_latin1_and_bidi(&[
3006                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65,
3007                0x66, 0x67, 0x68, 0x69,
3008            ]),
3009            Latin1Bidi::Bidi
3010        );
3011        assert_ne!(
3012            check_utf16_for_latin1_and_bidi(&[
3013                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
3014                0x66, 0x67, 0x68, 0x69,
3015            ]),
3016            Latin1Bidi::Bidi
3017        );
3018        assert_eq!(
3019            check_utf16_for_latin1_and_bidi(&[
3020                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65,
3021                0x66, 0x67, 0x68, 0x69,
3022            ]),
3023            Latin1Bidi::Bidi
3024        );
3025        assert_eq!(
3026            check_utf16_for_latin1_and_bidi(&[
3027                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65,
3028                0x66, 0x67, 0x68, 0x69,
3029            ]),
3030            Latin1Bidi::Bidi
3031        );
3032        assert_eq!(
3033            check_utf16_for_latin1_and_bidi(&[
3034                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65,
3035                0x66, 0x67, 0x68, 0x69,
3036            ]),
3037            Latin1Bidi::Bidi
3038        );
3039        assert_eq!(
3040            check_utf16_for_latin1_and_bidi(&[
3041                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65,
3042                0x66, 0x67, 0x68, 0x69,
3043            ]),
3044            Latin1Bidi::Bidi
3045        );
3046        assert_eq!(
3047            check_utf16_for_latin1_and_bidi(&[
3048                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65,
3049                0x66, 0x67, 0x68, 0x69,
3050            ]),
3051            Latin1Bidi::Bidi
3052        );
3053        assert_eq!(
3054            check_utf16_for_latin1_and_bidi(&[
3055                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65,
3056                0x66, 0x67, 0x68, 0x69,
3057            ]),
3058            Latin1Bidi::Bidi
3059        );
3060        assert_eq!(
3061            check_utf16_for_latin1_and_bidi(&[
3062                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65,
3063                0x66, 0x67, 0x68, 0x69,
3064            ]),
3065            Latin1Bidi::Bidi
3066        );
3067        assert_eq!(
3068            check_utf16_for_latin1_and_bidi(&[
3069                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65,
3070                0x66, 0x67, 0x68, 0x69,
3071            ]),
3072            Latin1Bidi::Bidi
3073        );
3074        assert_eq!(
3075            check_utf16_for_latin1_and_bidi(&[
3076                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65,
3077                0x66, 0x67, 0x68, 0x69,
3078            ]),
3079            Latin1Bidi::Bidi
3080        );
3081        assert_eq!(
3082            check_utf16_for_latin1_and_bidi(&[
3083                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65,
3084                0x66, 0x67, 0x68, 0x69,
3085            ]),
3086            Latin1Bidi::Bidi
3087        );
3088        assert_eq!(
3089            check_utf16_for_latin1_and_bidi(&[
3090                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65,
3091                0x66, 0x67, 0x68, 0x69,
3092            ]),
3093            Latin1Bidi::Bidi
3094        );
3095        assert_eq!(
3096            check_utf16_for_latin1_and_bidi(&[
3097                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65,
3098                0x66, 0x67, 0x68, 0x69,
3099            ]),
3100            Latin1Bidi::Bidi
3101        );
3102        assert_eq!(
3103            check_utf16_for_latin1_and_bidi(&[
3104                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65,
3105                0x66, 0x67, 0x68, 0x69,
3106            ]),
3107            Latin1Bidi::Bidi
3108        );
3109        assert_eq!(
3110            check_utf16_for_latin1_and_bidi(&[
3111                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65,
3112                0x66, 0x67, 0x68, 0x69,
3113            ]),
3114            Latin1Bidi::Bidi
3115        );
3116        assert_eq!(
3117            check_utf16_for_latin1_and_bidi(&[
3118                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65,
3119                0x66, 0x67, 0x68, 0x69,
3120            ]),
3121            Latin1Bidi::Bidi
3122        );
3123        assert_eq!(
3124            check_utf16_for_latin1_and_bidi(&[
3125                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65,
3126                0x66, 0x67, 0x68, 0x69,
3127            ]),
3128            Latin1Bidi::Bidi
3129        );
3130
3131        assert_eq!(
3132            check_utf16_for_latin1_and_bidi(&[
3133                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64,
3134                0x65, 0x66, 0x67, 0x68, 0x69,
3135            ]),
3136            Latin1Bidi::Bidi
3137        );
3138    }
3139
3140    #[inline(always)]
3141    pub fn reference_is_char_bidi(c: char) -> bool {
3142        match c {
3143            '\u{0590}'..='\u{08FF}'
3144            | '\u{FB1D}'..='\u{FDFF}'
3145            | '\u{FE70}'..='\u{FEFE}'
3146            | '\u{10800}'..='\u{10FFF}'
3147            | '\u{1E800}'..='\u{1EFFF}'
3148            | '\u{200F}'
3149            | '\u{202B}'
3150            | '\u{202E}'
3151            | '\u{2067}' => true,
3152            _ => false,
3153        }
3154    }
3155
3156    #[inline(always)]
3157    pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
3158        match u {
3159            0x0590..=0x08FF
3160            | 0xFB1D..=0xFDFF
3161            | 0xFE70..=0xFEFE
3162            | 0xD802
3163            | 0xD803
3164            | 0xD83A
3165            | 0xD83B
3166            | 0x200F
3167            | 0x202B
3168            | 0x202E
3169            | 0x2067 => true,
3170            _ => false,
3171        }
3172    }
3173
3174    #[test]
3175    #[cfg_attr(miri, ignore)] // Miri is too slow
3176    fn test_is_char_bidi_thoroughly() {
3177        for i in 0..0xD800u32 {
3178            let c: char = ::core::char::from_u32(i).unwrap();
3179            assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3180        }
3181        for i in 0xE000..0x110000u32 {
3182            let c: char = ::core::char::from_u32(i).unwrap();
3183            assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3184        }
3185    }
3186
3187    #[test]
3188    #[cfg_attr(miri, ignore)] // Miri is too slow
3189    fn test_is_utf16_code_unit_bidi_thoroughly() {
3190        for i in 0..0x10000u32 {
3191            let u = i as u16;
3192            assert_eq!(
3193                is_utf16_code_unit_bidi(u),
3194                reference_is_utf16_code_unit_bidi(u)
3195            );
3196        }
3197    }
3198
3199    #[test]
3200    #[cfg_attr(miri, ignore)] // Miri is too slow
3201    fn test_is_str_bidi_thoroughly() {
3202        let mut buf = [0; 4];
3203        for i in 0..0xD800u32 {
3204            let c: char = ::core::char::from_u32(i).unwrap();
3205            assert_eq!(
3206                is_str_bidi(c.encode_utf8(&mut buf[..])),
3207                reference_is_char_bidi(c)
3208            );
3209        }
3210        for i in 0xE000..0x110000u32 {
3211            let c: char = ::core::char::from_u32(i).unwrap();
3212            assert_eq!(
3213                is_str_bidi(c.encode_utf8(&mut buf[..])),
3214                reference_is_char_bidi(c)
3215            );
3216        }
3217    }
3218
3219    #[test]
3220    #[cfg_attr(miri, ignore)] // Miri is too slow
3221    fn test_is_utf8_bidi_thoroughly() {
3222        let mut buf = [0; 8];
3223        for i in 0..0xD800u32 {
3224            let c: char = ::core::char::from_u32(i).unwrap();
3225            let expect = reference_is_char_bidi(c);
3226            {
3227                let len = {
3228                    let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3229                    assert_eq!(is_utf8_bidi(bytes), expect);
3230                    bytes.len()
3231                };
3232                {
3233                    let tail = &mut buf[len..];
3234                    for b in tail.iter_mut() {
3235                        *b = 0;
3236                    }
3237                }
3238            }
3239            assert_eq!(is_utf8_bidi(&buf[..]), expect);
3240        }
3241        for i in 0xE000..0x110000u32 {
3242            let c: char = ::core::char::from_u32(i).unwrap();
3243            let expect = reference_is_char_bidi(c);
3244            {
3245                let len = {
3246                    let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3247                    assert_eq!(is_utf8_bidi(bytes), expect);
3248                    bytes.len()
3249                };
3250                {
3251                    let tail = &mut buf[len..];
3252                    for b in tail.iter_mut() {
3253                        *b = 0;
3254                    }
3255                }
3256            }
3257            assert_eq!(is_utf8_bidi(&buf[..]), expect);
3258        }
3259    }
3260
3261    #[test]
3262    #[cfg_attr(miri, ignore)] // Miri is too slow
3263    fn test_is_utf16_bidi_thoroughly() {
3264        let mut buf = [0; 32];
3265        for i in 0..0x10000u32 {
3266            let u = i as u16;
3267            buf[15] = u;
3268            assert_eq!(
3269                is_utf16_bidi(&buf[..]),
3270                reference_is_utf16_code_unit_bidi(u)
3271            );
3272        }
3273    }
3274
3275    #[test]
3276    fn test_is_utf8_bidi_edge_cases() {
3277        assert!(!is_utf8_bidi(b"\xD5\xBF\x61"));
3278        assert!(!is_utf8_bidi(b"\xD6\x80\x61"));
3279        assert!(!is_utf8_bidi(b"abc"));
3280        assert!(is_utf8_bidi(b"\xD5\xBF\xC2"));
3281        assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
3282        assert!(is_utf8_bidi(b"ab\xC2"));
3283    }
3284
3285    #[test]
3286    fn test_decode_latin1() {
3287        match decode_latin1(b"ab") {
3288            Cow::Borrowed(s) => {
3289                assert_eq!(s, "ab");
3290            }
3291            Cow::Owned(_) => {
3292                unreachable!("Should have borrowed");
3293            }
3294        }
3295        assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
3296    }
3297
3298    #[test]
3299    fn test_encode_latin1_lossy() {
3300        match encode_latin1_lossy("ab") {
3301            Cow::Borrowed(s) => {
3302                assert_eq!(s, b"ab");
3303            }
3304            Cow::Owned(_) => {
3305                unreachable!("Should have borrowed");
3306            }
3307        }
3308        assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
3309    }
3310
3311    #[test]
3312    fn test_convert_utf8_to_utf16_without_replacement() {
3313        let mut buf = [0u16; 5];
3314        assert_eq!(
3315            convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]),
3316            Some(2)
3317        );
3318        assert_eq!(buf[0], u16::from(b'a'));
3319        assert_eq!(buf[1], u16::from(b'b'));
3320        assert_eq!(buf[2], 0);
3321        assert_eq!(
3322            convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]),
3323            Some(2)
3324        );
3325        assert_eq!(buf[0], 0xE4);
3326        assert_eq!(buf[1], u16::from(b'c'));
3327        assert_eq!(buf[2], 0);
3328        assert_eq!(
3329            convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]),
3330            Some(1)
3331        );
3332        assert_eq!(buf[0], 0x2603);
3333        assert_eq!(buf[1], u16::from(b'c'));
3334        assert_eq!(buf[2], 0);
3335        assert_eq!(
3336            convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]),
3337            Some(2)
3338        );
3339        assert_eq!(buf[0], 0x2603);
3340        assert_eq!(buf[1], u16::from(b'd'));
3341        assert_eq!(buf[2], 0);
3342        assert_eq!(
3343            convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]),
3344            Some(2)
3345        );
3346        assert_eq!(buf[0], 0x2603);
3347        assert_eq!(buf[1], 0xE4);
3348        assert_eq!(buf[2], 0);
3349        assert_eq!(
3350            convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]),
3351            Some(2)
3352        );
3353        assert_eq!(buf[0], 0xD83D);
3354        assert_eq!(buf[1], 0xDCCE);
3355        assert_eq!(buf[2], 0);
3356        assert_eq!(
3357            convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]),
3358            Some(3)
3359        );
3360        assert_eq!(buf[0], 0xD83D);
3361        assert_eq!(buf[1], 0xDCCE);
3362        assert_eq!(buf[2], u16::from(b'e'));
3363        assert_eq!(
3364            convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]),
3365            None
3366        );
3367    }
3368}
encoding_rs/mem.rs

encoding_rs/
mem.rs