encoding_rs/
single_byte.rs

1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::ascii::*;
12use crate::data::position;
13use crate::handles::*;
14use crate::variant::*;
15
16pub struct SingleByteDecoder {
17    table: &'static [u16; 128],
18}
19
20impl SingleByteDecoder {
21    pub fn new(data: &'static [u16; 128]) -> VariantDecoder {
22        VariantDecoder::SingleByte(SingleByteDecoder { table: data })
23    }
24
25    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
26        Some(byte_length)
27    }
28
29    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
30        byte_length.checked_mul(3)
31    }
32
33    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
34        byte_length.checked_mul(3)
35    }
36
37    pub fn decode_to_utf8_raw(
38        &mut self,
39        src: &[u8],
40        dst: &mut [u8],
41        _last: bool,
42    ) -> (DecoderResult, usize, usize) {
43        let mut source = ByteSource::new(src);
44        let mut dest = Utf8Destination::new(dst);
45        'outermost: loop {
46            match dest.copy_ascii_from_check_space_bmp(&mut source) {
47                CopyAsciiResult::Stop(ret) => return ret,
48                CopyAsciiResult::GoOn((mut non_ascii, mut handle)) => 'middle: loop {
49                    // Start non-boilerplate
50                    //
51                    // Since the non-ASCIIness of `non_ascii` is hidden from
52                    // the optimizer, it can't figure out that it's OK to
53                    // statically omit the bound check when accessing
54                    // `[u16; 128]` with an index
55                    // `non_ascii as usize - 0x80usize`.
56                    //
57                    // Safety: `non_ascii` is a u8 byte >=0x80, from the invariants
58                    // on Utf8Destination::copy_ascii_from_check_space_bmp()
59                    let mapped =
60                        unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
61                    // let mapped = self.table[non_ascii as usize - 0x80usize];
62                    if mapped == 0u16 {
63                        return (
64                            DecoderResult::Malformed(1, 0),
65                            source.consumed(),
66                            handle.written(),
67                        );
68                    }
69                    let dest_again = handle.write_bmp_excl_ascii(mapped);
70                    // End non-boilerplate
71                    match source.check_available() {
72                        Space::Full(src_consumed) => {
73                            return (
74                                DecoderResult::InputEmpty,
75                                src_consumed,
76                                dest_again.written(),
77                            );
78                        }
79                        Space::Available(source_handle) => {
80                            match dest_again.check_space_bmp() {
81                                Space::Full(dst_written) => {
82                                    return (
83                                        DecoderResult::OutputFull,
84                                        source_handle.consumed(),
85                                        dst_written,
86                                    );
87                                }
88                                Space::Available(mut destination_handle) => {
89                                    let (mut b, unread_handle) = source_handle.read();
90                                    let source_again = unread_handle.commit();
91                                    'innermost: loop {
92                                        if b > 127 {
93                                            non_ascii = b;
94                                            handle = destination_handle;
95                                            continue 'middle;
96                                        }
97                                        // Testing on Haswell says that we should write the
98                                        // byte unconditionally instead of trying to unread it
99                                        // to make it part of the next SIMD stride.
100                                        let dest_again_again = destination_handle.write_ascii(b);
101                                        if b < 60 {
102                                            // We've got punctuation
103                                            match source_again.check_available() {
104                                                Space::Full(src_consumed_again) => {
105                                                    return (
106                                                        DecoderResult::InputEmpty,
107                                                        src_consumed_again,
108                                                        dest_again_again.written(),
109                                                    );
110                                                }
111                                                Space::Available(source_handle_again) => {
112                                                    match dest_again_again.check_space_bmp() {
113                                                        Space::Full(dst_written_again) => {
114                                                            return (
115                                                                DecoderResult::OutputFull,
116                                                                source_handle_again.consumed(),
117                                                                dst_written_again,
118                                                            );
119                                                        }
120                                                        Space::Available(
121                                                            destination_handle_again,
122                                                        ) => {
123                                                            let (b_again, _unread_handle_again) =
124                                                                source_handle_again.read();
125                                                            b = b_again;
126                                                            destination_handle =
127                                                                destination_handle_again;
128                                                            continue 'innermost;
129                                                        }
130                                                    }
131                                                }
132                                            }
133                                        }
134                                        // We've got markup or ASCII text
135                                        continue 'outermost;
136                                    }
137                                }
138                            }
139                        }
140                    }
141                },
142            }
143        }
144    }
145
146    pub fn decode_to_utf16_raw(
147        &mut self,
148        src: &[u8],
149        dst: &mut [u16],
150        _last: bool,
151    ) -> (DecoderResult, usize, usize) {
152        let (pending, length) = if dst.len() < src.len() {
153            (DecoderResult::OutputFull, dst.len())
154        } else {
155            (DecoderResult::InputEmpty, src.len())
156        };
157        // Safety invariant: converted <= length. Quite often we have `converted < length`
158        // which will be separately marked.
159        let mut converted = 0usize;
160        'outermost: loop {
161            match unsafe {
162                // Safety: length is the minimum length, `src/dst + x` will always be valid for reads/writes of `len - x`
163                ascii_to_basic_latin(
164                    src.as_ptr().add(converted),
165                    dst.as_mut_ptr().add(converted),
166                    length - converted,
167                )
168            } {
169                None => {
170                    return (pending, length, length);
171                }
172                Some((mut non_ascii, consumed)) => {
173                    // Safety invariant: `converted <= length` upheld, since this can only consume
174                    // up to `length - converted` bytes.
175                    //
176                    // Furthermore, in this context,
177                    // we can assume `converted < length` since this branch is only ever hit when
178                    // ascii_to_basic_latin fails to consume the entire slice
179                    converted += consumed;
180                    'middle: loop {
181                        // `converted` doesn't count the reading of `non_ascii` yet.
182                        // Since the non-ASCIIness of `non_ascii` is hidden from
183                        // the optimizer, it can't figure out that it's OK to
184                        // statically omit the bound check when accessing
185                        // `[u16; 128]` with an index
186                        // `non_ascii as usize - 0x80usize`.
187                        //
188                        // Safety: We can rely on `non_ascii` being between `0x80` and `0xFF` due to
189                        // the invariants of `ascii_to_basic_latin()`, and our table has enough space for that.
190                        let mapped =
191                            unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
192                        // let mapped = self.table[non_ascii as usize - 0x80usize];
193                        if mapped == 0u16 {
194                            return (
195                                DecoderResult::Malformed(1, 0),
196                                converted + 1, // +1 `for non_ascii`
197                                converted,
198                            );
199                        }
200                        unsafe {
201                            // Safety: As mentioned above, `converted < length`
202                            *(dst.get_unchecked_mut(converted)) = mapped;
203                        }
204                        // Safety: `converted <= length` upheld, since `converted < length` before this
205                        converted += 1;
206                        // Next, handle ASCII punctuation and non-ASCII without
207                        // going back to ASCII acceleration. Non-ASCII scripts
208                        // use ASCII punctuation, so this avoid going to
209                        // acceleration just for punctuation/space and then
210                        // failing. This is a significant boost to non-ASCII
211                        // scripts.
212                        // TODO: Split out Latin converters without this part
213                        // this stuff makes Latin script-conversion slower.
214                        if converted == length {
215                            return (pending, length, length);
216                        }
217                        // Safety: We are back to `converted < length` because of the == above
218                        // and can perform this check.
219                        let mut b = unsafe { *(src.get_unchecked(converted)) };
220                        // Safety: `converted < length` is upheld for this loop
221                        'innermost: loop {
222                            if b > 127 {
223                                non_ascii = b;
224                                continue 'middle;
225                            }
226                            // Testing on Haswell says that we should write the
227                            // byte unconditionally instead of trying to unread it
228                            // to make it part of the next SIMD stride.
229                            unsafe {
230                                // Safety: `converted < length` is true for this loop
231                                *(dst.get_unchecked_mut(converted)) = u16::from(b);
232                            }
233                            // Safety: We are now at `converted <= length`. We should *not* `continue`
234                            // the loop without reverifying
235                            converted += 1;
236                            if b < 60 {
237                                // We've got punctuation
238                                if converted == length {
239                                    return (pending, length, length);
240                                }
241                                // Safety: we're back to `converted <= length` because of the == above
242                                b = unsafe { *(src.get_unchecked(converted)) };
243                                // Safety: The loop continues as `converted < length`
244                                continue 'innermost;
245                            }
246                            // We've got markup or ASCII text
247                            continue 'outermost;
248                        }
249                    }
250                }
251            }
252        }
253    }
254
255    pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> usize {
256        let mut bytes = buffer;
257        let mut total = 0;
258        loop {
259            if let Some((non_ascii, offset)) = validate_ascii(bytes) {
260                total += offset;
261                // Safety: We can rely on `non_ascii` being between `0x80` and `0xFF` due to
262                // the invariants of `ascii_to_basic_latin()`, and our table has enough space for that.
263                let mapped = unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
264                if mapped != u16::from(non_ascii) {
265                    return total;
266                }
267                total += 1;
268                bytes = &bytes[offset + 1..];
269            } else {
270                return total;
271            }
272        }
273    }
274}
275
276pub struct SingleByteEncoder {
277    table: &'static [u16; 128],
278    run_bmp_offset: usize,
279    run_byte_offset: usize,
280    run_length: usize,
281}
282
283impl SingleByteEncoder {
284    pub fn new(
285        encoding: &'static Encoding,
286        data: &'static [u16; 128],
287        run_bmp_offset: u16,
288        run_byte_offset: u8,
289        run_length: u8,
290    ) -> Encoder {
291        Encoder::new(
292            encoding,
293            VariantEncoder::SingleByte(SingleByteEncoder {
294                table: data,
295                run_bmp_offset: run_bmp_offset as usize,
296                run_byte_offset: run_byte_offset as usize,
297                run_length: run_length as usize,
298            }),
299        )
300    }
301
302    pub fn max_buffer_length_from_utf16_without_replacement(
303        &self,
304        u16_length: usize,
305    ) -> Option<usize> {
306        Some(u16_length)
307    }
308
309    pub fn max_buffer_length_from_utf8_without_replacement(
310        &self,
311        byte_length: usize,
312    ) -> Option<usize> {
313        Some(byte_length)
314    }
315
316    #[inline(always)]
317    fn encode_u16(&self, code_unit: u16) -> Option<u8> {
318        // First, we see if the code unit falls into a run of consecutive
319        // code units that can be mapped by offset. This is very efficient
320        // for most non-Latin encodings as well as Latin1-ish encodings.
321        //
322        // For encodings that don't fit this pattern, the run (which may
323        // have the length of just one) just establishes the starting point
324        // for the next rule.
325        //
326        // Next, we do a forward linear search in the part of the index
327        // after the run. Even in non-Latin1-ish Latin encodings (except
328        // macintosh), the lower case letters are here.
329        //
330        // Next, we search the third quadrant up to the start of the run
331        // (upper case letters in Latin encodings except macintosh, in
332        // Greek and in KOI encodings) and then the second quadrant,
333        // except if the run stared before the third quadrant, we search
334        // the second quadrant up to the run.
335        //
336        // Last, we search the first quadrant, which has unused controls
337        // or punctuation in most encodings. This is bad for macintosh
338        // and IBM866, but those are rare.
339
340        // Run of consecutive units
341        let unit_as_usize = code_unit as usize;
342        let offset = unit_as_usize.wrapping_sub(self.run_bmp_offset);
343        if offset < self.run_length {
344            return Some((128 + self.run_byte_offset + offset) as u8);
345        }
346
347        // Search after the run
348        let tail_start = self.run_byte_offset + self.run_length;
349        if let Some(pos) = position(&self.table[tail_start..], code_unit) {
350            return Some((128 + tail_start + pos) as u8);
351        }
352
353        if self.run_byte_offset >= 64 {
354            // Search third quadrant before the run
355            if let Some(pos) = position(&self.table[64..self.run_byte_offset], code_unit) {
356                return Some(((128 + 64) + pos) as u8);
357            }
358
359            // Search second quadrant
360            if let Some(pos) = position(&self.table[32..64], code_unit) {
361                return Some(((128 + 32) + pos) as u8);
362            }
363        } else if let Some(pos) = position(&self.table[32..self.run_byte_offset], code_unit) {
364            // windows-1252, windows-874, ISO-8859-15 and ISO-8859-5
365            // Search second quadrant before the run
366            return Some(((128 + 32) + pos) as u8);
367        }
368
369        // Search first quadrant
370        if let Some(pos) = position(&self.table[..32], code_unit) {
371            return Some((128 + pos) as u8);
372        }
373
374        None
375    }
376
377    ascii_compatible_bmp_encoder_function!(
378        {
379            match self.encode_u16(bmp) {
380                Some(byte) => handle.write_one(byte),
381                None => {
382                    return (
383                        EncoderResult::unmappable_from_bmp(bmp),
384                        source.consumed(),
385                        handle.written(),
386                    );
387                }
388            }
389        },
390        bmp,
391        self,
392        source,
393        handle,
394        copy_ascii_to_check_space_one,
395        check_space_one,
396        encode_from_utf8_raw,
397        str,
398        Utf8Source,
399        true
400    );
401
402    pub fn encode_from_utf16_raw(
403        &mut self,
404        src: &[u16],
405        dst: &mut [u8],
406        _last: bool,
407    ) -> (EncoderResult, usize, usize) {
408        let (pending, length) = if dst.len() < src.len() {
409            (EncoderResult::OutputFull, dst.len())
410        } else {
411            (EncoderResult::InputEmpty, src.len())
412        };
413        // Safety invariant: converted <= length. Quite often we have `converted < length`
414        // which will be separately marked.
415        let mut converted = 0usize;
416        'outermost: loop {
417            match unsafe {
418                // Safety: length is the minimum length, `src/dst + x` will always be valid for reads/writes of `len - x`
419                basic_latin_to_ascii(
420                    src.as_ptr().add(converted),
421                    dst.as_mut_ptr().add(converted),
422                    length - converted,
423                )
424            } {
425                None => {
426                    return (pending, length, length);
427                }
428                Some((mut non_ascii, consumed)) => {
429                    // Safety invariant: `converted <= length` upheld, since this can only consume
430                    // up to `length - converted` bytes.
431                    //
432                    // Furthermore, in this context,
433                    // we can assume `converted < length` since this branch is only ever hit when
434                    // ascii_to_basic_latin fails to consume the entire slice
435                    converted += consumed;
436                    'middle: loop {
437                        // `converted` doesn't count the reading of `non_ascii` yet.
438                        match self.encode_u16(non_ascii) {
439                            Some(byte) => {
440                                unsafe {
441                                    // Safety: we're allowed this access since `converted < length`
442                                    *(dst.get_unchecked_mut(converted)) = byte;
443                                }
444                                converted += 1;
445                                // `converted <= length` now
446                            }
447                            None => {
448                                // At this point, we need to know if we
449                                // have a surrogate.
450                                let high_bits = non_ascii & 0xFC00u16;
451                                if high_bits == 0xD800u16 {
452                                    // high surrogate
453                                    if converted + 1 == length {
454                                        // End of buffer. This surrogate is unpaired.
455                                        return (
456                                            EncoderResult::Unmappable('\u{FFFD}'),
457                                            converted + 1, // +1 `for non_ascii`
458                                            converted,
459                                        );
460                                    }
461                                    // Safety: convered < length from outside the match, and `converted + 1 != length`,
462                                    // So `converted + 1 < length` as well. We're in bounds
463                                    let second =
464                                        u32::from(unsafe { *src.get_unchecked(converted + 1) });
465                                    if second & 0xFC00u32 != 0xDC00u32 {
466                                        return (
467                                            EncoderResult::Unmappable('\u{FFFD}'),
468                                            converted + 1, // +1 `for non_ascii`
469                                            converted,
470                                        );
471                                    }
472                                    // The next code unit is a low surrogate.
473                                    let astral: char = unsafe {
474                                        // Safety: We can rely on non_ascii being 0xD800-0xDBFF since the high bits are 0xD800
475                                        // Then, (non_ascii << 10 - 0xD800 << 10) becomes between (0 to 0x3FF) << 10, which is between
476                                        // 0x400 to 0xffc00. Adding the 0x10000 gives a range of 0x10400 to 0x10fc00. Subtracting the 0xDC00
477                                        // gives 0x2800 to 0x102000
478                                        // The second term is between 0xDC00 and 0xDFFF from the check above. This gives a maximum
479                                        // possible range of (0x10400 + 0xDC00) to (0x102000 + 0xDFFF) which is 0x1E000 to 0x10ffff.
480                                        // This is in range.
481                                        //
482                                        // From a Unicode principles perspective this can also be verified as we have checked that `non_ascii` is a high surrogate
483                                        // (0xD800..=0xDBFF), and that `second` is a low surrogate (`0xDC00..=0xDFFF`), and we are applying reverse of the UTC16 transformation
484                                        // algorithm <https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF>, by applying the high surrogate - 0xD800 to the
485                                        // high ten bits, and the low surrogate - 0xDc00 to the low ten bits, and then adding 0x10000
486                                        ::core::char::from_u32_unchecked(
487                                            (u32::from(non_ascii) << 10) + second
488                                                - (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
489                                        )
490                                    };
491                                    return (
492                                        EncoderResult::Unmappable(astral),
493                                        converted + 2, // +2 `for non_ascii` and `second`
494                                        converted,
495                                    );
496                                }
497                                if high_bits == 0xDC00u16 {
498                                    // Unpaired low surrogate
499                                    return (
500                                        EncoderResult::Unmappable('\u{FFFD}'),
501                                        converted + 1, // +1 `for non_ascii`
502                                        converted,
503                                    );
504                                }
505                                return (
506                                    EncoderResult::unmappable_from_bmp(non_ascii),
507                                    converted + 1, // +1 `for non_ascii`
508                                    converted,
509                                );
510                                // Safety: This branch diverges, so no need to uphold invariants on `converted`
511                            }
512                        }
513                        // Next, handle ASCII punctuation and non-ASCII without
514                        // going back to ASCII acceleration. Non-ASCII scripts
515                        // use ASCII punctuation, so this avoid going to
516                        // acceleration just for punctuation/space and then
517                        // failing. This is a significant boost to non-ASCII
518                        // scripts.
519                        // TODO: Split out Latin converters without this part
520                        // this stuff makes Latin script-conversion slower.
521                        if converted == length {
522                            return (pending, length, length);
523                        }
524                        // Safety: we're back to `converted < length` due to the == above and can perform
525                        // the unchecked read
526                        let mut unit = unsafe { *(src.get_unchecked(converted)) };
527                        'innermost: loop {
528                            // Safety: This loop always begins with `converted < length`, see
529                            // the invariant outside and the comment on the continue below
530                            if unit > 127 {
531                                non_ascii = unit;
532                                continue 'middle;
533                            }
534                            // Testing on Haswell says that we should write the
535                            // byte unconditionally instead of trying to unread it
536                            // to make it part of the next SIMD stride.
537                            unsafe {
538                                // Safety: Can rely on converted < length
539                                *(dst.get_unchecked_mut(converted)) = unit as u8;
540                            }
541                            converted += 1;
542                            // `converted <= length` here
543                            if unit < 60 {
544                                // We've got punctuation
545                                if converted == length {
546                                    return (pending, length, length);
547                                }
548                                // Safety: `converted < length` due to the == above. The read is safe.
549                                unit = unsafe { *(src.get_unchecked(converted)) };
550                                // Safety: This only happens if `converted < length`, maintaining it
551                                continue 'innermost;
552                            }
553                            // We've got markup or ASCII text
554                            continue 'outermost;
555                            // Safety: All other routes to here diverge so the continue is the only
556                            // way to run the innermost loop.
557                        }
558                    }
559                }
560            }
561        }
562    }
563}
564
565// Any copyright to the test code below this comment is dedicated to the
566// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
567
568#[cfg(all(test, feature = "alloc"))]
569mod tests {
570    use super::super::testing::*;
571    use super::super::*;
572
573    #[test]
574    fn test_windows_1255_ca() {
575        decode(WINDOWS_1255, b"\xCA", "\u{05BA}");
576        encode(WINDOWS_1255, "\u{05BA}", b"\xCA");
577    }
578
579    #[test]
580    fn test_ascii_punctuation() {
581        let bytes = b"\xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4. \xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4.";
582        let characters = "\u{0391}\u{03C5}\u{03C4}\u{03CC} \
583                          \u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
584                          \u{03C4}\u{03B5}\u{03C3}\u{03C4}. \u{0391}\u{03C5}\u{03C4}\u{03CC} \
585                          \u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
586                          \u{03C4}\u{03B5}\u{03C3}\u{03C4}.";
587        decode(WINDOWS_1253, bytes, characters);
588        encode(WINDOWS_1253, characters, bytes);
589    }
590
591    #[test]
592    fn test_decode_malformed() {
593        decode(
594            WINDOWS_1253,
595            b"\xC1\xF5\xD2\xF4\xFC",
596            "\u{0391}\u{03C5}\u{FFFD}\u{03C4}\u{03CC}",
597        );
598    }
599
600    #[test]
601    fn test_encode_unmappables() {
602        encode(
603            WINDOWS_1253,
604            "\u{0391}\u{03C5}\u{2603}\u{03C4}\u{03CC}",
605            b"\xC1\xF5&#9731;\xF4\xFC",
606        );
607        encode(
608            WINDOWS_1253,
609            "\u{0391}\u{03C5}\u{1F4A9}\u{03C4}\u{03CC}",
610            b"\xC1\xF5&#128169;\xF4\xFC",
611        );
612    }
613
614    #[test]
615    fn test_encode_unpaired_surrogates() {
616        encode_from_utf16(
617            WINDOWS_1253,
618            &[0x0391u16, 0x03C5u16, 0xDCA9u16, 0x03C4u16, 0x03CCu16],
619            b"\xC1\xF5&#65533;\xF4\xFC",
620        );
621        encode_from_utf16(
622            WINDOWS_1253,
623            &[0x0391u16, 0x03C5u16, 0xD83Du16, 0x03C4u16, 0x03CCu16],
624            b"\xC1\xF5&#65533;\xF4\xFC",
625        );
626        encode_from_utf16(
627            WINDOWS_1253,
628            &[0x0391u16, 0x03C5u16, 0x03C4u16, 0x03CCu16, 0xD83Du16],
629            b"\xC1\xF5\xF4\xFC&#65533;",
630        );
631    }
632
633    pub const HIGH_BYTES: &'static [u8; 128] = &[
634        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E,
635        0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D,
636        0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC,
637        0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB,
638        0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA,
639        0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9,
640        0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8,
641        0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
642        0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
643    ];
644
645    fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
646        let mut with_replacement = [0u16; 128];
647        let mut it = data.iter().enumerate();
648        loop {
649            match it.next() {
650                Some((i, code_point)) => {
651                    if *code_point == 0 {
652                        with_replacement[i] = 0xFFFD;
653                    } else {
654                        with_replacement[i] = *code_point;
655                    }
656                }
657                None => {
658                    break;
659                }
660            }
661        }
662
663        decode_to_utf16(encoding, HIGH_BYTES, &with_replacement[..]);
664    }
665
666    fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
667        let mut with_zeros = [0u8; 128];
668        let mut it = data.iter().enumerate();
669        loop {
670            match it.next() {
671                Some((i, code_point)) => {
672                    if *code_point == 0 {
673                        with_zeros[i] = 0;
674                    } else {
675                        with_zeros[i] = HIGH_BYTES[i];
676                    }
677                }
678                None => {
679                    break;
680                }
681            }
682        }
683
684        encode_from_utf16(encoding, data, &with_zeros[..]);
685    }
686
687    #[test]
688    fn test_single_byte_from_two_low_surrogates() {
689        let expectation = b"&#65533;&#65533;";
690        let mut output = [0u8; 40];
691        let mut encoder = WINDOWS_1253.new_encoder();
692        let (result, read, written, had_errors) =
693            encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
694        assert_eq!(result, CoderResult::InputEmpty);
695        assert_eq!(read, 2);
696        assert_eq!(written, expectation.len());
697        assert!(had_errors);
698        assert_eq!(&output[..written], expectation);
699    }
700
701    // These tests are so self-referential that they are pretty useless.
702
703    // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
704    // Instead, please regenerate using generate-encoding-data.py
705
706    #[test]
707    fn test_single_byte_decode() {
708        decode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
709        decode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
710        if cfg!(miri) {
711            // Miri is too slow
712            return;
713        }
714        decode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
715        decode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
716        decode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
717        decode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
718        decode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
719        decode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
720        decode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
721        decode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
722        decode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
723        decode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
724        decode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
725        decode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
726        decode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
727        decode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
728        decode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
729        decode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
730        decode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
731        decode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
732        decode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
733        decode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
734        decode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
735        decode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
736        decode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
737        decode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
738        decode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
739    }
740
741    #[test]
742    fn test_single_byte_encode() {
743        encode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
744        encode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
745        if cfg!(miri) {
746            // Miri is too slow
747            return;
748        }
749        encode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
750        encode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
751        encode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
752        encode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
753        encode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
754        encode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
755        encode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
756        encode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
757        encode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
758        encode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
759        encode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
760        encode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
761        encode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
762        encode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
763        encode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
764        encode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
765        encode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
766        encode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
767        encode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
768        encode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
769        encode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
770        encode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
771        encode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
772        encode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
773        encode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
774    }
775    // END GENERATED CODE
776}
encoding_rs/single_byte.rs

encoding_rs/
single_byte.rs