1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
use crate::{
engine::{general_purpose::INVALID_VALUE, DecodeEstimate, DecodePaddingMode},
DecodeError, PAD_BYTE,
};
// decode logic operates on chunks of 8 input bytes without padding
const INPUT_CHUNK_LEN: usize = 8;
const DECODED_CHUNK_LEN: usize = 6;
// we read a u64 and write a u64, but a u64 of input only yields 6 bytes of output, so the last
// 2 bytes of any output u64 should not be counted as written to (but must be available in a
// slice).
const DECODED_CHUNK_SUFFIX: usize = 2;
// how many u64's of input to handle at a time
const CHUNKS_PER_FAST_LOOP_BLOCK: usize = 4;
const INPUT_BLOCK_LEN: usize = CHUNKS_PER_FAST_LOOP_BLOCK * INPUT_CHUNK_LEN;
// includes the trailing 2 bytes for the final u64 write
const DECODED_BLOCK_LEN: usize =
CHUNKS_PER_FAST_LOOP_BLOCK * DECODED_CHUNK_LEN + DECODED_CHUNK_SUFFIX;
#[doc(hidden)]
pub struct GeneralPurposeEstimate {
/// Total number of decode chunks, including a possibly partial last chunk
num_chunks: usize,
decoded_len_estimate: usize,
}
impl GeneralPurposeEstimate {
pub(crate) fn new(encoded_len: usize) -> Self {
Self {
num_chunks: encoded_len
.checked_add(INPUT_CHUNK_LEN - 1)
.expect("Overflow when calculating number of chunks in input")
/ INPUT_CHUNK_LEN,
decoded_len_estimate: encoded_len
.checked_add(3)
.expect("Overflow when calculating decoded len estimate")
/ 4
* 3,
}
}
}
impl DecodeEstimate for GeneralPurposeEstimate {
fn decoded_len_estimate(&self) -> usize {
self.decoded_len_estimate
}
}
/// Helper to avoid duplicating num_chunks calculation, which is costly on short inputs.
/// Returns the number of bytes written, or an error.
// We're on the fragile edge of compiler heuristics here. If this is not inlined, slow. If this is
// inlined(always), a different slow. plain ol' inline makes the benchmarks happiest at the moment,
// but this is fragile and the best setting changes with only minor code modifications.
#[inline]
pub(crate) fn decode_helper(
input: &[u8],
estimate: GeneralPurposeEstimate,
output: &mut [u8],
decode_table: &[u8; 256],
decode_allow_trailing_bits: bool,
padding_mode: DecodePaddingMode,
) -> Result<usize, DecodeError> {
let remainder_len = input.len() % INPUT_CHUNK_LEN;
// Because the fast decode loop writes in groups of 8 bytes (unrolled to
// CHUNKS_PER_FAST_LOOP_BLOCK times 8 bytes, where possible) and outputs 8 bytes at a time (of
// which only 6 are valid data), we need to be sure that we stop using the fast decode loop
// soon enough that there will always be 2 more bytes of valid data written after that loop.
let trailing_bytes_to_skip = match remainder_len {
// if input is a multiple of the chunk size, ignore the last chunk as it may have padding,
// and the fast decode logic cannot handle padding
0 => INPUT_CHUNK_LEN,
// 1 and 5 trailing bytes are illegal: can't decode 6 bits of input into a byte
1 | 5 => {
// trailing whitespace is so common that it's worth it to check the last byte to
// possibly return a better error message
if let Some(b) = input.last() {
if *b != PAD_BYTE && decode_table[*b as usize] == INVALID_VALUE {
return Err(DecodeError::InvalidByte(input.len() - 1, *b));
}
}
return Err(DecodeError::InvalidLength);
}
// This will decode to one output byte, which isn't enough to overwrite the 2 extra bytes
// written by the fast decode loop. So, we have to ignore both these 2 bytes and the
// previous chunk.
2 => INPUT_CHUNK_LEN + 2,
// If this is 3 un-padded chars, then it would actually decode to 2 bytes. However, if this
// is an erroneous 2 chars + 1 pad char that would decode to 1 byte, then it should fail
// with an error, not panic from going past the bounds of the output slice, so we let it
// use stage 3 + 4.
3 => INPUT_CHUNK_LEN + 3,
// This can also decode to one output byte because it may be 2 input chars + 2 padding
// chars, which would decode to 1 byte.
4 => INPUT_CHUNK_LEN + 4,
// Everything else is a legal decode len (given that we don't require padding), and will
// decode to at least 2 bytes of output.
_ => remainder_len,
};
// rounded up to include partial chunks
let mut remaining_chunks = estimate.num_chunks;
let mut input_index = 0;
let mut output_index = 0;
{
let length_of_fast_decode_chunks = input.len().saturating_sub(trailing_bytes_to_skip);
// Fast loop, stage 1
// manual unroll to CHUNKS_PER_FAST_LOOP_BLOCK of u64s to amortize slice bounds checks
if let Some(max_start_index) = length_of_fast_decode_chunks.checked_sub(INPUT_BLOCK_LEN) {
while input_index <= max_start_index {
let input_slice = &input[input_index..(input_index + INPUT_BLOCK_LEN)];
let output_slice = &mut output[output_index..(output_index + DECODED_BLOCK_LEN)];
decode_chunk(
&input_slice[0..],
input_index,
decode_table,
&mut output_slice[0..],
)?;
decode_chunk(
&input_slice[8..],
input_index + 8,
decode_table,
&mut output_slice[6..],
)?;
decode_chunk(
&input_slice[16..],
input_index + 16,
decode_table,
&mut output_slice[12..],
)?;
decode_chunk(
&input_slice[24..],
input_index + 24,
decode_table,
&mut output_slice[18..],
)?;
input_index += INPUT_BLOCK_LEN;
output_index += DECODED_BLOCK_LEN - DECODED_CHUNK_SUFFIX;
remaining_chunks -= CHUNKS_PER_FAST_LOOP_BLOCK;
}
}
// Fast loop, stage 2 (aka still pretty fast loop)
// 8 bytes at a time for whatever we didn't do in stage 1.
if let Some(max_start_index) = length_of_fast_decode_chunks.checked_sub(INPUT_CHUNK_LEN) {
while input_index < max_start_index {
decode_chunk(
&input[input_index..(input_index + INPUT_CHUNK_LEN)],
input_index,
decode_table,
&mut output
[output_index..(output_index + DECODED_CHUNK_LEN + DECODED_CHUNK_SUFFIX)],
)?;
output_index += DECODED_CHUNK_LEN;
input_index += INPUT_CHUNK_LEN;
remaining_chunks -= 1;
}
}
}
// Stage 3
// If input length was such that a chunk had to be deferred until after the fast loop
// because decoding it would have produced 2 trailing bytes that wouldn't then be
// overwritten, we decode that chunk here. This way is slower but doesn't write the 2
// trailing bytes.
// However, we still need to avoid the last chunk (partial or complete) because it could
// have padding, so we always do 1 fewer to avoid the last chunk.
for _ in 1..remaining_chunks {
decode_chunk_precise(
&input[input_index..],
input_index,
decode_table,
&mut output[output_index..(output_index + DECODED_CHUNK_LEN)],
)?;
input_index += INPUT_CHUNK_LEN;
output_index += DECODED_CHUNK_LEN;
}
// always have one more (possibly partial) block of 8 input
debug_assert!(input.len() - input_index > 1 || input.is_empty());
debug_assert!(input.len() - input_index <= 8);
super::decode_suffix::decode_suffix(
input,
input_index,
output,
output_index,
decode_table,
decode_allow_trailing_bits,
padding_mode,
)
}
/// Decode 8 bytes of input into 6 bytes of output. 8 bytes of output will be written, but only the
/// first 6 of those contain meaningful data.
///
/// `input` is the bytes to decode, of which the first 8 bytes will be processed.
/// `index_at_start_of_input` is the offset in the overall input (used for reporting errors
/// accurately)
/// `decode_table` is the lookup table for the particular base64 alphabet.
/// `output` will have its first 8 bytes overwritten, of which only the first 6 are valid decoded
/// data.
// yes, really inline (worth 30-50% speedup)
#[inline(always)]
fn decode_chunk(
input: &[u8],
index_at_start_of_input: usize,
decode_table: &[u8; 256],
output: &mut [u8],
) -> Result<(), DecodeError> {
let morsel = decode_table[input[0] as usize];
if morsel == INVALID_VALUE {
return Err(DecodeError::InvalidByte(index_at_start_of_input, input[0]));
}
let mut accum = (morsel as u64) << 58;
let morsel = decode_table[input[1] as usize];
if morsel == INVALID_VALUE {
return Err(DecodeError::InvalidByte(
index_at_start_of_input + 1,
input[1],
));
}
accum |= (morsel as u64) << 52;
let morsel = decode_table[input[2] as usize];
if morsel == INVALID_VALUE {
return Err(DecodeError::InvalidByte(
index_at_start_of_input + 2,
input[2],
));
}
accum |= (morsel as u64) << 46;
let morsel = decode_table[input[3] as usize];
if morsel == INVALID_VALUE {
return Err(DecodeError::InvalidByte(
index_at_start_of_input + 3,
input[3],
));
}
accum |= (morsel as u64) << 40;
let morsel = decode_table[input[4] as usize];
if morsel == INVALID_VALUE {
return Err(DecodeError::InvalidByte(
index_at_start_of_input + 4,
input[4],
));
}
accum |= (morsel as u64) << 34;
let morsel = decode_table[input[5] as usize];
if morsel == INVALID_VALUE {
return Err(DecodeError::InvalidByte(
index_at_start_of_input + 5,
input[5],
));
}
accum |= (morsel as u64) << 28;
let morsel = decode_table[input[6] as usize];
if morsel == INVALID_VALUE {
return Err(DecodeError::InvalidByte(
index_at_start_of_input + 6,
input[6],
));
}
accum |= (morsel as u64) << 22;
let morsel = decode_table[input[7] as usize];
if morsel == INVALID_VALUE {
return Err(DecodeError::InvalidByte(
index_at_start_of_input + 7,
input[7],
));
}
accum |= (morsel as u64) << 16;
write_u64(output, accum);
Ok(())
}
/// Decode an 8-byte chunk, but only write the 6 bytes actually decoded instead of including 2
/// trailing garbage bytes.
#[inline]
fn decode_chunk_precise(
input: &[u8],
index_at_start_of_input: usize,
decode_table: &[u8; 256],
output: &mut [u8],
) -> Result<(), DecodeError> {
let mut tmp_buf = [0_u8; 8];
decode_chunk(
input,
index_at_start_of_input,
decode_table,
&mut tmp_buf[..],
)?;
output[0..6].copy_from_slice(&tmp_buf[0..6]);
Ok(())
}
#[inline]
fn write_u64(output: &mut [u8], value: u64) {
output[..8].copy_from_slice(&value.to_be_bytes());
}
#[cfg(test)]
mod tests {
use super::*;
use crate::engine::general_purpose::STANDARD;
#[test]
fn decode_chunk_precise_writes_only_6_bytes() {
let input = b"Zm9vYmFy"; // "foobar"
let mut output = [0_u8, 1, 2, 3, 4, 5, 6, 7];
decode_chunk_precise(&input[..], 0, &STANDARD.decode_table, &mut output).unwrap();
assert_eq!(&vec![b'f', b'o', b'o', b'b', b'a', b'r', 6, 7], &output);
}
#[test]
fn decode_chunk_writes_8_bytes() {
let input = b"Zm9vYmFy"; // "foobar"
let mut output = [0_u8, 1, 2, 3, 4, 5, 6, 7];
decode_chunk(&input[..], 0, &STANDARD.decode_table, &mut output).unwrap();
assert_eq!(&vec![b'f', b'o', b'o', b'b', b'a', b'r', 0, 0], &output);
}
}