hashbrown/raw/sse2.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
use super::bitmask::BitMask;
use super::EMPTY;
use core::mem;
use core::num::NonZeroU16;
#[cfg(target_arch = "x86")]
use core::arch::x86;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64 as x86;
pub(crate) type BitMaskWord = u16;
pub(crate) type NonZeroBitMaskWord = NonZeroU16;
pub(crate) const BITMASK_STRIDE: usize = 1;
pub(crate) const BITMASK_MASK: BitMaskWord = 0xffff;
pub(crate) const BITMASK_ITER_MASK: BitMaskWord = !0;
/// Abstraction over a group of control bytes which can be scanned in
/// parallel.
///
/// This implementation uses a 128-bit SSE value.
#[derive(Copy, Clone)]
pub(crate) struct Group(x86::__m128i);
// FIXME: https://github.com/rust-lang/rust-clippy/issues/3859
#[allow(clippy::use_self)]
impl Group {
/// Number of bytes in the group.
pub(crate) const WIDTH: usize = mem::size_of::<Self>();
/// Returns a full group of empty bytes, suitable for use as the initial
/// value for an empty hash table.
///
/// This is guaranteed to be aligned to the group size.
#[inline]
#[allow(clippy::items_after_statements)]
pub(crate) const fn static_empty() -> &'static [u8; Group::WIDTH] {
#[repr(C)]
struct AlignedBytes {
_align: [Group; 0],
bytes: [u8; Group::WIDTH],
}
const ALIGNED_BYTES: AlignedBytes = AlignedBytes {
_align: [],
bytes: [EMPTY; Group::WIDTH],
};
&ALIGNED_BYTES.bytes
}
/// Loads a group of bytes starting at the given address.
#[inline]
#[allow(clippy::cast_ptr_alignment)] // unaligned load
pub(crate) unsafe fn load(ptr: *const u8) -> Self {
Group(x86::_mm_loadu_si128(ptr.cast()))
}
/// Loads a group of bytes starting at the given address, which must be
/// aligned to `mem::align_of::<Group>()`.
#[inline]
#[allow(clippy::cast_ptr_alignment)]
pub(crate) unsafe fn load_aligned(ptr: *const u8) -> Self {
// FIXME: use align_offset once it stabilizes
debug_assert_eq!(ptr as usize & (mem::align_of::<Self>() - 1), 0);
Group(x86::_mm_load_si128(ptr.cast()))
}
/// Stores the group of bytes to the given address, which must be
/// aligned to `mem::align_of::<Group>()`.
#[inline]
#[allow(clippy::cast_ptr_alignment)]
pub(crate) unsafe fn store_aligned(self, ptr: *mut u8) {
// FIXME: use align_offset once it stabilizes
debug_assert_eq!(ptr as usize & (mem::align_of::<Self>() - 1), 0);
x86::_mm_store_si128(ptr.cast(), self.0);
}
/// Returns a `BitMask` indicating all bytes in the group which have
/// the given value.
#[inline]
pub(crate) fn match_byte(self, byte: u8) -> BitMask {
#[allow(
clippy::cast_possible_wrap, // byte: u8 as i8
// byte: i32 as u16
// note: _mm_movemask_epi8 returns a 16-bit mask in a i32, the
// upper 16-bits of the i32 are zeroed:
clippy::cast_sign_loss,
clippy::cast_possible_truncation
)]
unsafe {
let cmp = x86::_mm_cmpeq_epi8(self.0, x86::_mm_set1_epi8(byte as i8));
BitMask(x86::_mm_movemask_epi8(cmp) as u16)
}
}
/// Returns a `BitMask` indicating all bytes in the group which are
/// `EMPTY`.
#[inline]
pub(crate) fn match_empty(self) -> BitMask {
self.match_byte(EMPTY)
}
/// Returns a `BitMask` indicating all bytes in the group which are
/// `EMPTY` or `DELETED`.
#[inline]
pub(crate) fn match_empty_or_deleted(self) -> BitMask {
#[allow(
// byte: i32 as u16
// note: _mm_movemask_epi8 returns a 16-bit mask in a i32, the
// upper 16-bits of the i32 are zeroed:
clippy::cast_sign_loss,
clippy::cast_possible_truncation
)]
unsafe {
// A byte is EMPTY or DELETED iff the high bit is set
BitMask(x86::_mm_movemask_epi8(self.0) as u16)
}
}
/// Returns a `BitMask` indicating all bytes in the group which are full.
#[inline]
pub(crate) fn match_full(&self) -> BitMask {
self.match_empty_or_deleted().invert()
}
/// Performs the following transformation on all bytes in the group:
/// - `EMPTY => EMPTY`
/// - `DELETED => EMPTY`
/// - `FULL => DELETED`
#[inline]
pub(crate) fn convert_special_to_empty_and_full_to_deleted(self) -> Self {
// Map high_bit = 1 (EMPTY or DELETED) to 1111_1111
// and high_bit = 0 (FULL) to 1000_0000
//
// Here's this logic expanded to concrete values:
// let special = 0 > byte = 1111_1111 (true) or 0000_0000 (false)
// 1111_1111 | 1000_0000 = 1111_1111
// 0000_0000 | 1000_0000 = 1000_0000
#[allow(
clippy::cast_possible_wrap, // byte: 0x80_u8 as i8
)]
unsafe {
let zero = x86::_mm_setzero_si128();
let special = x86::_mm_cmpgt_epi8(zero, self.0);
Group(x86::_mm_or_si128(
special,
x86::_mm_set1_epi8(0x80_u8 as i8),
))
}
}
}