sha2/sha256/
x86_sha.rs

1//! SHA-256 `x86`/`x86_64` backend
2
3#![allow(clippy::many_single_char_names, unsafe_op_in_unsafe_fn)]
4
5#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
6compile_error!("x86-sha backend can be used only on x86 and x86_64 target arches");
7
8#[cfg(target_arch = "x86")]
9use core::arch::x86::*;
10#[cfg(target_arch = "x86_64")]
11use core::arch::x86_64::*;
12
13#[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
14unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i {
15    let t1 = _mm_sha256msg1_epu32(v0, v1);
16    let t2 = _mm_alignr_epi8(v3, v2, 4);
17    let t3 = _mm_add_epi32(t1, t2);
18    _mm_sha256msg2_epu32(t3, v3)
19}
20
21macro_rules! rounds4 {
22    ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
23        let k = crate::consts::K32X4[$i];
24        let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32);
25        let t1 = _mm_add_epi32($rest, kv);
26        $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1);
27        let t2 = _mm_shuffle_epi32(t1, 0x0E);
28        $abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2);
29    }};
30}
31
32macro_rules! schedule_rounds4 {
33    (
34        $abef:ident, $cdgh:ident,
35        $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
36        $i: expr
37    ) => {{
38        $w4 = schedule($w0, $w1, $w2, $w3);
39        rounds4!($abef, $cdgh, $w4, $i);
40    }};
41}
42
43// we use unaligned loads with `__m128i` pointers
44#[allow(clippy::cast_ptr_alignment)]
45#[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
46pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
47    #[allow(non_snake_case)]
48    let MASK: __m128i = _mm_set_epi64x(
49        0x0C0D_0E0F_0809_0A0Bu64 as i64,
50        0x0405_0607_0001_0203u64 as i64,
51    );
52
53    let state_ptr: *const __m128i = state.as_ptr().cast();
54    let dcba = _mm_loadu_si128(state_ptr.add(0));
55    let hgfe = _mm_loadu_si128(state_ptr.add(1));
56
57    let cdab = _mm_shuffle_epi32(dcba, 0xB1);
58    let efgh = _mm_shuffle_epi32(hgfe, 0x1B);
59    let mut abef = _mm_alignr_epi8(cdab, efgh, 8);
60    let mut cdgh = _mm_blend_epi16(efgh, cdab, 0xF0);
61
62    for block in blocks {
63        let abef_save = abef;
64        let cdgh_save = cdgh;
65
66        let block_ptr: *const __m128i = block.as_ptr().cast();
67        let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(0)), MASK);
68        let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(1)), MASK);
69        let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(2)), MASK);
70        let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(3)), MASK);
71        let mut w4;
72
73        rounds4!(abef, cdgh, w0, 0);
74        rounds4!(abef, cdgh, w1, 1);
75        rounds4!(abef, cdgh, w2, 2);
76        rounds4!(abef, cdgh, w3, 3);
77        schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4);
78        schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5);
79        schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6);
80        schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7);
81        schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8);
82        schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9);
83        schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10);
84        schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11);
85        schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12);
86        schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13);
87        schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14);
88        schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15);
89
90        abef = _mm_add_epi32(abef, abef_save);
91        cdgh = _mm_add_epi32(cdgh, cdgh_save);
92    }
93
94    let feba = _mm_shuffle_epi32(abef, 0x1B);
95    let dchg = _mm_shuffle_epi32(cdgh, 0xB1);
96    let dcba = _mm_blend_epi16(feba, dchg, 0xF0);
97    let hgef = _mm_alignr_epi8(dchg, feba, 8);
98
99    let state_ptr_mut: *mut __m128i = state.as_mut_ptr().cast();
100    _mm_storeu_si128(state_ptr_mut.add(0), dcba);
101    _mm_storeu_si128(state_ptr_mut.add(1), hgef);
102}