1#![allow(clippy::many_single_char_names, unsafe_op_in_unsafe_fn)]
4
5#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
6compile_error!("x86-sha backend can be used only on x86 and x86_64 target arches");
7
8#[cfg(target_arch = "x86")]
9use core::arch::x86::*;
10#[cfg(target_arch = "x86_64")]
11use core::arch::x86_64::*;
12
13#[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
14unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i {
15 let t1 = _mm_sha256msg1_epu32(v0, v1);
16 let t2 = _mm_alignr_epi8(v3, v2, 4);
17 let t3 = _mm_add_epi32(t1, t2);
18 _mm_sha256msg2_epu32(t3, v3)
19}
20
21macro_rules! rounds4 {
22 ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
23 let k = crate::consts::K32X4[$i];
24 let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32);
25 let t1 = _mm_add_epi32($rest, kv);
26 $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1);
27 let t2 = _mm_shuffle_epi32(t1, 0x0E);
28 $abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2);
29 }};
30}
31
32macro_rules! schedule_rounds4 {
33 (
34 $abef:ident, $cdgh:ident,
35 $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
36 $i: expr
37 ) => {{
38 $w4 = schedule($w0, $w1, $w2, $w3);
39 rounds4!($abef, $cdgh, $w4, $i);
40 }};
41}
42
43#[allow(clippy::cast_ptr_alignment)]
45#[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
46pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
47 #[allow(non_snake_case)]
48 let MASK: __m128i = _mm_set_epi64x(
49 0x0C0D_0E0F_0809_0A0Bu64 as i64,
50 0x0405_0607_0001_0203u64 as i64,
51 );
52
53 let state_ptr: *const __m128i = state.as_ptr().cast();
54 let dcba = _mm_loadu_si128(state_ptr.add(0));
55 let hgfe = _mm_loadu_si128(state_ptr.add(1));
56
57 let cdab = _mm_shuffle_epi32(dcba, 0xB1);
58 let efgh = _mm_shuffle_epi32(hgfe, 0x1B);
59 let mut abef = _mm_alignr_epi8(cdab, efgh, 8);
60 let mut cdgh = _mm_blend_epi16(efgh, cdab, 0xF0);
61
62 for block in blocks {
63 let abef_save = abef;
64 let cdgh_save = cdgh;
65
66 let block_ptr: *const __m128i = block.as_ptr().cast();
67 let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(0)), MASK);
68 let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(1)), MASK);
69 let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(2)), MASK);
70 let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(3)), MASK);
71 let mut w4;
72
73 rounds4!(abef, cdgh, w0, 0);
74 rounds4!(abef, cdgh, w1, 1);
75 rounds4!(abef, cdgh, w2, 2);
76 rounds4!(abef, cdgh, w3, 3);
77 schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4);
78 schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5);
79 schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6);
80 schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7);
81 schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8);
82 schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9);
83 schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10);
84 schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11);
85 schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12);
86 schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13);
87 schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14);
88 schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15);
89
90 abef = _mm_add_epi32(abef, abef_save);
91 cdgh = _mm_add_epi32(cdgh, cdgh_save);
92 }
93
94 let feba = _mm_shuffle_epi32(abef, 0x1B);
95 let dchg = _mm_shuffle_epi32(cdgh, 0xB1);
96 let dcba = _mm_blend_epi16(feba, dchg, 0xF0);
97 let hgef = _mm_alignr_epi8(dchg, feba, 8);
98
99 let state_ptr_mut: *mut __m128i = state.as_mut_ptr().cast();
100 _mm_storeu_si128(state_ptr_mut.add(0), dcba);
101 _mm_storeu_si128(state_ptr_mut.add(1), hgef);
102}