1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
use super::fallback;

// We only use AVX when we can detect at runtime whether it's available, which
// requires std.
#[cfg(feature = "std")]
mod avx;
mod sse2;

/// This macro employs a gcc-like "ifunc" trick where by upon first calling
/// `memchr` (for example), CPU feature detection will be performed at runtime
/// to determine the best implementation to use. After CPU feature detection
/// is done, we replace `memchr`'s function pointer with the selection. Upon
/// subsequent invocations, the CPU-specific routine is invoked directly, which
/// skips the CPU feature detection and subsequent branch that's required.
///
/// While this typically doesn't matter for rare occurrences or when used on
/// larger haystacks, `memchr` can be called in tight loops where the overhead
/// of this branch can actually add up *and is measurable*. This trick was
/// necessary to bring this implementation up to glibc's speeds for the 'tiny'
/// benchmarks, for example.
///
/// At some point, I expect the Rust ecosystem will get a nice macro for doing
/// exactly this, at which point, we can replace our hand-jammed version of it.
///
/// N.B. The ifunc strategy does prevent function inlining of course, but
/// on modern CPUs, you'll probably end up with the AVX2 implementation,
/// which probably can't be inlined anyway---unless you've compiled your
/// entire program with AVX2 enabled. However, even then, the various memchr
/// implementations aren't exactly small, so inlining might not help anyway!
///
/// # Safety
///
/// Callers must ensure that fnty is function pointer type.
#[cfg(feature = "std")]
macro_rules! unsafe_ifunc {
    ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{
        use std::{mem, sync::atomic::{AtomicPtr, Ordering}};

        type FnRaw = *mut ();

        static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw);

        fn detect($($needle: u8),+, haystack: &[u8]) -> Option<usize> {
            let fun =
                if cfg!(memchr_runtime_avx) && is_x86_feature_detected!("avx2") {
                    avx::$name as FnRaw
                } else if cfg!(memchr_runtime_sse2) {
                    sse2::$name as FnRaw
                } else {
                    fallback::$name as FnRaw
                };
            FN.store(fun as FnRaw, Ordering::Relaxed);
            // SAFETY: By virtue of the caller contract, $fnty is a function
            // pointer, which is always safe to transmute with a *mut ().
            // Also, if 'fun is the AVX routine, then it is guaranteed to be
            // supported since we checked the avx2 feature.
            unsafe {
                mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, haystack)
            }
        }

        // SAFETY: By virtue of the caller contract, $fnty is a function
        // pointer, which is always safe to transmute with a *mut (). Also, if
        // 'fun is the AVX routine, then it is guaranteed to be supported since
        // we checked the avx2 feature.
        unsafe {
            let fun = FN.load(Ordering::Relaxed);
            mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, $haystack)
        }
    }}
}

/// When std isn't available to provide runtime CPU feature detection, or if
/// runtime CPU feature detection has been explicitly disabled, then just
/// call our optimized SSE2 routine directly. SSE2 is avalbale on all x86_64
/// targets, so no CPU feature detection is necessary.
///
/// # Safety
///
/// There are no safety requirements for this definition of the macro. It is
/// safe for all inputs since it is restricted to either the fallback routine
/// or the SSE routine, which is always safe to call on x86_64.
#[cfg(not(feature = "std"))]
macro_rules! unsafe_ifunc {
    ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{
        if cfg!(memchr_runtime_sse2) {
            unsafe { sse2::$name($($needle),+, $haystack) }
        } else {
            fallback::$name($($needle),+, $haystack)
        }
    }}
}

#[inline(always)]
pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
    unsafe_ifunc!(fn(u8, &[u8]) -> Option<usize>, memchr, haystack, n1)
}

#[inline(always)]
pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
    unsafe_ifunc!(
        fn(u8, u8, &[u8]) -> Option<usize>,
        memchr2,
        haystack,
        n1,
        n2
    )
}

#[inline(always)]
pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
    unsafe_ifunc!(
        fn(u8, u8, u8, &[u8]) -> Option<usize>,
        memchr3,
        haystack,
        n1,
        n2,
        n3
    )
}

#[inline(always)]
pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
    unsafe_ifunc!(fn(u8, &[u8]) -> Option<usize>, memrchr, haystack, n1)
}

#[inline(always)]
pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
    unsafe_ifunc!(
        fn(u8, u8, &[u8]) -> Option<usize>,
        memrchr2,
        haystack,
        n1,
        n2
    )
}

#[inline(always)]
pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
    unsafe_ifunc!(
        fn(u8, u8, u8, &[u8]) -> Option<usize>,
        memrchr3,
        haystack,
        n1,
        n2,
        n3
    )
}