8 vpbroadcastb ymm0, xmm0
30global __fast_memset_aligned__
31__fast_memset_aligned__:
37 vpbroadcastb ymm0, xmm0
63global __fast__memcpy__
93global __fast__memcpy_aligned__
94__fast__memcpy_aligned__:
103 prefetchnta [rsi+256]
124global __fast__strncmp__
134 vmovdqu ymm0, [rdi] ; load 32 byte s1
135 vmovdqu ymm1, [rsi] ; load 32 byte s2
137 vpcmpeqb ymm2, ymm0, ymm1 ; compare byte per byte
138 vpmovmskb eax, ymm2 ; mask 32-bit
143 ; null terminator check
144 vpxor ymm3, ymm3, ymm3 ; register zero
145 vpcmpeqb ymm4, ymm0, ymm3
172 ; find first difference byte inside block 32-byte
187 ; scan for null terminator
211global __fast__memchr__
214 ; rdi = buf, rsi = c (char), rdx = len
219 vpbroadcastb ymm0, xmm0
225 prefetchnta [rdi+256]
226 vmovdqu ymm1, [rdi] ; load 32 byte from buf
228 vpcmpeqb ymm2, ymm1, ymm0 ; compare per byte with target
229 vpmovmskb eax, ymm2 ; bitmask: bit=1 if match
239 ; find first found byte inside block 32-byte
240 bsf eax, eax ; bit scan forward