Voxia OS v0.0.1
Hobby Project Operating System Targeting x86-64
Loading...
Searching...
No Matches
str_simd.asm
Go to the documentation of this file.
1global __fast_memset__
2__fast_memset__:
3 ; check size is not 0
4 test rdx, rdx
5 jz .done
6
7 movd xmm0, esi
8 vpbroadcastb ymm0, xmm0
9
10 mov rcx, rdx
11 shr rcx, 5
12 jz .tail
13align 32
14.loop:
15 vmovdqu [rdi], ymm0
16 add rdi, 32
17 dec rcx
18 jnz .loop
19.tail:
20 and rdx, 31
21 jz .done
22.tail_loop:
23 mov [rdi], sil
24 inc rdi
25 dec rdx
26 jnz .tail_loop
27.done:
28 ret
29
30global __fast_memset_aligned__
31__fast_memset_aligned__:
32 ; check size is not 0
33 test rdx, rdx
34 jz .done
35
36 movq xmm0, rsi
37 vpbroadcastb ymm0, xmm0
38
39 mov rcx, rdx
40 shr rcx, 5
41 jz .tail
42
43align 32
44.loop:
45 prefetchnta [rdi+256]
46 vmovntdq [rdi], ymm0
47 add rdi, 32
48 dec rcx
49 jnz .loop
50
51.tail:
52 and rdx, 31
53 jz .done
54
55.tail_loop:
56 mov [rdi], sil
57 inc rdi
58 dec rdx
59 jnz .tail_loop
60.done:
61 ret
62
63global __fast__memcpy__
64__fast__memcpy__:
65 test rdx, rdx
66 jz .done
67
68 mov rcx, rdx
69 shr rcx, 5
70 jz .tail
71align 32
72.loop:
73 prefetchnta [rsi+256]
74 vmovdqu ymm0, [rsi]
75 vmovdqu [rdi], ymm0
76 add rsi, 32
77 add rdi, 32
78 dec rcx
79 jnz .loop
80.tail:
81 and rdx, 31
82 jz .done
83.tail_loop:
84 mov al, [rsi]
85 mov [rdi], al
86 inc rsi
87 inc rdi
88 dec rdx
89 jnz .tail_loop
90.done:
91 ret
92
93global __fast__memcpy_aligned__
94__fast__memcpy_aligned__:
95 test rdx, rdx
96 jz .done
97
98 mov rcx, rdx
99 shr rcx, 5
100 jz .tail
101align 32
102.loop:
103 prefetchnta [rsi+256]
104 vmovdqa ymm0, [rsi]
105 vmovdqa [rdi], ymm0
106 add rsi, 32
107 add rdi, 32
108 dec rcx
109 jnz .loop
110.tail:
111 and rdx, 31
112 jz .done
113.tail_loop:
114 mov al, [rsi]
115 mov [rdi], al
116 inc rsi
117 inc rdi
118 dec rdx
119 jnz .tail_loop
120.done:
121 ret
122
123
124global __fast__strncmp__
125section .text
126__fast__strncmp__:
127 test rdx, rdx
128 je .done
129
130.loop32:
131 cmp rdx, 32
132 jb .tail
133
134 vmovdqu ymm0, [rdi] ; load 32 byte s1
135 vmovdqu ymm1, [rsi] ; load 32 byte s2
136
137 vpcmpeqb ymm2, ymm0, ymm1 ; compare byte per byte
138 vpmovmskb eax, ymm2 ; mask 32-bit
139
140 cmp eax, 0xFFFFFFFF
141 jne .mismatch
142
143 ; null terminator check
144 vpxor ymm3, ymm3, ymm3 ; register zero
145 vpcmpeqb ymm4, ymm0, ymm3
146 vpmovmskb ecx, ymm4
147 test ecx, ecx
148 jne .null_found
149
150 add rdi, 32
151 add rsi, 32
152 sub rdx, 32
153 jmp .loop32
154
155.tail:
156 test rdx, rdx
157 je .done
158.byte_loop:
159 mov al, [rdi]
160 mov bl, [rsi]
161 cmp al, bl
162 jne .return_diff
163 test al, al
164 je .done
165 inc rdi
166 inc rsi
167 dec rdx
168 jnz .byte_loop
169 jmp .done
170
171.mismatch:
172 ; find first difference byte inside block 32-byte
173 mov rcx, 0
174.byte_scan:
175 cmp rcx, 32
176 je .done
177 mov al, [rdi + rcx]
178 mov bl, [rsi + rcx]
179 cmp al, bl
180 jne .return_diff
181 test al, al
182 je .done
183 inc rcx
184 jmp .byte_scan
185
186.null_found:
187 ; scan for null terminator
188 mov rcx, 0
189.null_scan:
190 cmp rcx, 32
191 je .done
192 mov al, [rdi + rcx]
193 mov bl, [rsi + rcx]
194 cmp al, bl
195 jne .return_diff
196 test al, al
197 je .done
198 inc rcx
199 jmp .null_scan
200
201.return_diff:
202 movzx eax, al
203 movzx ecx, bl
204 sub eax, ecx
205 ret
206
207.done:
208 xor eax, eax
209 ret
210
211global __fast__memchr__
212section .text
213__fast__memchr__:
214 ; rdi = buf, rsi = c (char), rdx = len
215 test rdx, rdx
216 jz .not_found
217
218 movd xmm0, esi
219 vpbroadcastb ymm0, xmm0
220
221.loop32:
222 cmp rdx, 32
223 jb .tail
224
225 prefetchnta [rdi+256]
226 vmovdqu ymm1, [rdi] ; load 32 byte from buf
227
228 vpcmpeqb ymm2, ymm1, ymm0 ; compare per byte with target
229 vpmovmskb eax, ymm2 ; bitmask: bit=1 if match
230
231 test eax, eax
232 jnz .found32
233
234 add rdi, 32
235 sub rdx, 32
236 jmp .loop32
237
238.found32:
239 ; find first found byte inside block 32-byte
240 bsf eax, eax ; bit scan forward
241 add rdi, rax
242 mov rax, rdi
243 vzeroupper
244 ret
245
246.tail:
247 test rdx, rdx
248 jz .not_found
249.tail_loop:
250 mov al, [rdi]
251 cmp al, sil
252 je .found_tail
253 inc rdi
254 dec rdx
255 jnz .tail_loop
256
257.not_found:
258 xor eax, eax
259 vzeroupper
260 ret
261
262.found_tail:
263 mov rax, rdi
264 vzeroupper
265 ret