-
Notifications
You must be signed in to change notification settings - Fork 6
/
memcmp32.asm
366 lines (321 loc) · 12.8 KB
/
memcmp32.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
;************************* memcmp32.asm *************************************
; Author: Agner Fog
; Date created: 2013-10-03
; Last modified: 2013-10-03
; Description:
; Faster version of the standard memcmp function:
;
; int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
;
; Compares two memory blocks of size num.
; The return value is zero if the two memory blocks ptr1 and ptr2 are equal
; The return value is positive if the first differing byte of ptr1 is bigger
; than ptr2 when compared as unsigned bytes.
; The return value is negative if the first differing byte of ptr1 is smaller
; than ptr2 when compared as unsigned bytes.
;
; Overriding standard function memcmp:
; The alias ?OVR_memcmp is changed to _memcmp in the object file if
; it is desired to override the standard library function memcmp.
;
; Optimization:
; Uses XMM registers if SSE2 is available, uses YMM registers if AVX2.
;
; The latest version of this file is available at:
; www.agner.org/optimize/asmexamples.zip
; Copyright (c) 2013 GNU General Public License www.gnu.org/licenses
;******************************************************************************
global _A_memcmp: function ; Function memcmp
global ?OVR_memcmp: function ; ?OVR removed if standard function memcmp overridden
; Direct entries to CPU-specific versions
global _memcmp386: function ; version for old CPUs without SSE
global _memcmpSSE2: function ; SSE2 version
global _memcmpAVX2: function ; AVX2 version
; Imported from instrset32.asm
extern _InstructionSet ; Instruction set for CPU dispatcher
SECTION .text align=16
; extern "C" int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
; Function entry:
_A_memcmp:
?OVR_memcmp:
%IFNDEF POSITIONINDEPENDENT
jmp dword [memcmpDispatch] ; Go to appropriate version, depending on instruction set
%ELSE ; Position-independent code
call get_thunk_edx ; get reference point for position-independent code
RP: ; reference point edx = offset RP
; Make the following instruction with address relative to RP:
jmp dword [edx+memcmpDispatch-RP]
%ENDIF
align 16
_memcmpAVX2: ; AVX2 version. Use ymm register
memcmpAVX2@: ; internal reference
push esi
push edi
mov esi, [esp+12] ; ptr1
mov edi, [esp+16] ; ptr2
mov ecx, [esp+20] ; size
add esi, ecx ; use negative index from end of memory block
add edi, ecx
neg ecx
jz A900
mov edx, 0FFFFH
cmp ecx, -32
ja A100
A000: ; loop comparing 32 bytes
vmovdqu ymm1, [esi+ecx]
vpcmpeqb ymm0, ymm1, [edi+ecx] ; compare 32 bytes
vpmovmskb eax, ymm0 ; get byte mask
xor eax, -1 ; not eax would not set flags
jnz A700 ; difference found
add ecx, 32
jz A900 ; finished, equal
cmp ecx, -32
jna A000 ; next 32 bytes
vzeroupper ; end ymm state
A100: ; less than 32 bytes left
cmp ecx, -16
ja A200
movdqu xmm1, [esi+ecx]
movdqu xmm2, [edi+ecx]
pcmpeqb xmm1, xmm2 ; compare 16 bytes
pmovmskb eax, xmm1 ; get byte mask
xor eax, edx ; not ax
jnz A701 ; difference found
add ecx, 16
jz A901 ; finished, equal
A200: ; less than 16 bytes left
cmp ecx, -8
ja A300
; compare 8 bytes
movq xmm1, [esi+ecx]
movq xmm2, [edi+ecx]
pcmpeqb xmm1, xmm2 ; compare 8 bytes
pmovmskb eax, xmm1 ; get byte mask
xor eax, edx ; not ax
jnz A701 ; difference found
add ecx, 8
jz A901
A300: ; less than 8 bytes left
cmp ecx, -4
ja A400
; compare 4 bytes
movd xmm1, [esi+ecx]
movd xmm2, [edi+ecx]
pcmpeqb xmm1, xmm2 ; compare 4 bytes
pmovmskb eax, xmm1 ; get byte mask
xor eax, edx ; not ax
jnz A701 ; difference found
add ecx, 4
jz A901
A400: ; less than 4 bytes left
cmp ecx, -2
ja A500
movzx eax, word [esi+ecx]
movzx edx, word [edi+ecx]
sub eax, edx
jnz A800 ; difference in byte 0 or 1
add ecx, 2
jz A901
A500: ; less than 2 bytes left
test ecx, ecx
jz A901 ; no bytes left
A600: ; one byte left
movzx eax, byte [esi+ecx]
movzx edx, byte [edi+ecx]
sub eax, edx ; return result
pop edi
pop esi
ret
A700: ; difference found. find position
vzeroupper
A701:
bsf eax, eax
add ecx, eax
movzx eax, byte [esi+ecx]
movzx edx, byte [edi+ecx]
sub eax, edx ; return result
pop edi
pop esi
ret
A800: ; difference in byte 0 or 1
neg al
sbb ecx, -1 ; add 1 to ecx if al == 0
movzx eax, byte [esi+ecx]
movzx edx, byte [edi+ecx]
sub eax, edx ; return result
pop edi
pop esi
ret
A900: ; equal
vzeroupper
A901: xor eax, eax
pop edi
pop esi
ret
_memcmpSSE2: ; SSE2 version. Use xmm register
memcmpSSE2@: ; internal reference
push esi
push edi
mov esi, [esp+12] ; ptr1
mov edi, [esp+16] ; ptr2
mov ecx, [esp+20] ; size
add esi, ecx ; use negative index from end of memory block
add edi, ecx
neg ecx
jz S900
mov edx, 0FFFFH
cmp ecx, -16
ja S200
S100: ; loop comparing 16 bytes
movdqu xmm1, [esi+ecx]
movdqu xmm2, [edi+ecx]
pcmpeqb xmm1, xmm2 ; compare 16 bytes
pmovmskb eax, xmm1 ; get byte mask
xor eax, edx ; not ax
jnz S700 ; difference found
add ecx, 16
jz S900 ; finished, equal
cmp ecx, -16
jna S100 ; next 16 bytes
S200: ; less than 16 bytes left
cmp ecx, -8
ja S300
; compare 8 bytes
movq xmm1, [esi+ecx]
movq xmm2, [edi+ecx]
pcmpeqb xmm1, xmm2 ; compare 8 bytes
pmovmskb eax, xmm1 ; get byte mask
xor eax, edx ; not ax
jnz S700 ; difference found
add ecx, 8
jz S900
S300: ; less than 8 bytes left
cmp ecx, -4
ja S400
; compare 4 bytes
movd xmm1, [esi+ecx]
movd xmm2, [edi+ecx]
pcmpeqb xmm1, xmm2 ; compare 4 bytes
pmovmskb eax, xmm1 ; get byte mask
xor eax, edx ; not ax
jnz S700 ; difference found
add ecx, 4
jz S900
S400: ; less than 4 bytes left
cmp ecx, -2
ja S500
movzx eax, word [esi+ecx]
movzx edx, word [edi+ecx]
sub eax, edx
jnz S800 ; difference in byte 0 or 1
add ecx, 2
jz S900
S500: ; less than 2 bytes left
test ecx, ecx
jz S900 ; no bytes left
; one byte left
movzx eax, byte [esi+ecx]
movzx edx, byte [edi+ecx]
sub eax, edx ; return result
pop edi
pop esi
ret
S700: ; difference found. find position
bsf eax, eax
add ecx, eax
movzx eax, byte [esi+ecx]
movzx edx, byte [edi+ecx]
sub eax, edx ; return result
pop edi
pop esi
ret
S800: ; difference in byte 0 or 1
neg al
sbb ecx, -1 ; add 1 to ecx if al == 0
S820: movzx eax, byte [esi+ecx]
movzx edx, byte [edi+ecx]
sub eax, edx ; return result
pop edi
pop esi
ret
S900: ; equal
xor eax, eax
pop edi
pop esi
ret
_memcmp386: ; 80386 version
memcmp386@: ; internal reference
; This is not perfectly optimized because it is unlikely to ever be used
push esi
push edi
mov esi, [esp+12] ; ptr1
mov edi, [esp+16] ; ptr2
mov ecx, [esp+20] ; size
mov edx, ecx
shr ecx, 2 ; size/4 = number of dwords
repe cmpsd ; compare dwords
jnz M700
mov ecx, edx
and ecx, 3 ; remainder
M600: repe cmpsb ; compare bytes
je M800 ; equal
movzx eax, byte [esi-1] ; esi, edi point past the differing byte. find difference
movzx edx, byte [edi-1]
sub eax, edx ; calculate return value
pop edi
pop esi
ret
M700: ; dwords differ. search in last 4 bytes
mov ecx, 4
sub esi, ecx
sub edi, ecx
jmp M600
M800: ; equal. return zero
xor eax, eax
pop edi
pop esi
ret
; CPU dispatching for memcmp. This is executed only once
memcmpCPUDispatch:
%IFNDEF POSITIONINDEPENDENT
call _InstructionSet ; get supported instruction set
; Point to generic version of memcmp
mov dword [memcmpDispatch], memcmp386@
cmp eax, 4 ; check SSE2
jb Q100
; SSE2 supported
mov dword [memcmpDispatch], memcmpSSE2@
cmp eax, 13 ; check AVX2
jb Q100
; AVX2 supported
mov dword [memcmpDispatch], memcmpAVX2@
Q100: ; Continue in appropriate version of memcmp
jmp dword [memcmpDispatch]
%ELSE ; Position-independent version
push edx
call _InstructionSet
pop edx
; Point to generic version of memcmp
lea ecx, [edx+memcmp386@-RP]
cmp eax, 4 ; check SSE2
jb Q100
; Point to SSE2 version of memcmp
lea ecx, [edx+memcmpSSE2@-RP]
cmp eax, 13 ; check AVX2
jb Q100
; Point to AVX2 version of memcmp
lea ecx, [edx+memcmpAVX2@-RP]
Q100: mov [edx+memcmpDispatch-RP], ecx
; Continue in appropriate version of memcmp
jmp ecx
get_thunk_edx: ; load caller address into edx for position-independent code
mov edx, [esp]
ret
%ENDIF
SECTION .data
align 16
; Pointer to appropriate version.
; This initially points to memcmpCPUDispatch. memcmpCPUDispatch will
; change this to the appropriate version of memcmp, so that
; memcmpCPUDispatch is only executed once:
memcmpDispatch DD memcmpCPUDispatch