1 // Copyright 2021 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // This is an implementation based on the s390x
6 // implementation.
7
8 // Find a separator with 2 <= len <= 32 within a string.
9 // Separators with lengths of 2, 3 or 4 are handled
10 // specially.
11
12 // This works on power8 and above. The loads and
13 // compares are done in big endian order
14 // since that allows the used of VCLZD, and allows
15 // the same implementation to work on big and little
16 // endian platforms with minimal conditional changes.
17
18 // NOTE: There is a power9 implementation that
19 // improves performance by 10-15% on little
20 // endian for some of the benchmarks, but
21 // work is still needed for a big endian
22 // implementation on power9.
23
24 //go:build ppc64 || ppc64le
25 // +build ppc64 ppc64le
26
27 #include "go_asm.h"
28 #include "textflag.h"
29
30 // Needed to swap LXVD2X loads to the correct
31 // byte order to work on POWER8.
32
33 #ifdef GOARCH_ppc64
34 DATA byteswap<>+0(SB)/8, $0x0001020304050607
35 DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f
36 #else
37 DATA byteswap<>+0(SB)/8, $0x0706050403020100
38 DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
39 #endif
40
41 // Load bytes in big endian order. Address
42 // alignment does not need checking.
43 #define VLOADSWAP(base, index, vreg, vsreg) \
44 LXVD2X (base)(index), vsreg; \
45 VPERM vreg, vreg, SWAP, vreg
46
47 GLOBL byteswap<>+0(SB), RODATA, $16
48
49 TEXT ·Index(SB), NOSPLIT|NOFRAME, $0-56
50 MOVD a_base+0(FP), R3 // R3 = byte array pointer
51 MOVD a_len+8(FP), R4 // R4 = length
52 MOVD b_base+24(FP), R5 // R5 = separator pointer
53 MOVD b_len+32(FP), R6 // R6 = separator length
54 MOVD $ret+48(FP), R14 // R14 = &ret
55
56 #ifdef GOARCH_ppc64le
57 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
58 CMP R7, $1
59 BNE power8
60 BR indexbodyp9<>(SB)
61
62 #endif
63 power8:
64 BR indexbody<>(SB)
65
66 TEXT ·IndexString(SB), NOSPLIT|NOFRAME, $0-40
67 MOVD a_base+0(FP), R3 // R3 = string
68 MOVD a_len+8(FP), R4 // R4 = length
69 MOVD b_base+16(FP), R5 // R5 = separator pointer
70 MOVD b_len+24(FP), R6 // R6 = separator length
71 MOVD $ret+32(FP), R14 // R14 = &ret
72
73 #ifdef GOARCH_ppc64le
74 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
75 CMP R7, $1
76 BNE power8
77 BR indexbody<>(SB)
78
79 #endif
80 power8:
81 BR indexbody<>(SB)
82
83 // s: string we are searching
84 // sep: string to search for
85 // R3=&s[0], R4=len(s)
86 // R5=&sep[0], R6=len(sep)
87 // R14=&ret (index where sep found)
88 // R7=working addr of string
89 // R16=index value 16
90 // R17=index value 17
91 // R18=index value 18
92 // R19=index value 1
93 // R26=LASTBYTE of string
94 // R27=LASTSTR last start byte to compare with sep
95 // R8, R9 scratch
96 // V0=sep left justified zero fill
97 // CR4=sep length >= 16
98
99 #define SEPMASK V17
100 #define LASTBYTE R26
101 #define LASTSTR R27
102 #define ONES V20
103 #define SWAP V21
104 #define V0_ VS32
105 #define V1_ VS33
106 #define V2_ VS34
107 #define V3_ VS35
108 #define V4_ VS36
109 #define V5_ VS37
110 #define V6_ VS38
111 #define V7_ VS39
112 #define V8_ VS40
113 #define V9_ VS41
114 #define SWAP_ VS53
115 TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
116 CMP R6, R4 // Compare lengths
117 BGT notfound // If sep len is > string, notfound
118 ADD R4, R3, LASTBYTE // find last byte addr
119 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
120 CMP R6, $0 // Check sep len
121 BEQ notfound // sep len 0 -- not found
122 MOVD R3, R7 // Copy of string addr
123 MOVD $16, R16 // Index value 16
124 MOVD $17, R17 // Index value 17
125 MOVD $18, R18 // Index value 18
126 MOVD $1, R19 // Index value 1
127 MOVD $byteswap<>+00(SB), R8
128 VSPLTISB $0xFF, ONES // splat all 1s
129 LXVD2X (R8)(R0), SWAP_ // Set up swap string
130
131 CMP R6, $16, CR4 // CR4 for len(sep) >= 16
132 VOR ONES, ONES, SEPMASK // Set up full SEPMASK
133 BGE CR4, loadge16 // Load for len(sep) >= 16
134 SUB R6, R16, R9 // 16-len of sep
135 SLD $3, R9 // Set up for VSLO
136 MTVSRD R9, V9_ // Set up for VSLO
137 VSLDOI $8, V9, V9, V9 // Set up for VSLO
138 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
139
140 loadge16:
141 ANDCC $15, R5, R9 // Find byte offset of sep
142 ADD R9, R6, R10 // Add sep len
143 CMP R10, $16 // Check if sep len+offset > 16
144 BGE sepcross16 // Sep crosses 16 byte boundary
145
146 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
147 VLOADSWAP(R8, R0, V0, V0_)// Load 16 bytes @R8 into V0
148 SLD $3, R9 // Set up shift count for VSLO
149 MTVSRD R9, V8_ // Set up shift count for VSLO
150 VSLDOI $8, V8, V8, V8
151 VSLO V0, V8, V0 // Shift by start byte
152
153 VAND V0, SEPMASK, V0 // Mask separator (< 16)
154 BR index2plus
155
156 sepcross16:
157 VLOADSWAP(R5, R0, V0, V0_) // Load 16 bytes @R5 into V0
158
159 VAND V0, SEPMASK, V0 // mask out separator
160 BLE CR4, index2to16
161 BR index17plus // Handle sep > 16
162
163 index2plus:
164 CMP R6, $2 // Check length of sep
165 BNE index3plus // If not 2, check for 3
166 ADD $16, R7, R9 // Check if next 16 bytes past last
167 CMP R9, LASTBYTE // compare with last
168 BGE index2to16 // 2 <= len(string) <= 16
169 MOVD $0xff00, R21 // Mask for later
170 MTVSRD R21, V25 // Move to Vreg
171 VSPLTH $3, V25, V31 // Splat mask
172 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep
173 VSPLTISB $0, V10 // Clear V10
174
175 // First case: 2 byte separator
176 // V1: 2 byte separator splatted
177 // V2: 16 bytes at addr
178 // V4: 16 bytes at addr+1
179 // Compare 2 byte separator at start
180 // and at start+1. Use VSEL to combine
181 // those results to find the first
182 // matching start byte, returning
183 // that value when found. Loop as
184 // long as len(string) > 16
185 index2loop2:
186 VLOADSWAP(R7, R19, V3, V3_) // Load 16 bytes @R7+1 into V3
187
188 index2loop:
189 VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
190 VCMPEQUH V1, V2, V5 // Search for sep
191 VCMPEQUH V1, V3, V6 // Search for sep offset by 1
192 VSEL V6, V5, V31, V7 // merge even and odd indices
193 VCLZD V7, V18 // find index of first match
194 MFVSRD V18, R25 // get first value
195 CMP R25, $64 // Found if < 64
196 BLT foundR25 // Return byte index where found
197 VSLDOI $8, V18, V18, V18 // Adjust 2nd value
198 MFVSRD V18, R25 // get second value
199 CMP R25, $64 // Found if < 64
200 ADD $64, R25 // Update byte offset
201 BLT foundR25 // Return value
202 ADD $16, R7 // R7+=16 Update string pointer
203 ADD $17, R7, R9 // R9=F7+17 since loop unrolled
204 CMP R9, LASTBYTE // Compare addr+17 against last byte
205 BLT index2loop2 // If < last, continue loop
206 CMP R7, LASTBYTE // Compare addr+16 against last byte
207 BLT index2to16 // If < 16 handle specially
208 VLOADSWAP(R7, R0, V3, V3_) // Load 16 bytes @R7 into V3
209 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
210 BR index2loop
211
212 index3plus:
213 CMP R6, $3 // Check if sep == 3
214 BNE index4plus // If not check larger
215 ADD $19, R7, R9 // Find bytes for use in this loop
216 CMP R9, LASTBYTE // Compare against last byte
217 BGE index2to16 // Remaining string 2<=len<=16
218 MOVD $0xff00, R21 // Set up mask for upcoming loop
219 MTVSRD R21, V25 // Move mask to Vreg
220 VSPLTH $3, V25, V31 // Splat mask
221 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep
222 VSPLTB $2, V0, V8 // Splat 3rd byte of sep
223
224 // Loop to process 3 byte separator.
225 // string[0:16] is in V2
226 // string[2:18] is in V3
227 // sep[0:2] splatted in V1
228 // sec[3] splatted in v8
229 // Load vectors at string, string+1
230 // and string+2. Compare string, string+1
231 // against first 2 bytes of separator
232 // splatted, and string+2 against 3rd
233 // byte splatted. Merge the results with
234 // VSEL to find the first byte of a match.
235
236 // Special handling for last 16 bytes if the
237 // string fits in 16 byte multiple.
238 index3loop2:
239 MOVD $2, R21 // Set up index for 2
240 VSPLTISB $0, V10 // Clear V10
241 VLOADSWAP(R7, R21, V3, V3_)// Load 16 bytes @R7+2 into V3
242 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
243
244 index3loop:
245 VLOADSWAP(R7, R0, V2, V2_) // Load with correct order
246 VSLDOI $1, V2, V3, V4 // string[1:17]
247 VSLDOI $2, V2, V3, V9 // string[2:18]
248 VCMPEQUH V1, V2, V5 // compare hw even indices
249 VCMPEQUH V1, V4, V6 // compare hw odd indices
250 VCMPEQUB V8, V9, V10 // compare 3rd to last byte
251 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask
252 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte
253 VCLZD V7, V18 // Find first nonzero indexes
254 MFVSRD V18, R25 // Move 1st doubleword
255 CMP R25, $64 // If < 64 found
256 BLT foundR25 // Return matching index
257 VSLDOI $8, V18, V18, V18 // Move value
258 MFVSRD V18, R25 // Move 2nd doubleword
259 CMP R25, $64 // If < 64 found
260 ADD $64, R25 // Update byte index
261 BLT foundR25 // Return matching index
262 ADD $16, R7 // R7+=16 string ptr
263 ADD $19, R7, R9 // Number of string bytes for loop
264 CMP R9, LASTBYTE // Compare against last byte of string
265 BLT index3loop2 // If within, continue this loop
266 CMP R7, LASTSTR // Compare against last start byte
267 BLT index2to16 // Process remainder
268 VSPLTISB $0, V3 // Special case for last 16 bytes
269 BR index3loop // Continue this loop
270
271 // Loop to process 4 byte separator
272 // string[0:16] in V2
273 // string[3:16] in V3
274 // sep[0:4] splatted in V1
275 // Set up vectors with strings at offsets
276 // 0, 1, 2, 3 and compare against the 4 byte
277 // separator also splatted. Use VSEL with the
278 // compare results to find the first byte where
279 // a separator match is found.
280 index4plus:
281 CMP R6, $4 // Check if 4 byte separator
282 BNE index5plus // If not next higher
283 ADD $20, R7, R9 // Check string size to load
284 CMP R9, LASTBYTE // Verify string length
285 BGE index2to16 // If not large enough, process remaining
286 MOVD $2, R15 // Set up index
287
288 // Set up masks for use with VSEL
289 MOVD $0xff, R21 // Set up mask 0xff000000ff000000...
290 SLD $24, R21
291 MTVSRD R21, V10
292 VSPLTW $1, V10, V29
293 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
294 MOVD $0xffff, R21
295 SLD $16, R21
296 MTVSRD R21, V10
297 VSPLTW $1, V10, V31 // Mask 0xffff0000ffff0000...
298 VSPLTW $0, V0, V1 // Splat 1st word of separator
299
300 index4loop:
301 VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
302
303 next4:
304 VSPLTISB $0, V10 // Clear
305 MOVD $3, R9 // Number of bytes beyond 16
306 VLOADSWAP(R7, R9, V3, V3_) // Load 16 bytes @R7+3 into V3
307 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
308 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
309 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
310 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3
311 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep
312 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep
313 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep
314 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep
315 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask
316 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
317 VSEL V14, V13, V31, V7 // final merge
318 VCLZD V7, V18 // Find first index for each half
319 MFVSRD V18, R25 // Isolate value
320 CMP R25, $64 // If < 64, found
321 BLT foundR25 // Return found index
322 VSLDOI $8, V18, V18, V18 // Move for MFVSRD
323 MFVSRD V18, R25 // Isolate other value
324 CMP R25, $64 // If < 64, found
325 ADD $64, R25 // Update index for high doubleword
326 BLT foundR25 // Return found index
327 ADD $16, R7 // R7+=16 for next string
328 ADD $20, R7, R9 // R+20 for all bytes to load
329 CMP R9, LASTBYTE // Past end? Maybe check for extra?
330 BLT index4loop // If not, continue loop
331 CMP R7, LASTSTR // Check remainder
332 BLE index2to16 // Process remainder
333 BR notfound // Not found
334
335 index5plus:
336 CMP R6, $16 // Check for sep > 16
337 BGT index17plus // Handle large sep
338
339 // Assumption is that the separator is smaller than the string at this point
340 index2to16:
341 CMP R7, LASTSTR // Compare last start byte
342 BGT notfound // last takes len(sep) into account
343
344 ADD $16, R7, R9 // Check for last byte of string
345 CMP R9, LASTBYTE
346 BGT index2to16tail
347
348 // At least 16 bytes of string left
349 // Mask the number of bytes in sep
350 index2to16loop:
351 VLOADSWAP(R7, R0, V1, V1_) // Load 16 bytes @R7 into V1
352
353 compare:
354 VAND V1, SEPMASK, V2 // Mask out sep size
355 VCMPEQUBCC V0, V2, V3 // Compare masked string
356 BLT CR6, found // All equal
357 ADD $1, R7 // Update ptr to next byte
358 CMP R7, LASTSTR // Still less than last start byte
359 BGT notfound // Not found
360 ADD $16, R7, R9 // Verify remaining bytes
361 CMP R9, LASTBYTE // At least 16
362 BLT index2to16loop // Try again
363
364 // Less than 16 bytes remaining in string
365 // Separator >= 2
366 index2to16tail:
367 ADD R3, R4, R9 // End of string
368 SUB R7, R9, R9 // Number of bytes left
369 ANDCC $15, R7, R10 // 16 byte offset
370 ADD R10, R9, R11 // offset + len
371 CMP R11, $16 // >= 16?
372 BLE short // Does not cross 16 bytes
373 VLOADSWAP(R7, R0, V1, V1_)// Load 16 bytes @R7 into V1
374 BR index2to16next // Continue on
375
376 short:
377 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
378 VLOADSWAP(R9, R0, V1, V1_)// Load 16 bytes @R9 into V1
379 SLD $3, R10 // Set up shift
380 MTVSRD R10, V8_ // Set up shift
381 VSLDOI $8, V8, V8, V8
382 VSLO V1, V8, V1 // Shift by start byte
383 VSPLTISB $0, V25 // Clear for later use
384
385 index2to16next:
386 VAND V1, SEPMASK, V2 // Just compare size of sep
387 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
388 BLT CR6, found // Found
389 ADD $1, R7 // Not found, try next partial string
390 CMP R7, LASTSTR // Check for end of string
391 BGT notfound // If at end, then not found
392 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
393 BR index2to16next // Check the next partial string
394
395 index17plus:
396 CMP R6, $32 // Check if 17 < len(sep) <= 32
397 BGT index33plus
398 SUB $16, R6, R9 // Extra > 16
399 SLD $56, R9, R10 // Shift to use in VSLO
400 MTVSRD R10, V9_ // Set up for VSLO
401 VLOADSWAP(R5, R9, V1, V1_)// Load 16 bytes @R5+R9 into V1
402 VSLO V1, V9, V1 // Shift left
403 VSPLTISB $0xff, V7 // Splat 1s
404 VSPLTISB $0, V27 // Splat 0
405
406 index17to32loop:
407 VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
408
409 next17:
410 VLOADSWAP(R7, R9, V3, V3_) // Load 16 bytes @R7+R9 into V3
411 VSLO V3, V9, V3 // Shift left
412 VCMPEQUB V0, V2, V4 // Compare first 16 bytes
413 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
414 VAND V4, V5, V6 // Check if both equal
415 VCMPEQUBCC V6, V7, V8 // All equal?
416 BLT CR6, found // Yes
417 ADD $1, R7 // On to next byte
418 CMP R7, LASTSTR // Check if last start byte
419 BGT notfound // If too high, not found
420 BR index17to32loop // Continue
421
422 notfound:
423 MOVD $-1, R8 // Return -1 if not found
424 MOVD R8, (R14)
425 RET
426
427 index33plus:
428 MOVD $0, (R0) // Case not implemented
429 RET // Crash before return
430
431 foundR25:
432 SRD $3, R25 // Convert from bits to bytes
433 ADD R25, R7 // Add to current string address
434 SUB R3, R7 // Subtract from start of string
435 MOVD R7, (R14) // Return byte where found
436 RET
437
438 found:
439 SUB R3, R7 // Return byte where found
440 MOVD R7, (R14)
441 RET
442
443 TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
444 CMP R6, R4 // Compare lengths
445 BGT notfound // If sep len is > string, notfound
446 ADD R4, R3, LASTBYTE // find last byte addr
447 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
448 CMP R6, $0 // Check sep len
449 BEQ notfound // sep len 0 -- not found
450 MOVD R3, R7 // Copy of string addr
451 MOVD $16, R16 // Index value 16
452 MOVD $17, R17 // Index value 17
453 MOVD $18, R18 // Index value 18
454 MOVD $1, R19 // Index value 1
455 VSPLTISB $0xFF, ONES // splat all 1s
456
457 CMP R6, $16, CR4 // CR4 for len(sep) >= 16
458 VOR ONES, ONES, SEPMASK // Set up full SEPMASK
459 BGE CR4, loadge16 // Load for len(sep) >= 16
460 SUB R6, R16, R9 // 16-len of sep
461 SLD $3, R9 // Set up for VSLO
462 MTVSRD R9, V9_ // Set up for VSLO
463 VSLDOI $8, V9, V9, V9 // Set up for VSLO
464 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
465
466 loadge16:
467 ANDCC $15, R5, R9 // Find byte offset of sep
468 ADD R9, R6, R10 // Add sep len
469 CMP R10, $16 // Check if sep len+offset > 16
470 BGE sepcross16 // Sep crosses 16 byte boundary
471
472 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
473 LXVB16X (R8)(R0), V0_ // Load 16 bytes @R8 into V0
474 SLD $3, R9 // Set up shift count for VSLO
475 MTVSRD R9, V8_ // Set up shift count for VSLO
476 VSLDOI $8, V8, V8, V8
477 VSLO V0, V8, V0 // Shift by start byte
478
479 VAND V0, SEPMASK, V0 // Mask separator (< 16)
480 BR index2plus
481
482 sepcross16:
483 LXVB16X (R5)(R0), V0_ // Load 16 bytes @R5 into V0
484
485 VAND V0, SEPMASK, V0 // mask out separator
486 BLE CR4, index2to16
487 BR index17plus // Handle sep > 16
488
489 index2plus:
490 CMP R6, $2 // Check length of sep
491 BNE index3plus // If not 2, check for 3
492 ADD $16, R7, R9 // Check if next 16 bytes past last
493 CMP R9, LASTBYTE // compare with last
494 BGE index2to16 // 2 <= len(string) <= 16
495 MOVD $0xff00, R21 // Mask for later
496 MTVSRD R21, V25 // Move to Vreg
497 VSPLTH $3, V25, V31 // Splat mask
498 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep
499 VSPLTISB $0, V10 // Clear V10
500
501 // First case: 2 byte separator
502 // V1: 2 byte separator splatted
503 // V2: 16 bytes at addr
504 // V4: 16 bytes at addr+1
505 // Compare 2 byte separator at start
506 // and at start+1. Use VSEL to combine
507 // those results to find the first
508 // matching start byte, returning
509 // that value when found. Loop as
510 // long as len(string) > 16
511 index2loop2:
512 LXVB16X (R7)(R19), V3_ // Load 16 bytes @R7+1 into V3
513
514 index2loop:
515 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
516 VCMPEQUH V1, V2, V5 // Search for sep
517 VCMPEQUH V1, V3, V6 // Search for sep offset by 1
518 VSEL V6, V5, V31, V7 // merge even and odd indices
519 VCLZD V7, V18 // find index of first match
520 MFVSRD V18, R25 // get first value
521 CMP R25, $64 // Found if < 64
522 BLT foundR25 // Return byte index where found
523
524 MFVSRLD V18, R25 // get second value
525 CMP R25, $64 // Found if < 64
526 ADD $64, R25 // Update byte offset
527 BLT foundR25 // Return value
528 ADD $16, R7 // R7+=16 Update string pointer
529 ADD $17, R7, R9 // R9=F7+17 since loop unrolled
530 CMP R9, LASTBYTE // Compare addr+17 against last byte
531 BLT index2loop2 // If < last, continue loop
532 CMP R7, LASTBYTE // Compare addr+16 against last byte
533 BLT index2to16 // If < 16 handle specially
534 LXVB16X (R7)(R0), V3_ // Load 16 bytes @R7 into V3
535 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
536 BR index2loop
537
538 index3plus:
539 CMP R6, $3 // Check if sep == 3
540 BNE index4plus // If not check larger
541 ADD $19, R7, R9 // Find bytes for use in this loop
542 CMP R9, LASTBYTE // Compare against last byte
543 BGE index2to16 // Remaining string 2<=len<=16
544 MOVD $0xff00, R21 // Set up mask for upcoming loop
545 MTVSRD R21, V25 // Move mask to Vreg
546 VSPLTH $3, V25, V31 // Splat mask
547 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep
548 VSPLTB $2, V0, V8 // Splat 3rd byte of sep
549
550 // Loop to process 3 byte separator.
551 // string[0:16] is in V2
552 // string[2:18] is in V3
553 // sep[0:2] splatted in V1
554 // sec[3] splatted in v8
555 // Load vectors at string, string+1
556 // and string+2. Compare string, string+1
557 // against first 2 bytes of separator
558 // splatted, and string+2 against 3rd
559 // byte splatted. Merge the results with
560 // VSEL to find the first byte of a match.
561
562 // Special handling for last 16 bytes if the
563 // string fits in 16 byte multiple.
564 index3loop2:
565 MOVD $2, R21 // Set up index for 2
566 VSPLTISB $0, V10 // Clear V10
567 LXVB16X (R7)(R21), V3_ // Load 16 bytes @R7+2 into V3
568 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
569
570 index3loop:
571 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7
572 VSLDOI $1, V2, V3, V4 // string[1:17]
573 VSLDOI $2, V2, V3, V9 // string[2:18]
574 VCMPEQUH V1, V2, V5 // compare hw even indices
575 VCMPEQUH V1, V4, V6 // compare hw odd indices
576 VCMPEQUB V8, V9, V10 // compare 3rd to last byte
577 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask
578 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte
579 VCLZD V7, V18 // Find first nonzero indexes
580 MFVSRD V18, R25 // Move 1st doubleword
581 CMP R25, $64 // If < 64 found
582 BLT foundR25 // Return matching index
583
584 MFVSRLD V18, R25 // Move 2nd doubleword
585 CMP R25, $64 // If < 64 found
586 ADD $64, R25 // Update byte index
587 BLT foundR25 // Return matching index
588 ADD $16, R7 // R7+=16 string ptr
589 ADD $19, R7, R9 // Number of string bytes for loop
590 CMP R9, LASTBYTE // Compare against last byte of string
591 BLT index3loop2 // If within, continue this loop
592 CMP R7, LASTSTR // Compare against last start byte
593 BLT index2to16 // Process remainder
594 VSPLTISB $0, V3 // Special case for last 16 bytes
595 BR index3loop // Continue this loop
596
597 // Loop to process 4 byte separator
598 // string[0:16] in V2
599 // string[3:16] in V3
600 // sep[0:4] splatted in V1
601 // Set up vectors with strings at offsets
602 // 0, 1, 2, 3 and compare against the 4 byte
603 // separator also splatted. Use VSEL with the
604 // compare results to find the first byte where
605 // a separator match is found.
606 index4plus:
607 CMP R6, $4 // Check if 4 byte separator
608 BNE index5plus // If not next higher
609 ADD $20, R7, R9 // Check string size to load
610 CMP R9, LASTBYTE // Verify string length
611 BGE index2to16 // If not large enough, process remaining
612 MOVD $2, R15 // Set up index
613
614 // Set up masks for use with VSEL
615 MOVD $0xff, R21 // Set up mask 0xff000000ff000000...
616 SLD $24, R21
617 MTVSRWS R21, V29
618
619 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
620 MOVD $0xffff, R21
621 SLD $16, R21
622 MTVSRWS R21, V31
623
624 VSPLTW $0, V0, V1 // Splat 1st word of separator
625
626 index4loop:
627 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
628
629 next4:
630 VSPLTISB $0, V10 // Clear
631 MOVD $3, R9 // Number of bytes beyond 16
632 LXVB16X (R7)(R9), V3_ // Load 16 bytes @R7 into V2
633 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
634 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
635 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
636 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3
637 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep
638 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep
639 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep
640 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep
641 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask
642 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
643 VSEL V14, V13, V31, V7 // final merge
644 VCLZD V7, V18 // Find first index for each half
645 MFVSRD V18, R25 // Isolate value
646 CMP R25, $64 // If < 64, found
647 BLT foundR25 // Return found index
648
649 MFVSRLD V18, R25 // Isolate other value
650 CMP R25, $64 // If < 64, found
651 ADD $64, R25 // Update index for high doubleword
652 BLT foundR25 // Return found index
653 ADD $16, R7 // R7+=16 for next string
654 ADD $20, R7, R9 // R+20 for all bytes to load
655 CMP R9, LASTBYTE // Past end? Maybe check for extra?
656 BLT index4loop // If not, continue loop
657 CMP R7, LASTSTR // Check remainder
658 BLE index2to16 // Process remainder
659 BR notfound // Not found
660
661 index5plus:
662 CMP R6, $16 // Check for sep > 16
663 BGT index17plus // Handle large sep
664
665 // Assumption is that the separator is smaller than the string at this point
666 index2to16:
667 CMP R7, LASTSTR // Compare last start byte
668 BGT notfound // last takes len(sep) into account
669
670 ADD $16, R7, R9 // Check for last byte of string
671 CMP R9, LASTBYTE
672 BGT index2to16tail
673
674 // At least 16 bytes of string left
675 // Mask the number of bytes in sep
676 index2to16loop:
677 LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1
678
679 compare:
680 VAND V1, SEPMASK, V2 // Mask out sep size
681 VCMPEQUBCC V0, V2, V3 // Compare masked string
682 BLT CR6, found // All equal
683 ADD $1, R7 // Update ptr to next byte
684 CMP R7, LASTSTR // Still less than last start byte
685 BGT notfound // Not found
686 ADD $16, R7, R9 // Verify remaining bytes
687 CMP R9, LASTBYTE // At least 16
688 BLT index2to16loop // Try again
689
690 // Less than 16 bytes remaining in string
691 // Separator >= 2
692 index2to16tail:
693 ADD R3, R4, R9 // End of string
694 SUB R7, R9, R9 // Number of bytes left
695 ANDCC $15, R7, R10 // 16 byte offset
696 ADD R10, R9, R11 // offset + len
697 CMP R11, $16 // >= 16?
698 BLE short // Does not cross 16 bytes
699 LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1
700 BR index2to16next // Continue on
701
702 short:
703 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
704 LXVB16X (R9)(R0), V1_ // Load 16 bytes @R9 into V1
705 SLD $3, R10 // Set up shift
706 MTVSRD R10, V8_ // Set up shift
707 VSLDOI $8, V8, V8, V8
708 VSLO V1, V8, V1 // Shift by start byte
709 VSPLTISB $0, V25 // Clear for later use
710
711 index2to16next:
712 VAND V1, SEPMASK, V2 // Just compare size of sep
713 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
714 BLT CR6, found // Found
715 ADD $1, R7 // Not found, try next partial string
716 CMP R7, LASTSTR // Check for end of string
717 BGT notfound // If at end, then not found
718 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
719 BR index2to16next // Check the next partial string
720
721 index17plus:
722 CMP R6, $32 // Check if 17 < len(sep) <= 32
723 BGT index33plus
724 SUB $16, R6, R9 // Extra > 16
725 SLD $56, R9, R10 // Shift to use in VSLO
726 MTVSRD R10, V9_ // Set up for VSLO
727 LXVB16X (R5)(R9), V1_ // Load 16 bytes @R5+R9 into V1
728 VSLO V1, V9, V1 // Shift left
729 VSPLTISB $0xff, V7 // Splat 1s
730 VSPLTISB $0, V27 // Splat 0
731
732 index17to32loop:
733 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
734
735 next17:
736 LXVB16X (R7)(R9), V3_ // Load 16 bytes @R7+R9 into V3
737 VSLO V3, V9, V3 // Shift left
738 VCMPEQUB V0, V2, V4 // Compare first 16 bytes
739 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
740 VAND V4, V5, V6 // Check if both equal
741 VCMPEQUBCC V6, V7, V8 // All equal?
742 BLT CR6, found // Yes
743 ADD $1, R7 // On to next byte
744 CMP R7, LASTSTR // Check if last start byte
745 BGT notfound // If too high, not found
746 BR index17to32loop // Continue
747
748 notfound:
749 MOVD $-1, R8 // Return -1 if not found
750 MOVD R8, (R14)
751 RET
752
753 index33plus:
754 MOVD $0, (R0) // Case not implemented
755 RET // Crash before return
756
757 foundR25:
758 SRD $3, R25 // Convert from bits to bytes
759 ADD R25, R7 // Add to current string address
760 SUB R3, R7 // Subtract from start of string
761 MOVD R7, (R14) // Return byte where found
762 RET
763
764 found:
765 SUB R3, R7 // Return byte where found
766 MOVD R7, (R14)
767 RET
768
769
View as plain text