Black Lives Matter. Support the Equal Justice Initiative.

Text file src/runtime/memclr_amd64.s

Documentation: runtime

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !plan9
     6  // +build !plan9
     7  
     8  #include "go_asm.h"
     9  #include "textflag.h"
    10  
    11  // See memclrNoHeapPointers Go doc for important implementation constraints.
    12  
    13  // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
    14  // ABIInternal for performance.
    15  TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
    16  #ifdef GOEXPERIMENT_regabiargs
    17  	// AX = ptr
    18  	// BX = n
    19  	MOVQ	AX, DI	// DI = ptr
    20  #else
    21  	MOVQ	ptr+0(FP), DI
    22  	MOVQ	n+8(FP), BX
    23  #endif
    24  	XORQ	AX, AX
    25  
    26  	// MOVOU seems always faster than REP STOSQ.
    27  tail:
    28  	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    29  	TESTQ	BX, BX
    30  	JEQ	_0
    31  	CMPQ	BX, $2
    32  	JBE	_1or2
    33  	CMPQ	BX, $4
    34  	JBE	_3or4
    35  	CMPQ	BX, $8
    36  	JB	_5through7
    37  	JE	_8
    38  	CMPQ	BX, $16
    39  	JBE	_9through16
    40  #ifndef GOEXPERIMENT_regabig
    41  	PXOR	X15, X15
    42  #endif
    43  	CMPQ	BX, $32
    44  	JBE	_17through32
    45  	CMPQ	BX, $64
    46  	JBE	_33through64
    47  	CMPQ	BX, $128
    48  	JBE	_65through128
    49  	CMPQ	BX, $256
    50  	JBE	_129through256
    51  	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    52  	JE loop_preheader_avx2
    53  	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
    54  
    55  loop:
    56  	MOVOU	X15, 0(DI)
    57  	MOVOU	X15, 16(DI)
    58  	MOVOU	X15, 32(DI)
    59  	MOVOU	X15, 48(DI)
    60  	MOVOU	X15, 64(DI)
    61  	MOVOU	X15, 80(DI)
    62  	MOVOU	X15, 96(DI)
    63  	MOVOU	X15, 112(DI)
    64  	MOVOU	X15, 128(DI)
    65  	MOVOU	X15, 144(DI)
    66  	MOVOU	X15, 160(DI)
    67  	MOVOU	X15, 176(DI)
    68  	MOVOU	X15, 192(DI)
    69  	MOVOU	X15, 208(DI)
    70  	MOVOU	X15, 224(DI)
    71  	MOVOU	X15, 240(DI)
    72  	SUBQ	$256, BX
    73  	ADDQ	$256, DI
    74  	CMPQ	BX, $256
    75  	JAE	loop
    76  	JMP	tail
    77  
    78  loop_preheader_avx2:
    79  	VPXOR Y0, Y0, Y0
    80  	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
    81  	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
    82  	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
    83  	CMPQ    BX, $0x2000000
    84  	JAE     loop_preheader_avx2_huge
    85  loop_avx2:
    86  	VMOVDQU	Y0, 0(DI)
    87  	VMOVDQU	Y0, 32(DI)
    88  	VMOVDQU	Y0, 64(DI)
    89  	VMOVDQU	Y0, 96(DI)
    90  	SUBQ	$128, BX
    91  	ADDQ	$128, DI
    92  	CMPQ	BX, $128
    93  	JAE	loop_avx2
    94  	VMOVDQU  Y0, -32(DI)(BX*1)
    95  	VMOVDQU  Y0, -64(DI)(BX*1)
    96  	VMOVDQU  Y0, -96(DI)(BX*1)
    97  	VMOVDQU  Y0, -128(DI)(BX*1)
    98  	VZEROUPPER
    99  	RET
   100  loop_preheader_avx2_huge:
   101  	// Align to 32 byte boundary
   102  	VMOVDQU  Y0, 0(DI)
   103  	MOVQ	DI, SI
   104  	ADDQ	$32, DI
   105  	ANDQ	$~31, DI
   106  	SUBQ	DI, SI
   107  	ADDQ	SI, BX
   108  loop_avx2_huge:
   109  	VMOVNTDQ	Y0, 0(DI)
   110  	VMOVNTDQ	Y0, 32(DI)
   111  	VMOVNTDQ	Y0, 64(DI)
   112  	VMOVNTDQ	Y0, 96(DI)
   113  	SUBQ	$128, BX
   114  	ADDQ	$128, DI
   115  	CMPQ	BX, $128
   116  	JAE	loop_avx2_huge
   117  	// In the description of MOVNTDQ in [1]
   118  	// "... fencing operation implemented with the SFENCE or MFENCE instruction
   119  	// should be used in conjunction with MOVNTDQ instructions..."
   120  	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
   121  	SFENCE
   122  	VMOVDQU  Y0, -32(DI)(BX*1)
   123  	VMOVDQU  Y0, -64(DI)(BX*1)
   124  	VMOVDQU  Y0, -96(DI)(BX*1)
   125  	VMOVDQU  Y0, -128(DI)(BX*1)
   126  	VZEROUPPER
   127  	RET
   128  
   129  _1or2:
   130  	MOVB	AX, (DI)
   131  	MOVB	AX, -1(DI)(BX*1)
   132  	RET
   133  _0:
   134  	RET
   135  _3or4:
   136  	MOVW	AX, (DI)
   137  	MOVW	AX, -2(DI)(BX*1)
   138  	RET
   139  _5through7:
   140  	MOVL	AX, (DI)
   141  	MOVL	AX, -4(DI)(BX*1)
   142  	RET
   143  _8:
   144  	// We need a separate case for 8 to make sure we clear pointers atomically.
   145  	MOVQ	AX, (DI)
   146  	RET
   147  _9through16:
   148  	MOVQ	AX, (DI)
   149  	MOVQ	AX, -8(DI)(BX*1)
   150  	RET
   151  _17through32:
   152  	MOVOU	X15, (DI)
   153  	MOVOU	X15, -16(DI)(BX*1)
   154  	RET
   155  _33through64:
   156  	MOVOU	X15, (DI)
   157  	MOVOU	X15, 16(DI)
   158  	MOVOU	X15, -32(DI)(BX*1)
   159  	MOVOU	X15, -16(DI)(BX*1)
   160  	RET
   161  _65through128:
   162  	MOVOU	X15, (DI)
   163  	MOVOU	X15, 16(DI)
   164  	MOVOU	X15, 32(DI)
   165  	MOVOU	X15, 48(DI)
   166  	MOVOU	X15, -64(DI)(BX*1)
   167  	MOVOU	X15, -48(DI)(BX*1)
   168  	MOVOU	X15, -32(DI)(BX*1)
   169  	MOVOU	X15, -16(DI)(BX*1)
   170  	RET
   171  _129through256:
   172  	MOVOU	X15, (DI)
   173  	MOVOU	X15, 16(DI)
   174  	MOVOU	X15, 32(DI)
   175  	MOVOU	X15, 48(DI)
   176  	MOVOU	X15, 64(DI)
   177  	MOVOU	X15, 80(DI)
   178  	MOVOU	X15, 96(DI)
   179  	MOVOU	X15, 112(DI)
   180  	MOVOU	X15, -128(DI)(BX*1)
   181  	MOVOU	X15, -112(DI)(BX*1)
   182  	MOVOU	X15, -96(DI)(BX*1)
   183  	MOVOU	X15, -80(DI)(BX*1)
   184  	MOVOU	X15, -64(DI)(BX*1)
   185  	MOVOU	X15, -48(DI)(BX*1)
   186  	MOVOU	X15, -32(DI)(BX*1)
   187  	MOVOU	X15, -16(DI)(BX*1)
   188  	RET
   189  

View as plain text