memmove_amd64.s

Documentation: runtime

     1  // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2  // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
     3  //
     4  //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5  //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6  //         Portions Copyright 2009 The Go Authors. All rights reserved.
     7  //
     8  // Permission is hereby granted, free of charge, to any person obtaining a copy
     9  // of this software and associated documentation files (the "Software"), to deal
    10  // in the Software without restriction, including without limitation the rights
    11  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12  // copies of the Software, and to permit persons to whom the Software is
    13  // furnished to do so, subject to the following conditions:
    14  //
    15  // The above copyright notice and this permission notice shall be included in
    16  // all copies or substantial portions of the Software.
    17  //
    18  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24  // THE SOFTWARE.
    25  
    26  //go:build !plan9
    27  // +build !plan9
    28  
    29  #include "go_asm.h"
    30  #include "textflag.h"
    31  
    32  // See memmove Go doc for important implementation constraints.
    33  
    34  // func memmove(to, from unsafe.Pointer, n uintptr)
    35  // ABIInternal for performance.
    36  TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
    37  #ifdef GOEXPERIMENT_regabiargs
    38  	// AX = to
    39  	// BX = from
    40  	// CX = n
    41  	MOVQ	AX, DI
    42  	MOVQ	BX, SI
    43  	MOVQ	CX, BX
    44  #else
    45  	MOVQ	to+0(FP), DI
    46  	MOVQ	from+8(FP), SI
    47  	MOVQ	n+16(FP), BX
    48  #endif
    49  
    50  	// REP instructions have a high startup cost, so we handle small sizes
    51  	// with some straightline code. The REP MOVSQ instruction is really fast
    52  	// for large sizes. The cutover is approximately 2K.
    53  tail:
    54  	// move_129through256 or smaller work whether or not the source and the
    55  	// destination memory regions overlap because they load all data into
    56  	// registers before writing it back.  move_256through2048 on the other
    57  	// hand can be used only when the memory regions don't overlap or the copy
    58  	// direction is forward.
    59  	//
    60  	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    61  	TESTQ	BX, BX
    62  	JEQ	move_0
    63  	CMPQ	BX, $2
    64  	JBE	move_1or2
    65  	CMPQ	BX, $4
    66  	JB	move_3
    67  	JBE	move_4
    68  	CMPQ	BX, $8
    69  	JB	move_5through7
    70  	JE	move_8
    71  	CMPQ	BX, $16
    72  	JBE	move_9through16
    73  	CMPQ	BX, $32
    74  	JBE	move_17through32
    75  	CMPQ	BX, $64
    76  	JBE	move_33through64
    77  	CMPQ	BX, $128
    78  	JBE	move_65through128
    79  	CMPQ	BX, $256
    80  	JBE	move_129through256
    81  
    82  	TESTB	$1, runtime·useAVXmemmove(SB)
    83  	JNZ	avxUnaligned
    84  
    85  /*
    86   * check and set for backwards
    87   */
    88  	CMPQ	SI, DI
    89  	JLS	back
    90  
    91  /*
    92   * forward copy loop
    93   */
    94  forward:
    95  	CMPQ	BX, $2048
    96  	JLS	move_256through2048
    97  
    98  	// If REP MOVSB isn't fast, don't use it
    99  	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
   100  	JNE	fwdBy8
   101  
   102  	// Check alignment
   103  	MOVL	SI, AX
   104  	ORL	DI, AX
   105  	TESTL	$7, AX
   106  	JEQ	fwdBy8
   107  
   108  	// Do 1 byte at a time
   109  	MOVQ	BX, CX
   110  	REP;	MOVSB
   111  	RET
   112  
   113  fwdBy8:
   114  	// Do 8 bytes at a time
   115  	MOVQ	BX, CX
   116  	SHRQ	$3, CX
   117  	ANDQ	$7, BX
   118  	REP;	MOVSQ
   119  	JMP	tail
   120  
   121  back:
   122  /*
   123   * check overlap
   124   */
   125  	MOVQ	SI, CX
   126  	ADDQ	BX, CX
   127  	CMPQ	CX, DI
   128  	JLS	forward
   129  /*
   130   * whole thing backwards has
   131   * adjusted addresses
   132   */
   133  	ADDQ	BX, DI
   134  	ADDQ	BX, SI
   135  	STD
   136  
   137  /*
   138   * copy
   139   */
   140  	MOVQ	BX, CX
   141  	SHRQ	$3, CX
   142  	ANDQ	$7, BX
   143  
   144  	SUBQ	$8, DI
   145  	SUBQ	$8, SI
   146  	REP;	MOVSQ
   147  
   148  	CLD
   149  	ADDQ	$8, DI
   150  	ADDQ	$8, SI
   151  	SUBQ	BX, DI
   152  	SUBQ	BX, SI
   153  	JMP	tail
   154  
   155  move_1or2:
   156  	MOVB	(SI), AX
   157  	MOVB	-1(SI)(BX*1), CX
   158  	MOVB	AX, (DI)
   159  	MOVB	CX, -1(DI)(BX*1)
   160  	RET
   161  move_0:
   162  	RET
   163  move_4:
   164  	MOVL	(SI), AX
   165  	MOVL	AX, (DI)
   166  	RET
   167  move_3:
   168  	MOVW	(SI), AX
   169  	MOVB	2(SI), CX
   170  	MOVW	AX, (DI)
   171  	MOVB	CX, 2(DI)
   172  	RET
   173  move_5through7:
   174  	MOVL	(SI), AX
   175  	MOVL	-4(SI)(BX*1), CX
   176  	MOVL	AX, (DI)
   177  	MOVL	CX, -4(DI)(BX*1)
   178  	RET
   179  move_8:
   180  	// We need a separate case for 8 to make sure we write pointers atomically.
   181  	MOVQ	(SI), AX
   182  	MOVQ	AX, (DI)
   183  	RET
   184  move_9through16:
   185  	MOVQ	(SI), AX
   186  	MOVQ	-8(SI)(BX*1), CX
   187  	MOVQ	AX, (DI)
   188  	MOVQ	CX, -8(DI)(BX*1)
   189  	RET
   190  move_17through32:
   191  	MOVOU	(SI), X0
   192  	MOVOU	-16(SI)(BX*1), X1
   193  	MOVOU	X0, (DI)
   194  	MOVOU	X1, -16(DI)(BX*1)
   195  	RET
   196  move_33through64:
   197  	MOVOU	(SI), X0
   198  	MOVOU	16(SI), X1
   199  	MOVOU	-32(SI)(BX*1), X2
   200  	MOVOU	-16(SI)(BX*1), X3
   201  	MOVOU	X0, (DI)
   202  	MOVOU	X1, 16(DI)
   203  	MOVOU	X2, -32(DI)(BX*1)
   204  	MOVOU	X3, -16(DI)(BX*1)
   205  	RET
   206  move_65through128:
   207  	MOVOU	(SI), X0
   208  	MOVOU	16(SI), X1
   209  	MOVOU	32(SI), X2
   210  	MOVOU	48(SI), X3
   211  	MOVOU	-64(SI)(BX*1), X4
   212  	MOVOU	-48(SI)(BX*1), X5
   213  	MOVOU	-32(SI)(BX*1), X6
   214  	MOVOU	-16(SI)(BX*1), X7
   215  	MOVOU	X0, (DI)
   216  	MOVOU	X1, 16(DI)
   217  	MOVOU	X2, 32(DI)
   218  	MOVOU	X3, 48(DI)
   219  	MOVOU	X4, -64(DI)(BX*1)
   220  	MOVOU	X5, -48(DI)(BX*1)
   221  	MOVOU	X6, -32(DI)(BX*1)
   222  	MOVOU	X7, -16(DI)(BX*1)
   223  	RET
   224  move_129through256:
   225  	MOVOU	(SI), X0
   226  	MOVOU	16(SI), X1
   227  	MOVOU	32(SI), X2
   228  	MOVOU	48(SI), X3
   229  	MOVOU	64(SI), X4
   230  	MOVOU	80(SI), X5
   231  	MOVOU	96(SI), X6
   232  	MOVOU	112(SI), X7
   233  	MOVOU	-128(SI)(BX*1), X8
   234  	MOVOU	-112(SI)(BX*1), X9
   235  	MOVOU	-96(SI)(BX*1), X10
   236  	MOVOU	-80(SI)(BX*1), X11
   237  	MOVOU	-64(SI)(BX*1), X12
   238  	MOVOU	-48(SI)(BX*1), X13
   239  	MOVOU	-32(SI)(BX*1), X14
   240  	MOVOU	-16(SI)(BX*1), X15
   241  	MOVOU	X0, (DI)
   242  	MOVOU	X1, 16(DI)
   243  	MOVOU	X2, 32(DI)
   244  	MOVOU	X3, 48(DI)
   245  	MOVOU	X4, 64(DI)
   246  	MOVOU	X5, 80(DI)
   247  	MOVOU	X6, 96(DI)
   248  	MOVOU	X7, 112(DI)
   249  	MOVOU	X8, -128(DI)(BX*1)
   250  	MOVOU	X9, -112(DI)(BX*1)
   251  	MOVOU	X10, -96(DI)(BX*1)
   252  	MOVOU	X11, -80(DI)(BX*1)
   253  	MOVOU	X12, -64(DI)(BX*1)
   254  	MOVOU	X13, -48(DI)(BX*1)
   255  	MOVOU	X14, -32(DI)(BX*1)
   256  	MOVOU	X15, -16(DI)(BX*1)
   257  #ifdef GOEXPERIMENT_regabig
   258  	// X15 must be zero on return
   259  	PXOR	X15, X15
   260  #endif
   261  	RET
   262  move_256through2048:
   263  	SUBQ	$256, BX
   264  	MOVOU	(SI), X0
   265  	MOVOU	16(SI), X1
   266  	MOVOU	32(SI), X2
   267  	MOVOU	48(SI), X3
   268  	MOVOU	64(SI), X4
   269  	MOVOU	80(SI), X5
   270  	MOVOU	96(SI), X6
   271  	MOVOU	112(SI), X7
   272  	MOVOU	128(SI), X8
   273  	MOVOU	144(SI), X9
   274  	MOVOU	160(SI), X10
   275  	MOVOU	176(SI), X11
   276  	MOVOU	192(SI), X12
   277  	MOVOU	208(SI), X13
   278  	MOVOU	224(SI), X14
   279  	MOVOU	240(SI), X15
   280  	MOVOU	X0, (DI)
   281  	MOVOU	X1, 16(DI)
   282  	MOVOU	X2, 32(DI)
   283  	MOVOU	X3, 48(DI)
   284  	MOVOU	X4, 64(DI)
   285  	MOVOU	X5, 80(DI)
   286  	MOVOU	X6, 96(DI)
   287  	MOVOU	X7, 112(DI)
   288  	MOVOU	X8, 128(DI)
   289  	MOVOU	X9, 144(DI)
   290  	MOVOU	X10, 160(DI)
   291  	MOVOU	X11, 176(DI)
   292  	MOVOU	X12, 192(DI)
   293  	MOVOU	X13, 208(DI)
   294  	MOVOU	X14, 224(DI)
   295  	MOVOU	X15, 240(DI)
   296  	CMPQ	BX, $256
   297  	LEAQ	256(SI), SI
   298  	LEAQ	256(DI), DI
   299  	JGE	move_256through2048
   300  #ifdef GOEXPERIMENT_regabig
   301  	// X15 must be zero on return
   302  	PXOR	X15, X15
   303  #endif
   304  	JMP	tail
   305  
   306  avxUnaligned:
   307  	// There are two implementations of move algorithm.
   308  	// The first one for non-overlapped memory regions. It uses forward copying.
   309  	// The second one for overlapped regions. It uses backward copying
   310  	MOVQ	DI, CX
   311  	SUBQ	SI, CX
   312  	// Now CX contains distance between SRC and DEST
   313  	CMPQ	CX, BX
   314  	// If the distance lesser than region length it means that regions are overlapped
   315  	JC	copy_backward
   316  
   317  	// Non-temporal copy would be better for big sizes.
   318  	CMPQ	BX, $0x100000
   319  	JAE	gobble_big_data_fwd
   320  
   321  	// Memory layout on the source side
   322  	// SI                                       CX
   323  	// |<---------BX before correction--------->|
   324  	// |       |<--BX corrected-->|             |
   325  	// |       |                  |<--- AX  --->|
   326  	// |<-R11->|                  |<-128 bytes->|
   327  	// +----------------------------------------+
   328  	// | Head  | Body             | Tail        |
   329  	// +-------+------------------+-------------+
   330  	// ^       ^                  ^
   331  	// |       |                  |
   332  	// Save head into Y4          Save tail into X5..X12
   333  	//         |
   334  	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
   335  	// Algorithm:
   336  	// 1. Unaligned save of the tail's 128 bytes
   337  	// 2. Unaligned save of the head's 32  bytes
   338  	// 3. Destination-aligned copying of body (128 bytes per iteration)
   339  	// 4. Put head on the new place
   340  	// 5. Put the tail on the new place
   341  	// It can be important to satisfy processor's pipeline requirements for
   342  	// small sizes as the cost of unaligned memory region copying is
   343  	// comparable with the cost of main loop. So code is slightly messed there.
   344  	// There is more clean implementation of that algorithm for bigger sizes
   345  	// where the cost of unaligned part copying is negligible.
   346  	// You can see it after gobble_big_data_fwd label.
   347  	LEAQ	(SI)(BX*1), CX
   348  	MOVQ	DI, R10
   349  	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
   350  	MOVOU	-0x80(CX), X5
   351  	MOVOU	-0x70(CX), X6
   352  	MOVQ	$0x80, AX
   353  	// Align destination address
   354  	ANDQ	$-32, DI
   355  	ADDQ	$32, DI
   356  	// Continue tail saving.
   357  	MOVOU	-0x60(CX), X7
   358  	MOVOU	-0x50(CX), X8
   359  	// Make R11 delta between aligned and unaligned destination addresses.
   360  	MOVQ	DI, R11
   361  	SUBQ	R10, R11
   362  	// Continue tail saving.
   363  	MOVOU	-0x40(CX), X9
   364  	MOVOU	-0x30(CX), X10
   365  	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
   366  	SUBQ	R11, BX
   367  	// Continue tail saving.
   368  	MOVOU	-0x20(CX), X11
   369  	MOVOU	-0x10(CX), X12
   370  	// The tail will be put on its place after main body copying.
   371  	// It's time for the unaligned heading part.
   372  	VMOVDQU	(SI), Y4
   373  	// Adjust source address to point past head.
   374  	ADDQ	R11, SI
   375  	SUBQ	AX, BX
   376  	// Aligned memory copying there
   377  gobble_128_loop:
   378  	VMOVDQU	(SI), Y0
   379  	VMOVDQU	0x20(SI), Y1
   380  	VMOVDQU	0x40(SI), Y2
   381  	VMOVDQU	0x60(SI), Y3
   382  	ADDQ	AX, SI
   383  	VMOVDQA	Y0, (DI)
   384  	VMOVDQA	Y1, 0x20(DI)
   385  	VMOVDQA	Y2, 0x40(DI)
   386  	VMOVDQA	Y3, 0x60(DI)
   387  	ADDQ	AX, DI
   388  	SUBQ	AX, BX
   389  	JA	gobble_128_loop
   390  	// Now we can store unaligned parts.
   391  	ADDQ	AX, BX
   392  	ADDQ	DI, BX
   393  	VMOVDQU	Y4, (R10)
   394  	VZEROUPPER
   395  	MOVOU	X5, -0x80(BX)
   396  	MOVOU	X6, -0x70(BX)
   397  	MOVOU	X7, -0x60(BX)
   398  	MOVOU	X8, -0x50(BX)
   399  	MOVOU	X9, -0x40(BX)
   400  	MOVOU	X10, -0x30(BX)
   401  	MOVOU	X11, -0x20(BX)
   402  	MOVOU	X12, -0x10(BX)
   403  	RET
   404  
   405  gobble_big_data_fwd:
   406  	// There is forward copying for big regions.
   407  	// It uses non-temporal mov instructions.
   408  	// Details of this algorithm are commented previously for small sizes.
   409  	LEAQ	(SI)(BX*1), CX
   410  	MOVOU	-0x80(SI)(BX*1), X5
   411  	MOVOU	-0x70(CX), X6
   412  	MOVOU	-0x60(CX), X7
   413  	MOVOU	-0x50(CX), X8
   414  	MOVOU	-0x40(CX), X9
   415  	MOVOU	-0x30(CX), X10
   416  	MOVOU	-0x20(CX), X11
   417  	MOVOU	-0x10(CX), X12
   418  	VMOVDQU	(SI), Y4
   419  	MOVQ	DI, R8
   420  	ANDQ	$-32, DI
   421  	ADDQ	$32, DI
   422  	MOVQ	DI, R10
   423  	SUBQ	R8, R10
   424  	SUBQ	R10, BX
   425  	ADDQ	R10, SI
   426  	LEAQ	(DI)(BX*1), CX
   427  	SUBQ	$0x80, BX
   428  gobble_mem_fwd_loop:
   429  	PREFETCHNTA 0x1C0(SI)
   430  	PREFETCHNTA 0x280(SI)
   431  	// Prefetch values were chosen empirically.
   432  	// Approach for prefetch usage as in 7.6.6 of [1]
   433  	// [1] 64-ia-32-architectures-optimization-manual.pdf
   434  	// https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   435  	VMOVDQU	(SI), Y0
   436  	VMOVDQU	0x20(SI), Y1
   437  	VMOVDQU	0x40(SI), Y2
   438  	VMOVDQU	0x60(SI), Y3
   439  	ADDQ	$0x80, SI
   440  	VMOVNTDQ Y0, (DI)
   441  	VMOVNTDQ Y1, 0x20(DI)
   442  	VMOVNTDQ Y2, 0x40(DI)
   443  	VMOVNTDQ Y3, 0x60(DI)
   444  	ADDQ	$0x80, DI
   445  	SUBQ	$0x80, BX
   446  	JA		gobble_mem_fwd_loop
   447  	// NT instructions don't follow the normal cache-coherency rules.
   448  	// We need SFENCE there to make copied data available timely.
   449  	SFENCE
   450  	VMOVDQU	Y4, (R8)
   451  	VZEROUPPER
   452  	MOVOU	X5, -0x80(CX)
   453  	MOVOU	X6, -0x70(CX)
   454  	MOVOU	X7, -0x60(CX)
   455  	MOVOU	X8, -0x50(CX)
   456  	MOVOU	X9, -0x40(CX)
   457  	MOVOU	X10, -0x30(CX)
   458  	MOVOU	X11, -0x20(CX)
   459  	MOVOU	X12, -0x10(CX)
   460  	RET
   461  
   462  copy_backward:
   463  	MOVQ	DI, AX
   464  	// Backward copying is about the same as the forward one.
   465  	// Firstly we load unaligned tail in the beginning of region.
   466  	MOVOU	(SI), X5
   467  	MOVOU	0x10(SI), X6
   468  	ADDQ	BX, DI
   469  	MOVOU	0x20(SI), X7
   470  	MOVOU	0x30(SI), X8
   471  	LEAQ	-0x20(DI), R10
   472  	MOVQ	DI, R11
   473  	MOVOU	0x40(SI), X9
   474  	MOVOU	0x50(SI), X10
   475  	ANDQ	$0x1F, R11
   476  	MOVOU	0x60(SI), X11
   477  	MOVOU	0x70(SI), X12
   478  	XORQ	R11, DI
   479  	// Let's point SI to the end of region
   480  	ADDQ	BX, SI
   481  	// and load unaligned head into X4.
   482  	VMOVDQU	-0x20(SI), Y4
   483  	SUBQ	R11, SI
   484  	SUBQ	R11, BX
   485  	// If there is enough data for non-temporal moves go to special loop
   486  	CMPQ	BX, $0x100000
   487  	JA		gobble_big_data_bwd
   488  	SUBQ	$0x80, BX
   489  gobble_mem_bwd_loop:
   490  	VMOVDQU	-0x20(SI), Y0
   491  	VMOVDQU	-0x40(SI), Y1
   492  	VMOVDQU	-0x60(SI), Y2
   493  	VMOVDQU	-0x80(SI), Y3
   494  	SUBQ	$0x80, SI
   495  	VMOVDQA	Y0, -0x20(DI)
   496  	VMOVDQA	Y1, -0x40(DI)
   497  	VMOVDQA	Y2, -0x60(DI)
   498  	VMOVDQA	Y3, -0x80(DI)
   499  	SUBQ	$0x80, DI
   500  	SUBQ	$0x80, BX
   501  	JA		gobble_mem_bwd_loop
   502  	// Let's store unaligned data
   503  	VMOVDQU	Y4, (R10)
   504  	VZEROUPPER
   505  	MOVOU	X5, (AX)
   506  	MOVOU	X6, 0x10(AX)
   507  	MOVOU	X7, 0x20(AX)
   508  	MOVOU	X8, 0x30(AX)
   509  	MOVOU	X9, 0x40(AX)
   510  	MOVOU	X10, 0x50(AX)
   511  	MOVOU	X11, 0x60(AX)
   512  	MOVOU	X12, 0x70(AX)
   513  	RET
   514  
   515  gobble_big_data_bwd:
   516  	SUBQ	$0x80, BX
   517  gobble_big_mem_bwd_loop:
   518  	PREFETCHNTA -0x1C0(SI)
   519  	PREFETCHNTA -0x280(SI)
   520  	VMOVDQU	-0x20(SI), Y0
   521  	VMOVDQU	-0x40(SI), Y1
   522  	VMOVDQU	-0x60(SI), Y2
   523  	VMOVDQU	-0x80(SI), Y3
   524  	SUBQ	$0x80, SI
   525  	VMOVNTDQ	Y0, -0x20(DI)
   526  	VMOVNTDQ	Y1, -0x40(DI)
   527  	VMOVNTDQ	Y2, -0x60(DI)
   528  	VMOVNTDQ	Y3, -0x80(DI)
   529  	SUBQ	$0x80, DI
   530  	SUBQ	$0x80, BX
   531  	JA	gobble_big_mem_bwd_loop
   532  	SFENCE
   533  	VMOVDQU	Y4, (R10)
   534  	VZEROUPPER
   535  	MOVOU	X5, (AX)
   536  	MOVOU	X6, 0x10(AX)
   537  	MOVOU	X7, 0x20(AX)
   538  	MOVOU	X8, 0x30(AX)
   539  	MOVOU	X9, 0x40(AX)
   540  	MOVOU	X10, 0x50(AX)
   541  	MOVOU	X11, 0x60(AX)
   542  	MOVOU	X12, 0x70(AX)
   543  	RET
   544
View as plain text