blob: f82e884928af6643854f64df5899c42ab6fee9b2 [file] [log] [blame]
/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
#include <asm/cpufeature.h>
#include <asm/dwarf2.h>
/*
* memcpy - Copy a memory block.
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* rax original destination
*/
/*
* memcpy_c() - fast string ops (REP MOVSQ) based variant.
*
* This gets patched over the unrolled variant (below) via the
* alternative instructions framework:
*/
.section .altinstr_replacement, "ax", @progbits
.Lmemcpy_c:
movq %rdi, %rax
movl %edx, %ecx
shrl $3, %ecx
andl $7, %edx
rep movsq
movl %edx, %ecx
rep movsb
ret
.Lmemcpy_e:
.previous
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
/*
* Put the number of full 64-byte blocks into %ecx.
* Tail portion is handled at the end:
*/
movq %rdi, %rax
movl %edx, %ecx
shrl $6, %ecx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
/*
* We decrement the loop index here - and the zero-flag is
* checked at the end of the loop (instructions inbetween do
* not change the zero flag):
*/
decl %ecx
/*
* Move in blocks of 4x16 bytes:
*/
movq 0*8(%rsi), %r11
movq 1*8(%rsi), %r8
movq %r11, 0*8(%rdi)
movq %r8, 1*8(%rdi)
movq 2*8(%rsi), %r9
movq 3*8(%rsi), %r10
movq %r9, 2*8(%rdi)
movq %r10, 3*8(%rdi)
movq 4*8(%rsi), %r11
movq 5*8(%rsi), %r8
movq %r11, 4*8(%rdi)
movq %r8, 5*8(%rdi)
movq 6*8(%rsi), %r9
movq 7*8(%rsi), %r10
movq %r9, 6*8(%rdi)
movq %r10, 7*8(%rdi)
leaq 64(%rsi), %rsi
leaq 64(%rdi), %rdi
jnz .Lloop_64
.Lhandle_tail:
movl %edx, %ecx
andl $63, %ecx
shrl $3, %ecx
jz .Lhandle_7
.p2align 4
.Lloop_8:
decl %ecx
movq (%rsi), %r8
movq %r8, (%rdi)
leaq 8(%rdi), %rdi
leaq 8(%rsi), %rsi
jnz .Lloop_8
.Lhandle_7:
movl %edx, %ecx
andl $7, %ecx
jz .Lend
.p2align 4
.Lloop_1:
movb (%rsi), %r8b
movb %r8b, (%rdi)
incq %rdi
incq %rsi
decl %ecx
jnz .Lloop_1
.Lend:
ret
CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)
/*
* Some CPUs run faster using the string copy instructions.
* It is also a lot simpler. Use this when possible:
*/
.section .altinstructions, "a"
.align 8
.quad memcpy
.quad .Lmemcpy_c
.byte X86_FEATURE_REP_GOOD
/*
* Replace only beginning, memcpy is used to apply alternatives,
* so it is silly to overwrite itself with nops - reboot is the
* only outcome...
*/
.byte .Lmemcpy_e - .Lmemcpy_c
.byte .Lmemcpy_e - .Lmemcpy_c
.previous