/* * ==================================================== * Copyright (C) 2007 by Ellips BV. All rights reserved. * * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== */ #include "x86_64mach.h" .global SYM (memcpy) SOTYPE_FUNCTION(memcpy) SYM (memcpy): movq rdi, rax /* Store destination in return value */ cmpq $16, rdx jb byte_copy movq rdi, r8 /* Align destination on quad word boundary */ andq $7, r8 jz quadword_aligned movq $8, rcx subq r8, rcx subq rcx, rdx rep movsb quadword_aligned: cmpq $256, rdx jb quadword_copy pushq rax pushq r12 pushq r13 pushq r14 movq rdx, rcx /* Copy 128 bytes at a time with minimum cache polution */ shrq $7, rcx .p2align 4 loop: prefetchnta 768 (rsi) prefetchnta 832 (rsi) movq (rsi), rax movq 8 (rsi), r8 movq 16 (rsi), r9 movq 24 (rsi), r10 movq 32 (rsi), r11 movq 40 (rsi), r12 movq 48 (rsi), r13 movq 56 (rsi), r14 movntiq rax, (rdi) movntiq r8 , 8 (rdi) movntiq r9 , 16 (rdi) movntiq r10, 24 (rdi) movntiq r11, 32 (rdi) movntiq r12, 40 (rdi) movntiq r13, 48 (rdi) movntiq r14, 56 (rdi) movq 64 (rsi), rax movq 72 (rsi), r8 movq 80 (rsi), r9 movq 88 (rsi), r10 movq 96 (rsi), r11 movq 104 (rsi), r12 movq 112 (rsi), r13 movq 120 (rsi), r14 movntiq rax, 64 (rdi) movntiq r8 , 72 (rdi) movntiq r9 , 80 (rdi) movntiq r10, 88 (rdi) movntiq r11, 96 (rdi) movntiq r12, 104 (rdi) movntiq r13, 112 (rdi) movntiq r14, 120 (rdi) leaq 128 (rsi), rsi leaq 128 (rdi), rdi dec rcx jnz loop sfence movq rdx, rcx andq $127, rcx rep movsb popq r14 popq r13 popq r12 popq rax ret byte_copy: movq rdx, rcx rep movsb ret quadword_copy: movq rdx, rcx shrq $3, rcx .p2align 4 rep movsq movq rdx, rcx andq $7, rcx rep movsb /* Copy the remaining bytes */ ret