Skip to content
Snippets Groups Projects

Replace assembler memcpy with C variant

  • Clone with SSH
  • Clone with HTTPS
  • Embed
  • Share
    The snippet can be accessed without any authentication.
    Authored by Marco Ammon
    Edited
    0001-Replacing-assembler-with-C-memcpy.patch 9.44 KiB
    From 981606d5825cb03f392ba37ccb4a0c5d0ef3dba0 Mon Sep 17 00:00:00 2001
    From: Marco Ammon <marco.ammon@fau.de>
    Date: Wed, 7 Aug 2019 14:07:53 +0200
    Subject: [PATCH] Replacing assembler with C memcpy
    
    ---
     Makefile                                     |   2 +-
     arch/x86/lib/Makefile                        |   2 +-
     arch/x86/lib/memcpy_64.S                     | 176 -------------------
     arch/x86/lib/memcpy_plain.c                  |  13 ++
     tools/arch/x86/lib/memcpy_64.S               |  14 +-
     tools/perf/bench/Build                       |   6 +
     tools/perf/bench/mem-memcpy-x86-64-asm-def.h |  10 +-
     tools/perf/bench/memcpy_plain.c              |  18 ++
     tools/perf/bench/memcpy_plain_native.c       |  11 ++
     9 files changed, 63 insertions(+), 189 deletions(-)
     create mode 100644 arch/x86/lib/memcpy_plain.c
     create mode 100644 tools/perf/bench/memcpy_plain.c
     create mode 100644 tools/perf/bench/memcpy_plain_native.c
    
    diff --git a/Makefile b/Makefile
    index d5713e7b1e50..e562ecccaacf 100644
    --- a/Makefile
    +++ b/Makefile
    @@ -2,7 +2,7 @@
     VERSION = 5
     PATCHLEVEL = 0
     SUBLEVEL = 0
    -EXTRAVERSION =
    +EXTRAVERSION =-c-memcpy
     NAME = Shy Crocodile
     
     # *DOCUMENTATION*
    diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
    index 140e61843a07..ea160c5a02a6 100644
    --- a/arch/x86/lib/Makefile
    +++ b/arch/x86/lib/Makefile
    @@ -22,7 +22,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
     
     lib-y := delay.o misc.o cmdline.o cpu.o
     lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
    -lib-y += memcpy_$(BITS).o
    +lib-y += memcpy_$(BITS).o memcpy_plain.o
     lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
     lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
     lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
    diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
    index 3b24dc05251c..6e0bf704ffa4 100644
    --- a/arch/x86/lib/memcpy_64.S
    +++ b/arch/x86/lib/memcpy_64.S
    @@ -7,182 +7,6 @@
     #include <asm/alternative-asm.h>
     #include <asm/export.h>
     
    -/*
    - * We build a jump to memcpy_orig by default which gets NOPped out on
    - * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
    - * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
    - * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
    - */
    -
    -.weak memcpy
    -
    -/*
    - * memcpy - Copy a memory block.
    - *
    - * Input:
    - *  rdi destination
    - *  rsi source
    - *  rdx count
    - *
    - * Output:
    - * rax original destination
    - */
    -ENTRY(__memcpy)
    -ENTRY(memcpy)
    -	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
    -		      "jmp memcpy_erms", X86_FEATURE_ERMS
    -
    -	movq %rdi, %rax
    -	movq %rdx, %rcx
    -	shrq $3, %rcx
    -	andl $7, %edx
    -	rep movsq
    -	movl %edx, %ecx
    -	rep movsb
    -	ret
    -ENDPROC(memcpy)
    -ENDPROC(__memcpy)
    -EXPORT_SYMBOL(memcpy)
    -EXPORT_SYMBOL(__memcpy)
    -
    -/*
    - * memcpy_erms() - enhanced fast string memcpy. This is faster and
    - * simpler than memcpy. Use memcpy_erms when possible.
    - */
    -ENTRY(memcpy_erms)
    -	movq %rdi, %rax
    -	movq %rdx, %rcx
    -	rep movsb
    -	ret
    -ENDPROC(memcpy_erms)
    -
    -ENTRY(memcpy_orig)
    -	movq %rdi, %rax
    -
    -	cmpq $0x20, %rdx
    -	jb .Lhandle_tail
    -
    -	/*
    -	 * We check whether memory false dependence could occur,
    -	 * then jump to corresponding copy mode.
    -	 */
    -	cmp  %dil, %sil
    -	jl .Lcopy_backward
    -	subq $0x20, %rdx
    -.Lcopy_forward_loop:
    -	subq $0x20,	%rdx
    -
    -	/*
    -	 * Move in blocks of 4x8 bytes:
    -	 */
    -	movq 0*8(%rsi),	%r8
    -	movq 1*8(%rsi),	%r9
    -	movq 2*8(%rsi),	%r10
    -	movq 3*8(%rsi),	%r11
    -	leaq 4*8(%rsi),	%rsi
    -
    -	movq %r8,	0*8(%rdi)
    -	movq %r9,	1*8(%rdi)
    -	movq %r10,	2*8(%rdi)
    -	movq %r11,	3*8(%rdi)
    -	leaq 4*8(%rdi),	%rdi
    -	jae  .Lcopy_forward_loop
    -	addl $0x20,	%edx
    -	jmp  .Lhandle_tail
    -
    -.Lcopy_backward:
    -	/*
    -	 * Calculate copy position to tail.
    -	 */
    -	addq %rdx,	%rsi
    -	addq %rdx,	%rdi
    -	subq $0x20,	%rdx
    -	/*
    -	 * At most 3 ALU operations in one cycle,
    -	 * so append NOPS in the same 16 bytes trunk.
    -	 */
    -	.p2align 4
    -.Lcopy_backward_loop:
    -	subq $0x20,	%rdx
    -	movq -1*8(%rsi),	%r8
    -	movq -2*8(%rsi),	%r9
    -	movq -3*8(%rsi),	%r10
    -	movq -4*8(%rsi),	%r11
    -	leaq -4*8(%rsi),	%rsi
    -	movq %r8,		-1*8(%rdi)
    -	movq %r9,		-2*8(%rdi)
    -	movq %r10,		-3*8(%rdi)
    -	movq %r11,		-4*8(%rdi)
    -	leaq -4*8(%rdi),	%rdi
    -	jae  .Lcopy_backward_loop
    -
    -	/*
    -	 * Calculate copy position to head.
    -	 */
    -	addl $0x20,	%edx
    -	subq %rdx,	%rsi
    -	subq %rdx,	%rdi
    -.Lhandle_tail:
    -	cmpl $16,	%edx
    -	jb   .Lless_16bytes
    -
    -	/*
    -	 * Move data from 16 bytes to 31 bytes.
    -	 */
    -	movq 0*8(%rsi), %r8
    -	movq 1*8(%rsi),	%r9
    -	movq -2*8(%rsi, %rdx),	%r10
    -	movq -1*8(%rsi, %rdx),	%r11
    -	movq %r8,	0*8(%rdi)
    -	movq %r9,	1*8(%rdi)
    -	movq %r10,	-2*8(%rdi, %rdx)
    -	movq %r11,	-1*8(%rdi, %rdx)
    -	retq
    -	.p2align 4
    -.Lless_16bytes:
    -	cmpl $8,	%edx
    -	jb   .Lless_8bytes
    -	/*
    -	 * Move data from 8 bytes to 15 bytes.
    -	 */
    -	movq 0*8(%rsi),	%r8
    -	movq -1*8(%rsi, %rdx),	%r9
    -	movq %r8,	0*8(%rdi)
    -	movq %r9,	-1*8(%rdi, %rdx)
    -	retq
    -	.p2align 4
    -.Lless_8bytes:
    -	cmpl $4,	%edx
    -	jb   .Lless_3bytes
    -
    -	/*
    -	 * Move data from 4 bytes to 7 bytes.
    -	 */
    -	movl (%rsi), %ecx
    -	movl -4(%rsi, %rdx), %r8d
    -	movl %ecx, (%rdi)
    -	movl %r8d, -4(%rdi, %rdx)
    -	retq
    -	.p2align 4
    -.Lless_3bytes:
    -	subl $1, %edx
    -	jb .Lend
    -	/*
    -	 * Move data from 1 bytes to 3 bytes.
    -	 */
    -	movzbl (%rsi), %ecx
    -	jz .Lstore_1byte
    -	movzbq 1(%rsi), %r8
    -	movzbq (%rsi, %rdx), %r9
    -	movb %r8b, 1(%rdi)
    -	movb %r9b, (%rdi, %rdx)
    -.Lstore_1byte:
    -	movb %cl, (%rdi)
    -
    -.Lend:
    -	retq
    -ENDPROC(memcpy_orig)
    -
     #ifndef CONFIG_UML
     
     MCSAFE_TEST_CTL
    diff --git a/arch/x86/lib/memcpy_plain.c b/arch/x86/lib/memcpy_plain.c
    new file mode 100644
    index 000000000000..24b247d9c26e
    --- /dev/null
    +++ b/arch/x86/lib/memcpy_plain.c
    @@ -0,0 +1,13 @@
    +#include <linux/types.h>
    +
    +void *memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size) {
    +	void *old_to = to;
    +	while(size--) {
    +		*((char *) to++) = *((char *) from++);
    +	}
    +	return old_to;
    +}
    +
    +void *__memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size) {
    +	return memcpy(to, from, size);
    +}
    diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S
    index 3b24dc05251c..df43623f2a6c 100644
    --- a/tools/arch/x86/lib/memcpy_64.S
    +++ b/tools/arch/x86/lib/memcpy_64.S
    @@ -14,7 +14,7 @@
      * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
      */
     
    -.weak memcpy
    +.weak memcpy_movsq
     
     /*
      * memcpy - Copy a memory block.
    @@ -27,11 +27,7 @@
      * Output:
      * rax original destination
      */
    -ENTRY(__memcpy)
    -ENTRY(memcpy)
    -	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
    -		      "jmp memcpy_erms", X86_FEATURE_ERMS
    -
    +ENTRY(memcpy_movsq)
     	movq %rdi, %rax
     	movq %rdx, %rcx
     	shrq $3, %rcx
    @@ -40,10 +36,8 @@ ENTRY(memcpy)
     	movl %edx, %ecx
     	rep movsb
     	ret
    -ENDPROC(memcpy)
    -ENDPROC(__memcpy)
    -EXPORT_SYMBOL(memcpy)
    -EXPORT_SYMBOL(__memcpy)
    +ENDPROC(memcpy_movsq)
    +EXPORT_SYMBOL(memcpy_movsq)
     
     /*
      * memcpy_erms() - enhanced fast string memcpy. This is faster and
    diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
    index e4e321b6f883..8cbb4787b701 100644
    --- a/tools/perf/bench/Build
    +++ b/tools/perf/bench/Build
    @@ -13,5 +13,11 @@ perf-y += epoll-ctl.o
     perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o
     perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
     perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o
    +perf-$(CONFIG_X86_64) += memcpy_plain.o
    +perf-$(CONFIG_X86_64) += memcpy_plain_native.o
    +
    +CFLAGS_memcpy_plain.o = -falign-functions=32 -mno-avx -mno-avx2 -mno-sse -ffreestanding
    +CFLAGS_memcpy_plain_native.o = -falign-functions=32 -O3 -march=corei7 -minline-all-stringops -mstringop-strategy=rep_byte -mno-avx -mno-avx2 -mno-sse
    +CFLAGS_REMOVE_memcpy_plain_native.o = -O2
     
     perf-$(CONFIG_NUMA) += numa.o
    diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
    index 50ae8bd58296..0f39b99c95d4 100644
    --- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
    +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
    @@ -4,10 +4,18 @@ MEMCPY_FN(memcpy_orig,
     	"x86-64-unrolled",
     	"unrolled memcpy() in arch/x86/lib/memcpy_64.S")
     
    -MEMCPY_FN(__memcpy,
    +MEMCPY_FN(memcpy_movsq,
     	"x86-64-movsq",
     	"movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
     
     MEMCPY_FN(memcpy_erms,
     	"x86-64-movsb",
     	"movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
    +
    +MEMCPY_FN(__memcpy,
    +	"x86-64-c-memcpy",
    +	"C-based memcpy() in arch/x86/lib/memcpy_plain.c")
    +
    +MEMCPY_FN(memcpy_native,
    +	"x86-64-c-memcpy-native",
    +	"C-based memcpy(), natively optimized in arch/x86/lib/memcpy_plain.c")
    diff --git a/tools/perf/bench/memcpy_plain.c b/tools/perf/bench/memcpy_plain.c
    new file mode 100644
    index 000000000000..282551f35f24
    --- /dev/null
    +++ b/tools/perf/bench/memcpy_plain.c
    @@ -0,0 +1,18 @@
    +#include <linux/types.h>
    +
    +void *memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size);
    +void *__memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size);
    +
    +void *memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size) {
    +	char *t = to;
    +	const char *f = from;
    +	while(size--) {
    +		*(t++) = *(f++);
    +	}
    +	return to;
    +
    +}
    +
    +void *__memcpy(void *to, const void *from, size_t size) {
    +	return memcpy(to, from, size);
    +}
    diff --git a/tools/perf/bench/memcpy_plain_native.c b/tools/perf/bench/memcpy_plain_native.c
    new file mode 100644
    index 000000000000..cab84e0404a3
    --- /dev/null
    +++ b/tools/perf/bench/memcpy_plain_native.c
    @@ -0,0 +1,11 @@
    +#include <linux/types.h>
    +
    +void *memcpy_native(void * __restrict__ to, const void * __restrict__ from, size_t size);
    +
    +void *memcpy_native(void * __restrict__ to, const void * __restrict__ from, size_t size) {
    +	void *old_to = to;
    +	while(size--) {
    +		*((char *) to++) = *((char *) from++);
    +	}
    +	return old_to;
    +}
    -- 
    2.22.0
    0% Loading or .
    You are about to add 0 people to the discussion. Proceed with caution.
    Finish editing this message first!
    Please register or to comment