Replace assembler memcpy with C variant
The snippet can be accessed without any authentication.
Authored by
Marco Ammon
Edited
0001-Replacing-assembler-with-C-memcpy.patch 9.44 KiB
From 981606d5825cb03f392ba37ccb4a0c5d0ef3dba0 Mon Sep 17 00:00:00 2001
From: Marco Ammon <marco.ammon@fau.de>
Date: Wed, 7 Aug 2019 14:07:53 +0200
Subject: [PATCH] Replacing assembler with C memcpy
---
Makefile | 2 +-
arch/x86/lib/Makefile | 2 +-
arch/x86/lib/memcpy_64.S | 176 -------------------
arch/x86/lib/memcpy_plain.c | 13 ++
tools/arch/x86/lib/memcpy_64.S | 14 +-
tools/perf/bench/Build | 6 +
tools/perf/bench/mem-memcpy-x86-64-asm-def.h | 10 +-
tools/perf/bench/memcpy_plain.c | 18 ++
tools/perf/bench/memcpy_plain_native.c | 11 ++
9 files changed, 63 insertions(+), 189 deletions(-)
create mode 100644 arch/x86/lib/memcpy_plain.c
create mode 100644 tools/perf/bench/memcpy_plain.c
create mode 100644 tools/perf/bench/memcpy_plain_native.c
diff --git a/Makefile b/Makefile
index d5713e7b1e50..e562ecccaacf 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
VERSION = 5
PATCHLEVEL = 0
SUBLEVEL = 0
-EXTRAVERSION =
+EXTRAVERSION =-c-memcpy
NAME = Shy Crocodile
# *DOCUMENTATION*
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 140e61843a07..ea160c5a02a6 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -22,7 +22,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
lib-y := delay.o misc.o cmdline.o cpu.o
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
-lib-y += memcpy_$(BITS).o
+lib-y += memcpy_$(BITS).o memcpy_plain.o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 3b24dc05251c..6e0bf704ffa4 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -7,182 +7,6 @@
#include <asm/alternative-asm.h>
#include <asm/export.h>
-/*
- * We build a jump to memcpy_orig by default which gets NOPped out on
- * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
- * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
- * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
- */
-
-.weak memcpy
-
-/*
- * memcpy - Copy a memory block.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
- * Output:
- * rax original destination
- */
-ENTRY(__memcpy)
-ENTRY(memcpy)
- ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
- "jmp memcpy_erms", X86_FEATURE_ERMS
-
- movq %rdi, %rax
- movq %rdx, %rcx
- shrq $3, %rcx
- andl $7, %edx
- rep movsq
- movl %edx, %ecx
- rep movsb
- ret
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-EXPORT_SYMBOL(memcpy)
-EXPORT_SYMBOL(__memcpy)
-
-/*
- * memcpy_erms() - enhanced fast string memcpy. This is faster and
- * simpler than memcpy. Use memcpy_erms when possible.
- */
-ENTRY(memcpy_erms)
- movq %rdi, %rax
- movq %rdx, %rcx
- rep movsb
- ret
-ENDPROC(memcpy_erms)
-
-ENTRY(memcpy_orig)
- movq %rdi, %rax
-
- cmpq $0x20, %rdx
- jb .Lhandle_tail
-
- /*
- * We check whether memory false dependence could occur,
- * then jump to corresponding copy mode.
- */
- cmp %dil, %sil
- jl .Lcopy_backward
- subq $0x20, %rdx
-.Lcopy_forward_loop:
- subq $0x20, %rdx
-
- /*
- * Move in blocks of 4x8 bytes:
- */
- movq 0*8(%rsi), %r8
- movq 1*8(%rsi), %r9
- movq 2*8(%rsi), %r10
- movq 3*8(%rsi), %r11
- leaq 4*8(%rsi), %rsi
-
- movq %r8, 0*8(%rdi)
- movq %r9, 1*8(%rdi)
- movq %r10, 2*8(%rdi)
- movq %r11, 3*8(%rdi)
- leaq 4*8(%rdi), %rdi
- jae .Lcopy_forward_loop
- addl $0x20, %edx
- jmp .Lhandle_tail
-
-.Lcopy_backward:
- /*
- * Calculate copy position to tail.
- */
- addq %rdx, %rsi
- addq %rdx, %rdi
- subq $0x20, %rdx
- /*
- * At most 3 ALU operations in one cycle,
- * so append NOPS in the same 16 bytes trunk.
- */
- .p2align 4
-.Lcopy_backward_loop:
- subq $0x20, %rdx
- movq -1*8(%rsi), %r8
- movq -2*8(%rsi), %r9
- movq -3*8(%rsi), %r10
- movq -4*8(%rsi), %r11
- leaq -4*8(%rsi), %rsi
- movq %r8, -1*8(%rdi)
- movq %r9, -2*8(%rdi)
- movq %r10, -3*8(%rdi)
- movq %r11, -4*8(%rdi)
- leaq -4*8(%rdi), %rdi
- jae .Lcopy_backward_loop
-
- /*
- * Calculate copy position to head.
- */
- addl $0x20, %edx
- subq %rdx, %rsi
- subq %rdx, %rdi
-.Lhandle_tail:
- cmpl $16, %edx
- jb .Lless_16bytes
-
- /*
- * Move data from 16 bytes to 31 bytes.
- */
- movq 0*8(%rsi), %r8
- movq 1*8(%rsi), %r9
- movq -2*8(%rsi, %rdx), %r10
- movq -1*8(%rsi, %rdx), %r11
- movq %r8, 0*8(%rdi)
- movq %r9, 1*8(%rdi)
- movq %r10, -2*8(%rdi, %rdx)
- movq %r11, -1*8(%rdi, %rdx)
- retq
- .p2align 4
-.Lless_16bytes:
- cmpl $8, %edx
- jb .Lless_8bytes
- /*
- * Move data from 8 bytes to 15 bytes.
- */
- movq 0*8(%rsi), %r8
- movq -1*8(%rsi, %rdx), %r9
- movq %r8, 0*8(%rdi)
- movq %r9, -1*8(%rdi, %rdx)
- retq
- .p2align 4
-.Lless_8bytes:
- cmpl $4, %edx
- jb .Lless_3bytes
-
- /*
- * Move data from 4 bytes to 7 bytes.
- */
- movl (%rsi), %ecx
- movl -4(%rsi, %rdx), %r8d
- movl %ecx, (%rdi)
- movl %r8d, -4(%rdi, %rdx)
- retq
- .p2align 4
-.Lless_3bytes:
- subl $1, %edx
- jb .Lend
- /*
- * Move data from 1 bytes to 3 bytes.
- */
- movzbl (%rsi), %ecx
- jz .Lstore_1byte
- movzbq 1(%rsi), %r8
- movzbq (%rsi, %rdx), %r9
- movb %r8b, 1(%rdi)
- movb %r9b, (%rdi, %rdx)
-.Lstore_1byte:
- movb %cl, (%rdi)
-
-.Lend:
- retq
-ENDPROC(memcpy_orig)
-
#ifndef CONFIG_UML
MCSAFE_TEST_CTL
diff --git a/arch/x86/lib/memcpy_plain.c b/arch/x86/lib/memcpy_plain.c
new file mode 100644
index 000000000000..24b247d9c26e
--- /dev/null
+++ b/arch/x86/lib/memcpy_plain.c
@@ -0,0 +1,13 @@
+#include <linux/types.h>
+
+void *memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size) {
+ void *old_to = to;
+ while(size--) {
+ *((char *) to++) = *((char *) from++);
+ }
+ return old_to;
+}
+
+void *__memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size) {
+ return memcpy(to, from, size);
+}
diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S
index 3b24dc05251c..df43623f2a6c 100644
--- a/tools/arch/x86/lib/memcpy_64.S
+++ b/tools/arch/x86/lib/memcpy_64.S
@@ -14,7 +14,7 @@
* to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
*/
-.weak memcpy
+.weak memcpy_movsq
/*
* memcpy - Copy a memory block.
@@ -27,11 +27,7 @@
* Output:
* rax original destination
*/
-ENTRY(__memcpy)
-ENTRY(memcpy)
- ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
- "jmp memcpy_erms", X86_FEATURE_ERMS
-
+ENTRY(memcpy_movsq)
movq %rdi, %rax
movq %rdx, %rcx
shrq $3, %rcx
@@ -40,10 +36,8 @@ ENTRY(memcpy)
movl %edx, %ecx
rep movsb
ret
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-EXPORT_SYMBOL(memcpy)
-EXPORT_SYMBOL(__memcpy)
+ENDPROC(memcpy_movsq)
+EXPORT_SYMBOL(memcpy_movsq)
/*
* memcpy_erms() - enhanced fast string memcpy. This is faster and
diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
index e4e321b6f883..8cbb4787b701 100644
--- a/tools/perf/bench/Build
+++ b/tools/perf/bench/Build
@@ -13,5 +13,11 @@ perf-y += epoll-ctl.o
perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o
perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o
+perf-$(CONFIG_X86_64) += memcpy_plain.o
+perf-$(CONFIG_X86_64) += memcpy_plain_native.o
+
+CFLAGS_memcpy_plain.o = -falign-functions=32 -mno-avx -mno-avx2 -mno-sse -ffreestanding
+CFLAGS_memcpy_plain_native.o = -falign-functions=32 -O3 -march=corei7 -minline-all-stringops -mstringop-strategy=rep_byte -mno-avx -mno-avx2 -mno-sse
+CFLAGS_REMOVE_memcpy_plain_native.o = -O2
perf-$(CONFIG_NUMA) += numa.o
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
index 50ae8bd58296..0f39b99c95d4 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -4,10 +4,18 @@ MEMCPY_FN(memcpy_orig,
"x86-64-unrolled",
"unrolled memcpy() in arch/x86/lib/memcpy_64.S")
-MEMCPY_FN(__memcpy,
+MEMCPY_FN(memcpy_movsq,
"x86-64-movsq",
"movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
MEMCPY_FN(memcpy_erms,
"x86-64-movsb",
"movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
+
+MEMCPY_FN(__memcpy,
+ "x86-64-c-memcpy",
+ "C-based memcpy() in arch/x86/lib/memcpy_plain.c")
+
+MEMCPY_FN(memcpy_native,
+ "x86-64-c-memcpy-native",
+ "C-based memcpy(), natively optimized in arch/x86/lib/memcpy_plain.c")
diff --git a/tools/perf/bench/memcpy_plain.c b/tools/perf/bench/memcpy_plain.c
new file mode 100644
index 000000000000..282551f35f24
--- /dev/null
+++ b/tools/perf/bench/memcpy_plain.c
@@ -0,0 +1,18 @@
+#include <linux/types.h>
+
+void *memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size);
+void *__memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size);
+
+void *memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size) {
+ char *t = to;
+ const char *f = from;
+ while(size--) {
+ *(t++) = *(f++);
+ }
+ return to;
+
+}
+
+void *__memcpy(void *to, const void *from, size_t size) {
+ return memcpy(to, from, size);
+}
diff --git a/tools/perf/bench/memcpy_plain_native.c b/tools/perf/bench/memcpy_plain_native.c
new file mode 100644
index 000000000000..cab84e0404a3
--- /dev/null
+++ b/tools/perf/bench/memcpy_plain_native.c
@@ -0,0 +1,11 @@
+#include <linux/types.h>
+
+void *memcpy_native(void * __restrict__ to, const void * __restrict__ from, size_t size);
+
+void *memcpy_native(void * __restrict__ to, const void * __restrict__ from, size_t size) {
+ void *old_to = to;
+ while(size--) {
+ *((char *) to++) = *((char *) from++);
+ }
+ return old_to;
+}
--
2.22.0
Please register or sign in to comment