Combination of native and c-memcpy
The snippet can be accessed without any authentication.
Authored by
Marco Ammon
Edited
Combination-of-native-and-c-memcpy.patch 31.95 KiB
From a8613ce66604d96aee0a7c261ced50113f61a400 Mon Sep 17 00:00:00 2001
From: Marco Ammon <marco.ammon@fau.de>
Date: Wed, 7 Aug 2019 14:07:53 +0200
Subject: [PATCH] Combination of native and c-memcpy
---
Makefile | 16 +-
arch/x86/Kconfig.cpu | 8 +
arch/x86/Makefile | 27 ++-
arch/x86/boot/compressed/head_64.S | 4 +
arch/x86/crypto/des3_ede-asm_64.S | 28 +++
arch/x86/crypto/sha1_ssse3_asm.S | 7 +-
arch/x86/include/asm/arch_hweight.h | 28 ++-
arch/x86/include/asm/page_64.h | 26 +++
arch/x86/include/asm/segment.h | 1 +
arch/x86/kernel/relocate_kernel_64.S | 15 ++
arch/x86/kernel/verify_cpu.S | 27 +++
arch/x86/lib/Makefile | 18 +-
arch/x86/lib/memcpy_64.S | 176 ------------------
arch/x86/lib/memcpy_plain.c | 13 ++
arch/x86/lib/memset_64.S | 15 ++
arch/x86/lib/usercopy_64.c | 16 +-
arch/x86/platform/pvh/head.S | 4 +
drivers/net/wireless/mediatek/mt76/mac80211.c | 2 +-
include/linux/bitops.h | 2 +
lib/Makefile | 2 +
scripts/kconfig/.gitignore | 1 +
scripts/kconfig/Makefile | 7 +-
scripts/kconfig/cpuid.c | 108 +++++++++++
scripts/march-native.sh | 74 ++++++++
tools/arch/x86/lib/memcpy_64.S | 14 +-
tools/perf/bench/Build | 6 +
tools/perf/bench/mem-memcpy-x86-64-asm-def.h | 10 +-
tools/perf/bench/memcpy_plain.c | 18 ++
tools/perf/bench/memcpy_plain_native.c | 11 ++
29 files changed, 482 insertions(+), 202 deletions(-)
create mode 100644 arch/x86/lib/memcpy_plain.c
create mode 100644 scripts/kconfig/cpuid.c
create mode 100755 scripts/march-native.sh
create mode 100644 tools/perf/bench/memcpy_plain.c
create mode 100644 tools/perf/bench/memcpy_plain_native.c
diff --git a/Makefile b/Makefile
index d5713e7b1e50..548d1bc33ceb 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
VERSION = 5
PATCHLEVEL = 0
SUBLEVEL = 0
-EXTRAVERSION =
+EXTRAVERSION =-c-memcpy-native
NAME = Shy Crocodile
# *DOCUMENTATION*
@@ -370,10 +370,10 @@ HOST_LFS_LIBS := $(shell getconf LFS_LIBS 2>/dev/null)
HOSTCC = gcc
HOSTCXX = g++
-KBUILD_HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 \
+KBUILD_HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -march=native -O2 \
-fomit-frame-pointer -std=gnu89 $(HOST_LFS_CFLAGS) \
$(HOSTCFLAGS)
-KBUILD_HOSTCXXFLAGS := -O2 $(HOST_LFS_CFLAGS) $(HOSTCXXFLAGS)
+KBUILD_HOSTCXXFLAGS := -march=native -O2 $(HOST_LFS_CFLAGS) $(HOSTCXXFLAGS)
KBUILD_HOSTLDFLAGS := $(HOST_LFS_LDFLAGS) $(HOSTLDFLAGS)
KBUILD_HOSTLDLIBS := $(HOST_LFS_LIBS) $(HOSTLDLIBS)
@@ -594,6 +594,16 @@ ifeq ($(dot-config),1)
include include/config/auto.conf
endif
+ifdef CONFIG_MARCH_NATIVE
+KBUILD_CFLAGS += -march=native
+endif
+ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+KBUILD_CFLAGS += -mmemcpy-strategy=rep_byte:-1:align,rep_byte:-1:noalign
+endif
+ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+KBUILD_CFLAGS += -mmemset-strategy=rep_byte:-1:align,rep_byte:-1:noalign
+endif
+
# The all: target is the default when no target is given on the
# command line.
# This allow a user to issue only 'make' to build a kernel including modules
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 6adce15268bd..e06a22d3a163 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -287,6 +287,12 @@ config GENERIC_CPU
Generic x86-64 CPU.
Run equally well on all x86-64 CPUs.
+config MARCH_NATIVE
+ bool "-march=native"
+ depends on X86_64
+ ---help---
+ -march=native support.
+
endchoice
config X86_GENERIC
@@ -307,6 +313,7 @@ config X86_INTERNODE_CACHE_SHIFT
int
default "12" if X86_VSMP
default X86_L1_CACHE_SHIFT
+ depends on !MARCH_NATIVE
config X86_L1_CACHE_SHIFT
int
@@ -314,6 +321,7 @@ config X86_L1_CACHE_SHIFT
default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
default "4" if MELAN || M486 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
+ depends on !MARCH_NATIVE
config X86_F00F_BUG
def_bool y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 9c5a67d1b9c1..9d2605a52fce 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -12,6 +12,28 @@ else
KBUILD_DEFCONFIG := $(ARCH)_defconfig
endif
+CFLAGS_NO_FP :=
+CFLAGS_NO_FP += $(call cc-option,-mno-mmx,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse2,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse3,)
+CFLAGS_NO_FP += $(call cc-option,-mno-ssse3,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse4,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse4a,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse4.1,)
+CFLAGS_NO_FP += $(call cc-option,-mno-sse4.2,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx2,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512f,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512pf,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512er,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512cd,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512vl,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512bw,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512dq,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512ifma,)
+CFLAGS_NO_FP += $(call cc-option,-mno-avx512vbmi,)
+
# For gcc stack alignment is specified with -mpreferred-stack-boundary,
# clang has the option -mstack-alignment for that purpose.
ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
@@ -34,7 +56,7 @@ M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))
REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -DDISABLE_BRANCH_PROFILING \
-Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
- -mno-mmx -mno-sse
+ $(CFLAGS_NO_FP)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
@@ -57,8 +79,7 @@ endif
#
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
#
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
-KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+KBUILD_CFLAGS += $(CFLAGS_NO_FP)
ifeq ($(CONFIG_X86_32),y)
BITS := 32
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index f62e347862cc..10e3be1ae849 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -517,8 +517,12 @@ relocated:
leaq _bss(%rip), %rdi
leaq _ebss(%rip), %rcx
subq %rdi, %rcx
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ rep stosb
+#else
shrq $3, %rcx
rep stosq
+#endif
/*
* Do the extraction, and jump to the new kernel..
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index 8e49ce117494..007319ea1f62 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -159,6 +159,15 @@
#define dummy2(a, b) /*_*/
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+#define read_block(io, left, right) \
+ movbe (io), left##d; \
+ movbe 4(io), right##d;
+
+#define write_block(io, left, right) \
+ movbe left##d, (io); \
+ movbe right##d, 4(io);
+#else
#define read_block(io, left, right) \
movl (io), left##d; \
movl 4(io), right##d; \
@@ -170,6 +179,7 @@
bswapl right##d; \
movl left##d, (io); \
movl right##d, 4(io);
+#endif
ENTRY(des3_ede_x86_64_crypt_blk)
/* input:
@@ -443,6 +453,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
pushq %rsi /* dst */
/* load input */
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ movbe 0 * 4(%rdx), RL0d;
+ movbe 1 * 4(%rdx), RR0d;
+ movbe 2 * 4(%rdx), RL1d;
+ movbe 3 * 4(%rdx), RR1d;
+ movbe 4 * 4(%rdx), RL2d;
+ movbe 5 * 4(%rdx), RR2d;
+#else
movl 0 * 4(%rdx), RL0d;
movl 1 * 4(%rdx), RR0d;
movl 2 * 4(%rdx), RL1d;
@@ -456,6 +474,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
bswapl RR1d;
bswapl RL2d;
bswapl RR2d;
+#endif
initial_permutation3(RL, RR);
@@ -516,6 +535,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
final_permutation3(RR, RL);
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ movbe RR0d, 0 * 4(%rsi);
+ movbe RL0d, 1 * 4(%rsi);
+ movbe RR1d, 2 * 4(%rsi);
+ movbe RL1d, 3 * 4(%rsi);
+ movbe RR2d, 4 * 4(%rsi);
+ movbe RL2d, 5 * 4(%rsi);
+#else
bswapl RR0d;
bswapl RL0d;
bswapl RR1d;
@@ -530,6 +557,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
movl RL1d, 3 * 4(%rsi);
movl RR2d, 4 * 4(%rsi);
movl RL2d, 5 * 4(%rsi);
+#endif
popq %r15;
popq %r14;
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 613d0bfc3d84..9e8d3abc6b57 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -94,10 +94,15 @@
SHA1_PIPELINED_MAIN_BODY
# cleanup workspace
- mov $8, %ecx
mov %rsp, %rdi
xor %eax, %eax
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ mov $64, %ecx
+ rep stosb
+#else
+ mov $8, %ecx
rep stosq
+#endif
mov %rbp, %rsp # deallocate workspace
pop %rbp
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index fc0693569f7a..3e9b45bd5b6c 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,6 +2,30 @@
#ifndef _ASM_X86_HWEIGHT_H
#define _ASM_X86_HWEIGHT_H
+#define __HAVE_ARCH_SW_HWEIGHT
+
+#ifdef CONFIG_MARCH_NATIVE_POPCNT
+static inline unsigned int __arch_hweight64(uint64_t x)
+{
+ return __builtin_popcountll(x);
+}
+
+static inline unsigned int __arch_hweight32(uint32_t x)
+{
+ return __builtin_popcount(x);
+}
+
+static inline unsigned int __arch_hweight16(uint16_t x)
+{
+ return __builtin_popcount(x);
+}
+
+static inline unsigned int __arch_hweight8(uint8_t x)
+{
+ return __builtin_popcount(x);
+}
+#else
+
#include <asm/cpufeatures.h>
#ifdef CONFIG_64BIT
@@ -12,8 +36,6 @@
#define REG_OUT "a"
#endif
-#define __HAVE_ARCH_SW_HWEIGHT
-
static __always_inline unsigned int __arch_hweight32(unsigned int w)
{
unsigned int res;
@@ -55,3 +77,5 @@ static __always_inline unsigned long __arch_hweight64(__u64 w)
#endif /* CONFIG_X86_32 */
#endif
+
+#endif
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 939b1cff4a7b..7654d5544e0b 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -40,6 +40,18 @@ extern unsigned long __phys_addr_symbol(unsigned long);
#define pfn_valid(pfn) ((pfn) < max_pfn)
#endif
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+static __always_inline void clear_page(void *page)
+{
+ uint32_t len = PAGE_SIZE;
+ asm volatile (
+ "rep stosb"
+ : "+D" (page), "+c" (len)
+ : "a" (0)
+ : "memory"
+ );
+}
+#else
void clear_page_orig(void *page);
void clear_page_rep(void *page);
void clear_page_erms(void *page);
@@ -53,8 +65,22 @@ static inline void clear_page(void *page)
"0" (page)
: "cc", "memory", "rax", "rcx");
}
+#endif
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+static __always_inline void copy_page(void *to, void *from)
+{
+ uint32_t len = PAGE_SIZE;
+ asm volatile (
+ "rep movsb"
+ : "+D" (to), "+S" (from), "+c" (len)
+ :
+ : "memory"
+ );
+}
+#else
void copy_page(void *to, void *from);
+#endif
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index ac3892920419..d314c6b9b632 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -4,6 +4,7 @@
#include <linux/const.h>
#include <asm/alternative.h>
+#include <asm/cpufeatures.h>
/*
* Constructor for a conventional segment GDT (or LDT) entry.
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 11eda21eb697..41912f2713e5 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -268,18 +268,33 @@ swap_pages:
movq %rsi, %rax
movq %r10, %rdi
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ mov $4096, %ecx
+ rep movsb
+#else
movl $512, %ecx
rep ; movsq
+#endif
movq %rax, %rdi
movq %rdx, %rsi
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ mov $4096, %ecx
+ rep movsb
+#else
movl $512, %ecx
rep ; movsq
+#endif
movq %rdx, %rdi
movq %r10, %rsi
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ mov $4096, %ecx
+ rep movsb
+#else
movl $512, %ecx
rep ; movsq
+#endif
lea PAGE_SIZE(%rax), %rsi
jmp 0b
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 3d3c2f71f617..864a35038f74 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -136,6 +136,33 @@ ENTRY(verify_cpu)
movl $1,%eax
ret
.Lverify_cpu_sse_ok:
+
+#ifdef CONFIG_MARCH_NATIVE_POPCNT
+ mov $1, %eax
+ cpuid
+ bt $23, %ecx
+ jnc .Lverify_cpu_no_longmode
+#endif
+
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ mov $1, %eax
+ cpuid
+ bt $22, %ecx
+ jnc .Lverify_cpu_no_longmode
+#endif
+
+#if defined(CONFIG_MARCH_NATIVE_REP_MOVSB) || defined(CONFIG_MARCH_NATIVE_REP_STOSB)
+ xor %eax, %eax
+ cpuid
+ cmp $7, %eax
+ jb .Lverify_cpu_no_longmode
+ mov $7, %eax
+ xor %ecx, %ecx
+ cpuid
+ bt $9, %ebx
+ jnc .Lverify_cpu_no_longmode
+#endif
+
popf # Restore caller passed flags
xorl %eax, %eax
ret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 140e61843a07..09b8afc764a2 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -22,14 +22,21 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
lib-y := delay.o misc.o cmdline.o cpu.o
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
-lib-y += memcpy_$(BITS).o
+lib-y += memcpy_$(BITS).o memcpy_plain.o
+
+CFLAGS_memcpy_plain.o = -O3 -minline-all-stringops -mstringop-strategy=rep_byte
+CFLAGS_REMOVE_memcpy_plain.o = -O2 -mmemcpy-strategy=rep_byte:-1:align,rep_byte:-1:noalign
+
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
lib-$(CONFIG_RETPOLINE) += retpoline.o
-obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
+obj-y += msr.o msr-reg.o msr-reg-export.o
+ifneq ($(CONFIG_MARCH_NATIVE_POPCNT),y)
+ obj-y += hweight.o
+endif
obj-y += iomem.o
ifeq ($(CONFIG_X86_32),y)
@@ -45,7 +52,12 @@ endif
else
obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
- lib-y += clear_page_64.o copy_page_64.o
+ifneq ($(CONFIG_MARCH_NATIVE_REP_STOSB),y)
+ lib-y += clear_page_64.o
+endif
+ifneq ($(CONFIG_MARCH_NATIVE_REP_MOVSB),y)
+ lib-y += copy_page_64.o
+endif
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o
lib-y += cmpxchg16b_emu.o
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 3b24dc05251c..6e0bf704ffa4 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -7,182 +7,6 @@
#include <asm/alternative-asm.h>
#include <asm/export.h>
-/*
- * We build a jump to memcpy_orig by default which gets NOPped out on
- * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
- * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
- * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
- */
-
-.weak memcpy
-
-/*
- * memcpy - Copy a memory block.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
- * Output:
- * rax original destination
- */
-ENTRY(__memcpy)
-ENTRY(memcpy)
- ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
- "jmp memcpy_erms", X86_FEATURE_ERMS
-
- movq %rdi, %rax
- movq %rdx, %rcx
- shrq $3, %rcx
- andl $7, %edx
- rep movsq
- movl %edx, %ecx
- rep movsb
- ret
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-EXPORT_SYMBOL(memcpy)
-EXPORT_SYMBOL(__memcpy)
-
-/*
- * memcpy_erms() - enhanced fast string memcpy. This is faster and
- * simpler than memcpy. Use memcpy_erms when possible.
- */
-ENTRY(memcpy_erms)
- movq %rdi, %rax
- movq %rdx, %rcx
- rep movsb
- ret
-ENDPROC(memcpy_erms)
-
-ENTRY(memcpy_orig)
- movq %rdi, %rax
-
- cmpq $0x20, %rdx
- jb .Lhandle_tail
-
- /*
- * We check whether memory false dependence could occur,
- * then jump to corresponding copy mode.
- */
- cmp %dil, %sil
- jl .Lcopy_backward
- subq $0x20, %rdx
-.Lcopy_forward_loop:
- subq $0x20, %rdx
-
- /*
- * Move in blocks of 4x8 bytes:
- */
- movq 0*8(%rsi), %r8
- movq 1*8(%rsi), %r9
- movq 2*8(%rsi), %r10
- movq 3*8(%rsi), %r11
- leaq 4*8(%rsi), %rsi
-
- movq %r8, 0*8(%rdi)
- movq %r9, 1*8(%rdi)
- movq %r10, 2*8(%rdi)
- movq %r11, 3*8(%rdi)
- leaq 4*8(%rdi), %rdi
- jae .Lcopy_forward_loop
- addl $0x20, %edx
- jmp .Lhandle_tail
-
-.Lcopy_backward:
- /*
- * Calculate copy position to tail.
- */
- addq %rdx, %rsi
- addq %rdx, %rdi
- subq $0x20, %rdx
- /*
- * At most 3 ALU operations in one cycle,
- * so append NOPS in the same 16 bytes trunk.
- */
- .p2align 4
-.Lcopy_backward_loop:
- subq $0x20, %rdx
- movq -1*8(%rsi), %r8
- movq -2*8(%rsi), %r9
- movq -3*8(%rsi), %r10
- movq -4*8(%rsi), %r11
- leaq -4*8(%rsi), %rsi
- movq %r8, -1*8(%rdi)
- movq %r9, -2*8(%rdi)
- movq %r10, -3*8(%rdi)
- movq %r11, -4*8(%rdi)
- leaq -4*8(%rdi), %rdi
- jae .Lcopy_backward_loop
-
- /*
- * Calculate copy position to head.
- */
- addl $0x20, %edx
- subq %rdx, %rsi
- subq %rdx, %rdi
-.Lhandle_tail:
- cmpl $16, %edx
- jb .Lless_16bytes
-
- /*
- * Move data from 16 bytes to 31 bytes.
- */
- movq 0*8(%rsi), %r8
- movq 1*8(%rsi), %r9
- movq -2*8(%rsi, %rdx), %r10
- movq -1*8(%rsi, %rdx), %r11
- movq %r8, 0*8(%rdi)
- movq %r9, 1*8(%rdi)
- movq %r10, -2*8(%rdi, %rdx)
- movq %r11, -1*8(%rdi, %rdx)
- retq
- .p2align 4
-.Lless_16bytes:
- cmpl $8, %edx
- jb .Lless_8bytes
- /*
- * Move data from 8 bytes to 15 bytes.
- */
- movq 0*8(%rsi), %r8
- movq -1*8(%rsi, %rdx), %r9
- movq %r8, 0*8(%rdi)
- movq %r9, -1*8(%rdi, %rdx)
- retq
- .p2align 4
-.Lless_8bytes:
- cmpl $4, %edx
- jb .Lless_3bytes
-
- /*
- * Move data from 4 bytes to 7 bytes.
- */
- movl (%rsi), %ecx
- movl -4(%rsi, %rdx), %r8d
- movl %ecx, (%rdi)
- movl %r8d, -4(%rdi, %rdx)
- retq
- .p2align 4
-.Lless_3bytes:
- subl $1, %edx
- jb .Lend
- /*
- * Move data from 1 bytes to 3 bytes.
- */
- movzbl (%rsi), %ecx
- jz .Lstore_1byte
- movzbq 1(%rsi), %r8
- movzbq (%rsi, %rdx), %r9
- movb %r8b, 1(%rdi)
- movb %r9b, (%rdi, %rdx)
-.Lstore_1byte:
- movb %cl, (%rdi)
-
-.Lend:
- retq
-ENDPROC(memcpy_orig)
-
#ifndef CONFIG_UML
MCSAFE_TEST_CTL
diff --git a/arch/x86/lib/memcpy_plain.c b/arch/x86/lib/memcpy_plain.c
new file mode 100644
index 000000000000..24b247d9c26e
--- /dev/null
+++ b/arch/x86/lib/memcpy_plain.c
@@ -0,0 +1,13 @@
+#include <linux/types.h>
+
+void *memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size) {
+ void *old_to = to;
+ while(size--) {
+ *((char *) to++) = *((char *) from++);
+ }
+ return old_to;
+}
+
+void *__memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size) {
+ return memcpy(to, from, size);
+}
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 9bc861c71e75..7786d1a65423 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -8,6 +8,20 @@
.weak memset
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ENTRY(memset)
+ENTRY(__memset)
+ mov %esi, %eax
+ mov %rdi, %rsi
+ mov %rdx, %rcx
+ rep stosb
+ mov %rsi, %rax
+ ret
+ENDPROC(memset)
+ENDPROC(__memset)
+EXPORT_SYMBOL(memset)
+EXPORT_SYMBOL(__memset)
+#else
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
@@ -140,3 +154,4 @@ ENTRY(memset_orig)
jmp .Lafter_bad_alignment
.Lfinal:
ENDPROC(memset_orig)
+#endif
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index ee42bb0cbeb3..d89d6ef93dd4 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -15,11 +15,23 @@
unsigned long __clear_user(void __user *addr, unsigned long size)
{
- long __d0;
might_fault();
/* no memory constraint because it doesn't change any memory gcc knows
about */
stac();
+
+#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
+ asm volatile (
+ "0: rep stosb\n"
+ "1:\n"
+ _ASM_EXTABLE(0b,1b)
+ : "+D" (addr), "+c" (size)
+ : "a" (0)
+ : "memory"
+ );
+#else
+ {
+ long __d0;
asm volatile(
" testq %[size8],%[size8]\n"
" jz 4f\n"
@@ -41,6 +53,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
_ASM_EXTABLE_UA(1b, 2b)
: [size8] "=&c"(size), [dst] "=&D" (__d0)
: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
+ }
+#endif
clac();
return size;
}
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index 1f8825bbaffb..2737f3e8c021 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -64,9 +64,13 @@ ENTRY(pvh_start_xen)
mov $_pa(pvh_start_info), %edi
mov %ebx, %esi
mov _pa(pvh_start_info_sz), %ecx
+#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
+ rep movsb
+#else
shr $2,%ecx
rep
movsl
+#endif
mov $_pa(early_stack_end), %esp
diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 7b926dfa6b97..c42e0d3dcab3 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -124,7 +124,7 @@ static void mt76_init_stream_cap(struct mt76_dev *dev,
bool vht)
{
struct ieee80211_sta_ht_cap *ht_cap = &sband->ht_cap;
- int i, nstream = __sw_hweight8(dev->antenna_mask);
+ int i, nstream = hweight8(dev->antenna_mask);
struct ieee80211_sta_vht_cap *vht_cap;
u16 mcs_map = 0;
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 705f7c442691..6f6be5c418f5 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -7,10 +7,12 @@
#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
+#ifndef CONFIG_MARCH_NATIVE_POPCNT
extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);
extern unsigned int __sw_hweight32(unsigned int w);
extern unsigned long __sw_hweight64(__u64 w);
+#endif
/*
* Include this here because some architectures need generic_ffs/fls in
diff --git a/lib/Makefile b/lib/Makefile
index e1b59da71418..a48c5fece180 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -93,7 +93,9 @@ obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
obj-y += logic_pio.o
+ifneq ($(CONFIG_MARCH_NATIVE_POPCNT),y)
obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
+endif
obj-$(CONFIG_BTREE) += btree.o
obj-$(CONFIG_INTERVAL_TREE) += interval_tree.o
diff --git a/scripts/kconfig/.gitignore b/scripts/kconfig/.gitignore
index b5bf92f66d11..411a885ad9b1 100644
--- a/scripts/kconfig/.gitignore
+++ b/scripts/kconfig/.gitignore
@@ -8,6 +8,7 @@
# configuration programs
#
conf
+cpuid
mconf
nconf
qconf
diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile
index 181973509a05..83fecd21e42b 100644
--- a/scripts/kconfig/Makefile
+++ b/scripts/kconfig/Makefile
@@ -65,8 +65,9 @@ simple-targets := oldconfig allnoconfig allyesconfig allmodconfig \
alldefconfig randconfig listnewconfig olddefconfig syncconfig
PHONY += $(simple-targets)
-$(simple-targets): $(obj)/conf
+$(simple-targets): $(obj)/conf $(obj)/cpuid
$< $(silent) --$@ $(Kconfig)
+ $(Q)$(srctree)/scripts/march-native.sh "$(CC)" $(obj)/cpuid
PHONY += savedefconfig defconfig
@@ -149,6 +150,10 @@ $(obj)/zconf.lex.o: $(obj)/zconf.tab.h
HOSTCFLAGS_zconf.lex.o := -I$(src)
HOSTCFLAGS_zconf.tab.o := -I$(src)
+# cpuid: -march=native, CONFIG_MARCH_NATIVE_* detection
+hostprogs-y += cpuid
+cpuid-objs := cpuid.o
+
# conf: Used for defconfig, oldconfig and related targets
hostprogs-y += conf
conf-objs := conf.o $(common-objs)
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
new file mode 100644
index 000000000000..613c3e738f12
--- /dev/null
+++ b/scripts/kconfig/cpuid.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifdef __x86_64__
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static inline bool streq(const char *s1, const char *s2)
+{
+ return strcmp(s1, s2) == 0;
+}
+
+static inline void cpuid(uint32_t eax0, uint32_t *eax, uint32_t *ecx, uint32_t *edx, uint32_t *ebx)
+{
+ asm volatile (
+ "cpuid"
+ : "=a" (*eax), "=c" (*ecx), "=d" (*edx), "=b" (*ebx)
+ : "0" (eax0)
+ );
+}
+
+static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t *ecx, uint32_t *edx, uint32_t *ebx)
+{
+ asm volatile (
+ "cpuid"
+ : "=a" (*eax), "=c" (*ecx), "=d" (*edx), "=b" (*ebx)
+ : "0" (eax0), "1" (ecx0)
+ );
+}
+
+static bool movbe = false;
+static bool popcnt = false;
+static bool rep_movsb = false;
+static bool rep_stosb = false;
+
+static uint32_t eax0_max;
+
+static void intel(void)
+{
+ uint32_t eax, ecx, edx, ebx;
+
+ if (eax0_max >= 1) {
+ cpuid(1, &eax, &ecx, &edx, &ebx);
+// printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+
+ if (ecx & (1 << 22))
+ movbe = true;
+ if (ecx & (1 << 23))
+ popcnt = true;
+ }
+ if (eax0_max >= 7) {
+ cpuid2(7, 0, &eax, &ecx, &edx, &ebx);
+// printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+
+ if (ebx & (1 << 9)) {
+ rep_movsb = true;
+ rep_stosb = true;
+ }
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ const char *opt = argv[1];
+ uint32_t eax, ecx, edx, ebx;
+
+ if (argc != 2)
+ return EXIT_FAILURE;
+
+ cpuid(0, &eax, &ecx, &edx, &ebx);
+// printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
+ eax0_max = eax;
+
+ if (ecx == 0x6c65746e && edx == 0x49656e69 && ebx == 0x756e6547) {
+ intel();
+ }
+
+#define _(x) if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
+ _(movbe);
+ _(popcnt);
+ _(rep_movsb);
+ _(rep_stosb);
+#undef _
+
+ return EXIT_FAILURE;
+}
+#else
+#include <stdlib.h>
+int main(void)
+{
+ return EXIT_FAILURE;
+}
+#endif
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
new file mode 100755
index 000000000000..38710a4f0616
--- /dev/null
+++ b/scripts/march-native.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+# Copyright (c) 2017-2019 Alexey Dobriyan <adobriyan@gmail.com>
+if test "$(uname -m)" != "x86_64"; then
+ exit 0
+fi
+
+CC="$1"
+CPUID="$2"
+CONFIG=".config"
+AUTOCONF1="include/config/auto.conf"
+AUTOCONF2="include/generated/autoconf.h"
+
+if ! grep -q -e '^CONFIG_MARCH_NATIVE=y$' "$CONFIG"; then
+ sed -i -e '/^CONFIG_MARCH_NATIVE/d' "$AUTOCONF1" "$AUTOCONF2" >/dev/null 2>&1
+ exit 0
+fi
+
+if ! $CC -march=native -x c -c -o /dev/null /dev/null >/dev/null 2>&1; then
+ echo >&2 "error: unsupported '-march=native' compiler option"
+ exit 1
+fi
+
+_option() {
+ echo "$1=$2" >>"$CONFIG"
+ echo "$1=$2" >>"$AUTOCONF1"
+ echo "#define $1 $2" >>"$AUTOCONF2"
+}
+
+option() {
+ echo "$1=y" >>"$CONFIG"
+ echo "$1=y" >>"$AUTOCONF1"
+ echo "#define $1 1" >>"$AUTOCONF2"
+}
+
+if test ! -f "$CONFIG" -o ! -f "$AUTOCONF1" -o ! -f "$AUTOCONF2"; then
+ exit 0
+fi
+
+COLLECT_GCC_OPTIONS=$(
+ $CC -march=native -v -E -x c -c /dev/null 2>&1 |\
+ sed -ne '/^COLLECT_GCC_OPTIONS=/{n;p}' |\
+ awk '{$1=$1};1'
+)
+echo "-march=native: $COLLECT_GCC_OPTIONS"
+_option "CONFIG_MARCH_NATIVE_CC_OPTIONS" "\"$COLLECT_GCC_OPTIONS\""
+
+"$CPUID" movbe && option "CONFIG_MARCH_NATIVE_MOVBE"
+"$CPUID" popcnt && option "CONFIG_MARCH_NATIVE_POPCNT"
+"$CPUID" rep_movsb && option "CONFIG_MARCH_NATIVE_REP_MOVSB"
+"$CPUID" rep_stosb && option "CONFIG_MARCH_NATIVE_REP_STOSB"
+
+for i in $COLLECT_GCC_OPTIONS; do
+ case $i in
+ */cc1|-E|-quiet|-v|/dev/null|--param|-fstack-protector*)
+ ;;
+
+ l1-cache-line-size=64)
+ _option "CONFIG_X86_L1_CACHE_SHIFT" 6
+ _option "CONFIG_X86_INTERNODE_CACHE_SHIFT" 6
+ ;;
+
+ l1-cache-size=*);;
+ l2-cache-size=*);;
+
+ -march=*);;
+ -mtune=*);;
+
+ -m*);;
+ -mno-*);;
+
+ *)
+ echo >&2 "warning: unexpected -march=native option '$i'"
+ esac
+done
diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S
index 3b24dc05251c..df43623f2a6c 100644
--- a/tools/arch/x86/lib/memcpy_64.S
+++ b/tools/arch/x86/lib/memcpy_64.S
@@ -14,7 +14,7 @@
* to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
*/
-.weak memcpy
+.weak memcpy_movsq
/*
* memcpy - Copy a memory block.
@@ -27,11 +27,7 @@
* Output:
* rax original destination
*/
-ENTRY(__memcpy)
-ENTRY(memcpy)
- ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
- "jmp memcpy_erms", X86_FEATURE_ERMS
-
+ENTRY(memcpy_movsq)
movq %rdi, %rax
movq %rdx, %rcx
shrq $3, %rcx
@@ -40,10 +36,8 @@ ENTRY(memcpy)
movl %edx, %ecx
rep movsb
ret
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-EXPORT_SYMBOL(memcpy)
-EXPORT_SYMBOL(__memcpy)
+ENDPROC(memcpy_movsq)
+EXPORT_SYMBOL(memcpy_movsq)
/*
* memcpy_erms() - enhanced fast string memcpy. This is faster and
diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
index e4e321b6f883..8cbb4787b701 100644
--- a/tools/perf/bench/Build
+++ b/tools/perf/bench/Build
@@ -13,5 +13,11 @@ perf-y += epoll-ctl.o
perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o
perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o
+perf-$(CONFIG_X86_64) += memcpy_plain.o
+perf-$(CONFIG_X86_64) += memcpy_plain_native.o
+
+CFLAGS_memcpy_plain.o = -falign-functions=32 -mno-avx -mno-avx2 -mno-sse -ffreestanding
+CFLAGS_memcpy_plain_native.o = -falign-functions=32 -O3 -march=corei7 -minline-all-stringops -mstringop-strategy=rep_byte -mno-avx -mno-avx2 -mno-sse
+CFLAGS_REMOVE_memcpy_plain_native.o = -O2
perf-$(CONFIG_NUMA) += numa.o
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
index 50ae8bd58296..0f39b99c95d4 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -4,10 +4,18 @@ MEMCPY_FN(memcpy_orig,
"x86-64-unrolled",
"unrolled memcpy() in arch/x86/lib/memcpy_64.S")
-MEMCPY_FN(__memcpy,
+MEMCPY_FN(memcpy_movsq,
"x86-64-movsq",
"movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
MEMCPY_FN(memcpy_erms,
"x86-64-movsb",
"movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
+
+MEMCPY_FN(__memcpy,
+ "x86-64-c-memcpy",
+ "C-based memcpy() in arch/x86/lib/memcpy_plain.c")
+
+MEMCPY_FN(memcpy_native,
+ "x86-64-c-memcpy-native",
+ "C-based memcpy(), natively optimized in arch/x86/lib/memcpy_plain.c")
diff --git a/tools/perf/bench/memcpy_plain.c b/tools/perf/bench/memcpy_plain.c
new file mode 100644
index 000000000000..282551f35f24
--- /dev/null
+++ b/tools/perf/bench/memcpy_plain.c
@@ -0,0 +1,18 @@
+#include <linux/types.h>
+
+void *memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size);
+void *__memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size);
+
+void *memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size) {
+ char *t = to;
+ const char *f = from;
+ while(size--) {
+ *(t++) = *(f++);
+ }
+ return to;
+
+}
+
+void *__memcpy(void *to, const void *from, size_t size) {
+ return memcpy(to, from, size);
+}
diff --git a/tools/perf/bench/memcpy_plain_native.c b/tools/perf/bench/memcpy_plain_native.c
new file mode 100644
index 000000000000..cab84e0404a3
--- /dev/null
+++ b/tools/perf/bench/memcpy_plain_native.c
@@ -0,0 +1,11 @@
+#include <linux/types.h>
+
+void *memcpy_native(void * __restrict__ to, const void * __restrict__ from, size_t size);
+
+void *memcpy_native(void * __restrict__ to, const void * __restrict__ from, size_t size) {
+ void *old_to = to;
+ while(size--) {
+ *((char *) to++) = *((char *) from++);
+ }
+ return old_to;
+}
--
2.22.0
Please register or sign in to comment