Skip to content
Snippets Groups Projects

Combination of native and c-memcpy

  • Clone with SSH
  • Clone with HTTPS
  • Embed
  • Share
    The snippet can be accessed without any authentication.
    Authored by Marco Ammon
    Edited
    Combination-of-native-and-c-memcpy.patch 31.95 KiB
    From a8613ce66604d96aee0a7c261ced50113f61a400 Mon Sep 17 00:00:00 2001
    From: Marco Ammon <marco.ammon@fau.de>
    Date: Wed, 7 Aug 2019 14:07:53 +0200
    Subject: [PATCH] Combination of native and c-memcpy
    
    ---
     Makefile                                      |  16 +-
     arch/x86/Kconfig.cpu                          |   8 +
     arch/x86/Makefile                             |  27 ++-
     arch/x86/boot/compressed/head_64.S            |   4 +
     arch/x86/crypto/des3_ede-asm_64.S             |  28 +++
     arch/x86/crypto/sha1_ssse3_asm.S              |   7 +-
     arch/x86/include/asm/arch_hweight.h           |  28 ++-
     arch/x86/include/asm/page_64.h                |  26 +++
     arch/x86/include/asm/segment.h                |   1 +
     arch/x86/kernel/relocate_kernel_64.S          |  15 ++
     arch/x86/kernel/verify_cpu.S                  |  27 +++
     arch/x86/lib/Makefile                         |  18 +-
     arch/x86/lib/memcpy_64.S                      | 176 ------------------
     arch/x86/lib/memcpy_plain.c                   |  13 ++
     arch/x86/lib/memset_64.S                      |  15 ++
     arch/x86/lib/usercopy_64.c                    |  16 +-
     arch/x86/platform/pvh/head.S                  |   4 +
     drivers/net/wireless/mediatek/mt76/mac80211.c |   2 +-
     include/linux/bitops.h                        |   2 +
     lib/Makefile                                  |   2 +
     scripts/kconfig/.gitignore                    |   1 +
     scripts/kconfig/Makefile                      |   7 +-
     scripts/kconfig/cpuid.c                       | 108 +++++++++++
     scripts/march-native.sh                       |  74 ++++++++
     tools/arch/x86/lib/memcpy_64.S                |  14 +-
     tools/perf/bench/Build                        |   6 +
     tools/perf/bench/mem-memcpy-x86-64-asm-def.h  |  10 +-
     tools/perf/bench/memcpy_plain.c               |  18 ++
     tools/perf/bench/memcpy_plain_native.c        |  11 ++
     29 files changed, 482 insertions(+), 202 deletions(-)
     create mode 100644 arch/x86/lib/memcpy_plain.c
     create mode 100644 scripts/kconfig/cpuid.c
     create mode 100755 scripts/march-native.sh
     create mode 100644 tools/perf/bench/memcpy_plain.c
     create mode 100644 tools/perf/bench/memcpy_plain_native.c
    
    diff --git a/Makefile b/Makefile
    index d5713e7b1e50..548d1bc33ceb 100644
    --- a/Makefile
    +++ b/Makefile
    @@ -2,7 +2,7 @@
     VERSION = 5
     PATCHLEVEL = 0
     SUBLEVEL = 0
    -EXTRAVERSION =
    +EXTRAVERSION =-c-memcpy-native
     NAME = Shy Crocodile
     
     # *DOCUMENTATION*
    @@ -370,10 +370,10 @@ HOST_LFS_LIBS := $(shell getconf LFS_LIBS 2>/dev/null)
     
     HOSTCC       = gcc
     HOSTCXX      = g++
    -KBUILD_HOSTCFLAGS   := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 \
    +KBUILD_HOSTCFLAGS   := -Wall -Wmissing-prototypes -Wstrict-prototypes -march=native -O2 \
     		-fomit-frame-pointer -std=gnu89 $(HOST_LFS_CFLAGS) \
     		$(HOSTCFLAGS)
    -KBUILD_HOSTCXXFLAGS := -O2 $(HOST_LFS_CFLAGS) $(HOSTCXXFLAGS)
    +KBUILD_HOSTCXXFLAGS := -march=native -O2 $(HOST_LFS_CFLAGS) $(HOSTCXXFLAGS)
     KBUILD_HOSTLDFLAGS  := $(HOST_LFS_LDFLAGS) $(HOSTLDFLAGS)
     KBUILD_HOSTLDLIBS   := $(HOST_LFS_LIBS) $(HOSTLDLIBS)
     
    @@ -594,6 +594,16 @@ ifeq ($(dot-config),1)
     include include/config/auto.conf
     endif
     
    +ifdef CONFIG_MARCH_NATIVE
    +KBUILD_CFLAGS += -march=native
    +endif
    +ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
    +KBUILD_CFLAGS += -mmemcpy-strategy=rep_byte:-1:align,rep_byte:-1:noalign
    +endif
    +ifdef CONFIG_MARCH_NATIVE_REP_STOSB
    +KBUILD_CFLAGS += -mmemset-strategy=rep_byte:-1:align,rep_byte:-1:noalign
    +endif
    +
     # The all: target is the default when no target is given on the
     # command line.
     # This allow a user to issue only 'make' to build a kernel including modules
    diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
    index 6adce15268bd..e06a22d3a163 100644
    --- a/arch/x86/Kconfig.cpu
    +++ b/arch/x86/Kconfig.cpu
    @@ -287,6 +287,12 @@ config GENERIC_CPU
     	  Generic x86-64 CPU.
     	  Run equally well on all x86-64 CPUs.
     
    +config MARCH_NATIVE
    +	bool "-march=native"
    +	depends on X86_64
    +	---help---
    +	  -march=native support.
    +
     endchoice
     
     config X86_GENERIC
    @@ -307,6 +313,7 @@ config X86_INTERNODE_CACHE_SHIFT
     	int
     	default "12" if X86_VSMP
     	default X86_L1_CACHE_SHIFT
    +	depends on !MARCH_NATIVE
     
     config X86_L1_CACHE_SHIFT
     	int
    @@ -314,6 +321,7 @@ config X86_L1_CACHE_SHIFT
     	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
     	default "4" if MELAN || M486 || MGEODEGX1
     	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
    +	depends on !MARCH_NATIVE
     
     config X86_F00F_BUG
     	def_bool y
    diff --git a/arch/x86/Makefile b/arch/x86/Makefile
    index 9c5a67d1b9c1..9d2605a52fce 100644
    --- a/arch/x86/Makefile
    +++ b/arch/x86/Makefile
    @@ -12,6 +12,28 @@ else
             KBUILD_DEFCONFIG := $(ARCH)_defconfig
     endif
     
    +CFLAGS_NO_FP :=
    +CFLAGS_NO_FP += $(call cc-option,-mno-mmx,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-sse,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-sse2,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-sse3,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-ssse3,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-sse4,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-sse4a,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-sse4.1,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-sse4.2,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-avx,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-avx2,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-avx512f,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-avx512pf,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-avx512er,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-avx512cd,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-avx512vl,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-avx512bw,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-avx512dq,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-avx512ifma,)
    +CFLAGS_NO_FP += $(call cc-option,-mno-avx512vbmi,)
    +
     # For gcc stack alignment is specified with -mpreferred-stack-boundary,
     # clang has the option -mstack-alignment for that purpose.
     ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
    @@ -34,7 +56,7 @@ M16_CFLAGS	 := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))
     REALMODE_CFLAGS	:= $(M16_CFLAGS) -g -Os -DDISABLE_BRANCH_PROFILING \
     		   -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
     		   -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
    -		   -mno-mmx -mno-sse
    +		   $(CFLAGS_NO_FP)
     
     REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
     REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
    @@ -57,8 +79,7 @@ endif
     #
     #    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
     #
    -KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
    -KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
    +KBUILD_CFLAGS += $(CFLAGS_NO_FP)
     
     ifeq ($(CONFIG_X86_32),y)
             BITS := 32
    diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
    index f62e347862cc..10e3be1ae849 100644
    --- a/arch/x86/boot/compressed/head_64.S
    +++ b/arch/x86/boot/compressed/head_64.S
    @@ -517,8 +517,12 @@ relocated:
     	leaq    _bss(%rip), %rdi
     	leaq    _ebss(%rip), %rcx
     	subq	%rdi, %rcx
    +#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
    +	rep stosb
    +#else
     	shrq	$3, %rcx
     	rep	stosq
    +#endif
     
     /*
      * Do the extraction, and jump to the new kernel..
    diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
    index 8e49ce117494..007319ea1f62 100644
    --- a/arch/x86/crypto/des3_ede-asm_64.S
    +++ b/arch/x86/crypto/des3_ede-asm_64.S
    @@ -159,6 +159,15 @@
     
     #define dummy2(a, b) /*_*/
     
    +#ifdef CONFIG_MARCH_NATIVE_MOVBE
    +#define read_block(io, left, right) \
    +	movbe	 (io), left##d; \
    +	movbe	4(io), right##d;
    +
    +#define write_block(io, left, right) \
    +	movbe	left##d,   (io); \
    +	movbe	right##d, 4(io);
    +#else
     #define read_block(io, left, right) \
     	movl    (io), left##d; \
     	movl   4(io), right##d; \
    @@ -170,6 +179,7 @@
     	bswapl right##d; \
     	movl   left##d,   (io); \
     	movl   right##d, 4(io);
    +#endif
     
     ENTRY(des3_ede_x86_64_crypt_blk)
     	/* input:
    @@ -443,6 +453,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
     	pushq %rsi /* dst */
     
     	/* load input */
    +#ifdef CONFIG_MARCH_NATIVE_MOVBE
    +	movbe 0 * 4(%rdx), RL0d;
    +	movbe 1 * 4(%rdx), RR0d;
    +	movbe 2 * 4(%rdx), RL1d;
    +	movbe 3 * 4(%rdx), RR1d;
    +	movbe 4 * 4(%rdx), RL2d;
    +	movbe 5 * 4(%rdx), RR2d;
    +#else
     	movl 0 * 4(%rdx), RL0d;
     	movl 1 * 4(%rdx), RR0d;
     	movl 2 * 4(%rdx), RL1d;
    @@ -456,6 +474,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
     	bswapl RR1d;
     	bswapl RL2d;
     	bswapl RR2d;
    +#endif
     
     	initial_permutation3(RL, RR);
     
    @@ -516,6 +535,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
     
     	final_permutation3(RR, RL);
     
    +#ifdef CONFIG_MARCH_NATIVE_MOVBE
    +	movbe RR0d, 0 * 4(%rsi);
    +	movbe RL0d, 1 * 4(%rsi);
    +	movbe RR1d, 2 * 4(%rsi);
    +	movbe RL1d, 3 * 4(%rsi);
    +	movbe RR2d, 4 * 4(%rsi);
    +	movbe RL2d, 5 * 4(%rsi);
    +#else
     	bswapl RR0d;
     	bswapl RL0d;
     	bswapl RR1d;
    @@ -530,6 +557,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
     	movl RL1d, 3 * 4(%rsi);
     	movl RR2d, 4 * 4(%rsi);
     	movl RL2d, 5 * 4(%rsi);
    +#endif
     
     	popq %r15;
     	popq %r14;
    diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
    index 613d0bfc3d84..9e8d3abc6b57 100644
    --- a/arch/x86/crypto/sha1_ssse3_asm.S
    +++ b/arch/x86/crypto/sha1_ssse3_asm.S
    @@ -94,10 +94,15 @@
     	SHA1_PIPELINED_MAIN_BODY
     
     	# cleanup workspace
    -	mov	$8, %ecx
     	mov	%rsp, %rdi
     	xor	%eax, %eax
    +#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
    +	mov	$64, %ecx
    +	rep stosb
    +#else
    +	mov	$8, %ecx
     	rep stosq
    +#endif
     
     	mov	%rbp, %rsp		# deallocate workspace
     	pop	%rbp
    diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
    index fc0693569f7a..3e9b45bd5b6c 100644
    --- a/arch/x86/include/asm/arch_hweight.h
    +++ b/arch/x86/include/asm/arch_hweight.h
    @@ -2,6 +2,30 @@
     #ifndef _ASM_X86_HWEIGHT_H
     #define _ASM_X86_HWEIGHT_H
     
    +#define __HAVE_ARCH_SW_HWEIGHT
    +
    +#ifdef CONFIG_MARCH_NATIVE_POPCNT
    +static inline unsigned int __arch_hweight64(uint64_t x)
    +{
    +	return __builtin_popcountll(x);
    +}
    +
    +static inline unsigned int __arch_hweight32(uint32_t x)
    +{
    +	return __builtin_popcount(x);
    +}
    +
    +static inline unsigned int __arch_hweight16(uint16_t x)
    +{
    +	return __builtin_popcount(x);
    +}
    +
    +static inline unsigned int __arch_hweight8(uint8_t x)
    +{
    +	return __builtin_popcount(x);
    +}
    +#else
    +
     #include <asm/cpufeatures.h>
     
     #ifdef CONFIG_64BIT
    @@ -12,8 +36,6 @@
     #define REG_OUT "a"
     #endif
     
    -#define __HAVE_ARCH_SW_HWEIGHT
    -
     static __always_inline unsigned int __arch_hweight32(unsigned int w)
     {
     	unsigned int res;
    @@ -55,3 +77,5 @@ static __always_inline unsigned long __arch_hweight64(__u64 w)
     #endif /* CONFIG_X86_32 */
     
     #endif
    +
    +#endif
    diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
    index 939b1cff4a7b..7654d5544e0b 100644
    --- a/arch/x86/include/asm/page_64.h
    +++ b/arch/x86/include/asm/page_64.h
    @@ -40,6 +40,18 @@ extern unsigned long __phys_addr_symbol(unsigned long);
     #define pfn_valid(pfn)          ((pfn) < max_pfn)
     #endif
     
    +#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
    +static __always_inline void clear_page(void *page)
    +{
    +	uint32_t len = PAGE_SIZE;
    +	asm volatile (
    +		"rep stosb"
    +		: "+D" (page), "+c" (len)
    +		: "a" (0)
    +		: "memory"
    +	);
    +}
    +#else
     void clear_page_orig(void *page);
     void clear_page_rep(void *page);
     void clear_page_erms(void *page);
    @@ -53,8 +65,22 @@ static inline void clear_page(void *page)
     			   "0" (page)
     			   : "cc", "memory", "rax", "rcx");
     }
    +#endif
     
    +#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
    +static __always_inline void copy_page(void *to, void *from)
    +{
    +	uint32_t len = PAGE_SIZE;
    +	asm volatile (
    +		"rep movsb"
    +		: "+D" (to), "+S" (from), "+c" (len)
    +		:
    +		: "memory"
    +	);
    +}
    +#else
     void copy_page(void *to, void *from);
    +#endif
     
     #endif	/* !__ASSEMBLY__ */
     
    diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
    index ac3892920419..d314c6b9b632 100644
    --- a/arch/x86/include/asm/segment.h
    +++ b/arch/x86/include/asm/segment.h
    @@ -4,6 +4,7 @@
     
     #include <linux/const.h>
     #include <asm/alternative.h>
    +#include <asm/cpufeatures.h>
     
     /*
      * Constructor for a conventional segment GDT (or LDT) entry.
    diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
    index 11eda21eb697..41912f2713e5 100644
    --- a/arch/x86/kernel/relocate_kernel_64.S
    +++ b/arch/x86/kernel/relocate_kernel_64.S
    @@ -268,18 +268,33 @@ swap_pages:
     	movq	%rsi, %rax
     
     	movq	%r10, %rdi
    +#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
    +	mov	$4096, %ecx
    +	rep movsb
    +#else
     	movl	$512, %ecx
     	rep ; movsq
    +#endif
     
     	movq	%rax, %rdi
     	movq	%rdx, %rsi
    +#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
    +	mov	$4096, %ecx
    +	rep movsb
    +#else
     	movl	$512, %ecx
     	rep ; movsq
    +#endif
     
     	movq	%rdx, %rdi
     	movq	%r10, %rsi
    +#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
    +	mov	$4096, %ecx
    +	rep movsb
    +#else
     	movl	$512, %ecx
     	rep ; movsq
    +#endif
     
     	lea	PAGE_SIZE(%rax), %rsi
     	jmp	0b
    diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
    index 3d3c2f71f617..864a35038f74 100644
    --- a/arch/x86/kernel/verify_cpu.S
    +++ b/arch/x86/kernel/verify_cpu.S
    @@ -136,6 +136,33 @@ ENTRY(verify_cpu)
     	movl $1,%eax
     	ret
     .Lverify_cpu_sse_ok:
    +
    +#ifdef CONFIG_MARCH_NATIVE_POPCNT
    +	mov	$1, %eax
    +	cpuid
    +	bt	$23, %ecx
    +	jnc	.Lverify_cpu_no_longmode
    +#endif
    +
    +#ifdef CONFIG_MARCH_NATIVE_MOVBE
    +	mov	$1, %eax
    +	cpuid
    +	bt	$22, %ecx
    +	jnc	.Lverify_cpu_no_longmode
    +#endif
    +
    +#if defined(CONFIG_MARCH_NATIVE_REP_MOVSB) || defined(CONFIG_MARCH_NATIVE_REP_STOSB)
    +	xor	%eax, %eax
    +	cpuid
    +	cmp	$7, %eax
    +	jb	.Lverify_cpu_no_longmode
    +	mov	$7, %eax
    +	xor	%ecx, %ecx
    +	cpuid
    +	bt	$9, %ebx
    +	jnc	.Lverify_cpu_no_longmode
    +#endif
    +
     	popf				# Restore caller passed flags
     	xorl %eax, %eax
     	ret
    diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
    index 140e61843a07..09b8afc764a2 100644
    --- a/arch/x86/lib/Makefile
    +++ b/arch/x86/lib/Makefile
    @@ -22,14 +22,21 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
     
     lib-y := delay.o misc.o cmdline.o cpu.o
     lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
    -lib-y += memcpy_$(BITS).o
    +lib-y += memcpy_$(BITS).o memcpy_plain.o
    +
    +CFLAGS_memcpy_plain.o = -O3 -minline-all-stringops -mstringop-strategy=rep_byte
    +CFLAGS_REMOVE_memcpy_plain.o = -O2 -mmemcpy-strategy=rep_byte:-1:align,rep_byte:-1:noalign
    +
     lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
     lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
     lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
     lib-$(CONFIG_FUNCTION_ERROR_INJECTION)	+= error-inject.o
     lib-$(CONFIG_RETPOLINE) += retpoline.o
     
    -obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
    +obj-y += msr.o msr-reg.o msr-reg-export.o
    +ifneq ($(CONFIG_MARCH_NATIVE_POPCNT),y)
    +	obj-y += hweight.o
    +endif
     obj-y += iomem.o
     
     ifeq ($(CONFIG_X86_32),y)
    @@ -45,7 +52,12 @@ endif
     else
             obj-y += iomap_copy_64.o
             lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
    -        lib-y += clear_page_64.o copy_page_64.o
    +ifneq ($(CONFIG_MARCH_NATIVE_REP_STOSB),y)
    +        lib-y += clear_page_64.o
    +endif
    +ifneq ($(CONFIG_MARCH_NATIVE_REP_MOVSB),y)
    +	lib-y += copy_page_64.o
    +endif
             lib-y += memmove_64.o memset_64.o
             lib-y += copy_user_64.o
     	lib-y += cmpxchg16b_emu.o
    diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
    index 3b24dc05251c..6e0bf704ffa4 100644
    --- a/arch/x86/lib/memcpy_64.S
    +++ b/arch/x86/lib/memcpy_64.S
    @@ -7,182 +7,6 @@
     #include <asm/alternative-asm.h>
     #include <asm/export.h>
     
    -/*
    - * We build a jump to memcpy_orig by default which gets NOPped out on
    - * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
    - * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
    - * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
    - */
    -
    -.weak memcpy
    -
    -/*
    - * memcpy - Copy a memory block.
    - *
    - * Input:
    - *  rdi destination
    - *  rsi source
    - *  rdx count
    - *
    - * Output:
    - * rax original destination
    - */
    -ENTRY(__memcpy)
    -ENTRY(memcpy)
    -	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
    -		      "jmp memcpy_erms", X86_FEATURE_ERMS
    -
    -	movq %rdi, %rax
    -	movq %rdx, %rcx
    -	shrq $3, %rcx
    -	andl $7, %edx
    -	rep movsq
    -	movl %edx, %ecx
    -	rep movsb
    -	ret
    -ENDPROC(memcpy)
    -ENDPROC(__memcpy)
    -EXPORT_SYMBOL(memcpy)
    -EXPORT_SYMBOL(__memcpy)
    -
    -/*
    - * memcpy_erms() - enhanced fast string memcpy. This is faster and
    - * simpler than memcpy. Use memcpy_erms when possible.
    - */
    -ENTRY(memcpy_erms)
    -	movq %rdi, %rax
    -	movq %rdx, %rcx
    -	rep movsb
    -	ret
    -ENDPROC(memcpy_erms)
    -
    -ENTRY(memcpy_orig)
    -	movq %rdi, %rax
    -
    -	cmpq $0x20, %rdx
    -	jb .Lhandle_tail
    -
    -	/*
    -	 * We check whether memory false dependence could occur,
    -	 * then jump to corresponding copy mode.
    -	 */
    -	cmp  %dil, %sil
    -	jl .Lcopy_backward
    -	subq $0x20, %rdx
    -.Lcopy_forward_loop:
    -	subq $0x20,	%rdx
    -
    -	/*
    -	 * Move in blocks of 4x8 bytes:
    -	 */
    -	movq 0*8(%rsi),	%r8
    -	movq 1*8(%rsi),	%r9
    -	movq 2*8(%rsi),	%r10
    -	movq 3*8(%rsi),	%r11
    -	leaq 4*8(%rsi),	%rsi
    -
    -	movq %r8,	0*8(%rdi)
    -	movq %r9,	1*8(%rdi)
    -	movq %r10,	2*8(%rdi)
    -	movq %r11,	3*8(%rdi)
    -	leaq 4*8(%rdi),	%rdi
    -	jae  .Lcopy_forward_loop
    -	addl $0x20,	%edx
    -	jmp  .Lhandle_tail
    -
    -.Lcopy_backward:
    -	/*
    -	 * Calculate copy position to tail.
    -	 */
    -	addq %rdx,	%rsi
    -	addq %rdx,	%rdi
    -	subq $0x20,	%rdx
    -	/*
    -	 * At most 3 ALU operations in one cycle,
    -	 * so append NOPS in the same 16 bytes trunk.
    -	 */
    -	.p2align 4
    -.Lcopy_backward_loop:
    -	subq $0x20,	%rdx
    -	movq -1*8(%rsi),	%r8
    -	movq -2*8(%rsi),	%r9
    -	movq -3*8(%rsi),	%r10
    -	movq -4*8(%rsi),	%r11
    -	leaq -4*8(%rsi),	%rsi
    -	movq %r8,		-1*8(%rdi)
    -	movq %r9,		-2*8(%rdi)
    -	movq %r10,		-3*8(%rdi)
    -	movq %r11,		-4*8(%rdi)
    -	leaq -4*8(%rdi),	%rdi
    -	jae  .Lcopy_backward_loop
    -
    -	/*
    -	 * Calculate copy position to head.
    -	 */
    -	addl $0x20,	%edx
    -	subq %rdx,	%rsi
    -	subq %rdx,	%rdi
    -.Lhandle_tail:
    -	cmpl $16,	%edx
    -	jb   .Lless_16bytes
    -
    -	/*
    -	 * Move data from 16 bytes to 31 bytes.
    -	 */
    -	movq 0*8(%rsi), %r8
    -	movq 1*8(%rsi),	%r9
    -	movq -2*8(%rsi, %rdx),	%r10
    -	movq -1*8(%rsi, %rdx),	%r11
    -	movq %r8,	0*8(%rdi)
    -	movq %r9,	1*8(%rdi)
    -	movq %r10,	-2*8(%rdi, %rdx)
    -	movq %r11,	-1*8(%rdi, %rdx)
    -	retq
    -	.p2align 4
    -.Lless_16bytes:
    -	cmpl $8,	%edx
    -	jb   .Lless_8bytes
    -	/*
    -	 * Move data from 8 bytes to 15 bytes.
    -	 */
    -	movq 0*8(%rsi),	%r8
    -	movq -1*8(%rsi, %rdx),	%r9
    -	movq %r8,	0*8(%rdi)
    -	movq %r9,	-1*8(%rdi, %rdx)
    -	retq
    -	.p2align 4
    -.Lless_8bytes:
    -	cmpl $4,	%edx
    -	jb   .Lless_3bytes
    -
    -	/*
    -	 * Move data from 4 bytes to 7 bytes.
    -	 */
    -	movl (%rsi), %ecx
    -	movl -4(%rsi, %rdx), %r8d
    -	movl %ecx, (%rdi)
    -	movl %r8d, -4(%rdi, %rdx)
    -	retq
    -	.p2align 4
    -.Lless_3bytes:
    -	subl $1, %edx
    -	jb .Lend
    -	/*
    -	 * Move data from 1 bytes to 3 bytes.
    -	 */
    -	movzbl (%rsi), %ecx
    -	jz .Lstore_1byte
    -	movzbq 1(%rsi), %r8
    -	movzbq (%rsi, %rdx), %r9
    -	movb %r8b, 1(%rdi)
    -	movb %r9b, (%rdi, %rdx)
    -.Lstore_1byte:
    -	movb %cl, (%rdi)
    -
    -.Lend:
    -	retq
    -ENDPROC(memcpy_orig)
    -
     #ifndef CONFIG_UML
     
     MCSAFE_TEST_CTL
    diff --git a/arch/x86/lib/memcpy_plain.c b/arch/x86/lib/memcpy_plain.c
    new file mode 100644
    index 000000000000..24b247d9c26e
    --- /dev/null
    +++ b/arch/x86/lib/memcpy_plain.c
    @@ -0,0 +1,13 @@
    +#include <linux/types.h>
    +
    +void *memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size) {
    +	void *old_to = to;
    +	while(size--) {
    +		*((char *) to++) = *((char *) from++);
    +	}
    +	return old_to;
    +}
    +
    +void *__memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size) {
    +	return memcpy(to, from, size);
    +}
    diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
    index 9bc861c71e75..7786d1a65423 100644
    --- a/arch/x86/lib/memset_64.S
    +++ b/arch/x86/lib/memset_64.S
    @@ -8,6 +8,20 @@
     
     .weak memset
     
    +#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
    +ENTRY(memset)
    +ENTRY(__memset)
    +	mov	%esi, %eax
    +	mov	%rdi, %rsi
    +	mov	%rdx, %rcx
    +	rep stosb
    +	mov	%rsi, %rax
    +	ret
    +ENDPROC(memset)
    +ENDPROC(__memset)
    +EXPORT_SYMBOL(memset)
    +EXPORT_SYMBOL(__memset)
    +#else
     /*
      * ISO C memset - set a memory block to a byte value. This function uses fast
      * string to get better performance than the original function. The code is
    @@ -140,3 +154,4 @@ ENTRY(memset_orig)
     	jmp .Lafter_bad_alignment
     .Lfinal:
     ENDPROC(memset_orig)
    +#endif
    diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
    index ee42bb0cbeb3..d89d6ef93dd4 100644
    --- a/arch/x86/lib/usercopy_64.c
    +++ b/arch/x86/lib/usercopy_64.c
    @@ -15,11 +15,23 @@
     
     unsigned long __clear_user(void __user *addr, unsigned long size)
     {
    -	long __d0;
     	might_fault();
     	/* no memory constraint because it doesn't change any memory gcc knows
     	   about */
     	stac();
    +
    +#ifdef CONFIG_MARCH_NATIVE_REP_STOSB
    +	asm volatile (
    +		"0:     rep stosb\n"
    +		"1:\n"
    +		_ASM_EXTABLE(0b,1b)
    +		: "+D" (addr), "+c" (size)
    +		: "a" (0)
    +		: "memory"
    +	);
    +#else
    +	{
    +	long __d0;
     	asm volatile(
     		"	testq  %[size8],%[size8]\n"
     		"	jz     4f\n"
    @@ -41,6 +53,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
     		_ASM_EXTABLE_UA(1b, 2b)
     		: [size8] "=&c"(size), [dst] "=&D" (__d0)
     		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
    +	}
    +#endif
     	clac();
     	return size;
     }
    diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
    index 1f8825bbaffb..2737f3e8c021 100644
    --- a/arch/x86/platform/pvh/head.S
    +++ b/arch/x86/platform/pvh/head.S
    @@ -64,9 +64,13 @@ ENTRY(pvh_start_xen)
     	mov $_pa(pvh_start_info), %edi
     	mov %ebx, %esi
     	mov _pa(pvh_start_info_sz), %ecx
    +#ifdef CONFIG_MARCH_NATIVE_REP_MOVSB
    +	rep movsb
    +#else
     	shr $2,%ecx
     	rep
     	movsl
    +#endif
     
     	mov $_pa(early_stack_end), %esp
     
    diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
    index 7b926dfa6b97..c42e0d3dcab3 100644
    --- a/drivers/net/wireless/mediatek/mt76/mac80211.c
    +++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
    @@ -124,7 +124,7 @@ static void mt76_init_stream_cap(struct mt76_dev *dev,
     				 bool vht)
     {
     	struct ieee80211_sta_ht_cap *ht_cap = &sband->ht_cap;
    -	int i, nstream = __sw_hweight8(dev->antenna_mask);
    +	int i, nstream = hweight8(dev->antenna_mask);
     	struct ieee80211_sta_vht_cap *vht_cap;
     	u16 mcs_map = 0;
     
    diff --git a/include/linux/bitops.h b/include/linux/bitops.h
    index 705f7c442691..6f6be5c418f5 100644
    --- a/include/linux/bitops.h
    +++ b/include/linux/bitops.h
    @@ -7,10 +7,12 @@
     #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
     #define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
     
    +#ifndef CONFIG_MARCH_NATIVE_POPCNT
     extern unsigned int __sw_hweight8(unsigned int w);
     extern unsigned int __sw_hweight16(unsigned int w);
     extern unsigned int __sw_hweight32(unsigned int w);
     extern unsigned long __sw_hweight64(__u64 w);
    +#endif
     
     /*
      * Include this here because some architectures need generic_ffs/fls in
    diff --git a/lib/Makefile b/lib/Makefile
    index e1b59da71418..a48c5fece180 100644
    --- a/lib/Makefile
    +++ b/lib/Makefile
    @@ -93,7 +93,9 @@ obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
     
     obj-y += logic_pio.o
     
    +ifneq ($(CONFIG_MARCH_NATIVE_POPCNT),y)
     obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
    +endif
     
     obj-$(CONFIG_BTREE) += btree.o
     obj-$(CONFIG_INTERVAL_TREE) += interval_tree.o
    diff --git a/scripts/kconfig/.gitignore b/scripts/kconfig/.gitignore
    index b5bf92f66d11..411a885ad9b1 100644
    --- a/scripts/kconfig/.gitignore
    +++ b/scripts/kconfig/.gitignore
    @@ -8,6 +8,7 @@
     # configuration programs
     #
     conf
    +cpuid
     mconf
     nconf
     qconf
    diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile
    index 181973509a05..83fecd21e42b 100644
    --- a/scripts/kconfig/Makefile
    +++ b/scripts/kconfig/Makefile
    @@ -65,8 +65,9 @@ simple-targets := oldconfig allnoconfig allyesconfig allmodconfig \
     	alldefconfig randconfig listnewconfig olddefconfig syncconfig
     PHONY += $(simple-targets)
     
    -$(simple-targets): $(obj)/conf
    +$(simple-targets): $(obj)/conf $(obj)/cpuid
     	$< $(silent) --$@ $(Kconfig)
    +	$(Q)$(srctree)/scripts/march-native.sh "$(CC)" $(obj)/cpuid
     
     PHONY += savedefconfig defconfig
     
    @@ -149,6 +150,10 @@ $(obj)/zconf.lex.o: $(obj)/zconf.tab.h
     HOSTCFLAGS_zconf.lex.o	:= -I$(src)
     HOSTCFLAGS_zconf.tab.o	:= -I$(src)
     
    +# cpuid: -march=native, CONFIG_MARCH_NATIVE_* detection
    +hostprogs-y	+= cpuid
    +cpuid-objs	:= cpuid.o
    +
     # conf: Used for defconfig, oldconfig and related targets
     hostprogs-y	+= conf
     conf-objs	:= conf.o $(common-objs)
    diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
    new file mode 100644
    index 000000000000..613c3e738f12
    --- /dev/null
    +++ b/scripts/kconfig/cpuid.c
    @@ -0,0 +1,108 @@
    +/*
    + * Copyright (c) 2017 Alexey Dobriyan <adobriyan@gmail.com>
    + *
    + * Permission to use, copy, modify, and distribute this software for any
    + * purpose with or without fee is hereby granted, provided that the above
    + * copyright notice and this permission notice appear in all copies.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
    + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
    + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
    + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
    + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
    + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
    + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
    + */
    +#ifdef __x86_64__
    +#include <stdbool.h>
    +#include <stdint.h>
    +#include <stdio.h>
    +#include <stdlib.h>
    +#include <string.h>
    +
    +static inline bool streq(const char *s1, const char *s2)
    +{
    +	return strcmp(s1, s2) == 0;
    +}
    +
    +static inline void cpuid(uint32_t eax0, uint32_t *eax, uint32_t *ecx, uint32_t *edx, uint32_t *ebx)
    +{
    +	asm volatile (
    +		"cpuid"
    +		: "=a" (*eax), "=c" (*ecx), "=d" (*edx), "=b" (*ebx)
    +		: "0" (eax0)
    +	);
    +}
    +
    +static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t *ecx, uint32_t *edx, uint32_t *ebx)
    +{
    +	asm volatile (
    +		"cpuid"
    +		: "=a" (*eax), "=c" (*ecx), "=d" (*edx), "=b" (*ebx)
    +		: "0" (eax0), "1" (ecx0)
    +	);
    +}
    +
    +static bool movbe	= false;
    +static bool popcnt	= false;
    +static bool rep_movsb	= false;
    +static bool rep_stosb	= false;
    +
    +static uint32_t eax0_max;
    +
    +static void intel(void)
    +{
    +	uint32_t eax, ecx, edx, ebx;
    +
    +	if (eax0_max >= 1) {
    +		cpuid(1, &eax, &ecx, &edx, &ebx);
    +//		printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
    +
    +		if (ecx & (1 << 22))
    +			movbe = true;
    +		if (ecx & (1 << 23))
    +			popcnt = true;
    +	}
    +	if (eax0_max >= 7) {
    +		cpuid2(7, 0, &eax, &ecx, &edx, &ebx);
    +//		printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
    +
    +		if (ebx & (1 << 9)) {
    +			rep_movsb = true;
    +			rep_stosb = true;
    +		}
    +	}
    +}
    +
    +int main(int argc, char *argv[])
    +{
    +	const char *opt = argv[1];
    +	uint32_t eax, ecx, edx, ebx;
    +
    +	if (argc != 2)
    +		return EXIT_FAILURE;
    +
    +	cpuid(0, &eax, &ecx, &edx, &ebx);
    +//	printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);
    +	eax0_max = eax;
    +
    +	if (ecx == 0x6c65746e && edx == 0x49656e69 && ebx == 0x756e6547) {
    +		intel();
    +	}
    +
    +#define _(x)	if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
    +	_(movbe);
    +	_(popcnt);
    +	_(rep_movsb);
    +	_(rep_stosb);
    +#undef _
    +
    +	return EXIT_FAILURE;
    +}
    +#else
    +#include <stdlib.h>
    +int main(void)
    +{
    +	return EXIT_FAILURE;
    +}
    +#endif
    diff --git a/scripts/march-native.sh b/scripts/march-native.sh
    new file mode 100755
    index 000000000000..38710a4f0616
    --- /dev/null
    +++ b/scripts/march-native.sh
    @@ -0,0 +1,74 @@
    +#!/bin/sh
    +# Copyright (c) 2017-2019 Alexey Dobriyan <adobriyan@gmail.com>
    +if test "$(uname -m)" != "x86_64"; then
    +	exit 0
    +fi
    +
    +CC="$1"
    +CPUID="$2"
    +CONFIG=".config"
    +AUTOCONF1="include/config/auto.conf"
    +AUTOCONF2="include/generated/autoconf.h"
    +
    +if ! grep -q -e '^CONFIG_MARCH_NATIVE=y$' "$CONFIG"; then
    +	sed -i -e '/^CONFIG_MARCH_NATIVE/d' "$AUTOCONF1" "$AUTOCONF2" >/dev/null 2>&1
    +	exit 0
    +fi
    +
    +if ! $CC -march=native -x c -c -o /dev/null /dev/null >/dev/null 2>&1; then
    +	echo >&2 "error: unsupported '-march=native' compiler option"
    +	exit 1
    +fi
    +
    +_option() {
    +	echo "$1=$2"		>>"$CONFIG"
    +	echo "$1=$2"		>>"$AUTOCONF1"
    +	echo "#define $1 $2"	>>"$AUTOCONF2"
    +}
    +
    +option() {
    +	echo "$1=y"		>>"$CONFIG"
    +	echo "$1=y"		>>"$AUTOCONF1"
    +	echo "#define $1 1"	>>"$AUTOCONF2"
    +}
    +
    +if test ! -f "$CONFIG" -o ! -f "$AUTOCONF1" -o ! -f "$AUTOCONF2"; then
    +	exit 0
    +fi
    +
    +COLLECT_GCC_OPTIONS=$(
    +	$CC -march=native -v -E -x c -c /dev/null 2>&1	|\
    +	sed -ne '/^COLLECT_GCC_OPTIONS=/{n;p}'			|\
    +	awk '{$1=$1};1'
    +)
    +echo "-march=native: $COLLECT_GCC_OPTIONS"
    +_option "CONFIG_MARCH_NATIVE_CC_OPTIONS" "\"$COLLECT_GCC_OPTIONS\""
    +
    +"$CPUID" movbe		&& option "CONFIG_MARCH_NATIVE_MOVBE"
    +"$CPUID" popcnt		&& option "CONFIG_MARCH_NATIVE_POPCNT"
    +"$CPUID" rep_movsb	&& option "CONFIG_MARCH_NATIVE_REP_MOVSB"
    +"$CPUID" rep_stosb	&& option "CONFIG_MARCH_NATIVE_REP_STOSB"
    +
    +for i in $COLLECT_GCC_OPTIONS; do
    +	case $i in
    +		*/cc1|-E|-quiet|-v|/dev/null|--param|-fstack-protector*)
    +			;;
    +
    +		l1-cache-line-size=64)
    +			_option "CONFIG_X86_L1_CACHE_SHIFT"		6
    +			_option "CONFIG_X86_INTERNODE_CACHE_SHIFT"	6
    +			;;
    +
    +		l1-cache-size=*);;
    +		l2-cache-size=*);;
    +
    +		-march=*);;
    +		-mtune=*);;
    +
    +		-m*);;
    +		-mno-*);;
    +
    +		*)
    +			echo >&2 "warning: unexpected -march=native option '$i'"
    +	esac
    +done
    diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S
    index 3b24dc05251c..df43623f2a6c 100644
    --- a/tools/arch/x86/lib/memcpy_64.S
    +++ b/tools/arch/x86/lib/memcpy_64.S
    @@ -14,7 +14,7 @@
      * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
      */
     
    -.weak memcpy
    +.weak memcpy_movsq
     
     /*
      * memcpy - Copy a memory block.
    @@ -27,11 +27,7 @@
      * Output:
      * rax original destination
      */
    -ENTRY(__memcpy)
    -ENTRY(memcpy)
    -	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
    -		      "jmp memcpy_erms", X86_FEATURE_ERMS
    -
    +ENTRY(memcpy_movsq)
     	movq %rdi, %rax
     	movq %rdx, %rcx
     	shrq $3, %rcx
    @@ -40,10 +36,8 @@ ENTRY(memcpy)
     	movl %edx, %ecx
     	rep movsb
     	ret
    -ENDPROC(memcpy)
    -ENDPROC(__memcpy)
    -EXPORT_SYMBOL(memcpy)
    -EXPORT_SYMBOL(__memcpy)
    +ENDPROC(memcpy_movsq)
    +EXPORT_SYMBOL(memcpy_movsq)
     
     /*
      * memcpy_erms() - enhanced fast string memcpy. This is faster and
    diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
    index e4e321b6f883..8cbb4787b701 100644
    --- a/tools/perf/bench/Build
    +++ b/tools/perf/bench/Build
    @@ -13,5 +13,11 @@ perf-y += epoll-ctl.o
     perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o
     perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
     perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o
    +perf-$(CONFIG_X86_64) += memcpy_plain.o
    +perf-$(CONFIG_X86_64) += memcpy_plain_native.o
    +
    +CFLAGS_memcpy_plain.o = -falign-functions=32 -mno-avx -mno-avx2 -mno-sse -ffreestanding
    +CFLAGS_memcpy_plain_native.o = -falign-functions=32 -O3 -march=corei7 -minline-all-stringops -mstringop-strategy=rep_byte -mno-avx -mno-avx2 -mno-sse
    +CFLAGS_REMOVE_memcpy_plain_native.o = -O2
     
     perf-$(CONFIG_NUMA) += numa.o
    diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
    index 50ae8bd58296..0f39b99c95d4 100644
    --- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
    +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
    @@ -4,10 +4,18 @@ MEMCPY_FN(memcpy_orig,
     	"x86-64-unrolled",
     	"unrolled memcpy() in arch/x86/lib/memcpy_64.S")
     
    -MEMCPY_FN(__memcpy,
    +MEMCPY_FN(memcpy_movsq,
     	"x86-64-movsq",
     	"movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
     
     MEMCPY_FN(memcpy_erms,
     	"x86-64-movsb",
     	"movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
    +
    +MEMCPY_FN(__memcpy,
    +	"x86-64-c-memcpy",
    +	"C-based memcpy() in arch/x86/lib/memcpy_plain.c")
    +
    +MEMCPY_FN(memcpy_native,
    +	"x86-64-c-memcpy-native",
    +	"C-based memcpy(), natively optimized in arch/x86/lib/memcpy_plain.c")
    diff --git a/tools/perf/bench/memcpy_plain.c b/tools/perf/bench/memcpy_plain.c
    new file mode 100644
    index 000000000000..282551f35f24
    --- /dev/null
    +++ b/tools/perf/bench/memcpy_plain.c
    @@ -0,0 +1,18 @@
    +#include <linux/types.h>
    +
    +void *memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size);
    +void *__memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size);
    +
    +void *memcpy(void * __restrict__ to, const void * __restrict__ from, size_t size) {
    +	char *t = to;
    +	const char *f = from;
    +	while(size--) {
    +		*(t++) = *(f++);
    +	}
    +	return to;
    +
    +}
    +
    +void *__memcpy(void *to, const void *from, size_t size) {
    +	return memcpy(to, from, size);
    +}
    diff --git a/tools/perf/bench/memcpy_plain_native.c b/tools/perf/bench/memcpy_plain_native.c
    new file mode 100644
    index 000000000000..cab84e0404a3
    --- /dev/null
    +++ b/tools/perf/bench/memcpy_plain_native.c
    @@ -0,0 +1,11 @@
    +#include <linux/types.h>
    +
    +void *memcpy_native(void * __restrict__ to, const void * __restrict__ from, size_t size);
    +
    +void *memcpy_native(void * __restrict__ to, const void * __restrict__ from, size_t size) {
    +	void *old_to = to;
    +	while(size--) {
    +		*((char *) to++) = *((char *) from++);
    +	}
    +	return old_to;
    +}
    -- 
    2.22.0
    0% Loading or .
    You are about to add 0 people to the discussion. Proceed with caution.
    Finish editing this message first!
    Please register or to comment