416 lines
12 KiB
Diff
416 lines
12 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Matteo Croce <technoboy85@gmail.com>
|
|
Date: Wed, 29 Sep 2021 19:22:33 +0200
|
|
Subject: riscv: optimized memmove
|
|
|
|
When the destination buffer is before the source one, or when the
|
|
buffers doesn't overlap, it's safe to use memcpy() instead, which is
|
|
optimized to use a bigger data size possible.
|
|
|
|
Signed-off-by: Matteo Croce <mcroce@microsoft.com>
|
|
---
|
|
arch/riscv/include/asm/string.h | 6 +-
|
|
arch/riscv/kernel/riscv_ksyms.c | 2 -
|
|
arch/riscv/lib/Makefile | 1 -
|
|
arch/riscv/lib/memmove.S | 316 ----------
|
|
arch/riscv/lib/string.c | 23 +
|
|
5 files changed, 26 insertions(+), 322 deletions(-)
|
|
|
|
diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
|
|
index 23fdcbffb6a1..02120466d5a1 100644
|
|
--- a/arch/riscv/include/asm/string.h
|
|
+++ b/arch/riscv/include/asm/string.h
|
|
@@ -16,10 +16,10 @@ extern asmlinkage void *__memset(void *, int, size_t);
|
|
#define __HAVE_ARCH_MEMCPY
|
|
extern void *memcpy(void *dest, const void *src, size_t count);
|
|
extern void *__memcpy(void *dest, const void *src, size_t count);
|
|
-
|
|
#define __HAVE_ARCH_MEMMOVE
|
|
-extern asmlinkage void *memmove(void *, const void *, size_t);
|
|
-extern asmlinkage void *__memmove(void *, const void *, size_t);
|
|
+extern void *memmove(void *dest, const void *src, size_t count);
|
|
+extern void *__memmove(void *dest, const void *src, size_t count);
|
|
+
|
|
/* For those files which don't want to check by kasan. */
|
|
#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
|
|
#define memcpy(dst, src, len) __memcpy(dst, src, len)
|
|
diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c
|
|
index 3f6d512a5b97..361565c4db7e 100644
|
|
--- a/arch/riscv/kernel/riscv_ksyms.c
|
|
+++ b/arch/riscv/kernel/riscv_ksyms.c
|
|
@@ -10,6 +10,4 @@
|
|
* Assembly functions that may be used (directly or indirectly) by modules
|
|
*/
|
|
EXPORT_SYMBOL(memset);
|
|
-EXPORT_SYMBOL(memmove);
|
|
EXPORT_SYMBOL(__memset);
|
|
-EXPORT_SYMBOL(__memmove);
|
|
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
|
|
index 023d78f20960..9166b61cc2dc 100644
|
|
--- a/arch/riscv/lib/Makefile
|
|
+++ b/arch/riscv/lib/Makefile
|
|
@@ -1,7 +1,6 @@
|
|
# SPDX-License-Identifier: GPL-2.0-only
|
|
lib-y += delay.o
|
|
lib-y += memset.o
|
|
-lib-y += memmove.o
|
|
lib-$(CONFIG_MMU) += uaccess.o
|
|
lib-$(CONFIG_64BIT) += tishift.o
|
|
lib-y += string.o
|
|
diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S
|
|
deleted file mode 100644
|
|
index e0609e1f0864..000000000000
|
|
--- a/arch/riscv/lib/memmove.S
|
|
+++ /dev/null
|
|
@@ -1,316 +0,0 @@
|
|
-/* SPDX-License-Identifier: GPL-2.0-only */
|
|
-/*
|
|
- * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
|
|
- */
|
|
-
|
|
-#include <linux/linkage.h>
|
|
-#include <asm/asm.h>
|
|
-
|
|
-SYM_FUNC_START(__memmove)
|
|
-SYM_FUNC_START_WEAK(memmove)
|
|
- /*
|
|
- * Returns
|
|
- * a0 - dest
|
|
- *
|
|
- * Parameters
|
|
- * a0 - Inclusive first byte of dest
|
|
- * a1 - Inclusive first byte of src
|
|
- * a2 - Length of copy n
|
|
- *
|
|
- * Because the return matches the parameter register a0,
|
|
- * we will not clobber or modify that register.
|
|
- *
|
|
- * Note: This currently only works on little-endian.
|
|
- * To port to big-endian, reverse the direction of shifts
|
|
- * in the 2 misaligned fixup copy loops.
|
|
- */
|
|
-
|
|
- /* Return if nothing to do */
|
|
- beq a0, a1, return_from_memmove
|
|
- beqz a2, return_from_memmove
|
|
-
|
|
- /*
|
|
- * Register Uses
|
|
- * Forward Copy: a1 - Index counter of src
|
|
- * Reverse Copy: a4 - Index counter of src
|
|
- * Forward Copy: t3 - Index counter of dest
|
|
- * Reverse Copy: t4 - Index counter of dest
|
|
- * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
|
|
- * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
|
|
- * Both Copy Modes: t0 - Link / Temporary for load-store
|
|
- * Both Copy Modes: t1 - Temporary for load-store
|
|
- * Both Copy Modes: t2 - Temporary for load-store
|
|
- * Both Copy Modes: a5 - dest to src alignment offset
|
|
- * Both Copy Modes: a6 - Shift ammount
|
|
- * Both Copy Modes: a7 - Inverse Shift ammount
|
|
- * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
|
|
- */
|
|
-
|
|
- /*
|
|
- * Solve for some register values now.
|
|
- * Byte copy does not need t5 or t6.
|
|
- */
|
|
- mv t3, a0
|
|
- add t4, a0, a2
|
|
- add a4, a1, a2
|
|
-
|
|
- /*
|
|
- * Byte copy if copying less than (2 * SZREG) bytes. This can
|
|
- * cause problems with the bulk copy implementation and is
|
|
- * small enough not to bother.
|
|
- */
|
|
- andi t0, a2, -(2 * SZREG)
|
|
- beqz t0, byte_copy
|
|
-
|
|
- /*
|
|
- * Now solve for t5 and t6.
|
|
- */
|
|
- andi t5, t3, -SZREG
|
|
- andi t6, t4, -SZREG
|
|
- /*
|
|
- * If dest(Register t3) rounded down to the nearest naturally
|
|
- * aligned SZREG address, does not equal dest, then add SZREG
|
|
- * to find the low-bound of SZREG alignment in the dest memory
|
|
- * region. Note that this could overshoot the dest memory
|
|
- * region if n is less than SZREG. This is one reason why
|
|
- * we always byte copy if n is less than SZREG.
|
|
- * Otherwise, dest is already naturally aligned to SZREG.
|
|
- */
|
|
- beq t5, t3, 1f
|
|
- addi t5, t5, SZREG
|
|
- 1:
|
|
-
|
|
- /*
|
|
- * If the dest and src are co-aligned to SZREG, then there is
|
|
- * no need for the full rigmarole of a full misaligned fixup copy.
|
|
- * Instead, do a simpler co-aligned copy.
|
|
- */
|
|
- xor t0, a0, a1
|
|
- andi t1, t0, (SZREG - 1)
|
|
- beqz t1, coaligned_copy
|
|
- /* Fall through to misaligned fixup copy */
|
|
-
|
|
-misaligned_fixup_copy:
|
|
- bltu a1, a0, misaligned_fixup_copy_reverse
|
|
-
|
|
-misaligned_fixup_copy_forward:
|
|
- jal t0, byte_copy_until_aligned_forward
|
|
-
|
|
- andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
|
|
- slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
|
|
- sub a5, a1, t3 /* Find the difference between src and dest */
|
|
- andi a1, a1, -SZREG /* Align the src pointer */
|
|
- addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
|
|
-
|
|
- /*
|
|
- * Compute The Inverse Shift
|
|
- * a7 = XLEN - a6 = XLEN + -a6
|
|
- * 2s complement negation to find the negative: -a6 = ~a6 + 1
|
|
- * Add that to XLEN. XLEN = SZREG * 8.
|
|
- */
|
|
- not a7, a6
|
|
- addi a7, a7, (SZREG * 8 + 1)
|
|
-
|
|
- /*
|
|
- * Fix Misalignment Copy Loop - Forward
|
|
- * load_val0 = load_ptr[0];
|
|
- * do {
|
|
- * load_val1 = load_ptr[1];
|
|
- * store_ptr += 2;
|
|
- * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
|
|
- *
|
|
- * if (store_ptr == {a2})
|
|
- * break;
|
|
- *
|
|
- * load_val0 = load_ptr[2];
|
|
- * load_ptr += 2;
|
|
- * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
|
|
- *
|
|
- * } while (store_ptr != store_ptr_end);
|
|
- * store_ptr = store_ptr_end;
|
|
- */
|
|
-
|
|
- REG_L t0, (0 * SZREG)(a1)
|
|
- 1:
|
|
- REG_L t1, (1 * SZREG)(a1)
|
|
- addi t3, t3, (2 * SZREG)
|
|
- srl t0, t0, a6
|
|
- sll t2, t1, a7
|
|
- or t2, t0, t2
|
|
- REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
|
|
-
|
|
- beq t3, a2, 2f
|
|
-
|
|
- REG_L t0, (2 * SZREG)(a1)
|
|
- addi a1, a1, (2 * SZREG)
|
|
- srl t1, t1, a6
|
|
- sll t2, t0, a7
|
|
- or t2, t1, t2
|
|
- REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
|
|
-
|
|
- bne t3, t6, 1b
|
|
- 2:
|
|
- mv t3, t6 /* Fix the dest pointer in case the loop was broken */
|
|
-
|
|
- add a1, t3, a5 /* Restore the src pointer */
|
|
- j byte_copy_forward /* Copy any remaining bytes */
|
|
-
|
|
-misaligned_fixup_copy_reverse:
|
|
- jal t0, byte_copy_until_aligned_reverse
|
|
-
|
|
- andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
|
|
- slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
|
|
- sub a5, a4, t4 /* Find the difference between src and dest */
|
|
- andi a4, a4, -SZREG /* Align the src pointer */
|
|
- addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
|
|
-
|
|
- /*
|
|
- * Compute The Inverse Shift
|
|
- * a7 = XLEN - a6 = XLEN + -a6
|
|
- * 2s complement negation to find the negative: -a6 = ~a6 + 1
|
|
- * Add that to XLEN. XLEN = SZREG * 8.
|
|
- */
|
|
- not a7, a6
|
|
- addi a7, a7, (SZREG * 8 + 1)
|
|
-
|
|
- /*
|
|
- * Fix Misalignment Copy Loop - Reverse
|
|
- * load_val1 = load_ptr[0];
|
|
- * do {
|
|
- * load_val0 = load_ptr[-1];
|
|
- * store_ptr -= 2;
|
|
- * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
|
|
- *
|
|
- * if (store_ptr == {a2})
|
|
- * break;
|
|
- *
|
|
- * load_val1 = load_ptr[-2];
|
|
- * load_ptr -= 2;
|
|
- * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
|
|
- *
|
|
- * } while (store_ptr != store_ptr_end);
|
|
- * store_ptr = store_ptr_end;
|
|
- */
|
|
-
|
|
- REG_L t1, ( 0 * SZREG)(a4)
|
|
- 1:
|
|
- REG_L t0, (-1 * SZREG)(a4)
|
|
- addi t4, t4, (-2 * SZREG)
|
|
- sll t1, t1, a7
|
|
- srl t2, t0, a6
|
|
- or t2, t1, t2
|
|
- REG_S t2, ( 1 * SZREG)(t4)
|
|
-
|
|
- beq t4, a2, 2f
|
|
-
|
|
- REG_L t1, (-2 * SZREG)(a4)
|
|
- addi a4, a4, (-2 * SZREG)
|
|
- sll t0, t0, a7
|
|
- srl t2, t1, a6
|
|
- or t2, t0, t2
|
|
- REG_S t2, ( 0 * SZREG)(t4)
|
|
-
|
|
- bne t4, t5, 1b
|
|
- 2:
|
|
- mv t4, t5 /* Fix the dest pointer in case the loop was broken */
|
|
-
|
|
- add a4, t4, a5 /* Restore the src pointer */
|
|
- j byte_copy_reverse /* Copy any remaining bytes */
|
|
-
|
|
-/*
|
|
- * Simple copy loops for SZREG co-aligned memory locations.
|
|
- * These also make calls to do byte copies for any unaligned
|
|
- * data at their terminations.
|
|
- */
|
|
-coaligned_copy:
|
|
- bltu a1, a0, coaligned_copy_reverse
|
|
-
|
|
-coaligned_copy_forward:
|
|
- jal t0, byte_copy_until_aligned_forward
|
|
-
|
|
- 1:
|
|
- REG_L t1, ( 0 * SZREG)(a1)
|
|
- addi a1, a1, SZREG
|
|
- addi t3, t3, SZREG
|
|
- REG_S t1, (-1 * SZREG)(t3)
|
|
- bne t3, t6, 1b
|
|
-
|
|
- j byte_copy_forward /* Copy any remaining bytes */
|
|
-
|
|
-coaligned_copy_reverse:
|
|
- jal t0, byte_copy_until_aligned_reverse
|
|
-
|
|
- 1:
|
|
- REG_L t1, (-1 * SZREG)(a4)
|
|
- addi a4, a4, -SZREG
|
|
- addi t4, t4, -SZREG
|
|
- REG_S t1, ( 0 * SZREG)(t4)
|
|
- bne t4, t5, 1b
|
|
-
|
|
- j byte_copy_reverse /* Copy any remaining bytes */
|
|
-
|
|
-/*
|
|
- * These are basically sub-functions within the function. They
|
|
- * are used to byte copy until the dest pointer is in alignment.
|
|
- * At which point, a bulk copy method can be used by the
|
|
- * calling code. These work on the same registers as the bulk
|
|
- * copy loops. Therefore, the register values can be picked
|
|
- * up from where they were left and we avoid code duplication
|
|
- * without any overhead except the call in and return jumps.
|
|
- */
|
|
-byte_copy_until_aligned_forward:
|
|
- beq t3, t5, 2f
|
|
- 1:
|
|
- lb t1, 0(a1)
|
|
- addi a1, a1, 1
|
|
- addi t3, t3, 1
|
|
- sb t1, -1(t3)
|
|
- bne t3, t5, 1b
|
|
- 2:
|
|
- jalr zero, 0x0(t0) /* Return to multibyte copy loop */
|
|
-
|
|
-byte_copy_until_aligned_reverse:
|
|
- beq t4, t6, 2f
|
|
- 1:
|
|
- lb t1, -1(a4)
|
|
- addi a4, a4, -1
|
|
- addi t4, t4, -1
|
|
- sb t1, 0(t4)
|
|
- bne t4, t6, 1b
|
|
- 2:
|
|
- jalr zero, 0x0(t0) /* Return to multibyte copy loop */
|
|
-
|
|
-/*
|
|
- * Simple byte copy loops.
|
|
- * These will byte copy until they reach the end of data to copy.
|
|
- * At that point, they will call to return from memmove.
|
|
- */
|
|
-byte_copy:
|
|
- bltu a1, a0, byte_copy_reverse
|
|
-
|
|
-byte_copy_forward:
|
|
- beq t3, t4, 2f
|
|
- 1:
|
|
- lb t1, 0(a1)
|
|
- addi a1, a1, 1
|
|
- addi t3, t3, 1
|
|
- sb t1, -1(t3)
|
|
- bne t3, t4, 1b
|
|
- 2:
|
|
- ret
|
|
-
|
|
-byte_copy_reverse:
|
|
- beq t4, t3, 2f
|
|
- 1:
|
|
- lb t1, -1(a4)
|
|
- addi a4, a4, -1
|
|
- addi t4, t4, -1
|
|
- sb t1, 0(t4)
|
|
- bne t4, t3, 1b
|
|
- 2:
|
|
-
|
|
-return_from_memmove:
|
|
- ret
|
|
-
|
|
-SYM_FUNC_END(memmove)
|
|
-SYM_FUNC_END(__memmove)
|
|
diff --git a/arch/riscv/lib/string.c b/arch/riscv/lib/string.c
|
|
index 32509ab0232e..bd52581e2068 100644
|
|
--- a/arch/riscv/lib/string.c
|
|
+++ b/arch/riscv/lib/string.c
|
|
@@ -89,3 +89,26 @@ EXPORT_SYMBOL(__memcpy);
|
|
|
|
void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy);
|
|
EXPORT_SYMBOL(memcpy);
|
|
+
|
|
+/*
|
|
+ * Simply check if the buffer overlaps an call memcpy() in case,
|
|
+ * otherwise do a simple one byte at time backward copy.
|
|
+ */
|
|
+void *__memmove(void *dest, const void *src, size_t count)
|
|
+{
|
|
+ if (dest < src || src + count <= dest)
|
|
+ return __memcpy(dest, src, count);
|
|
+
|
|
+ if (dest > src) {
|
|
+ const char *s = src + count;
|
|
+ char *tmp = dest + count;
|
|
+
|
|
+ while (count--)
|
|
+ *--tmp = *--s;
|
|
+ }
|
|
+ return dest;
|
|
+}
|
|
+EXPORT_SYMBOL(__memmove);
|
|
+
|
|
+void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmove);
|
|
+EXPORT_SYMBOL(memmove);
|
|
--
|
|
Armbian
|
|
|