309 lines
6.6 KiB
Diff
309 lines
6.6 KiB
Diff
|
|
From 82dee12d3b6b11714a14ffb46886f693bc745ec6 Mon Sep 17 00:00:00 2001
|
||
|
|
From: Wilco Dijkstra <wdijkstr@arm.com>
|
||
|
|
Date: Thu, 2 Dec 2021 18:30:55 +0000
|
||
|
|
Subject: [PATCH] AArch64: Optimize memcmp
|
||
|
|
|
||
|
|
Rewrite memcmp to improve performance. On small and medium inputs performance
|
||
|
|
is 10-20% better. Large inputs use a SIMD loop processing 64 bytes per
|
||
|
|
iteration, which is 30-50% faster depending on the size.
|
||
|
|
|
||
|
|
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||
|
|
(cherry picked from commit b51eb35c572b015641f03e3682c303f7631279b7)
|
||
|
|
---
|
||
|
|
sysdeps/aarch64/memcmp.S | 241 ++++++++++++++++++++++-----------------
|
||
|
|
1 file changed, 134 insertions(+), 107 deletions(-)
|
||
|
|
|
||
|
|
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
|
||
|
|
index c1937f6f5c..c7d56a8af0 100644
|
||
|
|
--- a/sysdeps/aarch64/memcmp.S
|
||
|
|
+++ b/sysdeps/aarch64/memcmp.S
|
||
|
|
@@ -22,105 +22,79 @@
|
||
|
|
|
||
|
|
/* Assumptions:
|
||
|
|
*
|
||
|
|
- * ARMv8-a, AArch64, unaligned accesses.
|
||
|
|
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||
|
|
*/
|
||
|
|
|
||
|
|
-/* Parameters and result. */
|
||
|
|
-#define src1 x0
|
||
|
|
-#define src2 x1
|
||
|
|
-#define limit x2
|
||
|
|
-#define result w0
|
||
|
|
-
|
||
|
|
-/* Internal variables. */
|
||
|
|
-#define data1 x3
|
||
|
|
-#define data1w w3
|
||
|
|
-#define data1h x4
|
||
|
|
-#define data2 x5
|
||
|
|
-#define data2w w5
|
||
|
|
-#define data2h x6
|
||
|
|
-#define tmp1 x7
|
||
|
|
-#define tmp2 x8
|
||
|
|
-
|
||
|
|
-ENTRY_ALIGN (memcmp, 6)
|
||
|
|
+#define src1 x0
|
||
|
|
+#define src2 x1
|
||
|
|
+#define limit x2
|
||
|
|
+#define result w0
|
||
|
|
+
|
||
|
|
+#define data1 x3
|
||
|
|
+#define data1w w3
|
||
|
|
+#define data2 x4
|
||
|
|
+#define data2w w4
|
||
|
|
+#define data3 x5
|
||
|
|
+#define data3w w5
|
||
|
|
+#define data4 x6
|
||
|
|
+#define data4w w6
|
||
|
|
+#define tmp x6
|
||
|
|
+#define src1end x7
|
||
|
|
+#define src2end x8
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ENTRY (memcmp)
|
||
|
|
PTR_ARG (0)
|
||
|
|
PTR_ARG (1)
|
||
|
|
SIZE_ARG (2)
|
||
|
|
|
||
|
|
- subs limit, limit, 16
|
||
|
|
+ cmp limit, 16
|
||
|
|
b.lo L(less16)
|
||
|
|
-
|
||
|
|
- ldp data1, data1h, [src1], 16
|
||
|
|
- ldp data2, data2h, [src2], 16
|
||
|
|
+ ldp data1, data3, [src1]
|
||
|
|
+ ldp data2, data4, [src2]
|
||
|
|
ccmp data1, data2, 0, ne
|
||
|
|
- ccmp data1h, data2h, 0, eq
|
||
|
|
- b.ne L(return64)
|
||
|
|
+ ccmp data3, data4, 0, eq
|
||
|
|
+ b.ne L(return2)
|
||
|
|
|
||
|
|
- subs limit, limit, 16
|
||
|
|
+ add src1end, src1, limit
|
||
|
|
+ add src2end, src2, limit
|
||
|
|
+ cmp limit, 32
|
||
|
|
b.ls L(last_bytes)
|
||
|
|
- cmp limit, 112
|
||
|
|
- b.lo L(loop16)
|
||
|
|
-
|
||
|
|
- and tmp1, src1, 15
|
||
|
|
- add limit, limit, tmp1
|
||
|
|
- sub src1, src1, tmp1
|
||
|
|
- sub src2, src2, tmp1
|
||
|
|
- subs limit, limit, 48
|
||
|
|
+ cmp limit, 160
|
||
|
|
+ b.hs L(loop_align)
|
||
|
|
+ sub limit, limit, 32
|
||
|
|
|
||
|
|
- /* Compare 128 up bytes using aligned access. */
|
||
|
|
.p2align 4
|
||
|
|
-L(loop64):
|
||
|
|
- ldp data1, data1h, [src1]
|
||
|
|
- ldp data2, data2h, [src2]
|
||
|
|
- cmp data1, data2
|
||
|
|
- ccmp data1h, data2h, 0, eq
|
||
|
|
- b.ne L(return64)
|
||
|
|
-
|
||
|
|
- ldp data1, data1h, [src1, 16]
|
||
|
|
- ldp data2, data2h, [src2, 16]
|
||
|
|
- cmp data1, data2
|
||
|
|
- ccmp data1h, data2h, 0, eq
|
||
|
|
- b.ne L(return64)
|
||
|
|
-
|
||
|
|
- ldp data1, data1h, [src1, 32]
|
||
|
|
- ldp data2, data2h, [src2, 32]
|
||
|
|
- cmp data1, data2
|
||
|
|
- ccmp data1h, data2h, 0, eq
|
||
|
|
- b.ne L(return64)
|
||
|
|
-
|
||
|
|
- ldp data1, data1h, [src1, 48]
|
||
|
|
- ldp data2, data2h, [src2, 48]
|
||
|
|
+L(loop32):
|
||
|
|
+ ldp data1, data3, [src1, 16]
|
||
|
|
+ ldp data2, data4, [src2, 16]
|
||
|
|
cmp data1, data2
|
||
|
|
- ccmp data1h, data2h, 0, eq
|
||
|
|
- b.ne L(return64)
|
||
|
|
+ ccmp data3, data4, 0, eq
|
||
|
|
+ b.ne L(return2)
|
||
|
|
+ cmp limit, 16
|
||
|
|
+ b.ls L(last_bytes)
|
||
|
|
|
||
|
|
- subs limit, limit, 64
|
||
|
|
- add src1, src1, 64
|
||
|
|
- add src2, src2, 64
|
||
|
|
- b.pl L(loop64)
|
||
|
|
- adds limit, limit, 48
|
||
|
|
- b.lo L(last_bytes)
|
||
|
|
-
|
||
|
|
-L(loop16):
|
||
|
|
- ldp data1, data1h, [src1], 16
|
||
|
|
- ldp data2, data2h, [src2], 16
|
||
|
|
+ ldp data1, data3, [src1, 32]
|
||
|
|
+ ldp data2, data4, [src2, 32]
|
||
|
|
cmp data1, data2
|
||
|
|
- ccmp data1h, data2h, 0, eq
|
||
|
|
- b.ne L(return64)
|
||
|
|
+ ccmp data3, data4, 0, eq
|
||
|
|
+ b.ne L(return2)
|
||
|
|
+ add src1, src1, 32
|
||
|
|
+ add src2, src2, 32
|
||
|
|
+L(last64):
|
||
|
|
+ subs limit, limit, 32
|
||
|
|
+ b.hi L(loop32)
|
||
|
|
|
||
|
|
- subs limit, limit, 16
|
||
|
|
- b.hi L(loop16)
|
||
|
|
/* Compare last 1-16 bytes using unaligned access. */
|
||
|
|
L(last_bytes):
|
||
|
|
- add src1, src1, limit
|
||
|
|
- add src2, src2, limit
|
||
|
|
- ldp data1, data1h, [src1]
|
||
|
|
- ldp data2, data2h, [src2]
|
||
|
|
+ ldp data1, data3, [src1end, -16]
|
||
|
|
+ ldp data2, data4, [src2end, -16]
|
||
|
|
+L(return2):
|
||
|
|
+ cmp data1, data2
|
||
|
|
+ csel data1, data1, data3, ne
|
||
|
|
+ csel data2, data2, data4, ne
|
||
|
|
|
||
|
|
/* Compare data bytes and set return value to 0, -1 or 1. */
|
||
|
|
-L(return64):
|
||
|
|
- cmp data1, data2
|
||
|
|
- csel data1, data1, data1h, ne
|
||
|
|
- csel data2, data2, data2h, ne
|
||
|
|
L(return):
|
||
|
|
#ifndef __AARCH64EB__
|
||
|
|
rev data1, data1
|
||
|
|
@@ -133,45 +107,98 @@ L(return):
|
||
|
|
|
||
|
|
.p2align 4
|
||
|
|
L(less16):
|
||
|
|
- adds limit, limit, 8
|
||
|
|
- b.lo L(less8) //lo:<
|
||
|
|
+ add src1end, src1, limit
|
||
|
|
+ add src2end, src2, limit
|
||
|
|
+ tbz limit, 3, L(less8)
|
||
|
|
ldr data1, [src1]
|
||
|
|
ldr data2, [src2]
|
||
|
|
- /* equal 8 optimized */
|
||
|
|
- ccmp data1, data2, 0, ne
|
||
|
|
- b.ne L(return)
|
||
|
|
-
|
||
|
|
- ldr data1, [src1, limit]
|
||
|
|
- ldr data2, [src2, limit]
|
||
|
|
- b L(return)
|
||
|
|
+ ldr data3, [src1end, -8]
|
||
|
|
+ ldr data4, [src2end, -8]
|
||
|
|
+ b L(return2)
|
||
|
|
|
||
|
|
.p2align 4
|
||
|
|
L(less8):
|
||
|
|
- adds limit, limit, 4
|
||
|
|
- b.lo L(less4)
|
||
|
|
+ tbz limit, 2, L(less4)
|
||
|
|
ldr data1w, [src1]
|
||
|
|
ldr data2w, [src2]
|
||
|
|
- ccmp data1w, data2w, 0, ne
|
||
|
|
- b.ne L(return)
|
||
|
|
- ldr data1w, [src1, limit]
|
||
|
|
- ldr data2w, [src2, limit]
|
||
|
|
- b L(return)
|
||
|
|
+ ldr data3w, [src1end, -4]
|
||
|
|
+ ldr data4w, [src2end, -4]
|
||
|
|
+ b L(return2)
|
||
|
|
|
||
|
|
- .p2align 4
|
||
|
|
L(less4):
|
||
|
|
- adds limit, limit, 4
|
||
|
|
- b.eq L(ret_0)
|
||
|
|
-
|
||
|
|
-L(byte_loop):
|
||
|
|
- ldrb data1w, [src1], 1
|
||
|
|
- ldrb data2w, [src2], 1
|
||
|
|
- subs limit, limit, 1
|
||
|
|
- ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
|
||
|
|
- b.eq L(byte_loop)
|
||
|
|
+ tbz limit, 1, L(less2)
|
||
|
|
+ ldrh data1w, [src1]
|
||
|
|
+ ldrh data2w, [src2]
|
||
|
|
+ cmp data1w, data2w
|
||
|
|
+ b.ne L(return)
|
||
|
|
+L(less2):
|
||
|
|
+ mov result, 0
|
||
|
|
+ tbz limit, 0, L(return_zero)
|
||
|
|
+ ldrb data1w, [src1end, -1]
|
||
|
|
+ ldrb data2w, [src2end, -1]
|
||
|
|
sub result, data1w, data2w
|
||
|
|
+L(return_zero):
|
||
|
|
ret
|
||
|
|
-L(ret_0):
|
||
|
|
- mov result, 0
|
||
|
|
+
|
||
|
|
+L(loop_align):
|
||
|
|
+ ldp data1, data3, [src1, 16]
|
||
|
|
+ ldp data2, data4, [src2, 16]
|
||
|
|
+ cmp data1, data2
|
||
|
|
+ ccmp data3, data4, 0, eq
|
||
|
|
+ b.ne L(return2)
|
||
|
|
+
|
||
|
|
+ /* Align src2 and adjust src1, src2 and limit. */
|
||
|
|
+ and tmp, src2, 15
|
||
|
|
+ sub tmp, tmp, 16
|
||
|
|
+ sub src2, src2, tmp
|
||
|
|
+ add limit, limit, tmp
|
||
|
|
+ sub src1, src1, tmp
|
||
|
|
+ sub limit, limit, 64 + 16
|
||
|
|
+
|
||
|
|
+ .p2align 4
|
||
|
|
+L(loop64):
|
||
|
|
+ ldr q0, [src1, 16]
|
||
|
|
+ ldr q1, [src2, 16]
|
||
|
|
+ subs limit, limit, 64
|
||
|
|
+ ldr q2, [src1, 32]
|
||
|
|
+ ldr q3, [src2, 32]
|
||
|
|
+ eor v0.16b, v0.16b, v1.16b
|
||
|
|
+ eor v1.16b, v2.16b, v3.16b
|
||
|
|
+ ldr q2, [src1, 48]
|
||
|
|
+ ldr q3, [src2, 48]
|
||
|
|
+ umaxp v0.16b, v0.16b, v1.16b
|
||
|
|
+ ldr q4, [src1, 64]!
|
||
|
|
+ ldr q5, [src2, 64]!
|
||
|
|
+ eor v1.16b, v2.16b, v3.16b
|
||
|
|
+ eor v2.16b, v4.16b, v5.16b
|
||
|
|
+ umaxp v1.16b, v1.16b, v2.16b
|
||
|
|
+ umaxp v0.16b, v0.16b, v1.16b
|
||
|
|
+ umaxp v0.16b, v0.16b, v0.16b
|
||
|
|
+ fmov tmp, d0
|
||
|
|
+ ccmp tmp, 0, 0, hi
|
||
|
|
+ b.eq L(loop64)
|
||
|
|
+
|
||
|
|
+ /* If equal, process last 1-64 bytes using scalar loop. */
|
||
|
|
+ add limit, limit, 64 + 16
|
||
|
|
+ cbz tmp, L(last64)
|
||
|
|
+
|
||
|
|
+ /* Determine the 8-byte aligned offset of the first difference. */
|
||
|
|
+#ifdef __AARCH64EB__
|
||
|
|
+ rev16 tmp, tmp
|
||
|
|
+#endif
|
||
|
|
+ rev tmp, tmp
|
||
|
|
+ clz tmp, tmp
|
||
|
|
+ bic tmp, tmp, 7
|
||
|
|
+ sub tmp, tmp, 48
|
||
|
|
+ ldr data1, [src1, tmp]
|
||
|
|
+ ldr data2, [src2, tmp]
|
||
|
|
+#ifndef __AARCH64EB__
|
||
|
|
+ rev data1, data1
|
||
|
|
+ rev data2, data2
|
||
|
|
+#endif
|
||
|
|
+ mov result, 1
|
||
|
|
+ cmp data1, data2
|
||
|
|
+ cneg result, result, lo
|
||
|
|
ret
|
||
|
|
|
||
|
|
END (memcmp)
|
||
|
|
--
|
||
|
|
2.27.0
|
||
|
|
|