lib/crc: arm64: Simplify intrinsics implementation

ardbiesheuvel · Eric Biggers · commit 8fdef85d601d · 2026-04-02T16:14:53.000-07:00
NEON intrinsics are useful because they remove the need for manual register allocation, and the resulting code can be re-compiled and optimized for different micro-architectures, and shared between arm64 and 32-bit ARM. However, the strong typing of the vector variables can lead to incomprehensible gibberish, as is the case with the new CRC64 implementation. To address this, let's repaint all variables as uint64x2_t to minimize the number of vreinterpretq_xxx() calls, and to be able to rely on the ^ operator for exclusive OR operations. This makes the code much more concise and readable. While at it, wrap the calls to vmull_p64() et al in order to have a more consistent calling convention, and encapsulate any remaining vreinterpret() calls that are still needed. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Link: https://lore.kernel.org/r/20260330144630.33026-11-ardb@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>
diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/arm64/crc64-neon-inner.c
@@ -8,71 +8,58 @@
 
 u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
 
-#define GET_P64_0(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 0))
-#define GET_P64_1(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 1))
-
 /* x^191 mod G, x^127 mod G */
 static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL,
 					0x21e9761e252621acULL };
 /* floor(x^127 / G), (G - x^64) / x */
 static const u64 bconsts_val[2] = { 0x27ecfa329aef9f77ULL,
 				    0x34d926535897936aULL };
 
-u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
+static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b)
 {
-	uint64x2_t v0_u64 = { crc, 0 };
-	poly64x2_t v0 = vreinterpretq_p64_u64(v0_u64);
-	poly64x2_t fold_consts =
-		vreinterpretq_p64_u64(vld1q_u64(fold_consts_val));
-	poly64x2_t v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+	return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 0),
+						vgetq_lane_u64(b, 0)));
+}
 
-	v0 = vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(v0),
-					   vreinterpretq_u8_p64(v1)));
-	p += 16;
-	len -= 16;
+static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b)
+{
+	poly64x2_t l = vreinterpretq_p64_u64(a);
+	poly64x2_t m = vreinterpretq_p64_u64(b);
 
-	do {
-		v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+	return vreinterpretq_u64_p128(vmull_high_p64(l, m));
+}
 
-		poly128_t v2 = vmull_high_p64(fold_consts, v0);
-		poly128_t v0_128 =
-			vmull_p64(GET_P64_0(fold_consts), GET_P64_0(v0));
+static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b)
+{
+	return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 1),
+						vgetq_lane_u64(b, 0)));
+}
 
-		uint8x16_t x0 = veorq_u8(vreinterpretq_u8_p128(v0_128),
-					 vreinterpretq_u8_p128(v2));
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
+{
+	uint64x2_t fold_consts = vld1q_u64(fold_consts_val);
+	uint64x2_t v0 = { crc, 0 };
+	uint64x2_t zero = { };
 
-		x0 = veorq_u8(x0, vreinterpretq_u8_p64(v1));
-		v0 = vreinterpretq_p64_u8(x0);
+	for (;;) {
+		v0 ^= vreinterpretq_u64_u8(vld1q_u8(p));
 
 		p += 16;
 		len -= 16;
-	} while (len >= 16);
-
-	/* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */
-	poly64x2_t v7 = vreinterpretq_p64_u64((uint64x2_t){ 0, 0 });
-	poly128_t v1_128 = vmull_p64(GET_P64_1(fold_consts), GET_P64_0(v0));
+		if (len < 16)
+			break;
 
-	uint8x16_t ext_v0 =
-		vextq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p64(v7), 8);
-	uint8x16_t x0 = veorq_u8(ext_v0, vreinterpretq_u8_p128(v1_128));
+		v0 = pmull64(fold_consts, v0) ^ pmull64_high(fold_consts, v0);
+	}
 
-	v0 = vreinterpretq_p64_u8(x0);
+	/* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */
+	v0 = vextq_u64(v0, zero, 1) ^ pmull64_hi_lo(fold_consts, v0);
 
 	/* Final Barrett reduction */
-	poly64x2_t bconsts = vreinterpretq_p64_u64(vld1q_u64(bconsts_val));
-
-	v1_128 = vmull_p64(GET_P64_0(bconsts), GET_P64_0(v0));
-
-	poly64x2_t v1_64 = vreinterpretq_p64_u8(vreinterpretq_u8_p128(v1_128));
-	poly128_t v3_128 = vmull_p64(GET_P64_1(bconsts), GET_P64_0(v1_64));
-
-	x0 = veorq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p128(v3_128));
-
-	uint8x16_t ext_v2 = vextq_u8(vreinterpretq_u8_p64(v7),
-				     vreinterpretq_u8_p128(v1_128), 8);
+	uint64x2_t bconsts = vld1q_u64(bconsts_val);
+	uint64x2_t final = pmull64(bconsts, v0);
 
-	x0 = veorq_u8(x0, ext_v2);
+	v0 ^= vextq_u64(zero, final, 1) ^ pmull64_hi_lo(bconsts, final);
 
-	v0 = vreinterpretq_p64_u8(x0);
-	return vgetq_lane_u64(vreinterpretq_u64_p64(v0), 1);
+	return vgetq_lane_u64(v0, 1);
 }