Skip to content

Commit 8fdef85

Browse files
ardbiesheuvelEric Biggers
authored andcommitted
lib/crc: arm64: Simplify intrinsics implementation
NEON intrinsics are useful because they remove the need for manual register allocation, and the resulting code can be re-compiled and optimized for different micro-architectures, and shared between arm64 and 32-bit ARM. However, the strong typing of the vector variables can lead to incomprehensible gibberish, as is the case with the new CRC64 implementation. To address this, let's repaint all variables as uint64x2_t to minimize the number of vreinterpretq_xxx() calls, and to be able to rely on the ^ operator for exclusive OR operations. This makes the code much more concise and readable. While at it, wrap the calls to vmull_p64() et al in order to have a more consistent calling convention, and encapsulate any remaining vreinterpret() calls that are still needed. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Link: https://lore.kernel.org/r/20260330144630.33026-11-ardb@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>
1 parent f956dc8 commit 8fdef85

1 file changed

Lines changed: 32 additions & 45 deletions

File tree

lib/crc/arm64/crc64-neon-inner.c

Lines changed: 32 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -8,71 +8,58 @@
88

99
u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
1010

11-
#define GET_P64_0(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 0))
12-
#define GET_P64_1(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 1))
13-
1411
/* x^191 mod G, x^127 mod G */
1512
static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL,
1613
0x21e9761e252621acULL };
1714
/* floor(x^127 / G), (G - x^64) / x */
1815
static const u64 bconsts_val[2] = { 0x27ecfa329aef9f77ULL,
1916
0x34d926535897936aULL };
2017

21-
u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
18+
static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b)
2219
{
23-
uint64x2_t v0_u64 = { crc, 0 };
24-
poly64x2_t v0 = vreinterpretq_p64_u64(v0_u64);
25-
poly64x2_t fold_consts =
26-
vreinterpretq_p64_u64(vld1q_u64(fold_consts_val));
27-
poly64x2_t v1 = vreinterpretq_p64_u8(vld1q_u8(p));
20+
return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 0),
21+
vgetq_lane_u64(b, 0)));
22+
}
2823

29-
v0 = vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(v0),
30-
vreinterpretq_u8_p64(v1)));
31-
p += 16;
32-
len -= 16;
24+
static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b)
25+
{
26+
poly64x2_t l = vreinterpretq_p64_u64(a);
27+
poly64x2_t m = vreinterpretq_p64_u64(b);
3328

34-
do {
35-
v1 = vreinterpretq_p64_u8(vld1q_u8(p));
29+
return vreinterpretq_u64_p128(vmull_high_p64(l, m));
30+
}
3631

37-
poly128_t v2 = vmull_high_p64(fold_consts, v0);
38-
poly128_t v0_128 =
39-
vmull_p64(GET_P64_0(fold_consts), GET_P64_0(v0));
32+
static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b)
33+
{
34+
return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 1),
35+
vgetq_lane_u64(b, 0)));
36+
}
4037

41-
uint8x16_t x0 = veorq_u8(vreinterpretq_u8_p128(v0_128),
42-
vreinterpretq_u8_p128(v2));
38+
u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
39+
{
40+
uint64x2_t fold_consts = vld1q_u64(fold_consts_val);
41+
uint64x2_t v0 = { crc, 0 };
42+
uint64x2_t zero = { };
4343

44-
x0 = veorq_u8(x0, vreinterpretq_u8_p64(v1));
45-
v0 = vreinterpretq_p64_u8(x0);
44+
for (;;) {
45+
v0 ^= vreinterpretq_u64_u8(vld1q_u8(p));
4646

4747
p += 16;
4848
len -= 16;
49-
} while (len >= 16);
50-
51-
/* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */
52-
poly64x2_t v7 = vreinterpretq_p64_u64((uint64x2_t){ 0, 0 });
53-
poly128_t v1_128 = vmull_p64(GET_P64_1(fold_consts), GET_P64_0(v0));
49+
if (len < 16)
50+
break;
5451

55-
uint8x16_t ext_v0 =
56-
vextq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p64(v7), 8);
57-
uint8x16_t x0 = veorq_u8(ext_v0, vreinterpretq_u8_p128(v1_128));
52+
v0 = pmull64(fold_consts, v0) ^ pmull64_high(fold_consts, v0);
53+
}
5854

59-
v0 = vreinterpretq_p64_u8(x0);
55+
/* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */
56+
v0 = vextq_u64(v0, zero, 1) ^ pmull64_hi_lo(fold_consts, v0);
6057

6158
/* Final Barrett reduction */
62-
poly64x2_t bconsts = vreinterpretq_p64_u64(vld1q_u64(bconsts_val));
63-
64-
v1_128 = vmull_p64(GET_P64_0(bconsts), GET_P64_0(v0));
65-
66-
poly64x2_t v1_64 = vreinterpretq_p64_u8(vreinterpretq_u8_p128(v1_128));
67-
poly128_t v3_128 = vmull_p64(GET_P64_1(bconsts), GET_P64_0(v1_64));
68-
69-
x0 = veorq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p128(v3_128));
70-
71-
uint8x16_t ext_v2 = vextq_u8(vreinterpretq_u8_p64(v7),
72-
vreinterpretq_u8_p128(v1_128), 8);
59+
uint64x2_t bconsts = vld1q_u64(bconsts_val);
60+
uint64x2_t final = pmull64(bconsts, v0);
7361

74-
x0 = veorq_u8(x0, ext_v2);
62+
v0 ^= vextq_u64(zero, final, 1) ^ pmull64_hi_lo(bconsts, final);
7563

76-
v0 = vreinterpretq_p64_u8(x0);
77-
return vgetq_lane_u64(vreinterpretq_u64_p64(v0), 1);
64+
return vgetq_lane_u64(v0, 1);
7865
}

0 commit comments

Comments
 (0)