Merge tag 'crc-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux

torvalds · torvalds · commit d142ab35ee0b · 2026-04-13T17:36:04.000-07:00
Pull CRC updates from Eric Biggers:

 - Several improvements related to crc_kunit, to align with the standard
   KUnit conventions and make it easier for developers and CI systems to
   run this test suite

 - Add an arm64-optimized implementation of CRC64-NVME

 - Remove unused code for big endian arm64

* tag 'crc-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux:
  lib/crc: arm64: Simplify intrinsics implementation
  lib/crc: arm64: Use existing macros for kernel-mode FPU cflags
  lib/crc: arm64: Drop unnecessary chunking logic from crc64
  lib/crc: arm64: Assume a little-endian kernel
  lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation
  lib/crc: arm64: Drop check for CONFIG_KERNEL_MODE_NEON
  crypto: crc32c - Remove another outdated comment
  crypto: crc32c - Remove more outdated usage information
  kunit: configs: Enable all CRC tests in all_tests.config
  lib/crc: tests: Add a .kunitconfig file
  lib/crc: tests: Add CRC_ENABLE_ALL_FOR_KUNIT
  lib/crc: tests: Make crc_kunit test only the enabled CRC variants
diff --git a/crypto/crc32c.c b/crypto/crc32c.c
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Cryptographic API.
- *
- * CRC32C chksum
+ * crypto_shash support for CRC-32C
  *
  *@Article{castagnoli-crc,
  * author =       { Guy Castagnoli and Stefan Braeuer and Martin Herrman},
@@ -15,16 +13,6 @@
  * pages =        {},
  * month =        {June},
  *}
- * Used by the iSCSI driver, possibly others, and derived from
- * the iscsi-crc.c module of the linux-iscsi driver at
- * http://linux-iscsi.sourceforge.net.
- *
- * Following the example of lib/crc32, this function is intended to be
- * flexible and useful for all users.  Modules that currently have their
- * own crc32c, but hopefully may be able to use this one are:
- *  net/sctp (please add all your doco to here if you change to
- *            use this one!)
- *  <endoflist>
  *
  * Copyright (c) 2004 Cisco Systems, Inc.
  * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
@@ -49,11 +37,6 @@ struct chksum_desc_ctx {
 	u32 crc;
 };
 
-/*
- * Steps through buffer one byte at a time, calculates reflected
- * crc using table.
- */
-
 static int chksum_init(struct shash_desc *desc)
 {
 	struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
diff --git a/lib/crc/.kunitconfig b/lib/crc/.kunitconfig
@@ -0,0 +1,3 @@
+CONFIG_KUNIT=y
+CONFIG_CRC_ENABLE_ALL_FOR_KUNIT=y
+CONFIG_CRC_KUNIT_TEST=y
diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
@@ -48,7 +48,7 @@ config CRC_T10DIF_ARCH
 	bool
 	depends on CRC_T10DIF && CRC_OPTIMIZATIONS
 	default y if ARM && KERNEL_MODE_NEON
-	default y if ARM64 && KERNEL_MODE_NEON
+	default y if ARM64
 	default y if PPC64 && ALTIVEC
 	default y if RISCV && RISCV_ISA_ZBC
 	default y if X86
@@ -82,6 +82,7 @@ config CRC64
 config CRC64_ARCH
 	bool
 	depends on CRC64 && CRC_OPTIMIZATIONS
+	default y if ARM64
 	default y if RISCV && RISCV_ISA_ZBC && 64BIT
 	default y if X86_64
 
@@ -99,18 +100,27 @@ config CRC_OPTIMIZATIONS
 
 config CRC_KUNIT_TEST
 	tristate "KUnit tests for CRC functions" if !KUNIT_ALL_TESTS
-	depends on KUNIT
+	depends on KUNIT && (CRC7 || CRC16 || CRC_T10DIF || CRC32 || CRC64)
 	default KUNIT_ALL_TESTS
+	help
+	  Unit tests for the CRC library functions.
+
+	  This is intended to help people writing architecture-specific
+	  optimized versions.  If unsure, say N.
+
+config CRC_ENABLE_ALL_FOR_KUNIT
+	tristate "Enable all CRC functions for KUnit test"
+	depends on KUNIT
 	select CRC7
 	select CRC16
 	select CRC_T10DIF
 	select CRC32
 	select CRC64
 	help
-	  Unit tests for the CRC library functions.
+	  Enable all CRC functions that have test code in CRC_KUNIT_TEST.
 
-	  This is intended to help people writing architecture-specific
-	  optimized versions.  If unsure, say N.
+	  Enable this only if you'd like the CRC KUnit test suite to test all
+	  the CRC variants, even ones that wouldn't otherwise need to be built.
 
 config CRC_BENCHMARK
 	bool "Benchmark for the CRC functions"
diff --git a/lib/crc/Makefile b/lib/crc/Makefile
@@ -38,9 +38,14 @@ obj-$(CONFIG_CRC64) += crc64.o
 crc64-y := crc64-main.o
 ifeq ($(CONFIG_CRC64_ARCH),y)
 CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
+
+CFLAGS_REMOVE_arm64/crc64-neon-inner.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_arm64/crc64-neon-inner.o += $(CC_FLAGS_FPU) -march=armv8-a+crypto
+crc64-$(CONFIG_ARM64) += arm64/crc64-neon-inner.o
+
 crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
 crc64-$(CONFIG_X86) += x86/crc64-pclmul.o
-endif
+endif # CONFIG_CRC64_ARCH
 
 obj-y += tests/
 
diff --git a/lib/crc/arm64/crc-t10dif-core.S b/lib/crc/arm64/crc-t10dif-core.S
@@ -181,13 +181,13 @@ SYM_FUNC_END(__pmull_p8_16x64)
 
 	pmull16x64_\p	fold_consts, \reg1, v8
 
-CPU_LE(	rev64		v11.16b, v11.16b		)
-CPU_LE(	rev64		v12.16b, v12.16b		)
+	rev64		v11.16b, v11.16b
+	rev64		v12.16b, v12.16b
 
 	pmull16x64_\p	fold_consts, \reg2, v9
 
-CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
-CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
+	ext		v11.16b, v11.16b, v11.16b, #8
+	ext		v12.16b, v12.16b, v12.16b, #8
 
 	eor		\reg1\().16b, \reg1\().16b, v8.16b
 	eor		\reg2\().16b, \reg2\().16b, v9.16b
@@ -220,22 +220,22 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	ldp		q4, q5, [buf, #0x40]
 	ldp		q6, q7, [buf, #0x60]
 	add		buf, buf, #0x80
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	rev64		v1.16b, v1.16b			)
-CPU_LE(	rev64		v2.16b, v2.16b			)
-CPU_LE(	rev64		v3.16b, v3.16b			)
-CPU_LE(	rev64		v4.16b, v4.16b			)
-CPU_LE(	rev64		v5.16b, v5.16b			)
-CPU_LE(	rev64		v6.16b, v6.16b			)
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
-CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
-CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
-CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
-CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
-CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
-CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+	rev64		v0.16b, v0.16b
+	rev64		v1.16b, v1.16b
+	rev64		v2.16b, v2.16b
+	rev64		v3.16b, v3.16b
+	rev64		v4.16b, v4.16b
+	rev64		v5.16b, v5.16b
+	rev64		v6.16b, v6.16b
+	rev64		v7.16b, v7.16b
+	ext		v0.16b, v0.16b, v0.16b, #8
+	ext		v1.16b, v1.16b, v1.16b, #8
+	ext		v2.16b, v2.16b, v2.16b, #8
+	ext		v3.16b, v3.16b, v3.16b, #8
+	ext		v4.16b, v4.16b, v4.16b, #8
+	ext		v5.16b, v5.16b, v5.16b, #8
+	ext		v6.16b, v6.16b, v6.16b, #8
+	ext		v7.16b, v7.16b, v7.16b, #8
 
 	// XOR the first 16 data *bits* with the initial CRC value.
 	movi		v8.16b, #0
@@ -288,8 +288,8 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	pmull16x64_\p	fold_consts, v7, v8
 	eor		v7.16b, v7.16b, v8.16b
 	ldr		q0, [buf], #16
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+	rev64		v0.16b, v0.16b
+	ext		v0.16b, v0.16b, v0.16b, #8
 	eor		v7.16b, v7.16b, v0.16b
 	subs		len, len, #16
 	b.ge		.Lfold_16_bytes_loop_\@
@@ -310,8 +310,8 @@ CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 	// v0 = last 16 original data bytes
 	add		buf, buf, len
 	ldr		q0, [buf, #-16]
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+	rev64		v0.16b, v0.16b
+	ext		v0.16b, v0.16b, v0.16b, #8
 
 	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
 	adr_l		x4, .Lbyteshift_table + 16
@@ -344,8 +344,8 @@ CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 
 	// Load the first 16 data bytes.
 	ldr		q7, [buf], #0x10
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+	rev64		v7.16b, v7.16b
+	ext		v7.16b, v7.16b, v7.16b, #8
 
 	// XOR the first 16 data *bits* with the initial CRC value.
 	movi		v0.16b, #0
@@ -382,8 +382,8 @@ SYM_FUNC_START(crc_t10dif_pmull_p8)
 
 	crc_t10dif_pmull p8
 
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+	rev64		v7.16b, v7.16b
+	ext		v7.16b, v7.16b, v7.16b, #8
 	str		q7, [x3]
 
 	frame_pop
diff --git a/lib/crc/arm64/crc32-core.S b/lib/crc/arm64/crc32-core.S
@@ -29,24 +29,19 @@
 	.endm
 
 	.macro		hwordle, reg
-CPU_BE(	rev16		\reg, \reg	)
 	.endm
 
 	.macro		hwordbe, reg
-CPU_LE(	rev		\reg, \reg	)
+	rev		\reg, \reg
 	rbit		\reg, \reg
-CPU_BE(	lsr		\reg, \reg, #16	)
 	.endm
 
 	.macro		le, regs:vararg
-	.irp		r, \regs
-CPU_BE(	rev		\r, \r		)
-	.endr
 	.endm
 
 	.macro		be, regs:vararg
 	.irp		r, \regs
-CPU_LE(	rev		\r, \r		)
+	rev		\r, \r
 	.endr
 	.irp		r, \regs
 	rbit		\r, \r
diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/arm64/crc64-neon-inner.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC64 (NVMe) using ARM NEON C intrinsics
+ */
+
+#include <linux/types.h>
+#include <asm/neon-intrinsics.h>
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
+
+/* x^191 mod G, x^127 mod G */
+static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL,
+					0x21e9761e252621acULL };
+/* floor(x^127 / G), (G - x^64) / x */
+static const u64 bconsts_val[2] = { 0x27ecfa329aef9f77ULL,
+				    0x34d926535897936aULL };
+
+static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b)
+{
+	return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 0),
+						vgetq_lane_u64(b, 0)));
+}
+
+static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b)
+{
+	poly64x2_t l = vreinterpretq_p64_u64(a);
+	poly64x2_t m = vreinterpretq_p64_u64(b);
+
+	return vreinterpretq_u64_p128(vmull_high_p64(l, m));
+}
+
+static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b)
+{
+	return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 1),
+						vgetq_lane_u64(b, 0)));
+}
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
+{
+	uint64x2_t fold_consts = vld1q_u64(fold_consts_val);
+	uint64x2_t v0 = { crc, 0 };
+	uint64x2_t zero = { };
+
+	for (;;) {
+		v0 ^= vreinterpretq_u64_u8(vld1q_u8(p));
+
+		p += 16;
+		len -= 16;
+		if (len < 16)
+			break;
+
+		v0 = pmull64(fold_consts, v0) ^ pmull64_high(fold_consts, v0);
+	}
+
+	/* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */
+	v0 = vextq_u64(v0, zero, 1) ^ pmull64_hi_lo(fold_consts, v0);
+
+	/* Final Barrett reduction */
+	uint64x2_t bconsts = vld1q_u64(bconsts_val);
+	uint64x2_t final = pmull64(bconsts, v0);
+
+	v0 ^= vextq_u64(zero, final, 1) ^ pmull64_hi_lo(bconsts, final);
+
+	return vgetq_lane_u64(v0, 1);
+}
diff --git a/lib/crc/arm64/crc64.h b/lib/crc/arm64/crc64.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * CRC64 using ARM64 PMULL instructions
+ */
+
+#include <linux/cpufeature.h>
+#include <asm/simd.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
+
+#define crc64_be_arch crc64_be_generic
+
+static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+	if (len >= 128 && cpu_have_named_feature(PMULL) &&
+	    likely(may_use_simd())) {
+		size_t chunk = len & ~15;
+
+		scoped_ksimd()
+			crc = crc64_nvme_arm64_c(crc, p, chunk);
+
+		p += chunk;
+		len &= 15;
+	}
+	return crc64_nvme_generic(crc, p, len);
+}
diff --git a/lib/crc/tests/crc_kunit.c b/lib/crc/tests/crc_kunit.c
diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+CONFIG_KUNIT=y`
	`2`	`+CONFIG_CRC_ENABLE_ALL_FOR_KUNIT=y`
	`3`	`+CONFIG_CRC_KUNIT_TEST=y`