Commit 6c1b0da1 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu
Browse files

crypto: arm64/crct10dif - preparatory refactor for 8x8 PMULL version



Reorganize the CRC-T10DIF asm routine so we can easily instantiate an
alternative version based on 8x8 polynomial multiplication in a
subsequent patch.

Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 598b7d41
Loading
Loading
Loading
Loading
+85 −75
Original line number Diff line number Diff line
@@ -80,7 +80,46 @@

	vzr		.req	v13

ENTRY(crc_t10dif_pmull)
	.macro		fold64, p, reg1, reg2
	ldp		q11, q12, [arg2], #0x20

	__pmull_\p	v8, \reg1, v10, 2
	__pmull_\p	\reg1, \reg1, v10

CPU_LE(	rev64		v11.16b, v11.16b		)
CPU_LE(	rev64		v12.16b, v12.16b		)

	__pmull_\p	v9, \reg2, v10, 2
	__pmull_\p	\reg2, \reg2, v10

CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)

	eor		\reg1\().16b, \reg1\().16b, v8.16b
	eor		\reg2\().16b, \reg2\().16b, v9.16b
	eor		\reg1\().16b, \reg1\().16b, v11.16b
	eor		\reg2\().16b, \reg2\().16b, v12.16b
	.endm

	.macro		fold16, p, reg, rk
	__pmull_\p	v8, \reg, v10
	__pmull_\p	\reg, \reg, v10, 2
	.ifnb		\rk
	ldr_l		q10, \rk, x8
	.endif
	eor		v7.16b, v7.16b, v8.16b
	eor		v7.16b, v7.16b, \reg\().16b
	.endm

	.macro		__pmull_p64, rd, rn, rm, n
	.ifb		\n
	pmull		\rd\().1q, \rn\().1d, \rm\().1d
	.else
	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
	.endif
	.endm

	.macro		crc_t10dif_pmull, p
	frame_push	3, 128

	mov		arg1_low32, w0
@@ -96,7 +135,7 @@ ENTRY(crc_t10dif_pmull)
	cmp		arg3, #256

	// for sizes less than 128, we can't fold 64B at a time...
	b.lt		_less_than_128
	b.lt		.L_less_than_128_\@

	// load the initial crc value
	// crc value does not need to be byte-reflected, but it needs
@@ -147,41 +186,19 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
	// buffer. The _fold_64_B_loop will fold 64B at a time
	// until we have 64+y Bytes of buffer


	// fold 64B at a time. This section of the code folds 4 vector
	// registers in parallel
_fold_64_B_loop:

	.macro		fold64, reg1, reg2
	ldp		q11, q12, [arg2], #0x20

	pmull2		v8.1q, \reg1\().2d, v10.2d
	pmull		\reg1\().1q, \reg1\().1d, v10.1d

CPU_LE(	rev64		v11.16b, v11.16b		)
CPU_LE(	rev64		v12.16b, v12.16b		)

	pmull2		v9.1q, \reg2\().2d, v10.2d
	pmull		\reg2\().1q, \reg2\().1d, v10.1d

CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)

	eor		\reg1\().16b, \reg1\().16b, v8.16b
	eor		\reg2\().16b, \reg2\().16b, v9.16b
	eor		\reg1\().16b, \reg1\().16b, v11.16b
	eor		\reg2\().16b, \reg2\().16b, v12.16b
	.endm
.L_fold_64_B_loop_\@:

	fold64		v0, v1
	fold64		v2, v3
	fold64		v4, v5
	fold64		v6, v7
	fold64		\p, v0, v1
	fold64		\p, v2, v3
	fold64		\p, v4, v5
	fold64		\p, v6, v7

	subs		arg3, arg3, #128

	// check if there is another 64B in the buffer to be able to fold
	b.lt		_fold_64_B_end
	b.lt		.L_fold_64_B_end_\@

	if_will_cond_yield_neon
	stp		q0, q1, [sp, #.Lframe_local_offset]
@@ -197,9 +214,9 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
	movi		vzr.16b, #0		// init zero register
	endif_yield_neon

	b		_fold_64_B_loop
	b		.L_fold_64_B_loop_\@

_fold_64_B_end:
.L_fold_64_B_end_\@:
	// at this point, the buffer pointer is pointing at the last y Bytes
	// of the buffer the 64B of folded data is in 4 of the vector
	// registers: v0, v1, v2, v3
@@ -209,37 +226,27 @@ _fold_64_B_end:

	ldr_l		q10, rk9, x8

	.macro		fold16, reg, rk
	pmull		v8.1q, \reg\().1d, v10.1d
	pmull2		\reg\().1q, \reg\().2d, v10.2d
	.ifnb		\rk
	ldr_l		q10, \rk, x8
	.endif
	eor		v7.16b, v7.16b, v8.16b
	eor		v7.16b, v7.16b, \reg\().16b
	.endm

	fold16		v0, rk11
	fold16		v1, rk13
	fold16		v2, rk15
	fold16		v3, rk17
	fold16		v4, rk19
	fold16		v5, rk1
	fold16		v6
	fold16		\p, v0, rk11
	fold16		\p, v1, rk13
	fold16		\p, v2, rk15
	fold16		\p, v3, rk17
	fold16		\p, v4, rk19
	fold16		\p, v5, rk1
	fold16		\p, v6

	// instead of 64, we add 48 to the loop counter to save 1 instruction
	// from the loop instead of a cmp instruction, we use the negative
	// flag with the jl instruction
	adds		arg3, arg3, #(128-16)
	b.lt		_final_reduction_for_128
	b.lt		.L_final_reduction_for_128_\@

	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
	// continue folding 16B at a time

_16B_reduction_loop:
	pmull		v8.1q, v7.1d, v10.1d
	pmull2		v7.1q, v7.2d, v10.2d
.L_16B_reduction_loop_\@:
	__pmull_\p	v8, v7, v10
	__pmull_\p	v7, v7, v10, 2
	eor		v7.16b, v7.16b, v8.16b

	ldr		q0, [arg2], #16
@@ -251,22 +258,22 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
	// instead of a cmp instruction, we utilize the flags with the
	// jge instruction equivalent of: cmp arg3, 16-16
	// check if there is any more 16B in the buffer to be able to fold
	b.ge		_16B_reduction_loop
	b.ge		.L_16B_reduction_loop_\@

	// now we have 16+z bytes left to reduce, where 0<= z < 16.
	// first, we reduce the data in the xmm7 register

_final_reduction_for_128:
.L_final_reduction_for_128_\@:
	// check if any more data to fold. If not, compute the CRC of
	// the final 128 bits
	adds		arg3, arg3, #16
	b.eq		_128_done
	b.eq		.L_128_done_\@

	// here we are getting data that is less than 16 bytes.
	// since we know that there was data before the pointer, we can
	// offset the input pointer before the actual point, to receive
	// exactly 16 bytes. after that the registers need to be adjusted.
_get_last_two_regs:
.L_get_last_two_regs_\@:
	add		arg2, arg2, arg3
	ldr		q1, [arg2, #-16]
CPU_LE(	rev64		v1.16b, v1.16b			)
@@ -291,47 +298,46 @@ CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
	bsl		v0.16b, v2.16b, v1.16b

	// fold 16 Bytes
	pmull		v8.1q, v7.1d, v10.1d
	pmull2		v7.1q, v7.2d, v10.2d
	__pmull_\p	v8, v7, v10
	__pmull_\p	v7, v7, v10, 2
	eor		v7.16b, v7.16b, v8.16b
	eor		v7.16b, v7.16b, v0.16b

_128_done:
.L_128_done_\@:
	// compute crc of a 128-bit value
	ldr_l		q10, rk5, x8		// rk5 and rk6 in xmm10

	// 64b fold
	ext		v0.16b, vzr.16b, v7.16b, #8
	mov		v7.d[0], v7.d[1]
	pmull		v7.1q, v7.1d, v10.1d
	__pmull_\p	v7, v7, v10
	eor		v7.16b, v7.16b, v0.16b

	// 32b fold
	ext		v0.16b, v7.16b, vzr.16b, #4
	mov		v7.s[3], vzr.s[0]
	pmull2		v0.1q, v0.2d, v10.2d
	__pmull_\p	v0, v0, v10, 2
	eor		v7.16b, v7.16b, v0.16b

	// barrett reduction
_barrett:
	ldr_l		q10, rk7, x8
	mov		v0.d[0], v7.d[1]

	pmull		v0.1q, v0.1d, v10.1d
	__pmull_\p	v0, v0, v10
	ext		v0.16b, vzr.16b, v0.16b, #12
	pmull2		v0.1q, v0.2d, v10.2d
	__pmull_\p	v0, v0, v10, 2
	ext		v0.16b, vzr.16b, v0.16b, #12
	eor		v7.16b, v7.16b, v0.16b
	mov		w0, v7.s[1]

_cleanup:
.L_cleanup_\@:
	// scale the result back to 16 bits
	lsr		x0, x0, #16
	frame_pop
	ret

_less_than_128:
	cbz		arg3, _cleanup
.L_less_than_128_\@:
	cbz		arg3, .L_cleanup_\@

	movi		v0.16b, #0
	mov		v0.s[3], arg1_low32	// get the initial crc value
@@ -342,20 +348,20 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
	eor		v7.16b, v7.16b, v0.16b	// xor the initial crc value

	cmp		arg3, #16
	b.eq		_128_done		// exactly 16 left
	b.lt		_less_than_16_left
	b.eq		.L_128_done_\@		// exactly 16 left
	b.lt		.L_less_than_16_left_\@

	ldr_l		q10, rk1, x8		// rk1 and rk2 in xmm10

	// update the counter. subtract 32 instead of 16 to save one
	// instruction from the loop
	subs		arg3, arg3, #32
	b.ge		_16B_reduction_loop
	b.ge		.L_16B_reduction_loop_\@

	add		arg3, arg3, #16
	b		_get_last_two_regs
	b		.L_get_last_two_regs_\@

_less_than_16_left:
.L_less_than_16_left_\@:
	// shl r9, 4
	adr_l		x0, tbl_shf_table + 16
	sub		x0, x0, arg3
@@ -363,8 +369,12 @@ _less_than_16_left:
	movi		v9.16b, #0x80
	eor		v0.16b, v0.16b, v9.16b
	tbl		v7.16b, {v7.16b}, v0.16b
	b		_128_done
ENDPROC(crc_t10dif_pmull)
	b		.L_128_done_\@
	.endm

ENTRY(crc_t10dif_pmull_p64)
	crc_t10dif_pmull	p64
ENDPROC(crc_t10dif_pmull_p64)

// precomputed constants
// these constants are precomputed from the poly:
+5 −1
Original line number Diff line number Diff line
@@ -22,7 +22,9 @@

#define CRC_T10DIF_PMULL_CHUNK_SIZE	16U

asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);
asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 buf[], u64 len);

static u16 (*crc_t10dif_pmull)(u16 init_crc, const u8 buf[], u64 len);

static int crct10dif_init(struct shash_desc *desc)
{
@@ -85,6 +87,8 @@ static struct shash_alg crc_t10dif_alg = {

static int __init crc_t10dif_mod_init(void)
{
	crc_t10dif_pmull = crc_t10dif_pmull_p64;

	return crypto_register_shash(&crc_t10dif_alg);
}