Commit 11031c0d authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu
Browse files

crypto: arm64/gcm-ce - implement 4 way interleave



To improve performance on cores with deep pipelines such as ThunderX2,
reimplement gcm(aes) using a 4-way interleave rather than the 2-way
interleave we use currently.

This comes down to a complete rewrite of the GCM part of the combined
GCM/GHASH driver, and instead of interleaving two invocations of AES
with the GHASH handling at the instruction level, the new version
uses a more coarse grained approach where each chunk of 64 bytes is
encrypted first and then ghashed (or ghashed and then decrypted in
the converse case).

The core NEON routine is now able to consume inputs of any size,
and tail blocks of less than 64 bytes are handled using overlapping
loads and stores, and processed by the same 4-way encryption and
hashing routines. This gets rid of most of the branches, and avoids
having to return to the C code to handle the tail block using a
stack buffer.

The table below compares the performance of the old driver and the new
one on various micro-architectures and running in various modes.

        |     AES-128      |     AES-192      |     AES-256      |
 #bytes | 512 | 1500 |  4k | 512 | 1500 |  4k | 512 | 1500 |  4k |
 -------+-----+------+-----+-----+------+-----+-----+------+-----+
    TX2 | 35% |  23% | 11% | 34% |  20% |  9% | 38% |  25% | 16% |
   EMAG | 11% |   6% |  3% | 12% |   4% |  2% | 11% |   4% |  2% |
    A72 |  8% |   5% | -4% |  9% |   4% | -5% |  7% |   4% | -5% |
    A53 | 11% |   6% | -1% | 10% |   8% | -1% | 10% |   8% | -2% |

Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent ec05a74f
Loading
Loading
Loading
Loading
+344 −157
Original line number Diff line number Diff line
@@ -13,8 +13,8 @@
	T1		.req	v2
	T2		.req	v3
	MASK		.req	v4
	XL		.req	v5
	XM		.req	v6
	XM		.req	v5
	XL		.req	v6
	XH		.req	v7
	IN1		.req	v7

@@ -358,20 +358,37 @@ ENTRY(pmull_ghash_update_p8)
	__pmull_ghash	p8
ENDPROC(pmull_ghash_update_p8)

	KS0		.req	v12
	KS1		.req	v13
	INP0		.req	v14
	INP1		.req	v15

	.macro		load_round_keys, rounds, rk
	cmp		\rounds, #12
	blo		2222f		/* 128 bits */
	beq		1111f		/* 192 bits */
	ld1		{v17.4s-v18.4s}, [\rk], #32
1111:	ld1		{v19.4s-v20.4s}, [\rk], #32
2222:	ld1		{v21.4s-v24.4s}, [\rk], #64
	ld1		{v25.4s-v28.4s}, [\rk], #64
	ld1		{v29.4s-v31.4s}, [\rk]
	KS0		.req	v8
	KS1		.req	v9
	KS2		.req	v10
	KS3		.req	v11

	INP0		.req	v21
	INP1		.req	v22
	INP2		.req	v23
	INP3		.req	v24

	K0		.req	v25
	K1		.req	v26
	K2		.req	v27
	K3		.req	v28
	K4		.req	v12
	K5		.req	v13
	K6		.req	v4
	K7		.req	v5
	K8		.req	v14
	K9		.req	v15
	KK		.req	v29
	KL		.req	v30
	KM		.req	v31

	.macro		load_round_keys, rounds, rk, tmp
	add		\tmp, \rk, #64
	ld1		{K0.4s-K3.4s}, [\rk]
	ld1		{K4.4s-K5.4s}, [\tmp]
	add		\tmp, \rk, \rounds, lsl #4
	sub		\tmp, \tmp, #32
	ld1		{KK.4s-KM.4s}, [\tmp]
	.endm

	.macro		enc_round, state, key
@@ -379,197 +396,367 @@ ENDPROC(pmull_ghash_update_p8)
	aesmc		\state\().16b, \state\().16b
	.endm

	.macro		enc_block, state, rounds
	cmp		\rounds, #12
	b.lo		2222f		/* 128 bits */
	b.eq		1111f		/* 192 bits */
	enc_round	\state, v17
	enc_round	\state, v18
1111:	enc_round	\state, v19
	enc_round	\state, v20
2222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
	.macro		enc_qround, s0, s1, s2, s3, key
	enc_round	\s0, \key
	enc_round	\s1, \key
	enc_round	\s2, \key
	enc_round	\s3, \key
	.endm

	.macro		enc_block, state, rounds, rk, tmp
	add		\tmp, \rk, #96
	ld1		{K6.4s-K7.4s}, [\tmp], #32
	.irp		key, K0, K1, K2, K3, K4 K5
	enc_round	\state, \key
	.endr
	aese		\state\().16b, v30.16b
	eor		\state\().16b, \state\().16b, v31.16b

	tbnz		\rounds, #2, .Lnot128_\@
.Lout256_\@:
	enc_round	\state, K6
	enc_round	\state, K7

.Lout192_\@:
	enc_round	\state, KK
	aese		\state\().16b, KL.16b
	eor		\state\().16b, \state\().16b, KM.16b

	.subsection	1
.Lnot128_\@:
	ld1		{K8.4s-K9.4s}, [\tmp], #32
	enc_round	\state, K6
	enc_round	\state, K7
	ld1		{K6.4s-K7.4s}, [\tmp]
	enc_round	\state, K8
	enc_round	\state, K9
	tbz		\rounds, #1, .Lout192_\@
	b		.Lout256_\@
	.previous
	.endm

	.align		6
	.macro		pmull_gcm_do_crypt, enc
	ld1		{SHASH.2d}, [x4], #16
	ld1		{HH.2d}, [x4]
	ld1		{XL.2d}, [x1]
	ldr		x8, [x5, #8]			// load lower counter
	stp		x29, x30, [sp, #-32]!
	mov		x29, sp
	str		x19, [sp, #24]

	load_round_keys	x7, x6, x8

	ld1		{SHASH.2d}, [x3], #16
	ld1		{HH.2d-HH4.2d}, [x3]

	movi		MASK.16b, #0xe1
	trn1		SHASH2.2d, SHASH.2d, HH.2d
	trn2		T1.2d, SHASH.2d, HH.2d
CPU_LE(	rev		x8, x8		)
	shl		MASK.2d, MASK.2d, #57
	eor		SHASH2.16b, SHASH2.16b, T1.16b

	.if		\enc == 1
	ldr		x10, [sp]
	ld1		{KS0.16b-KS1.16b}, [x10]
	.endif
	trn1		HH34.2d, HH3.2d, HH4.2d
	trn2		T1.2d, HH3.2d, HH4.2d
	eor		HH34.16b, HH34.16b, T1.16b

	cbnz		x6, 4f
	ld1		{XL.2d}, [x4]

0:	ld1		{INP0.16b-INP1.16b}, [x3], #32
	cbz		x0, 3f				// tag only?

	rev		x9, x8
	add		x11, x8, #1
	add		x8, x8, #2
	ldr		w8, [x5, #12]			// load lower counter
CPU_LE(	rev		w8, w8		)

	.if		\enc == 1
	eor		INP0.16b, INP0.16b, KS0.16b	// encrypt input
	eor		INP1.16b, INP1.16b, KS1.16b
0:	mov		w9, #4				// max blocks per round
	add		x10, x0, #0xf
	lsr		x10, x10, #4			// remaining blocks

	subs		x0, x0, #64
	csel		w9, w10, w9, mi
	add		w8, w8, w9

	bmi		1f
	ld1		{INP0.16b-INP3.16b}, [x2], #64
	.subsection	1
	/*
	 * Populate the four input registers right to left with up to 63 bytes
	 * of data, using overlapping loads to avoid branches.
	 *
	 *                INP0     INP1     INP2     INP3
	 *  1 byte     |        |        |        |x       |
	 * 16 bytes    |        |        |        |xxxxxxxx|
	 * 17 bytes    |        |        |xxxxxxxx|x       |
	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
	 * etc etc
	 *
	 * Note that this code may read up to 15 bytes before the start of
	 * the input. It is up to the calling code to ensure this is safe if
	 * this happens in the first iteration of the loop (i.e., when the
	 * input size is < 16 bytes)
	 */
1:	mov		x15, #16
	ands		x19, x0, #0xf
	csel		x19, x19, x15, ne
	adr_l		x17, .Lpermute_table + 16

	sub		x11, x15, x19
	add		x12, x17, x11
	sub		x17, x17, x11
	ld1		{T1.16b}, [x12]
	sub		x10, x1, x11
	sub		x11, x2, x11

	cmp		x0, #-16
	csel		x14, x15, xzr, gt
	cmp		x0, #-32
	csel		x15, x15, xzr, gt
	cmp		x0, #-48
	csel		x16, x19, xzr, gt
	csel		x1, x1, x10, gt
	csel		x2, x2, x11, gt

	ld1		{INP0.16b}, [x2], x14
	ld1		{INP1.16b}, [x2], x15
	ld1		{INP2.16b}, [x2], x16
	ld1		{INP3.16b}, [x2]
	tbl		INP3.16b, {INP3.16b}, T1.16b
	b		2f
	.previous

2:	.if		\enc == 0
	bl		pmull_gcm_ghash_4x
	.endif

	ld1		{KS0.8b}, [x5]			// load upper counter
	rev		x11, x11
	sub		w0, w0, #2
	mov		KS1.8b, KS0.8b
	ins		KS0.d[1], x9			// set lower counter
	ins		KS1.d[1], x11
	bl		pmull_gcm_enc_4x

	rev64		T1.16b, INP1.16b
	tbnz		x0, #63, 6f
	st1		{INP0.16b-INP3.16b}, [x1], #64
	.if		\enc == 1
	bl		pmull_gcm_ghash_4x
	.endif
	bne		0b

	cmp		w7, #12
	b.ge		2f				// AES-192/256?
3:	ldp		x19, x10, [sp, #24]
	cbz		x10, 5f				// output tag?

1:	enc_round	KS0, v21
	ext		IN1.16b, T1.16b, T1.16b, #8
	ld1		{INP3.16b}, [x10]		// load lengths[]
	mov		w9, #1
	bl		pmull_gcm_ghash_4x

	enc_round	KS1, v21
	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
	mov		w11, #(0x1 << 24)		// BE '1U'
	ld1		{KS0.16b}, [x5]
	mov		KS0.s[3], w11

	enc_round	KS0, v22
	eor		T1.16b, T1.16b, IN1.16b
	enc_block	KS0, x7, x6, x12

	enc_round	KS1, v22
	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
	ext		XL.16b, XL.16b, XL.16b, #8
	rev64		XL.16b, XL.16b
	eor		XL.16b, XL.16b, KS0.16b
	st1		{XL.16b}, [x10]			// store tag

	enc_round	KS0, v23
	pmull		XM2.1q, SHASH2.1d, T1.1d	// (a1 + a0)(b1 + b0)
4:	ldp		x29, x30, [sp], #32
	ret

	enc_round	KS1, v23
	rev64		T1.16b, INP0.16b
	ext		T2.16b, XL.16b, XL.16b, #8
5:
CPU_LE(	rev		w8, w8		)
	str		w8, [x5, #12]			// store lower counter
	st1		{XL.2d}, [x4]
	b		4b

6:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
	sub		x17, x17, x19, lsl #1

	cmp		w9, #1
	beq		7f
	.subsection	1
7:	ld1		{INP2.16b}, [x1]
	tbx		INP2.16b, {INP3.16b}, T1.16b
	mov		INP3.16b, INP2.16b
	b		8f
	.previous

	st1		{INP0.16b}, [x1], x14
	st1		{INP1.16b}, [x1], x15
	st1		{INP2.16b}, [x1], x16
	tbl		INP3.16b, {INP3.16b}, T1.16b
	tbx		INP3.16b, {INP2.16b}, T2.16b
8:	st1		{INP3.16b}, [x1]

	enc_round	KS0, v24
	ext		IN1.16b, T1.16b, T1.16b, #8
	eor		T1.16b, T1.16b, T2.16b
	.if		\enc == 1
	ld1		{T1.16b}, [x17]
	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
	bl		pmull_gcm_ghash_4x
	.endif
	b		3b
	.endm

	enc_round	KS1, v24
	eor		XL.16b, XL.16b, IN1.16b
	/*
	 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
	 *			  int rounds, u8 tag)
	 */
ENTRY(pmull_gcm_encrypt)
	pmull_gcm_do_crypt	1
ENDPROC(pmull_gcm_encrypt)

	enc_round	KS0, v25
	eor		T1.16b, T1.16b, XL.16b
	/*
	 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
	 *			  int rounds, u8 tag)
	 */
ENTRY(pmull_gcm_decrypt)
	pmull_gcm_do_crypt	0
ENDPROC(pmull_gcm_decrypt)

	enc_round	KS1, v25
	pmull2		XH.1q, HH.2d, XL.2d		// a1 * b1
pmull_gcm_ghash_4x:
	movi		MASK.16b, #0xe1
	shl		MASK.2d, MASK.2d, #57

	enc_round	KS0, v26
	pmull		XL.1q, HH.1d, XL.1d		// a0 * b0
	rev64		T1.16b, INP0.16b
	rev64		T2.16b, INP1.16b
	rev64		TT3.16b, INP2.16b
	rev64		TT4.16b, INP3.16b

	enc_round	KS1, v26
	pmull2		XM.1q, SHASH2.2d, T1.2d		// (a1 + a0)(b1 + b0)
	ext		XL.16b, XL.16b, XL.16b, #8

	enc_round	KS0, v27
	eor		XL.16b, XL.16b, XL2.16b
	eor		XH.16b, XH.16b, XH2.16b
	tbz		w9, #2, 0f			// <4 blocks?
	.subsection	1
0:	movi		XH2.16b, #0
	movi		XM2.16b, #0
	movi		XL2.16b, #0

	enc_round	KS1, v27
	eor		XM.16b, XM.16b, XM2.16b
	ext		T1.16b, XL.16b, XH.16b, #8
	tbz		w9, #0, 1f			// 2 blocks?
	tbz		w9, #1, 2f			// 1 block?

	enc_round	KS0, v28
	eor		T2.16b, XL.16b, XH.16b
	eor		XM.16b, XM.16b, T1.16b
	eor		T2.16b, T2.16b, XL.16b
	ext		T1.16b, T2.16b, T2.16b, #8
	b		.Lgh3

	enc_round	KS1, v28
	eor		XM.16b, XM.16b, T2.16b
1:	eor		TT3.16b, TT3.16b, XL.16b
	ext		T2.16b, TT3.16b, TT3.16b, #8
	b		.Lgh2

	enc_round	KS0, v29
	pmull		T2.1q, XL.1d, MASK.1d
2:	eor		TT4.16b, TT4.16b, XL.16b
	ext		IN1.16b, TT4.16b, TT4.16b, #8
	b		.Lgh1
	.previous

	enc_round	KS1, v29
	mov		XH.d[0], XM.d[1]
	mov		XM.d[1], XL.d[0]
	eor		T1.16b, T1.16b, XL.16b
	ext		IN1.16b, T1.16b, T1.16b, #8

	aese		KS0.16b, v30.16b
	eor		XL.16b, XM.16b, T2.16b
	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
	eor		T1.16b, T1.16b, IN1.16b
	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)

	aese		KS1.16b, v30.16b
	ext		T2.16b, XL.16b, XL.16b, #8
	ext		T1.16b, T2.16b, T2.16b, #8
.Lgh3:	eor		T2.16b, T2.16b, T1.16b
	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)

	eor		KS0.16b, KS0.16b, v31.16b
	pmull		XL.1q, XL.1d, MASK.1d
	eor		T2.16b, T2.16b, XH.16b
	eor		XH2.16b, XH2.16b, XH.16b
	eor		XL2.16b, XL2.16b, XL.16b
	eor		XM2.16b, XM2.16b, XM.16b

	eor		KS1.16b, KS1.16b, v31.16b
	eor		XL.16b, XL.16b, T2.16b
	ext		T2.16b, TT3.16b, TT3.16b, #8
.Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)

	.if		\enc == 0
	eor		INP0.16b, INP0.16b, KS0.16b
	eor		INP1.16b, INP1.16b, KS1.16b
	.endif
	eor		XH2.16b, XH2.16b, XH.16b
	eor		XL2.16b, XL2.16b, XL.16b
	eor		XM2.16b, XM2.16b, XM.16b

	st1		{INP0.16b-INP1.16b}, [x2], #32
	ext		IN1.16b, TT4.16b, TT4.16b, #8
.Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)

	cbnz		w0, 0b
	eor		XH.16b, XH.16b, XH2.16b
	eor		XL.16b, XL.16b, XL2.16b
	eor		XM.16b, XM.16b, XM2.16b

CPU_LE(	rev		x8, x8		)
	st1		{XL.2d}, [x1]
	str		x8, [x5, #8]			// store lower counter
	eor		T2.16b, XL.16b, XH.16b
	ext		T1.16b, XL.16b, XH.16b, #8
	eor		XM.16b, XM.16b, T2.16b

	.if		\enc == 1
	st1		{KS0.16b-KS1.16b}, [x10]
	.endif
	__pmull_reduce_p64

	eor		T2.16b, T2.16b, XH.16b
	eor		XL.16b, XL.16b, T2.16b

	ret
ENDPROC(pmull_gcm_ghash_4x)

pmull_gcm_enc_4x:
	ld1		{KS0.16b}, [x5]			// load upper counter
	sub		w10, w8, #4
	sub		w11, w8, #3
	sub		w12, w8, #2
	sub		w13, w8, #1
	rev		w10, w10
	rev		w11, w11
	rev		w12, w12
	rev		w13, w13
	mov		KS1.16b, KS0.16b
	mov		KS2.16b, KS0.16b
	mov		KS3.16b, KS0.16b
	ins		KS0.s[3], w10			// set lower counter
	ins		KS1.s[3], w11
	ins		KS2.s[3], w12
	ins		KS3.s[3], w13

	add		x10, x6, #96			// round key pointer
	ld1		{K6.4s-K7.4s}, [x10], #32
	.irp		key, K0, K1, K2, K3, K4, K5
	enc_qround	KS0, KS1, KS2, KS3, \key
	.endr

2:	b.eq		3f				// AES-192?
	enc_round	KS0, v17
	enc_round	KS1, v17
	enc_round	KS0, v18
	enc_round	KS1, v18
3:	enc_round	KS0, v19
	enc_round	KS1, v19
	enc_round	KS0, v20
	enc_round	KS1, v20
	b		1b
	tbnz		x7, #2, .Lnot128
	.subsection	1
.Lnot128:
	ld1		{K8.4s-K9.4s}, [x10], #32
	.irp		key, K6, K7
	enc_qround	KS0, KS1, KS2, KS3, \key
	.endr
	ld1		{K6.4s-K7.4s}, [x10]
	.irp		key, K8, K9
	enc_qround	KS0, KS1, KS2, KS3, \key
	.endr
	tbz		x7, #1, .Lout192
	b		.Lout256
	.previous

4:	load_round_keys	w7, x6
	b		0b
	.endm
.Lout256:
	.irp		key, K6, K7
	enc_qround	KS0, KS1, KS2, KS3, \key
	.endr

	/*
	 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
	 *			  struct ghash_key const *k, u8 ctr[],
	 *			  int rounds, u8 ks[])
	 */
ENTRY(pmull_gcm_encrypt)
	pmull_gcm_do_crypt	1
ENDPROC(pmull_gcm_encrypt)
.Lout192:
	enc_qround	KS0, KS1, KS2, KS3, KK

	/*
	 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
	 *			  struct ghash_key const *k, u8 ctr[],
	 *			  int rounds)
	 */
ENTRY(pmull_gcm_decrypt)
	pmull_gcm_do_crypt	0
ENDPROC(pmull_gcm_decrypt)
	aese		KS0.16b, KL.16b
	aese		KS1.16b, KL.16b
	aese		KS2.16b, KL.16b
	aese		KS3.16b, KL.16b

	eor		KS0.16b, KS0.16b, KM.16b
	eor		KS1.16b, KS1.16b, KM.16b
	eor		KS2.16b, KS2.16b, KM.16b
	eor		KS3.16b, KS3.16b, KM.16b

	eor		INP0.16b, INP0.16b, KS0.16b
	eor		INP1.16b, INP1.16b, KS1.16b
	eor		INP2.16b, INP2.16b, KS2.16b
	eor		INP3.16b, INP3.16b, KS3.16b

	/*
	 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
	 */
ENTRY(pmull_gcm_encrypt_block)
	cbz		x2, 0f
	load_round_keys	w3, x2
0:	ld1		{v0.16b}, [x1]
	enc_block	v0, w3
	st1		{v0.16b}, [x0]
	ret
ENDPROC(pmull_gcm_encrypt_block)
ENDPROC(pmull_gcm_enc_4x)

	.section	".rodata", "a"
	.align		6
.Lpermute_table:
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
	.previous
+123 −170

File changed.

Preview size limit exceeded, changes collapsed.