Commit bdcb1aef authored by Nicholas Piggin's avatar Nicholas Piggin Committed by Michael Ellerman
Browse files

powerpc/64s: Improve RFI L1-D cache flush fallback



The fallback RFI flush is used when firmware does not provide a way
to flush the cache. It's a "displacement flush" that evicts useful
data by displacing it with an uninteresting buffer.

The flush has to take care to work with implementation specific cache
replacment policies, so the recipe has been in flux. The initial
slow but conservative approach is to touch all lines of a congruence
class, with dependencies between each load. It has since been
determined that a linear pattern of loads without dependencies is
sufficient, and is significantly faster.

Measuring the speed of a null syscall with RFI fallback flush enabled
gives the relative improvement:

P8 - 1.83x
P9 - 1.75x

The flush also becomes simpler and more adaptable to different cache
geometries.

Signed-off-by: default avatarNicholas Piggin <npiggin@gmail.com>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
parent 35adacd6
Loading
Loading
Loading
Loading
+1 −2
Original line number Original line Diff line number Diff line
@@ -239,8 +239,7 @@ struct paca_struct {
	 */
	 */
	u64 exrfi[EX_SIZE] __aligned(0x80);
	u64 exrfi[EX_SIZE] __aligned(0x80);
	void *rfi_flush_fallback_area;
	void *rfi_flush_fallback_area;
	u64 l1d_flush_congruence;
	u64 l1d_flush_size;
	u64 l1d_flush_sets;
#endif
#endif
};
};


+1 −2
Original line number Original line Diff line number Diff line
@@ -239,8 +239,7 @@ int main(void)
	OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
	OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
	OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area);
	OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area);
	OFFSET(PACA_EXRFI, paca_struct, exrfi);
	OFFSET(PACA_EXRFI, paca_struct, exrfi);
	OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence);
	OFFSET(PACA_L1D_FLUSH_SIZE, paca_struct, l1d_flush_size);
	OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets);


#endif
#endif
	OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
	OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
+36 −40
Original line number Original line Diff line number Diff line
@@ -1461,39 +1461,37 @@ TRAMP_REAL_BEGIN(rfi_flush_fallback)
	std	r9,PACA_EXRFI+EX_R9(r13)
	std	r9,PACA_EXRFI+EX_R9(r13)
	std	r10,PACA_EXRFI+EX_R10(r13)
	std	r10,PACA_EXRFI+EX_R10(r13)
	std	r11,PACA_EXRFI+EX_R11(r13)
	std	r11,PACA_EXRFI+EX_R11(r13)
	std	r12,PACA_EXRFI+EX_R12(r13)
	std	r8,PACA_EXRFI+EX_R13(r13)
	mfctr	r9
	mfctr	r9
	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
	ld	r11,PACA_L1D_FLUSH_SETS(r13)
	ld	r11,PACA_L1D_FLUSH_SIZE(r13)
	ld	r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
	srdi	r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */
	/*
	 * The load adresses are at staggered offsets within cachelines,
	 * which suits some pipelines better (on others it should not
	 * hurt).
	 */
	addi	r12,r12,8
	mtctr	r11
	mtctr	r11
	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */


	/* order ld/st prior to dcbt stop all streams with flushing */
	/* order ld/st prior to dcbt stop all streams with flushing */
	sync
	sync
1:	li	r8,0

	.rept	8 /* 8-way set associative */
	/*
	ldx	r11,r10,r8
	 * The load adresses are at staggered offsets within cachelines,
	add	r8,r8,r12
	 * which suits some pipelines better (on others it should not
	xor	r11,r11,r11	// Ensure r11 is 0 even if fallback area is not
	 * hurt).
	add	r8,r8,r11	// Add 0, this creates a dependency on the ldx
	 */
	.endr
1:
	addi	r10,r10,128 /* 128 byte cache line */
	ld	r11,(0x80 + 8)*0(r10)
	ld	r11,(0x80 + 8)*1(r10)
	ld	r11,(0x80 + 8)*2(r10)
	ld	r11,(0x80 + 8)*3(r10)
	ld	r11,(0x80 + 8)*4(r10)
	ld	r11,(0x80 + 8)*5(r10)
	ld	r11,(0x80 + 8)*6(r10)
	ld	r11,(0x80 + 8)*7(r10)
	addi	r10,r10,0x80*8
	bdnz	1b
	bdnz	1b


	mtctr	r9
	mtctr	r9
	ld	r9,PACA_EXRFI+EX_R9(r13)
	ld	r9,PACA_EXRFI+EX_R9(r13)
	ld	r10,PACA_EXRFI+EX_R10(r13)
	ld	r10,PACA_EXRFI+EX_R10(r13)
	ld	r11,PACA_EXRFI+EX_R11(r13)
	ld	r11,PACA_EXRFI+EX_R11(r13)
	ld	r12,PACA_EXRFI+EX_R12(r13)
	ld	r8,PACA_EXRFI+EX_R13(r13)
	GET_SCRATCH0(r13);
	GET_SCRATCH0(r13);
	rfid
	rfid


@@ -1503,39 +1501,37 @@ TRAMP_REAL_BEGIN(hrfi_flush_fallback)
	std	r9,PACA_EXRFI+EX_R9(r13)
	std	r9,PACA_EXRFI+EX_R9(r13)
	std	r10,PACA_EXRFI+EX_R10(r13)
	std	r10,PACA_EXRFI+EX_R10(r13)
	std	r11,PACA_EXRFI+EX_R11(r13)
	std	r11,PACA_EXRFI+EX_R11(r13)
	std	r12,PACA_EXRFI+EX_R12(r13)
	std	r8,PACA_EXRFI+EX_R13(r13)
	mfctr	r9
	mfctr	r9
	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
	ld	r11,PACA_L1D_FLUSH_SETS(r13)
	ld	r11,PACA_L1D_FLUSH_SIZE(r13)
	ld	r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
	srdi	r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */
	/*
	 * The load adresses are at staggered offsets within cachelines,
	 * which suits some pipelines better (on others it should not
	 * hurt).
	 */
	addi	r12,r12,8
	mtctr	r11
	mtctr	r11
	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */


	/* order ld/st prior to dcbt stop all streams with flushing */
	/* order ld/st prior to dcbt stop all streams with flushing */
	sync
	sync
1:	li	r8,0

	.rept	8 /* 8-way set associative */
	/*
	ldx	r11,r10,r8
	 * The load adresses are at staggered offsets within cachelines,
	add	r8,r8,r12
	 * which suits some pipelines better (on others it should not
	xor	r11,r11,r11	// Ensure r11 is 0 even if fallback area is not
	 * hurt).
	add	r8,r8,r11	// Add 0, this creates a dependency on the ldx
	 */
	.endr
1:
	addi	r10,r10,128 /* 128 byte cache line */
	ld	r11,(0x80 + 8)*0(r10)
	ld	r11,(0x80 + 8)*1(r10)
	ld	r11,(0x80 + 8)*2(r10)
	ld	r11,(0x80 + 8)*3(r10)
	ld	r11,(0x80 + 8)*4(r10)
	ld	r11,(0x80 + 8)*5(r10)
	ld	r11,(0x80 + 8)*6(r10)
	ld	r11,(0x80 + 8)*7(r10)
	addi	r10,r10,0x80*8
	bdnz	1b
	bdnz	1b


	mtctr	r9
	mtctr	r9
	ld	r9,PACA_EXRFI+EX_R9(r13)
	ld	r9,PACA_EXRFI+EX_R9(r13)
	ld	r10,PACA_EXRFI+EX_R10(r13)
	ld	r10,PACA_EXRFI+EX_R10(r13)
	ld	r11,PACA_EXRFI+EX_R11(r13)
	ld	r11,PACA_EXRFI+EX_R11(r13)
	ld	r12,PACA_EXRFI+EX_R12(r13)
	ld	r8,PACA_EXRFI+EX_R13(r13)
	GET_SCRATCH0(r13);
	GET_SCRATCH0(r13);
	hrfid
	hrfid


+1 −12
Original line number Original line Diff line number Diff line
@@ -875,19 +875,8 @@ static void init_fallback_flush(void)
	memset(l1d_flush_fallback_area, 0, l1d_size * 2);
	memset(l1d_flush_fallback_area, 0, l1d_size * 2);


	for_each_possible_cpu(cpu) {
	for_each_possible_cpu(cpu) {
		/*
		 * The fallback flush is currently coded for 8-way
		 * associativity. Different associativity is possible, but it
		 * will be treated as 8-way and may not evict the lines as
		 * effectively.
		 *
		 * 128 byte lines are mandatory.
		 */
		u64 c = l1d_size / 8;

		paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area;
		paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area;
		paca[cpu].l1d_flush_congruence = c;
		paca[cpu].l1d_flush_size = l1d_size;
		paca[cpu].l1d_flush_sets = c / 128;
	}
	}
}
}


+0 −2
Original line number Original line Diff line number Diff line
@@ -2377,8 +2377,6 @@ static void dump_one_paca(int cpu)
		printf(" slb_cache[%d]:        = 0x%016lx\n", i, p->slb_cache[i]);
		printf(" slb_cache[%d]:        = 0x%016lx\n", i, p->slb_cache[i]);


	DUMP(p, rfi_flush_fallback_area, "px");
	DUMP(p, rfi_flush_fallback_area, "px");
	DUMP(p, l1d_flush_congruence, "llx");
	DUMP(p, l1d_flush_sets, "llx");
#endif
#endif
	DUMP(p, dscr_default, "llx");
	DUMP(p, dscr_default, "llx");
#ifdef CONFIG_PPC_BOOK3E
#ifdef CONFIG_PPC_BOOK3E