Commit 29a843ae authored by Russell King's avatar Russell King
Browse files

Merge tag 'arm32-efi-cache-ops-for-rmk' of...

Merge tag 'arm32-efi-cache-ops-for-rmk' of git://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux

 into devel-stable

ARMv7 compliant cache maintenance for the decompressor

On v7 and later cores, cache maintenance operations by set/way are only
intended to be used in the context of on/offlining a core, while it has
been taken out of the coherency domain. Any use intended to ensure that
the contents of the cache have made it to main memory is unreliable,
since cacheline migration and non-architected system caches may cause
these contents to linger elsewhere, without being visible in main memory
once the MMU and caches are disabled.

So switch to cache maintenance by virtual address for v7 and later cores.
This makes the 32-bit kernel bootable on systems with L3 system caches
that are not covered by set/way operations, such as Socionext SynQuacer.

Tony says:

  I gave these a try on top of the earlier "arm: fix Kbuild issue caused
  by per-task stack protector GCC plugin" and booting still works for
  me on armv7 including appended dtb:

  Tested-by: default avatarTony Lindgren <tony@atomide.com>

Linus says:

  No problem, I have tested it on the following:

  - ARMv7 Cortex A9 x 2 Qualcomm APQ8060 DragonBoard
  - ARM PB11MPCore (4 x 1176)
  - ARMv7 Ux500 Cortex A9 x 2

  The PB11MPCore is again the crucial board, if it work on that
  board it works on anything, most of the time :D

  Tested-by: default avatarLinus Walleij <linus.walleij@linaro.org>

Note that the first 2 patches are shared with the efi/core branch in
TIP, which is the reason why this is sent as a pull request rather
than via the patch system.
parents bb6d3fb3 401b368c
Loading
Loading
Loading
Loading
+86 −76
Original line number Diff line number Diff line
@@ -151,6 +151,25 @@
.L_\@:
		.endm

		/*
		 * The kernel build system appends the size of the
		 * decompressed kernel at the end of the compressed data
		 * in little-endian form.
		 */
		.macro	get_inflated_image_size, res:req, tmp1:req, tmp2:req
		adr	\res, .Linflated_image_size_offset
		ldr	\tmp1, [\res]
		add	\tmp1, \tmp1, \res	@ address of inflated image size

		ldrb	\res, [\tmp1]		@ get_unaligned_le32
		ldrb	\tmp2, [\tmp1, #1]
		orr	\res, \res, \tmp2, lsl #8
		ldrb	\tmp2, [\tmp1, #2]
		ldrb	\tmp1, [\tmp1, #3]
		orr	\res, \res, \tmp2, lsl #16
		orr	\res, \res, \tmp1, lsl #24
		.endm

		.section ".start", "ax"
/*
 * sort out different calling conventions
@@ -268,15 +287,15 @@ not_angel:
		 */
		mov	r0, pc
		cmp	r0, r4
		ldrcc	r0, LC0+32
		ldrcc	r0, LC0+28
		addcc	r0, r0, pc
		cmpcc	r4, r0
		orrcc	r4, r4, #1		@ remember we skipped cache_on
		blcs	cache_on

restart:	adr	r0, LC0
		ldmia	r0, {r1, r2, r3, r6, r10, r11, r12}
		ldr	sp, [r0, #28]
		ldmia	r0, {r1, r2, r3, r6, r11, r12}
		ldr	sp, [r0, #24]

		/*
		 * We might be running at a different address.  We need
@@ -284,20 +303,8 @@ restart: adr r0, LC0
		 */
		sub	r0, r0, r1		@ calculate the delta offset
		add	r6, r6, r0		@ _edata
		add	r10, r10, r0		@ inflated kernel size location

		/*
		 * The kernel build system appends the size of the
		 * decompressed kernel at the end of the compressed data
		 * in little-endian form.
		 */
		ldrb	r9, [r10, #0]
		ldrb	lr, [r10, #1]
		orr	r9, r9, lr, lsl #8
		ldrb	lr, [r10, #2]
		ldrb	r10, [r10, #3]
		orr	r9, r9, lr, lsl #16
		orr	r9, r9, r10, lsl #24
		get_inflated_image_size	r9, r10, lr

#ifndef CONFIG_ZBOOT_ROM
		/* malloc space is above the relocated stack (64k max) */
@@ -521,11 +528,8 @@ dtb_check_done:
		/* Preserve offset to relocated code. */
		sub	r6, r9, r6

#ifndef CONFIG_ZBOOT_ROM
		/* cache_clean_flush may use the stack, so relocate it */
		add	sp, sp, r6
#endif

		mov	r0, r9			@ start of relocated zImage
		add	r1, sp, r6		@ end of relocated zImage
		bl	cache_clean_flush

		badr	r0, restart
@@ -622,6 +626,11 @@ not_relocated: mov r0, #0
		add	r2, sp, #0x10000	@ 64k max
		mov	r3, r7
		bl	decompress_kernel

		get_inflated_image_size	r1, r2, r3

		mov	r0, r4			@ start of inflated image
		add	r1, r1, r0		@ end of inflated image
		bl	cache_clean_flush
		bl	cache_off

@@ -652,13 +661,15 @@ LC0: .word LC0 @ r1
		.word	__bss_start		@ r2
		.word	_end			@ r3
		.word	_edata			@ r6
		.word	input_data_end - 4	@ r10 (inflated size location)
		.word	_got_start		@ r11
		.word	_got_end		@ ip
		.word	.L_user_stack_end	@ sp
		.word	_end - restart + 16384 + 1024*1024
		.size	LC0, . - LC0

.Linflated_image_size_offset:
		.long	(input_data_end - 4) - .

#ifdef CONFIG_ARCH_RPC
		.globl	params
params:		ldr	r0, =0x10000100		@ params_phys for RPC
@@ -667,6 +678,24 @@ params: ldr r0, =0x10000100 @ params_phys for RPC
		.align
#endif

/*
 * dcache_line_size - get the minimum D-cache line size from the CTR register
 * on ARMv7.
 */
		.macro	dcache_line_size, reg, tmp
#ifdef CONFIG_CPU_V7M
		movw	\tmp, #:lower16:BASEADDR_V7M_SCB + V7M_SCB_CTR
		movt	\tmp, #:upper16:BASEADDR_V7M_SCB + V7M_SCB_CTR
		ldr	\tmp, [\tmp]
#else
		mrc	p15, 0, \tmp, c0, c0, 1		@ read ctr
#endif
		lsr	\tmp, \tmp, #16
		and	\tmp, \tmp, #0xf		@ cache line size encoding
		mov	\reg, #4			@ bytes per word
		mov	\reg, \reg, lsl \tmp		@ actual cache line size
		.endm

/*
 * Turn on the cache.  We need to setup some page tables so that we
 * can have both the I and D caches on.
@@ -1159,8 +1188,6 @@ __armv7_mmu_cache_off:
		bic	r0, r0, #0x000c
#endif
		mcr	p15, 0, r0, c1, c0	@ turn MMU and cache off
		mov	r12, lr
		bl	__armv7_mmu_cache_flush
		mov	r0, #0
#ifdef CONFIG_MMU
		mcr	p15, 0, r0, c8, c7, 0	@ invalidate whole TLB
@@ -1168,11 +1195,14 @@ __armv7_mmu_cache_off:
		mcr	p15, 0, r0, c7, c5, 6	@ invalidate BTC
		mcr	p15, 0, r0, c7, c10, 4	@ DSB
		mcr	p15, 0, r0, c7, c5, 4	@ ISB
		mov	pc, r12
		mov	pc, lr

/*
 * Clean and flush the cache to maintain consistency.
 *
 * On entry,
 *  r0 = start address
 *  r1 = end address (exclusive)
 * On exit,
 *  r1, r2, r3, r9, r10, r11, r12 corrupted
 * This routine must preserve:
@@ -1181,6 +1211,7 @@ __armv7_mmu_cache_off:
		.align	5
cache_clean_flush:
		mov	r3, #16
		mov	r11, r1
		b	call_cache_fn

__armv4_mpu_cache_flush:
@@ -1231,51 +1262,16 @@ __armv7_mmu_cache_flush:
		mcr	p15, 0, r10, c7, c14, 0	@ clean+invalidate D
		b	iflush
hierarchical:
		mcr	p15, 0, r10, c7, c10, 5	@ DMB
		stmfd	sp!, {r0-r7, r9-r11}
		mrc	p15, 1, r0, c0, c0, 1	@ read clidr
		ands	r3, r0, #0x7000000	@ extract loc from clidr
		mov	r3, r3, lsr #23		@ left align loc bit field
		beq	finished		@ if loc is 0, then no need to clean
		mov	r10, #0			@ start clean at cache level 0
loop1:
		add	r2, r10, r10, lsr #1	@ work out 3x current cache level
		mov	r1, r0, lsr r2		@ extract cache type bits from clidr
		and	r1, r1, #7		@ mask of the bits for current cache only
		cmp	r1, #2			@ see what cache we have at this level
		blt	skip			@ skip if no cache, or just i-cache
		mcr	p15, 2, r10, c0, c0, 0	@ select current cache level in cssr
		mcr	p15, 0, r10, c7, c5, 4	@ isb to sych the new cssr&csidr
		mrc	p15, 1, r1, c0, c0, 0	@ read the new csidr
		and	r2, r1, #7		@ extract the length of the cache lines
		add	r2, r2, #4		@ add 4 (line length offset)
		ldr	r4, =0x3ff
		ands	r4, r4, r1, lsr #3	@ find maximum number on the way size
		clz	r5, r4			@ find bit position of way size increment
		ldr	r7, =0x7fff
		ands	r7, r7, r1, lsr #13	@ extract max number of the index size
loop2:
		mov	r9, r4			@ create working copy of max way size
loop3:
 ARM(		orr	r11, r10, r9, lsl r5	) @ factor way and cache number into r11
 ARM(		orr	r11, r11, r7, lsl r2	) @ factor index number into r11
 THUMB(		lsl	r6, r9, r5		)
 THUMB(		orr	r11, r10, r6		) @ factor way and cache number into r11
 THUMB(		lsl	r6, r7, r2		)
 THUMB(		orr	r11, r11, r6		) @ factor index number into r11
		mcr	p15, 0, r11, c7, c14, 2	@ clean & invalidate by set/way
		subs	r9, r9, #1		@ decrement the way
		bge	loop3
		subs	r7, r7, #1		@ decrement the index
		bge	loop2
skip:
		add	r10, r10, #2		@ increment cache number
		cmp	r3, r10
		bgt	loop1
finished:
		ldmfd	sp!, {r0-r7, r9-r11}
		mov	r10, #0			@ switch back to cache level 0
		mcr	p15, 2, r10, c0, c0, 0	@ select current cache level in cssr
		dcache_line_size r1, r2		@ r1 := dcache min line size
		sub	r2, r1, #1		@ r2 := line size mask
		bic	r0, r0, r2		@ round down start to line size
		sub	r11, r11, #1		@ end address is exclusive
		bic	r11, r11, r2		@ round down end to line size
0:		cmp	r0, r11			@ finished?
		bgt	iflush
		mcr	p15, 0, r0, c7, c14, 1	@ Dcache clean/invalidate by VA
		add	r0, r0, r1
		b	0b
iflush:
		mcr	p15, 0, r10, c7, c10, 4	@ DSB
		mcr	p15, 0, r10, c7, c5, 0	@ invalidate I+BTB
@@ -1460,7 +1456,24 @@ ENTRY(efi_stub_entry)

		@ Preserve return value of efi_entry() in r4
		mov	r4, r0
		add	r1, r4, #SZ_2M			@ DT end
		bl	cache_clean_flush

		ldr	r0, [sp]			@ relocated zImage
		ldr	r1, =_edata			@ size of zImage
		add	r1, r1, r0			@ end of zImage
		bl	cache_clean_flush

		@ The PE/COFF loader might not have cleaned the code we are
		@ running beyond the PoU, and so calling cache_off below from
		@ inside the PE/COFF loader allocated region is unsafe. Let's
		@ assume our own zImage relocation code did a better job, and
		@ jump into its version of this routine before proceeding.
		ldr	r0, [sp]			@ relocated zImage
		ldr	r1, .Ljmp
		sub	r1, r0, r1
		mov	pc, r1				@ no mode switch
0:
		bl	cache_off

		@ Set parameters for booting zImage according to boot protocol
@@ -1469,18 +1482,15 @@ ENTRY(efi_stub_entry)
		mov	r0, #0
		mov	r1, #0xFFFFFFFF
		mov	r2, r4

		@ Branch to (possibly) relocated zImage that is in [sp]
		ldr	lr, [sp]
		ldr	ip, =start_offset
		add	lr, lr, ip
		mov	pc, lr				@ no mode switch
		b	__efi_start

efi_load_fail:
		@ Return EFI_LOAD_ERROR to EFI firmware on error.
		ldr	r0, =0x80000001
		ldmfd	sp!, {ip, pc}
ENDPROC(efi_stub_entry)
		.align	2
.Ljmp:		.long	start - 0b
#endif

		.align