Merge tag 'asm-generic-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic (e3de671d) · Commits · 戴 / test

arch/arm/include/asm/div64.h

+93 −190

Original line number	Diff line number	Diff line
		@@ -5,9 +5,9 @@
		#include <asm/compiler.h>

		/*
		* The semantics of do_div() are:
		* The semantics of __div64_32() are:
		*
		* uint32_t do_div(uint64_t *n, uint32_t base)
		* uint32_t __div64_32(uint64_t *n, uint32_t base)
		* {
		* uint32_t remainder = *n % base;
		* n = n / base;
		@@ -16,8 +16,9 @@
		*
		* In other words, a 64-bit dividend with a 32-bit divisor producing
		* a 64-bit result and a 32-bit remainder. To accomplish this optimally
		* we call a special __do_div64 helper with completely non standard
		* calling convention for arguments and results (beware).
		* we override the generic version in lib/div64.c to call our __do_div64
		* assembly implementation with completely non standard calling convention
		* for arguments and results (beware).
		*/

		#ifdef __ARMEB__
		@@ -28,199 +29,101 @@
		#define __xh "r1"
		#endif

		#define __do_div_asm(n, base) \
		({ \
		register unsigned int __base asm("r4") = base; \
		register unsigned long long __n asm("r0") = n; \
		register unsigned long long __res asm("r2"); \
		register unsigned int __rem asm(__xh); \
		asm( __asmeq("%0", __xh) \
		__asmeq("%1", "r2") \
		__asmeq("%2", "r0") \
		__asmeq("%3", "r4") \
		"bl __do_div64" \
		: "=r" (__rem), "=r" (__res) \
		: "r" (__n), "r" (__base) \
		: "ip", "lr", "cc"); \
		n = __res; \
		__rem; \
		})

		#if __GNUC__ < 4 \|\| !defined(CONFIG_AEABI)
		static inline uint32_t __div64_32(uint64_t *n, uint32_t base)
		{
		register unsigned int __base asm("r4") = base;
		register unsigned long long __n asm("r0") = *n;
		register unsigned long long __res asm("r2");
		register unsigned int __rem asm(__xh);
		asm( __asmeq("%0", __xh)
		__asmeq("%1", "r2")
		__asmeq("%2", "r0")
		__asmeq("%3", "r4")
		"bl __do_div64"
		: "=r" (__rem), "=r" (__res)
		: "r" (__n), "r" (__base)
		: "ip", "lr", "cc");
		*n = __res;
		return __rem;
		}
		#define __div64_32 __div64_32

		#if !defined(CONFIG_AEABI)

		/*
		* gcc versions earlier than 4.0 are simply too problematic for the
		* optimized implementation below. First there is gcc PR 15089 that
		* tend to trig on more complex constructs, spurious .global __udivsi3
		* are inserted even if none of those symbols are referenced in the
		* generated code, and those gcc versions are not able to do constant
		* propagation on long long values anyway.
		* In OABI configurations, some uses of the do_div function
		* cause gcc to run out of registers. To work around that,
		* we can force the use of the out-of-line version for
		* configurations that build a OABI kernel.
		*/
		#define do_div(n, base) __do_div_asm(n, base)

		#elif __GNUC__ >= 4
		#define do_div(n, base) __div64_32(&(n), base)

		#include <asm/bug.h>
		#else

		/*
		* If the divisor happens to be constant, we determine the appropriate
		* inverse at compile time to turn the division into a few inline
		* multiplications instead which is much faster. And yet only if compiling
		* for ARMv4 or higher (we need umull/umlal) and if the gcc version is
		* sufficiently recent to perform proper long long constant propagation.
		* (It is unfortunate that gcc doesn't perform all this internally.)
		* gcc versions earlier than 4.0 are simply too problematic for the
		* __div64_const32() code in asm-generic/div64.h. First there is
		* gcc PR 15089 that tend to trig on more complex constructs, spurious
		* .global __udivsi3 are inserted even if none of those symbols are
		* referenced in the generated code, and those gcc versions are not able
		* to do constant propagation on long long values anyway.
		*/
		#define do_div(n, base) \
		({ \
		unsigned int __r, __b = (base); \
		if (!__builtin_constant_p(__b) \|\| __b == 0 \|\| \
		(__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) { \
		/* non-constant divisor (or zero): slow path */ \
		__r = __do_div_asm(n, __b); \
		} else if ((__b & (__b - 1)) == 0) { \
		/* Trivial: __b is constant and a power of 2 */ \
		/* gcc does the right thing with this code. */ \
		__r = n; \
		__r &= (__b - 1); \
		n /= __b; \
		} else { \
		/* Multiply by inverse of __b: n/b = n(p/b)/p / \
		/* We rely on the fact that most of this code gets */ \
		/* optimized away at compile time due to constant */ \
		/* propagation and only a couple inline assembly */ \
		/* instructions should remain. Better avoid any */ \
		/* code construct that might prevent that. */ \
		unsigned long long __res, __x, __t, __m, __n = n; \
		unsigned int __c, __p, __z = 0; \
		/* preserve low part of n for reminder computation */ \
		__r = __n; \
		/* determine number of bits to represent __b */ \
		__p = 1 << __div64_fls(__b); \
		/* compute __m = ((__p << 64) + __b - 1) / __b */ \
		__m = (~0ULL / __b) * __p; \
		__m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b; \
		/* compute __res = __m(~0ULL/__b__b-1)/(__p << 64) */ \
		__x = ~0ULL / __b * __b - 1; \
		__res = (__m & 0xffffffff) * (__x & 0xffffffff); \
		__res >>= 32; \
		__res += (__m & 0xffffffff) * (__x >> 32); \
		__t = __res; \
		__res += (__x & 0xffffffff) * (__m >> 32); \
		__t = (__res < __t) ? (1ULL << 32) : 0; \
		__res = (__res >> 32) + __t; \
		__res += (__m >> 32) * (__x >> 32); \
		__res /= __p; \
		/* Now sanitize and optimize what we've got. */ \
		if (~0ULL % (__b / (__b & -__b)) == 0) { \
		/* those cases can be simplified with: */ \
		__n /= (__b & -__b); \
		__m = ~0ULL / (__b / (__b & -__b)); \
		__p = 1; \
		__c = 1; \
		} else if (__res != __x / __b) { \
		/* We can't get away without a correction */ \
		/* to compensate for bit truncation errors. */ \
		/* To avoid it we'd need an additional bit */ \
		/* to represent __m which would overflow it. */ \
		/* Instead we do m=p/b and n/b=(nm+m)/p. / \
		__c = 1; \
		/* Compute __m = (__p << 64) / __b */ \
		__m = (~0ULL / __b) * __p; \
		__m += ((~0ULL % __b + 1) * __p) / __b; \
		} else { \
		/* Reduce __m/__p, and try to clear bit 31 */ \
		/* of __m when possible otherwise that'll */ \
		/* need extra overflow handling later. */ \
		unsigned int __bits = -(__m & -__m); \
		__bits \|= __m >> 32; \
		__bits = (~__bits) << 1; \
		/* If __bits == 0 then setting bit 31 is */ \
		/* unavoidable. Simply apply the maximum */ \
		/* possible reduction in that case. */ \
		/* Otherwise the MSB of __bits indicates the */ \
		/* best reduction we should apply. */ \
		if (!__bits) { \
		__p /= (__m & -__m); \
		__m /= (__m & -__m); \
		} else { \
		__p >>= __div64_fls(__bits); \
		__m >>= __div64_fls(__bits); \
		} \
		/* No correction needed. */ \
		__c = 0; \
		} \
		/* Now we have a combination of 2 conditions: */ \
		/* 1) whether or not we need a correction (__c), and */ \
		/* 2) whether or not there might be an overflow in */ \
		/* the cross product (__m & ((1<<63) \| (1<<31))) */ \
		/* Select the best insn combination to perform the */ \
		/* actual __m * __n / (__p << 64) operation. */ \
		if (!__c) { \
		asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \
		"mov %Q0, #0" \
		: "=&r" (__res) \
		: "r" (__m), "r" (__n) \
		: "cc" ); \
		} else if (!(__m & ((1ULL << 63) \| (1ULL << 31)))) { \
		__res = __m; \
		asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t" \
		"mov %Q0, #0" \
		: "+&r" (__res) \
		: "r" (__m), "r" (__n) \
		: "cc" ); \
		} else { \
		asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \
		"cmn %Q0, %Q1\n\t" \
		"adcs %R0, %R0, %R1\n\t" \
		"adc %Q0, %3, #0" \
		: "=&r" (__res) \
		: "r" (__m), "r" (__n), "r" (__z) \
		: "cc" ); \
		} \
		if (!(__m & ((1ULL << 63) \| (1ULL << 31)))) { \
		asm ( "umlal %R0, %Q0, %R1, %Q2\n\t" \
		"umlal %R0, %Q0, %Q1, %R2\n\t" \
		"mov %R0, #0\n\t" \
		"umlal %Q0, %R0, %R1, %R2" \
		: "+&r" (__res) \
		: "r" (__m), "r" (__n) \
		: "cc" ); \
		} else { \
		asm ( "umlal %R0, %Q0, %R2, %Q3\n\t" \
		"umlal %R0, %1, %Q2, %R3\n\t" \
		"mov %R0, #0\n\t" \
		"adds %Q0, %1, %Q0\n\t" \
		"adc %R0, %R0, #0\n\t" \
		"umlal %Q0, %R0, %R2, %R3" \
		: "+&r" (__res), "+&r" (__z) \
		: "r" (__m), "r" (__n) \
		: "cc" ); \
		} \
		__res /= __p; \
		/* The reminder can be computed with 32-bit regs */ \
		/* only, and gcc is good at that. */ \
		{ \
		unsigned int __res0 = __res; \
		unsigned int __b0 = __b; \
		__r -= __res0 * __b0; \
		} \
		/* BUG_ON(__r >= __b \|\| __res * __b + __r != n); */ \
		n = __res; \
		} \
		__r; \
		})

		/* our own fls implementation to make sure constant propagation is fine */
		#define __div64_fls(bits) \
		({ \
		unsigned int __left = (bits), __nr = 0; \
		if (__left & 0xffff0000) __nr += 16, __left >>= 16; \
		if (__left & 0x0000ff00) __nr += 8, __left >>= 8; \
		if (__left & 0x000000f0) __nr += 4, __left >>= 4; \
		if (__left & 0x0000000c) __nr += 2, __left >>= 2; \
		if (__left & 0x00000002) __nr += 1; \
		__nr; \
		})

		#define __div64_const32_is_OK (__GNUC__ >= 4)

		static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
		{
		unsigned long long res;
		unsigned int tmp = 0;

		if (!bias) {
		asm ( "umull %Q0, %R0, %Q1, %Q2\n\t"
		"mov %Q0, #0"
		: "=&r" (res)
		: "r" (m), "r" (n)
		: "cc");
		} else if (!(m & ((1ULL << 63) \| (1ULL << 31)))) {
		res = m;
		asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t"
		"mov %Q0, #0"
		: "+&r" (res)
		: "r" (m), "r" (n)
		: "cc");
		} else {
		asm ( "umull %Q0, %R0, %Q1, %Q2\n\t"
		"cmn %Q0, %Q1\n\t"
		"adcs %R0, %R0, %R1\n\t"
		"adc %Q0, %3, #0"
		: "=&r" (res)
		: "r" (m), "r" (n), "r" (tmp)
		: "cc");
		}

		if (!(m & ((1ULL << 63) \| (1ULL << 31)))) {
		asm ( "umlal %R0, %Q0, %R1, %Q2\n\t"
		"umlal %R0, %Q0, %Q1, %R2\n\t"
		"mov %R0, #0\n\t"
		"umlal %Q0, %R0, %R1, %R2"
		: "+&r" (res)
		: "r" (m), "r" (n)
		: "cc");
		} else {
		asm ( "umlal %R0, %Q0, %R2, %Q3\n\t"
		"umlal %R0, %1, %Q2, %R3\n\t"
		"mov %R0, #0\n\t"
		"adds %Q0, %1, %Q0\n\t"
		"adc %R0, %R0, #0\n\t"
		"umlal %Q0, %R0, %R2, %R3"
		: "+&r" (res), "+&r" (tmp)
		: "r" (m), "r" (n)
		: "cc");
		}

		return res;
		}
		#define __arch_xprod_64 __arch_xprod_64

		#include <asm-generic/div64.h>

		#endif

drivers/clk/tegra/clk-divider.c

+2 −2

Original line number	Diff line number	Diff line
		@@ -32,7 +32,7 @@
		static int get_div(struct tegra_clk_frac_div *divider, unsigned long rate,
		unsigned long parent_rate)
		{
		s64 divider_ux1 = parent_rate;
		u64 divider_ux1 = parent_rate;
		u8 flags = divider->flags;
		int mul;

		@@ -54,7 +54,7 @@ static int get_div(struct tegra_clk_frac_div *divider, unsigned long rate,

		divider_ux1 -= mul;

		if (divider_ux1 < 0)
		if ((s64)divider_ux1 < 0)
		return 0;

		if (divider_ux1 > get_max_div(divider))

drivers/gpu/drm/mgag200/mgag200_mode.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -1564,7 +1564,7 @@ static uint32_t mga_vga_calculate_mode_bandwidth(struct drm_display_mode *mode,
		int bits_per_pixel)
		{
		uint32_t total_area, divisor;
		int64_t active_area, pixels_per_second, bandwidth;
		uint64_t active_area, pixels_per_second, bandwidth;
		uint64_t bytes_per_pixel = (bits_per_pixel + 7) / 8;

		divisor = 1024;

drivers/gpu/drm/nouveau/nvkm/subdev/clk/gk20a.c

+1 −2

Original line number	Diff line number	Diff line
		@@ -141,9 +141,8 @@ gk20a_pllg_calc_rate(struct gk20a_clk *clk)

		rate = clk->parent_rate * clk->n;
		divider = clk->m * pl_to_div[clk->pl];
		do_div(rate, divider);

		return rate / 2;
		return rate / divider / 2;
		}

		static int

drivers/hid/hid-sensor-hub.c

+2 −1

Original line number	Diff line number	Diff line
		@@ -218,7 +218,8 @@ int sensor_hub_set_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
		goto done_proc;
		}

		remaining_bytes = do_div(buffer_size, sizeof(__s32));
		remaining_bytes = buffer_size % sizeof(__s32);
		buffer_size = buffer_size / sizeof(__s32);
		if (buffer_size) {
		for (i = 0; i < buffer_size; ++i) {
		hid_set_field(report->field[field_index], i,

Admin message