Commit 822dfbd0 authored Oct 29, 2020 by Nicolas Pitre Committed by Andrew Boie Nov 09, 2020
lib/os/prf.c: alternate implementation for _ldiv5()



The _ldiv5() is an optimized divide-by-5 function that is smaller and
faster than the generic libgcc implementation.

Yet it can be made even smaller and faster with this replacement
implementation based on a reciprocal multiplication plus some tricks.

For example, here's the assembly from the original code on ARM:

_ldiv5:
        ldr     r3, [r0]
        movw    ip, #52429
        ldr     r1, [r0, #4]
        movt    ip, 52428
        adds    r3, r3, #2
        push    {r4, r5, r6, r7, lr}
        mov     lr, #0
        adc     r1, r1, lr
        adds    r2, lr, lr
        umull   r7, r6, ip, r1
        lsr     r6, r6, #2
        adc     r7, r6, r6
        adds    r2, r2, r2
        adc     r7, r7, r7
        adds    r2, r2, lr
        adc     r7, r7, r6
        subs    r3, r3, r2
        sbc     r7, r1, r7
        lsr     r2, r3, #3
        orr     r2, r2, r7, lsl #29
        umull   r2, r1, ip, r2
        lsr     r2, r1, #2
        lsr     r7, r1, #31
        lsl     r1, r2, #3
        adds    r4, lr, r1
        adc     r5, r6, r7
        adds    r2, r1, r1
        adds    r2, r2, r2
        adds    r2, r2, r1
        subs    r2, r3, r2
        umull   r3, r2, ip, r2
        lsr     r2, r2, #2
        adds    r4, r4, r2
        adc     r5, r5, #0
        strd    r4, [r0]
        pop     {r4, r5, r6, r7, pc}

And here's the resulting assembly with this commit applied:

_ldiv5:
        push    {r4, r5, r6, r7}
        movw    r4, #13107
        ldr     r6, [r0]
        movt    r4, 13107
        ldr     r1, [r0, #4]
        mov     r3, #0
        umull   r6, r7, r6, r4
        add     r2, r4, r4, lsl #1
        umull   r4, r5, r1, r4
        adds    r1, r6, r2
        adc     r2, r7, r2
        adds    ip, r6, r4
        adc     r1, r7, r5
        adds    r2, ip, r2
        adc     r2, r1, r3
        adds    r2, r4, r2
        adc     r3, r5, r3
        strd    r2, [r0]
        pop     {r4, r5, r6, r7}
        bx      lr

So we're down to 20 instructions from 36 initially, with only 2 umull
instructions instead of 3, and slightly smaller stack footprint.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
parent 0a5b2591
Show whitespace changes
Inline Side-by-side
Please to comment
Admin message