diff options
Diffstat (limited to 'compiler-rt/lib/builtins/arm/udivsi3.S')
-rw-r--r-- | compiler-rt/lib/builtins/arm/udivsi3.S | 261 |
1 files changed, 261 insertions, 0 deletions
diff --git a/compiler-rt/lib/builtins/arm/udivsi3.S b/compiler-rt/lib/builtins/arm/udivsi3.S new file mode 100644 index 000000000000..9b1b035b33d6 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/udivsi3.S @@ -0,0 +1,261 @@ +//===-- udivsi3.S - 32-bit unsigned integer divide ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the __udivsi3 (32-bit unsigned integer divide) +// function for the ARM 32-bit architecture. +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" + + .syntax unified + .text + +DEFINE_CODE_STATE + + .p2align 2 +DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_uidiv, __udivsi3) + +@ unsigned int __udivsi3(unsigned int divident, unsigned int divisor) +@ Calculate and return the quotient of the (unsigned) division. + +DEFINE_COMPILERRT_FUNCTION(__udivsi3) +#if __ARM_ARCH_EXT_IDIV__ + tst r1, r1 + beq LOCAL_LABEL(divby0) + udiv r0, r0, r1 + bx lr + +LOCAL_LABEL(divby0): + mov r0, #0 +# ifdef __ARM_EABI__ + b __aeabi_idiv0 +# else + JMP(lr) +# endif + +#else // ! __ARM_ARCH_EXT_IDIV__ + cmp r1, #1 + bcc LOCAL_LABEL(divby0) +#if defined(USE_THUMB_1) + bne LOCAL_LABEL(num_neq_denom) + JMP(lr) +LOCAL_LABEL(num_neq_denom): +#else + IT(eq) + JMPc(lr, eq) +#endif + cmp r0, r1 +#if defined(USE_THUMB_1) + bhs LOCAL_LABEL(num_ge_denom) + movs r0, #0 + JMP(lr) +LOCAL_LABEL(num_ge_denom): +#else + ITT(cc) + movcc r0, #0 + JMPc(lr, cc) +#endif + + // Implement division using binary long division algorithm. + // + // r0 is the numerator, r1 the denominator. + // + // The code before JMP computes the correct shift I, so that + // r0 and (r1 << I) have the highest bit set in the same position. + // At the time of JMP, ip := .Ldiv0block - 12 * I. + // This depends on the fixed instruction size of block. + // For ARM mode, this is 12 Bytes, for THUMB mode 14 Bytes. + // + // block(shift) implements the test-and-update-quotient core. + // It assumes (r0 << shift) can be computed without overflow and + // that (r0 << shift) < 2 * r1. The quotient is stored in r3. + +# if defined(__ARM_FEATURE_CLZ) + clz ip, r0 + clz r3, r1 + // r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. + sub r3, r3, ip +# if defined(USE_THUMB_2) + adr ip, LOCAL_LABEL(div0block) + 1 + sub ip, ip, r3, lsl #1 +# else + adr ip, LOCAL_LABEL(div0block) +# endif + sub ip, ip, r3, lsl #2 + sub ip, ip, r3, lsl #3 + mov r3, #0 + bx ip +# else // No CLZ Feature +# if defined(USE_THUMB_2) +# error THUMB mode requires CLZ or UDIV +# endif +# if defined(USE_THUMB_1) +# define BLOCK_SIZE 10 +# else +# define BLOCK_SIZE 12 +# endif + + mov r2, r0 +# if defined(USE_THUMB_1) + mov ip, r0 + adr r0, LOCAL_LABEL(div0block) + adds r0, #1 +# else + adr ip, LOCAL_LABEL(div0block) +# endif + lsrs r3, r2, #16 + cmp r3, r1 +# if defined(USE_THUMB_1) + blo LOCAL_LABEL(skip_16) + movs r2, r3 + subs r0, r0, #(16 * BLOCK_SIZE) +LOCAL_LABEL(skip_16): +# else + movhs r2, r3 + subhs ip, ip, #(16 * BLOCK_SIZE) +# endif + + lsrs r3, r2, #8 + cmp r3, r1 +# if defined(USE_THUMB_1) + blo LOCAL_LABEL(skip_8) + movs r2, r3 + subs r0, r0, #(8 * BLOCK_SIZE) +LOCAL_LABEL(skip_8): +# else + movhs r2, r3 + subhs ip, ip, #(8 * BLOCK_SIZE) +# endif + + lsrs r3, r2, #4 + cmp r3, r1 +# if defined(USE_THUMB_1) + blo LOCAL_LABEL(skip_4) + movs r2, r3 + subs r0, r0, #(4 * BLOCK_SIZE) +LOCAL_LABEL(skip_4): +# else + movhs r2, r3 + subhs ip, #(4 * BLOCK_SIZE) +# endif + + lsrs r3, r2, #2 + cmp r3, r1 +# if defined(USE_THUMB_1) + blo LOCAL_LABEL(skip_2) + movs r2, r3 + subs r0, r0, #(2 * BLOCK_SIZE) +LOCAL_LABEL(skip_2): +# else + movhs r2, r3 + subhs ip, ip, #(2 * BLOCK_SIZE) +# endif + + // Last block, no need to update r2 or r3. +# if defined(USE_THUMB_1) + lsrs r3, r2, #1 + cmp r3, r1 + blo LOCAL_LABEL(skip_1) + subs r0, r0, #(1 * BLOCK_SIZE) +LOCAL_LABEL(skip_1): + movs r2, r0 + mov r0, ip + movs r3, #0 + JMP (r2) + +# else + cmp r1, r2, lsr #1 + subls ip, ip, #(1 * BLOCK_SIZE) + + movs r3, #0 + + JMP(ip) +# endif +# endif // __ARM_FEATURE_CLZ + + +#define IMM # + // due to the range limit of branch in Thumb1, we have to place the + // block closer +LOCAL_LABEL(divby0): + movs r0, #0 +# if defined(__ARM_EABI__) + push {r7, lr} + bl __aeabi_idiv0 // due to relocation limit, can't use b. + pop {r7, pc} +# else + JMP(lr) +# endif + + +#if defined(USE_THUMB_1) +#define block(shift) \ + lsls r2, r1, IMM shift; \ + cmp r0, r2; \ + blo LOCAL_LABEL(block_skip_##shift); \ + subs r0, r0, r2; \ + LOCAL_LABEL(block_skip_##shift) :; \ + adcs r3, r3 // same as ((r3 << 1) | Carry). Carry is set if r0 >= r2. + + // TODO: if current location counter is not not word aligned, we don't + // need the .p2align and nop + // Label div0block must be word-aligned. First align block 31 + .p2align 2 + nop // Padding to align div0block as 31 blocks = 310 bytes + +#else +#define block(shift) \ + cmp r0, r1, lsl IMM shift; \ + ITT(hs); \ + WIDE(addhs) r3, r3, IMM (1 << shift); \ + WIDE(subhs) r0, r0, r1, lsl IMM shift +#endif + + block(31) + block(30) + block(29) + block(28) + block(27) + block(26) + block(25) + block(24) + block(23) + block(22) + block(21) + block(20) + block(19) + block(18) + block(17) + block(16) + block(15) + block(14) + block(13) + block(12) + block(11) + block(10) + block(9) + block(8) + block(7) + block(6) + block(5) + block(4) + block(3) + block(2) + block(1) +LOCAL_LABEL(div0block): + block(0) + + mov r0, r3 + JMP(lr) +#endif // __ARM_ARCH_EXT_IDIV__ + +END_COMPILERRT_FUNCTION(__udivsi3) + +NO_EXEC_STACK_DIRECTIVE + |