diff options
Diffstat (limited to 'contrib/compiler-rt/lib/builtins/hexagon')
30 files changed, 5432 insertions, 0 deletions
diff --git a/contrib/compiler-rt/lib/builtins/hexagon/common_entry_exit_abi1.S b/contrib/compiler-rt/lib/builtins/hexagon/common_entry_exit_abi1.S new file mode 100644 index 000000000000..d5479d2a52fa --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/common_entry_exit_abi1.S @@ -0,0 +1,103 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +/* Functions that implement common sequences in function prologues and epilogues + used to save code size */ + + .macro FUNCTION_BEGIN name + .text + .globl \name + .type \name, @function + .falign +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + + .macro FALLTHROUGH_TAIL_CALL name0 name1 + .size \name0, . - \name0 + .globl \name1 + .type \name1, @function + .falign +\name1: + .endm + + + + +/* Save r25:24 at fp+#-8 and r27:26 at fp+#-16. */ + + + + +/* The compiler knows that the __save_* functions clobber LR. No other + registers should be used without informing the compiler. */ + +/* Since we can only issue one store per packet, we don't hurt performance by + simply jumping to the right point in this sequence of stores. */ + +FUNCTION_BEGIN __save_r24_through_r27 + memd(fp+#-16) = r27:26 +FALLTHROUGH_TAIL_CALL __save_r24_through_r27 __save_r24_through_r25 + { + memd(fp+#-8) = r25:24 + jumpr lr + } +FUNCTION_END __save_r24_through_r25 + + + + +/* For each of the *_before_tailcall functions, jumpr lr is executed in parallel + with deallocframe. That way, the return gets the old value of lr, which is + where these functions need to return, and at the same time, lr gets the value + it needs going into the tail call. */ + +FUNCTION_BEGIN __restore_r24_through_r27_and_deallocframe_before_tailcall + r27:26 = memd(fp+#-16) +FALLTHROUGH_TAIL_CALL __restore_r24_through_r27_and_deallocframe_before_tailcall __restore_r24_through_r25_and_deallocframe_before_tailcall + { + r25:24 = memd(fp+#-8) + deallocframe + jumpr lr + } +FUNCTION_END __restore_r24_through_r25_and_deallocframe_before_tailcall + + + + +/* Here we use the extra load bandwidth to restore LR early, allowing the return + to occur in parallel with the deallocframe. */ + +FUNCTION_BEGIN __restore_r24_through_r27_and_deallocframe + { + lr = memw(fp+#4) + r27:26 = memd(fp+#-16) + } + { + r25:24 = memd(fp+#-8) + deallocframe + jumpr lr + } +FUNCTION_END __restore_r24_through_r27_and_deallocframe + + + + +/* Here the load bandwidth is maximized. */ + +FUNCTION_BEGIN __restore_r24_through_r25_and_deallocframe + { + r25:24 = memd(fp+#-8) + deallocframe + } + jumpr lr +FUNCTION_END __restore_r24_through_r25_and_deallocframe diff --git a/contrib/compiler-rt/lib/builtins/hexagon/common_entry_exit_abi2.S b/contrib/compiler-rt/lib/builtins/hexagon/common_entry_exit_abi2.S new file mode 100644 index 000000000000..6f470343db49 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/common_entry_exit_abi2.S @@ -0,0 +1,268 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +/* Functions that implement common sequences in function prologues and epilogues + used to save code size */ + + .macro FUNCTION_BEGIN name + .p2align 2 + .section .text.\name,"ax",@progbits + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + + .macro FALLTHROUGH_TAIL_CALL name0 name1 + .p2align 2 + .size \name0, . - \name0 + .globl \name1 + .type \name1, @function +\name1: + .endm + + + + +/* Save r17:16 at fp+#-8, r19:18 at fp+#-16, r21:20 at fp+#-24, r23:22 at + fp+#-32, r25:24 at fp+#-40, and r27:26 at fp+#-48. + The compiler knows that the __save_* functions clobber LR. No other + registers should be used without informing the compiler. */ + +FUNCTION_BEGIN __save_r16_through_r27 + { + memd(fp+#-48) = r27:26 + memd(fp+#-40) = r25:24 + } + { + memd(fp+#-32) = r23:22 + memd(fp+#-24) = r21:20 + } + { + memd(fp+#-16) = r19:18 + memd(fp+#-8) = r17:16 + jumpr lr + } +FUNCTION_END __save_r16_through_r27 + +FUNCTION_BEGIN __save_r16_through_r25 + { + memd(fp+#-40) = r25:24 + memd(fp+#-32) = r23:22 + } + { + memd(fp+#-24) = r21:20 + memd(fp+#-16) = r19:18 + } + { + memd(fp+#-8) = r17:16 + jumpr lr + } +FUNCTION_END __save_r16_through_r25 + +FUNCTION_BEGIN __save_r16_through_r23 + { + memd(fp+#-32) = r23:22 + memd(fp+#-24) = r21:20 + } + { + memd(fp+#-16) = r19:18 + memd(fp+#-8) = r17:16 + jumpr lr + } +FUNCTION_END __save_r16_through_r23 + +FUNCTION_BEGIN __save_r16_through_r21 + { + memd(fp+#-24) = r21:20 + memd(fp+#-16) = r19:18 + } + { + memd(fp+#-8) = r17:16 + jumpr lr + } +FUNCTION_END __save_r16_through_r21 + +FUNCTION_BEGIN __save_r16_through_r19 + { + memd(fp+#-16) = r19:18 + memd(fp+#-8) = r17:16 + jumpr lr + } +FUNCTION_END __save_r16_through_r19 + +FUNCTION_BEGIN __save_r16_through_r17 + { + memd(fp+#-8) = r17:16 + jumpr lr + } +FUNCTION_END __save_r16_through_r17 + +/* For each of the *_before_tailcall functions, jumpr lr is executed in parallel + with deallocframe. That way, the return gets the old value of lr, which is + where these functions need to return, and at the same time, lr gets the value + it needs going into the tail call. */ + + +FUNCTION_BEGIN __restore_r16_through_r27_and_deallocframe_before_tailcall + r27:26 = memd(fp+#-48) + { + r25:24 = memd(fp+#-40) + r23:22 = memd(fp+#-32) + } + { + r21:20 = memd(fp+#-24) + r19:18 = memd(fp+#-16) + } + { + r17:16 = memd(fp+#-8) + deallocframe + jumpr lr + } +FUNCTION_END __restore_r16_through_r27_and_deallocframe_before_tailcall + +FUNCTION_BEGIN __restore_r16_through_r25_and_deallocframe_before_tailcall + { + r25:24 = memd(fp+#-40) + r23:22 = memd(fp+#-32) + } + { + r21:20 = memd(fp+#-24) + r19:18 = memd(fp+#-16) + } + { + r17:16 = memd(fp+#-8) + deallocframe + jumpr lr + } +FUNCTION_END __restore_r16_through_r25_and_deallocframe_before_tailcall + +FUNCTION_BEGIN __restore_r16_through_r23_and_deallocframe_before_tailcall + { + r23:22 = memd(fp+#-32) + r21:20 = memd(fp+#-24) + } + r19:18 = memd(fp+#-16) + { + r17:16 = memd(fp+#-8) + deallocframe + jumpr lr + } +FUNCTION_END __restore_r16_through_r23_and_deallocframe_before_tailcall + + +FUNCTION_BEGIN __restore_r16_through_r21_and_deallocframe_before_tailcall + { + r21:20 = memd(fp+#-24) + r19:18 = memd(fp+#-16) + } + { + r17:16 = memd(fp+#-8) + deallocframe + jumpr lr + } +FUNCTION_END __restore_r16_through_r19_and_deallocframe_before_tailcall + +FUNCTION_BEGIN __restore_r16_through_r19_and_deallocframe_before_tailcall + r19:18 = memd(fp+#-16) + { + r17:16 = memd(fp+#-8) + deallocframe + jumpr lr + } +FUNCTION_END __restore_r16_through_r19_and_deallocframe_before_tailcall + +FUNCTION_BEGIN __restore_r16_through_r17_and_deallocframe_before_tailcall + { + r17:16 = memd(fp+#-8) + deallocframe + jumpr lr + } +FUNCTION_END __restore_r16_through_r17_and_deallocframe_before_tailcall + + +FUNCTION_BEGIN __restore_r16_through_r27_and_deallocframe + r27:26 = memd(fp+#-48) + { + r25:24 = memd(fp+#-40) + r23:22 = memd(fp+#-32) + } + { + r21:20 = memd(fp+#-24) + r19:18 = memd(fp+#-16) + } + { + r17:16 = memd(fp+#-8) + dealloc_return + } +FUNCTION_END __restore_r16_through_r27_and_deallocframe + +FUNCTION_BEGIN __restore_r16_through_r25_and_deallocframe + { + r25:24 = memd(fp+#-40) + r23:22 = memd(fp+#-32) + } + { + r21:20 = memd(fp+#-24) + r19:18 = memd(fp+#-16) + } + { + r17:16 = memd(fp+#-8) + dealloc_return + } +FUNCTION_END __restore_r16_through_r25_and_deallocframe + +FUNCTION_BEGIN __restore_r16_through_r23_and_deallocframe + { + r23:22 = memd(fp+#-32) + } + { + r21:20 = memd(fp+#-24) + r19:18 = memd(fp+#-16) + } + { + r17:16 = memd(fp+#-8) + dealloc_return + } +FUNCTION_END __restore_r16_through_r23_and_deallocframe + +FUNCTION_BEGIN __restore_r16_through_r21_and_deallocframe + { + r21:20 = memd(fp+#-24) + r19:18 = memd(fp+#-16) + } + { + r17:16 = memd(fp+#-8) + dealloc_return + } +FUNCTION_END __restore_r16_through_r21_and_deallocframe + +FUNCTION_BEGIN __restore_r16_through_r19_and_deallocframe + { + r19:18 = memd(fp+#-16) + r17:16 = memd(fp+#-8) + } + { + dealloc_return + } +FUNCTION_END __restore_r16_through_r19_and_deallocframe + +FUNCTION_BEGIN __restore_r16_through_r17_and_deallocframe + { + r17:16 = memd(fp+#-8) + dealloc_return + } +FUNCTION_END __restore_r16_through_r17_and_deallocframe + +FUNCTION_BEGIN __deallocframe + dealloc_return +FUNCTION_END __deallocframe diff --git a/contrib/compiler-rt/lib/builtins/hexagon/common_entry_exit_legacy.S b/contrib/compiler-rt/lib/builtins/hexagon/common_entry_exit_legacy.S new file mode 100644 index 000000000000..3258f15a3267 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/common_entry_exit_legacy.S @@ -0,0 +1,157 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + +/* Functions that implement common sequences in function prologues and epilogues + used to save code size */ + + .macro FUNCTION_BEGIN name + .text + .globl \name + .type \name, @function + .falign +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + + .macro FALLTHROUGH_TAIL_CALL name0 name1 + .size \name0, . - \name0 + .globl \name1 + .type \name1, @function + .falign +\name1: + .endm + + + + +/* Save r27:26 at fp+#-8, r25:24 at fp+#-16, r23:22 at fp+#-24, r21:20 at + fp+#-32, r19:18 at fp+#-40, and r17:16 at fp+#-48. */ + + + + +/* The compiler knows that the __save_* functions clobber LR. No other + registers should be used without informing the compiler. */ + +/* Since we can only issue one store per packet, we don't hurt performance by + simply jumping to the right point in this sequence of stores. */ + +FUNCTION_BEGIN __save_r27_through_r16 + memd(fp+#-48) = r17:16 +FALLTHROUGH_TAIL_CALL __save_r27_through_r16 __save_r27_through_r18 + memd(fp+#-40) = r19:18 +FALLTHROUGH_TAIL_CALL __save_r27_through_r18 __save_r27_through_r20 + memd(fp+#-32) = r21:20 +FALLTHROUGH_TAIL_CALL __save_r27_through_r20 __save_r27_through_r22 + memd(fp+#-24) = r23:22 +FALLTHROUGH_TAIL_CALL __save_r27_through_r22 __save_r27_through_r24 + memd(fp+#-16) = r25:24 + { + memd(fp+#-8) = r27:26 + jumpr lr + } +FUNCTION_END __save_r27_through_r24 + + + + +/* For each of the *_before_sibcall functions, jumpr lr is executed in parallel + with deallocframe. That way, the return gets the old value of lr, which is + where these functions need to return, and at the same time, lr gets the value + it needs going into the sibcall. */ + +FUNCTION_BEGIN __restore_r27_through_r20_and_deallocframe_before_sibcall + { + r21:20 = memd(fp+#-32) + r23:22 = memd(fp+#-24) + } +FALLTHROUGH_TAIL_CALL __restore_r27_through_r20_and_deallocframe_before_sibcall __restore_r27_through_r24_and_deallocframe_before_sibcall + { + r25:24 = memd(fp+#-16) + jump __restore_r27_through_r26_and_deallocframe_before_sibcall + } +FUNCTION_END __restore_r27_through_r24_and_deallocframe_before_sibcall + + + + +FUNCTION_BEGIN __restore_r27_through_r16_and_deallocframe_before_sibcall + r17:16 = memd(fp+#-48) +FALLTHROUGH_TAIL_CALL __restore_r27_through_r16_and_deallocframe_before_sibcall __restore_r27_through_r18_and_deallocframe_before_sibcall + { + r19:18 = memd(fp+#-40) + r21:20 = memd(fp+#-32) + } +FALLTHROUGH_TAIL_CALL __restore_r27_through_r18_and_deallocframe_before_sibcall __restore_r27_through_r22_and_deallocframe_before_sibcall + { + r23:22 = memd(fp+#-24) + r25:24 = memd(fp+#-16) + } +FALLTHROUGH_TAIL_CALL __restore_r27_through_r22_and_deallocframe_before_sibcall __restore_r27_through_r26_and_deallocframe_before_sibcall + { + r27:26 = memd(fp+#-8) + deallocframe + jumpr lr + } +FUNCTION_END __restore_r27_through_r26_and_deallocframe_before_sibcall + + + + +/* Here we use the extra load bandwidth to restore LR early, allowing the return + to occur in parallel with the deallocframe. */ + +FUNCTION_BEGIN __restore_r27_through_r16_and_deallocframe + { + r17:16 = memd(fp+#-48) + r19:18 = memd(fp+#-40) + } +FALLTHROUGH_TAIL_CALL __restore_r27_through_r16_and_deallocframe __restore_r27_through_r20_and_deallocframe + { + r21:20 = memd(fp+#-32) + r23:22 = memd(fp+#-24) + } +FALLTHROUGH_TAIL_CALL __restore_r27_through_r20_and_deallocframe __restore_r27_through_r24_and_deallocframe + { + lr = memw(fp+#4) + r25:24 = memd(fp+#-16) + } + { + r27:26 = memd(fp+#-8) + deallocframe + jumpr lr + } +FUNCTION_END __restore_r27_through_r24_and_deallocframe + + + + +/* Here the load bandwidth is maximized for all three functions. */ + +FUNCTION_BEGIN __restore_r27_through_r18_and_deallocframe + { + r19:18 = memd(fp+#-40) + r21:20 = memd(fp+#-32) + } +FALLTHROUGH_TAIL_CALL __restore_r27_through_r18_and_deallocframe __restore_r27_through_r22_and_deallocframe + { + r23:22 = memd(fp+#-24) + r25:24 = memd(fp+#-16) + } +FALLTHROUGH_TAIL_CALL __restore_r27_through_r22_and_deallocframe __restore_r27_through_r26_and_deallocframe + { + r27:26 = memd(fp+#-8) + deallocframe + } + jumpr lr +FUNCTION_END __restore_r27_through_r26_and_deallocframe diff --git a/contrib/compiler-rt/lib/builtins/hexagon/dfaddsub.S b/contrib/compiler-rt/lib/builtins/hexagon/dfaddsub.S new file mode 100644 index 000000000000..4173f86a4f54 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/dfaddsub.S @@ -0,0 +1,398 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +/* Double Precision Multiply */ + +#define A r1:0 +#define AH r1 +#define AL r0 +#define B r3:2 +#define BH r3 +#define BL r2 + +#define EXPA r4 +#define EXPB r5 +#define EXPB_A r5:4 + +#define ZTMP r7:6 +#define ZTMPH r7 +#define ZTMPL r6 + +#define ATMP r13:12 +#define ATMPH r13 +#define ATMPL r12 + +#define BTMP r9:8 +#define BTMPH r9 +#define BTMPL r8 + +#define ATMP2 r11:10 +#define ATMP2H r11 +#define ATMP2L r10 + +#define EXPDIFF r15 +#define EXTRACTOFF r14 +#define EXTRACTAMT r15:14 + +#define TMP r28 + +#define MANTBITS 52 +#define HI_MANTBITS 20 +#define EXPBITS 11 +#define BIAS 1024 +#define MANTISSA_TO_INT_BIAS 52 +#define SR_BIT_INEXACT 5 + +#ifndef SR_ROUND_OFF +#define SR_ROUND_OFF 22 +#endif + +#define NORMAL p3 +#define BIGB p2 + +#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG +#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG +#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG +#define END(TAG) .size TAG,.-TAG + + .text + .global __hexagon_adddf3 + .global __hexagon_subdf3 + .type __hexagon_adddf3, @function + .type __hexagon_subdf3, @function + +Q6_ALIAS(adddf3) +FAST_ALIAS(adddf3) +FAST2_ALIAS(adddf3) +Q6_ALIAS(subdf3) +FAST_ALIAS(subdf3) +FAST2_ALIAS(subdf3) + + .p2align 5 +__hexagon_adddf3: + { + EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS) + EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS) + ATMP = combine(##0x20000000,#0) + } + { + NORMAL = dfclass(A,#2) + NORMAL = dfclass(B,#2) + BTMP = ATMP + BIGB = cmp.gtu(EXPB,EXPA) // Is B substantially greater than A? + } + { + if (!NORMAL) jump .Ladd_abnormal // If abnormal, go to special code + if (BIGB) A = B // if B >> A, swap A and B + if (BIGB) B = A // If B >> A, swap A and B + if (BIGB) EXPB_A = combine(EXPA,EXPB) // swap exponents + } + { + ATMP = insert(A,#MANTBITS,#EXPBITS-2) // Q1.62 + BTMP = insert(B,#MANTBITS,#EXPBITS-2) // Q1.62 + EXPDIFF = sub(EXPA,EXPB) + ZTMP = combine(#62,#1) + } +#undef BIGB +#undef NORMAL +#define B_POS p3 +#define A_POS p2 +#define NO_STICKIES p1 +.Ladd_continue: + { + EXPDIFF = min(EXPDIFF,ZTMPH) // If exponent difference >= ~60, + // will collapse to sticky bit + ATMP2 = neg(ATMP) + A_POS = cmp.gt(AH,#-1) + EXTRACTOFF = #0 + } + { + if (!A_POS) ATMP = ATMP2 + ATMP2 = extractu(BTMP,EXTRACTAMT) + BTMP = ASR(BTMP,EXPDIFF) +#undef EXTRACTAMT +#undef EXPDIFF +#undef EXTRACTOFF +#define ZERO r15:14 + ZERO = #0 + } + { + NO_STICKIES = cmp.eq(ATMP2,ZERO) + if (!NO_STICKIES.new) BTMPL = or(BTMPL,ZTMPL) + EXPB = add(EXPA,#-BIAS-60) + B_POS = cmp.gt(BH,#-1) + } + { + ATMP = add(ATMP,BTMP) // ADD!!! + ATMP2 = sub(ATMP,BTMP) // Negate and ADD --> SUB!!! + ZTMP = combine(#54,##2045) + } + { + p0 = cmp.gtu(EXPA,ZTMPH) // must be pretty high in case of large cancellation + p0 = !cmp.gtu(EXPA,ZTMPL) + if (!p0.new) jump:nt .Ladd_ovf_unf + if (!B_POS) ATMP = ATMP2 // if B neg, pick difference + } + { + A = convert_d2df(ATMP) // Convert to Double Precision, taking care of flags, etc. So nice! + p0 = cmp.eq(ATMPH,#0) + p0 = cmp.eq(ATMPL,#0) + if (p0.new) jump:nt .Ladd_zero // or maybe conversion handles zero case correctly? + } + { + AH += asl(EXPB,#HI_MANTBITS) + jumpr r31 + } + .falign +__hexagon_subdf3: + { + BH = togglebit(BH,#31) + jump __qdsp_adddf3 + } + + + .falign +.Ladd_zero: + // True zero, full cancellation + // +0 unless round towards negative infinity + { + TMP = USR + A = #0 + BH = #1 + } + { + TMP = extractu(TMP,#2,#22) + BH = asl(BH,#31) + } + { + p0 = cmp.eq(TMP,#2) + if (p0.new) AH = xor(AH,BH) + jumpr r31 + } + .falign +.Ladd_ovf_unf: + // Overflow or Denormal is possible + // Good news: Underflow flag is not possible! + /* + * ATMP has 2's complement value + * + * EXPA has A's exponent, EXPB has EXPA-BIAS-60 + * + * Convert, extract exponent, add adjustment. + * If > 2046, overflow + * If <= 0, denormal + * + * Note that we've not done our zero check yet, so do that too + * + */ + { + A = convert_d2df(ATMP) + p0 = cmp.eq(ATMPH,#0) + p0 = cmp.eq(ATMPL,#0) + if (p0.new) jump:nt .Ladd_zero + } + { + TMP = extractu(AH,#EXPBITS,#HI_MANTBITS) + AH += asl(EXPB,#HI_MANTBITS) + } + { + EXPB = add(EXPB,TMP) + B = combine(##0x00100000,#0) + } + { + p0 = cmp.gt(EXPB,##BIAS+BIAS-2) + if (p0.new) jump:nt .Ladd_ovf + } + { + p0 = cmp.gt(EXPB,#0) + if (p0.new) jumpr:t r31 + TMP = sub(#1,EXPB) + } + { + B = insert(A,#MANTBITS,#0) + A = ATMP + } + { + B = lsr(B,TMP) + } + { + A = insert(B,#63,#0) + jumpr r31 + } + .falign +.Ladd_ovf: + // We get either max finite value or infinity. Either way, overflow+inexact + { + A = ATMP // 2's complement value + TMP = USR + ATMP = combine(##0x7fefffff,#-1) // positive max finite + } + { + EXPB = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits + TMP = or(TMP,#0x28) // inexact + overflow + BTMP = combine(##0x7ff00000,#0) // positive infinity + } + { + USR = TMP + EXPB ^= lsr(AH,#31) // Does sign match rounding? + TMP = EXPB // unmodified rounding mode + } + { + p0 = !cmp.eq(TMP,#1) // If not round-to-zero and + p0 = !cmp.eq(EXPB,#2) // Not rounding the other way, + if (p0.new) ATMP = BTMP // we should get infinity + } + { + A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign + } + { + p0 = dfcmp.eq(A,A) + jumpr r31 + } + +.Ladd_abnormal: + { + ATMP = extractu(A,#63,#0) // strip off sign + BTMP = extractu(B,#63,#0) // strip off sign + } + { + p3 = cmp.gtu(ATMP,BTMP) + if (!p3.new) A = B // sort values + if (!p3.new) B = A // sort values + } + { + // Any NaN --> NaN, possibly raise invalid if sNaN + p0 = dfclass(A,#0x0f) // A not NaN? + if (!p0.new) jump:nt .Linvalid_nan_add + if (!p3) ATMP = BTMP + if (!p3) BTMP = ATMP + } + { + // Infinity + non-infinity number is infinity + // Infinity + infinity --> inf or nan + p1 = dfclass(A,#0x08) // A is infinity + if (p1.new) jump:nt .Linf_add + } + { + p2 = dfclass(B,#0x01) // B is zero + if (p2.new) jump:nt .LB_zero // so return A or special 0+0 + ATMP = #0 + } + // We are left with adding one or more subnormals + { + p0 = dfclass(A,#4) + if (p0.new) jump:nt .Ladd_two_subnormal + ATMP = combine(##0x20000000,#0) + } + { + EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS) + EXPB = #1 + // BTMP already ABS(B) + BTMP = asl(BTMP,#EXPBITS-2) + } +#undef ZERO +#define EXTRACTOFF r14 +#define EXPDIFF r15 + { + ATMP = insert(A,#MANTBITS,#EXPBITS-2) + EXPDIFF = sub(EXPA,EXPB) + ZTMP = combine(#62,#1) + jump .Ladd_continue + } + +.Ladd_two_subnormal: + { + ATMP = extractu(A,#63,#0) + BTMP = extractu(B,#63,#0) + } + { + ATMP = neg(ATMP) + BTMP = neg(BTMP) + p0 = cmp.gt(AH,#-1) + p1 = cmp.gt(BH,#-1) + } + { + if (p0) ATMP = A + if (p1) BTMP = B + } + { + ATMP = add(ATMP,BTMP) + } + { + BTMP = neg(ATMP) + p0 = cmp.gt(ATMPH,#-1) + B = #0 + } + { + if (!p0) A = BTMP + if (p0) A = ATMP + BH = ##0x80000000 + } + { + if (!p0) AH = or(AH,BH) + p0 = dfcmp.eq(A,B) + if (p0.new) jump:nt .Lzero_plus_zero + } + { + jumpr r31 + } + +.Linvalid_nan_add: + { + TMP = convert_df2sf(A) // will generate invalid if sNaN + p0 = dfclass(B,#0x0f) // if B is not NaN + if (p0.new) B = A // make it whatever A is + } + { + BL = convert_df2sf(B) // will generate invalid if sNaN + A = #-1 + jumpr r31 + } + .falign +.LB_zero: + { + p0 = dfcmp.eq(ATMP,A) // is A also zero? + if (!p0.new) jumpr:t r31 // If not, just return A + } + // 0 + 0 is special + // if equal integral values, they have the same sign, which is fine for all rounding + // modes. + // If unequal in sign, we get +0 for all rounding modes except round down +.Lzero_plus_zero: + { + p0 = cmp.eq(A,B) + if (p0.new) jumpr:t r31 + } + { + TMP = USR + } + { + TMP = extractu(TMP,#2,#SR_ROUND_OFF) + A = #0 + } + { + p0 = cmp.eq(TMP,#2) + if (p0.new) AH = ##0x80000000 + jumpr r31 + } +.Linf_add: + // adding infinities is only OK if they are equal + { + p0 = !cmp.eq(AH,BH) // Do they have different signs + p0 = dfclass(B,#8) // And is B also infinite? + if (!p0.new) jumpr:t r31 // If not, just a normal inf + } + { + BL = ##0x7f800001 // sNAN + } + { + A = convert_sf2df(BL) // trigger invalid, set NaN + jumpr r31 + } +END(__hexagon_adddf3) diff --git a/contrib/compiler-rt/lib/builtins/hexagon/dfdiv.S b/contrib/compiler-rt/lib/builtins/hexagon/dfdiv.S new file mode 100644 index 000000000000..0c5dbe272c89 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/dfdiv.S @@ -0,0 +1,492 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +/* Double Precision Divide */ + +#define A r1:0 +#define AH r1 +#define AL r0 + +#define B r3:2 +#define BH r3 +#define BL r2 + +#define Q r5:4 +#define QH r5 +#define QL r4 + +#define PROD r7:6 +#define PRODHI r7 +#define PRODLO r6 + +#define SFONE r8 +#define SFDEN r9 +#define SFERROR r10 +#define SFRECIP r11 + +#define EXPBA r13:12 +#define EXPB r13 +#define EXPA r12 + +#define REMSUB2 r15:14 + + + +#define SIGN r28 + +#define Q_POSITIVE p3 +#define NORMAL p2 +#define NO_OVF_UNF p1 +#define P_TMP p0 + +#define RECIPEST_SHIFT 3 +#define QADJ 61 + +#define DFCLASS_NORMAL 0x02 +#define DFCLASS_NUMBER 0x0F +#define DFCLASS_INFINITE 0x08 +#define DFCLASS_ZERO 0x01 +#define DFCLASS_NONZERO (DFCLASS_NUMBER ^ DFCLASS_ZERO) +#define DFCLASS_NONINFINITE (DFCLASS_NUMBER ^ DFCLASS_INFINITE) + +#define DF_MANTBITS 52 +#define DF_EXPBITS 11 +#define SF_MANTBITS 23 +#define SF_EXPBITS 8 +#define DF_BIAS 0x3ff + +#define SR_ROUND_OFF 22 + +#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG +#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG +#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG +#define END(TAG) .size TAG,.-TAG + + .text + .global __hexagon_divdf3 + .type __hexagon_divdf3,@function + Q6_ALIAS(divdf3) + FAST_ALIAS(divdf3) + FAST2_ALIAS(divdf3) + .p2align 5 +__hexagon_divdf3: + { + NORMAL = dfclass(A,#DFCLASS_NORMAL) + NORMAL = dfclass(B,#DFCLASS_NORMAL) + EXPBA = combine(BH,AH) + SIGN = xor(AH,BH) + } +#undef A +#undef AH +#undef AL +#undef B +#undef BH +#undef BL +#define REM r1:0 +#define REMHI r1 +#define REMLO r0 +#define DENOM r3:2 +#define DENOMHI r3 +#define DENOMLO r2 + { + if (!NORMAL) jump .Ldiv_abnormal + PROD = extractu(DENOM,#SF_MANTBITS,#DF_MANTBITS-SF_MANTBITS) + SFONE = ##0x3f800001 + } + { + SFDEN = or(SFONE,PRODLO) + EXPB = extractu(EXPB,#DF_EXPBITS,#DF_MANTBITS-32) + EXPA = extractu(EXPA,#DF_EXPBITS,#DF_MANTBITS-32) + Q_POSITIVE = cmp.gt(SIGN,#-1) + } +#undef SIGN +#define ONE r28 +.Ldenorm_continue: + { + SFRECIP,P_TMP = sfrecipa(SFONE,SFDEN) + SFERROR = and(SFONE,#-2) + ONE = #1 + EXPA = sub(EXPA,EXPB) + } +#undef EXPB +#define RECIPEST r13 + { + SFERROR -= sfmpy(SFRECIP,SFDEN):lib + REMHI = insert(ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) + RECIPEST = ##0x00800000 << RECIPEST_SHIFT + } + { + SFRECIP += sfmpy(SFRECIP,SFERROR):lib + DENOMHI = insert(ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) + SFERROR = and(SFONE,#-2) + } + { + SFERROR -= sfmpy(SFRECIP,SFDEN):lib + QH = #-DF_BIAS+1 + QL = #DF_BIAS-1 + } + { + SFRECIP += sfmpy(SFRECIP,SFERROR):lib + NO_OVF_UNF = cmp.gt(EXPA,QH) + NO_OVF_UNF = !cmp.gt(EXPA,QL) + } + { + RECIPEST = insert(SFRECIP,#SF_MANTBITS,#RECIPEST_SHIFT) + Q = #0 + EXPA = add(EXPA,#-QADJ) + } +#undef SFERROR +#undef SFRECIP +#define TMP r10 +#define TMP1 r11 + { + RECIPEST = add(RECIPEST,#((-3) << RECIPEST_SHIFT)) + } + +#define DIV_ITER1B(QSHIFTINSN,QSHIFT,REMSHIFT,EXTRA) \ + { \ + PROD = mpyu(RECIPEST,REMHI); \ + REM = asl(REM,# ## ( REMSHIFT )); \ + }; \ + { \ + PRODLO = # ## 0; \ + REM -= mpyu(PRODHI,DENOMLO); \ + REMSUB2 = mpyu(PRODHI,DENOMHI); \ + }; \ + { \ + Q += QSHIFTINSN(PROD, # ## ( QSHIFT )); \ + REM -= asl(REMSUB2, # ## 32); \ + EXTRA \ + } + + + DIV_ITER1B(ASL,14,15,) + DIV_ITER1B(ASR,1,15,) + DIV_ITER1B(ASR,16,15,) + DIV_ITER1B(ASR,31,15,PROD=# ( 0 );) + +#undef REMSUB2 +#define TMPPAIR r15:14 +#define TMPPAIRHI r15 +#define TMPPAIRLO r14 +#undef RECIPEST +#define EXPB r13 + { + // compare or sub with carry + TMPPAIR = sub(REM,DENOM) + P_TMP = cmp.gtu(DENOM,REM) + // set up amt to add to q + if (!P_TMP.new) PRODLO = #2 + } + { + Q = add(Q,PROD) + if (!P_TMP) REM = TMPPAIR + TMPPAIR = #0 + } + { + P_TMP = cmp.eq(REM,TMPPAIR) + if (!P_TMP.new) QL = or(QL,ONE) + } + { + PROD = neg(Q) + } + { + if (!Q_POSITIVE) Q = PROD + } +#undef REM +#undef REMHI +#undef REMLO +#undef DENOM +#undef DENOMLO +#undef DENOMHI +#define A r1:0 +#define AH r1 +#define AL r0 +#define B r3:2 +#define BH r3 +#define BL r2 + { + A = convert_d2df(Q) + if (!NO_OVF_UNF) jump .Ldiv_ovf_unf + } + { + AH += asl(EXPA,#DF_MANTBITS-32) + jumpr r31 + } + +.Ldiv_ovf_unf: + { + AH += asl(EXPA,#DF_MANTBITS-32) + EXPB = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32) + } + { + PROD = abs(Q) + EXPA = add(EXPA,EXPB) + } + { + P_TMP = cmp.gt(EXPA,##DF_BIAS+DF_BIAS) // overflow + if (P_TMP.new) jump:nt .Ldiv_ovf + } + { + P_TMP = cmp.gt(EXPA,#0) + if (P_TMP.new) jump:nt .Lpossible_unf // round up to normal possible... + } + /* Underflow */ + /* We know what the infinite range exponent should be (EXPA) */ + /* Q is 2's complement, PROD is abs(Q) */ + /* Normalize Q, shift right, add a high bit, convert, change exponent */ + +#define FUDGE1 7 // how much to shift right +#define FUDGE2 4 // how many guard/round to keep at lsbs + + { + EXPB = add(clb(PROD),#-1) // doesn't need to be added in since + EXPA = sub(#FUDGE1,EXPA) // we extract post-converted exponent + TMP = USR + TMP1 = #63 + } + { + EXPB = min(EXPA,TMP1) + TMP1 = or(TMP,#0x030) + PROD = asl(PROD,EXPB) + EXPA = #0 + } + { + TMPPAIR = extractu(PROD,EXPBA) // bits that will get shifted out + PROD = lsr(PROD,EXPB) // shift out bits + B = #1 + } + { + P_TMP = cmp.gtu(B,TMPPAIR) + if (!P_TMP.new) PRODLO = or(BL,PRODLO) + PRODHI = setbit(PRODHI,#DF_MANTBITS-32+FUDGE2) + } + { + Q = neg(PROD) + P_TMP = bitsclr(PRODLO,#(1<<FUDGE2)-1) + if (!P_TMP.new) TMP = TMP1 + } + { + USR = TMP + if (Q_POSITIVE) Q = PROD + TMP = #-DF_BIAS-(DF_MANTBITS+FUDGE2) + } + { + A = convert_d2df(Q) + } + { + AH += asl(TMP,#DF_MANTBITS-32) + jumpr r31 + } + + +.Lpossible_unf: + /* If upper parts of Q were all F's, but abs(A) == 0x00100000_00000000, we rounded up to min_normal */ + /* The answer is correct, but we need to raise Underflow */ + { + B = extractu(A,#63,#0) + TMPPAIR = combine(##0x00100000,#0) // min normal + TMP = #0x7FFF + } + { + P_TMP = dfcmp.eq(TMPPAIR,B) // Is everything zero in the rounded value... + P_TMP = bitsset(PRODHI,TMP) // but a bunch of bits set in the unrounded abs(quotient)? + } + +#if (__HEXAGON_ARCH__ == 60) + TMP = USR // If not, just return + if (!P_TMP) jumpr r31 // Else, we want to set Unf+Inexact + // Note that inexact is already set... +#else + { + if (!P_TMP) jumpr r31 // If not, just return + TMP = USR // Else, we want to set Unf+Inexact + } // Note that inexact is already set... +#endif + { + TMP = or(TMP,#0x30) + } + { + USR = TMP + } + { + p0 = dfcmp.eq(A,A) + jumpr r31 + } + +.Ldiv_ovf: + /* + * Raise Overflow, and choose the correct overflow value (saturated normal or infinity) + */ + { + TMP = USR + B = combine(##0x7fefffff,#-1) + AH = mux(Q_POSITIVE,#0,#-1) + } + { + PROD = combine(##0x7ff00000,#0) + QH = extractu(TMP,#2,#SR_ROUND_OFF) + TMP = or(TMP,#0x28) + } + { + USR = TMP + QH ^= lsr(AH,#31) + QL = QH + } + { + p0 = !cmp.eq(QL,#1) // if not round-to-zero + p0 = !cmp.eq(QH,#2) // and not rounding the other way + if (p0.new) B = PROD // go to inf + p0 = dfcmp.eq(B,B) // get exceptions + } + { + A = insert(B,#63,#0) + jumpr r31 + } + +#undef ONE +#define SIGN r28 +#undef NORMAL +#undef NO_OVF_UNF +#define P_INF p1 +#define P_ZERO p2 +.Ldiv_abnormal: + { + P_TMP = dfclass(A,#DFCLASS_NUMBER) + P_TMP = dfclass(B,#DFCLASS_NUMBER) + Q_POSITIVE = cmp.gt(SIGN,#-1) + } + { + P_INF = dfclass(A,#DFCLASS_INFINITE) + P_INF = dfclass(B,#DFCLASS_INFINITE) + } + { + P_ZERO = dfclass(A,#DFCLASS_ZERO) + P_ZERO = dfclass(B,#DFCLASS_ZERO) + } + { + if (!P_TMP) jump .Ldiv_nan + if (P_INF) jump .Ldiv_invalid + } + { + if (P_ZERO) jump .Ldiv_invalid + } + { + P_ZERO = dfclass(A,#DFCLASS_NONZERO) // nonzero + P_ZERO = dfclass(B,#DFCLASS_NONINFINITE) // non-infinite + } + { + P_INF = dfclass(A,#DFCLASS_NONINFINITE) // non-infinite + P_INF = dfclass(B,#DFCLASS_NONZERO) // nonzero + } + { + if (!P_ZERO) jump .Ldiv_zero_result + if (!P_INF) jump .Ldiv_inf_result + } + /* Now we've narrowed it down to (de)normal / (de)normal */ + /* Set up A/EXPA B/EXPB and go back */ +#undef P_ZERO +#undef P_INF +#define P_TMP2 p1 + { + P_TMP = dfclass(A,#DFCLASS_NORMAL) + P_TMP2 = dfclass(B,#DFCLASS_NORMAL) + TMP = ##0x00100000 + } + { + EXPBA = combine(BH,AH) + AH = insert(TMP,#DF_EXPBITS+1,#DF_MANTBITS-32) // clear out hidden bit, sign bit + BH = insert(TMP,#DF_EXPBITS+1,#DF_MANTBITS-32) // clear out hidden bit, sign bit + } + { + if (P_TMP) AH = or(AH,TMP) // if normal, add back in hidden bit + if (P_TMP2) BH = or(BH,TMP) // if normal, add back in hidden bit + } + { + QH = add(clb(A),#-DF_EXPBITS) + QL = add(clb(B),#-DF_EXPBITS) + TMP = #1 + } + { + EXPA = extractu(EXPA,#DF_EXPBITS,#DF_MANTBITS-32) + EXPB = extractu(EXPB,#DF_EXPBITS,#DF_MANTBITS-32) + } + { + A = asl(A,QH) + B = asl(B,QL) + if (!P_TMP) EXPA = sub(TMP,QH) + if (!P_TMP2) EXPB = sub(TMP,QL) + } // recreate values needed by resume coke + { + PROD = extractu(B,#SF_MANTBITS,#DF_MANTBITS-SF_MANTBITS) + } + { + SFDEN = or(SFONE,PRODLO) + jump .Ldenorm_continue + } + +.Ldiv_zero_result: + { + AH = xor(AH,BH) + B = #0 + } + { + A = insert(B,#63,#0) + jumpr r31 + } +.Ldiv_inf_result: + { + p2 = dfclass(B,#DFCLASS_ZERO) + p2 = dfclass(A,#DFCLASS_NONINFINITE) + } + { + TMP = USR + if (!p2) jump 1f + AH = xor(AH,BH) + } + { + TMP = or(TMP,#0x04) // DBZ + } + { + USR = TMP + } +1: + { + B = combine(##0x7ff00000,#0) + p0 = dfcmp.uo(B,B) // take possible exception + } + { + A = insert(B,#63,#0) + jumpr r31 + } +.Ldiv_nan: + { + p0 = dfclass(A,#0x10) + p1 = dfclass(B,#0x10) + if (!p0.new) A = B + if (!p1.new) B = A + } + { + QH = convert_df2sf(A) // get possible invalid exceptions + QL = convert_df2sf(B) + } + { + A = #-1 + jumpr r31 + } + +.Ldiv_invalid: + { + TMP = ##0x7f800001 + } + { + A = convert_sf2df(TMP) // get invalid, get DF qNaN + jumpr r31 + } +END(__hexagon_divdf3) diff --git a/contrib/compiler-rt/lib/builtins/hexagon/dffma.S b/contrib/compiler-rt/lib/builtins/hexagon/dffma.S new file mode 100644 index 000000000000..97b885a3bf27 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/dffma.S @@ -0,0 +1,705 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG +#define END(TAG) .size TAG,.-TAG + +/* Double Precision Multiply */ + + +#define A r1:0 +#define AH r1 +#define AL r0 +#define B r3:2 +#define BH r3 +#define BL r2 +#define C r5:4 +#define CH r5 +#define CL r4 + + + +#define BTMP r15:14 +#define BTMPH r15 +#define BTMPL r14 + +#define ATMP r13:12 +#define ATMPH r13 +#define ATMPL r12 + +#define CTMP r11:10 +#define CTMPH r11 +#define CTMPL r10 + +#define PP_LL r9:8 +#define PP_LL_H r9 +#define PP_LL_L r8 + +#define PP_ODD r7:6 +#define PP_ODD_H r7 +#define PP_ODD_L r6 + + +#define PP_HH r17:16 +#define PP_HH_H r17 +#define PP_HH_L r16 + +#define EXPA r18 +#define EXPB r19 +#define EXPBA r19:18 + +#define TMP r28 + +#define P_TMP p0 +#define PROD_NEG p3 +#define EXACT p2 +#define SWAP p1 + +#define MANTBITS 52 +#define HI_MANTBITS 20 +#define EXPBITS 11 +#define BIAS 1023 +#define STACKSPACE 32 + +#define ADJUST 4 + +#define FUDGE 7 +#define FUDGE2 3 + +#ifndef SR_ROUND_OFF +#define SR_ROUND_OFF 22 +#endif + + /* + * First, classify for normal values, and abort if abnormal + * + * Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8 + * + * Since we know that the 2 MSBs of the H registers is zero, we should never carry + * the partial products that involve the H registers + * + * Try to buy X slots, at the expense of latency if needed + * + * We will have PP_HH with the upper bits of the product, PP_LL with the lower + * PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts + * PP_HH can have a minimum of 0x0100_0000_0000_0000 + * + * 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS + * + * We need to align CTMP. + * If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add + * If CTMP << PP align CTMP and add 128 bits. Then compute sticky + * If CTMP ~= PP, align CTMP and add 128 bits. May have massive cancellation. + * + * Convert partial product and CTMP to 2's complement prior to addition + * + * After we add, we need to normalize into upper 64 bits, then compute sticky. + * + * + */ + + .text + .global __hexagon_fmadf4 + .type __hexagon_fmadf4,@function + .global __hexagon_fmadf5 + .type __hexagon_fmadf5,@function + .global fma + .type fma,@function + Q6_ALIAS(fmadf5) + .p2align 5 +__hexagon_fmadf4: +__hexagon_fmadf5: +fma: + { + P_TMP = dfclass(A,#2) + P_TMP = dfclass(B,#2) + ATMP = #0 + BTMP = #0 + } + { + ATMP = insert(A,#MANTBITS,#EXPBITS-3) + BTMP = insert(B,#MANTBITS,#EXPBITS-3) + PP_ODD_H = ##0x10000000 + allocframe(#STACKSPACE) + } + { + PP_LL = mpyu(ATMPL,BTMPL) + if (!P_TMP) jump .Lfma_abnormal_ab + ATMPH = or(ATMPH,PP_ODD_H) + BTMPH = or(BTMPH,PP_ODD_H) + } + { + P_TMP = dfclass(C,#2) + if (!P_TMP.new) jump:nt .Lfma_abnormal_c + CTMP = combine(PP_ODD_H,#0) + PP_ODD = combine(#0,PP_LL_H) + } +.Lfma_abnormal_c_restart: + { + PP_ODD += mpyu(BTMPL,ATMPH) + CTMP = insert(C,#MANTBITS,#EXPBITS-3) + memd(r29+#0) = PP_HH + memd(r29+#8) = EXPBA + } + { + PP_ODD += mpyu(ATMPL,BTMPH) + EXPBA = neg(CTMP) + P_TMP = cmp.gt(CH,#-1) + TMP = xor(AH,BH) + } + { + EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS) + EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS) + PP_HH = combine(#0,PP_ODD_H) + if (!P_TMP) CTMP = EXPBA + } + { + PP_HH += mpyu(ATMPH,BTMPH) + PP_LL = combine(PP_ODD_L,PP_LL_L) +#undef PP_ODD +#undef PP_ODD_H +#undef PP_ODD_L +#undef ATMP +#undef ATMPL +#undef ATMPH +#undef BTMP +#undef BTMPL +#undef BTMPH +#define RIGHTLEFTSHIFT r13:12 +#define RIGHTSHIFT r13 +#define LEFTSHIFT r12 + + EXPA = add(EXPA,EXPB) +#undef EXPB +#undef EXPBA +#define EXPC r19 +#define EXPCA r19:18 + EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS) + } + /* PP_HH:PP_LL now has product */ + /* CTMP is negated */ + /* EXPA,B,C are extracted */ + /* + * We need to negate PP + * Since we will be adding with carry later, if we need to negate, + * just invert all bits now, which we can do conditionally and in parallel + */ +#define PP_HH_TMP r15:14 +#define PP_LL_TMP r7:6 + { + EXPA = add(EXPA,#-BIAS+(ADJUST)) + PROD_NEG = !cmp.gt(TMP,#-1) + PP_LL_TMP = #0 + PP_HH_TMP = #0 + } + { + PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry + P_TMP = !cmp.gt(TMP,#-1) + SWAP = cmp.gt(EXPC,EXPA) // If C >> PP + if (SWAP.new) EXPCA = combine(EXPA,EXPC) + } + { + PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry + if (P_TMP) PP_LL = PP_LL_TMP +#undef PP_LL_TMP +#define CTMP2 r7:6 +#define CTMP2H r7 +#define CTMP2L r6 + CTMP2 = #0 + EXPC = sub(EXPA,EXPC) + } + { + if (P_TMP) PP_HH = PP_HH_TMP + P_TMP = cmp.gt(EXPC,#63) + if (SWAP) PP_LL = CTMP2 + if (SWAP) CTMP2 = PP_LL + } +#undef PP_HH_TMP +//#define ONE r15:14 +//#define S_ONE r14 +#define ZERO r15:14 +#define S_ZERO r15 +#undef PROD_NEG +#define P_CARRY p3 + { + if (SWAP) PP_HH = CTMP // Swap C and PP + if (SWAP) CTMP = PP_HH + if (P_TMP) EXPC = add(EXPC,#-64) + TMP = #63 + } + { + // If diff > 63, pre-shift-right by 64... + if (P_TMP) CTMP2 = CTMP + TMP = asr(CTMPH,#31) + RIGHTSHIFT = min(EXPC,TMP) + LEFTSHIFT = #0 + } +#undef C +#undef CH +#undef CL +#define STICKIES r5:4 +#define STICKIESH r5 +#define STICKIESL r4 + { + if (P_TMP) CTMP = combine(TMP,TMP) // sign extension of pre-shift-right-64 + STICKIES = extract(CTMP2,RIGHTLEFTSHIFT) + CTMP2 = lsr(CTMP2,RIGHTSHIFT) + LEFTSHIFT = sub(#64,RIGHTSHIFT) + } + { + ZERO = #0 + TMP = #-2 + CTMP2 |= lsl(CTMP,LEFTSHIFT) + CTMP = asr(CTMP,RIGHTSHIFT) + } + { + P_CARRY = cmp.gtu(STICKIES,ZERO) // If we have sticky bits from C shift + if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR +#undef ZERO +#define ONE r15:14 +#define S_ONE r14 + ONE = #1 + STICKIES = #0 + } + { + PP_LL = add(CTMP2,PP_LL,P_CARRY):carry // use the carry to add the sticky + } + { + PP_HH = add(CTMP,PP_HH,P_CARRY):carry + TMP = #62 + } + /* + * PP_HH:PP_LL now holds the sum + * We may need to normalize left, up to ??? bits. + * + * I think that if we have massive cancellation, the range we normalize by + * is still limited + */ + { + LEFTSHIFT = add(clb(PP_HH),#-2) + if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f // all sign bits? + } + /* We had all sign bits, shift left by 62. */ + { + CTMP = extractu(PP_LL,#62,#2) + PP_LL = asl(PP_LL,#62) + EXPA = add(EXPA,#-62) // And adjust exponent of result + } + { + PP_HH = insert(CTMP,#62,#0) // Then shift 63 + } + { + LEFTSHIFT = add(clb(PP_HH),#-2) + } + .falign +1: + { + CTMP = asl(PP_HH,LEFTSHIFT) + STICKIES |= asl(PP_LL,LEFTSHIFT) + RIGHTSHIFT = sub(#64,LEFTSHIFT) + EXPA = sub(EXPA,LEFTSHIFT) + } + { + CTMP |= lsr(PP_LL,RIGHTSHIFT) + EXACT = cmp.gtu(ONE,STICKIES) + TMP = #BIAS+BIAS-2 + } + { + if (!EXACT) CTMPL = or(CTMPL,S_ONE) + // If EXPA is overflow/underflow, jump to ovf_unf + P_TMP = !cmp.gt(EXPA,TMP) + P_TMP = cmp.gt(EXPA,#1) + if (!P_TMP.new) jump:nt .Lfma_ovf_unf + } + { + // XXX: FIXME: should PP_HH for check of zero be CTMP? + P_TMP = cmp.gtu(ONE,CTMP) // is result true zero? + A = convert_d2df(CTMP) + EXPA = add(EXPA,#-BIAS-60) + PP_HH = memd(r29+#0) + } + { + AH += asl(EXPA,#HI_MANTBITS) + EXPCA = memd(r29+#8) + if (!P_TMP) dealloc_return // not zero, return + } +.Ladd_yields_zero: + /* We had full cancellation. Return +/- zero (-0 when round-down) */ + { + TMP = USR + A = #0 + } + { + TMP = extractu(TMP,#2,#SR_ROUND_OFF) + PP_HH = memd(r29+#0) + EXPCA = memd(r29+#8) + } + { + p0 = cmp.eq(TMP,#2) + if (p0.new) AH = ##0x80000000 + dealloc_return + } + +#undef RIGHTLEFTSHIFT +#undef RIGHTSHIFT +#undef LEFTSHIFT +#undef CTMP2 +#undef CTMP2H +#undef CTMP2L + +.Lfma_ovf_unf: + { + p0 = cmp.gtu(ONE,CTMP) + if (p0.new) jump:nt .Ladd_yields_zero + } + { + A = convert_d2df(CTMP) + EXPA = add(EXPA,#-BIAS-60) + TMP = EXPA + } +#define NEW_EXPB r7 +#define NEW_EXPA r6 + { + AH += asl(EXPA,#HI_MANTBITS) + NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS) + } + { + NEW_EXPA = add(EXPA,NEW_EXPB) + PP_HH = memd(r29+#0) + EXPCA = memd(r29+#8) +#undef PP_HH +#undef PP_HH_H +#undef PP_HH_L +#undef EXPCA +#undef EXPC +#undef EXPA +#undef PP_LL +#undef PP_LL_H +#undef PP_LL_L +#define EXPA r6 +#define EXPB r7 +#define EXPBA r7:6 +#define ATMP r9:8 +#define ATMPH r9 +#define ATMPL r8 +#undef NEW_EXPB +#undef NEW_EXPA + ATMP = abs(CTMP) + } + { + p0 = cmp.gt(EXPA,##BIAS+BIAS) + if (p0.new) jump:nt .Lfma_ovf + } + { + p0 = cmp.gt(EXPA,#0) + if (p0.new) jump:nt .Lpossible_unf + } + { + // TMP has original EXPA. + // ATMP is corresponding value + // Normalize ATMP and shift right to correct location + EXPB = add(clb(ATMP),#-2) // Amount to left shift to normalize + EXPA = sub(#1+5,TMP) // Amount to right shift to denormalize + p3 = cmp.gt(CTMPH,#-1) + } + /* Underflow */ + /* We know that the infinte range exponent should be EXPA */ + /* CTMP is 2's complement, ATMP is abs(CTMP) */ + { + EXPA = add(EXPA,EXPB) // how much to shift back right + ATMP = asl(ATMP,EXPB) // shift left + AH = USR + TMP = #63 + } + { + EXPB = min(EXPA,TMP) + EXPA = #0 + AL = #0x0030 + } + { + B = extractu(ATMP,EXPBA) + ATMP = asr(ATMP,EXPB) + } + { + p0 = cmp.gtu(ONE,B) + if (!p0.new) ATMPL = or(ATMPL,S_ONE) + ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2) + } + { + CTMP = neg(ATMP) + p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1) + if (!p1.new) AH = or(AH,AL) + B = #0 + } + { + if (p3) CTMP = ATMP + USR = AH + TMP = #-BIAS-(MANTBITS+FUDGE2) + } + { + A = convert_d2df(CTMP) + } + { + AH += asl(TMP,#HI_MANTBITS) + dealloc_return + } +.Lpossible_unf: + { + TMP = ##0x7fefffff + ATMP = abs(CTMP) + } + { + p0 = cmp.eq(AL,#0) + p0 = bitsclr(AH,TMP) + if (!p0.new) dealloc_return:t + TMP = #0x7fff + } + { + p0 = bitsset(ATMPH,TMP) + BH = USR + BL = #0x0030 + } + { + if (p0) BH = or(BH,BL) + } + { + USR = BH + } + { + p0 = dfcmp.eq(A,A) + dealloc_return + } +.Lfma_ovf: + { + TMP = USR + CTMP = combine(##0x7fefffff,#-1) + A = CTMP + } + { + ATMP = combine(##0x7ff00000,#0) + BH = extractu(TMP,#2,#SR_ROUND_OFF) + TMP = or(TMP,#0x28) + } + { + USR = TMP + BH ^= lsr(AH,#31) + BL = BH + } + { + p0 = !cmp.eq(BL,#1) + p0 = !cmp.eq(BH,#2) + } + { + p0 = dfcmp.eq(ATMP,ATMP) + if (p0.new) CTMP = ATMP + } + { + A = insert(CTMP,#63,#0) + dealloc_return + } +#undef CTMP +#undef CTMPH +#undef CTMPL +#define BTMP r11:10 +#define BTMPH r11 +#define BTMPL r10 + +#undef STICKIES +#undef STICKIESH +#undef STICKIESL +#define C r5:4 +#define CH r5 +#define CL r4 + +.Lfma_abnormal_ab: + { + ATMP = extractu(A,#63,#0) + BTMP = extractu(B,#63,#0) + deallocframe + } + { + p3 = cmp.gtu(ATMP,BTMP) + if (!p3.new) A = B // sort values + if (!p3.new) B = A + } + { + p0 = dfclass(A,#0x0f) // A NaN? + if (!p0.new) jump:nt .Lnan + if (!p3) ATMP = BTMP + if (!p3) BTMP = ATMP + } + { + p1 = dfclass(A,#0x08) // A is infinity + p1 = dfclass(B,#0x0e) // B is nonzero + } + { + p0 = dfclass(A,#0x08) // a is inf + p0 = dfclass(B,#0x01) // b is zero + } + { + if (p1) jump .Lab_inf + p2 = dfclass(B,#0x01) + } + { + if (p0) jump .Linvalid + if (p2) jump .Lab_true_zero + TMP = ##0x7c000000 + } + // We are left with a normal or subnormal times a subnormal, A > B + // If A and B are both very small, we will go to a single sticky bit; replace + // A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results + // if A and B might multiply to something bigger, decrease A exp and increase B exp + // and start over + { + p0 = bitsclr(AH,TMP) + if (p0.new) jump:nt .Lfma_ab_tiny + } + { + TMP = add(clb(BTMP),#-EXPBITS) + } + { + BTMP = asl(BTMP,TMP) + } + { + B = insert(BTMP,#63,#0) + AH -= asl(TMP,#HI_MANTBITS) + } + jump fma + +.Lfma_ab_tiny: + ATMP = combine(##0x00100000,#0) + { + A = insert(ATMP,#63,#0) + B = insert(ATMP,#63,#0) + } + jump fma + +.Lab_inf: + { + B = lsr(B,#63) + p0 = dfclass(C,#0x10) + } + { + A ^= asl(B,#63) + if (p0) jump .Lnan + } + { + p1 = dfclass(C,#0x08) + if (p1.new) jump:nt .Lfma_inf_plus_inf + } + /* A*B is +/- inf, C is finite. Return A */ + { + jumpr r31 + } + .falign +.Lfma_inf_plus_inf: + { // adding infinities of different signs is invalid + p0 = dfcmp.eq(A,C) + if (!p0.new) jump:nt .Linvalid + } + { + jumpr r31 + } + +.Lnan: + { + p0 = dfclass(B,#0x10) + p1 = dfclass(C,#0x10) + if (!p0.new) B = A + if (!p1.new) C = A + } + { // find sNaNs + BH = convert_df2sf(B) + BL = convert_df2sf(C) + } + { + BH = convert_df2sf(A) + A = #-1 + jumpr r31 + } + +.Linvalid: + { + TMP = ##0x7f800001 // sp snan + } + { + A = convert_sf2df(TMP) + jumpr r31 + } + +.Lab_true_zero: + // B is zero, A is finite number + { + p0 = dfclass(C,#0x10) + if (p0.new) jump:nt .Lnan + if (p0.new) A = C + } + { + p0 = dfcmp.eq(B,C) // is C also zero? + AH = lsr(AH,#31) // get sign + } + { + BH ^= asl(AH,#31) // form correctly signed zero in B + if (!p0) A = C // If C is not zero, return C + if (!p0) jumpr r31 + } + /* B has correctly signed zero, C is also zero */ +.Lzero_plus_zero: + { + p0 = cmp.eq(B,C) // yes, scalar equals. +0++0 or -0+-0 + if (p0.new) jumpr:t r31 + A = B + } + { + TMP = USR + } + { + TMP = extractu(TMP,#2,#SR_ROUND_OFF) + A = #0 + } + { + p0 = cmp.eq(TMP,#2) + if (p0.new) AH = ##0x80000000 + jumpr r31 + } +#undef BTMP +#undef BTMPH +#undef BTMPL +#define CTMP r11:10 + .falign +.Lfma_abnormal_c: + /* We know that AB is normal * normal */ + /* C is not normal: zero, subnormal, inf, or NaN. */ + { + p0 = dfclass(C,#0x10) // is C NaN? + if (p0.new) jump:nt .Lnan + if (p0.new) A = C // move NaN to A + deallocframe + } + { + p0 = dfclass(C,#0x08) // is C inf? + if (p0.new) A = C // return C + if (p0.new) jumpr:nt r31 + } + // zero or subnormal + // If we have a zero, and we know AB is normal*normal, we can just call normal multiply + { + p0 = dfclass(C,#0x01) // is C zero? + if (p0.new) jump:nt __hexagon_muldf3 + TMP = #1 + } + // Left with: subnormal + // Adjust C and jump back to restart + { + allocframe(#STACKSPACE) // oops, deallocated above, re-allocate frame + CTMP = #0 + CH = insert(TMP,#EXPBITS,#HI_MANTBITS) + jump .Lfma_abnormal_c_restart + } +END(fma) diff --git a/contrib/compiler-rt/lib/builtins/hexagon/dfminmax.S b/contrib/compiler-rt/lib/builtins/hexagon/dfminmax.S new file mode 100644 index 000000000000..41122911f183 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/dfminmax.S @@ -0,0 +1,79 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define A r1:0 +#define B r3:2 +#define ATMP r5:4 + + +#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG +#define END(TAG) .size TAG,.-TAG + +/* + * Min and Max return A if B is NaN, or B if A is NaN + * Otherwise, they return the smaller or bigger value + * + * If values are equal, we want to favor -0.0 for min and +0.0 for max. + */ + +/* + * Compares always return false for NaN + * if (isnan(A)) A = B; if (A > B) A = B will only trigger at most one of those options. + */ + .text + .global __hexagon_mindf3 + .global __hexagon_maxdf3 + .global fmin + .type fmin,@function + .global fmax + .type fmax,@function + .type __hexagon_mindf3,@function + .type __hexagon_maxdf3,@function + Q6_ALIAS(mindf3) + Q6_ALIAS(maxdf3) + .p2align 5 +__hexagon_mindf3: +fmin: + { + p0 = dfclass(A,#0x10) // If A is a number + p1 = dfcmp.gt(A,B) // AND B > A, don't swap + ATMP = A + } + { + if (p0) A = B // if A is NaN use B + if (p1) A = B // gt is always false if either is NaN + p2 = dfcmp.eq(A,B) // if A == B + if (!p2.new) jumpr:t r31 + } + /* A == B, return A|B to select -0.0 over 0.0 */ + { + A = or(ATMP,B) + jumpr r31 + } +END(__hexagon_mindf3) + .falign +__hexagon_maxdf3: +fmax: + { + p0 = dfclass(A,#0x10) + p1 = dfcmp.gt(B,A) + ATMP = A + } + { + if (p0) A = B + if (p1) A = B + p2 = dfcmp.eq(A,B) + if (!p2.new) jumpr:t r31 + } + /* A == B, return A&B to select 0.0 over -0.0 */ + { + A = and(ATMP,B) + jumpr r31 + } +END(__hexagon_maxdf3) diff --git a/contrib/compiler-rt/lib/builtins/hexagon/dfmul.S b/contrib/compiler-rt/lib/builtins/hexagon/dfmul.S new file mode 100644 index 000000000000..fde6d77bdcff --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/dfmul.S @@ -0,0 +1,418 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +/* Double Precision Multiply */ +#define A r1:0 +#define AH r1 +#define AL r0 +#define B r3:2 +#define BH r3 +#define BL r2 + +#define BTMP r5:4 +#define BTMPH r5 +#define BTMPL r4 + +#define PP_ODD r7:6 +#define PP_ODD_H r7 +#define PP_ODD_L r6 + +#define ONE r9:8 +#define S_ONE r8 +#define S_ZERO r9 + +#define PP_HH r11:10 +#define PP_HH_H r11 +#define PP_HH_L r10 + +#define ATMP r13:12 +#define ATMPH r13 +#define ATMPL r12 + +#define PP_LL r15:14 +#define PP_LL_H r15 +#define PP_LL_L r14 + +#define TMP r28 + +#define MANTBITS 52 +#define HI_MANTBITS 20 +#define EXPBITS 11 +#define BIAS 1024 +#define MANTISSA_TO_INT_BIAS 52 + +/* Some constant to adjust normalization amount in error code */ +/* Amount to right shift the partial product to get to a denorm */ +#define FUDGE 5 + +#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG +#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG +#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG +#define END(TAG) .size TAG,.-TAG + +#define SR_ROUND_OFF 22 + .text + .global __hexagon_muldf3 + .type __hexagon_muldf3,@function + Q6_ALIAS(muldf3) + FAST_ALIAS(muldf3) + FAST2_ALIAS(muldf3) + .p2align 5 +__hexagon_muldf3: + { + p0 = dfclass(A,#2) + p0 = dfclass(B,#2) + ATMP = combine(##0x40000000,#0) + } + { + ATMP = insert(A,#MANTBITS,#EXPBITS-1) + BTMP = asl(B,#EXPBITS-1) + TMP = #-BIAS + ONE = #1 + } + { + PP_ODD = mpyu(BTMPL,ATMPH) + BTMP = insert(ONE,#2,#62) + } + /* since we know that the MSB of the H registers is zero, we should never carry */ + /* H <= 2^31-1. L <= 2^32-1. Therefore, HL <= 2^63-2^32-2^31+1 */ + /* Adding 2 HLs, we get 2^64-3*2^32+2 maximum. */ + /* Therefore, we can add 3 2^32-1 values safely without carry. We only need one. */ + { + PP_LL = mpyu(ATMPL,BTMPL) + PP_ODD += mpyu(ATMPL,BTMPH) + } + { + PP_ODD += lsr(PP_LL,#32) + PP_HH = mpyu(ATMPH,BTMPH) + BTMP = combine(##BIAS+BIAS-4,#0) + } + { + PP_HH += lsr(PP_ODD,#32) + if (!p0) jump .Lmul_abnormal + p1 = cmp.eq(PP_LL_L,#0) // 64 lsb's 0? + p1 = cmp.eq(PP_ODD_L,#0) // 64 lsb's 0? + } + /* + * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts + * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so + */ +#undef PP_ODD +#undef PP_ODD_H +#undef PP_ODD_L +#define EXP10 r7:6 +#define EXP1 r7 +#define EXP0 r6 + { + if (!p1) PP_HH_L = or(PP_HH_L,S_ONE) + EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS) + EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS) + } + { + PP_LL = neg(PP_HH) + EXP0 += add(TMP,EXP1) + TMP = xor(AH,BH) + } + { + if (!p2.new) PP_HH = PP_LL + p2 = cmp.gt(TMP,#-1) + p0 = !cmp.gt(EXP0,BTMPH) + p0 = cmp.gt(EXP0,BTMPL) + if (!p0.new) jump:nt .Lmul_ovf_unf + } + { + A = convert_d2df(PP_HH) + EXP0 = add(EXP0,#-BIAS-58) + } + { + AH += asl(EXP0,#HI_MANTBITS) + jumpr r31 + } + + .falign +.Lpossible_unf: + /* We end up with a positive exponent */ + /* But we may have rounded up to an exponent of 1. */ + /* If the exponent is 1, if we rounded up to it + * we need to also raise underflow + * Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000 + * And the PP should also have more than one bit set + */ + /* Note: ATMP should have abs(PP_HH) */ + /* Note: BTMPL should have 0x7FEFFFFF */ + { + p0 = cmp.eq(AL,#0) + p0 = bitsclr(AH,BTMPL) + if (!p0.new) jumpr:t r31 + BTMPH = #0x7fff + } + { + p0 = bitsset(ATMPH,BTMPH) + BTMPL = USR + BTMPH = #0x030 + } + { + if (p0) BTMPL = or(BTMPL,BTMPH) + } + { + USR = BTMPL + } + { + p0 = dfcmp.eq(A,A) + jumpr r31 + } + .falign +.Lmul_ovf_unf: + { + A = convert_d2df(PP_HH) + ATMP = abs(PP_HH) // take absolute value + EXP1 = add(EXP0,#-BIAS-58) + } + { + AH += asl(EXP1,#HI_MANTBITS) + EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS) + BTMPL = ##0x7FEFFFFF + } + { + EXP1 += add(EXP0,##-BIAS-58) + //BTMPH = add(clb(ATMP),#-2) + BTMPH = #0 + } + { + p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow + if (p0.new) jump:nt .Lmul_ovf + } + { + p0 = cmp.gt(EXP1,#0) + if (p0.new) jump:nt .Lpossible_unf + BTMPH = sub(EXP0,BTMPH) + TMP = #63 // max amount to shift + } + /* Underflow */ + /* + * PP_HH has the partial product with sticky LSB. + * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts + * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so + * The exponent of PP_HH is in EXP1, which is non-positive (0 or negative) + * That's the exponent that happens after the normalization + * + * EXP0 has the exponent that, when added to the normalized value, is out of range. + * + * Strategy: + * + * * Shift down bits, with sticky bit, such that the bits are aligned according + * to the LZ count and appropriate exponent, but not all the way to mantissa + * field, keep around the last few bits. + * * Put a 1 near the MSB + * * Check the LSBs for inexact; if inexact also set underflow + * * Convert [u]d2df -- will correctly round according to rounding mode + * * Replace exponent field with zero + * + * + */ + + + { + BTMPL = #0 // offset for extract + BTMPH = sub(#FUDGE,BTMPH) // amount to right shift + } + { + p3 = cmp.gt(PP_HH_H,#-1) // is it positive? + BTMPH = min(BTMPH,TMP) // Don't shift more than 63 + PP_HH = ATMP + } + { + TMP = USR + PP_LL = extractu(PP_HH,BTMP) + } + { + PP_HH = asr(PP_HH,BTMPH) + BTMPL = #0x0030 // underflow flag + AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS) + } + { + p0 = cmp.gtu(ONE,PP_LL) // Did we extract all zeros? + if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE) // add sticky bit + PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3) // Add back in a bit so we can use convert instruction + } + { + PP_LL = neg(PP_HH) + p1 = bitsclr(PP_HH_L,#0x7) // Are the LSB's clear? + if (!p1.new) TMP = or(BTMPL,TMP) // If not, Inexact+Underflow + } + { + if (!p3) PP_HH = PP_LL + USR = TMP + } + { + A = convert_d2df(PP_HH) // Do rounding + p0 = dfcmp.eq(A,A) // realize exception + } + { + AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1) // Insert correct exponent + jumpr r31 + } + .falign +.Lmul_ovf: + // We get either max finite value or infinity. Either way, overflow+inexact + { + TMP = USR + ATMP = combine(##0x7fefffff,#-1) // positive max finite + A = PP_HH + } + { + PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits + TMP = or(TMP,#0x28) // inexact + overflow + BTMP = combine(##0x7ff00000,#0) // positive infinity + } + { + USR = TMP + PP_LL_L ^= lsr(AH,#31) // Does sign match rounding? + TMP = PP_LL_L // unmodified rounding mode + } + { + p0 = !cmp.eq(TMP,#1) // If not round-to-zero and + p0 = !cmp.eq(PP_LL_L,#2) // Not rounding the other way, + if (p0.new) ATMP = BTMP // we should get infinity + p0 = dfcmp.eq(A,A) // Realize FP exception if enabled + } + { + A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign + jumpr r31 + } + +.Lmul_abnormal: + { + ATMP = extractu(A,#63,#0) // strip off sign + BTMP = extractu(B,#63,#0) // strip off sign + } + { + p3 = cmp.gtu(ATMP,BTMP) + if (!p3.new) A = B // sort values + if (!p3.new) B = A // sort values + } + { + // Any NaN --> NaN, possibly raise invalid if sNaN + p0 = dfclass(A,#0x0f) // A not NaN? + if (!p0.new) jump:nt .Linvalid_nan + if (!p3) ATMP = BTMP + if (!p3) BTMP = ATMP + } + { + // Infinity * nonzero number is infinity + p1 = dfclass(A,#0x08) // A is infinity + p1 = dfclass(B,#0x0e) // B is nonzero + } + { + // Infinity * zero --> NaN, raise invalid + // Other zeros return zero + p0 = dfclass(A,#0x08) // A is infinity + p0 = dfclass(B,#0x01) // B is zero + } + { + if (p1) jump .Ltrue_inf + p2 = dfclass(B,#0x01) + } + { + if (p0) jump .Linvalid_zeroinf + if (p2) jump .Ltrue_zero // so return zero + TMP = ##0x7c000000 + } + // We are left with a normal or subnormal times a subnormal. A > B + // If A and B are both very small (exp(a) < BIAS-MANTBITS), + // we go to a single sticky bit, which we can round easily. + // If A and B might multiply to something bigger, decrease A exponent and increase + // B exponent and try again + { + p0 = bitsclr(AH,TMP) + if (p0.new) jump:nt .Lmul_tiny + } + { + TMP = cl0(BTMP) + } + { + TMP = add(TMP,#-EXPBITS) + } + { + BTMP = asl(BTMP,TMP) + } + { + B = insert(BTMP,#63,#0) + AH -= asl(TMP,#HI_MANTBITS) + } + jump __hexagon_muldf3 +.Lmul_tiny: + { + TMP = USR + A = xor(A,B) // get sign bit + } + { + TMP = or(TMP,#0x30) // Inexact + Underflow + A = insert(ONE,#63,#0) // put in rounded up value + BTMPH = extractu(TMP,#2,#SR_ROUND_OFF) // get rounding mode + } + { + USR = TMP + p0 = cmp.gt(BTMPH,#1) // Round towards pos/neg inf? + if (!p0.new) AL = #0 // If not, zero + BTMPH ^= lsr(AH,#31) // rounding my way --> set LSB + } + { + p0 = cmp.eq(BTMPH,#3) // if rounding towards right inf + if (!p0.new) AL = #0 // don't go to zero + jumpr r31 + } +.Linvalid_zeroinf: + { + TMP = USR + } + { + A = #-1 + TMP = or(TMP,#2) + } + { + USR = TMP + } + { + p0 = dfcmp.uo(A,A) // force exception if enabled + jumpr r31 + } +.Linvalid_nan: + { + p0 = dfclass(B,#0x0f) // if B is not NaN + TMP = convert_df2sf(A) // will generate invalid if sNaN + if (p0.new) B = A // make it whatever A is + } + { + BL = convert_df2sf(B) // will generate invalid if sNaN + A = #-1 + jumpr r31 + } + .falign +.Ltrue_zero: + { + A = B + B = A + } +.Ltrue_inf: + { + BH = extract(BH,#1,#31) + } + { + AH ^= asl(BH,#31) + jumpr r31 + } +END(__hexagon_muldf3) + +#undef ATMP +#undef ATMPL +#undef ATMPH +#undef BTMP +#undef BTMPL +#undef BTMPH diff --git a/contrib/compiler-rt/lib/builtins/hexagon/dfsqrt.S b/contrib/compiler-rt/lib/builtins/hexagon/dfsqrt.S new file mode 100644 index 000000000000..027d9e1fde43 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/dfsqrt.S @@ -0,0 +1,406 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +/* Double Precision square root */ + +#define EXP r28 + +#define A r1:0 +#define AH r1 +#define AL r0 + +#define SFSH r3:2 +#define SF_S r3 +#define SF_H r2 + +#define SFHALF_SONE r5:4 +#define S_ONE r4 +#define SFHALF r5 +#define SF_D r6 +#define SF_E r7 +#define RECIPEST r8 +#define SFRAD r9 + +#define FRACRAD r11:10 +#define FRACRADH r11 +#define FRACRADL r10 + +#define ROOT r13:12 +#define ROOTHI r13 +#define ROOTLO r12 + +#define PROD r15:14 +#define PRODHI r15 +#define PRODLO r14 + +#define P_TMP p0 +#define P_EXP1 p1 +#define NORMAL p2 + +#define SF_EXPBITS 8 +#define SF_MANTBITS 23 + +#define DF_EXPBITS 11 +#define DF_MANTBITS 52 + +#define DF_BIAS 0x3ff + +#define DFCLASS_ZERO 0x01 +#define DFCLASS_NORMAL 0x02 +#define DFCLASS_DENORMAL 0x02 +#define DFCLASS_INFINITE 0x08 +#define DFCLASS_NAN 0x10 + +#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function +#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function +#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function +#define END(TAG) .size TAG,.-TAG + + .text + .global __hexagon_sqrtdf2 + .type __hexagon_sqrtdf2,@function + .global __hexagon_sqrt + .type __hexagon_sqrt,@function + Q6_ALIAS(sqrtdf2) + Q6_ALIAS(sqrt) + FAST_ALIAS(sqrtdf2) + FAST_ALIAS(sqrt) + FAST2_ALIAS(sqrtdf2) + FAST2_ALIAS(sqrt) + .type sqrt,@function + .p2align 5 +__hexagon_sqrtdf2: +__hexagon_sqrt: + { + PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) + EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32) + SFHALF_SONE = combine(##0x3f000004,#1) + } + { + NORMAL = dfclass(A,#DFCLASS_NORMAL) // Is it normal + NORMAL = cmp.gt(AH,#-1) // and positive? + if (!NORMAL.new) jump:nt .Lsqrt_abnormal + SFRAD = or(SFHALF,PRODLO) + } +#undef NORMAL +.Ldenormal_restart: + { + FRACRAD = A + SF_E,P_TMP = sfinvsqrta(SFRAD) + SFHALF = and(SFHALF,#-16) + SFSH = #0 + } +#undef A +#undef AH +#undef AL +#define ERROR r1:0 +#define ERRORHI r1 +#define ERRORLO r0 + // SF_E : reciprocal square root + // SF_H : half rsqrt + // sf_S : square root + // SF_D : error term + // SFHALF: 0.5 + { + SF_S += sfmpy(SF_E,SFRAD):lib // s0: root + SF_H += sfmpy(SF_E,SFHALF):lib // h0: 0.5*y0. Could also decrement exponent... + SF_D = SFHALF +#undef SFRAD +#define SHIFTAMT r9 + SHIFTAMT = and(EXP,#1) + } + { + SF_D -= sfmpy(SF_S,SF_H):lib // d0: 0.5-H*S = 0.5-0.5*~1 + FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) // replace upper bits with hidden + P_EXP1 = cmp.gtu(SHIFTAMT,#0) + } + { + SF_S += sfmpy(SF_S,SF_D):lib // s1: refine sqrt + SF_H += sfmpy(SF_H,SF_D):lib // h1: refine half-recip + SF_D = SFHALF + SHIFTAMT = mux(P_EXP1,#8,#9) + } + { + SF_D -= sfmpy(SF_S,SF_H):lib // d1: error term + FRACRAD = asl(FRACRAD,SHIFTAMT) // Move fracrad bits to right place + SHIFTAMT = mux(P_EXP1,#3,#2) + } + { + SF_H += sfmpy(SF_H,SF_D):lib // d2: rsqrt + // cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x). + PROD = asl(FRACRAD,SHIFTAMT) // fracrad<<(2+exp1) + } + { + SF_H = and(SF_H,##0x007fffff) + } + { + SF_H = add(SF_H,##0x00800000 - 3) + SHIFTAMT = mux(P_EXP1,#7,#8) + } + { + RECIPEST = asl(SF_H,SHIFTAMT) + SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0)) + } + { + ROOT = mpyu(RECIPEST,PRODHI) // root = mpyu_full(recipest,hi(fracrad<<(2+exp1))) + } + +#undef SFSH // r3:2 +#undef SF_H // r2 +#undef SF_S // r3 +#undef S_ONE // r4 +#undef SFHALF // r5 +#undef SFHALF_SONE // r5:4 +#undef SF_D // r6 +#undef SF_E // r7 + +#define HL r3:2 +#define LL r5:4 +#define HH r7:6 + +#undef P_EXP1 +#define P_CARRY0 p1 +#define P_CARRY1 p2 +#define P_CARRY2 p3 + + /* Iteration 0 */ + /* Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply */ + /* We can shift and subtract instead of shift and add? */ + { + ERROR = asl(FRACRAD,#15) + PROD = mpyu(ROOTHI,ROOTHI) + P_CARRY0 = cmp.eq(r0,r0) + } + { + ERROR -= asl(PROD,#15) + PROD = mpyu(ROOTHI,ROOTLO) + P_CARRY1 = cmp.eq(r0,r0) + } + { + ERROR -= lsr(PROD,#16) + P_CARRY2 = cmp.eq(r0,r0) + } + { + ERROR = mpyu(ERRORHI,RECIPEST) + } + { + ROOT += lsr(ERROR,SHIFTAMT) + SHIFTAMT = add(SHIFTAMT,#16) + ERROR = asl(FRACRAD,#31) // for next iter + } + /* Iteration 1 */ + { + PROD = mpyu(ROOTHI,ROOTHI) + ERROR -= mpyu(ROOTHI,ROOTLO) // amount is 31, no shift needed + } + { + ERROR -= asl(PROD,#31) + PROD = mpyu(ROOTLO,ROOTLO) + } + { + ERROR -= lsr(PROD,#33) + } + { + ERROR = mpyu(ERRORHI,RECIPEST) + } + { + ROOT += lsr(ERROR,SHIFTAMT) + SHIFTAMT = add(SHIFTAMT,#16) + ERROR = asl(FRACRAD,#47) // for next iter + } + /* Iteration 2 */ + { + PROD = mpyu(ROOTHI,ROOTHI) + } + { + ERROR -= asl(PROD,#47) + PROD = mpyu(ROOTHI,ROOTLO) + } + { + ERROR -= asl(PROD,#16) // bidir shr 31-47 + PROD = mpyu(ROOTLO,ROOTLO) + } + { + ERROR -= lsr(PROD,#17) // 64-47 + } + { + ERROR = mpyu(ERRORHI,RECIPEST) + } + { + ROOT += lsr(ERROR,SHIFTAMT) + } +#undef ERROR +#undef PROD +#undef PRODHI +#undef PRODLO +#define REM_HI r15:14 +#define REM_HI_HI r15 +#define REM_LO r1:0 +#undef RECIPEST +#undef SHIFTAMT +#define TWOROOT_LO r9:8 + /* Adjust Root */ + { + HL = mpyu(ROOTHI,ROOTLO) + LL = mpyu(ROOTLO,ROOTLO) + REM_HI = #0 + REM_LO = #0 + } + { + HL += lsr(LL,#33) + LL += asl(HL,#33) + P_CARRY0 = cmp.eq(r0,r0) + } + { + HH = mpyu(ROOTHI,ROOTHI) + REM_LO = sub(REM_LO,LL,P_CARRY0):carry + TWOROOT_LO = #1 + } + { + HH += lsr(HL,#31) + TWOROOT_LO += asl(ROOT,#1) + } +#undef HL +#undef LL +#define REM_HI_TMP r3:2 +#define REM_HI_TMP_HI r3 +#define REM_LO_TMP r5:4 + { + REM_HI = sub(FRACRAD,HH,P_CARRY0):carry + REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry +#undef FRACRAD +#undef HH +#define ZERO r11:10 +#define ONE r7:6 + ONE = #1 + ZERO = #0 + } + { + REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry + ONE = add(ROOT,ONE) + EXP = add(EXP,#-DF_BIAS) // subtract bias --> signed exp + } + { + // If carry set, no borrow: result was still positive + if (P_CARRY1) ROOT = ONE + if (P_CARRY1) REM_LO = REM_LO_TMP + if (P_CARRY1) REM_HI = REM_HI_TMP + } + { + REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry + ONE = #1 + EXP = asr(EXP,#1) // divide signed exp by 2 + } + { + REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry + ONE = add(ROOT,ONE) + } + { + if (P_CARRY2) ROOT = ONE + if (P_CARRY2) REM_LO = REM_LO_TMP + // since tworoot <= 2^32, remhi must be zero +#undef REM_HI_TMP +#undef REM_HI_TMP_HI +#define S_ONE r2 +#define ADJ r3 + S_ONE = #1 + } + { + P_TMP = cmp.eq(REM_LO,ZERO) // is the low part zero + if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE) // if so, it's exact... hopefully + ADJ = cl0(ROOT) + EXP = add(EXP,#-63) + } +#undef REM_LO +#define RET r1:0 +#define RETHI r1 + { + RET = convert_ud2df(ROOT) // set up mantissa, maybe set inexact flag + EXP = add(EXP,ADJ) // add back bias + } + { + RETHI += asl(EXP,#DF_MANTBITS-32) // add exponent adjust + jumpr r31 + } +#undef REM_LO_TMP +#undef REM_HI_TMP +#undef REM_HI_TMP_HI +#undef REM_LO +#undef REM_HI +#undef TWOROOT_LO + +#undef RET +#define A r1:0 +#define AH r1 +#define AL r1 +#undef S_ONE +#define TMP r3:2 +#define TMPHI r3 +#define TMPLO r2 +#undef P_CARRY0 +#define P_NEG p1 + + +#define SFHALF r5 +#define SFRAD r9 +.Lsqrt_abnormal: + { + P_TMP = dfclass(A,#DFCLASS_ZERO) // zero? + if (P_TMP.new) jumpr:t r31 + } + { + P_TMP = dfclass(A,#DFCLASS_NAN) + if (P_TMP.new) jump:nt .Lsqrt_nan + } + { + P_TMP = cmp.gt(AH,#-1) + if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg + if (!P_TMP.new) EXP = ##0x7F800001 // sNaN + } + { + P_TMP = dfclass(A,#DFCLASS_INFINITE) + if (P_TMP.new) jumpr:nt r31 + } + // If we got here, we're denormal + // prepare to restart + { + A = extractu(A,#DF_MANTBITS,#0) // Extract mantissa + } + { + EXP = add(clb(A),#-DF_EXPBITS) // how much to normalize? + } + { + A = asl(A,EXP) // Shift mantissa + EXP = sub(#1,EXP) // Form exponent + } + { + AH = insert(EXP,#1,#DF_MANTBITS-32) // insert lsb of exponent + } + { + TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) // get sf value (mant+exp1) + SFHALF = ##0x3f000004 // form half constant + } + { + SFRAD = or(SFHALF,TMPLO) // form sf value + SFHALF = and(SFHALF,#-16) + jump .Ldenormal_restart // restart + } +.Lsqrt_nan: + { + EXP = convert_df2sf(A) // if sNaN, get invalid + A = #-1 // qNaN + jumpr r31 + } +.Lsqrt_invalid_neg: + { + A = convert_sf2df(EXP) // Invalid,NaNval + jumpr r31 + } +END(__hexagon_sqrt) +END(__hexagon_sqrtdf2) diff --git a/contrib/compiler-rt/lib/builtins/hexagon/divdi3.S b/contrib/compiler-rt/lib/builtins/hexagon/divdi3.S new file mode 100644 index 000000000000..49ee8104f305 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/divdi3.S @@ -0,0 +1,85 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + + +FUNCTION_BEGIN __hexagon_divdi3 + { + p2 = tstbit(r1,#31) + p3 = tstbit(r3,#31) + } + { + r1:0 = abs(r1:0) + r3:2 = abs(r3:2) + } + { + r6 = cl0(r1:0) // count leading 0's of dividend (numerator) + r7 = cl0(r3:2) // count leading 0's of divisor (denominator) + r5:4 = r3:2 // divisor moved into working registers + r3:2 = r1:0 // dividend is the initial remainder, r3:2 contains remainder + } + { + p3 = xor(p2,p3) + r10 = sub(r7,r6) // left shift count for bit & divisor + r1:0 = #0 // initialize quotient to 0 + r15:14 = #1 // initialize bit to 1 + } + { + r11 = add(r10,#1) // loop count is 1 more than shift count + r13:12 = lsl(r5:4,r10) // shift divisor msb into same bit position as dividend msb + r15:14 = lsl(r15:14,r10) // shift the bit left by same amount as divisor + } + { + p0 = cmp.gtu(r5:4,r3:2) // check if divisor > dividend + loop0(1f,r11) // register loop + } + { + if (p0) jump .hexagon_divdi3_return // if divisor > dividend, we're done, so return + } + .falign +1: + { + p0 = cmp.gtu(r13:12,r3:2) // set predicate reg if shifted divisor > current remainder + } + { + r7:6 = sub(r3:2, r13:12) // subtract shifted divisor from current remainder + r9:8 = add(r1:0, r15:14) // save current quotient to temp (r9:8) + } + { + r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8) + r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6) + } + { + r15:14 = lsr(r15:14, #1) // shift bit right by 1 for next iteration + r13:12 = lsr(r13:12, #1) // shift "shifted divisor" right by 1 for next iteration + }:endloop0 + +.hexagon_divdi3_return: + { + r3:2 = neg(r1:0) + } + { + r1:0 = vmux(p3,r3:2,r1:0) + jumpr r31 + } +FUNCTION_END __hexagon_divdi3 + + .globl __qdsp_divdi3 + .set __qdsp_divdi3, __hexagon_divdi3 diff --git a/contrib/compiler-rt/lib/builtins/hexagon/divsi3.S b/contrib/compiler-rt/lib/builtins/hexagon/divsi3.S new file mode 100644 index 000000000000..8e159baa192f --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/divsi3.S @@ -0,0 +1,84 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + + + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + + +FUNCTION_BEGIN __hexagon_divsi3 + { + p0 = cmp.ge(r0,#0) + p1 = cmp.ge(r1,#0) + r1 = abs(r0) + r2 = abs(r1) + } + { + r3 = cl0(r1) + r4 = cl0(r2) + r5 = sub(r1,r2) + p2 = cmp.gtu(r2,r1) + } +#if (__HEXAGON_ARCH__ == 60) + { + r0 = #0 + p1 = xor(p0,p1) + p0 = cmp.gtu(r2,r5) + } + if (p2) jumpr r31 +#else + { + r0 = #0 + p1 = xor(p0,p1) + p0 = cmp.gtu(r2,r5) + if (p2) jumpr r31 + } +#endif + { + r0 = mux(p1,#-1,#1) + if (p0) jumpr r31 + r4 = sub(r4,r3) + r3 = #1 + } + { + r0 = #0 + r3:2 = vlslw(r3:2,r4) + loop0(1f,r4) + } + .falign +1: + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r1 = sub(r1,r2) + if (!p0.new) r0 = add(r0,r3) + r3:2 = vlsrw(r3:2,#1) + }:endloop0 + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r0 = add(r0,r3) + if (!p1) jumpr r31 + } + { + r0 = neg(r0) + jumpr r31 + } +FUNCTION_END __hexagon_divsi3 + + .globl __qdsp_divsi3 + .set __qdsp_divsi3, __hexagon_divsi3 diff --git a/contrib/compiler-rt/lib/builtins/hexagon/fabs_opt.S b/contrib/compiler-rt/lib/builtins/hexagon/fabs_opt.S new file mode 100644 index 000000000000..b09b00734d98 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/fabs_opt.S @@ -0,0 +1,37 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +.macro FUNCTION_BEGIN name +.text +.p2align 5 +.globl \name +.type \name, @function +\name: +.endm + +.macro FUNCTION_END name +.size \name, . - \name +.endm + +FUNCTION_BEGIN fabs + { + r1 = clrbit(r1, #31) + jumpr r31 + } +FUNCTION_END fabs + +FUNCTION_BEGIN fabsf + { + r0 = clrbit(r0, #31) + jumpr r31 + } +FUNCTION_END fabsf + + .globl fabsl + .set fabsl, fabs diff --git a/contrib/compiler-rt/lib/builtins/hexagon/fastmath2_dlib_asm.S b/contrib/compiler-rt/lib/builtins/hexagon/fastmath2_dlib_asm.S new file mode 100644 index 000000000000..9286df06c26d --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/fastmath2_dlib_asm.S @@ -0,0 +1,491 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/* ==================================================================== */ +/* FUNCTIONS Optimized double floating point operators */ +/* ==================================================================== */ +/* c = dadd_asm(a, b) */ +/* ==================================================================== * +fast2_QDOUBLE fast2_dadd(fast2_QDOUBLE a,fast2_QDOUBLE b) { + fast2_QDOUBLE c; + lint manta = a & MANTMASK; + int expa = Q6_R_sxth_R(a) ; + lint mantb = b & MANTMASK; + int expb = Q6_R_sxth_R(b) ; + int exp, expdiff, j, k, hi, lo, cn; + lint mant; + + expdiff = (int) Q6_P_vabsdiffh_PP(a, b); + expdiff = Q6_R_sxth_R(expdiff) ; + if (expdiff > 63) { expdiff = 62;} + if (expa > expb) { + exp = expa + 1; + expa = 1; + expb = expdiff + 1; + } else { + exp = expb + 1; + expb = 1; + expa = expdiff + 1; + } + mant = (manta>>expa) + (mantb>>expb); + + hi = (int) (mant>>32); + lo = (int) (mant); + + k = Q6_R_normamt_R(hi); + if(hi == 0 || hi == -1) k = 31+Q6_R_normamt_R(lo); + + mant = (mant << k); + cn = (mant == 0x8000000000000000LL); + exp = exp - k + cn; + + if (mant == 0 || mant == -1) exp = 0x8001; + c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK); + return(c); + } + * ==================================================================== */ + .text + .global fast2_dadd_asm + .type fast2_dadd_asm, @function +fast2_dadd_asm: +#define manta R0 +#define mantexpa R1:0 +#define lmanta R1:0 +#define mantb R2 +#define mantexpb R3:2 +#define lmantb R3:2 +#define expa R4 +#define expb R5 +#define mantexpd R7:6 +#define expd R6 +#define exp R8 +#define c63 R9 +#define lmant R1:0 +#define manth R1 +#define mantl R0 +#define minmin R11:10 // exactly 0x000000000000008001LL +#define minminl R10 +#define k R4 +#define ce P0 + .falign + { + mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL + c63 = #62 + expa = SXTH(manta) + expb = SXTH(mantb) + } { + expd = SXTH(expd) + ce = CMP.GT(expa, expb); + if ( ce.new) exp = add(expa, #1) + if (!ce.new) exp = add(expb, #1) + } { + if ( ce) expa = #1 + if (!ce) expb = #1 + manta.L = #0 + expd = MIN(expd, c63) + } { + if (!ce) expa = add(expd, #1) + if ( ce) expb = add(expd, #1) + mantb.L = #0 + minmin = #0 + } { + lmanta = ASR(lmanta, expa) + lmantb = ASR(lmantb, expb) + } { + lmant = add(lmanta, lmantb) + minminl.L = #0x8001 + } { + k = clb(lmant) + c63 = #58 + } { + k = add(k, #-1) + p0 = cmp.gt(k, c63) + } { + mantexpa = ASL(lmant, k) + exp = SUB(exp, k) + if(p0) jump .Ldenorma + } { + manta = insert(exp, #16, #0) + jumpr r31 + } +.Ldenorma: + { + mantexpa = minmin + jumpr r31 + } +/* =================================================================== * + fast2_QDOUBLE fast2_dsub(fast2_QDOUBLE a,fast2_QDOUBLE b) { + fast2_QDOUBLE c; + lint manta = a & MANTMASK; + int expa = Q6_R_sxth_R(a) ; + lint mantb = b & MANTMASK; + int expb = Q6_R_sxth_R(b) ; + int exp, expdiff, j, k; + lint mant; + + expdiff = (int) Q6_P_vabsdiffh_PP(a, b); + expdiff = Q6_R_sxth_R(expdiff) ; + if (expdiff > 63) { expdiff = 62;} + if (expa > expb) { + exp = expa + 1; + expa = 1; + expb = expdiff + 1; + } else { + exp = expb + 1; + expb = 1; + expa = expdiff + 1; + } + mant = (manta>>expa) - (mantb>>expb); + k = Q6_R_clb_P(mant)-1; + mant = (mant << k); + exp = exp - k; + if (mant == 0 || mant == -1) exp = 0x8001; + c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK); + return(c); + } + * ==================================================================== */ + .text + .global fast2_dsub_asm + .type fast2_dsub_asm, @function +fast2_dsub_asm: + +#define manta R0 +#define mantexpa R1:0 +#define lmanta R1:0 +#define mantb R2 +#define mantexpb R3:2 +#define lmantb R3:2 +#define expa R4 +#define expb R5 +#define mantexpd R7:6 +#define expd R6 +#define exp R8 +#define c63 R9 +#define lmant R1:0 +#define manth R1 +#define mantl R0 +#define minmin R11:10 // exactly 0x000000000000008001LL +#define minminl R10 +#define k R4 +#define ce P0 + .falign + { + mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL + c63 = #62 + expa = SXTH(manta) + expb = SXTH(mantb) + } { + expd = SXTH(expd) + ce = CMP.GT(expa, expb); + if ( ce.new) exp = add(expa, #1) + if (!ce.new) exp = add(expb, #1) + } { + if ( ce) expa = #1 + if (!ce) expb = #1 + manta.L = #0 + expd = MIN(expd, c63) + } { + if (!ce) expa = add(expd, #1) + if ( ce) expb = add(expd, #1) + mantb.L = #0 + minmin = #0 + } { + lmanta = ASR(lmanta, expa) + lmantb = ASR(lmantb, expb) + } { + lmant = sub(lmanta, lmantb) + minminl.L = #0x8001 + } { + k = clb(lmant) + c63 = #58 + } { + k = add(k, #-1) + p0 = cmp.gt(k, c63) + } { + mantexpa = ASL(lmant, k) + exp = SUB(exp, k) + if(p0) jump .Ldenorm + } { + manta = insert(exp, #16, #0) + jumpr r31 + } +.Ldenorm: + { + mantexpa = minmin + jumpr r31 + } +/* ==================================================================== * + fast2_QDOUBLE fast2_dmpy(fast2_QDOUBLE a,fast2_QDOUBLE b) { + fast2_QDOUBLE c; + lint manta = a & MANTMASK; + int expa = Q6_R_sxth_R(a) ; + lint mantb = b & MANTMASK; + int expb = Q6_R_sxth_R(b) ; + int exp, k; + lint mant; + int hia, hib, hi, lo; + unsigned int loa, lob; + + hia = (int)(a >> 32); + loa = Q6_R_extractu_RII((int)manta, 31, 1); + hib = (int)(b >> 32); + lob = Q6_R_extractu_RII((int)mantb, 31, 1); + + mant = Q6_P_mpy_RR(hia, lob); + mant = Q6_P_mpyacc_RR(mant,hib, loa); + mant = (mant >> 30) + (Q6_P_mpy_RR(hia, hib)<<1); + + hi = (int) (mant>>32); + + k = Q6_R_normamt_R(hi); + mant = mant << k; + exp = expa + expb - k; + if (mant == 0 || mant == -1) exp = 0x8001; + c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK); + return(c); + } + * ==================================================================== */ + .text + .global fast2_dmpy_asm + .type fast2_dmpy_asm, @function +fast2_dmpy_asm: + +#define mantal R0 +#define mantah R1 +#define mantexpa R1:0 +#define mantbl R2 +#define mantbh R3 +#define mantexpb R3:2 +#define expa R4 +#define expb R5 +#define c8001 R12 +#define mantexpd R7:6 +#define mantdh R7 +#define exp R8 +#define lmantc R11:10 +#define kb R9 +#define guard R11 +#define mantal_ R12 +#define mantbl_ R13 +#define min R15:14 +#define minh R15 + + .falign + { + mantbl_= lsr(mantbl, #16) + expb = sxth(mantbl) + expa = sxth(mantal) + mantal_= lsr(mantal, #16) + } + { + lmantc = mpy(mantah, mantbh) + mantexpd = mpy(mantah, mantbl_) + mantal.L = #0x0 + min = #0 + } + { + lmantc = add(lmantc, lmantc) + mantexpd+= mpy(mantbh, mantal_) + mantbl.L = #0x0 + minh.H = #0x8000 + } + { + mantexpd = asr(mantexpd, #15) + c8001.L = #0x8001 + p1 = cmp.eq(mantexpa, mantexpb) + } + { + mantexpd = add(mantexpd, lmantc) + exp = add(expa, expb) + p2 = cmp.eq(mantexpa, min) + } + { + kb = clb(mantexpd) + mantexpb = abs(mantexpd) + guard = #58 + } + { + p1 = and(p1, p2) + exp = sub(exp, kb) + kb = add(kb, #-1) + p0 = cmp.gt(kb, guard) + } + { + exp = add(exp, #1) + mantexpa = asl(mantexpd, kb) + if(p1) jump .Lsat //rarely happens + } + { + mantal = insert(exp,#16, #0) + if(!p0) jumpr r31 + } + { + mantal = insert(c8001,#16, #0) + jumpr r31 + } +.Lsat: + { + mantexpa = #-1 + } + { + mantexpa = lsr(mantexpa, #1) + } + { + mantal = insert(exp,#16, #0) + jumpr r31 + } + +/* ==================================================================== * + int fast2_qd2f(fast2_QDOUBLE a) { + int exp; + long long int manta; + int ic, rnd, mantb; + + manta = a>>32; + exp = Q6_R_sxth_R(a) ; + ic = 0x80000000 & manta; + manta = Q6_R_abs_R_sat(manta); + mantb = (manta + rnd)>>7; + rnd = 0x40 + exp = (exp + 126); + if((manta & 0xff) == rnd) rnd = 0x00; + if((manta & 0x7fffffc0) == 0x7fffffc0) { + manta = 0x0; exp++; + } else { + manta= mantb & 0x007fffff; + } + exp = (exp << 23) & 0x7fffffc0; + ic = Q6_R_addacc_RR(ic, exp, manta); + return (ic); + } + * ==================================================================== */ + + .text + .global fast2_qd2f_asm + .type fast2_qd2f_asm, @function +fast2_qd2f_asm: +#define mantah R1 +#define mantal R0 +#define cff R0 +#define mant R3 +#define expo R4 +#define rnd R5 +#define mask R6 +#define c07f R7 +#define c80 R0 +#define mantb R2 +#define ic R0 + + .falign + { + mant = abs(mantah):sat + expo = sxth(mantal) + rnd = #0x40 + mask.L = #0xffc0 + } + { + cff = extractu(mant, #8, #0) + p2 = cmp.gt(expo, #126) + p3 = cmp.ge(expo, #-126) + mask.H = #0x7fff + } + { + p1 = cmp.eq(cff,#0x40) + if(p1.new) rnd = #0 + expo = add(expo, #126) + if(!p3) jump .Lmin + } + { + p0 = bitsset(mant, mask) + c80.L = #0x0000 + mantb = add(mant, rnd) + c07f = lsr(mask, #8) + } + { + if(p0) expo = add(expo, #1) + if(p0) mant = #0 + mantb = lsr(mantb, #7) + c80.H = #0x8000 + } + { + ic = and(c80, mantah) + mask &= asl(expo, #23) + if(!p0) mant = and(mantb, c07f) + if(p2) jump .Lmax + } + { + ic += add(mask, mant) + jumpr r31 + } +.Lmax: + { + ic.L = #0xffff; + } + { + ic.H = #0x7f7f; + jumpr r31 + } +.Lmin: + { + ic = #0x0 + jumpr r31 + } + +/* ==================================================================== * +fast2_QDOUBLE fast2_f2qd(int ia) { + lint exp; + lint mant; + fast2_QDOUBLE c; + + mant = ((ia << 7) | 0x40000000)&0x7fffff80 ; + if (ia & 0x80000000) mant = -mant; + exp = ((ia >> 23) & 0xFFLL) - 126; + c = (mant<<32) | Q6_R_zxth_R(exp);; + return(c); +} + * ==================================================================== */ + .text + .global fast2_f2qd_asm + .type fast2_f2qd_asm, @function +fast2_f2qd_asm: +#define ia R0 +#define mag R3 +#define mantr R1 +#define expr R0 +#define zero R2 +#define maxneg R5:4 +#define maxnegl R4 + .falign + { + mantr = asl(ia, #7) + p0 = tstbit(ia, #31) + maxneg = #0 + mag = add(ia,ia) + } + { + mantr = setbit(mantr, #30) + expr= extractu(ia,#8,#23) + maxnegl.L = #0x8001 + p1 = cmp.eq(mag, #0) + } + { + mantr= extractu(mantr, #31, #0) + expr= add(expr, #-126) + zero = #0 + if(p1) jump .Lminqd + } + { + expr = zxth(expr) + if(p0) mantr= sub(zero, mantr) + jumpr r31 + } +.Lminqd: + { + R1:0 = maxneg + jumpr r31 + } diff --git a/contrib/compiler-rt/lib/builtins/hexagon/fastmath2_ldlib_asm.S b/contrib/compiler-rt/lib/builtins/hexagon/fastmath2_ldlib_asm.S new file mode 100644 index 000000000000..4192555351a6 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/fastmath2_ldlib_asm.S @@ -0,0 +1,345 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/* ==================================================================== * + +fast2_QLDOUBLE fast2_ldadd(fast2_QLDOUBLE a,fast2_QLDOUBLE b) { + fast2_QLDOUBLE c; + lint manta = a & MANTMASK; + int expa = Q6_R_sxth_R(a) ; + lint mantb = b & MANTMASK; + int expb = Q6_R_sxth_R(b) ; + int exp, expdiff, j, k, hi, lo, cn; + lint mant; + + expdiff = (int) Q6_P_vabsdiffh_PP(a, b); + expdiff = Q6_R_sxth_R(expdiff) ; + if (expdiff > 63) { expdiff = 62;} + if (expa > expb) { + exp = expa + 1; + expa = 1; + expb = expdiff + 1; + } else { + exp = expb + 1; + expb = 1; + expa = expdiff + 1; + } + mant = (manta>>expa) + (mantb>>expb); + + hi = (int) (mant>>32); + lo = (int) (mant); + + k = Q6_R_normamt_R(hi); + if(hi == 0 || hi == -1) k = 31+Q6_R_normamt_R(lo); + + mant = (mant << k); + cn = (mant == 0x8000000000000000LL); + exp = exp - k + cn; + + if (mant == 0 || mant == -1) exp = 0x8001; + c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK); + return(c); + } + * ==================================================================== */ + .text + .global fast2_ldadd_asm + .type fast2_ldadd_asm, @function +fast2_ldadd_asm: +#define manta R1:0 +#define lmanta R1:0 +#define mantb R3:2 +#define lmantb R3:2 +#define expa R4 +#define expb R5 +#define expd R6 +#define exp R8 +#define c63 R9 +#define lmant R1:0 +#define k R4 +#define ce P0 +#define zero R3:2 + .falign + { + expa = memw(r29+#8) + expb = memw(r29+#24) + r7 = r0 + } + { + expd = sub(expa, expb):sat + ce = CMP.GT(expa, expb); + if ( ce.new) exp = add(expa, #1) + if (!ce.new) exp = add(expb, #1) + } { + expd = abs(expd):sat + if ( ce) expa = #1 + if (!ce) expb = #1 + c63 = #62 + } { + expd = MIN(expd, c63) + manta = memd(r29+#0) + mantb = memd(r29+#16) + } { + if (!ce) expa = add(expd, #1) + if ( ce) expb = add(expd, #1) + } { + lmanta = ASR(lmanta, expa) + lmantb = ASR(lmantb, expb) + } { + lmant = add(lmanta, lmantb) + zero = #0 + } { + k = clb(lmant) + c63.L =#0x0001 + } { + exp -= add(k, #-1) //exp = exp - (k-1) + k = add(k, #-1) + p0 = cmp.gt(k, #58) + c63.H =#0x8000 + } { + if(!p0)memw(r7+#8) = exp + lmant = ASL(lmant, k) + if(p0) jump .Ldenorma + } { + memd(r7+#0) = lmant + jumpr r31 + } +.Ldenorma: + memd(r7+#0) = zero + { + memw(r7+#8) = c63 + jumpr r31 + } +/* =================================================================== * + fast2_QLDOUBLE fast2_ldsub(fast2_QLDOUBLE a,fast2_QLDOUBLE b) { + fast2_QLDOUBLE c; + lint manta = a & MANTMASK; + int expa = Q6_R_sxth_R(a) ; + lint mantb = b & MANTMASK; + int expb = Q6_R_sxth_R(b) ; + int exp, expdiff, j, k; + lint mant; + + expdiff = (int) Q6_P_vabsdiffh_PP(a, b); + expdiff = Q6_R_sxth_R(expdiff) ; + if (expdiff > 63) { expdiff = 62;} + if (expa > expb) { + exp = expa + 1; + expa = 1; + expb = expdiff + 1; + } else { + exp = expb + 1; + expb = 1; + expa = expdiff + 1; + } + mant = (manta>>expa) - (mantb>>expb); + k = Q6_R_clb_P(mant)-1; + mant = (mant << k); + exp = exp - k; + if (mant == 0 || mant == -1) exp = 0x8001; + c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK); + return(c); + } + * ==================================================================== */ + .text + .global fast2_ldsub_asm + .type fast2_ldsub_asm, @function +fast2_ldsub_asm: +#define manta R1:0 +#define lmanta R1:0 +#define mantb R3:2 +#define lmantb R3:2 +#define expa R4 +#define expb R5 +#define expd R6 +#define exp R8 +#define c63 R9 +#define lmant R1:0 +#define k R4 +#define ce P0 +#define zero R3:2 + .falign + { + expa = memw(r29+#8) + expb = memw(r29+#24) + r7 = r0 + } + { + expd = sub(expa, expb):sat + ce = CMP.GT(expa, expb); + if ( ce.new) exp = add(expa, #1) + if (!ce.new) exp = add(expb, #1) + } { + expd = abs(expd):sat + if ( ce) expa = #1 + if (!ce) expb = #1 + c63 = #62 + } { + expd = min(expd, c63) + manta = memd(r29+#0) + mantb = memd(r29+#16) + } { + if (!ce) expa = add(expd, #1) + if ( ce) expb = add(expd, #1) + } { + lmanta = ASR(lmanta, expa) + lmantb = ASR(lmantb, expb) + } { + lmant = sub(lmanta, lmantb) + zero = #0 + } { + k = clb(lmant) + c63.L =#0x0001 + } { + exp -= add(k, #-1) //exp = exp - (k+1) + k = add(k, #-1) + p0 = cmp.gt(k, #58) + c63.H =#0x8000 + } { + if(!p0)memw(r7+#8) = exp + lmant = asl(lmant, k) + if(p0) jump .Ldenorma_s + } { + memd(r7+#0) = lmant + jumpr r31 + } +.Ldenorma_s: + memd(r7+#0) = zero + { + memw(r7+#8) = c63 + jumpr r31 + } + +/* ==================================================================== * + fast2_QLDOUBLE fast2_ldmpy(fast2_QLDOUBLE a,fast2_QLDOUBLE b) { + fast2_QLDOUBLE c; + lint manta = a & MANTMASK; + int expa = Q6_R_sxth_R(a) ; + lint mantb = b & MANTMASK; + int expb = Q6_R_sxth_R(b) ; + int exp, k; + lint mant; + int hia, hib, hi, lo; + unsigned int loa, lob; + + hia = (int)(a >> 32); + loa = Q6_R_extractu_RII((int)manta, 31, 1); + hib = (int)(b >> 32); + lob = Q6_R_extractu_RII((int)mantb, 31, 1); + + mant = Q6_P_mpy_RR(hia, lob); + mant = Q6_P_mpyacc_RR(mant,hib, loa); + mant = (mant >> 30) + (Q6_P_mpy_RR(hia, hib)<<1); + + hi = (int) (mant>>32); + + k = Q6_R_normamt_R(hi); + mant = mant << k; + exp = expa + expb - k; + if (mant == 0 || mant == -1) exp = 0x8001; + c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK); + return(c); + } + * ==================================================================== */ + .text + .global fast2_ldmpy_asm + .type fast2_ldmpy_asm, @function +fast2_ldmpy_asm: + +#define mantxl_ R9 +#define mantxl R14 +#define mantxh R15 +#define mantx R15:14 +#define mantbl R2 +#define mantbl_ R8 +#define mantbh R3 +#define mantb R3:2 +#define expa R4 +#define expb R5 +#define c8001 R8 +#define mantd R7:6 +#define lmantc R11:10 +#define kp R9 +#define min R13:12 +#define minh R13 +#define max R13:12 +#define maxh R13 +#define ret R0 + + .falign + { + mantx = memd(r29+#0) + mantb = memd(r29+#16) + min = #0 + } + { + mantbl_= extractu(mantbl, #31, #1) + mantxl_= extractu(mantxl, #31, #1) + minh.H = #0x8000 + } + { + lmantc = mpy(mantxh, mantbh) + mantd = mpy(mantxh, mantbl_) + expa = memw(r29+#8) + expb = memw(r29+#24) + } + { + lmantc = add(lmantc, lmantc) + mantd += mpy(mantbh, mantxl_) + } + { + mantd = asr(mantd, #30) + c8001.L = #0x0001 + p1 = cmp.eq(mantx, mantb) + } + { + mantd = add(mantd, lmantc) + expa= add(expa, expb) + p2 = cmp.eq(mantb, min) + } + { + kp = clb(mantd) + c8001.H = #0x8000 + p1 = and(p1, p2) + } + { + expa-= add(kp, #-1) + kp = add(kp, #-1) + if(p1) jump .Lsat + } + { + mantd = asl(mantd, kp) + memw(ret+#8) = expa + p0 = cmp.gt(kp, #58) + if(p0.new) jump:NT .Ldenorm //rarely happens + } + { + memd(ret+#0) = mantd + jumpr r31 + } +.Lsat: + { + max = #0 + expa+= add(kp, #1) + } + { + maxh.H = #0x4000 + memw(ret+#8) = expa + } + { + memd(ret+#0) = max + jumpr r31 + } +.Ldenorm: + { + memw(ret+#8) = c8001 + mantx = #0 + } + { + memd(ret+#0) = mantx + jumpr r31 + } diff --git a/contrib/compiler-rt/lib/builtins/hexagon/fastmath_dlib_asm.S b/contrib/compiler-rt/lib/builtins/hexagon/fastmath_dlib_asm.S new file mode 100644 index 000000000000..215936b783ca --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/fastmath_dlib_asm.S @@ -0,0 +1,400 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/* ==================================================================== */ +/* FUNCTIONS Optimized double floating point operators */ +/* ==================================================================== */ +/* c = dadd_asm(a, b) */ +/* ==================================================================== + +QDOUBLE dadd(QDOUBLE a,QDOUBLE b) { + QDOUBLE c; + lint manta = a & MANTMASK; + int expa = HEXAGON_R_sxth_R(a) ; + lint mantb = b & MANTMASK; + int expb = HEXAGON_R_sxth_R(b) ; + int exp, expdiff, j, k, hi, lo, cn; + lint mant; + + expdiff = (int) HEXAGON_P_vabsdiffh_PP(a, b); + expdiff = HEXAGON_R_sxth_R(expdiff) ; + if (expdiff > 63) { expdiff = 62;} + if (expa > expb) { + exp = expa + 1; + expa = 1; + expb = expdiff + 1; + } else { + exp = expb + 1; + expb = 1; + expa = expdiff + 1; + } + mant = (manta>>expa) + (mantb>>expb); + + hi = (int) (mant>>32); + lo = (int) (mant); + + k = HEXAGON_R_normamt_R(hi); + if(hi == 0 || hi == -1) k = 31+HEXAGON_R_normamt_R(lo); + + mant = (mant << k); + cn = (mant == 0x8000000000000000LL); + exp = exp - k + cn; + + if (mant == 0 || mant == -1) exp = 0x8001; + c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK); + return(c); + } + * ==================================================================== */ + .text + .global dadd_asm + .type dadd_asm, @function +dadd_asm: + +#define manta R0 +#define mantexpa R1:0 +#define lmanta R1:0 +#define mantb R2 +#define mantexpb R3:2 +#define lmantb R3:2 +#define expa R4 +#define expb R5 +#define mantexpd R7:6 +#define expd R6 +#define exp R8 +#define c63 R9 +#define lmant R1:0 +#define manth R1 +#define mantl R0 +#define zero R7:6 +#define zerol R6 +#define minus R3:2 +#define minusl R2 +#define maxneg R9 +#define minmin R11:10 // exactly 0x800000000000000000LL +#define minminh R11 +#define k R4 +#define kl R5 +#define ce P0 + .falign + { + mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL + c63 = #62 + expa = SXTH(manta) + expb = SXTH(mantb) + } { + expd = SXTH(expd) + ce = CMP.GT(expa, expb); + if ( ce.new) exp = add(expa, #1) + if (!ce.new) exp = add(expb, #1) + } { + if ( ce) expa = #1 + if (!ce) expb = #1 + manta.L = #0 + expd = MIN(expd, c63) + } { + if (!ce) expa = add(expd, #1) + if ( ce) expb = add(expd, #1) + mantb.L = #0 + zero = #0 + } { + lmanta = ASR(lmanta, expa) + lmantb = ASR(lmantb, expb) + minmin = #0 + } { + lmant = add(lmanta, lmantb) + minus = #-1 + minminh.H = #0x8000 + } { + k = NORMAMT(manth) + kl = NORMAMT(mantl) + p0 = cmp.eq(manth, zerol) + p1 = cmp.eq(manth, minusl) + } { + p0 = OR(p0, p1) + if(p0.new) k = add(kl, #31) + maxneg.H = #0 + } { + mantexpa = ASL(lmant, k) + exp = SUB(exp, k) + maxneg.L = #0x8001 + } { + p0 = cmp.eq(mantexpa, zero) + p1 = cmp.eq(mantexpa, minus) + manta.L = #0 + exp = ZXTH(exp) + } { + p2 = cmp.eq(mantexpa, minmin) //is result 0x80....0 + if(p2.new) exp = add(exp, #1) + } +#if (__HEXAGON_ARCH__ == 60) + { + p0 = OR(p0, p1) + if( p0.new) manta = OR(manta,maxneg) + if(!p0.new) manta = OR(manta,exp) + } + jumpr r31 +#else + { + p0 = OR(p0, p1) + if( p0.new) manta = OR(manta,maxneg) + if(!p0.new) manta = OR(manta,exp) + jumpr r31 + } +#endif +/* =================================================================== * + QDOUBLE dsub(QDOUBLE a,QDOUBLE b) { + QDOUBLE c; + lint manta = a & MANTMASK; + int expa = HEXAGON_R_sxth_R(a) ; + lint mantb = b & MANTMASK; + int expb = HEXAGON_R_sxth_R(b) ; + int exp, expdiff, j, k, hi, lo, cn; + lint mant; + + expdiff = (int) HEXAGON_P_vabsdiffh_PP(a, b); + expdiff = HEXAGON_R_sxth_R(expdiff) ; + if (expdiff > 63) { expdiff = 62;} + if (expa > expb) { + exp = expa + 1; + expa = 1; + expb = expdiff + 1; + } else { + exp = expb + 1; + expb = 1; + expa = expdiff + 1; + } + mant = (manta>>expa) - (mantb>>expb); + + hi = (int) (mant>>32); + lo = (int) (mant); + + k = HEXAGON_R_normamt_R(hi); + if(hi == 0 || hi == -1) k = 31+HEXAGON_R_normamt_R(lo); + + mant = (mant << k); + cn = (mant == 0x8000000000000000LL); + exp = exp - k + cn; + + if (mant == 0 || mant == -1) exp = 0x8001; + c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK); + return(c); + } + * ==================================================================== */ + .text + .global dsub_asm + .type dsub_asm, @function +dsub_asm: + +#define manta R0 +#define mantexpa R1:0 +#define lmanta R1:0 +#define mantb R2 +#define mantexpb R3:2 +#define lmantb R3:2 +#define expa R4 +#define expb R5 +#define mantexpd R7:6 +#define expd R6 +#define exp R8 +#define c63 R9 +#define lmant R1:0 +#define manth R1 +#define mantl R0 +#define zero R7:6 +#define zerol R6 +#define minus R3:2 +#define minusl R2 +#define maxneg R9 +#define minmin R11:10 // exactly 0x800000000000000000LL +#define minminh R11 +#define k R4 +#define kl R5 +#define ce P0 + .falign + { + mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL + c63 = #62 + expa = SXTH(manta) + expb = SXTH(mantb) + } { + expd = SXTH(expd) + ce = CMP.GT(expa, expb); + if ( ce.new) exp = add(expa, #1) + if (!ce.new) exp = add(expb, #1) + } { + if ( ce) expa = #1 + if (!ce) expb = #1 + manta.L = #0 + expd = MIN(expd, c63) + } { + if (!ce) expa = add(expd, #1) + if ( ce) expb = add(expd, #1) + mantb.L = #0 + zero = #0 + } { + lmanta = ASR(lmanta, expa) + lmantb = ASR(lmantb, expb) + minmin = #0 + } { + lmant = sub(lmanta, lmantb) + minus = #-1 + minminh.H = #0x8000 + } { + k = NORMAMT(manth) + kl = NORMAMT(mantl) + p0 = cmp.eq(manth, zerol) + p1 = cmp.eq(manth, minusl) + } { + p0 = OR(p0, p1) + if(p0.new) k = add(kl, #31) + maxneg.H = #0 + } { + mantexpa = ASL(lmant, k) + exp = SUB(exp, k) + maxneg.L = #0x8001 + } { + p0 = cmp.eq(mantexpa, zero) + p1 = cmp.eq(mantexpa, minus) + manta.L = #0 + exp = ZXTH(exp) + } { + p2 = cmp.eq(mantexpa, minmin) //is result 0x80....0 + if(p2.new) exp = add(exp, #1) + } +#if (__HEXAGON_ARCH__ == 60) + { + p0 = OR(p0, p1) + if( p0.new) manta = OR(manta,maxneg) + if(!p0.new) manta = OR(manta,exp) + } + jumpr r31 +#else + { + p0 = OR(p0, p1) + if( p0.new) manta = OR(manta,maxneg) + if(!p0.new) manta = OR(manta,exp) + jumpr r31 + } +#endif +/* ==================================================================== * + QDOUBLE dmpy(QDOUBLE a,QDOUBLE b) { + QDOUBLE c; + lint manta = a & MANTMASK; + int expa = HEXAGON_R_sxth_R(a) ; + lint mantb = b & MANTMASK; + int expb = HEXAGON_R_sxth_R(b) ; + int exp, k; + lint mant; + int hia, hib, hi, lo; + unsigned int loa, lob; + + hia = (int)(a >> 32); + loa = HEXAGON_R_extractu_RII((int)manta, 31, 1); + hib = (int)(b >> 32); + lob = HEXAGON_R_extractu_RII((int)mantb, 31, 1); + + mant = HEXAGON_P_mpy_RR(hia, lob); + mant = HEXAGON_P_mpyacc_RR(mant,hib, loa); + mant = (mant >> 30) + (HEXAGON_P_mpy_RR(hia, hib)<<1); + + hi = (int) (mant>>32); + lo = (int) (mant); + + k = HEXAGON_R_normamt_R(hi); + if(hi == 0 || hi == -1) k = 31+HEXAGON_R_normamt_R(lo); + mant = mant << k; + exp = expa + expb - k; + if (mant == 0 || mant == -1) exp = 0x8001; + c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK); + return(c); + } + * ==================================================================== */ + .text + .global dmpy_asm + .type dmpy_asm, @function +dmpy_asm: + +#define mantal R0 +#define mantah R1 +#define mantexpa R1:0 +#define mantbl R2 +#define mantbh R3 +#define mantexpb R3:2 +#define expa R4 +#define expb R5 +#define mantexpd R7:6 +#define exp R8 +#define lmantc R11:10 +#define mantch R11 +#define mantcl R10 +#define zero0 R7:6 +#define zero0l R6 +#define minus1 R3:2 +#define minus1l R2 +#define maxneg R9 +#define k R4 +#define kl R5 + + .falign + { + mantbl = lsr(mantbl, #16) + mantal = lsr(mantal, #16) + expa = sxth(mantal) + expb = sxth(mantbl) + } + { + lmantc = mpy(mantah, mantbh) + mantexpd = mpy(mantah, mantbl) + } + { + lmantc = add(lmantc, lmantc) //<<1 + mantexpd+= mpy(mantbh, mantal) + } + { + lmantc += asr(mantexpd, #15) + exp = add(expa, expb) + zero0 = #0 + minus1 = #-1 + } + { + k = normamt(mantch) + kl = normamt(mantcl) + p0 = cmp.eq(mantch, zero0l) + p1 = cmp.eq(mantch, minus1l) + } + { + p0 = or(p0, p1) + if(p0.new) k = add(kl, #31) + maxneg.H = #0 + } + { + mantexpa = asl(lmantc, k) + exp = sub(exp, k) + maxneg.L = #0x8001 + } + { + p0 = cmp.eq(mantexpa, zero0) + p1 = cmp.eq(mantexpa, minus1) + mantal.L = #0 + exp = zxth(exp) + } +#if (__HEXAGON_ARCH__ == 60) + { + p0 = or(p0, p1) + if( p0.new) mantal = or(mantal,maxneg) + if(!p0.new) mantal = or(mantal,exp) + } + jumpr r31 +#else + { + p0 = or(p0, p1) + if( p0.new) mantal = or(mantal,maxneg) + if(!p0.new) mantal = or(mantal,exp) + jumpr r31 + } +#endif diff --git a/contrib/compiler-rt/lib/builtins/hexagon/fma_opt.S b/contrib/compiler-rt/lib/builtins/hexagon/fma_opt.S new file mode 100644 index 000000000000..12378f0da04e --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/fma_opt.S @@ -0,0 +1,31 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +.macro FUNCTION_BEGIN name +.text +.p2align 5 +.globl \name +.type \name, @function +\name: +.endm + +.macro FUNCTION_END name +.size \name, . - \name +.endm + +FUNCTION_BEGIN fmaf + r2 += sfmpy(r0, r1) + { + r0 = r2 + jumpr r31 + } +FUNCTION_END fmaf + + .globl fmal + .set fmal, fma diff --git a/contrib/compiler-rt/lib/builtins/hexagon/fmax_opt.S b/contrib/compiler-rt/lib/builtins/hexagon/fmax_opt.S new file mode 100644 index 000000000000..f3a218c9769b --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/fmax_opt.S @@ -0,0 +1,30 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +.macro FUNCTION_BEGIN name +.text +.p2align 5 +.globl \name +.type \name, @function +\name: +.endm + +.macro FUNCTION_END name +.size \name, . - \name +.endm + +FUNCTION_BEGIN fmaxf + { + r0 = sfmax(r0, r1) + jumpr r31 + } +FUNCTION_END fmaxf + + .globl fmaxl + .set fmaxl, fmax diff --git a/contrib/compiler-rt/lib/builtins/hexagon/fmin_opt.S b/contrib/compiler-rt/lib/builtins/hexagon/fmin_opt.S new file mode 100644 index 000000000000..ef9b0ff854a2 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/fmin_opt.S @@ -0,0 +1,30 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +.macro FUNCTION_BEGIN name +.text +.p2align 5 +.globl \name +.type \name, @function +\name: +.endm + +.macro FUNCTION_END name +.size \name, . - \name +.endm + +FUNCTION_BEGIN fminf + { + r0 = sfmin(r0, r1) + jumpr r31 + } +FUNCTION_END fminf + + .globl fminl + .set fminl, fmin diff --git a/contrib/compiler-rt/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S b/contrib/compiler-rt/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S new file mode 100644 index 000000000000..fbe09086cd33 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S @@ -0,0 +1,125 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// An optimized version of a memcpy which is equivalent to the following loop: +// +// volatile unsigned *dest; +// unsigned *src; +// +// for (i = 0; i < num_words; ++i) +// *dest++ = *src++; +// +// The corresponding C prototype for this function would be +// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest, +// const unsigned *src, +// unsigned num_words); +// +// *** Both dest and src must be aligned to 32-bit boundaries. *** +// The code does not perform any runtime checks for this, and will fail +// in bad ways if this requirement is not met. +// +// The "forward" in the name refers to the fact that the function copies +// the words going forward in memory. It is incorrect to use this function +// for cases where the original code copied words in any other order. +// +// *** This function is only for the use by the compiler. *** +// The only indended use is for the LLVM compiler to generate calls to +// this function, when a mem-copy loop, like the one above, is detected. + + .text + +// Inputs: +// r0: dest +// r1: src +// r2: num_words + + .globl hexagon_memcpy_forward_vp4cp4n2 + .balign 32 + .type hexagon_memcpy_forward_vp4cp4n2,@function +hexagon_memcpy_forward_vp4cp4n2: + + // Compute r3 to be the number of words remaining in the current page. + // At the same time, compute r4 to be the number of 32-byte blocks + // remaining in the page (for prefetch). + { + r3 = sub(##4096, r1) + r5 = lsr(r2, #3) + } + { + // The word count before end-of-page is in the 12 lowest bits of r3. + // (If the address in r1 was already page-aligned, the bits are 0.) + r3 = extractu(r3, #10, #2) + r4 = extractu(r3, #7, #5) + } + { + r3 = minu(r2, r3) + r4 = minu(r5, r4) + } + { + r4 = or(r4, ##2105344) // 2105344 = 0x202000 + p0 = cmp.eq(r3, #0) + if (p0.new) jump:nt .Lskipprolog + } + l2fetch(r1, r4) + { + loop0(.Lprolog, r3) + r2 = sub(r2, r3) // r2 = number of words left after the prolog. + } + .falign +.Lprolog: + { + r4 = memw(r1++#4) + memw(r0++#4) = r4.new + } :endloop0 +.Lskipprolog: + { + // Let r3 = number of whole pages left (page = 1024 words). + r3 = lsr(r2, #10) + if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain + } + { + loop1(.Lout, r3) + r2 = extractu(r2, #10, #0) // r2 = r2 & 1023 + r3 = ##2105472 // r3 = 0x202080 (prefetch info) + } + // Iterate over pages. + .falign +.Lout: + // Prefetch each individual page. + l2fetch(r1, r3) + loop0(.Lpage, #512) + .falign +.Lpage: + r5:4 = memd(r1++#8) + { + memw(r0++#8) = r4 + memw(r0+#4) = r5 + } :endloop0:endloop1 +.Lskipmain: + { + r3 = ##2105344 // r3 = 0x202000 (prefetch info) + r4 = lsr(r2, #3) // r4 = number of 32-byte blocks remaining. + p0 = cmp.eq(r2, #0) + if (p0.new) jumpr:nt r31 + } + { + r3 = or(r3, r4) + loop0(.Lepilog, r2) + } + l2fetch(r1, r3) + .falign +.Lepilog: + { + r4 = memw(r1++#4) + memw(r0++#4) = r4.new + } :endloop0 + + jumpr r31 + +.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2 diff --git a/contrib/compiler-rt/lib/builtins/hexagon/memcpy_likely_aligned.S b/contrib/compiler-rt/lib/builtins/hexagon/memcpy_likely_aligned.S new file mode 100644 index 000000000000..bbc85c22db08 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/memcpy_likely_aligned.S @@ -0,0 +1,64 @@ +//===------------------------- memcopy routines ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + +FUNCTION_BEGIN __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes + { + p0 = bitsclr(r1,#7) + p0 = bitsclr(r0,#7) + if (p0.new) r5:4 = memd(r1) + r3 = #-3 + } + { + if (!p0) jump .Lmemcpy_call + if (p0) memd(r0++#8) = r5:4 + if (p0) r5:4 = memd(r1+#8) + r3 += lsr(r2,#3) + } + { + memd(r0++#8) = r5:4 + r5:4 = memd(r1+#16) + r1 = add(r1,#24) + loop0(1f,r3) + } + .falign +1: + { + memd(r0++#8) = r5:4 + r5:4 = memd(r1++#8) + }:endloop0 + { + memd(r0) = r5:4 + r0 -= add(r2,#-8) + jumpr r31 + } +FUNCTION_END __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes + +.Lmemcpy_call: +#ifdef __PIC__ + jump memcpy@PLT +#else + jump memcpy +#endif + + .globl __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes + .set __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes, \ + __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes diff --git a/contrib/compiler-rt/lib/builtins/hexagon/moddi3.S b/contrib/compiler-rt/lib/builtins/hexagon/moddi3.S new file mode 100644 index 000000000000..12a0595fe465 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/moddi3.S @@ -0,0 +1,83 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + + +FUNCTION_BEGIN __hexagon_moddi3 + { + p3 = tstbit(r1,#31) + } + { + r1:0 = abs(r1:0) + r3:2 = abs(r3:2) + } + { + r6 = cl0(r1:0) // count leading 0's of dividend (numerator) + r7 = cl0(r3:2) // count leading 0's of divisor (denominator) + r5:4 = r3:2 // divisor moved into working registers + r3:2 = r1:0 // dividend is the initial remainder, r3:2 contains remainder + } + { + r10 = sub(r7,r6) // left shift count for bit & divisor + r1:0 = #0 // initialize quotient to 0 + r15:14 = #1 // initialize bit to 1 + } + { + r11 = add(r10,#1) // loop count is 1 more than shift count + r13:12 = lsl(r5:4,r10) // shift divisor msb into same bit position as dividend msb + r15:14 = lsl(r15:14,r10) // shift the bit left by same amount as divisor + } + { + p0 = cmp.gtu(r5:4,r3:2) // check if divisor > dividend + loop0(1f,r11) // register loop + } + { + if (p0) jump .hexagon_moddi3_return // if divisor > dividend, we're done, so return + } + .falign +1: + { + p0 = cmp.gtu(r13:12,r3:2) // set predicate reg if shifted divisor > current remainder + } + { + r7:6 = sub(r3:2, r13:12) // subtract shifted divisor from current remainder + r9:8 = add(r1:0, r15:14) // save current quotient to temp (r9:8) + } + { + r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8) + r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6) + } + { + r15:14 = lsr(r15:14, #1) // shift bit right by 1 for next iteration + r13:12 = lsr(r13:12, #1) // shift "shifted divisor" right by 1 for next iteration + }:endloop0 + +.hexagon_moddi3_return: + { + r1:0 = neg(r3:2) + } + { + r1:0 = vmux(p3,r1:0,r3:2) + jumpr r31 + } +FUNCTION_END __hexagon_moddi3 + + .globl __qdsp_moddi3 + .set __qdsp_moddi3, __hexagon_moddi3 diff --git a/contrib/compiler-rt/lib/builtins/hexagon/modsi3.S b/contrib/compiler-rt/lib/builtins/hexagon/modsi3.S new file mode 100644 index 000000000000..5afda9e2978b --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/modsi3.S @@ -0,0 +1,66 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + + + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + + +FUNCTION_BEGIN __hexagon_modsi3 + { + p2 = cmp.ge(r0,#0) + r2 = abs(r0) + r1 = abs(r1) + } + { + r3 = cl0(r2) + r4 = cl0(r1) + p0 = cmp.gtu(r1,r2) + } + { + r3 = sub(r4,r3) + if (p0) jumpr r31 + } + { + p1 = cmp.eq(r3,#0) + loop0(1f,r3) + r0 = r2 + r2 = lsl(r1,r3) + } + .falign +1: + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r2) + r2 = lsr(r2,#1) + if (p1) r1 = #0 + }:endloop0 + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r1) + if (p2) jumpr r31 + } + { + r0 = neg(r0) + jumpr r31 + } +FUNCTION_END __hexagon_modsi3 + + .globl __qdsp_modsi3 + .set __qdsp_modsi3, __hexagon_modsi3 diff --git a/contrib/compiler-rt/lib/builtins/hexagon/sfdiv_opt.S b/contrib/compiler-rt/lib/builtins/hexagon/sfdiv_opt.S new file mode 100644 index 000000000000..6bdd4808c2b8 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/sfdiv_opt.S @@ -0,0 +1,66 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + +#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG +#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG +#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG + +FUNCTION_BEGIN __hexagon_divsf3 + { + r2,p0 = sfrecipa(r0,r1) + r4 = sffixupd(r0,r1) + r3 = ##0x3f800000 // 1.0 + } + { + r5 = sffixupn(r0,r1) + r3 -= sfmpy(r4,r2):lib // 1-(den/recip) yields error? + r6 = ##0x80000000 + r7 = r3 + } + { + r2 += sfmpy(r3,r2):lib + r3 = r7 + r6 = r5 + r0 = and(r6,r5) + } + { + r3 -= sfmpy(r4,r2):lib + r0 += sfmpy(r5,r2):lib + } + { + r2 += sfmpy(r3,r2):lib + r6 -= sfmpy(r0,r4):lib + } + { + r0 += sfmpy(r6,r2):lib + } + { + r5 -= sfmpy(r0,r4):lib + } + { + r0 += sfmpy(r5,r2,p0):scale + jumpr r31 + } +FUNCTION_END __hexagon_divsf3 + +Q6_ALIAS(divsf3) +FAST_ALIAS(divsf3) +FAST2_ALIAS(divsf3) diff --git a/contrib/compiler-rt/lib/builtins/hexagon/sfsqrt_opt.S b/contrib/compiler-rt/lib/builtins/hexagon/sfsqrt_opt.S new file mode 100644 index 000000000000..7f619002774f --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/sfsqrt_opt.S @@ -0,0 +1,82 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + +#define RIN r0 +#define S r0 +#define H r1 +#define D r2 +#define E r3 +#define HALF r4 +#define R r5 + +#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG +#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG +#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG + +FUNCTION_BEGIN __hexagon_sqrtf + { + E,p0 = sfinvsqrta(RIN) + R = sffixupr(RIN) + HALF = ##0x3f000000 // 0.5 + r1:0 = combine(#0,#0) // clear S/H + } + { + S += sfmpy(E,R):lib // S0 + H += sfmpy(E,HALF):lib // H0 + D = HALF + E = R + } + { + D -= sfmpy(S,H):lib // d0 + p1 = sfclass(R,#1) // is zero? + //E -= sfmpy(S,S):lib // e0 + } + { + S += sfmpy(S,D):lib // S1 + H += sfmpy(H,D):lib // H1 + D = HALF + E = R + } + { + D -= sfmpy(S,H):lib // d0 + E -= sfmpy(S,S):lib // e0 + } + { + S += sfmpy(H,E):lib // S2 + H += sfmpy(H,D):lib // H2 + D = HALF + E = R + } + { + //D -= sfmpy(S,H):lib // d2 + E -= sfmpy(S,S):lib // e2 + if (p1) r0 = or(r0,R) // sqrt(-0.0) = -0.0 + } + { + S += sfmpy(H,E,p0):scale // S3 + jumpr r31 + } + +FUNCTION_END __hexagon_sqrtf + +Q6_ALIAS(sqrtf) +FAST_ALIAS(sqrtf) +FAST2_ALIAS(sqrtf) diff --git a/contrib/compiler-rt/lib/builtins/hexagon/udivdi3.S b/contrib/compiler-rt/lib/builtins/hexagon/udivdi3.S new file mode 100644 index 000000000000..1ca326b75208 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/udivdi3.S @@ -0,0 +1,71 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + + +FUNCTION_BEGIN __hexagon_udivdi3 + { + r6 = cl0(r1:0) // count leading 0's of dividend (numerator) + r7 = cl0(r3:2) // count leading 0's of divisor (denominator) + r5:4 = r3:2 // divisor moved into working registers + r3:2 = r1:0 // dividend is the initial remainder, r3:2 contains remainder + } + { + r10 = sub(r7,r6) // left shift count for bit & divisor + r1:0 = #0 // initialize quotient to 0 + r15:14 = #1 // initialize bit to 1 + } + { + r11 = add(r10,#1) // loop count is 1 more than shift count + r13:12 = lsl(r5:4,r10) // shift divisor msb into same bit position as dividend msb + r15:14 = lsl(r15:14,r10) // shift the bit left by same amount as divisor + } + { + p0 = cmp.gtu(r5:4,r3:2) // check if divisor > dividend + loop0(1f,r11) // register loop + } + { + if (p0) jumpr r31 // if divisor > dividend, we're done, so return + } + .falign +1: + { + p0 = cmp.gtu(r13:12,r3:2) // set predicate reg if shifted divisor > current remainder + } + { + r7:6 = sub(r3:2, r13:12) // subtract shifted divisor from current remainder + r9:8 = add(r1:0, r15:14) // save current quotient to temp (r9:8) + } + { + r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8) + r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6) + } + { + r15:14 = lsr(r15:14, #1) // shift bit right by 1 for next iteration + r13:12 = lsr(r13:12, #1) // shift "shifted divisor" right by 1 for next iteration + }:endloop0 + { + jumpr r31 // return + } +FUNCTION_END __hexagon_udivdi3 + + .globl __qdsp_udivdi3 + .set __qdsp_udivdi3, __hexagon_udivdi3 diff --git a/contrib/compiler-rt/lib/builtins/hexagon/udivmoddi4.S b/contrib/compiler-rt/lib/builtins/hexagon/udivmoddi4.S new file mode 100644 index 000000000000..deb5aae0924d --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/udivmoddi4.S @@ -0,0 +1,71 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + + +FUNCTION_BEGIN __hexagon_udivmoddi4 + { + r6 = cl0(r1:0) // count leading 0's of dividend (numerator) + r7 = cl0(r3:2) // count leading 0's of divisor (denominator) + r5:4 = r3:2 // divisor moved into working registers + r3:2 = r1:0 // dividend is the initial remainder, r3:2 contains remainder + } + { + r10 = sub(r7,r6) // left shift count for bit & divisor + r1:0 = #0 // initialize quotient to 0 + r15:14 = #1 // initialize bit to 1 + } + { + r11 = add(r10,#1) // loop count is 1 more than shift count + r13:12 = lsl(r5:4,r10) // shift divisor msb into same bit position as dividend msb + r15:14 = lsl(r15:14,r10) // shift the bit left by same amount as divisor + } + { + p0 = cmp.gtu(r5:4,r3:2) // check if divisor > dividend + loop0(1f,r11) // register loop + } + { + if (p0) jumpr r31 // if divisor > dividend, we're done, so return + } + .falign +1: + { + p0 = cmp.gtu(r13:12,r3:2) // set predicate reg if shifted divisor > current remainder + } + { + r7:6 = sub(r3:2, r13:12) // subtract shifted divisor from current remainder + r9:8 = add(r1:0, r15:14) // save current quotient to temp (r9:8) + } + { + r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8) + r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6) + } + { + r15:14 = lsr(r15:14, #1) // shift bit right by 1 for next iteration + r13:12 = lsr(r13:12, #1) // shift "shifted divisor" right by 1 for next iteration + }:endloop0 + { + jumpr r31 // return + } +FUNCTION_END __hexagon_udivmoddi4 + + .globl __qdsp_udivmoddi4 + .set __qdsp_udivmoddi4, __hexagon_udivmoddi4 diff --git a/contrib/compiler-rt/lib/builtins/hexagon/udivmodsi4.S b/contrib/compiler-rt/lib/builtins/hexagon/udivmodsi4.S new file mode 100644 index 000000000000..25bbe7cd5925 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/udivmodsi4.S @@ -0,0 +1,60 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + + +FUNCTION_BEGIN __hexagon_udivmodsi4 + { + r2 = cl0(r0) + r3 = cl0(r1) + r5:4 = combine(#1,#0) + p0 = cmp.gtu(r1,r0) + } + { + r6 = sub(r3,r2) + r4 = r1 + r1:0 = combine(r0,r4) + if (p0) jumpr r31 + } + { + r3:2 = vlslw(r5:4,r6) + loop0(1f,r6) + p0 = cmp.eq(r6,#0) + if (p0.new) r4 = #0 + } + .falign +1: + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r1 = sub(r1,r2) + if (!p0.new) r0 = add(r0,r3) + r3:2 = vlsrw(r3:2,#1) + }:endloop0 + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r1 = sub(r1,r4) + if (!p0.new) r0 = add(r0,r3) + jumpr r31 + } +FUNCTION_END __hexagon_udivmodsi4 + + .globl __qdsp_udivmodsi4 + .set __qdsp_udivmodsi4, __hexagon_udivmodsi4 diff --git a/contrib/compiler-rt/lib/builtins/hexagon/udivsi3.S b/contrib/compiler-rt/lib/builtins/hexagon/udivsi3.S new file mode 100644 index 000000000000..54f0aa409f93 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/udivsi3.S @@ -0,0 +1,56 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + + +FUNCTION_BEGIN __hexagon_udivsi3 + { + r2 = cl0(r0) + r3 = cl0(r1) + r5:4 = combine(#1,#0) + p0 = cmp.gtu(r1,r0) + } + { + r6 = sub(r3,r2) + r4 = r1 + r1:0 = combine(r0,r4) + if (p0) jumpr r31 + } + { + r3:2 = vlslw(r5:4,r6) + loop0(1f,r6) + } + .falign +1: + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r1 = sub(r1,r2) + if (!p0.new) r0 = add(r0,r3) + r3:2 = vlsrw(r3:2,#1) + }:endloop0 + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r0 = add(r0,r3) + jumpr r31 + } +FUNCTION_END __hexagon_udivsi3 + + .globl __qdsp_udivsi3 + .set __qdsp_udivsi3, __hexagon_udivsi3 diff --git a/contrib/compiler-rt/lib/builtins/hexagon/umoddi3.S b/contrib/compiler-rt/lib/builtins/hexagon/umoddi3.S new file mode 100644 index 000000000000..f091521414af --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/umoddi3.S @@ -0,0 +1,74 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + + +FUNCTION_BEGIN __hexagon_umoddi3 + { + r6 = cl0(r1:0) // count leading 0's of dividend (numerator) + r7 = cl0(r3:2) // count leading 0's of divisor (denominator) + r5:4 = r3:2 // divisor moved into working registers + r3:2 = r1:0 // dividend is the initial remainder, r3:2 contains remainder + } + { + r10 = sub(r7,r6) // left shift count for bit & divisor + r1:0 = #0 // initialize quotient to 0 + r15:14 = #1 // initialize bit to 1 + } + { + r11 = add(r10,#1) // loop count is 1 more than shift count + r13:12 = lsl(r5:4,r10) // shift divisor msb into same bit position as dividend msb + r15:14 = lsl(r15:14,r10) // shift the bit left by same amount as divisor + } + { + p0 = cmp.gtu(r5:4,r3:2) // check if divisor > dividend + loop0(1f,r11) // register loop + } + { + if (p0) jump .hexagon_umoddi3_return // if divisor > dividend, we're done, so return + } + .falign +1: + { + p0 = cmp.gtu(r13:12,r3:2) // set predicate reg if shifted divisor > current remainder + } + { + r7:6 = sub(r3:2, r13:12) // subtract shifted divisor from current remainder + r9:8 = add(r1:0, r15:14) // save current quotient to temp (r9:8) + } + { + r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8) + r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6) + } + { + r15:14 = lsr(r15:14, #1) // shift bit right by 1 for next iteration + r13:12 = lsr(r13:12, #1) // shift "shifted divisor" right by 1 for next iteration + }:endloop0 + +.hexagon_umoddi3_return: + { + r1:0 = r3:2 + jumpr r31 + } +FUNCTION_END __hexagon_umoddi3 + + .globl __qdsp_umoddi3 + .set __qdsp_umoddi3, __hexagon_umoddi3 diff --git a/contrib/compiler-rt/lib/builtins/hexagon/umodsi3.S b/contrib/compiler-rt/lib/builtins/hexagon/umodsi3.S new file mode 100644 index 000000000000..a8270c2030d5 --- /dev/null +++ b/contrib/compiler-rt/lib/builtins/hexagon/umodsi3.S @@ -0,0 +1,55 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + + +FUNCTION_BEGIN __hexagon_umodsi3 + { + r2 = cl0(r0) + r3 = cl0(r1) + p0 = cmp.gtu(r1,r0) + } + { + r2 = sub(r3,r2) + if (p0) jumpr r31 + } + { + loop0(1f,r2) + p1 = cmp.eq(r2,#0) + r2 = lsl(r1,r2) + } + .falign +1: + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r2) + r2 = lsr(r2,#1) + if (p1) r1 = #0 + }:endloop0 + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r1) + jumpr r31 + } +FUNCTION_END __hexagon_umodsi3 + + .globl __qdsp_umodsi3 + .set __qdsp_umodsi3, __hexagon_umodsi3 |