282 lines
6.3 KiB
ArmAsm
282 lines
6.3 KiB
ArmAsm
|
/* Copyright (C) 2008-2021 Free Software Foundation, Inc.
|
||
|
Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
|
||
|
on behalf of Synopsys Inc.
|
||
|
|
||
|
This file is part of GCC.
|
||
|
|
||
|
GCC is free software; you can redistribute it and/or modify it under
|
||
|
the terms of the GNU General Public License as published by the Free
|
||
|
Software Foundation; either version 3, or (at your option) any later
|
||
|
version.
|
||
|
|
||
|
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
||
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||
|
for more details.
|
||
|
|
||
|
Under Section 7 of GPL version 3, you are granted additional
|
||
|
permissions described in the GCC Runtime Library Exception, version
|
||
|
3.1, as published by the Free Software Foundation.
|
||
|
|
||
|
You should have received a copy of the GNU General Public License and
|
||
|
a copy of the GCC Runtime Library Exception along with this program;
|
||
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||
|
<http://www.gnu.org/licenses/>. */
|
||
|
|
||
|
/*
|
||
|
- calculate 15..18 bit inverse using a table of approximating polynoms.
|
||
|
precision is higher for polynoms used to evaluate input with larger
|
||
|
value.
|
||
|
- do one newton-raphson iteration step to double the precision,
|
||
|
then multiply this with the divisor
|
||
|
-> more time to decide if dividend is subnormal
|
||
|
- the worst error propagation is on the side of the value range
|
||
|
with the least initial defect, thus giving us about 30 bits precision.
|
||
|
*/
|
||
|
#include "arc-ieee-754.h"
|
||
|
|
||
|
#if 0 /* DEBUG */
|
||
|
.global __divsf3
|
||
|
FUNC(__divsf3)
|
||
|
.balign 4
|
||
|
__divsf3:
|
||
|
push_s blink
|
||
|
push_s r1
|
||
|
bl.d __divsf3_c
|
||
|
push_s r0
|
||
|
ld_s r1,[sp,4]
|
||
|
st_s r0,[sp,4]
|
||
|
bl.d __divsf3_asm
|
||
|
pop_s r0
|
||
|
pop_s r1
|
||
|
pop_s blink
|
||
|
cmp r0,r1
|
||
|
#if 1
|
||
|
bne abort
|
||
|
jeq_s [blink]
|
||
|
b abort
|
||
|
#else
|
||
|
bne abort
|
||
|
j_s [blink]
|
||
|
#endif
|
||
|
ENDFUNC(__divsf3)
|
||
|
#define __divsf3 __divsf3_asm
|
||
|
#endif /* DEBUG */
|
||
|
|
||
|
FUNC(__divsf3)
|
||
|
.balign 4
|
||
|
.L7f800000:
|
||
|
.long 0x7f800000
|
||
|
.Ldivtab:
|
||
|
.long 0xfc0ffff0
|
||
|
.long 0xf46ffefd
|
||
|
.long 0xed1ffd2a
|
||
|
.long 0xe627fa8e
|
||
|
.long 0xdf7ff73b
|
||
|
.long 0xd917f33b
|
||
|
.long 0xd2f7eea3
|
||
|
.long 0xcd1fe986
|
||
|
.long 0xc77fe3e7
|
||
|
.long 0xc21fdddb
|
||
|
.long 0xbcefd760
|
||
|
.long 0xb7f7d08c
|
||
|
.long 0xb32fc960
|
||
|
.long 0xae97c1ea
|
||
|
.long 0xaa27ba26
|
||
|
.long 0xa5e7b22e
|
||
|
.long 0xa1cfa9fe
|
||
|
.long 0x9ddfa1a0
|
||
|
.long 0x9a0f990c
|
||
|
.long 0x9667905d
|
||
|
.long 0x92df878a
|
||
|
.long 0x8f6f7e84
|
||
|
.long 0x8c27757e
|
||
|
.long 0x88f76c54
|
||
|
.long 0x85df630c
|
||
|
.long 0x82e759c5
|
||
|
.long 0x8007506d
|
||
|
.long 0x7d3f470a
|
||
|
.long 0x7a8f3da2
|
||
|
.long 0x77ef341e
|
||
|
.long 0x756f2abe
|
||
|
.long 0x72f7212d
|
||
|
.long 0x709717ad
|
||
|
.long 0x6e4f0e44
|
||
|
.long 0x6c1704d6
|
||
|
.long 0x69e6fb44
|
||
|
.long 0x67cef1d7
|
||
|
.long 0x65c6e872
|
||
|
.long 0x63cedf18
|
||
|
.long 0x61e6d5cd
|
||
|
.long 0x6006cc6d
|
||
|
.long 0x5e36c323
|
||
|
.long 0x5c76b9f3
|
||
|
.long 0x5abeb0b7
|
||
|
.long 0x5916a79b
|
||
|
.long 0x57769e77
|
||
|
.long 0x55de954d
|
||
|
.long 0x54568c4e
|
||
|
.long 0x52d6834d
|
||
|
.long 0x51667a7f
|
||
|
.long 0x4ffe71b5
|
||
|
.long 0x4e9e68f1
|
||
|
.long 0x4d466035
|
||
|
.long 0x4bf65784
|
||
|
.long 0x4aae4ede
|
||
|
.long 0x496e4646
|
||
|
.long 0x48363dbd
|
||
|
.long 0x47063547
|
||
|
.long 0x45de2ce5
|
||
|
.long 0x44be2498
|
||
|
.long 0x43a61c64
|
||
|
.long 0x4296144a
|
||
|
.long 0x41860c0e
|
||
|
.long 0x407e03ee
|
||
|
__divsf3_support: /* This label makes debugger output saner. */
|
||
|
.Ldenorm_fp1:
|
||
|
bclr r6,r6,31
|
||
|
norm.f r12,r6 ; flag for x/0 -> Inf check
|
||
|
add r6,r6,r6
|
||
|
rsub r5,r12,16
|
||
|
ror r5,r1,r5
|
||
|
asl r6,r6,r12
|
||
|
bmsk r5,r5,5
|
||
|
ld.as r5,[r3,r5]
|
||
|
add r4,r6,r6
|
||
|
; load latency
|
||
|
MPYHU r7,r5,r4
|
||
|
bic.ne.f 0, \
|
||
|
0x60000000,r0 ; large number / denorm -> Inf
|
||
|
beq_s .Linf_NaN
|
||
|
asl r5,r5,13
|
||
|
; wb stall
|
||
|
; slow track
|
||
|
sub r7,r5,r7
|
||
|
MPYHU r8,r7,r6
|
||
|
asl_s r12,r12,23
|
||
|
and.f r2,r0,r9
|
||
|
add r2,r2,r12
|
||
|
asl r12,r0,8
|
||
|
; wb stall
|
||
|
bne.d .Lpast_denorm_fp1
|
||
|
.Ldenorm_fp0:
|
||
|
MPYHU r8,r8,r7
|
||
|
bclr r12,r12,31
|
||
|
norm.f r3,r12 ; flag for 0/x -> 0 check
|
||
|
bic.ne.f 0,0x60000000,r1 ; denorm/large number -> 0
|
||
|
beq_s .Lret0
|
||
|
asl_s r12,r12,r3
|
||
|
asl_s r3,r3,23
|
||
|
add_s r12,r12,r12
|
||
|
add r11,r11,r3
|
||
|
b.d .Lpast_denorm_fp0
|
||
|
mov_s r3,r12
|
||
|
.balign 4
|
||
|
.Linf_NaN:
|
||
|
bclr.f 0,r0,31 ; 0/0 -> NaN
|
||
|
xor_s r0,r0,r1
|
||
|
bmsk r1,r0,30
|
||
|
bic_s r0,r0,r1
|
||
|
sub.eq r0,r0,1
|
||
|
j_s.d [blink]
|
||
|
or r0,r0,r9
|
||
|
.Lret0:
|
||
|
xor_s r0,r0,r1
|
||
|
bmsk r1,r0,30
|
||
|
j_s.d [blink]
|
||
|
bic_s r0,r0,r1
|
||
|
.Linf_nan_fp1:
|
||
|
lsr_s r0,r0,31
|
||
|
bmsk.f 0,r1,22
|
||
|
asl_s r0,r0,31
|
||
|
bne_s 0f ; inf/inf -> nan
|
||
|
brne r2,r9,.Lsigned0 ; x/inf -> 0, but x/nan -> nan
|
||
|
0: j_s.d [blink]
|
||
|
mov r0,-1
|
||
|
.Lsigned0:
|
||
|
.Linf_nan_fp0:
|
||
|
tst_s r1,r1
|
||
|
j_s.d [blink]
|
||
|
bxor.mi r0,r0,31
|
||
|
.balign 4
|
||
|
.global __divsf3
|
||
|
/* N.B. the spacing between divtab and the sub3 to get its address must
|
||
|
be a multiple of 8. */
|
||
|
__divsf3:
|
||
|
lsr r2,r1,17
|
||
|
sub3 r3,pcl,55;(.-.Ldivtab) >> 3
|
||
|
bmsk_s r2,r2,5
|
||
|
ld.as r5,[r3,r2]
|
||
|
asl r4,r1,9
|
||
|
ld.as r9,[pcl,-114]; [pcl,(-((.-.L7f800000) >> 2))] ; 0x7f800000
|
||
|
MPYHU r7,r5,r4
|
||
|
asl r6,r1,8
|
||
|
and.f r11,r1,r9
|
||
|
bset r6,r6,31
|
||
|
asl r5,r5,13
|
||
|
; wb stall
|
||
|
beq .Ldenorm_fp1
|
||
|
sub r7,r5,r7
|
||
|
MPYHU r8,r7,r6
|
||
|
breq.d r11,r9,.Linf_nan_fp1
|
||
|
and.f r2,r0,r9
|
||
|
beq.d .Ldenorm_fp0
|
||
|
asl r12,r0,8
|
||
|
; wb stall
|
||
|
breq r2,r9,.Linf_nan_fp0
|
||
|
MPYHU r8,r8,r7
|
||
|
.Lpast_denorm_fp1:
|
||
|
bset r3,r12,31
|
||
|
.Lpast_denorm_fp0:
|
||
|
cmp_s r3,r6
|
||
|
lsr.cc r3,r3,1
|
||
|
add_s r2,r2, /* wait for immediate */ \
|
||
|
/* wb stall */ \
|
||
|
0x3f000000
|
||
|
sub r7,r7,r8 ; u1.31 inverse, about 30 bit
|
||
|
MPYHU r3,r3,r7
|
||
|
sbc r2,r2,r11
|
||
|
xor.f 0,r0,r1
|
||
|
and r0,r2,r9
|
||
|
bxor.mi r0,r0,31
|
||
|
brhs r2, /* wb stall / wait for immediate */ \
|
||
|
0x7f000000,.Linf_denorm
|
||
|
.Lpast_denorm:
|
||
|
add_s r3,r3,0x22 ; round to nearest or higher
|
||
|
tst r3,0x3c ; check if rounding was unsafe
|
||
|
lsr r3,r3,6
|
||
|
jne.d [blink] ; return if rounding was safe.
|
||
|
add_s r0,r0,r3
|
||
|
/* work out exact rounding if we fall through here. */
|
||
|
/* We know that the exact result cannot be represented in single
|
||
|
precision. Find the mid-point between the two nearest
|
||
|
representable values, multiply with the divisor, and check if
|
||
|
the result is larger than the dividend. */
|
||
|
add_s r3,r3,r3
|
||
|
sub_s r3,r3,1
|
||
|
mpyu r3,r3,r6
|
||
|
asr.f 0,r0,1 ; for round-to-even in case this is a denorm
|
||
|
rsub r2,r9,25
|
||
|
asl_s r12,r12,r2
|
||
|
; wb stall
|
||
|
; slow track
|
||
|
sub.f 0,r12,r3
|
||
|
j_s.d [blink]
|
||
|
sub.mi r0,r0,1
|
||
|
/* For denormal results, it is possible that an exact result needs
|
||
|
rounding, and thus the round-to-even rule has to come into play. */
|
||
|
.Linf_denorm:
|
||
|
brlo r2,0xc0000000,.Linf
|
||
|
.Ldenorm:
|
||
|
asr_s r2,r2,23
|
||
|
bic r0,r0,r9
|
||
|
neg r9,r2
|
||
|
brlo.d r9,25,.Lpast_denorm
|
||
|
lsr r3,r3,r9
|
||
|
/* Fall through: return +- 0 */
|
||
|
j_s [blink]
|
||
|
.Linf:
|
||
|
j_s.d [blink]
|
||
|
or r0,r0,r9
|
||
|
ENDFUNC(__divsf3)
|