1
0
Fork 0
mirror of https://github.com/betaflight/betaflight.git synced 2025-07-15 04:15:44 +03:00

Adding RP2350 SDK and target framework (#13988)

* Adding RP2350 SDK and target framework

* Spacing

* Removing board definitions
This commit is contained in:
J Blackman 2024-10-23 10:02:48 +11:00 committed by GitHub
parent 462cb05930
commit 2dd6f95aad
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
576 changed files with 435012 additions and 0 deletions

View file

@ -0,0 +1,253 @@
/*
* Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include "pico/asm_helper.S"
#if HAS_DOUBLE_COPROCESSOR
#include "hardware/dcp_instr.inc.S"
#include "hardware/dcp_canned.inc.S"
pico_default_asm_setup
// todo alignment
//__pre_init __aeabi_float_init, 00020
// factor out save/restore (there is a copy in double code)
.macro float_section name
#if PICO_FLOAT_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm
.macro float_wrapper_section func
float_section WRAPPER_FUNC_NAME(\func)
.endm
// ============== STATE SAVE AND RESTORE ===============
.macro saving_func func
// Note we are usually 32-bit aligned already at this point, as most of the
// function bodies contain exactly two 16-bit instructions: bmi and bx lr.
// We want the PCMP word-aligned.
.p2align 2
// When the engaged flag is set, branch back here to invoke save routine and
// hook lr with the restore routine, then fall back through to the entry
// point. The engaged flag will be clear when checked a second time.
1:
push {lr} // 16-bit instruction
bl generic_save_state // 32-bit instruction
b 1f // 16-bit instruction
// This is the actual entry point:
wrapper_func \func
PCMP apsr_nzcv
bmi 1b
1:
.endm
.macro saving_func_return
bx lr
.endm
float_section __rp2350_dcp_engaged_state_save_restore
.thumb_func
generic_save_state:
sub sp, #24
push {r0, r1}
// do save here
PXMD r0, r1
strd r0, r1, [sp, #8 + 0]
PYMD r0, r1
strd r0, r1, [sp, #8 + 8]
REFD r0, r1
strd r0, r1, [sp, #8 + 16]
pop {r0, r1}
blx lr
// <- wrapped function returns here
// fall through into restore:
.thumb_func
generic_restore_state:
// do restore here
pop {r12, r14}
WXMD r12, r14
pop {r12, r14}
WYMD r12, r14
pop {r12, r14}
WEFD r12, r14
pop {pc}
// ============== ARITHMETIC FUNCTIONS ===============
float_wrapper_section __aeabi_fadd
saving_func __aeabi_fadd
dcp_fadd_m r0,r0,r1
saving_func_return
float_wrapper_section __aeabi_fsub
saving_func __aeabi_fsub
dcp_fsub_m r0,r0,r1
saving_func_return
float_wrapper_section __aeabi_frsub
saving_func __aeabi_frsub
dcp_fsub_m r0,r1,r0
saving_func_return
float_wrapper_section __aeabi_fmul
saving_func __aeabi_fmul
dcp_fmul_m r0,r0,r1,r0,r1
saving_func_return
float_section fdiv_fast
saving_func fdiv_fast
dcp_fdiv_fast_m r0,r0,r1,r0,r1,r2
saving_func_return
float_wrapper_section __aeabi_fdiv
saving_func __aeabi_fdiv
@ with correct rounding
dcp_fdiv_m r0,r0,r1,r0,r1,r2,r3
saving_func_return
float_section sqrtf_fast
saving_func sqrtf_fast
dcp_fsqrt_fast_m r0,r0,r0,r1,r2,r3
saving_func_return
float_wrapper_section sqrtf
saving_func sqrtf
@ with correct rounding
dcp_fsqrt_m r0,r0,r0,r1,r2,r3
saving_func_return
// todo not a real thing
float_wrapper_section __aeabi_fclassify
saving_func __aeabi_fclassify
dcp_fclassify_m apsr_nzcv,r0
saving_func_return
// ============== CONVERSION FUNCTIONS ===============
float_wrapper_section __aeabi_f2d
saving_func __aeabi_f2d
dcp_float2double_m r0,r1,r0
saving_func_return
float_wrapper_section __aeabi_i2f
saving_func __aeabi_i2f
@ with rounding
dcp_int2float_m r0,r0
saving_func_return
float_wrapper_section __aeabi_ui2f
saving_func __aeabi_ui2f
@ with rounding
dcp_uint2float_m r0,r0
saving_func_return
float_wrapper_section __aeabi_f2iz
saving_func __aeabi_f2iz
@ with truncation towards 0
dcp_float2int_m r0,r0
saving_func_return
float_wrapper_section __aeabi_f2uiz
saving_func __aeabi_f2uiz
@ with truncation towards 0
dcp_float2uint_m r0,r0
saving_func_return
// todo not a real thing
float_wrapper_section __aeabi_f2i_r
saving_func __aeabi_f2i_r
@ with rounding
dcp_float2int_r_m r0,r0
saving_func_return
// todo not a real thing
float_wrapper_section __aeabi_f2ui_r
saving_func __aeabi_f2ui_r
@ with rounding
dcp_float2uint_r_m r0,r0
saving_func_return
// ============== COMPARISON FUNCTIONS ===============
float_wrapper_section __aeabi_fcmpun
saving_func __aeabi_fcmpun
dcp_fcmp_m r0,r0,r1
// extract unordered bit
ubfx r0, r0, #28, #1
saving_func_return
float_wrapper_section __aeabi_fcmp
saving_func __aeabi_cfrcmple
dcp_fcmp_m apsr_nzcv,r1,r0 // with arguments reversed
bvs cmp_nan
saving_func_return
// these next two can be the same function in the absence of exceptions
saving_func __aeabi_cfcmple
dcp_fcmp_m apsr_nzcv,r0,r1
bvs cmp_nan
saving_func_return
// It is not clear from the ABI documentation whether cfcmpeq must set the C flag
// in the same way as cfcmple. If not, we could save the "bvs" below; but we
// err on the side of caution.
saving_func __aeabi_cfcmpeq
dcp_fcmp_m apsr_nzcv,r0,r1
bvs cmp_nan
saving_func_return
// If the result of a flag-setting comparison is "unordered" then we need to set C and clear Z.
// We could conceivably just do lsrs r12,r14,#1, or even cmp r14,r14,lsr#1 as (a) r14 here is a
// return address and r14b0=1 for Thumb mode; (b) we are unlikely to be returning to address 0.
cmp_nan:
movs r12, #3 // r12 does not need to be preserved by the flag-setting comparisons
lsrs r12, #1 // set C, clear Z
saving_func_return
float_wrapper_section __aeabi_fcmpeq
saving_func __aeabi_fcmpeq
dcp_fcmp_m r0,r0,r1
// extract Z
ubfx r0, r0, #30, #1
saving_func_return
float_wrapper_section __aeabi_fcmplt
saving_func __aeabi_fcmplt
dcp_fcmp_m apsr_nzcv,r1,r0
ite hi
movhi r0,#1
movls r0,#0
saving_func_return
float_wrapper_section __aeabi_fcmple
saving_func __aeabi_fcmple
dcp_fcmp_m apsr_nzcv,r1,r0
ite hs
movhs r0,#1
movlo r0,#0
saving_func_return
float_wrapper_section __aeabi_fcmpge
saving_func __aeabi_fcmpge
dcp_fcmp_m apsr_nzcv,r0,r1
ite hs
movhs r0,#1
movlo r0,#0
saving_func_return
float_wrapper_section __aeabi_fcmpgt
saving_func __aeabi_fcmpgt
dcp_fcmp_m apsr_nzcv,r0,r1
ite hi
movhi r0,#1
movls r0,#0
saving_func_return
#endif

View file

@ -0,0 +1,776 @@
/*
* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include "pico/asm_helper.S"
#include "pico/runtime_init.h"
#include "pico/bootrom/sf_table.h"
#include "hardware/divider_helper.S"
PICO_RUNTIME_INIT_FUNC_RUNTIME(__aeabi_float_init, PICO_RUNTIME_INIT_AEABI_FLOAT)
pico_default_asm_setup
.macro float_section name
#if PICO_FLOAT_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm
.macro float_wrapper_section func
float_section WRAPPER_FUNC_NAME(\func)
.endm
.macro _float_wrapper_func x
wrapper_func \x
.endm
.macro wrapper_func_f1 x
_float_wrapper_func \x
#if PICO_FLOAT_PROPAGATE_NANS
mov ip, lr
bl __check_nan_f1
mov lr, ip
#endif
.endm
.macro wrapper_func_f2 x
_float_wrapper_func \x
#if PICO_FLOAT_PROPAGATE_NANS
mov ip, lr
bl __check_nan_f2
mov lr, ip
#endif
.endm
.section .text
#if PICO_FLOAT_PROPAGATE_NANS
.thumb_func
__check_nan_f1:
movs r3, #1
lsls r3, #24
lsls r2, r0, #1
adds r2, r3
bhi 1f
bx lr
1:
bx ip
.thumb_func
__check_nan_f2:
movs r3, #1
lsls r3, #24
lsls r2, r0, #1
adds r2, r3
bhi 1f
lsls r2, r1, #1
adds r2, r3
bhi 2f
bx lr
2:
mov r0, r1
1:
bx ip
#endif
.macro table_tail_call SF_TABLE_OFFSET
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
#ifndef NDEBUG
movs r3, #0
mov ip, r3
#endif
#endif
ldr r3, =sf_table
ldr r3, [r3, #\SF_TABLE_OFFSET]
bx r3
.endm
.macro shimmable_table_tail_call SF_TABLE_OFFSET shim
ldr r3, =sf_table
ldr r3, [r3, #\SF_TABLE_OFFSET]
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
mov ip, pc
#endif
bx r3
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
.byte \SF_TABLE_OFFSET, 0xdf
.word \shim
#endif
.endm
// note generally each function is in a separate section unless there is fall thru or branching between them
// note fadd, fsub, fmul, fdiv are so tiny and just defer to rom so are lumped together so they can share constant pool
// note functions are word aligned except where they are an odd number of linear instructions
// float FUNC_NAME(__aeabi_fadd)(float, float) single-precision addition
float_wrapper_section __aeabi_farithmetic
// float FUNC_NAME(__aeabi_frsub)(float x, float y) single-precision reverse subtraction, y - x
// frsub first because it is the only one that needs alignment
.align 2
wrapper_func __aeabi_frsub
eors r0, r1
eors r1, r0
eors r0, r1
// fall thru
// float FUNC_NAME(__aeabi_fsub)(float x, float y) single-precision subtraction, x - y
wrapper_func_f2 __aeabi_fsub
#if PICO_FLOAT_PROPAGATE_NANS
// we want to return nan for inf-inf or -inf - -inf, but without too much upfront cost
mov r2, r0
eors r2, r1
bmi 1f // different signs
push {r0, r1, lr}
bl 1f
b fdiv_fsub_nan_helper
1:
#endif
table_tail_call SF_TABLE_FSUB
wrapper_func_f2 __aeabi_fadd
table_tail_call SF_TABLE_FADD
// float FUNC_NAME(__aeabi_fdiv)(float n, float d) single-precision division, n / d
wrapper_func_f2 __aeabi_fdiv
#if PICO_FLOAT_PROPAGATE_NANS
push {r0, r1, lr}
bl 1f
b fdiv_fsub_nan_helper
1:
#endif
#if !PICO_DIVIDER_DISABLE_INTERRUPTS
// to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
ldr r2, =(SIO_BASE)
ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
bcs fdiv_save_state
#else
// to avoid worrying about IRQs (or context switches), simply disable interrupts around call
push {r4, lr}
mrs r4, PRIMASK
cpsid i
bl fdiv_shim_call
msr PRIMASK, r4
pop {r4, pc}
#endif
fdiv_shim_call:
table_tail_call SF_TABLE_FDIV
#if !PICO_DIVIDER_DISABLE_INTERRUPTS
fdiv_save_state:
save_div_state_and_lr
bl fdiv_shim_call
ldr r2, =(SIO_BASE)
restore_div_state_and_return
#endif
fdiv_fsub_nan_helper:
#if PICO_FLOAT_PROPAGATE_NANS
pop {r1, r2}
// check for infinite op infinite (or rather check for infinite result with both
// operands being infinite)
lsls r3, r0, #1
asrs r3, r3, #24
adds r3, #1
beq 2f
pop {pc}
2:
lsls r1, #1
asrs r1, r1, #24
lsls r2, #1
asrs r2, r2, #24
ands r1, r2
adds r1, #1
bne 3f
// infinite to nan
movs r1, #1
lsls r1, #22
orrs r0, r1
3:
pop {pc}
#endif
// float FUNC_NAME(__aeabi_fmul)(float, float) single-precision multiplication
wrapper_func_f2 __aeabi_fmul
#if PICO_FLOAT_PROPAGATE_NANS
push {r0, r1, lr}
bl 1f
pop {r1, r2}
// check for multiplication of infinite by zero (or rather check for infinite result with either
// operand 0)
lsls r3, r0, #1
asrs r3, r3, #24
adds r3, #1
beq 2f
pop {pc}
2:
ands r1, r2
bne 3f
// infinite to nan
movs r1, #1
lsls r1, #22
orrs r0, r1
3:
pop {pc}
1:
#endif
table_tail_call SF_TABLE_FMUL
// void FUNC_NAME(__aeabi_cfrcmple)(float, float) reversed 3-way (<, =, ?>) compare [1], result in PSR ZC flags
float_wrapper_section __aeabi_cfcmple
.align 2
wrapper_func __aeabi_cfrcmple
push {r0-r2, lr}
eors r0, r1
eors r1, r0
eors r0, r1
b __aeabi_cfcmple_guts
// NOTE these share an implementation as we have no excepting NaNs.
// void FUNC_NAME(__aeabi_cfcmple)(float, float) 3-way (<, =, ?>) compare [1], result in PSR ZC flags
// void FUNC_NAME(__aeabi_cfcmpeq)(float, float) non-excepting equality comparison [1], result in PSR ZC flags
.align 2
wrapper_func __aeabi_cfcmple
wrapper_func __aeabi_cfcmpeq
push {r0-r2, lr}
__aeabi_cfcmple_guts:
lsls r2,r0,#1
lsrs r2,#24
beq 1f
cmp r2,#0xff
bne 2f
lsls r2, r0, #9
bhi 3f
1:
lsrs r0,#23 @ clear mantissa if denormal or infinite
lsls r0,#23
2:
lsls r2,r1,#1
lsrs r2,#24
beq 1f
cmp r2,#0xff
bne 2f
lsls r2, r1, #9
bhi 3f
1:
lsrs r1,#23 @ clear mantissa if denormal or infinite
lsls r1,#23
2:
movs r2,#1 @ initialise result
eors r1,r0
bmi 2f @ opposite signs? then can proceed on basis of sign of x
eors r1,r0 @ restore y
bpl 1f
cmp r1,r0
pop {r0-r2, pc}
1:
cmp r0,r1
pop {r0-r2, pc}
2:
orrs r1, r0 @ handle 0/-0
adds r1, r1 @ note this always sets C
beq 3f
mvns r0, r0 @ carry inverse of r0 sign
adds r0, r0
3:
pop {r0-r2, pc}
// int FUNC_NAME(__aeabi_fcmpeq)(float, float) result (1, 0) denotes (=, ?<>) [2], use for C == and !=
float_wrapper_section __aeabi_fcmpeq
.align 2
wrapper_func __aeabi_fcmpeq
push {lr}
bl __aeabi_cfcmpeq
beq 1f
movs r0, #0
pop {pc}
1:
movs r0, #1
pop {pc}
// int FUNC_NAME(__aeabi_fcmplt)(float, float) result (1, 0) denotes (<, ?>=) [2], use for C <
float_wrapper_section __aeabi_fcmplt
.align 2
wrapper_func __aeabi_fcmplt
push {lr}
bl __aeabi_cfcmple
sbcs r0, r0
pop {pc}
// int FUNC_NAME(__aeabi_fcmple)(float, float) result (1, 0) denotes (<=, ?>) [2], use for C <=
float_wrapper_section __aeabi_fcmple
.align 2
wrapper_func __aeabi_fcmple
push {lr}
bl __aeabi_cfcmple
bls 1f
movs r0, #0
pop {pc}
1:
movs r0, #1
pop {pc}
// int FUNC_NAME(__aeabi_fcmpge)(float, float) result (1, 0) denotes (>=, ?<) [2], use for C >=
float_wrapper_section __aeabi_fcmpge
.align 2
wrapper_func __aeabi_fcmpge
push {lr}
// because of NaNs it is better to reverse the args than the result
bl __aeabi_cfrcmple
bls 1f
movs r0, #0
pop {pc}
1:
movs r0, #1
pop {pc}
// int FUNC_NAME(__aeabi_fcmpgt)(float, float) result (1, 0) denotes (>, ?<=) [2], use for C >
float_wrapper_section __aeabi_fcmpgt
wrapper_func __aeabi_fcmpgt
push {lr}
// because of NaNs it is better to reverse the args than the result
bl __aeabi_cfrcmple
sbcs r0, r0
pop {pc}
// int FUNC_NAME(__aeabi_fcmpun)(float, float) result (1, 0) denotes (?, <=>) [2], use for C99 isunordered()
float_wrapper_section __aeabi_fcmpun
wrapper_func __aeabi_fcmpun
movs r3, #1
lsls r3, #24
lsls r2, r0, #1
adds r2, r3
bhi 1f
lsls r2, r1, #1
adds r2, r3
bhi 1f
movs r0, #0
bx lr
1:
movs r0, #1
bx lr
// float FUNC_NAME(__aeabi_ui2f)(unsigned) unsigned to float (single precision) conversion
float_wrapper_section __aeabi_ui2f
wrapper_func __aeabi_ui2f
regular_func uint2float
subs r1, r1
cmp r0, #0
bne __aeabi_i2f_main
mov r0, r1
bx lr
float_wrapper_section __aeabi_i2f
// float FUNC_NAME(__aeabi_i2f)(int) integer to float (single precision) conversion
wrapper_func __aeabi_i2f
regular_func int2float
lsrs r1, r0, #31
lsls r1, #31
bpl 1f
negs r0, r0
1:
cmp r0, #0
beq 7f
__aeabi_i2f_main:
mov ip, lr
push {r0, r1}
ldr r3, =sf_clz_func
ldr r3, [r3]
blx r3
pop {r1, r2}
lsls r1, r0
subs r0, #158
negs r0, r0
adds r1,#0x80 @ rounding
bcs 5f @ tripped carry? then have leading 1 in C as required (and result is even so can ignore sticky bits)
lsls r3,r1,#24 @ check bottom 8 bits of r1
beq 6f @ in rounding-tie case?
lsls r1,#1 @ remove leading 1
3:
lsrs r1,#9 @ align mantissa
lsls r0,#23 @ align exponent
orrs r0,r2 @ assemble exponent and mantissa
4:
orrs r0,r1 @ apply sign
1:
bx ip
5:
adds r0,#1 @ correct exponent offset
b 3b
6:
lsrs r1,#9 @ ensure even result
lsls r1,#10
b 3b
7:
bx lr
// int FUNC_NAME(__aeabi_f2iz)(float) float (single precision) to integer C-style conversion [3]
float_wrapper_section __aeabi_f2iz
wrapper_func __aeabi_f2iz
regular_func float2int_z
lsls r1, r0, #1
lsrs r2, r1, #24
movs r3, #0x80
lsls r3, #24
cmp r2, #126
ble 1f
subs r2, #158
bge 2f
asrs r1, r0, #31
lsls r0, #9
lsrs r0, #1
orrs r0, r3
negs r2, r2
lsrs r0, r2
lsls r1, #1
adds r1, #1
muls r0, r1
bx lr
1:
movs r0, #0
bx lr
2:
lsrs r0, #31
adds r0, r3
subs r0, #1
bx lr
cmn r0, r0
bcc float2int
push {lr}
lsls r0, #1
lsrs r0, #1
movs r1, #0
bl __aeabi_f2uiz
cmp r0, #0
bmi 1f
negs r0, r0
pop {pc}
1:
movs r0, #128
lsls r0, #24
pop {pc}
float_section float2int
regular_func float2int
shimmable_table_tail_call SF_TABLE_FLOAT2INT float2int_shim
float_section float2fix
regular_func float2fix
shimmable_table_tail_call SF_TABLE_FLOAT2FIX float2fix_shim
float_section float2ufix
regular_func float2ufix
table_tail_call SF_TABLE_FLOAT2UFIX
// unsigned FUNC_NAME(__aeabi_f2uiz)(float) float (single precision) to unsigned C-style conversion [3]
float_wrapper_section __aeabi_f2uiz
wrapper_func __aeabi_f2uiz
regular_func float2uint_z
table_tail_call SF_TABLE_FLOAT2UINT
float_section fix2float
regular_func fix2float
table_tail_call SF_TABLE_FIX2FLOAT
float_section ufix2float
regular_func ufix2float
table_tail_call SF_TABLE_UFIX2FLOAT
float_section fix642float
regular_func fix642float
shimmable_table_tail_call SF_TABLE_FIX642FLOAT fix642float_shim
float_section ufix642float
regular_func ufix642float
shimmable_table_tail_call SF_TABLE_UFIX642FLOAT ufix642float_shim
// float FUNC_NAME(__aeabi_l2f)(long long) long long to float (single precision) conversion
float_wrapper_section __aeabi_l2f
1:
ldr r2, =__aeabi_i2f
bx r2
wrapper_func __aeabi_l2f
regular_func int642float
asrs r2, r0, #31
cmp r1, r2
beq 1b
shimmable_table_tail_call SF_TABLE_INT642FLOAT int642float_shim
// float FUNC_NAME(__aeabi_l2f)(long long) long long to float (single precision) conversion
float_wrapper_section __aeabi_ul2f
1:
ldr r2, =__aeabi_ui2f
bx r2
wrapper_func __aeabi_ul2f
regular_func uint642float
cmp r1, #0
beq 1b
shimmable_table_tail_call SF_TABLE_UINT642FLOAT uint642float_shim
// long long FUNC_NAME(__aeabi_f2lz)(float) float (single precision) to long long C-style conversion [3]
float_wrapper_section __aeabi_f2lz
wrapper_func __aeabi_f2lz
regular_func float2int64_z
cmn r0, r0
bcc float2int64
push {lr}
lsls r0, #1
lsrs r0, #1
movs r1, #0
bl float2ufix64
cmp r1, #0
bmi 1f
movs r2, #0
negs r0, r0
sbcs r2, r1
mov r1, r2
pop {pc}
1:
movs r1, #128
lsls r1, #24
movs r0, #0
pop {pc}
float_section float2int64
regular_func float2int64
shimmable_table_tail_call SF_TABLE_FLOAT2INT64 float2int64_shim
float_section float2fix64
regular_func float2fix64
shimmable_table_tail_call SF_TABLE_FLOAT2FIX64 float2fix64_shim
// unsigned long long FUNC_NAME(__aeabi_f2ulz)(float) float to unsigned long long C-style conversion [3]
float_wrapper_section __aeabi_f2ulz
wrapper_func __aeabi_f2ulz
regular_func float2uint64_z
shimmable_table_tail_call SF_TABLE_FLOAT2UINT64 float2uint64_shim
float_section float2ufix64
regular_func float2ufix64
shimmable_table_tail_call SF_TABLE_FLOAT2UFIX64 float2ufix64_shim
float_wrapper_section __aeabi_f2d
1:
#if PICO_FLOAT_PROPAGATE_NANS
// copy sign bit and 25 NAN id bits into sign bit and significant ID bits, also setting the high id bit
asrs r1, r0, #3
movs r2, #0xf
lsls r2, #27
orrs r1, r2
lsls r0, #25
bx lr
#endif
wrapper_func __aeabi_f2d
#if PICO_FLOAT_PROPAGATE_NANS
movs r3, #1
lsls r3, #24
lsls r2, r0, #1
adds r2, r3
bhi 1b
#endif
shimmable_table_tail_call SF_TABLE_FLOAT2DOUBLE float2double_shim
float_wrapper_section sqrtf
wrapper_func_f1 sqrtf
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
// check for negative
asrs r1, r0, #23
bmi 1f
#endif
table_tail_call SF_TABLE_FSQRT
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
1:
mvns r0, r1
cmp r0, #255
bne 2f
// -0 or -Denormal return -0 (0x80000000)
lsls r0, #31
bx lr
2:
// return -Inf (0xff800000)
asrs r0, r1, #31
lsls r0, #23
bx lr
#endif
float_wrapper_section cosf
// note we don't use _f1 since we do an infinity/nan check for outside of range
wrapper_func cosf
// rom version only works for -128 < angle < 128
lsls r1, r0, #1
lsrs r1, #24
cmp r1, #127 + 7
bge 1f
2:
table_tail_call SF_TABLE_FCOS
1:
#if PICO_FLOAT_PROPAGATE_NANS
// also check for infinites
cmp r1, #255
bne 3f
// infinite to nan
movs r1, #1
lsls r1, #22
orrs r0, r1
bx lr
3:
#endif
ldr r1, =0x40c90fdb // 2 * M_PI
push {lr}
bl remainderf
pop {r1}
mov lr, r1
b 2b
float_wrapper_section sinf
// note we don't use _f1 since we do an infinity/nan check for outside of range
wrapper_func sinf
// rom version only works for -128 < angle < 128
lsls r1, r0, #1
lsrs r1, #24
cmp r1, #127 + 7
bge 1f
2:
table_tail_call SF_TABLE_FSIN
1:
#if PICO_FLOAT_PROPAGATE_NANS
// also check for infinites
cmp r1, #255
bne 3f
// infinite to nan
movs r1, #1
lsls r1, #22
orrs r0, r1
bx lr
3:
#endif
ldr r1, =0x40c90fdb // 2 * M_PI
push {lr}
bl remainderf
pop {r1}
mov lr, r1
b 2b
float_wrapper_section sincosf
// note we don't use _f1 since we do an infinity/nan check for outside of range
wrapper_func sincosf
push {r1, r2, lr}
// rom version only works for -128 < angle < 128
lsls r3, r0, #1
lsrs r3, #24
cmp r3, #127 + 7
bge 3f
2:
ldr r3, =sf_table
ldr r3, [r3, #SF_TABLE_FSIN]
blx r3
pop {r2, r3}
str r0, [r2]
str r1, [r3]
pop {pc}
#if PICO_FLOAT_PROPAGATE_NANS
.align 2
pop {pc}
#endif
3:
#if PICO_FLOAT_PROPAGATE_NANS
// also check for infinites
cmp r3, #255
bne 4f
// infinite to nan
movs r3, #1
lsls r3, #22
orrs r0, r3
str r0, [r1]
str r0, [r2]
add sp, #12
bx lr
4:
#endif
ldr r1, =0x40c90fdb // 2 * M_PI
push {lr}
bl remainderf
pop {r1}
mov lr, r1
b 2b
float_wrapper_section tanf
// note we don't use _f1 since we do an infinity/nan check for outside of range
wrapper_func tanf
// rom version only works for -128 < angle < 128
lsls r1, r0, #1
lsrs r1, #24
cmp r1, #127 + 7
bge ftan_out_of_range
ftan_in_range:
#if !PICO_DIVIDER_DISABLE_INTERRUPTS
// to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
ldr r2, =(SIO_BASE)
ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
bcs ftan_save_state
#else
// to avoid worrying about IRQs (or context switches), simply disable interrupts around call
push {r4, lr}
mrs r4, PRIMASK
cpsid i
bl ftan_shim_call
msr PRIMASK, r4
pop {r4, pc}
#endif
ftan_shim_call:
table_tail_call SF_TABLE_FTAN
#if !PICO_DIVIDER_DISABLE_INTERRUPTS
ftan_save_state:
save_div_state_and_lr
bl ftan_shim_call
ldr r2, =(SIO_BASE)
restore_div_state_and_return
#endif
ftan_out_of_range:
#if PICO_FLOAT_PROPAGATE_NANS
// also check for infinites
cmp r1, #255
bne 3f
// infinite to nan
movs r1, #1
lsls r1, #22
orrs r0, r1
bx lr
3:
#endif
ldr r1, =0x40c90fdb // 2 * M_PI
push {lr}
bl remainderf
pop {r1}
mov lr, r1
b ftan_in_range
float_wrapper_section atan2f
wrapper_func_f2 atan2f
shimmable_table_tail_call SF_TABLE_FATAN2 fatan2_shim
float_wrapper_section expf
wrapper_func_f1 expf
table_tail_call SF_TABLE_FEXP
float_wrapper_section logf
wrapper_func_f1 logf
table_tail_call SF_TABLE_FLN

View file

@ -0,0 +1,369 @@
/*
* Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#if !PICO_RP2040
#include "pico/asm_helper.S"
pico_default_asm_setup
.macro float_section name
#if PICO_FLOAT_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm
.macro float_wrapper_section func
float_section WRAPPER_FUNC_NAME(\func)
.endm
float_wrapper_section conv_tof
@ convert int64 to float, rounding
wrapper_func __aeabi_l2f
regular_func int642float
movs r2,#0 @ fall through
@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
regular_func fix642float
cmp r1,#0
bge 10f @ positive? use unsigned code
rsbs r0,#0
sbc r1,r1,r1,lsl#1 @ make positive
cbz r1,7f @ high word is zero?
clz r3,r1
subs r3,#8
bmi 2f
lsls r1,r3
lsls r12,r0,r3 @ bits that will be lost
rsb r3,#32
lsr r0,r3
orr r0,r0,r1
sub r2,r2,r3
rsb r2,#149
adds r12,r12,r12 @ rounding bit into carry
adc r0,r0,r2,lsl#23 @ insert exponent, add rounding
orr r0,r0,#0x80000000
beq 4f @ potential rounding tie?
cmp r2,#0xfe
bhs 3f @ over/underflow?
bx r14
2:
add r3,#33
lsls r12,r1,r3 @ rounding bit in carry, sticky bits in r12
orrs r12,r12,r0 @ all of low word into sticky bits: affects Z but not C
rsb r3,#33
lsr r0,r1,r3
@ push {r14}
@ bl dumpreg
@ pop {r14}
sub r2,r3,r2
add r2,#22+127+32
adc r0,r0,r2,lsl#23 @ insert exponent, add rounding
orr r0,r0,#0x80000000
beq 4f @ potential rounding tie?
cmp r2,#0xfe
it lo
bxlo r14
@ over/underflow?
3:
mov r0,#0x80000000 @ underflow
it ge
movtge r0,#0xff80 @ overflow
1:
bx r14
7:
mov r1,r2
b fix2float_neg
4:
it cs @ rounding tie?
biccs r0,r0,#1 @ force to even if we rounded up
cmp r2,#0xfe
it lo
bxlo r14
b 3b
@ convert signed 32-bit fix to float, rounding; number of r0 bits after point in r1
.thumb_func
regular_func fix2float
cmp r0,#0
bge ufix2float @ positive? can use unsigned code
rsbs r0,#0 @ make positive
fix2float_neg:
clz r3,r0
subs r3,#8
bmi 2f
lsls r0,r3
add r2,r1,r3
rsb r2,#149
add r0,r0,r2,lsl#23 @ insert exponent
orr r0,#0x80000000
cmp r2,#0xfe
it lo @ over/underflow?
bxlo r14
b 3f
2:
add r3,#33
lsls r12,r0,r3 @ rounding bit in carry, sticky bits in r12
rsb r3,#33
lsr r0,r3
@ push {r14}
@ bl dumpreg
@ pop {r14}
sub r2,r3,r1
add r2,#22+127
adc r0,r0,r2,lsl#23 @ insert exponent
orr r0,#0x80000000
beq 4f @ potential rounding tie?
cmp r2,#0xfe
it lo
bxlo r14
@ over/underflow?
3:
mov r0,#0x80000000 @ underflow
it ge
orrge r0,#0x7f800000 @ overflow
1:
bx r14
4:
it cs @ rounding tie?
biccs r0,r0,#1 @ force to even if we rounded up
cmp r2,#0xfe
it lo
bxlo r14
b 3b
@ convert unsigned 32-bit fix to float, rounding; number of r0 bits after point in r1
regular_func ufix2float
cbz r0,1f @ zero? return it
clz r3,r0
subs r3,#8
bmi 2f
lsls r0,r3
add r2,r1,r3
rsb r2,#149
add r0,r0,r2,lsl#23 @ insert exponent
@ push {r14}
@ bl dumpreg
@ pop {r14}
cmp r2,#0xfe
it lo @ over/underflow?
bxlo r14
b 3f
2:
add r3,#33
lsls r12,r0,r3 @ rounding bit in carry, sticky bits in r12
rsb r3,#33
lsr r0,r3
@ push {r14}
@ bl dumpreg
@ pop {r14}
sub r2,r3,r1
add r2,#22+127
adc r0,r0,r2,lsl#23 @ insert exponent
beq 4f @ potential rounding tie?
cmp r2,#0xfe
it lo
bxlo r14
@ over/underflow?
3:
ite ge
movge r0,#0x7f800000 @ overflow
movlt r0,#0x00000000 @ underflow
1:
bx r14
4:
it cs @ rounding tie?
biccs r0,r0,#1 @ force to even if we rounded up
cmp r2,#0xfe
it lo
bxlo r14
b 3b
@ convert uint64 to float, rounding
wrapper_func __aeabi_ul2f
regular_func uint642float
movs r2,#0 @ fall through
@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
regular_func ufix642float
10:
cbz r1,7f @ high word is zero?
clz r3,r1
subs r3,#8
bmi 2f
lsls r1,r3
lsls r12,r0,r3 @ bits that will be lost
rsb r3,#32
lsr r0,r3
orr r0,r0,r1
sub r2,r2,r3
rsb r2,#149
adds r12,r12,r12 @ rounding bit into carry
adc r0,r0,r2,lsl#23 @ insert exponent, add rounding
beq 4f @ potential rounding tie?
cmp r2,#0xfe
bhs 3f @ over/underflow?
bx r14
2:
add r3,#33
lsls r12,r1,r3 @ rounding bit in carry, sticky bits in r12
orrs r12,r12,r0 @ all of low word into sticky bits: affects Z but not C
rsb r3,#33
lsr r0,r1,r3
@ push {r14}
@ bl dumpreg
@ pop {r14}
sub r2,r3,r2
add r2,#22+127+32
adc r0,r0,r2,lsl#23 @ insert exponent, add rounding
beq 4f @ potential rounding tie?
cmp r2,#0xfe
it lo
bxlo r14
@ over/underflow?
3:
ite ge
movge r0,#0x7f800000 @ overflow
movlt r0,#0x00000000 @ underflow
1:
bx r14
7:
mov r1,r2
b ufix2float
4:
it cs @ rounding tie?
biccs r0,r0,#1 @ force to even if we rounded up
cmp r2,#0xfe
it lo
bxlo r14
b 3b
float_wrapper_section conv_ftoi64
@ convert float to signed int64, rounding towards 0, clamping
wrapper_func __aeabi_f2lz
regular_func float2int64_z
movs r1,#0 @ fall through
@ convert float in r0 to signed fixed point in r0:r1, clamping
regular_func float2fix64_z
subs r1,#0x95 @ remove exponent bias, compensate for mantissa length
asrs r2,r0,#23 @ sign and exponent
sub r3,r2,#1
sub r0,r0,r3,lsl#23 @ install implied 1, clear exponent
uxtb r3,r3
cmp r3,#0xfe
bhs 1f @ 0 or Inf/NaN?
adds r1,r3 @ offset exponent by fix precision; r1 is now required left shift
bmi 4f @ actually a right shift?
subs r3,r1,#32 @ result fits in high 32 bits only?
bge 8f
subs r3,r1,#8 @ result fits in low 32 bits only?
ble 7f
lsls r0,#8
rsbs r1,r3,#32
lsrs r1,r0,r1
lsls r0,r3
cmp r2,#0
it ge
bxge r14
rsbs r0,#0 @ negate if necessary
sbcs r1,r1,r1,lsl#1
bx r14
7:
lsls r0,r0,r1
movs r1,r2,asr#31 @ sign extend
eors r0,r0,r1 @ negate if necessary
subs r0,r0,r1
bx r14
8:
cmp r3,#8 @ overflow?
bge 5f
lsls r0,r0,r3
eor r1,r0,r2,asr#31 @ negate if necessary
add r1,r1,r2,lsr#31
movs r0,#0
bx r14
1:
bhi 3f @ 0?
lsls r1,r0,#9 @ mantissa field
it ne @ NaN?
movne r2,#0 @ treat NaNs as +∞
5:
mvn r1,#0x80000000 @ = 0x7fffffff
add r1,r1,r2,lsr#31 @ so -Inf → 0x80000000, +Inf → 0x7fffffff
mvn r0,r2,asr#31
bx r14
3:
movs r0,#0
movs r1,#0
bx r14
4:
rsbs r1,#0
usat r1,#5,r1
lsrs r0,r0,r1
eors r0,r0,r2,asr#31 @ negate if necessary
adds r0,r0,r2,lsr#31
movs r1,r0,asr#31 @ sign extend
bx r14
float_wrapper_section conv_ftoui64
@ convert float to unsigned int64, rounding towards -Inf, clamping
wrapper_func __aeabi_f2ulz
regular_func float2uint64
regular_func float2uint64_z
movs r1,#0 @ fall through
@ convert float in r0 to unsigned fixed point in r0:r1, clamping
regular_func float2ufix64
//regular_func float2ufix64_z
subs r1,#0x96 @ remove exponent bias, compensate for mantissa length
asrs r2,r0,#23 @ sign and exponent
sub r3,r2,#1
cmp r3,#0xfe
bhs 1f @ -ve, 0 or Inf/NaN?
sub r0,r0,r3,lsl#23 @ install implied 1, clear exponent
adds r1,r2 @ offset exponent by fix precision; r1 is now required left shift
bmi 4f @ actually a right shift?
subs r2,r1,#7
ble 7f @ result (easily) fits in lo 32 bits?
subs r3,r1,#32
bge 8f @ results might fit in hi 32 bits?
lsls r0,r0,#7
rsbs r3,r2,#32
lsrs r1,r0,r3
lsls r0,r0,r2
bx r14
7:
lsls r0,r1
movs r1,#0
bx r14
8:
cmp r1,#32+9 @ overflow?
bge 5f
lsls r1,r0,r3
movs r0,#0
bx r14
5:
mvn r0,#0 @ = 0xffffffff
mvn r1,#0 @ = 0xffffffff
bx r14
4:
rsbs r1,#0
usat r1,#5,r1 @ if shift is long return 0
lsrs r0,r0,r1
movs r1,#0
bx r14
1:
cmp r0,#0xff800000
bhi 5b @ -NaN, return 0xffffffff
cmp r0,#0x00800000
bgt 5b @ +Inf or +NaN, return 0xfffffff
2:
movs r0,#0 @ return 0
movs r1,#0
bx r14
#endif

View file

@ -0,0 +1,70 @@
/*
* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include <string.h>
#include "pico/bootrom.h"
#include "pico/bootrom/sf_table.h"
// NOTE THIS FUNCTION TABLE IS NOT PUBLIC OR NECESSARILY COMPLETE...
// IT IS ***NOT*** SAFE TO CALL THESE FUNCTION POINTERS FROM ARBITRARY CODE
uint32_t sf_table[SF_TABLE_V2_SIZE / 2];
void __attribute__((weak)) *sf_clz_func;
#if !(PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED)
static __attribute__((noreturn)) void missing_float_func_shim(void) {
panic("");
}
#endif
void __aeabi_float_init(void) {
int rom_version = rp2040_rom_version();
void *rom_table = rom_data_lookup(rom_table_code('S', 'F'));
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
if (rom_version == 1) {
memcpy(&sf_table, rom_table, SF_TABLE_V1_SIZE);
extern void float_table_shim_on_use_helper(void);
// todo replace NDEBUG with a more exclusive assertion guard
#ifndef NDEBUG
if (*(uint16_t *)0x29ee != 0x0fc4 || // this is packx
*(uint16_t *)0x29c0 != 0x0dc2 || // this is upackx
*(uint16_t *)0x2b96 != 0xb5c0 || // this is cordic_vec
*(uint16_t *)0x2b18 != 0x2500 || // this is packretns
*(uint16_t *)0x2acc != 0xb510 || // this is float2fix
*(uint32_t *)0x2cfc != 0x6487ed51 // pi_q29
) {
panic("");
}
#endif
// this is a little tricky.. we only want to pull in a shim if the corresponding function
// is called. to that end we include a SVC instruction with the table offset as the call number
// followed by the shim function pointer inside the actual wrapper function. that way if the wrapper
// function is garbage collected, so is the shim function.
//
// float_table_shim_on_use_helper expects this SVC instruction in the calling code soon after the address
// pointed to by IP and patches the float_table entry with the real shim the first time the function is called.
for(uint i=SF_TABLE_V1_SIZE/4; i<SF_TABLE_V2_SIZE/4; i++) {
sf_table[i] = (uintptr_t)float_table_shim_on_use_helper;
}
// we shim these for -0 and -denormal handling
sf_table[SF_TABLE_FLOAT2INT/4] = sf_table[SF_TABLE_FLOAT2FIX/4] = (uintptr_t)float_table_shim_on_use_helper;
}
#else
if (rom_version == 1) {
memcpy(&sf_table, rom_table, SF_TABLE_V1_SIZE);
// opting for soft failure for now - you'll get a panic at runtime if you call any of the missing methods
for(uint i=0;i<SF_TABLE_V2_SIZE/4;i++) {
if (!sf_table[i]) sf_table[i] = (uintptr_t)missing_float_func_shim;
}
}
#endif
if (rom_version >= 2) {
assert(*((uint8_t *)rom_table-2) * 4 >= SF_TABLE_V2_SIZE);
memcpy(&sf_table, rom_table, SF_TABLE_V2_SIZE);
}
sf_clz_func = rom_func_lookup(ROM_FUNC_CLZ32);
}

View file

@ -0,0 +1,582 @@
/*
* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include "pico/float.h"
// opened a separate issue https://github.com/raspberrypi/pico-sdk/issues/166 to deal with these warnings if at all
GCC_Pragma("GCC diagnostic push")
GCC_Pragma("GCC diagnostic ignored \"-Wconversion\"")
GCC_Pragma("GCC diagnostic ignored \"-Wsign-conversion\"")
typedef uint32_t ui32;
typedef int32_t i32;
#define FPINF ( HUGE_VALF)
#define FMINF (-HUGE_VALF)
#define NANF ((float)NAN)
#define PZERO (+0.0)
#define MZERO (-0.0)
#define PI 3.14159265358979323846
#define LOG2 0.69314718055994530941
// Unfortunately in double precision ln(10) is very close to half-way between to representable numbers
#define LOG10 2.30258509299404568401
#define LOG2E 1.44269504088896340737
#define LOG10E 0.43429448190325182765
#define ONETHIRD 0.33333333333333333333
#define PIf 3.14159265358979323846f
#define LOG2f 0.69314718055994530941f
#define LOG2Ef 1.44269504088896340737f
#define LOG10Ef 0.43429448190325182765f
#define ONETHIRDf 0.33333333333333333333f
#define FUNPACK(x,e,m) e=((x)>>23)&0xff,m=((x)&0x007fffff)|0x00800000
#define FUNPACKS(x,s,e,m) s=((x)>>31),FUNPACK((x),(e),(m))
typedef union {
float f;
ui32 ix;
} float_ui32;
static inline float ui322float(ui32 ix) {
float_ui32 tmp;
tmp.ix = ix;
return tmp.f;
}
static inline ui32 float2ui32(float f) {
float_ui32 tmp;
tmp.f = f;
return tmp.ix;
}
#if PICO_FLOAT_PROPAGATE_NANS
static inline bool fisnan(float x) {
ui32 ix=float2ui32(x);
return ix * 2 > 0xff000000u;
}
#define check_nan_f1(x) if (fisnan((x))) return (x)
#define check_nan_f2(x,y) if (fisnan((x))) return (x); else if (fisnan((y))) return (y);
#else
#define check_nan_f1(x) ((void)0)
#define check_nan_f2(x,y) ((void)0)
#endif
static inline int fgetsignexp(float x) {
ui32 ix=float2ui32(x);
return (ix>>23)&0x1ff;
}
static inline int fgetexp(float x) {
ui32 ix=float2ui32(x);
return (ix>>23)&0xff;
}
static inline float fldexp(float x,int de) {
ui32 ix=float2ui32(x),iy;
int e;
e=fgetexp(x);
if(e==0||e==0xff) return x;
e+=de;
if(e<=0) iy=ix&0x80000000; // signed zero for underflow
else if(e>=0xff) iy=(ix&0x80000000)|0x7f800000ULL; // signed infinity on overflow
else iy=ix+((ui32)de<<23);
return ui322float(iy);
}
float WRAPPER_FUNC(ldexpf)(float x, int de) {
check_nan_f1(x);
return fldexp(x, de);
}
static inline float fcopysign(float x,float y) {
ui32 ix=float2ui32(x),iy=float2ui32(y);
ix=((ix&0x7fffffff)|(iy&0x80000000));
return ui322float(ix);
}
float WRAPPER_FUNC(copysignf)(float x, float y) {
check_nan_f2(x,y);
return fcopysign(x, y);
}
static inline int fiszero(float x) { return fgetexp (x)==0; }
//static inline int fispzero(float x) { return fgetsignexp(x)==0; }
//static inline int fismzero(float x) { return fgetsignexp(x)==0x100; }
static inline int fisinf(float x) { return fgetexp (x)==0xff; }
static inline int fispinf(float x) { return fgetsignexp(x)==0xff; }
static inline int fisminf(float x) { return fgetsignexp(x)==0x1ff; }
static inline int fisint(float x) {
ui32 ix=float2ui32(x),m;
int e=fgetexp(x);
if(e==0) return 1; // 0 is an integer
e-=0x7f; // remove exponent bias
if(e<0) return 0; // |x|<1
e=23-e; // bit position in mantissa with significance 1
if(e<=0) return 1; // |x| large, so must be an integer
m=(1<<e)-1; // mask for bits of significance <1
if(ix&m) return 0; // not an integer
return 1;
}
static inline int fisoddint(float x) {
ui32 ix=float2ui32(x),m;
int e=fgetexp(x);
e-=0x7f; // remove exponent bias
if(e<0) return 0; // |x|<1; 0 is not odd
e=23-e; // bit position in mantissa with significance 1
if(e<0) return 0; // |x| large, so must be even
m=(1<<e)-1; // mask for bits of significance <1 (if any)
if(ix&m) return 0; // not an integer
if(e==23) return 1; // value is exactly 1
return (ix>>e)&1;
}
static inline int fisstrictneg(float x) {
ui32 ix=float2ui32(x);
if(fiszero(x)) return 0;
return ix>>31;
}
static inline int fisneg(float x) {
ui32 ix=float2ui32(x);
return ix>>31;
}
static inline float fneg(float x) {
ui32 ix=float2ui32(x);
ix^=0x80000000;
return ui322float(ix);
}
static inline int fispo2(float x) {
ui32 ix=float2ui32(x);
if(fiszero(x)) return 0;
if(fisinf(x)) return 0;
ix&=0x007fffff;
return ix==0;
}
static inline float fnan_or(float x) {
#if PICO_FLOAT_PROPAGATE_NANS
return NANF;
#else
return x;
#endif
}
float WRAPPER_FUNC(truncf)(float x) {
check_nan_f1(x);
ui32 ix=float2ui32(x),m;
int e=fgetexp(x);
e-=0x7f; // remove exponent bias
if(e<0) { // |x|<1
ix&=0x80000000;
return ui322float(ix);
}
e=23-e; // bit position in mantissa with significance 1
if(e<=0) return x; // |x| large, so must be an integer
m=(1<<e)-1; // mask for bits of significance <1
ix&=~m;
return ui322float(ix);
}
float WRAPPER_FUNC(roundf)(float x) {
check_nan_f1(x);
ui32 ix=float2ui32(x),m;
int e=fgetexp(x);
e-=0x7f; // remove exponent bias
if(e<-1) { // |x|<0.5
ix&=0x80000000;
return ui322float(ix);
}
if(e==-1) { // 0.5<=|x|<1
ix&=0x80000000;
ix|=0x3f800000; // ±1
return ui322float(ix);
}
e=23-e; // bit position in mantissa with significance 1, <=23
if(e<=0) return x; // |x| large, so must be an integer
m=1<<(e-1); // mask for bit of significance 0.5
ix+=m;
m=m+m-1; // mask for bits of significance <1
ix&=~m;
return ui322float(ix);
}
float WRAPPER_FUNC(floorf)(float x) {
check_nan_f1(x);
ui32 ix=float2ui32(x),m;
int e=fgetexp(x);
if(e==0) { // x==0
ix&=0x80000000;
return ui322float(ix);
}
e-=0x7f; // remove exponent bias
if(e<0) { // |x|<1, not zero
if(fisneg(x)) return -1;
return PZERO;
}
e=23-e; // bit position in mantissa with significance 1
if(e<=0) return x; // |x| large, so must be an integer
m=(1<<e)-1; // mask for bit of significance <1
if(fisneg(x)) ix+=m; // add 1-ε to magnitude if negative
ix&=~m; // truncate
return ui322float(ix);
}
float WRAPPER_FUNC(ceilf)(float x) {
check_nan_f1(x);
ui32 ix=float2ui32(x),m;
int e=fgetexp(x);
if(e==0) { // x==0
ix&=0x80000000;
return ui322float(ix);
}
e-=0x7f; // remove exponent bias
if(e<0) { // |x|<1, not zero
if(fisneg(x)) return MZERO;
return 1;
}
e=23-e; // bit position in mantissa with significance 1
if(e<=0) return x; // |x| large, so must be an integer
m=(1<<e)-1; // mask for bit of significance <1
if(!fisneg(x)) ix+=m; // add 1-ε to magnitude if positive
ix&=~m; // truncate
return ui322float(ix);
}
float WRAPPER_FUNC(asinf)(float x) {
check_nan_f1(x);
float u;
u=(1.0f-x)*(1.0f+x);
if(fisstrictneg(u)) return fnan_or(FPINF);
return atan2f(x,sqrtf(u));
}
float WRAPPER_FUNC(acosf)(float x) {
check_nan_f1(x);
float u;
u=(1.0f-x)*(1.0f+x);
if(fisstrictneg(u)) return fnan_or(FPINF);
return atan2f(sqrtf(u),x);
}
float WRAPPER_FUNC(atanf)(float x) {
check_nan_f1(x);
if(fispinf(x)) return (float)( PIf/2);
if(fisminf(x)) return (float)(-PIf/2);
return atan2f(x,1.0f);
}
float WRAPPER_FUNC(sinhf)(float x) {
check_nan_f1(x);
return fldexp((expf(x)-expf(fneg(x))),-1);
}
float WRAPPER_FUNC(coshf)(float x) {
check_nan_f1(x);
return fldexp((expf(x)+expf(fneg(x))),-1);
}
float WRAPPER_FUNC(tanhf)(float x) {
check_nan_f1(x);
float u;
int e;
e=fgetexp(x);
if(e>=4+0x7f) { // |x|>=16?
if(!fisneg(x)) return 1; // 1 << exp 2x; avoid generating infinities later
else return -1; // 1 >> exp 2x
}
u=expf(fldexp(x,1));
return (u-1.0f)/(u+1.0f);
}
float WRAPPER_FUNC(asinhf)(float x) {
check_nan_f1(x);
int e;
e=fgetexp(x);
if(e>=16+0x7f) { // |x|>=2^16?
if(!fisneg(x)) return logf( x )+LOG2f; // 1/x^2 << 1
else return fneg(logf(fneg(x))+LOG2f); // 1/x^2 << 1
}
if(x>0) return (float)log(sqrt((double)x*(double)x+1.0)+(double)x);
else return fneg((float)log(sqrt((double)x*(double)x+1.0)-(double)x));
}
float WRAPPER_FUNC(acoshf)(float x) {
check_nan_f1(x);
int e;
if(fisneg(x)) x=fneg(x);
e=fgetexp(x);
if(e>=16+0x7f) return logf(x)+LOG2f; // |x|>=2^16?
return (float)log(sqrt(((double)x+1.0)*((double)x-1.0))+(double)x);
}
float WRAPPER_FUNC(atanhf)(float x) {
check_nan_f1(x);
return fldexp(logf((1.0f+x)/(1.0f-x)),-1);
}
float WRAPPER_FUNC(exp2f)(float x) { check_nan_f1(x); return (float)exp((double)x*LOG2); }
float WRAPPER_FUNC(log2f)(float x) { check_nan_f1(x); return logf(x)*LOG2Ef; }
float WRAPPER_FUNC(exp10f)(float x) { check_nan_f1(x); return (float)exp((double)x*LOG10); }
float WRAPPER_FUNC(log10f)(float x) { check_nan_f1(x); return logf(x)*LOG10Ef; }
float WRAPPER_FUNC(expm1f)(float x) { check_nan_f1(x); return (float)(exp((double)x)-1); }
float WRAPPER_FUNC(log1pf)(float x) { check_nan_f1(x); return (float)(log(1+(double)x)); }
float WRAPPER_FUNC(fmaf)(float x,float y,float z) {
check_nan_f2(x,y);
check_nan_f1(z);
return (float)((double)x*(double)y+(double)z);
} // has double rounding so not exact
// general power, x>0
static inline float fpow_1(float x,float y) {
return (float)exp(log((double)x)*(double)y); // using double-precision intermediates for better accuracy
}
static float fpow_int2(float x,int y) {
float u;
if(y==1) return x;
u=fpow_int2(x,y/2);
u*=u;
if(y&1) u*=x;
return u;
}
// for the case where x not zero or infinity, y small and not zero
static inline float fpowint_1(float x,int y) {
if(y<0) x=1.0f/x,y=-y;
return fpow_int2(x,y);
}
// for the case where x not zero or infinity
static float fpowint_0(float x,int y) {
int e;
if(fisneg(x)) {
if(fisoddint(y)) return fneg(fpowint_0(fneg(x),y));
else return fpowint_0(fneg(x),y);
}
if(fispo2(x)) {
e=fgetexp(x)-0x7f;
if(y>=256) y= 255; // avoid overflow
if(y<-256) y=-256;
y*=e;
return fldexp(1,y);
}
if(y==0) return 1;
if(y>=-32&&y<=32) return fpowint_1(x,y);
return fpow_1(x,y);
}
float WRAPPER_FUNC(powintf)(float x,int y) {
GCC_Pragma("GCC diagnostic push")
GCC_Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
if(x==1.0f||y==0) return 1;
if(x==0.0f) {
if(y>0) {
if(y&1) return x;
else return 0;
}
if((y&1)) return fcopysign(FPINF,x);
return FPINF;
}
GCC_Pragma("GCC diagnostic pop")
check_nan_f1(x);
if(fispinf(x)) {
if(y<0) return 0;
else return FPINF;
}
if(fisminf(x)) {
if(y>0) {
if((y&1)) return FMINF;
else return FPINF;
}
if((y&1)) return MZERO;
else return PZERO;
}
return fpowint_0(x,y);
}
// for the case where y is guaranteed a finite integer, x not zero or infinity
static float fpow_0(float x,float y) {
int e,p;
if(fisneg(x)) {
if(fisoddint(y)) return fneg(fpow_0(fneg(x),y));
else return fpow_0(fneg(x),y);
}
p=(int)y;
if(fispo2(x)) {
e=fgetexp(x)-0x7f;
if(p>=256) p= 255; // avoid overflow
if(p<-256) p=-256;
p*=e;
return fldexp(1,p);
}
if(p==0) return 1;
if(p>=-32&&p<=32) return fpowint_1(x,p);
return fpow_1(x,y);
}
float WRAPPER_FUNC(powf)(float x,float y) {
GCC_Like_Pragma("GCC diagnostic push")
GCC_Like_Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
if(x==1.0f||fiszero(y)) return 1;
check_nan_f2(x,y);
if(x==-1.0f&&fisinf(y)) return 1;
GCC_Like_Pragma("GCC diagnostic pop")
if(fiszero(x)) {
if(!fisneg(y)) {
if(fisoddint(y)) return x;
else return 0;
}
if(fisoddint(y)) return fcopysign(FPINF,x);
return FPINF;
}
if(fispinf(x)) {
if(fisneg(y)) return 0;
else return FPINF;
}
if(fisminf(x)) {
if(!fisneg(y)) {
if(fisoddint(y)) return FMINF;
else return FPINF;
}
if(fisoddint(y)) return MZERO;
else return PZERO;
}
if(fispinf(y)) {
if(fgetexp(x)<0x7f) return PZERO;
else return FPINF;
}
if(fisminf(y)) {
if(fgetexp(x)<0x7f) return FPINF;
else return PZERO;
}
if(fisint(y)) return fpow_0(x,y);
if(fisneg(x)) return FPINF;
return fpow_1(x,y);
}
float WRAPPER_FUNC(hypotf)(float x,float y) {
check_nan_f2(x,y);
int ex,ey;
ex=fgetexp(x); ey=fgetexp(y);
if(ex>=0x7f+50||ey>=0x7f+50) { // overflow, or nearly so
x=fldexp(x,-70),y=fldexp(y,-70);
return fldexp(sqrtf(x*x+y*y), 70);
}
else if(ex<=0x7f-50&&ey<=0x7f-50) { // underflow, or nearly so
x=fldexp(x, 70),y=fldexp(y, 70);
return fldexp(sqrtf(x*x+y*y),-70);
}
return sqrtf(x*x+y*y);
}
float WRAPPER_FUNC(cbrtf)(float x) {
check_nan_f1(x);
int e;
if(fisneg(x)) return fneg(cbrtf(fneg(x)));
if(fiszero(x)) return fcopysign(PZERO,x);
e=fgetexp(x)-0x7f;
e=(e*0x5555+0x8000)>>16; // ~e/3, rounded
x=fldexp(x,-e*3);
x=expf(logf(x)*ONETHIRDf);
return fldexp(x,e);
}
// reduces mx*2^e modulo my, returning bottom bits of quotient at *pquo
// 2^23<=|mx|,my<2^24, e>=0; 0<=result<my
static i32 frem_0(i32 mx,i32 my,int e,int*pquo) {
int quo=0,q,r=0,s;
if(e>0) {
r=0xffffffffU/(ui32)(my>>7); // reciprocal estimate Q16
}
while(e>0) {
s=e; if(s>12) s=12; // gain up to 12 bits on each iteration
q=(mx>>9)*r; // Q30
q=((q>>(29-s))+1)>>1; // Q(s), rounded
mx=(mx<<s)-my*q;
quo=(quo<<s)+q;
e-=s;
}
if(mx>=my) mx-=my,quo++; // when e==0 mx can be nearly as big as 2my
if(mx>=my) mx-=my,quo++;
if(mx<0) mx+=my,quo--;
if(mx<0) mx+=my,quo--;
if(pquo) *pquo=quo;
return mx;
}
float WRAPPER_FUNC(fmodf)(float x,float y) {
check_nan_f2(x,y);
ui32 ix=float2ui32(x),iy=float2ui32(y);
int sx,ex,ey;
i32 mx,my;
FUNPACKS(ix,sx,ex,mx);
FUNPACK(iy,ey,my);
if(ex==0xff) {
return fnan_or(FPINF);
}
if(ey==0) return FPINF;
if(ex==0) {
if(!fisneg(x)) return PZERO;
return MZERO;
}
if(ex<ey) return x; // |x|<|y|, including case x=±0
mx=frem_0(mx,my,ex-ey,0);
if(sx) mx=-mx;
return fix2float(mx,0x7f-ey+23);
}
float WRAPPER_FUNC(remquof)(float x,float y,int*quo) {
check_nan_f2(x,y);
ui32 ix=float2ui32(x),iy=float2ui32(y);
int sx,sy,ex,ey,q;
i32 mx,my;
FUNPACKS(ix,sx,ex,mx);
FUNPACKS(iy,sy,ey,my);
if(quo) *quo=0;
if(ex==0xff) return FPINF;
if(ey==0) return FPINF;
if(ex==0) return PZERO;
if(ey==0xff) return x;
if(ex<ey-1) return x; // |x|<|y|/2
if(ex==ey-1) {
if(mx<=my) return x; // |x|<=|y|/2, even quotient
// here |y|/2<|x|<|y|
if(!sx) { // x>|y|/2
mx-=my+my;
ey--;
q=1;
} else { // x<-|y|/2
mx=my+my-mx;
ey--;
q=-1;
}
}
else {
if(sx) mx=-mx;
mx=frem_0(mx,my,ex-ey,&q);
if(mx+mx>my || (mx+mx==my&&(q&1)) ) { // |x|>|y|/2, or equality and an odd quotient?
mx-=my;
q++;
}
}
if(sy) q=-q;
if(quo) *quo=q;
return fix2float(mx,0x7f-ey+23);
}
float WRAPPER_FUNC(dremf)(float x,float y) { check_nan_f2(x,y); return remquof(x,y,0); }
float WRAPPER_FUNC(remainderf)(float x,float y) { check_nan_f2(x,y); return remquof(x,y,0); }
GCC_Pragma("GCC diagnostic pop") // conversion

View file

@ -0,0 +1,82 @@
/*
* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include "pico/asm_helper.S"
#include "pico/bootrom/sf_table.h"
pico_default_asm_setup
wrapper_func __aeabi_fadd
wrapper_func __aeabi_fdiv
wrapper_func __aeabi_fmul
wrapper_func __aeabi_frsub
wrapper_func __aeabi_fsub
wrapper_func __aeabi_cfcmpeq
wrapper_func __aeabi_cfrcmple
wrapper_func __aeabi_cfcmple
wrapper_func __aeabi_fcmpeq
wrapper_func __aeabi_fcmplt
wrapper_func __aeabi_fcmple
wrapper_func __aeabi_fcmpge
wrapper_func __aeabi_fcmpgt
wrapper_func __aeabi_fcmpun
wrapper_func __aeabi_i2f
wrapper_func __aeabi_l2f
wrapper_func __aeabi_ui2f
wrapper_func __aeabi_ul2f
wrapper_func __aeabi_f2iz
wrapper_func __aeabi_f2lz
wrapper_func __aeabi_f2uiz
wrapper_func __aeabi_f2ulz
wrapper_func __aeabi_f2d
wrapper_func sqrtf
wrapper_func cosf
wrapper_func sinf
wrapper_func tanf
wrapper_func atan2f
wrapper_func expf
wrapper_func logf
wrapper_func ldexpf
wrapper_func copysignf
wrapper_func truncf
wrapper_func floorf
wrapper_func ceilf
wrapper_func roundf
wrapper_func sincosf
wrapper_func asinf
wrapper_func acosf
wrapper_func atanf
wrapper_func sinhf
wrapper_func coshf
wrapper_func tanhf
wrapper_func asinhf
wrapper_func acoshf
wrapper_func atanhf
wrapper_func exp2f
wrapper_func log2f
wrapper_func exp10f
wrapper_func log10f
wrapper_func powf
wrapper_func powintf
wrapper_func hypotf
wrapper_func cbrtf
wrapper_func fmodf
wrapper_func dremf
wrapper_func remainderf
wrapper_func remquof
wrapper_func expm1f
wrapper_func log1pf
wrapper_func fmaf
#ifdef __riscv
la a0, str
j panic
#else
push {lr} // keep stack trace sane
ldr r0, =str
bl panic
#endif
str:
.asciz "float support is disabled"

View file

@ -0,0 +1,801 @@
/*
* Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#if !PICO_RP2040
#include "pico/asm_helper.S"
pico_default_asm_setup
.macro float_section name
#if PICO_FLOAT_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm
.macro float_wrapper_section func
float_section WRAPPER_FUNC_NAME(\func)
.endm
.extern rtwopi
.extern logtab0
.extern exptab0
.extern exptab1
.extern exptab2
.extern trigtab
@ load a 32-bit constant n into register rx
.macro movlong rx,n
movw \rx,#(\n)&0xffff
movt \rx,#((\n)>>16)&0xffff
.endm
float_section frrcore
@ input:
@ r0 mantissa m Q23
@ r1 exponent e>=-32, typically offset by +9
@ output:
@ r0..r1 preserved
@ r6 range reduced result
@ r2,r3,r4,r5 trashed
frr_core:
ldr r2,=rtwopi
asrs r3,r1,#5 @ k=e/32, k<=5 for e offsets up to 9+32
add r2,r2,r3,lsl#2 @ p
and r3,r1,#31 @ s=e%32
mov r4,#1
lsls r4,r4,r3 @ 1<<s
umull r3,r4,r4,r0
@ r2 p
@ r3:r4 u0:u1 = m<<(e%32); u1 is never more than 2<<23
ldr r5,[r2,#12] @ a0=p[3]
umull r5,r6,r5,r4 @ r0=a0*u1 hi, discard lo
@ r6 r0
ldr r5,[r2,#8] @ a1=p[2]
mla r6,r5,r4,r6 @ a1*u1 lo, discard hi
umlal r5,r6,r5,r3 @ a1*u0 hi, discard lo
@ r6 r0
ldr r5,[r2,#4] @ a2=p[1]
mla r6,r5,r3,r6 @ r0+=a2*u0
bx r14
float_wrapper_section expf
2:
@ we could use fadd macro to calculate x+1 here
@ need to add quadratic term
rsb r3,#1 @ 1-(e+9)
lsrs r0,r3 @ Q31
smmulr r1,r0,r0 @ Q30
cmp r12,#0
bne 1f
add r0,r0,r1
lsrs r0,#8
adc r0,r0,#0x3f800000 @ result is just over 1
bx r14
1:
sub r0,r1,r0
asrs r0,#7
adc r0,r0,#0x3f800000 @ result is just under 1
bx r14
7:
movs r1,#1
bfi r0,r1,#23,#9 @ implied 1, clear sign bit
adds r3,#9
bmi 2b @ <~2^-9? return 1+x+x²/2
movlong r1,0xb17217f8 @ ln2 Q32
lsls r0,r3 @ Q32
cmp r12,#0
bne 8f
cmp r0,r1
blo 6f
sub r0,r0,r1
mov r12,#1
b 6f
8:
mvn r12,#0
subs r0,r1,r0
bhs 6f
add r0,r0,r1
mvn r12,#1
b 6f
wrapper_func expf
ands r12,r0,#1<<31 @ save sign
ubfx r3,r0,#23,#8 @ get exponent
subs r3,r3,#0x7f
bmi 7b @ e<0?
cmp r3,#7
bge 20f @ overflow, Inf or NaN?
movs r1,#1
bfi r0,r1,#23,#9 @ implied 1, clear sign bit
lsls r0,r3 @ x Q23
eors r0,r0,r12,asr#31
adds r0,r0,r12,lsr#31 @ negate if sign bit set
mov r1,#0xB8AA @ 1/ln2 Q15, rounded down
add r1,r1,r12,lsr#31 @ rounded up if we have a negative argument
smull r1,r2,r1,r0 @ Q38
movlong r1,0xb17217f8 @ ln2 Q32
asr r12,r2,#6 @ Q0 exponent candidate
lsls r0,#9 @ Q32
mls r0,r12,r1,r0 @ Q32-Q0*Q32=Q32
6:
lsrs r1,r0,#28
ldr r2,=exptab0
add r2,r2,r1,lsl#3
ldmia r2,{r2-r3}
and r2,r2,#63
orr r2,#64 @ y=(t&0x3f)+0x40; Q6
sub r0,r3
lsrs r1,r0,#24
ldr r3,=exptab1+4
ldr r3,[r3,r1,lsl#3]
add r1,#256
muls r2,r2,r1 @ y Q14
sub r0,r3
lsrs r1,r0,#21
ldr r3,=exptab2+4
ldr r3,[r3,r1,lsl#3]
add r1,r1,r1
add r1,#4096
mla r2,r2,r1,r2 @ y Q26
sub r0,r3 @ ε Q32
smmlar r2,r2,r0,r2 @ y(1+ε) Q26
cmp r2,#1<<27
bhs 1f
2:
cmp r12,#0x7f
bgt 23f
cmn r12,#0x7e
blt 23f
lsls r0,r12,#23
adds r0,r0,r2,lsr#3
adc r0,r0,#0x3f000000
bx r14
@ mantissa calculation has overflowed
1:
lsrs r2,#1
adds r12,#1
b 2b
20:
@ process Inf/NaN for fexp
cmp r3,#0x80
bne 22f
lsls r2,r0,#9
ite eq
biceq r0,r0,r0,asr#31 @ +Inf -> + Inf; -Inf -> +0
orrne r0,r0,#0x00400000
bx r14
23:
ands r12,r12,#1<<31
22:
eors r0,r12,#1<<31
subs r0,r0,r0,lsr#8 @ overflow: convert to +0 or +Inf
bx r14
float_wrapper_section logf
1:
movlong r0,0xff800000 @ return -Inf
bx r14
4:
lsls r1,r0,#9
it ne
orrne r0,r0,#0x00400000
bx r14
@ here result may be close to zero
5:
sub r1,r0,#0x3f800000
bmi 7f
cmp r1,#0x8000
bge 6f @ |ε|>~2^-8?
@ here ε positive
clz r12,r1
sub r3,r12,#1
b 8f
7:
rsbs r2,r1,#0
cmp r2,#0x8000
bge 6f @ |ε|>~2^-8?
@ here ε negative
@ r1: x Q24
clz r12,r2
sub r3,r12,#2
8:
sub r12,#10
lsls r0,r1,r3 @ ε Q24+r3
beq 10f @ ε=0?
smmulr r1,r0,r0
asrs r1,r1,r12
sub r0,r0,r1,asr#1 @ - ε²/2
smmulr r1,r1,r0
asrs r2,r1,r12
movs r3,#0x55
mul r2,r2,r3
adds r0,r0,r2,asr#8 @ + ~ ε³/3
rsb r1,r12,#117
b 9f
wrapper_func logf
lsls r3,r0,#1
bcs 1b @ x<0?
lsrs r3,#24
beq 1b @ x==0/denormal?
sub r12,r3,#0x7e
cmp r12,#0x81 @ +Inf/NaN?
beq 4b
push {r14}
cmp r12,#1 @ result will be near zero?
bls 5b
6:
movs r2,#1
bfi r0,r2,#23,#9 @ set implied 1, clear exponent Q52
lsls r0,#8
lsrs r1,r0,#27
ldr r2,=logtab0-16*8
add r2,r2,r1,lsl#3
ldmia r2,{r2-r3}
lsls r2,#26
umlal r2,r0,r2,r0
mvn r1,r0,asr#24
ldr r2,=exptab1+4
ldr r2,[r2,r1,lsl#3]
add r3,r3,r2
lsls r1,#24
umlal r1,r0,r1,r0
ldr r2,=exptab2+4
mvn r1,r0,asr#21
ldr r2,[r2,r1,lsl#3]
lsls r1,#21
orr r1,#0x00100000
umlal r1,r0,r1,r0
add r3,r3,r2
@ r0: ε Q32
@ r3: log y
smmulr r1,r0,r0 @ ε² Q32
sub r3,r0
add r3,r3,r1,lsr#1 @ log u - ε + ε²/2 Q32
movlong r2,0xb17218 @ ln2 Q24, but accurate to 2^-29 or so
cmp r12,#1 @ result will be near zero?
bls 2f
mul r2,r2,r12
mov r1,#125
add r3,#255 @ reduce systematic error
subs r0,r2,r3,lsr#8
9:
bmi 1f
bl fpack_q
10:
pop {r15}
2:
mov r1,#117
bne 3f @ r12=0?
@ here r12=1
movlong r2,0xb17217f8
sub r0,r2,r3
bl fpack_q
pop {r15}
3:
mov r0,r3
bl fpack_q
orrs r0,r0,#1<<31
pop {r15}
1:
rsbs r0,#0
bl fpack_q
orrs r0,r0,#1<<31
pop {r15}
float_wrapper_section atan2f
@ fatan small reduced angle case
@ r0 y/x in IEEE format, 0..2^-8
@ r2 e+6 <0
@ r3 #1
@ r6 flags
20:
rsbs r4,r2,#0 @ -e-6 = shift down of mantissa required to get to Q29 >0
cmp r4,#7 @ -e-6 ≥ 7 [ e ≤ -13 ]
bge 1f @ very small reduced angle? atn θ ~ θ
@ do a power series
bfi r0,r3,#23,#9 @ fix up mantissa Q23-e
lsls r0,#7 @ Q30-e
smmulr r1,r0,r0 @ θ² Q60-2e-Q32=Q28-2e
lsrs r1,r4 @ Q28-2e + (e+6) = Q34-e
smmulr r1,r1,r0 @ θ³ Q34-e+Q30-e-Q32=Q32-2e
mov r3,#0x55555555 @ 1/3 Q32
lsrs r1,r4 @ Q32-2e + e+6 = Q38-e
smmulr r1,r1,r3 @ θ³/3 Q38-e
sub r0,r0,r1,lsr#8 @ Q30-e
cmp r6,#4 @ at least one quadrant to add?
bhs 2f
add r1,r2,#113
bl fpack_q
eors r0,r0,r6,lsl#31 @ fix sign
pop {r4-r6,r15}
@ here reduced angle is < 2^-12
1:
cmp r6,#4
bhs 3f @ at least one quadrant to add?
eors r0,r0,r6,lsl#31 @ no: return y/x with the correct sign
pop {r4-r6,r15}
3:
bfi r0,r3,#23,#9 @ fix up mantissa
lsrs r0,r4 @ Q29
lsls r0,#1 @ Q30
b 40f
2:
lsrs r0,r4 @ Q30-e + e+6 = Q36
lsrs r0,#6 @ Q30
b 40f
@ case where reduced (x',y') has x' infinite
71:
sbfx r4,r0,#23,#8
movs r0,#0
cmn r4,#1 @ y' also infinite?
bne 80f
mov r0,#0x3f800000 @ both infinite: pretend ∞/∞=1
b 80f
@ case where reduced (x',y') has y' zero
70:
ubfx r4,r1,#23,#8
movs r0,#0
cbnz r4,80f @ x' also zero?
tst r6,#4
beq 80f @ already in quadrants 0/±2? then 0/0 result will be correct
tst r6,#2
ite eq
addeq r6,#6
subne r6,#6 @ fix up when in quadrants ±0
b 80f
90:
movs r0,r1
91:
orrs r0,r0,#0x00400000
pop {r4-r6,r15}
wrapper_func atan2f
push {r4-r6,r14}
lsls r4,r1,#1
cmp r4,#0xff000000
bhi 90b @ y NaN?
lsls r4,r0,#1
cmp r4,#0xff000000
bhi 91b @ x NaN?
lsrs r6,r0,#31 @ b31..2: quadrant count; b1: sign to apply before addition; b0: sign to apply after addition
bic r0,#1<<31
@ y now positive
movs r1,r1
bpl 1f
@ here y positive, x negative
adds r6,#10
bic r1,r1,#1<<31
cmp r1,r0
bhi 4f @ |x| > y: 3rd octant
@ |x| < y: 2nd octant
subs r6,#6
b 3f
1:
@ here x and y positive
cmp r1,r0
bhs 4f
@ x < y: 1st octant
adds r6,#6
3:
movs r2,r1 @ exchange x and y
movs r1,r0
movs r0,r2
4:
@ here
@ r0 y'
@ r1 x'
@ where both x' and y' are positive, y'/x' < 1+δ, and the final result is
@ ± (Q.π/2 ± atn y/x) where 0≤Q≤2 is a quadrant count in r6b3..2, the inner negation
@ is given by r6b1 and the outer negation by r6b0. x' can be infinite, or both x' and
@ y' can be infinite, but not y' alone.
sbfx r2,r1,#23,#8
cmn r2,#1
beq 71b @ x' infinite?
ubfx r2,r0,#23,#8
cmp r2,#0
beq 70b @ y' zero?
bl __aeabi_fdiv
80:
@ r0 y/x in IEEE format, 0..1
lsr r2,r0,#23 @ exponent
movs r3,#1
subs r2,#0x7f-6
bmi 20b
bfi r0,r3,#23,#9 @ fix up mantissa
lsls r0,r2
lsls r0,#2
50:
@ from here atan2(y,1) where 1 implied
@ r0 y Q31 0≤y<1+δ
lsrs r2,r0,#16
mul r3,r2,r2 @ y^2
movw r4,#0x895c
muls r2,r2,r4 @ y*0x895c
movw r5,#0x1227
lsrs r3,#14
mls r2,r3,r5,r2
subs r2,#0x330000 @ Chebyshev approximation to atn(y)
lsrs r2,#25
ldr r3,=trigtab+4
add r3,r3,r2,lsl#4
ldmia r3,{r3-r5}
@ r0 y Q30
@ r3 phi0 Q32
@ r4 sphi0 Q31
@ r5 cphi0 Q31
@ r6 flags
@ x0= cphi0 + sphi0*y
@ y0=-sphi0 + cphi0*y
umull r12,r1,r0,r4 @ sphi0*y
umull r12,r2,r0,r5 @ cphi0*y
add r1,r1,r5,lsr#1 @ x0 Q30
sub r2,r2,r4,lsr#1 @ y0 Q30
11:
@ r1 x0 Q30
@ r2 y0 Q30
@ r3 phi0 Q32
@ r6 flags
mov r0,#0xffffffff
lsrs r4,r1,#15
udiv r12,r0,r4 @ rx0 Q17
lsls r4,r2,#6 @ y0 Q36
smmul r4,r4,r12 @ t2=y0*rx0 Q21
lsls r5,r4,#11 @ t2 Q32
smmlar r0,r5,r2,r1
smmlsr r1,r5,r1,r2
@ r0 x1 Q30
@ r1 y1 Q30
@ r3 phi0
@ r4 r2 Q21
@ r12 rx0 Q17
mul r5,r4,r4 @ t2_2 Q42
smull r2,r5,r4,r5 @ t2_3 Q63
add r3,r3,r4,lsl#11 @ Q32
lsls r5,#1 @ Q32
mov r2,#0x55555555 @ 1/3
smmlsr r3,r2,r5,r3 @ t2_3/3
@ r0 x1 Q30
@ r1 y1 Q30
@ r3 phi0+phi1 Q32
@ r12 rx0 Q17
mul r0,r1,r12 @ y1*rx0 Q30+Q17=Q47
add r3,r3,r0,asr#15
@ r3 phi0+phi1+phi2, result over reduced range Q32
@ r6 flags
lsrs r0,r3,#2 @ Q30
@ adc r0,#0 @ rounding
40:
lsl r5,r6,#30 @ b1 -> b31
eor r0,r0,r5,asr#31 @ negate if required
movlong r1,0x6487ED51 @ π/2 Q30
lsr r5,r6,#2 @ quadrants to add
mla r0,r5,r1,r0
mov r1,#0x80-9 @ for packing Q30
60:
bl fpack_q
eors r0,r0,r6,lsl#31
pop {r4-r6,r15}
@=======================================
float_wrapper_section fpack
@ fnegpack: negate and pack
@ fpack_q31:
@ input
@ r0 Q31 result, must not be zero
@ r1 exponent offset [fpack_q only]
@ output
@ r0 IEEE single
@ trashes (r1,)r2
fnegpack_q31:
rsbs r0,r0,#0
fpack_q31:
mov r1,#0x7f-9 @ exponent
fpack_q:
clz r2,r0
rsbs r2,#8
bmi 1f
lsrs r0,r0,r2 @ save rounding bit in carry
add r2,r2,r1
adc r0,r0,r2,lsl#23 @ insert exponent
bx r14
1:
rsb r2,#0
lsls r0,r0,r2
sub r2,r1,r2
add r0,r0,r2,lsl#23
bx r14
float_wrapper_section tanf
wrapper_func tanf
push {r14}
bl sincosf_raw
bl __aeabi_fdiv
pop {r15}
float_wrapper_section fsin_fcos
10: @ process Inf/NaN for sinf and fcos
lsls r1,r0,#9
it eq
orreq r0,#0x80000000
orr r0,#0x00400000 @ turn Inf into NaN
movs r1,r0 @ copy result to cosine output
bx r14
@ case where angle is very small (<2^-32) before reduction
32:
adds r1,r1,#0x7f
it eq
moveq r0,#0 @ flush denormal to zero
movs r1,0x3f800000 @ return x for sine, 1 for cosine
tst r12,#2
it ne
movne r0,r1 @ calculating cosine? move to result registers
bx r14
40:
@ case where range-reduced angle is very small
@ here
@ r0 mantissa
@ r1 exponent+9
@ r6 abs range-reduced angle / 2π < 2^-7 Q32
@ r12b31: dsincos flag
@ r12b30: original argument sign
@ r12b2..1: quadrant count
@ r12b0: sign of reduced angle
push {r12}
movs r12,#0
2:
add r12,#2
add r1,#2
bl frr_core @ repeat range reduction with extra factor of 2^2 (, 2^4, 2^6, 2^8,...)
eors r4,r6,r6,asr#31 @ we want to keep the sign in r6b31 for later
cmp r4,#1<<26 @ ≥ 2^-6?
bhs 1f @ loop until the result is big enough
cmp r12,#32 @ safety net
bne 2b
1:
@ here r4 is the abs range-reduced angle Q32+r12, 2^-6..2^-4 in Q32 terms
@ 2π=6.487ED511 (0B4611A6...)
movlong r5,0x487ED511 @ 2π Q64 high fractional word
umull r2,r0,r4,r5
movs r5,#6 @ 2π integer part
mla r0,r4,r5,r0
@ here
@ r0 θ, abs range reduced angle θ 0..π/4 Q32+r12, 2π * 2^-6..2^-4 in Q32 terms (so top bit is clear)
@ r6b31: sign of reduced angle
@ r12: excess exponent ≥2, multiple of 2
@ r0 / 2^r12 < ~ 2π * 2^-7 so for sin we need to go to term in x^3
smmulr r1,r0,r0 @ θ² Q32+2*r12
lsrs r1,r1,r12 @ θ² Q32+r12
lsrs r1,r1,r12 @ θ² Q32
movs r4,#0x55555555 @ 1/3 Q32
smmulr r4,r1,r4 @ θ²/3 Q32
smmulr r2,r4,r1 @ θ⁴/3 Q32
rsb r3,r1,r2,lsr#2 @ -θ²+θ⁴/12) Q32
asrs r3,#9 @ -θ²/2+θ⁴/24 Q24
adc r3,r3,#0x3f800000 @ IEEE single with rounding
smmulr r2,r4,r0 @ θ³/3 Q32+r12
sub r0,r0,r2,lsr#1 @ θ-θ³/6 Q32+r12
rsb r1,r12,#117
bl fpack_q
@ here
@ r0 packed sin
@ r3 packed cos
@ r6b31: sign of reduced angle
pop {r12} @ get flags
lsrs r6,#31
bfi r12,r6,#0,#1
asrs r2,r12,#1
bmi 23f @ doing sincos?
asrs r2,#1
bcc 21f @ need sine?
@ need cosine:
ands r12,#4
orrs r0,r3,r12,lsl#29 @ insert sign
pop {r4-r6,r15}
21:
eors r12,r12,r12,lsr#2
orrs r0,r0,r12,lsl#31 @ insert sign
pop {r4-r6,r15}
23:
ands r2,r12,#4
orrs r3,r3,r2,lsl#29 @ insert sign
push {r3}
@ drop into sincosf code below...
b 20f
wrapper_func sincosf
push {r1, r2, lr}
bl sincosf_raw
pop {r2, r3}
str r0, [r2]
str r1, [r3]
pop {pc}
sincosf_raw:
ands r12,r0,#1<<31
lsrs r12,#1 @ save argument sign in r12b30
orrs r12,r12,#1<<31 @ flag we want both results in r12b31
b 1f
wrapper_func sinf
lsrs r12,r0,#29 @ negative argument -> +2 quadrants
ands r12,#4
b 1f
wrapper_func cosf
movs r12,#2 @ cos -> +1 quadrant
1:
ubfx r1,r0,#23,#8 @ get exponent
sub r1,r1,#0x7f
cmp r1,#0x80
beq 10b @ Inf or NaN?
cmn r1,#32
blt 32b @ very small argument?
movs r3,#1
bfi r0,r3,#23,#9 @ fix implied 1 in mantissa
push {r4-r6,r14}
add r1,#9 @ e+9
bl frr_core
@ r6 θ/2π 0..1 Q64
lsrs r4,r6,#30 @ quadrant count
adc r4,r4,#0 @ rounded
sub r6,r6,r4,lsl#30 @ now -0.125≤r6<+0.125 Q32
add r12,r12,r4,lsl#1
orr r12,r12,r6,lsr#31 @ sign into r12b0
@ r12b2..1: quadrant count
@ r12b0: sign of reduced angle
eors r6,r6,r6,asr#31 @ absolute value of reduced angle 0≤r7<0.125 Q32
cmp r6,#1<<25 @ θ / 2π < 2^-7?
blo 40b
@ 2π=6.487ED511 (0B4611A6...)
movlong r5,0x487ED511 @ 2π Q64 high fractional word
umull r2,r0,r6,r5
movs r5,#6 @ 2π integer part
mla r0,r6,r5,r0
@ r0 range reduced angle θ 0..π/4 Q32
lsrs r2,r0,#27
ldr r3,=trigtab+4
add r3,r3,r2,lsl#4
ldmia r3,{r1-r3}
subs r0,r1
@ r0: ε Q32 |ε| < 1.17*2^-6
@ r2: sin φ Q31
@ r3: cos φ Q31
asrs r1,r12,#1
bmi 5f @ doing sincosf?
asrs r1,#1
bcs 3f @ need cosine?
@ here need sine
smmulr r1,r0,r0 @ ε² Q32
mov r4,#0x55555555 @ 1/3 Q32
asrs r1,#1
smmlsr r2,r1,r2,r2 @ sin φ - ε²/2*sin φ ~ sin φ cos ε
smmulr r1,r1,r0 @ ε³/2
smmlsr r1,r1,r4,r0 @ ε - ε³/6
smmlar r0,r1,r3,r2 @ sin φ cos ε + cos φ (ε - ε³/6) ~ sin (φ+ε)
@ the sign of the result is r12b0^r12b2
bl fpack_q31
eors r12,r12,r12,lsr#2
orrs r0,r0,r12,lsl#31 @ insert sign
pop {r4-r6,r15}
3:
@ here need cosine
smmulr r1,r0,r0 @ ε² Q32
mov r4,#0x55555555 @ 1/3 Q32
asrs r1,#1
smmlsr r3,r1,r3,r3 @ cos φ - ε²/2*cos φ ~ cos φ cos ε
smmulr r1,r1,r0 @ ε³/2
smmlsr r1,r1,r4,r0 @ ε - ε³/6
smmlsr r0,r1,r2,r3 @ cos φ cos ε - sin φ (ε - ε³/6) ~ cos (φ+ε)
@ the sign of the result is r12b2
bl fpack_q31
ands r12,#4
orrs r0,r0,r12,lsl#29 @ insert sign
pop {r4-r6,r15}
5:
@ here doing sincosf
smmulr r1,r0,r0 @ ε² Q32
mov r6,#0x55555555 @ 1/3 Q32
asrs r1,#1
smmlsr r4,r1,r2,r2 @ sin φ - ε²/2*sin φ ~ sin φ cos ε
smmlsr r5,r1,r3,r3 @ cos φ - ε²/2*cos φ ~ cos φ cos ε
smmulr r1,r1,r0 @ ε³/2
smmlsr r6,r1,r6,r0 @ ε - ε³/6
@ here
@ r2 sin φ
@ r3 cos φ
@ r4 sin φ cos ε
@ r5 cos φ cos ε
@ r6 ε - ε³/6 ~ sin ε
smmlsr r0,r6,r2,r5 @ cos φ cos ε - sin φ (ε - ε³/6) ~ cos (φ+ε)
bl fpack_q31
ands r5,r12,#4
eors r0,r0,r5,lsl#29 @ negate cosine in quadrants 2 and 3
push {r0}
smmlar r0,r6,r3,r4 @ sin φ cos ε + cos φ (ε - ε³/6) ~ sin (φ+ε)
bl fpack_q31
20:
eors r4,r12,r12,lsr#1
eors r4,r4,r12,lsr#2
ands r5,r12,#1<<30
tst r12,#2 @ exchange sine and cosine in odd quadrants
beq 1f
eors r1,r0,r4,lsl#31 @ negate sine on b0^b1^b2
pop {r0}
eors r0,r0,r5,lsl#1 @ negate sine result if argument was negative
pop {r4-r6,r15}
1:
pop {r1}
eors r0,r0,r4,lsl#31 @ negate sine on b0^b1^b2
eors r0,r0,r5,lsl#1 @ negate sine result if argument was negative
pop {r4-r6,r15}
#endif

View file

@ -0,0 +1,856 @@
/*
* Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#if !PICO_RP2040
#include "pico/asm_helper.S"
pico_default_asm_setup
.macro float_section name
#if PICO_FLOAT_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm
.macro float_wrapper_section func
float_section WRAPPER_FUNC_NAME(\func)
.endm
@ load a 32-bit constant n into register rx
.macro movlong rx,n
movw \rx,#(\n)&0xffff
movt \rx,#((\n)>>16)&0xffff
.endm
float_section frrcore_v
// 1/2π to plenty of accuracy
.long 0 @ this allows values of e down to -32
rtwopi:
.long 0,0
.long 0x28BE60DB, 0x9391054A, 0x7F09D5F4, 0x7D4D3770, 0x36D8A566, 0x4F10E410
@ input:
@ r0 mantissa m Q23
@ r1 exponent e>=-32, typically offset by +9
@ output:
@ r0..r1 preserved
@ r6 range reduced result in revolutions Q32
@ r2,r3,r4,r5 trashed
.thumb_func
frr_core:
adr r2,rtwopi
asrs r3,r1,#5 @ k=e/32, k<=5 for e offsets up to 9+32
add r2,r2,r3,lsl#2 @ p
and r3,r1,#31 @ s=e%32
mov r4,#1
lsls r4,r4,r3 @ 1<<s
umull r3,r4,r4,r0
@ r2 p
@ r3:r4 u0:u1 = m<<(e%32); u1 is never more than 2<<23
ldr r5,[r2,#12] @ a0=p[3]
umull r5,r6,r5,r4 @ r0=a0*u1 hi, discard lo
@ r6 r0
ldr r5,[r2,#8] @ a1=p[2]
mla r6,r5,r4,r6 @ a1*u1 lo, discard hi
umlal r5,r6,r5,r3 @ a1*u0 hi, discard lo
@ r6 r0
ldr r5,[r2,#4] @ a2=p[1]
mla r6,r5,r3,r6 @ r0+=a2*u0
bx r14
float_wrapper_section expf
wrapper_func expf
@ soft float version, via 2^x
asrs r1,r0,#23
bmi 1f
cmp r1,#0x85
bge 3f
10:
movs r2,#1
bfi r0,r2,#23,#9
subs r1,#0x7e
bmi 2f
lsl r0,r1 @ x Q24
11:
movlong r3,0x5c551d95 @ 1/log(2) Q30
smull r0,r1,r0,r3 @ Q54
adr r2,k_exp2
vldmia r2,{s8-s10}
lsrs r2,r0,#22
bfi r2,r1,#10,#17 @ ε Q32
vmov s0,r2
vcvt.f32.u32 s0,s0,#32
vmul.f32 s1,s0,s0
ubfx r2,r1,#17,#5
vmul.f32 s4,s0,s8
adr r3,exptab3
vmul.f32 s2,s0,s1
ldr r2,[r3,r2,lsl#2]
vmla.f32 s4,s1,s9
asrs r1,#22
vmla.f32 s4,s2,s10
add r2,r2,r1,lsl#23
vmov s0,r2
vmla.f32 s0,s0,s4
vmov r0,s0
bx r14
2: @ x≤0.5
rsbs r1,#0
lsrs r0,r1
@ adc r0,#0 @ rounding not needed
b 11b
3: @ risk of overflow, Inf,NaN
movlong r2,0x42B17218
cmp r0,r2
blo 10b @ in range after all
cmp r0,#0x7f800000
bls 4f @ not NaN?
orrs r0,#0x00400000
bx r14
4:
movlong r0,0x7f800000 @ return +Inf
bx r14
1: @ x<0, r1=0xffffffXX where XX is biased exponent
cmn r1,#0x7b
bge 5f @ risk of underflow, -Inf, -NaN?
13:
movs r2,#1
bfi r0,r2,#23,#9
adds r1,#0x82
bpl 6f
rsbs r1,#0
lsrs r0,r1
adc r0,r0,#0 @ rounding
12:
rsbs r0,#0
b 11b
6:
lsls r0,r1
b 12b
5:
movlong r2,0xC2AEAC4F
cmp r0,r2
bls 13b
cmp r0,#0xff800000
bls 14f
orrs r0,#0x00400000
bx r14
14:
mov r0,#0
bx r14
.align 3
k_exp2:
.float 0.693147181 @ log2
.float 0.240226507 @ log²2/2
.float 0.055504109 @ log³2/6
exptab3: @ pow(2,[0..31]/32)
.word 0x3f800000
.word 0x3f82cd87
.word 0x3f85aac3
.word 0x3f88980f
.word 0x3f8b95c2
.word 0x3f8ea43a
.word 0x3f91c3d3
.word 0x3f94f4f0
.word 0x3f9837f0
.word 0x3f9b8d3a
.word 0x3f9ef532
.word 0x3fa27043
.word 0x3fa5fed7
.word 0x3fa9a15b
.word 0x3fad583f
.word 0x3fb123f6
.word 0x3fb504f3
.word 0x3fb8fbaf
.word 0x3fbd08a4
.word 0x3fc12c4d
.word 0x3fc5672a
.word 0x3fc9b9be
.word 0x3fce248c
.word 0x3fd2a81e
.word 0x3fd744fd
.word 0x3fdbfbb8
.word 0x3fe0ccdf
.word 0x3fe5b907
.word 0x3feac0c7
.word 0x3fefe4ba
.word 0x3ff5257d
.word 0x3ffa83b3
float_wrapper_section logf
wrapper_func logf
cmp r0,#0x7f800000 @ catch Inf, NaN, -ve
bhs 1f
asrs r1,r0,#23 @ get exponent; C from here is preserved...
beq 2f @ ±0?
mov r2,#1
bfi r0,r2,#23,#9 @ fix implied 1
it cc @ 50% ... to here...
lslcc r0,#1 @ this plus sbc below means we work relative to nearby power of 2
adr r3,#k_log3
vldmia r3,{s8-s10}
@ 0x00c00000 ≤ r0 < 0x017fffff
adr r3,logtab3-24*8+4
add r3,r3,r0,lsr#16 @ look up r0>>19 rounded, preserving flags
bic r3,#7
ldrd r2,r3,[r3]
mul r0,r0,r2 @ ε
vmov s0,s1,r3,r0 @ s0=-log u, s1=ε
vcvt.f32.s32 s1,s1,#32
vmul.f32 s2,s1,s1 @ power series in ε
sbc r1,r1,#0x7e @ ... and here
vmul.f32 s3,s1,s2
lsls r1,#23 @ e Q23
vmul.f32 s4,s2,s2 @ to ε⁴
@ movlong r2,0x58b90bfc @ log 2 Q31, more accurate than we deserve
movw r2,0x0bfc
vmul.f32 s2,s2,s8
movt r2,0x58b9
vmul.f32 s3,s3,s9
smmulr r1,r1,r2 @ Q22
vmul.f32 s4,s4,s10
vmov s7,r1
vsub.f32 s3,s3,s4
vcvt.f32.s32 s7,s7,#22
vsub.f32 s2,s2,s3
vsub.f32 s1,s1,s2
vadd.f32 s0,s0,s1 @ log ε - log u
vadd.f32 s0,s0,s7 @ e log 2 + log ε - log u
vmov r0,s0
bx r14
1:
bgt 3f @ +NaN?
beq 10f @ +Inf?
2:
cmp r0,#0x80800000 @ -0?
blo 11f
cmp r0,#0xff800000 @ -NaN/-Inf?
3:
orr r0,#0x00400000
bhi 10f
movlong r0,0xffc00000
10:
bx r14
11:
movlong r0,0xff800000
bx r14
.align 3
k_log3:
.float 0.5
.float 0.333333333333333
.float 0.25
.float 0 @ alignment
logtab3:
@ u=64/[48:2:96]; u Q8, -log u F32
.word 0x0155,0xbe92cb01 @ 00003e9b..00004145
.word 0x0148,0xbe7dc8c3 @ 00003ec8..00004158
.word 0x013b,0xbe545f68 @ 00003ec1..00004137
.word 0x012f,0xbe2c99c7 @ 00003ebb..00004119
.word 0x0125,0xbe0a3c2c @ 00003ef3..0000413d
.word 0x011a,0xbdc61a2f @ 00003eca..000040fe
.word 0x0111,0xbd83acc2 @ 00003eeb..0000410d
.word 0x0108,0xbcfc14d8 @ 00003ee8..000040f8
.word 0x0100,0x00000000 @ 00003f00..00004100
.word 0x00f8,0x3d020aec @ 00003ef8..000040e8
.word 0x00f1,0x3d77518e @ 00003f13..000040f5
.word 0x00ea,0x3db80698 @ 00003f12..000040e6
.word 0x00e4,0x3ded393b @ 00003f3c..00004104
.word 0x00dd,0x3e168b08 @ 00003f05..000040bf
.word 0x00d8,0x3e2dfa03 @ 00003f48..000040f8
.word 0x00d2,0x3e4ad2d7 @ 00003f2a..000040ce
.word 0x00cd,0x3e637fde @ 00003f43..000040dd
.word 0x00c8,0x3e7cc8e3 @ 00003f48..000040d8
.word 0x00c3,0x3e8b5ae6 @ 00003f39..000040bf
.word 0x00bf,0x3e95f784 @ 00003f6b..000040e9
.word 0x00ba,0x3ea38c6e @ 00003f36..000040aa
.word 0x00b6,0x3eaeadef @ 00003f46..000040b2
.word 0x00b2,0x3eba0ec4 @ 00003f46..000040aa
.word 0x00ae,0x3ec5b1cd @ 00003f36..00004092
.word 0x00ab,0x3ece995f @ 00003f75..000040cb
float_wrapper_section fsin_fcos
30:
lsls r1,r0,#9
bne 1f @ NaN? return it
orrs r0,r0,#0x80000000 @ Inf: make a NaN
1:
orrs r0,r0,#0x00400000 @ set top mantissa bit of NaN
bx r14
@ heavy-duty range reduction
@ here x≥256, -e in r1
40:
push {r4-r7,r14}
movs r3,#1
bfi r0,r3,#23,#9 @ insert implied 1 in mantissa, clear sign
rsb r1,#9 @ e+9
mov r7,#0x7e @ this will be the exponent of the reduced angle - 1
42:
bl frr_core
@ here r6 is revolutions Q32
lsrs r3,r6,#30 @ quadrant count
adcs r3,r3,#0 @ rounded
add r12,r12,r3
subs r6,r6,r3,lsl#30 @ reduced angle/2π Q32 -.125≤x<+.125
@ comment out from here...
lsls r2,r6,#2 @ Q34
it cs
rsbcs r2,r2,#0 @ absolute value
cmp r2,#1<<28 @ big enough for accuracy?
bhs 41f
@ ... to here for slightly better accuracy
43:
adds r1,r1,#2 @ try again with increased exponent
bl frr_core
eors r2,r6,r6,asr#32 @ absolute value
adc r2,r2,#0
cmp r2,#1<<28 @ big enough yet?
bhs 44f
subs r7,r7,#2
bpl 43b @ safety net
44:
41:
ldr r4,=0xC90FDAA2 @ 2π Q29
umull r2,r4,r2,r4 @ r4 has reduced angle Q34+Q29-Q32=Q31
@ add r4,r4,r2,lsr#31
clz r2,r4 @ normalise
lsls r4,r4,r2
lsrs r4,r4,#8
sub r2,r7,r2
adc r0,r4,r2,lsl#23 @ with rounding
lsrs r1,r0,#23 @ re-extract exponent as there may have been a carry into it
rsbs r1,r1,#0x7f @ prepare exponent for re-entry
lsrs r6,r6,#31
add r3,r0,r6,lsl#31 @ apply sign of reduced angle
pop {r4-r7,r14}
b 5f @ re-enter with no risk of looping
.ltorg
@ light-duty range reduction
@ here argument ≥1
@ r0: argument
@ r1: -e
@ r12: quadrant count
@ required result is sin(r0+r12*π/2)
10:
cmn r1,#0x80
beq 30b @ Inf/NaN
bics r2,r0,r12,lsl#31 @ negative argument,doing sin -> +2 quadrants
it mi
addmi r12,r12,#2
bic r0,r0,#0x80000000 @ make positive: original sign is now captured in quadrant count in r12
@ this may not actually be faster than doing it in integer registers
vmov s0,r0
adr r2,k_sc4
vldmia r2!,{s5-s7}
@ vmul.f32 s4,s4,s0 @ this accurate calculation of the quadrant count does not seem necessary
@ vfma.f32 s4,s5,s0
vmul.f32 s4,s5,s0 @ this is BALGE
cmn r1,#8 @ ≥256?
vrintn.f32.f32 s4,s4 @ round to quadrant count: x<256 so count≤163
ble 40b @ then do heavy-duty range reduction
vfms.f32 s0,s4,s7
vfms.f32 s0,s4,s6
vmov r3,s0 @ reduced angle
vcvt.s32.f32 s3,s4
ubfx r2,r3,#23,#8 @ get exponent
cmp r2,#0x78
blo 40b @ very small result? use heavy-duty reduction to get a more accurate answer
rsbs r1,r2,#0x7f @ ready for re-entry
vmov r2,s3 @ integer quadrant count
add r12,r12,r2
@ prepare to re-enter with no risk of looping
b 5f
k_sc4:
@ 2/π=0.A2F9836E4E441529FC...
.word 0x3f22f983 @ 2/π
@ π/2=1.921FB54442D1846989...
.word 0xb695777a,0x3fc91000 @ these two add up to π/2 with error ~1.6e-13
wrapper_func sincosf
push {r0-r2,r14}
ubfx r1,r0,#23,#8
cmp r1,#0xff @ Inf/NaN?
beq 2f
bl cosf_entry @ this will exit via 1f or 2f...
pop {r1-r2,r14}
str r0,[r14]
@ here C is still set from lsrs r12,r12,#1
bcs 1f
mvns r1,r1
eor r12,r12,r1,lsr#31
@ this is fsc_costail:
@ here calculate cos φ+ε = cosθ
vmul.f32 s5,s7,s1 @ sinφ sinε
vfma.f32 s5,s2,s6 @ sinφ sinε + cosφ(1-cosε)
vsub.f32 s5,s6,s5 @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
vmov.f32 r0,s5
eor r0,r0,r12,lsl#31
str r0,[r2]
pop {r15}
1:
eor r12,r12,r1,lsr#31
@ this is fsc_sintail:
@ here calculate sin φ+ε = sinθ
vmul.f32 s4,s2,s7 @ sinφ(1-cosε)
vfms.f32 s4,s6,s1 @ sinφ(1-cosε) - cosφ sinε
eor r1,r12,r3,lsr#31 @ flip sign if (reduced) argument was negative
vsub.f32 s4,s7,s4 @ cosφ sinε + sinφ cosε
vmov.f32 r0,s4
eor r0,r0,r1,lsl#31
str r0,[r2] @ save cos result
pop {r15}
@ sincos of Inf or NaN
2:
lsls r1,r0,#9
pop {r1-r3,r14}
bne 1f @ NaN? return it
orrs r0,r0,#0x80000000 @ Inf: make a NaN
1:
orrs r0,r0,#0x00400000 @ set top mantissa bit of NaN
str r0,[r2] @ both sin and cos results
str r0,[r3]
bx r14
wrapper_func sinf
@ r12b1..0: quadrant count
movs r12,#0
b 1f
wrapper_func cosf
.thumb_func
cosf_entry:
movs r12,#1 @ cos -> +1 quadrant
1:
ubfx r1,r0,#23,#8 @ get exponent
cbz r1,20f @ 0/denormal?
22:
rsbs r1,r1,#0x7f
bls 10b @ argument ≥1? needs reduction; also Inf/NaN handling
bic r3,r0,r12,lsl#31 @ this would mess up NaNs so do it here
5:
@ here we have a quadrant count in r12 and a signed offset r0 from r12*π/2
bic r0,r3,#0x80000000 @ this would mess up NaNs so do it here
vmov s0,r0
ubfx r0,r0,#18,#5 @ extract top of mantissa
adds r0,r0,#32 @ insert implied 1
lsrs r1,r0,r1 @ to fixed point Q5
ldr r2,=k_sc3
adcs r1,r1,#0 @ rounding
vldmia r2!,{s8-s9}
add r2,r2,r1,lsl#2 @ 12 bytes per entry
add r2,r2,r1,lsl#3
vldmia r2,{s5-s7} @ φ, cosφ, sinφ
vsub.f32 s1,s0,s5 @ ε
vmul.f32 s2,s1,s1 @ ε²
lsrs r12,r12,#1 @ computing cosine?
vmul.f32 s3,s2,s1 @ ε³
bcs 2f
vmul.f32 s2,s2,s8 @ ε²/2! ~ 1-cosε
vmul.f32 s3,s3,s9 @ ε³/3!
vsub.f32 s1,s1,s3 @ ε-ε³/3! ~ sinε
@ here:
@ s1: sinε
@ s2: 1-cosε
@ s6: cosφ
@ s7: sinφ
@ r12: quadrant count
fsc_sintail:
@ here calculate sin φ+ε = sinθ
vmul.f32 s4,s2,s7 @ sinφ(1-cosε)
vfms.f32 s4,s6,s1 @ sinφ(1-cosε) - cosφ sinε
eor r1,r12,r3,lsr#31 @ flip sign if (reduced) argument was negative
vsub.f32 s4,s7,s4 @ cosφ sinε + sinφ cosε
vmov.f32 r0,s4
eor r0,r0,r1,lsl#31
bx r14
20:
and r0,r0,#0x80000000 @ make signed zero
b 22b
.align 2
2:
vmul.f32 s3,s3,s9 @ ε³/3!
vsub.f32 s1,s1,s3 @ ε-ε³/3! ~ sinε
vmul.f32 s2,s2,s8 @ ε²/2! ~ 1-cosε
fsc_costail:
@ here calculate cos φ+ε = cosθ
vmul.f32 s5,s7,s1 @ sinφ sinε
vfma.f32 s5,s2,s6 @ sinφ sinε + cosφ(1-cosε)
vsub.f32 s5,s6,s5 @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
vmov.f32 r0,s5
eor r0,r0,r12,lsl#31
bx r14
.align 3
k_sc3:
.word 0x3EFFFEC1 @ ~ 1/2! with PMC
.word 0x3e2aaa25 @ ~ 1/3! with PMC
trigtab2:
// φ cos φ sin φ
.word 0x00000000,0x3f800000,0x00000000
.word 0x3cfcc961,0x3f7fe0cd,0x3cfcbf1c @ φ=0.03085774 : cos φ=3feffc199ff28ef4 33.3b; sin φ=3f9f97e38006c678 39.2b
.word 0x3d810576,0x3f7f7dfe,0x3d80ef9e @ φ=0.06299870 : cos φ=3fefefbfc00d6b6d 33.3b; sin φ=3fb01df3c000dfd5 40.2b
.word 0x3dbf0c09,0x3f7ee30f,0x3dbec522 @ φ=0.09328467 : cos φ=3fefdc61dff4f58e 33.5b; sin φ=3fb7d8a43ffdf9ac 39.0b
.word 0x3dff24b6,0x3f7e0414,0x3dfe7be2 @ φ=0.12458174 : cos φ=3fefc0827fdaf90f 31.8b; sin φ=3fbfcf7c3ff9dd0c 37.4b
.word 0x3e1f0713,0x3f7ceb48,0x3e1e63a0 @ φ=0.15530042 : cos φ=3fef9d68ffe680a0 32.3b; sin φ=3fc3cc73fffa6d09 36.5b
.word 0x3e40306d,0x3f7b811d,0x3e3f1015 @ φ=0.18768473 : cos φ=3fef70239fe32301 32.1b; sin φ=3fc7e2029ffdbc2c 37.8b
.word 0x3e60ada2,0x3f79dccf,0x3e5ee13e @ φ=0.21941236 : cos φ=3fef3b99e023f5aa 31.8b; sin φ=3fcbdc27bffe216d 38.1b
.word 0x3e800d7b,0x3f7808fa,0x3e7d7196 @ φ=0.25010285 : cos φ=3fef011f401572a6 32.6b; sin φ=3fcfae32c00328bb 37.3b
.word 0x3e8f986e,0x3f75ff65,0x3e8db868 @ φ=0.28045982 : cos φ=3feebfeca0aaaf99 29.6b; sin φ=3fd1b70cfffc1468 36.0b
.word 0x3e9fe1f4,0x3f739e93,0x3e9d4bfd @ φ=0.31227076 : cos φ=3fee73d25fbf733b 31.0b; sin φ=3fd3a97fa0002ced 40.5b
.word 0x3eb054c6,0x3f70f7ae,0x3eacddb3 @ φ=0.34439677 : cos φ=3fee1ef5bfcf70cb 31.4b; sin φ=3fd59bb65fff5c30 38.6b
.word 0x3ebf89c5,0x3f6e4b60,0x3ebb1a0a @ φ=0.37409797 : cos φ=3fedc96bffdebb8a 31.9b; sin φ=3fd763414003344b 36.3b
.word 0x3ecfc426,0x3f6b35ca,0x3eca1c63 @ φ=0.40579337 : cos φ=3fed66b93fe27dc6 32.1b; sin φ=3fd9438c5ffe5d45 37.3b
.word 0x3ee054f2,0x3f67d166,0x3ed93907 @ φ=0.43814808 : cos φ=3fecfa2cbffc16e9 35.0b; sin φ=3fdb2720dffef5b6 37.9b
.word 0x3eeff0dd,0x3f64664b,0x3ee74116 @ φ=0.46863452 : cos φ=3fec8cc95f714272 29.8b; sin φ=3fdce822c00479ad 35.8b
.word 0x3f002b31,0x3f609488,0x3ef5c30f @ φ=0.50065905 : cos φ=3fec1290ffc99208 31.2b; sin φ=3fdeb861dfff3932 38.4b
.word 0x3f07e407,0x3f5cc5a2,0x3f01992b @ φ=0.53082317 : cos φ=3feb98b44034cd46 31.3b; sin φ=3fe033255ffff628 41.7b
.word 0x3f101fc5,0x3f587d8f,0x3f08a165 @ φ=0.56298476 : cos φ=3feb0fb1e0ceda6f 29.3b; sin φ=3fe1142c9ffd5ae4 35.6b
.word 0x3f17f68a,0x3f5434b5,0x3f0f31ca @ φ=0.59360564 : cos φ=3fea8696a038a06f 31.2b; sin φ=3fe1e639400269fb 35.7b
.word 0x3f1fffe2,0x3f4f9b59,0x3f15c8d7 @ φ=0.62499821 : cos φ=3fe9f36b1f428363 29.4b; sin φ=3fe2b91ae001d55d 36.1b
.word 0x3f280646,0x3f4acf6b,0x3f1c37c4 @ φ=0.65634573 : cos φ=3fe959ed61449f08 28.7b; sin φ=3fe386f87ffd9617 35.7b
.word 0x3f303041,0x3f45b9e0,0x3f229ae4 @ φ=0.68823630 : cos φ=3fe8b73c0047ae7a 30.8b; sin φ=3fe4535c7ffdf1ac 36.0b
.word 0x3f381da7,0x3f4098ca,0x3f28a620 @ φ=0.71920246 : cos φ=3fe81319402ae6e1 31.6b; sin φ=3fe514c3ffff423c 37.4b
.word 0x3f3fc72f,0x3f3b76ac,0x3f2e564a @ φ=0.74913305 : cos φ=3fe76ed5809d419f 29.7b; sin φ=3fe5cac93fffaf1d 38.7b
.word 0x3f4813db,0x3f35b6cc,0x3f34526b @ φ=0.78155297 : cos φ=3fe6b6d9800e8b52 33.1b; sin φ=3fe68a4d5ffe89fc 36.5b
.word 0x3f4fc779,0x3f30352f,0x3f39b4d0 @ φ=0.81163746 : cos φ=3fe606a5dfdc2b5b 31.8b; sin φ=3fe73699fffd7fc8 35.7b
.word 0x3f57dd52,0x3f2a4170,0x3f3f2d91 @ φ=0.84322083 : cos φ=3fe5482e011ba752 28.9b; sin φ=3fe7e5b21ffcb223 35.3b
.word 0x3f5fce26,0x3f243e9f,0x3f445dc3 @ φ=0.87423933 : cos φ=3fe487d3e0b9864b 29.5b; sin φ=3fe88bb85ffde6d5 35.9b
.word 0x3f6825f1,0x3f1dc250,0x3f499d1c @ φ=0.90682894 : cos φ=3fe3b849ffea9b8f 32.6b; sin φ=3fe933a38002730d 35.7b
.word 0x3f703be1,0x3f175041,0x3f4e7ebf @ φ=0.93841368 : cos φ=3fe2ea0820791b4e 30.1b; sin φ=3fe9cfd7e0053e65 34.6b
.word 0x3f781078,0x3f10ed71,0x3f5306af @ φ=0.96900129 : cos φ=3fe21dae1fdea23e 31.9b; sin φ=3fea60d5e001b90b 36.2b
.word 0x3f7ff4d4,0x3f0a5aa7,0x3f57649b @ φ=0.99982953 : cos φ=3fe14b54deeaa407 28.9b; sin φ=3feaec9360012825 36.8b
float_wrapper_section tanf
wrapper_func tanf
push {r0,r14}
ubfx r1,r0,#23,#8
cmp r1,#0xff @ Inf/NaN?
beq 2f
bl cosf_entry @ this will exit via sintail or costail...
ldr r1,[sp,#0]
@ here C is still set from lsrs r12,r12,#1
bcs 1f
@ we exited via sintail
@ this is fsc_costail:
@ here calculate cos φ+ε = cosθ
vmul.f32 s5,s7,s1 @ sinφ sinε
vfma.f32 s5,s2,s6 @ sinφ sinε + cosφ(1-cosε)
eors r1,r1,r3
vsub.f32 s5,s6,s5 @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
vdiv.f32 s0,s5,s4
vmov.f32 r0,s0
it pl
eorpl r0,r0,#0x80000000
pop {r1,r15}
1:
@ we exited via costail
@ this is fsc_sintail:
@ here calculate sin φ+ε = sinθ
vmul.f32 s4,s2,s7 @ sinφ(1-cosε)
vfms.f32 s4,s6,s1 @ sinφ(1-cosε) - cosφ sinε
eors r1,r1,r3
vsub.f32 s4,s7,s4 @ cosφ sinε + sinφ cosε
vdiv.f32 s0,s4,s5
vmov.f32 r0,s0
it mi
eormi r0,r0,#0x80000000
pop {r1,r15}
@ tan of Inf or NaN
2:
lsls r1,r0,#9
bne 1f @ NaN? return it
orrs r0,r0,#0x80000000 @ Inf: make a NaN
1:
orrs r0,r0,#0x00400000 @ set top mantissa bit of NaN
pop {r3,r15}
float_wrapper_section atan2f
50:
60:
orrs r0,r1,#0x00400000
bx r14
51:
bne 52f @ NaN?
cmp r3,#0x7f800000 @ y an infinity; x an infinity too?
bne 55f @ no: carry on
@ here x and y are both infinities
b 66f
52:
62:
orrs r0,r0,#0x00400000
bx r14
61:
bne 62b @ NaN?
cmp r3,#0x7f800000 @ y an infinity; x an infinity too?
bne 65f @ no: carry on
66:
@ here x and y are both infinities
subs r0,r0,#1 @ make both finite (and equal) with same sign and retry
subs r1,r1,#1
b 86f
70:
and r3,#0x80000000
cmp r2,#0x00800000
bhs 72f @ y 0 or denormal?
@ here both x and y are zeros
b 85f
71:
and r2,#0x80000000
72:
vmov s0,s1,r2,r3
vdiv.f32 s2,s0,s1 @ restart the division
b 73f @ and go back and check for NaNs
80:
and r3,#0x80000000
cmp r2,#0x00800000
bhs 82f @ y 0 or denormal?
85:
@ here both x and y are zeros
orr r1,r1,0x3f800000 @ retry with x replaced by ~1 with appropriate sign
b 86f
81:
and r2,#0x80000000
82:
vmov s0,s1,r2,r3
vdiv.f32 s2,s1,s0 @ restart the division
b 83f @ and go back and check for NaNs
wrapper_func atan2f
86:
bic r2,r0,#0x80000000
bic r3,r1,#0x80000000
vmov s0,s1,r2,r3
cmp r2,r3 @ |y| vs. |x|
bhi 1f
@ here |x|≥|y| so we need |y|/|x|; octant/xs/ys: 0++,3-+,4--,7+-
vdiv.f32 s2,s0,s1 @ get this division started; result ≤1
cmp r3,#0x00800000
blo 70b @ x 0 or denormal?
cmp r2,#0x00800000
blo 71b @ y 0 or denormal?
73:
cmp r3,#0x7f800000
bhi 50b @ x NaN?
cmp r2,#0x7f800000
bhs 51b @ y Inf or NaN?
55:
cmp r1,#0
ite mi
ldrmi r12,pi @ if x<0, need two extra quadrants
movpl r12,#0
@ inner negation is the sign of x
b 2f
1:
@ here |x|<|y| so we need |x|/|y|; octant/xs/ys: 1++,2-+,5--,6+-
vdiv.f32 s2,s1,s0 @ result <1
cmp r3,#0x00800000
blo 80b @ x 0 or denormal?
cmp r2,#0x00800000
blo 81b @ y 0 or denormal?
83:
cmp r3,#0x7f800000
bhi 60b @ x NaN?
cmp r2,#0x7f800000
bhs 61b @ y Inf or NaN?
65:
ldr r12,piover2 @ always one extra quadrant in this path
eors r1,r1,#0x80000000 @ inner negation is the complement of the sign of x
2:
@ here
@ r0 y
@ r1 ±x
@ r2 |y|
@ r3 |x|
@ s0,s1 = |x|,|y|
@ s2=s0/s1 or s1/s0, 0≤s2≤1
@ r12=quadrant count * π/2
@ where the final result is
@ ± (r12 ± atn s2) where the inner negation is given by r1b31 and the outer negation by r0b31
adr r2,trigtab3
vmov.f32 s3,s2
vcvt.u32.f32 s3,s3,#6
vmov.f32 r3,s3
lsrs r3,r3,#1
adcs r3,r3,#0 @ rounding; set Z if in φ==0 case
add r2,r2,r3,lsl#3
vldr s5,[r2,#4] @ t=tanφ
vmul.f32 s0,s5,s2 @ ty
vsub.f32 s1,s2,s5 @ y-t
vmov.f32 s5,#1.0
vadd.f32 s0,s5,s0 @ 1+ty
beq 9f @ did we look up zeroth table entry?
@ now (s0,s1) = (x,y)
vdiv.f32 s0,s1,s0 @ ε
ldr r2,[r2] @ φ Q29
@ result is now ±(r12±(r2+atn(s0))
cmp r1,#0 @ inner negation
it mi
rsbmi r2,r2,#0
add r2,r12,r2 @ Q29
cmp r0,#0 @ outer negation
it mi
rsbmi r2,r2,#0
cmp r2,#0
bpl 1f
rsbs r2,r2,#0
clz r3,r2
lsls r2,r2,r3
beq 3f
rsb r3,#0x180
b 2f
1:
clz r3,r2
lsls r2,r2,r3
beq 3f
rsb r3,#0x80
2:
lsrs r2,r2,#8 @ rounding bit to carry
adc r2,r2,r3,lsl#23 @ with rounding
3:
vmul.f32 s2,s0,s0 @ ε²
vldr.f32 s3,onethird
vmul.f32 s2,s2,s0 @ ε³
teq r0,r1
vmul.f32 s2,s2,s3 @ ε³/3
vmov.f32 s4,r2
vsub.f32 s0,s0,s2 @ ~atn(ε)
ite pl
vaddpl.f32 s0,s4,s0
vsubmi.f32 s0,s4,s0
vmov.f32 r0,s0
bx r14
9: @ we looked up the zeroth table entry; we could generate slightly more accurate results here
@ now (s0,s1) = (x,y)
vdiv.f32 s0,s1,s0 @ ε
@ result is now ±(r12±(0+atn(s0))
mov r2,r12 @ Q29; in fact r12 is only ±π/2 or ±π so can probably simplify this
cmp r0,#0 @ outer negation
it mi
rsbmi r2,r2,#0
cmp r2,#0
bpl 1f
rsbs r2,r2,#0
clz r3,r2
lsls r2,r2,r3
beq 3f
rsb r3,#0x180
b 2f
1:
clz r3,r2
lsls r2,r2,r3
beq 3f
rsb r3,#0x80
2:
lsrs r2,r2,#8 @ rounding bit to carry
adc r2,r2,r3,lsl#23 @ with rounding
3:
vmul.f32 s2,s0,s0 @ ε²
vldr.f32 s3,onethird
vmul.f32 s2,s2,s0 @ ε³
teq r0,r1
vmul.f32 s2,s2,s3 @ ε³/3
vmov.f32 s4,r2
vsub.f32 s0,s0,s2 @ ~atn(ε)
ite pl
vaddpl.f32 s0,s4,s0
vsubmi.f32 s0,s4,s0
vmov.f32 r0,s0
tst r0,#0x7f800000 @ about to return a denormal?
it ne
bxne r14
and r0,r0,#0x80000000 @ make it zero
bx r14
piover2: .word 0x3243f6a9 @ Q29
pi: .word 0x6487ed51 @ Q29
onethird: .float 0.33333333
trigtab3:
// φ Q29 tan φ SP
.word 0x00000000,0x00000000
.word 0x00ffee23,0x3d0001bb @ φ=0.03124148 : tan φ=3fa000375fffff9d 50.4b
.word 0x01fe88dc,0x3d7f992a @ φ=0.06232112 : tan φ=3faff3253fffea1f 44.5b
.word 0x02fe0a70,0x3dc01203 @ φ=0.09351084 : tan φ=3fb8024060002522 42.8b
.word 0x03fad228,0x3e000368 @ φ=0.12436779 : tan φ=3fc0006cfffffc90 45.2b
.word 0x04f5ab70,0x3e1ffdea @ φ=0.15498897 : tan φ=3fc3ffbd400014d5 42.6b
.word 0x05ed56f8,0x3e3fdddc @ φ=0.18522213 : tan φ=3fc7fbbb80000beb 43.4b
.word 0x06e4cfa0,0x3e601425 @ φ=0.21543103 : tan φ=3fcc02849fffe817 42.4b
.word 0x07d8d3e0,0x3e80215d @ φ=0.24521822 : tan φ=3fd0042b9ffff89f 43.1b
.word 0x08c60460,0x3e9000a5 @ φ=0.27417201 : tan φ=3fd20014a000182b 41.4b
.word 0x09b26770,0x3ea01492 @ φ=0.30302784 : tan φ=3fd402923ffff932 43.2b
.word 0x0a996d50,0x3eb01377 @ φ=0.33122888 : tan φ=3fd6026ee0001062 42.0b
.word 0x0b7a6d10,0x3ebff4a0 @ φ=0.35869458 : tan φ=3fd7fe93ffff8c38 39.1b
.word 0x0c593ce0,0x3ed0019f @ φ=0.38589329 : tan φ=3fda0033e0001354 41.7b
.word 0x0d33ebd0,0x3ee01bbc @ φ=0.41258803 : tan φ=3fdc0377800162a1 37.5b
.word 0x0e087ab0,0x3ef01fbd @ φ=0.43853506 : tan φ=3fde03f79fffddf2 40.9b
.word 0x0ed56180,0x3effef98 @ φ=0.46354747 : tan φ=3fdffdf30000767d 39.1b
.word 0x0fa1de80,0x3f080ebf @ φ=0.48850942 : tan φ=3fe101d7dfffb9fc 38.9b
.word 0x10639d00,0x3f0fec31 @ φ=0.51215982 : tan φ=3fe1fd862000aad5 37.6b
.word 0x112690e0,0x3f180cfd @ φ=0.53595775 : tan φ=3fe3019fa00069ea 38.3b
.word 0x11e014c0,0x3f200065 @ φ=0.55860364 : tan φ=3fe4000ca00022e5 39.9b
.word 0x129651e0,0x3f2808be @ φ=0.58084959 : tan φ=3fe50117c00015a7 40.6b
.word 0x1346d400,0x3f300a7d @ φ=0.60239601 : tan φ=3fe6014f9fffa020 38.4b
.word 0x13efc7c0,0x3f37ee2f @ φ=0.62302005 : tan φ=3fe6fdc5dfff98d7 38.3b
.word 0x14988960,0x3f400c32 @ φ=0.64362019 : tan φ=3fe801863fffff81 46.0b
.word 0x1537a8c0,0x3f47ef42 @ φ=0.66304433 : tan φ=3fe8fde8400062a4 38.4b
.word 0x15d4cc60,0x3f4ff630 @ φ=0.68222636 : tan φ=3fe9fec5ffff76e2 37.9b
.word 0x166ef280,0x3f581534 @ φ=0.70104337 : tan φ=3feb02a680004e91 38.7b
.word 0x16ff75c0,0x3f5fef1e @ φ=0.71868408 : tan φ=3febfde3c0001404 40.7b
.word 0x179116a0,0x3f68184d @ φ=0.73646098 : tan φ=3fed03099ffed6e5 36.8b
.word 0x181b5aa0,0x3f701722 @ φ=0.75333911 : tan φ=3fee02e43fffd351 39.5b
.word 0x18a10560,0x3f781071 @ φ=0.76965588 : tan φ=3fef020e20005c05 38.5b
.word 0x19214060,0x3f7ff451 @ φ=0.78530902 : tan φ=3feffe8a1fffe11b 40.1b
#endif

View file

@ -0,0 +1,318 @@
/*
* Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include "pico/asm_helper.S"
#include "hardware/hazard3.h"
// This file reimplements some common single-precision soft float routines
// from libgcc. It targets the RV32IMBZbkb dialect (plus optionally Xh3bextm)
// and is tuned for Hazard3 execution timings.
// Subnormal values are always flushed to zero on both input and output.
// Rounding is always to nearest (even on tie).
pico_default_asm_setup
.macro float_section name
#if PICO_FLOAT_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm
float_section __addsf3
.global __subsf3
.p2align 2
__subsf3:
binvi a1, a1, 31
.global __addsf3
__addsf3:
// Unpack exponent:
h3.bextmi a2, a0, 23, 8
h3.bextmi a3, a1, 23, 8
// Flush-to-zero => 0 + y = y applies, including nan, with the sole
// exception of y being subnormal (which also needs to be flushed)
beqz a2, __addsf_return_y_flushed
// Don't have to handle this case for x + 0 = 0 because we already know x
// is nonzero
beqz a3, __addsf_return_x
// Unpack significand, plus 3 extra zeroes for working space:
slli a4, a0, 9
slli a5, a1, 9
// check nan/inf on input
li t0, 255
beq a2, t0, __addsf_x_nan_inf
beq a3, t0, __addsf_y_nan_inf
// (finish unpacking significand)
srli a4, a4, 6
srli a5, a5, 6
// If we're still on the straight path then we are adding two normal
// values. Add implicit one (1.xx...xx000)
bseti a4, a4, 23 + 3
bseti a5, a5, 23 + 3
// Negate if sign bit is set
bgez a0, 1f
neg a4, a4
1:
// (tuck this 16-bit here to avoid alignment penalty)
li t1, 25
bgez a1, 1f
neg a5, a5
1:
bltu a2, a3, __addsf_ye_gt_xe
// The main body is repeated twice with different register assignments.
// lhs is the more-significant addend:
.macro addsf_core packed_lhs, packed_rhs, sig_lhs, sig_rhs, exp_lhs, exp_rhs, rhs_is_x
sub \packed_rhs, \exp_lhs, \exp_rhs
// If there is a large exponent difference then there is no effect on lhs
.if \rhs_is_x
bgeu \packed_rhs, t1, __addsf_return_y
.else
bgeu \packed_rhs, t1, __addsf_return_x
.endif
// Shift rhs down to correct relative significance
sra \packed_lhs, \sig_rhs, \packed_rhs
// Set sticky bit if ones were shifted out
sll \packed_rhs, \packed_lhs, \packed_rhs
sltu \packed_rhs, \packed_rhs, \sig_rhs
or \packed_lhs, \packed_lhs, \packed_rhs
// Add significands
add \sig_lhs, \sig_lhs, \packed_lhs
// Detect exact cancellation (may be beyond max normalisation shift; also
// IEEE 754 requires +0 for exact cancellation, no matter input signs)
beqz \sig_lhs, __addsf_return_0
// Convert two's complement back to sign + magnitude
srai \exp_rhs, \sig_lhs, 31
xor \sig_lhs, \sig_lhs, \exp_rhs
sub \sig_lhs, \sig_lhs, \exp_rhs
// Renormalise significand: bit 31 is now implicit one
clz \packed_lhs, \sig_lhs
sll \sig_lhs, \sig_lhs, \packed_lhs
// Adjust exponent
addi \packed_lhs, \packed_lhs, -5
sub \exp_lhs, \exp_lhs, \packed_lhs
// Round to nearest, even on tie (bias upward if above odd number)
bexti \packed_lhs, \sig_lhs, 8
addi \sig_lhs, \sig_lhs, 127
add \sig_lhs, \sig_lhs, \packed_lhs
// Exponent may increase by one due to rounding up from all-ones; this is
// detected by clearing of implicit one (there is a carry-out too)
bgez \sig_lhs, 3f
4:
// Detect underflow/overflow
bgeu \exp_lhs, t0, 1f
// Pack and return
packh \exp_lhs, \exp_lhs, \exp_rhs
slli \exp_lhs, \exp_lhs, 23
slli \sig_lhs, \sig_lhs, 1
srli \sig_lhs, \sig_lhs, 9
add a0, \sig_lhs, \exp_lhs
ret
1:
bgez \exp_lhs, 2f
// Signed zero on underflow
slli a0, \exp_rhs, 31
ret
2:
// Signed infinity on overflow
packh a0, t0, \exp_rhs
slli a0, a0, 23
ret
3:
// Exponent increase due to rounding (uncommon)
srli \sig_lhs, \sig_lhs, 1
addi \exp_lhs, \exp_lhs, 1
j 4b
.endm
__addsf_xe_gte_ye:
addsf_core a0, a1, a4, a5, a2, a3, 0
.p2align 2
__addsf_ye_gt_xe:
addsf_core a1, a0, a5, a4, a3, a2, 1
__addsf_x_nan_inf:
// When at least one operand is nan, we must propagate at least one of
// those nan payloads (sign of nan result is unspecified, which we take
// advantage of by implementing x - y as x + -y). Check x nan vs inf:
bnez a4, __addsf_return_x
__addsf_x_inf:
// If x is +-inf, need to distinguish the following cases:
bne a3, t0, __addsf_return_x // y is neither inf nor nan -> return x (propagate inf)
bnez a5, __addsf_return_y // y is nan: -> return y (propagate nan)
xor a5, a0, a1
srli a5, a5, 31
beqz a5, __addsf_return_x // y is inf of same sign -> return either x or y (x is faster)
li a0, -1 // y is inf of different sign -> return nan
ret
__addsf_y_nan_inf:
// Mirror of __addsf_x_nan_inf
bnez a5, __addsf_return_y
__addsf_y_inf:
bne a2, t0, __addsf_return_y
bnez a4, __addsf_return_x
xor a4, a0, a1
srli a4, a4, 31
beqz a4, __addsf_return_x
li a0, -1
ret
__addsf_return_y_flushed:
bnez a3, 1f
srli a1, a1, 23
slli a1, a1, 23
1:
__addsf_return_y:
mv a0, a1
__addsf_return_x:
ret
__addsf_return_0:
li a0, 0
ret
float_section __mulsf3
.global __mulsf3
.p2align 2
__mulsf3:
// Force y to be positive (by possibly negating x) *before* unpacking.
// This allows many special cases to be handled without repacking.
bgez a1, 1f
binvi a0, a0, 31
1:
// Unpack exponent:
h3.bextmi a2, a0, 23, 8
h3.bextmi a3, a1, 23, 8
// Check special cases
li t0, 255
beqz a2, __mulsf_x_0
beqz a3, __mulsf_y_0
beq a2, t0, __mulsf_x_nan_inf
beq a3, t0, __mulsf_y_nan_inf
// Finish unpacking sign
srai a6, a0, 31
// Unpack significand (with implicit one in MSB)
slli a4, a0, 8
slli a5, a1, 8
bseti a4, a4, 31
bseti a5, a5, 31
// Get full 64-bit multiply result in a4:a1 (one cycle each half)
// Going from Q1.23 to Q2.46 (both left-justified)
mul a1, a4, a5
mulhu a4, a4, a5
// Normalise (shift left by either 0 or 1) -- bit 8 is the LSB of the
// final significand (ignoring rounding)
clz a0, a4
sll a4, a4, a0
sub a2, a2, a0
// After normalising we can calculate the final exponent, since rounding
// cannot increase the exponent for multiplication (unlike addition)
add a2, a2, a3
// Subtract redundant bias term (127), add 1 for normalisation correction
addi a2, a2, -126
blez a2, __mulsf_underflow
bge a2, t0, __mulsf_overflow
// Gather sticky bits from low fraction:
snez a1, a1
or a4, a4, a1
// Round to nearest, even on tie (aka bias upward if odd)
bexti a1, a4, 8
add a4, a4, a1
addi a4, a4, 127
// Pack it and ship it
packh a2, a2, a6
slli a2, a2, 23
slli a4, a4, 1
srli a4, a4, 9
add a0, a4, a2
ret
__mulsf_underflow:
// Signed zero
slli a0, a6, 31
ret
__mulsf_overflow:
// Signed inf
packh a0, t0, a6
slli a0, a0, 23
ret
__mulsf_x_0:
// 0 times nan -> propagate nan
// 0 times inf -> generate nan
// 0 times others -> 0 (need to flush significand too as we are FTZ)
bne a3, t0, __mulsf_return_flushed_x
slli a5, a1, 9
beqz a5, 1f
// Propagate nan from y
__mulsf_return_y:
mv a0, a1
ret
1:
// Generate new nan
li a0, -1
ret
__mulsf_y_0:
// Mirror image of x_0 except we still return x for signed 0, since the
// signs were already resolved.
bne a2, t0, __mulsf_return_flushed_x
slli a1, a0, 9
bnez a1, 1f
li a0, -1
1:
ret
__mulsf_return_flushed_x:
// If we don't support subnormals we at least need to flush to a canonical
// zero. This is just a sign bit in bit 31.
srli a0, a0, 31
slli a0, a0, 31
__mulsf_return_x:
ret
__mulsf_x_nan_inf:
// We know that y is not zero and is positive. So...
// x is nan -> return x
// else y is nan -> return y
// else y is inf -> return x
// else y is normal -> return x
// (the order of the first two clauses is actually our free choice)
slli a4, a0, 9
bnez a4, __mulsf_return_x
bne a3, t0, __mulsf_return_x
slli a5, a1, 9
bnez a5, __mulsf_return_y
ret // return x
__mulsf_y_nan_inf:
// We know that x is not zero, nan, nor inf. That just leaves normals.
// y is nan -> return y
// y is inf -> return inf * sgn(x) (since we already merged the signs)
slli a5, a1, 9
bnez a5, __mulsf_return_y
srai a0, a0, 31
packh a0, t0, a0
slli a0, a0, 23
ret
// This is a hack to improve soft float performance for the routines we don't
// implement (e.g. libm) in libraries built against a non-Zbb ISA dialect:
float_section __clz2si
.global __clz2si
__clz2si:
clz a0, a0
ret

View file

@ -0,0 +1,346 @@
/*
* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include "pico/asm_helper.S"
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
#ifndef PICO_FLOAT_IN_RAM
#define PICO_FLOAT_IN_RAM 0
#endif
pico_default_asm_setup
.macro float_section name
// todo separate flag for shims?
#if PICO_FLOAT_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm
float_section float_table_shim_on_use_helper
regular_func float_table_shim_on_use_helper
push {r0-r2, lr}
mov r0, ip
#ifndef NDEBUG
// sanity check to make sure we weren't called by non (shimmable_) table_tail_call macro
cmp r0, #0
bne 1f
bkpt #0
#endif
1:
ldrh r1, [r0]
lsrs r2, r1, #8
adds r0, #2
cmp r2, #0xdf
bne 1b
uxtb r1, r1 // r1 holds table offset
lsrs r2, r0, #2
bcc 1f
// unaligned
ldrh r2, [r0, #0]
ldrh r0, [r0, #2]
lsls r0, #16
orrs r0, r2
b 2f
1:
ldr r0, [r0]
2:
ldr r2, =sf_table
str r0, [r2, r1]
str r0, [sp, #12]
pop {r0-r2, pc}
float_section 642float_shims
@ convert uint64 to float, rounding
regular_func uint642float_shim
movs r2,#0 @ fall through
@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
regular_func ufix642float_shim
push {r4,r5,r14}
cmp r1,#0
bpl 3f @ positive? we can use signed code
lsls r5,r1,#31 @ contribution to sticky bits
orrs r5,r0
lsrs r0,r1,#1
subs r2,#1
b 4f
@ convert int64 to float, rounding
regular_func int642float_shim
movs r2,#0 @ fall through
@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
regular_func fix642float_shim
push {r4,r5,r14}
3:
movs r5,r0
orrs r5,r1
beq ret_pop45 @ zero? return +0
asrs r5,r1,#31 @ sign bits
2:
asrs r4,r1,#24 @ try shifting 7 bits at a time
cmp r4,r5
bne 1f @ next shift will overflow?
lsls r1,#7
lsrs r4,r0,#25
orrs r1,r4
lsls r0,#7
adds r2,#7
b 2b
1:
movs r5,r0
movs r0,r1
4:
negs r2,r2
adds r2,#32+29
// bl packx
ldr r1, =0x29ef // packx
blx r1
ret_pop45:
pop {r4,r5,r15}
float_section fatan2_shim
regular_func fatan2_shim
push {r4,r5,r14}
ldr r4, =0x29c1 // unpackx
mov ip, r4
@ unpack arguments and shift one down to have common exponent
blx ip
mov r4,r0
mov r0,r1
mov r1,r4
mov r4,r2
mov r2,r3
mov r3,r4
blx ip
lsls r0,r0,#5 @ Q28
lsls r1,r1,#5 @ Q28
adds r4,r2,r3 @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise
asrs r4,#9
adds r4,#1
bmi 2f @ force y to 0 proper, so result will be zero
subs r4,r2,r3 @ calculate shift
bge 1f @ ex>=ey?
negs r4,r4 @ make shift positive
asrs r0,r4
cmp r4,#28
blo 3f
asrs r0,#31
b 3f
1:
asrs r1,r4
cmp r4,#28
blo 3f
2:
@ here |x|>>|y| or both x and y are ±0
cmp r0,#0
bge 4f @ x positive, return signed 0
ldr r3, =0x2cfc @ &pi_q29, circular coefficients
ldr r0,[r3] @ x negative, return +/- pi
asrs r1,#31
eors r0,r1
b 7f
4:
asrs r0,r1,#31
b 7f
3:
movs r2,#0 @ initial angle
ldr r3, =0x2cfc @ &pi_q29, circular coefficients
cmp r0,#0 @ x negative
bge 5f
negs r0,r0 @ rotate to 1st/4th quadrants
negs r1,r1
ldr r2,[r3] @ pi Q29
5:
movs r4,#1 @ m=1
ldr r5, =0x2b97 @ cordic_vec
blx r5 @ also produces magnitude (with scaling factor 1.646760119), which is discarded
mov r0,r2 @ result here is -pi/2..3pi/2 Q29
@ asrs r2,#29
@ subs r0,r2
ldr r3, =0x2cfc @ &pi_q29, circular coefficients
ldr r2,[r3] @ pi Q29
adds r4,r0,r2 @ attempt to fix -3pi/2..-pi case
bcs 6f @ -pi/2..0? leave result as is
subs r4,r0,r2 @ <pi? leave as is
bmi 6f
subs r0,r4,r2 @ >pi: take off 2pi
6:
subs r0,#1 @ fiddle factor so atan2(0,1)==0
7:
movs r2,#0 @ exponent for pack
ldr r3, =0x2b19
bx r3
float_section float232_shims
regular_func float2int_shim
movs r1,#0 @ fall through
regular_func float2fix_shim
// check for -0 or -denormal upfront
asrs r2, r0, #23
adds r2, #128
adds r2, #128
beq 1f
// call original
ldr r2, =0x2acd
bx r2
1:
movs r0, #0
bx lr
float_section float264_shims
regular_func float2int64_shim
movs r1,#0 @ and fall through
regular_func float2fix64_shim
push {r14}
bl f2fix
b d2f64_a
regular_func float2uint64_shim
movs r1,#0 @ and fall through
regular_func float2ufix64_shim
asrs r3,r0,#23 @ negative? return 0
bmi ret_dzero
@ and fall through
@ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf
@ result clamped so that r3 can only be 0 or -1
@ trashes r12
.thumb_func
f2fix:
push {r4,r14}
mov r12,r1
asrs r3,r0,#31
lsls r0,#1
lsrs r2,r0,#24
beq 1f @ zero?
cmp r2,#0xff @ Inf?
beq 2f
subs r1,r2,#1
subs r2,#0x7f @ remove exponent bias
lsls r1,#24
subs r0,r1 @ insert implied 1
eors r0,r3
subs r0,r3 @ top two's complement
asrs r1,r0,#4 @ convert to double format
lsls r0,#28
ldr r4, =d2fix_a
bx r4
1:
movs r0,#0
movs r1,r0
movs r3,r0
pop {r4,r15}
2:
mvns r0,r3 @ return max/min value
mvns r1,r3
pop {r4,r15}
ret_dzero:
movs r0,#0
movs r1,#0
bx r14
float_section d2fix_a_float
.weak d2fix_a // weak because it exists in float shims too
.thumb_func
d2fix_a:
@ here
@ r0:r1 two's complement mantissa
@ r2 unbaised exponent
@ r3 mantissa sign extension bits
add r2,r12 @ exponent plus offset for required binary point position
subs r2,#52 @ required shift
bmi 1f @ shift down?
@ here a shift up by r2 places
cmp r2,#12 @ will clamp?
bge 2f
movs r4,r0
lsls r1,r2
lsls r0,r2
negs r2,r2
adds r2,#32 @ complementary shift
lsrs r4,r2
orrs r1,r4
pop {r4,r15}
2:
mvns r0,r3
mvns r1,r3 @ overflow: clamp to extreme fixed-point values
pop {r4,r15}
1:
@ here a shift down by -r2 places
adds r2,#32
bmi 1f @ long shift?
mov r4,r1
lsls r4,r2
negs r2,r2
adds r2,#32 @ complementary shift
asrs r1,r2
lsrs r0,r2
orrs r0,r4
pop {r4,r15}
1:
@ here a long shift down
movs r0,r1
asrs r1,#31 @ shift down 32 places
adds r2,#32
bmi 1f @ very long shift?
negs r2,r2
adds r2,#32
asrs r0,r2
pop {r4,r15}
1:
movs r0,r3 @ result very near zero: use sign extension bits
movs r1,r3
pop {r4,r15}
d2f64_a:
asrs r2,r1,#31
cmp r2,r3
bne 1f @ sign extension bits fail to match sign of result?
pop {r15}
1:
mvns r0,r3
movs r1,#1
lsls r1,#31
eors r1,r1,r0 @ generate extreme fixed-point values
pop {r15}
float_section float2double_shim
regular_func float2double_shim
lsrs r3,r0,#31 @ sign bit
lsls r3,#31
lsls r1,r0,#1
lsrs r2,r1,#24 @ exponent
beq 1f @ zero?
cmp r2,#0xff @ Inf?
beq 2f
lsrs r1,#4 @ exponent and top 20 bits of mantissa
ldr r2,=(0x3ff-0x7f)<<20 @ difference in exponent offsets
adds r1,r2
orrs r1,r3
lsls r0,#29 @ bottom 3 bits of mantissa
bx r14
1:
movs r1,r3 @ return signed zero
3:
movs r0,#0
bx r14
2:
ldr r1,=0x7ff00000 @ return signed infinity
adds r1,r3
b 3b
#endif

View file

@ -0,0 +1,90 @@
/*
* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#ifndef _PICO_FLOAT_H
#define _PICO_FLOAT_H
#include <math.h>
#include <float.h>
#include "pico.h"
#include "pico/bootrom/sf_table.h"
#ifdef __cplusplus
extern "C" {
#endif
/** \file float.h
* \defgroup pico_float pico_float
*
* \brief Optimized single-precision floating point functions
*
* (Replacement) optimized implementations are provided for the following compiler built-ins
* and math library functions on Arm:
*
* - __aeabi_fadd, __aeabi_fdiv, __aeabi_fmul, __aeabi_frsub, __aeabi_fsub, __aeabi_cfcmpeq, __aeabi_cfrcmple, __aeabi_cfcmple, __aeabi_fcmpeq, __aeabi_fcmplt, __aeabi_fcmple, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmpun, __aeabi_i2f, __aeabi_l2f, __aeabi_ui2f, __aeabi_ul2f, __aeabi_f2iz, __aeabi_f2lz, __aeabi_f2uiz, __aeabi_f2ulz, __aeabi_f2d, sqrtf, cosf, sinf, tanf, atan2f, expf, logf
* - ldexpf, copysignf, truncf, floorf, ceilf, roundf, asinf, acosf, atanf, sinhf, coshf, tanhf, asinhf, acoshf, atanhf, exp2f, log2f, exp10f, log10f, powf, hypotf, cbrtf, fmodf, dremf, remainderf, remquof, expm1f, log1pf, fmaf
* - powintf, sincosf (GNU extensions)
*
* The following additional optimized functions are also provided:
*
* - int2float, uint2float, int642float, uint642float, fix2float, ufix2float, fix642float, ufix642float
* - float2fix, float2ufix, float2fix64, float2ufix64, float2int, float2uint, float2int64, float2uint64, float2int_z, float2int64_z, float2uint_z, float2uint64_z
* - exp10f, sincosf, powintf
*
* On RP2350 (Arm) the following additional functions are available; the _fast methods are faster but do not round correctly
*
* - float2fix64_z, fdiv_fast, fsqrt_fast,
*
* On RP2350 RISC-V, only a small number of compiler runtime functions are overridden with faster implementations:
*
* - __addsf3, __subsf3, __mulsf3
*/
// None of these functions are available on RISC-V:
#if !defined(__riscv) || PICO_COMBINED_DOCS
float int2float(int32_t f);
float uint2float(uint32_t f);
float int642float(int64_t f);
float uint642float(uint64_t f);
float fix2float(int32_t m, int e);
float ufix2float(uint32_t m, int e);
float fix642float(int64_t m, int e);
float ufix642float(uint64_t m, int e);
// These methods round towards -Infinity.
int32_t float2fix(float f, int e);
uint32_t float2ufix(float f, int e);
int64_t float2fix64(float f, int e);
uint64_t float2ufix64(float f, int e);
int32_t float2int(float f);
uint32_t float2uint(float f);
int64_t float2int64(float f);
uint64_t float2uint64(float f);
// These methods round towards 0.
int32_t float2int_z(float f);
int64_t float2int64_z(float f);
int32_t float2uint_z(float f);
int64_t float2uint64_z(float f);
float exp10f(float x);
void sincosf(float x, float *sinx, float *cosx);
float powintf(float x, int y);
#if !PICO_RP2040 || PICO_COMBINED_DOCS
int64_t float2fix64_z(float f, int e);
float fdiv_fast(float n, float d);
float fsqrt_fast(float f);
#endif
#endif
#ifdef __cplusplus
}
#endif
#endif