mirror of
https://github.com/betaflight/betaflight.git
synced 2025-07-15 04:15:44 +03:00
Adding RP2350 SDK and target framework (#13988)
* Adding RP2350 SDK and target framework * Spacing * Removing board definitions
This commit is contained in:
parent
462cb05930
commit
2dd6f95aad
576 changed files with 435012 additions and 0 deletions
253
lib/main/pico-sdk/rp2_common/pico_float/float_aeabi_dcp.S
Normal file
253
lib/main/pico-sdk/rp2_common/pico_float/float_aeabi_dcp.S
Normal file
|
@ -0,0 +1,253 @@
|
|||
/*
|
||||
* Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#include "pico/asm_helper.S"
|
||||
#if HAS_DOUBLE_COPROCESSOR
|
||||
#include "hardware/dcp_instr.inc.S"
|
||||
#include "hardware/dcp_canned.inc.S"
|
||||
|
||||
pico_default_asm_setup
|
||||
|
||||
// todo alignment
|
||||
//__pre_init __aeabi_float_init, 00020
|
||||
// factor out save/restore (there is a copy in double code)
|
||||
|
||||
.macro float_section name
|
||||
#if PICO_FLOAT_IN_RAM
|
||||
.section RAM_SECTION_NAME(\name), "ax"
|
||||
#else
|
||||
.section SECTION_NAME(\name), "ax"
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro float_wrapper_section func
|
||||
float_section WRAPPER_FUNC_NAME(\func)
|
||||
.endm
|
||||
|
||||
// ============== STATE SAVE AND RESTORE ===============
|
||||
|
||||
.macro saving_func func
|
||||
// Note we are usually 32-bit aligned already at this point, as most of the
|
||||
// function bodies contain exactly two 16-bit instructions: bmi and bx lr.
|
||||
// We want the PCMP word-aligned.
|
||||
.p2align 2
|
||||
// When the engaged flag is set, branch back here to invoke save routine and
|
||||
// hook lr with the restore routine, then fall back through to the entry
|
||||
// point. The engaged flag will be clear when checked a second time.
|
||||
1:
|
||||
push {lr} // 16-bit instruction
|
||||
bl generic_save_state // 32-bit instruction
|
||||
b 1f // 16-bit instruction
|
||||
// This is the actual entry point:
|
||||
wrapper_func \func
|
||||
PCMP apsr_nzcv
|
||||
bmi 1b
|
||||
1:
|
||||
.endm
|
||||
|
||||
.macro saving_func_return
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
float_section __rp2350_dcp_engaged_state_save_restore
|
||||
.thumb_func
|
||||
generic_save_state:
|
||||
sub sp, #24
|
||||
push {r0, r1}
|
||||
// do save here
|
||||
PXMD r0, r1
|
||||
strd r0, r1, [sp, #8 + 0]
|
||||
PYMD r0, r1
|
||||
strd r0, r1, [sp, #8 + 8]
|
||||
REFD r0, r1
|
||||
strd r0, r1, [sp, #8 + 16]
|
||||
pop {r0, r1}
|
||||
blx lr
|
||||
// <- wrapped function returns here
|
||||
// fall through into restore:
|
||||
.thumb_func
|
||||
generic_restore_state:
|
||||
// do restore here
|
||||
pop {r12, r14}
|
||||
WXMD r12, r14
|
||||
pop {r12, r14}
|
||||
WYMD r12, r14
|
||||
pop {r12, r14}
|
||||
WEFD r12, r14
|
||||
pop {pc}
|
||||
|
||||
// ============== ARITHMETIC FUNCTIONS ===============
|
||||
|
||||
float_wrapper_section __aeabi_fadd
|
||||
saving_func __aeabi_fadd
|
||||
dcp_fadd_m r0,r0,r1
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_fsub
|
||||
saving_func __aeabi_fsub
|
||||
dcp_fsub_m r0,r0,r1
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_frsub
|
||||
saving_func __aeabi_frsub
|
||||
dcp_fsub_m r0,r1,r0
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_fmul
|
||||
saving_func __aeabi_fmul
|
||||
dcp_fmul_m r0,r0,r1,r0,r1
|
||||
saving_func_return
|
||||
|
||||
float_section fdiv_fast
|
||||
saving_func fdiv_fast
|
||||
dcp_fdiv_fast_m r0,r0,r1,r0,r1,r2
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_fdiv
|
||||
saving_func __aeabi_fdiv
|
||||
@ with correct rounding
|
||||
dcp_fdiv_m r0,r0,r1,r0,r1,r2,r3
|
||||
saving_func_return
|
||||
|
||||
float_section sqrtf_fast
|
||||
saving_func sqrtf_fast
|
||||
dcp_fsqrt_fast_m r0,r0,r0,r1,r2,r3
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section sqrtf
|
||||
saving_func sqrtf
|
||||
@ with correct rounding
|
||||
dcp_fsqrt_m r0,r0,r0,r1,r2,r3
|
||||
saving_func_return
|
||||
|
||||
// todo not a real thing
|
||||
float_wrapper_section __aeabi_fclassify
|
||||
saving_func __aeabi_fclassify
|
||||
dcp_fclassify_m apsr_nzcv,r0
|
||||
saving_func_return
|
||||
|
||||
// ============== CONVERSION FUNCTIONS ===============
|
||||
|
||||
float_wrapper_section __aeabi_f2d
|
||||
saving_func __aeabi_f2d
|
||||
dcp_float2double_m r0,r1,r0
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_i2f
|
||||
saving_func __aeabi_i2f
|
||||
@ with rounding
|
||||
dcp_int2float_m r0,r0
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_ui2f
|
||||
saving_func __aeabi_ui2f
|
||||
@ with rounding
|
||||
dcp_uint2float_m r0,r0
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_f2iz
|
||||
saving_func __aeabi_f2iz
|
||||
@ with truncation towards 0
|
||||
dcp_float2int_m r0,r0
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_f2uiz
|
||||
saving_func __aeabi_f2uiz
|
||||
@ with truncation towards 0
|
||||
dcp_float2uint_m r0,r0
|
||||
saving_func_return
|
||||
|
||||
// todo not a real thing
|
||||
float_wrapper_section __aeabi_f2i_r
|
||||
saving_func __aeabi_f2i_r
|
||||
@ with rounding
|
||||
dcp_float2int_r_m r0,r0
|
||||
saving_func_return
|
||||
|
||||
// todo not a real thing
|
||||
float_wrapper_section __aeabi_f2ui_r
|
||||
saving_func __aeabi_f2ui_r
|
||||
@ with rounding
|
||||
dcp_float2uint_r_m r0,r0
|
||||
saving_func_return
|
||||
|
||||
// ============== COMPARISON FUNCTIONS ===============
|
||||
|
||||
float_wrapper_section __aeabi_fcmpun
|
||||
saving_func __aeabi_fcmpun
|
||||
dcp_fcmp_m r0,r0,r1
|
||||
// extract unordered bit
|
||||
ubfx r0, r0, #28, #1
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_fcmp
|
||||
saving_func __aeabi_cfrcmple
|
||||
dcp_fcmp_m apsr_nzcv,r1,r0 // with arguments reversed
|
||||
bvs cmp_nan
|
||||
saving_func_return
|
||||
|
||||
// these next two can be the same function in the absence of exceptions
|
||||
saving_func __aeabi_cfcmple
|
||||
dcp_fcmp_m apsr_nzcv,r0,r1
|
||||
bvs cmp_nan
|
||||
saving_func_return
|
||||
|
||||
// It is not clear from the ABI documentation whether cfcmpeq must set the C flag
|
||||
// in the same way as cfcmple. If not, we could save the "bvs" below; but we
|
||||
// err on the side of caution.
|
||||
saving_func __aeabi_cfcmpeq
|
||||
dcp_fcmp_m apsr_nzcv,r0,r1
|
||||
bvs cmp_nan
|
||||
saving_func_return
|
||||
|
||||
// If the result of a flag-setting comparison is "unordered" then we need to set C and clear Z.
|
||||
// We could conceivably just do lsrs r12,r14,#1, or even cmp r14,r14,lsr#1 as (a) r14 here is a
|
||||
// return address and r14b0=1 for Thumb mode; (b) we are unlikely to be returning to address 0.
|
||||
cmp_nan:
|
||||
movs r12, #3 // r12 does not need to be preserved by the flag-setting comparisons
|
||||
lsrs r12, #1 // set C, clear Z
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_fcmpeq
|
||||
saving_func __aeabi_fcmpeq
|
||||
dcp_fcmp_m r0,r0,r1
|
||||
// extract Z
|
||||
ubfx r0, r0, #30, #1
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_fcmplt
|
||||
saving_func __aeabi_fcmplt
|
||||
dcp_fcmp_m apsr_nzcv,r1,r0
|
||||
ite hi
|
||||
movhi r0,#1
|
||||
movls r0,#0
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_fcmple
|
||||
saving_func __aeabi_fcmple
|
||||
dcp_fcmp_m apsr_nzcv,r1,r0
|
||||
ite hs
|
||||
movhs r0,#1
|
||||
movlo r0,#0
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_fcmpge
|
||||
saving_func __aeabi_fcmpge
|
||||
dcp_fcmp_m apsr_nzcv,r0,r1
|
||||
ite hs
|
||||
movhs r0,#1
|
||||
movlo r0,#0
|
||||
saving_func_return
|
||||
|
||||
float_wrapper_section __aeabi_fcmpgt
|
||||
saving_func __aeabi_fcmpgt
|
||||
dcp_fcmp_m apsr_nzcv,r0,r1
|
||||
ite hi
|
||||
movhi r0,#1
|
||||
movls r0,#0
|
||||
saving_func_return
|
||||
|
||||
#endif
|
776
lib/main/pico-sdk/rp2_common/pico_float/float_aeabi_rp2040.S
Normal file
776
lib/main/pico-sdk/rp2_common/pico_float/float_aeabi_rp2040.S
Normal file
|
@ -0,0 +1,776 @@
|
|||
/*
|
||||
* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#include "pico/asm_helper.S"
|
||||
#include "pico/runtime_init.h"
|
||||
#include "pico/bootrom/sf_table.h"
|
||||
#include "hardware/divider_helper.S"
|
||||
|
||||
PICO_RUNTIME_INIT_FUNC_RUNTIME(__aeabi_float_init, PICO_RUNTIME_INIT_AEABI_FLOAT)
|
||||
|
||||
pico_default_asm_setup
|
||||
|
||||
.macro float_section name
|
||||
#if PICO_FLOAT_IN_RAM
|
||||
.section RAM_SECTION_NAME(\name), "ax"
|
||||
#else
|
||||
.section SECTION_NAME(\name), "ax"
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro float_wrapper_section func
|
||||
float_section WRAPPER_FUNC_NAME(\func)
|
||||
.endm
|
||||
|
||||
.macro _float_wrapper_func x
|
||||
wrapper_func \x
|
||||
.endm
|
||||
|
||||
.macro wrapper_func_f1 x
|
||||
_float_wrapper_func \x
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
mov ip, lr
|
||||
bl __check_nan_f1
|
||||
mov lr, ip
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro wrapper_func_f2 x
|
||||
_float_wrapper_func \x
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
mov ip, lr
|
||||
bl __check_nan_f2
|
||||
mov lr, ip
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.section .text
|
||||
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
.thumb_func
|
||||
__check_nan_f1:
|
||||
movs r3, #1
|
||||
lsls r3, #24
|
||||
lsls r2, r0, #1
|
||||
adds r2, r3
|
||||
bhi 1f
|
||||
bx lr
|
||||
1:
|
||||
bx ip
|
||||
|
||||
.thumb_func
|
||||
__check_nan_f2:
|
||||
movs r3, #1
|
||||
lsls r3, #24
|
||||
lsls r2, r0, #1
|
||||
adds r2, r3
|
||||
bhi 1f
|
||||
lsls r2, r1, #1
|
||||
adds r2, r3
|
||||
bhi 2f
|
||||
bx lr
|
||||
2:
|
||||
mov r0, r1
|
||||
1:
|
||||
bx ip
|
||||
#endif
|
||||
|
||||
.macro table_tail_call SF_TABLE_OFFSET
|
||||
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
|
||||
#ifndef NDEBUG
|
||||
movs r3, #0
|
||||
mov ip, r3
|
||||
#endif
|
||||
#endif
|
||||
ldr r3, =sf_table
|
||||
ldr r3, [r3, #\SF_TABLE_OFFSET]
|
||||
bx r3
|
||||
.endm
|
||||
|
||||
.macro shimmable_table_tail_call SF_TABLE_OFFSET shim
|
||||
ldr r3, =sf_table
|
||||
ldr r3, [r3, #\SF_TABLE_OFFSET]
|
||||
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
|
||||
mov ip, pc
|
||||
#endif
|
||||
bx r3
|
||||
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
|
||||
.byte \SF_TABLE_OFFSET, 0xdf
|
||||
.word \shim
|
||||
#endif
|
||||
.endm
|
||||
|
||||
|
||||
// note generally each function is in a separate section unless there is fall thru or branching between them
|
||||
// note fadd, fsub, fmul, fdiv are so tiny and just defer to rom so are lumped together so they can share constant pool
|
||||
|
||||
// note functions are word aligned except where they are an odd number of linear instructions
|
||||
|
||||
// float FUNC_NAME(__aeabi_fadd)(float, float) single-precision addition
|
||||
float_wrapper_section __aeabi_farithmetic
|
||||
// float FUNC_NAME(__aeabi_frsub)(float x, float y) single-precision reverse subtraction, y - x
|
||||
|
||||
// frsub first because it is the only one that needs alignment
|
||||
.align 2
|
||||
wrapper_func __aeabi_frsub
|
||||
eors r0, r1
|
||||
eors r1, r0
|
||||
eors r0, r1
|
||||
// fall thru
|
||||
|
||||
// float FUNC_NAME(__aeabi_fsub)(float x, float y) single-precision subtraction, x - y
|
||||
wrapper_func_f2 __aeabi_fsub
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
// we want to return nan for inf-inf or -inf - -inf, but without too much upfront cost
|
||||
mov r2, r0
|
||||
eors r2, r1
|
||||
bmi 1f // different signs
|
||||
push {r0, r1, lr}
|
||||
bl 1f
|
||||
b fdiv_fsub_nan_helper
|
||||
1:
|
||||
#endif
|
||||
table_tail_call SF_TABLE_FSUB
|
||||
|
||||
wrapper_func_f2 __aeabi_fadd
|
||||
table_tail_call SF_TABLE_FADD
|
||||
|
||||
// float FUNC_NAME(__aeabi_fdiv)(float n, float d) single-precision division, n / d
|
||||
wrapper_func_f2 __aeabi_fdiv
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
push {r0, r1, lr}
|
||||
bl 1f
|
||||
b fdiv_fsub_nan_helper
|
||||
1:
|
||||
#endif
|
||||
#if !PICO_DIVIDER_DISABLE_INTERRUPTS
|
||||
// to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
|
||||
ldr r2, =(SIO_BASE)
|
||||
ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
|
||||
lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
|
||||
bcs fdiv_save_state
|
||||
#else
|
||||
// to avoid worrying about IRQs (or context switches), simply disable interrupts around call
|
||||
push {r4, lr}
|
||||
mrs r4, PRIMASK
|
||||
cpsid i
|
||||
bl fdiv_shim_call
|
||||
msr PRIMASK, r4
|
||||
pop {r4, pc}
|
||||
#endif
|
||||
fdiv_shim_call:
|
||||
table_tail_call SF_TABLE_FDIV
|
||||
#if !PICO_DIVIDER_DISABLE_INTERRUPTS
|
||||
fdiv_save_state:
|
||||
save_div_state_and_lr
|
||||
bl fdiv_shim_call
|
||||
ldr r2, =(SIO_BASE)
|
||||
restore_div_state_and_return
|
||||
#endif
|
||||
|
||||
fdiv_fsub_nan_helper:
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
pop {r1, r2}
|
||||
|
||||
// check for infinite op infinite (or rather check for infinite result with both
|
||||
// operands being infinite)
|
||||
lsls r3, r0, #1
|
||||
asrs r3, r3, #24
|
||||
adds r3, #1
|
||||
beq 2f
|
||||
pop {pc}
|
||||
2:
|
||||
lsls r1, #1
|
||||
asrs r1, r1, #24
|
||||
lsls r2, #1
|
||||
asrs r2, r2, #24
|
||||
ands r1, r2
|
||||
adds r1, #1
|
||||
bne 3f
|
||||
// infinite to nan
|
||||
movs r1, #1
|
||||
lsls r1, #22
|
||||
orrs r0, r1
|
||||
3:
|
||||
pop {pc}
|
||||
#endif
|
||||
|
||||
// float FUNC_NAME(__aeabi_fmul)(float, float) single-precision multiplication
|
||||
wrapper_func_f2 __aeabi_fmul
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
push {r0, r1, lr}
|
||||
bl 1f
|
||||
pop {r1, r2}
|
||||
|
||||
// check for multiplication of infinite by zero (or rather check for infinite result with either
|
||||
// operand 0)
|
||||
lsls r3, r0, #1
|
||||
asrs r3, r3, #24
|
||||
adds r3, #1
|
||||
beq 2f
|
||||
pop {pc}
|
||||
2:
|
||||
ands r1, r2
|
||||
bne 3f
|
||||
// infinite to nan
|
||||
movs r1, #1
|
||||
lsls r1, #22
|
||||
orrs r0, r1
|
||||
3:
|
||||
pop {pc}
|
||||
1:
|
||||
#endif
|
||||
table_tail_call SF_TABLE_FMUL
|
||||
|
||||
// void FUNC_NAME(__aeabi_cfrcmple)(float, float) reversed 3-way (<, =, ?>) compare [1], result in PSR ZC flags
|
||||
float_wrapper_section __aeabi_cfcmple
|
||||
.align 2
|
||||
wrapper_func __aeabi_cfrcmple
|
||||
push {r0-r2, lr}
|
||||
eors r0, r1
|
||||
eors r1, r0
|
||||
eors r0, r1
|
||||
b __aeabi_cfcmple_guts
|
||||
|
||||
// NOTE these share an implementation as we have no excepting NaNs.
|
||||
// void FUNC_NAME(__aeabi_cfcmple)(float, float) 3-way (<, =, ?>) compare [1], result in PSR ZC flags
|
||||
// void FUNC_NAME(__aeabi_cfcmpeq)(float, float) non-excepting equality comparison [1], result in PSR ZC flags
|
||||
.align 2
|
||||
wrapper_func __aeabi_cfcmple
|
||||
wrapper_func __aeabi_cfcmpeq
|
||||
push {r0-r2, lr}
|
||||
|
||||
__aeabi_cfcmple_guts:
|
||||
lsls r2,r0,#1
|
||||
lsrs r2,#24
|
||||
beq 1f
|
||||
cmp r2,#0xff
|
||||
bne 2f
|
||||
lsls r2, r0, #9
|
||||
bhi 3f
|
||||
1:
|
||||
lsrs r0,#23 @ clear mantissa if denormal or infinite
|
||||
lsls r0,#23
|
||||
2:
|
||||
lsls r2,r1,#1
|
||||
lsrs r2,#24
|
||||
beq 1f
|
||||
cmp r2,#0xff
|
||||
bne 2f
|
||||
lsls r2, r1, #9
|
||||
bhi 3f
|
||||
1:
|
||||
lsrs r1,#23 @ clear mantissa if denormal or infinite
|
||||
lsls r1,#23
|
||||
2:
|
||||
movs r2,#1 @ initialise result
|
||||
eors r1,r0
|
||||
bmi 2f @ opposite signs? then can proceed on basis of sign of x
|
||||
eors r1,r0 @ restore y
|
||||
bpl 1f
|
||||
cmp r1,r0
|
||||
pop {r0-r2, pc}
|
||||
1:
|
||||
cmp r0,r1
|
||||
pop {r0-r2, pc}
|
||||
2:
|
||||
orrs r1, r0 @ handle 0/-0
|
||||
adds r1, r1 @ note this always sets C
|
||||
beq 3f
|
||||
mvns r0, r0 @ carry inverse of r0 sign
|
||||
adds r0, r0
|
||||
3:
|
||||
pop {r0-r2, pc}
|
||||
|
||||
|
||||
// int FUNC_NAME(__aeabi_fcmpeq)(float, float) result (1, 0) denotes (=, ?<>) [2], use for C == and !=
|
||||
float_wrapper_section __aeabi_fcmpeq
|
||||
.align 2
|
||||
wrapper_func __aeabi_fcmpeq
|
||||
push {lr}
|
||||
bl __aeabi_cfcmpeq
|
||||
beq 1f
|
||||
movs r0, #0
|
||||
pop {pc}
|
||||
1:
|
||||
movs r0, #1
|
||||
pop {pc}
|
||||
|
||||
// int FUNC_NAME(__aeabi_fcmplt)(float, float) result (1, 0) denotes (<, ?>=) [2], use for C <
|
||||
float_wrapper_section __aeabi_fcmplt
|
||||
.align 2
|
||||
wrapper_func __aeabi_fcmplt
|
||||
push {lr}
|
||||
bl __aeabi_cfcmple
|
||||
sbcs r0, r0
|
||||
pop {pc}
|
||||
|
||||
// int FUNC_NAME(__aeabi_fcmple)(float, float) result (1, 0) denotes (<=, ?>) [2], use for C <=
|
||||
float_wrapper_section __aeabi_fcmple
|
||||
.align 2
|
||||
wrapper_func __aeabi_fcmple
|
||||
push {lr}
|
||||
bl __aeabi_cfcmple
|
||||
bls 1f
|
||||
movs r0, #0
|
||||
pop {pc}
|
||||
1:
|
||||
movs r0, #1
|
||||
pop {pc}
|
||||
|
||||
// int FUNC_NAME(__aeabi_fcmpge)(float, float) result (1, 0) denotes (>=, ?<) [2], use for C >=
|
||||
float_wrapper_section __aeabi_fcmpge
|
||||
.align 2
|
||||
wrapper_func __aeabi_fcmpge
|
||||
push {lr}
|
||||
// because of NaNs it is better to reverse the args than the result
|
||||
bl __aeabi_cfrcmple
|
||||
bls 1f
|
||||
movs r0, #0
|
||||
pop {pc}
|
||||
1:
|
||||
movs r0, #1
|
||||
pop {pc}
|
||||
|
||||
// int FUNC_NAME(__aeabi_fcmpgt)(float, float) result (1, 0) denotes (>, ?<=) [2], use for C >
|
||||
float_wrapper_section __aeabi_fcmpgt
|
||||
wrapper_func __aeabi_fcmpgt
|
||||
push {lr}
|
||||
// because of NaNs it is better to reverse the args than the result
|
||||
bl __aeabi_cfrcmple
|
||||
sbcs r0, r0
|
||||
pop {pc}
|
||||
|
||||
// int FUNC_NAME(__aeabi_fcmpun)(float, float) result (1, 0) denotes (?, <=>) [2], use for C99 isunordered()
|
||||
float_wrapper_section __aeabi_fcmpun
|
||||
wrapper_func __aeabi_fcmpun
|
||||
movs r3, #1
|
||||
lsls r3, #24
|
||||
lsls r2, r0, #1
|
||||
adds r2, r3
|
||||
bhi 1f
|
||||
lsls r2, r1, #1
|
||||
adds r2, r3
|
||||
bhi 1f
|
||||
movs r0, #0
|
||||
bx lr
|
||||
1:
|
||||
movs r0, #1
|
||||
bx lr
|
||||
|
||||
|
||||
// float FUNC_NAME(__aeabi_ui2f)(unsigned) unsigned to float (single precision) conversion
|
||||
float_wrapper_section __aeabi_ui2f
|
||||
wrapper_func __aeabi_ui2f
|
||||
regular_func uint2float
|
||||
subs r1, r1
|
||||
cmp r0, #0
|
||||
bne __aeabi_i2f_main
|
||||
mov r0, r1
|
||||
bx lr
|
||||
|
||||
float_wrapper_section __aeabi_i2f
|
||||
// float FUNC_NAME(__aeabi_i2f)(int) integer to float (single precision) conversion
|
||||
wrapper_func __aeabi_i2f
|
||||
regular_func int2float
|
||||
lsrs r1, r0, #31
|
||||
lsls r1, #31
|
||||
bpl 1f
|
||||
negs r0, r0
|
||||
1:
|
||||
cmp r0, #0
|
||||
beq 7f
|
||||
__aeabi_i2f_main:
|
||||
|
||||
mov ip, lr
|
||||
push {r0, r1}
|
||||
ldr r3, =sf_clz_func
|
||||
ldr r3, [r3]
|
||||
blx r3
|
||||
pop {r1, r2}
|
||||
lsls r1, r0
|
||||
subs r0, #158
|
||||
negs r0, r0
|
||||
|
||||
adds r1,#0x80 @ rounding
|
||||
bcs 5f @ tripped carry? then have leading 1 in C as required (and result is even so can ignore sticky bits)
|
||||
|
||||
lsls r3,r1,#24 @ check bottom 8 bits of r1
|
||||
beq 6f @ in rounding-tie case?
|
||||
lsls r1,#1 @ remove leading 1
|
||||
3:
|
||||
lsrs r1,#9 @ align mantissa
|
||||
lsls r0,#23 @ align exponent
|
||||
orrs r0,r2 @ assemble exponent and mantissa
|
||||
4:
|
||||
orrs r0,r1 @ apply sign
|
||||
1:
|
||||
bx ip
|
||||
5:
|
||||
adds r0,#1 @ correct exponent offset
|
||||
b 3b
|
||||
6:
|
||||
lsrs r1,#9 @ ensure even result
|
||||
lsls r1,#10
|
||||
b 3b
|
||||
7:
|
||||
bx lr
|
||||
|
||||
|
||||
// int FUNC_NAME(__aeabi_f2iz)(float) float (single precision) to integer C-style conversion [3]
|
||||
float_wrapper_section __aeabi_f2iz
|
||||
wrapper_func __aeabi_f2iz
|
||||
regular_func float2int_z
|
||||
lsls r1, r0, #1
|
||||
lsrs r2, r1, #24
|
||||
movs r3, #0x80
|
||||
lsls r3, #24
|
||||
cmp r2, #126
|
||||
ble 1f
|
||||
subs r2, #158
|
||||
bge 2f
|
||||
asrs r1, r0, #31
|
||||
lsls r0, #9
|
||||
lsrs r0, #1
|
||||
orrs r0, r3
|
||||
negs r2, r2
|
||||
lsrs r0, r2
|
||||
lsls r1, #1
|
||||
adds r1, #1
|
||||
muls r0, r1
|
||||
bx lr
|
||||
1:
|
||||
movs r0, #0
|
||||
bx lr
|
||||
2:
|
||||
lsrs r0, #31
|
||||
adds r0, r3
|
||||
subs r0, #1
|
||||
bx lr
|
||||
|
||||
cmn r0, r0
|
||||
bcc float2int
|
||||
push {lr}
|
||||
lsls r0, #1
|
||||
lsrs r0, #1
|
||||
movs r1, #0
|
||||
bl __aeabi_f2uiz
|
||||
cmp r0, #0
|
||||
bmi 1f
|
||||
negs r0, r0
|
||||
pop {pc}
|
||||
1:
|
||||
movs r0, #128
|
||||
lsls r0, #24
|
||||
pop {pc}
|
||||
|
||||
float_section float2int
|
||||
regular_func float2int
|
||||
shimmable_table_tail_call SF_TABLE_FLOAT2INT float2int_shim
|
||||
|
||||
float_section float2fix
|
||||
regular_func float2fix
|
||||
shimmable_table_tail_call SF_TABLE_FLOAT2FIX float2fix_shim
|
||||
|
||||
float_section float2ufix
|
||||
regular_func float2ufix
|
||||
table_tail_call SF_TABLE_FLOAT2UFIX
|
||||
|
||||
// unsigned FUNC_NAME(__aeabi_f2uiz)(float) float (single precision) to unsigned C-style conversion [3]
|
||||
float_wrapper_section __aeabi_f2uiz
|
||||
wrapper_func __aeabi_f2uiz
|
||||
regular_func float2uint_z
|
||||
table_tail_call SF_TABLE_FLOAT2UINT
|
||||
|
||||
float_section fix2float
|
||||
regular_func fix2float
|
||||
table_tail_call SF_TABLE_FIX2FLOAT
|
||||
|
||||
float_section ufix2float
|
||||
regular_func ufix2float
|
||||
table_tail_call SF_TABLE_UFIX2FLOAT
|
||||
|
||||
float_section fix642float
|
||||
regular_func fix642float
|
||||
shimmable_table_tail_call SF_TABLE_FIX642FLOAT fix642float_shim
|
||||
|
||||
float_section ufix642float
|
||||
regular_func ufix642float
|
||||
shimmable_table_tail_call SF_TABLE_UFIX642FLOAT ufix642float_shim
|
||||
|
||||
// float FUNC_NAME(__aeabi_l2f)(long long) long long to float (single precision) conversion
|
||||
float_wrapper_section __aeabi_l2f
|
||||
1:
|
||||
ldr r2, =__aeabi_i2f
|
||||
bx r2
|
||||
wrapper_func __aeabi_l2f
|
||||
regular_func int642float
|
||||
asrs r2, r0, #31
|
||||
cmp r1, r2
|
||||
beq 1b
|
||||
shimmable_table_tail_call SF_TABLE_INT642FLOAT int642float_shim
|
||||
|
||||
// float FUNC_NAME(__aeabi_l2f)(long long) long long to float (single precision) conversion
|
||||
float_wrapper_section __aeabi_ul2f
|
||||
1:
|
||||
ldr r2, =__aeabi_ui2f
|
||||
bx r2
|
||||
wrapper_func __aeabi_ul2f
|
||||
regular_func uint642float
|
||||
cmp r1, #0
|
||||
beq 1b
|
||||
shimmable_table_tail_call SF_TABLE_UINT642FLOAT uint642float_shim
|
||||
|
||||
// long long FUNC_NAME(__aeabi_f2lz)(float) float (single precision) to long long C-style conversion [3]
|
||||
float_wrapper_section __aeabi_f2lz
|
||||
wrapper_func __aeabi_f2lz
|
||||
regular_func float2int64_z
|
||||
cmn r0, r0
|
||||
bcc float2int64
|
||||
push {lr}
|
||||
lsls r0, #1
|
||||
lsrs r0, #1
|
||||
movs r1, #0
|
||||
bl float2ufix64
|
||||
cmp r1, #0
|
||||
bmi 1f
|
||||
movs r2, #0
|
||||
negs r0, r0
|
||||
sbcs r2, r1
|
||||
mov r1, r2
|
||||
pop {pc}
|
||||
1:
|
||||
movs r1, #128
|
||||
lsls r1, #24
|
||||
movs r0, #0
|
||||
pop {pc}
|
||||
|
||||
float_section float2int64
|
||||
regular_func float2int64
|
||||
shimmable_table_tail_call SF_TABLE_FLOAT2INT64 float2int64_shim
|
||||
|
||||
float_section float2fix64
|
||||
regular_func float2fix64
|
||||
shimmable_table_tail_call SF_TABLE_FLOAT2FIX64 float2fix64_shim
|
||||
|
||||
// unsigned long long FUNC_NAME(__aeabi_f2ulz)(float) float to unsigned long long C-style conversion [3]
|
||||
float_wrapper_section __aeabi_f2ulz
|
||||
wrapper_func __aeabi_f2ulz
|
||||
regular_func float2uint64_z
|
||||
shimmable_table_tail_call SF_TABLE_FLOAT2UINT64 float2uint64_shim
|
||||
|
||||
float_section float2ufix64
|
||||
regular_func float2ufix64
|
||||
shimmable_table_tail_call SF_TABLE_FLOAT2UFIX64 float2ufix64_shim
|
||||
|
||||
float_wrapper_section __aeabi_f2d
|
||||
1:
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
// copy sign bit and 25 NAN id bits into sign bit and significant ID bits, also setting the high id bit
|
||||
asrs r1, r0, #3
|
||||
movs r2, #0xf
|
||||
lsls r2, #27
|
||||
orrs r1, r2
|
||||
lsls r0, #25
|
||||
bx lr
|
||||
#endif
|
||||
wrapper_func __aeabi_f2d
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
movs r3, #1
|
||||
lsls r3, #24
|
||||
lsls r2, r0, #1
|
||||
adds r2, r3
|
||||
bhi 1b
|
||||
#endif
|
||||
shimmable_table_tail_call SF_TABLE_FLOAT2DOUBLE float2double_shim
|
||||
|
||||
float_wrapper_section sqrtf
|
||||
wrapper_func_f1 sqrtf
|
||||
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
|
||||
// check for negative
|
||||
asrs r1, r0, #23
|
||||
bmi 1f
|
||||
#endif
|
||||
table_tail_call SF_TABLE_FSQRT
|
||||
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
|
||||
1:
|
||||
mvns r0, r1
|
||||
cmp r0, #255
|
||||
bne 2f
|
||||
// -0 or -Denormal return -0 (0x80000000)
|
||||
lsls r0, #31
|
||||
bx lr
|
||||
2:
|
||||
// return -Inf (0xff800000)
|
||||
asrs r0, r1, #31
|
||||
lsls r0, #23
|
||||
bx lr
|
||||
#endif
|
||||
|
||||
float_wrapper_section cosf
|
||||
// note we don't use _f1 since we do an infinity/nan check for outside of range
|
||||
wrapper_func cosf
|
||||
// rom version only works for -128 < angle < 128
|
||||
lsls r1, r0, #1
|
||||
lsrs r1, #24
|
||||
cmp r1, #127 + 7
|
||||
bge 1f
|
||||
2:
|
||||
table_tail_call SF_TABLE_FCOS
|
||||
1:
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
// also check for infinites
|
||||
cmp r1, #255
|
||||
bne 3f
|
||||
// infinite to nan
|
||||
movs r1, #1
|
||||
lsls r1, #22
|
||||
orrs r0, r1
|
||||
bx lr
|
||||
3:
|
||||
#endif
|
||||
ldr r1, =0x40c90fdb // 2 * M_PI
|
||||
push {lr}
|
||||
bl remainderf
|
||||
pop {r1}
|
||||
mov lr, r1
|
||||
b 2b
|
||||
|
||||
float_wrapper_section sinf
|
||||
// note we don't use _f1 since we do an infinity/nan check for outside of range
|
||||
wrapper_func sinf
|
||||
// rom version only works for -128 < angle < 128
|
||||
lsls r1, r0, #1
|
||||
lsrs r1, #24
|
||||
cmp r1, #127 + 7
|
||||
bge 1f
|
||||
2:
|
||||
table_tail_call SF_TABLE_FSIN
|
||||
1:
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
// also check for infinites
|
||||
cmp r1, #255
|
||||
bne 3f
|
||||
// infinite to nan
|
||||
movs r1, #1
|
||||
lsls r1, #22
|
||||
orrs r0, r1
|
||||
bx lr
|
||||
3:
|
||||
#endif
|
||||
ldr r1, =0x40c90fdb // 2 * M_PI
|
||||
push {lr}
|
||||
bl remainderf
|
||||
pop {r1}
|
||||
mov lr, r1
|
||||
b 2b
|
||||
|
||||
float_wrapper_section sincosf
|
||||
// note we don't use _f1 since we do an infinity/nan check for outside of range
|
||||
wrapper_func sincosf
|
||||
push {r1, r2, lr}
|
||||
// rom version only works for -128 < angle < 128
|
||||
lsls r3, r0, #1
|
||||
lsrs r3, #24
|
||||
cmp r3, #127 + 7
|
||||
bge 3f
|
||||
2:
|
||||
ldr r3, =sf_table
|
||||
ldr r3, [r3, #SF_TABLE_FSIN]
|
||||
blx r3
|
||||
pop {r2, r3}
|
||||
str r0, [r2]
|
||||
str r1, [r3]
|
||||
pop {pc}
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
.align 2
|
||||
pop {pc}
|
||||
#endif
|
||||
3:
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
// also check for infinites
|
||||
cmp r3, #255
|
||||
bne 4f
|
||||
// infinite to nan
|
||||
movs r3, #1
|
||||
lsls r3, #22
|
||||
orrs r0, r3
|
||||
str r0, [r1]
|
||||
str r0, [r2]
|
||||
add sp, #12
|
||||
bx lr
|
||||
4:
|
||||
#endif
|
||||
ldr r1, =0x40c90fdb // 2 * M_PI
|
||||
push {lr}
|
||||
bl remainderf
|
||||
pop {r1}
|
||||
mov lr, r1
|
||||
b 2b
|
||||
|
||||
float_wrapper_section tanf
|
||||
// note we don't use _f1 since we do an infinity/nan check for outside of range
|
||||
wrapper_func tanf
|
||||
// rom version only works for -128 < angle < 128
|
||||
lsls r1, r0, #1
|
||||
lsrs r1, #24
|
||||
cmp r1, #127 + 7
|
||||
bge ftan_out_of_range
|
||||
ftan_in_range:
|
||||
#if !PICO_DIVIDER_DISABLE_INTERRUPTS
|
||||
// to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
|
||||
ldr r2, =(SIO_BASE)
|
||||
ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
|
||||
lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
|
||||
bcs ftan_save_state
|
||||
#else
|
||||
// to avoid worrying about IRQs (or context switches), simply disable interrupts around call
|
||||
push {r4, lr}
|
||||
mrs r4, PRIMASK
|
||||
cpsid i
|
||||
bl ftan_shim_call
|
||||
msr PRIMASK, r4
|
||||
pop {r4, pc}
|
||||
#endif
|
||||
ftan_shim_call:
|
||||
table_tail_call SF_TABLE_FTAN
|
||||
#if !PICO_DIVIDER_DISABLE_INTERRUPTS
|
||||
ftan_save_state:
|
||||
save_div_state_and_lr
|
||||
bl ftan_shim_call
|
||||
ldr r2, =(SIO_BASE)
|
||||
restore_div_state_and_return
|
||||
#endif
|
||||
ftan_out_of_range:
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
// also check for infinites
|
||||
cmp r1, #255
|
||||
bne 3f
|
||||
// infinite to nan
|
||||
movs r1, #1
|
||||
lsls r1, #22
|
||||
orrs r0, r1
|
||||
bx lr
|
||||
3:
|
||||
#endif
|
||||
ldr r1, =0x40c90fdb // 2 * M_PI
|
||||
push {lr}
|
||||
bl remainderf
|
||||
pop {r1}
|
||||
mov lr, r1
|
||||
b ftan_in_range
|
||||
|
||||
float_wrapper_section atan2f
|
||||
wrapper_func_f2 atan2f
|
||||
shimmable_table_tail_call SF_TABLE_FATAN2 fatan2_shim
|
||||
|
||||
float_wrapper_section expf
|
||||
wrapper_func_f1 expf
|
||||
table_tail_call SF_TABLE_FEXP
|
||||
|
||||
float_wrapper_section logf
|
||||
wrapper_func_f1 logf
|
||||
table_tail_call SF_TABLE_FLN
|
369
lib/main/pico-sdk/rp2_common/pico_float/float_conv_m33.S
Normal file
369
lib/main/pico-sdk/rp2_common/pico_float/float_conv_m33.S
Normal file
|
@ -0,0 +1,369 @@
|
|||
/*
|
||||
* Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#if !PICO_RP2040
|
||||
#include "pico/asm_helper.S"
|
||||
|
||||
pico_default_asm_setup
|
||||
|
||||
.macro float_section name
|
||||
#if PICO_FLOAT_IN_RAM
|
||||
.section RAM_SECTION_NAME(\name), "ax"
|
||||
#else
|
||||
.section SECTION_NAME(\name), "ax"
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro float_wrapper_section func
|
||||
float_section WRAPPER_FUNC_NAME(\func)
|
||||
.endm
|
||||
|
||||
float_wrapper_section conv_tof
|
||||
|
||||
@ convert int64 to float, rounding
|
||||
wrapper_func __aeabi_l2f
|
||||
regular_func int642float
|
||||
movs r2,#0 @ fall through
|
||||
@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
|
||||
regular_func fix642float
|
||||
cmp r1,#0
|
||||
bge 10f @ positive? use unsigned code
|
||||
rsbs r0,#0
|
||||
sbc r1,r1,r1,lsl#1 @ make positive
|
||||
cbz r1,7f @ high word is zero?
|
||||
clz r3,r1
|
||||
subs r3,#8
|
||||
bmi 2f
|
||||
lsls r1,r3
|
||||
lsls r12,r0,r3 @ bits that will be lost
|
||||
rsb r3,#32
|
||||
lsr r0,r3
|
||||
orr r0,r0,r1
|
||||
sub r2,r2,r3
|
||||
rsb r2,#149
|
||||
adds r12,r12,r12 @ rounding bit into carry
|
||||
adc r0,r0,r2,lsl#23 @ insert exponent, add rounding
|
||||
orr r0,r0,#0x80000000
|
||||
beq 4f @ potential rounding tie?
|
||||
cmp r2,#0xfe
|
||||
bhs 3f @ over/underflow?
|
||||
bx r14
|
||||
2:
|
||||
add r3,#33
|
||||
lsls r12,r1,r3 @ rounding bit in carry, sticky bits in r12
|
||||
orrs r12,r12,r0 @ all of low word into sticky bits: affects Z but not C
|
||||
rsb r3,#33
|
||||
lsr r0,r1,r3
|
||||
@ push {r14}
|
||||
@ bl dumpreg
|
||||
@ pop {r14}
|
||||
sub r2,r3,r2
|
||||
add r2,#22+127+32
|
||||
adc r0,r0,r2,lsl#23 @ insert exponent, add rounding
|
||||
orr r0,r0,#0x80000000
|
||||
beq 4f @ potential rounding tie?
|
||||
cmp r2,#0xfe
|
||||
it lo
|
||||
bxlo r14
|
||||
@ over/underflow?
|
||||
3:
|
||||
mov r0,#0x80000000 @ underflow
|
||||
it ge
|
||||
movtge r0,#0xff80 @ overflow
|
||||
1:
|
||||
bx r14
|
||||
7:
|
||||
mov r1,r2
|
||||
b fix2float_neg
|
||||
4:
|
||||
it cs @ rounding tie?
|
||||
biccs r0,r0,#1 @ force to even if we rounded up
|
||||
cmp r2,#0xfe
|
||||
it lo
|
||||
bxlo r14
|
||||
b 3b
|
||||
|
||||
@ convert signed 32-bit fix to float, rounding; number of r0 bits after point in r1
|
||||
.thumb_func
|
||||
regular_func fix2float
|
||||
cmp r0,#0
|
||||
bge ufix2float @ positive? can use unsigned code
|
||||
rsbs r0,#0 @ make positive
|
||||
fix2float_neg:
|
||||
clz r3,r0
|
||||
subs r3,#8
|
||||
bmi 2f
|
||||
lsls r0,r3
|
||||
add r2,r1,r3
|
||||
rsb r2,#149
|
||||
add r0,r0,r2,lsl#23 @ insert exponent
|
||||
orr r0,#0x80000000
|
||||
cmp r2,#0xfe
|
||||
it lo @ over/underflow?
|
||||
bxlo r14
|
||||
b 3f
|
||||
2:
|
||||
add r3,#33
|
||||
lsls r12,r0,r3 @ rounding bit in carry, sticky bits in r12
|
||||
rsb r3,#33
|
||||
lsr r0,r3
|
||||
@ push {r14}
|
||||
@ bl dumpreg
|
||||
@ pop {r14}
|
||||
sub r2,r3,r1
|
||||
add r2,#22+127
|
||||
adc r0,r0,r2,lsl#23 @ insert exponent
|
||||
orr r0,#0x80000000
|
||||
beq 4f @ potential rounding tie?
|
||||
cmp r2,#0xfe
|
||||
it lo
|
||||
bxlo r14
|
||||
@ over/underflow?
|
||||
3:
|
||||
mov r0,#0x80000000 @ underflow
|
||||
it ge
|
||||
orrge r0,#0x7f800000 @ overflow
|
||||
1:
|
||||
bx r14
|
||||
4:
|
||||
it cs @ rounding tie?
|
||||
biccs r0,r0,#1 @ force to even if we rounded up
|
||||
cmp r2,#0xfe
|
||||
it lo
|
||||
bxlo r14
|
||||
b 3b
|
||||
|
||||
@ convert unsigned 32-bit fix to float, rounding; number of r0 bits after point in r1
|
||||
regular_func ufix2float
|
||||
cbz r0,1f @ zero? return it
|
||||
clz r3,r0
|
||||
subs r3,#8
|
||||
bmi 2f
|
||||
lsls r0,r3
|
||||
add r2,r1,r3
|
||||
rsb r2,#149
|
||||
add r0,r0,r2,lsl#23 @ insert exponent
|
||||
@ push {r14}
|
||||
@ bl dumpreg
|
||||
@ pop {r14}
|
||||
cmp r2,#0xfe
|
||||
it lo @ over/underflow?
|
||||
bxlo r14
|
||||
b 3f
|
||||
2:
|
||||
add r3,#33
|
||||
lsls r12,r0,r3 @ rounding bit in carry, sticky bits in r12
|
||||
rsb r3,#33
|
||||
lsr r0,r3
|
||||
@ push {r14}
|
||||
@ bl dumpreg
|
||||
@ pop {r14}
|
||||
sub r2,r3,r1
|
||||
add r2,#22+127
|
||||
adc r0,r0,r2,lsl#23 @ insert exponent
|
||||
beq 4f @ potential rounding tie?
|
||||
cmp r2,#0xfe
|
||||
it lo
|
||||
bxlo r14
|
||||
@ over/underflow?
|
||||
3:
|
||||
ite ge
|
||||
movge r0,#0x7f800000 @ overflow
|
||||
movlt r0,#0x00000000 @ underflow
|
||||
1:
|
||||
bx r14
|
||||
4:
|
||||
it cs @ rounding tie?
|
||||
biccs r0,r0,#1 @ force to even if we rounded up
|
||||
cmp r2,#0xfe
|
||||
it lo
|
||||
bxlo r14
|
||||
b 3b
|
||||
|
||||
@ convert uint64 to float, rounding
|
||||
wrapper_func __aeabi_ul2f
|
||||
regular_func uint642float
|
||||
movs r2,#0 @ fall through
|
||||
@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
|
||||
regular_func ufix642float
|
||||
10:
|
||||
cbz r1,7f @ high word is zero?
|
||||
clz r3,r1
|
||||
subs r3,#8
|
||||
bmi 2f
|
||||
lsls r1,r3
|
||||
lsls r12,r0,r3 @ bits that will be lost
|
||||
rsb r3,#32
|
||||
lsr r0,r3
|
||||
orr r0,r0,r1
|
||||
sub r2,r2,r3
|
||||
rsb r2,#149
|
||||
adds r12,r12,r12 @ rounding bit into carry
|
||||
adc r0,r0,r2,lsl#23 @ insert exponent, add rounding
|
||||
beq 4f @ potential rounding tie?
|
||||
cmp r2,#0xfe
|
||||
bhs 3f @ over/underflow?
|
||||
bx r14
|
||||
2:
|
||||
add r3,#33
|
||||
lsls r12,r1,r3 @ rounding bit in carry, sticky bits in r12
|
||||
orrs r12,r12,r0 @ all of low word into sticky bits: affects Z but not C
|
||||
rsb r3,#33
|
||||
lsr r0,r1,r3
|
||||
@ push {r14}
|
||||
@ bl dumpreg
|
||||
@ pop {r14}
|
||||
sub r2,r3,r2
|
||||
add r2,#22+127+32
|
||||
adc r0,r0,r2,lsl#23 @ insert exponent, add rounding
|
||||
beq 4f @ potential rounding tie?
|
||||
cmp r2,#0xfe
|
||||
it lo
|
||||
bxlo r14
|
||||
@ over/underflow?
|
||||
3:
|
||||
ite ge
|
||||
movge r0,#0x7f800000 @ overflow
|
||||
movlt r0,#0x00000000 @ underflow
|
||||
1:
|
||||
bx r14
|
||||
7:
|
||||
mov r1,r2
|
||||
b ufix2float
|
||||
4:
|
||||
it cs @ rounding tie?
|
||||
biccs r0,r0,#1 @ force to even if we rounded up
|
||||
cmp r2,#0xfe
|
||||
it lo
|
||||
bxlo r14
|
||||
b 3b
|
||||
|
||||
float_wrapper_section conv_ftoi64
|
||||
|
||||
@ convert float to signed int64, rounding towards 0, clamping
|
||||
wrapper_func __aeabi_f2lz
|
||||
regular_func float2int64_z
|
||||
movs r1,#0 @ fall through
|
||||
@ convert float in r0 to signed fixed point in r0:r1, clamping
|
||||
regular_func float2fix64_z
|
||||
subs r1,#0x95 @ remove exponent bias, compensate for mantissa length
|
||||
asrs r2,r0,#23 @ sign and exponent
|
||||
sub r3,r2,#1
|
||||
sub r0,r0,r3,lsl#23 @ install implied 1, clear exponent
|
||||
uxtb r3,r3
|
||||
cmp r3,#0xfe
|
||||
bhs 1f @ 0 or Inf/NaN?
|
||||
adds r1,r3 @ offset exponent by fix precision; r1 is now required left shift
|
||||
bmi 4f @ actually a right shift?
|
||||
subs r3,r1,#32 @ result fits in high 32 bits only?
|
||||
bge 8f
|
||||
subs r3,r1,#8 @ result fits in low 32 bits only?
|
||||
ble 7f
|
||||
lsls r0,#8
|
||||
rsbs r1,r3,#32
|
||||
lsrs r1,r0,r1
|
||||
lsls r0,r3
|
||||
cmp r2,#0
|
||||
it ge
|
||||
bxge r14
|
||||
rsbs r0,#0 @ negate if necessary
|
||||
sbcs r1,r1,r1,lsl#1
|
||||
bx r14
|
||||
7:
|
||||
lsls r0,r0,r1
|
||||
movs r1,r2,asr#31 @ sign extend
|
||||
eors r0,r0,r1 @ negate if necessary
|
||||
subs r0,r0,r1
|
||||
bx r14
|
||||
8:
|
||||
cmp r3,#8 @ overflow?
|
||||
bge 5f
|
||||
lsls r0,r0,r3
|
||||
eor r1,r0,r2,asr#31 @ negate if necessary
|
||||
add r1,r1,r2,lsr#31
|
||||
movs r0,#0
|
||||
bx r14
|
||||
1:
|
||||
bhi 3f @ 0?
|
||||
lsls r1,r0,#9 @ mantissa field
|
||||
it ne @ NaN?
|
||||
movne r2,#0 @ treat NaNs as +∞
|
||||
5:
|
||||
mvn r1,#0x80000000 @ = 0x7fffffff
|
||||
add r1,r1,r2,lsr#31 @ so -Inf → 0x80000000, +Inf → 0x7fffffff
|
||||
mvn r0,r2,asr#31
|
||||
bx r14
|
||||
3:
|
||||
movs r0,#0
|
||||
movs r1,#0
|
||||
bx r14
|
||||
4:
|
||||
rsbs r1,#0
|
||||
usat r1,#5,r1
|
||||
lsrs r0,r0,r1
|
||||
eors r0,r0,r2,asr#31 @ negate if necessary
|
||||
adds r0,r0,r2,lsr#31
|
||||
movs r1,r0,asr#31 @ sign extend
|
||||
bx r14
|
||||
|
||||
float_wrapper_section conv_ftoui64
|
||||
|
||||
@ convert float to unsigned int64, rounding towards -Inf, clamping
|
||||
wrapper_func __aeabi_f2ulz
|
||||
regular_func float2uint64
|
||||
regular_func float2uint64_z
|
||||
movs r1,#0 @ fall through
|
||||
@ convert float in r0 to unsigned fixed point in r0:r1, clamping
|
||||
regular_func float2ufix64
|
||||
//regular_func float2ufix64_z
|
||||
subs r1,#0x96 @ remove exponent bias, compensate for mantissa length
|
||||
asrs r2,r0,#23 @ sign and exponent
|
||||
sub r3,r2,#1
|
||||
cmp r3,#0xfe
|
||||
bhs 1f @ -ve, 0 or Inf/NaN?
|
||||
sub r0,r0,r3,lsl#23 @ install implied 1, clear exponent
|
||||
adds r1,r2 @ offset exponent by fix precision; r1 is now required left shift
|
||||
bmi 4f @ actually a right shift?
|
||||
subs r2,r1,#7
|
||||
ble 7f @ result (easily) fits in lo 32 bits?
|
||||
subs r3,r1,#32
|
||||
bge 8f @ results might fit in hi 32 bits?
|
||||
lsls r0,r0,#7
|
||||
rsbs r3,r2,#32
|
||||
lsrs r1,r0,r3
|
||||
lsls r0,r0,r2
|
||||
bx r14
|
||||
7:
|
||||
lsls r0,r1
|
||||
movs r1,#0
|
||||
bx r14
|
||||
8:
|
||||
cmp r1,#32+9 @ overflow?
|
||||
bge 5f
|
||||
lsls r1,r0,r3
|
||||
movs r0,#0
|
||||
bx r14
|
||||
5:
|
||||
mvn r0,#0 @ = 0xffffffff
|
||||
mvn r1,#0 @ = 0xffffffff
|
||||
bx r14
|
||||
4:
|
||||
rsbs r1,#0
|
||||
usat r1,#5,r1 @ if shift is long return 0
|
||||
lsrs r0,r0,r1
|
||||
movs r1,#0
|
||||
bx r14
|
||||
1:
|
||||
cmp r0,#0xff800000
|
||||
bhi 5b @ -NaN, return 0xffffffff
|
||||
cmp r0,#0x00800000
|
||||
bgt 5b @ +Inf or +NaN, return 0xfffffff
|
||||
2:
|
||||
movs r0,#0 @ return 0
|
||||
movs r1,#0
|
||||
bx r14
|
||||
|
||||
#endif
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include "pico/bootrom.h"
|
||||
#include "pico/bootrom/sf_table.h"
|
||||
|
||||
// NOTE THIS FUNCTION TABLE IS NOT PUBLIC OR NECESSARILY COMPLETE...
|
||||
// IT IS ***NOT*** SAFE TO CALL THESE FUNCTION POINTERS FROM ARBITRARY CODE
|
||||
uint32_t sf_table[SF_TABLE_V2_SIZE / 2];
|
||||
void __attribute__((weak)) *sf_clz_func;
|
||||
|
||||
#if !(PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED)
|
||||
static __attribute__((noreturn)) void missing_float_func_shim(void) {
|
||||
panic("");
|
||||
}
|
||||
#endif
|
||||
|
||||
void __aeabi_float_init(void) {
|
||||
int rom_version = rp2040_rom_version();
|
||||
void *rom_table = rom_data_lookup(rom_table_code('S', 'F'));
|
||||
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
|
||||
if (rom_version == 1) {
|
||||
memcpy(&sf_table, rom_table, SF_TABLE_V1_SIZE);
|
||||
extern void float_table_shim_on_use_helper(void);
|
||||
// todo replace NDEBUG with a more exclusive assertion guard
|
||||
#ifndef NDEBUG
|
||||
if (*(uint16_t *)0x29ee != 0x0fc4 || // this is packx
|
||||
*(uint16_t *)0x29c0 != 0x0dc2 || // this is upackx
|
||||
*(uint16_t *)0x2b96 != 0xb5c0 || // this is cordic_vec
|
||||
*(uint16_t *)0x2b18 != 0x2500 || // this is packretns
|
||||
*(uint16_t *)0x2acc != 0xb510 || // this is float2fix
|
||||
*(uint32_t *)0x2cfc != 0x6487ed51 // pi_q29
|
||||
) {
|
||||
panic("");
|
||||
}
|
||||
#endif
|
||||
|
||||
// this is a little tricky.. we only want to pull in a shim if the corresponding function
|
||||
// is called. to that end we include a SVC instruction with the table offset as the call number
|
||||
// followed by the shim function pointer inside the actual wrapper function. that way if the wrapper
|
||||
// function is garbage collected, so is the shim function.
|
||||
//
|
||||
// float_table_shim_on_use_helper expects this SVC instruction in the calling code soon after the address
|
||||
// pointed to by IP and patches the float_table entry with the real shim the first time the function is called.
|
||||
|
||||
for(uint i=SF_TABLE_V1_SIZE/4; i<SF_TABLE_V2_SIZE/4; i++) {
|
||||
sf_table[i] = (uintptr_t)float_table_shim_on_use_helper;
|
||||
}
|
||||
// we shim these for -0 and -denormal handling
|
||||
sf_table[SF_TABLE_FLOAT2INT/4] = sf_table[SF_TABLE_FLOAT2FIX/4] = (uintptr_t)float_table_shim_on_use_helper;
|
||||
}
|
||||
#else
|
||||
if (rom_version == 1) {
|
||||
memcpy(&sf_table, rom_table, SF_TABLE_V1_SIZE);
|
||||
// opting for soft failure for now - you'll get a panic at runtime if you call any of the missing methods
|
||||
for(uint i=0;i<SF_TABLE_V2_SIZE/4;i++) {
|
||||
if (!sf_table[i]) sf_table[i] = (uintptr_t)missing_float_func_shim;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (rom_version >= 2) {
|
||||
assert(*((uint8_t *)rom_table-2) * 4 >= SF_TABLE_V2_SIZE);
|
||||
memcpy(&sf_table, rom_table, SF_TABLE_V2_SIZE);
|
||||
}
|
||||
sf_clz_func = rom_func_lookup(ROM_FUNC_CLZ32);
|
||||
}
|
582
lib/main/pico-sdk/rp2_common/pico_float/float_math.c
Normal file
582
lib/main/pico-sdk/rp2_common/pico_float/float_math.c
Normal file
|
@ -0,0 +1,582 @@
|
|||
/*
|
||||
* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#include "pico/float.h"
|
||||
|
||||
// opened a separate issue https://github.com/raspberrypi/pico-sdk/issues/166 to deal with these warnings if at all
|
||||
GCC_Pragma("GCC diagnostic push")
|
||||
GCC_Pragma("GCC diagnostic ignored \"-Wconversion\"")
|
||||
GCC_Pragma("GCC diagnostic ignored \"-Wsign-conversion\"")
|
||||
|
||||
typedef uint32_t ui32;
|
||||
typedef int32_t i32;
|
||||
|
||||
#define FPINF ( HUGE_VALF)
|
||||
#define FMINF (-HUGE_VALF)
|
||||
#define NANF ((float)NAN)
|
||||
#define PZERO (+0.0)
|
||||
#define MZERO (-0.0)
|
||||
|
||||
#define PI 3.14159265358979323846
|
||||
#define LOG2 0.69314718055994530941
|
||||
// Unfortunately in double precision ln(10) is very close to half-way between to representable numbers
|
||||
#define LOG10 2.30258509299404568401
|
||||
#define LOG2E 1.44269504088896340737
|
||||
#define LOG10E 0.43429448190325182765
|
||||
#define ONETHIRD 0.33333333333333333333
|
||||
|
||||
#define PIf 3.14159265358979323846f
|
||||
#define LOG2f 0.69314718055994530941f
|
||||
#define LOG2Ef 1.44269504088896340737f
|
||||
#define LOG10Ef 0.43429448190325182765f
|
||||
#define ONETHIRDf 0.33333333333333333333f
|
||||
|
||||
#define FUNPACK(x,e,m) e=((x)>>23)&0xff,m=((x)&0x007fffff)|0x00800000
|
||||
#define FUNPACKS(x,s,e,m) s=((x)>>31),FUNPACK((x),(e),(m))
|
||||
|
||||
typedef union {
|
||||
float f;
|
||||
ui32 ix;
|
||||
} float_ui32;
|
||||
|
||||
static inline float ui322float(ui32 ix) {
|
||||
float_ui32 tmp;
|
||||
tmp.ix = ix;
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
static inline ui32 float2ui32(float f) {
|
||||
float_ui32 tmp;
|
||||
tmp.f = f;
|
||||
return tmp.ix;
|
||||
}
|
||||
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
static inline bool fisnan(float x) {
|
||||
ui32 ix=float2ui32(x);
|
||||
return ix * 2 > 0xff000000u;
|
||||
}
|
||||
|
||||
#define check_nan_f1(x) if (fisnan((x))) return (x)
|
||||
#define check_nan_f2(x,y) if (fisnan((x))) return (x); else if (fisnan((y))) return (y);
|
||||
#else
|
||||
#define check_nan_f1(x) ((void)0)
|
||||
#define check_nan_f2(x,y) ((void)0)
|
||||
#endif
|
||||
|
||||
static inline int fgetsignexp(float x) {
|
||||
ui32 ix=float2ui32(x);
|
||||
return (ix>>23)&0x1ff;
|
||||
}
|
||||
|
||||
static inline int fgetexp(float x) {
|
||||
ui32 ix=float2ui32(x);
|
||||
return (ix>>23)&0xff;
|
||||
}
|
||||
|
||||
static inline float fldexp(float x,int de) {
|
||||
ui32 ix=float2ui32(x),iy;
|
||||
int e;
|
||||
e=fgetexp(x);
|
||||
if(e==0||e==0xff) return x;
|
||||
e+=de;
|
||||
if(e<=0) iy=ix&0x80000000; // signed zero for underflow
|
||||
else if(e>=0xff) iy=(ix&0x80000000)|0x7f800000ULL; // signed infinity on overflow
|
||||
else iy=ix+((ui32)de<<23);
|
||||
return ui322float(iy);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(ldexpf)(float x, int de) {
|
||||
check_nan_f1(x);
|
||||
return fldexp(x, de);
|
||||
}
|
||||
|
||||
static inline float fcopysign(float x,float y) {
|
||||
ui32 ix=float2ui32(x),iy=float2ui32(y);
|
||||
ix=((ix&0x7fffffff)|(iy&0x80000000));
|
||||
return ui322float(ix);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(copysignf)(float x, float y) {
|
||||
check_nan_f2(x,y);
|
||||
return fcopysign(x, y);
|
||||
}
|
||||
|
||||
static inline int fiszero(float x) { return fgetexp (x)==0; }
|
||||
//static inline int fispzero(float x) { return fgetsignexp(x)==0; }
|
||||
//static inline int fismzero(float x) { return fgetsignexp(x)==0x100; }
|
||||
static inline int fisinf(float x) { return fgetexp (x)==0xff; }
|
||||
static inline int fispinf(float x) { return fgetsignexp(x)==0xff; }
|
||||
static inline int fisminf(float x) { return fgetsignexp(x)==0x1ff; }
|
||||
|
||||
static inline int fisint(float x) {
|
||||
ui32 ix=float2ui32(x),m;
|
||||
int e=fgetexp(x);
|
||||
if(e==0) return 1; // 0 is an integer
|
||||
e-=0x7f; // remove exponent bias
|
||||
if(e<0) return 0; // |x|<1
|
||||
e=23-e; // bit position in mantissa with significance 1
|
||||
if(e<=0) return 1; // |x| large, so must be an integer
|
||||
m=(1<<e)-1; // mask for bits of significance <1
|
||||
if(ix&m) return 0; // not an integer
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline int fisoddint(float x) {
|
||||
ui32 ix=float2ui32(x),m;
|
||||
int e=fgetexp(x);
|
||||
e-=0x7f; // remove exponent bias
|
||||
if(e<0) return 0; // |x|<1; 0 is not odd
|
||||
e=23-e; // bit position in mantissa with significance 1
|
||||
if(e<0) return 0; // |x| large, so must be even
|
||||
m=(1<<e)-1; // mask for bits of significance <1 (if any)
|
||||
if(ix&m) return 0; // not an integer
|
||||
if(e==23) return 1; // value is exactly 1
|
||||
return (ix>>e)&1;
|
||||
}
|
||||
|
||||
static inline int fisstrictneg(float x) {
|
||||
ui32 ix=float2ui32(x);
|
||||
if(fiszero(x)) return 0;
|
||||
return ix>>31;
|
||||
}
|
||||
|
||||
static inline int fisneg(float x) {
|
||||
ui32 ix=float2ui32(x);
|
||||
return ix>>31;
|
||||
}
|
||||
|
||||
static inline float fneg(float x) {
|
||||
ui32 ix=float2ui32(x);
|
||||
ix^=0x80000000;
|
||||
return ui322float(ix);
|
||||
}
|
||||
|
||||
static inline int fispo2(float x) {
|
||||
ui32 ix=float2ui32(x);
|
||||
if(fiszero(x)) return 0;
|
||||
if(fisinf(x)) return 0;
|
||||
ix&=0x007fffff;
|
||||
return ix==0;
|
||||
}
|
||||
|
||||
static inline float fnan_or(float x) {
|
||||
#if PICO_FLOAT_PROPAGATE_NANS
|
||||
return NANF;
|
||||
#else
|
||||
return x;
|
||||
#endif
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(truncf)(float x) {
|
||||
check_nan_f1(x);
|
||||
ui32 ix=float2ui32(x),m;
|
||||
int e=fgetexp(x);
|
||||
e-=0x7f; // remove exponent bias
|
||||
if(e<0) { // |x|<1
|
||||
ix&=0x80000000;
|
||||
return ui322float(ix);
|
||||
}
|
||||
e=23-e; // bit position in mantissa with significance 1
|
||||
if(e<=0) return x; // |x| large, so must be an integer
|
||||
m=(1<<e)-1; // mask for bits of significance <1
|
||||
ix&=~m;
|
||||
return ui322float(ix);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(roundf)(float x) {
|
||||
check_nan_f1(x);
|
||||
ui32 ix=float2ui32(x),m;
|
||||
int e=fgetexp(x);
|
||||
e-=0x7f; // remove exponent bias
|
||||
if(e<-1) { // |x|<0.5
|
||||
ix&=0x80000000;
|
||||
return ui322float(ix);
|
||||
}
|
||||
if(e==-1) { // 0.5<=|x|<1
|
||||
ix&=0x80000000;
|
||||
ix|=0x3f800000; // ±1
|
||||
return ui322float(ix);
|
||||
}
|
||||
e=23-e; // bit position in mantissa with significance 1, <=23
|
||||
if(e<=0) return x; // |x| large, so must be an integer
|
||||
m=1<<(e-1); // mask for bit of significance 0.5
|
||||
ix+=m;
|
||||
m=m+m-1; // mask for bits of significance <1
|
||||
ix&=~m;
|
||||
return ui322float(ix);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(floorf)(float x) {
|
||||
check_nan_f1(x);
|
||||
ui32 ix=float2ui32(x),m;
|
||||
int e=fgetexp(x);
|
||||
if(e==0) { // x==0
|
||||
ix&=0x80000000;
|
||||
return ui322float(ix);
|
||||
}
|
||||
e-=0x7f; // remove exponent bias
|
||||
if(e<0) { // |x|<1, not zero
|
||||
if(fisneg(x)) return -1;
|
||||
return PZERO;
|
||||
}
|
||||
e=23-e; // bit position in mantissa with significance 1
|
||||
if(e<=0) return x; // |x| large, so must be an integer
|
||||
m=(1<<e)-1; // mask for bit of significance <1
|
||||
if(fisneg(x)) ix+=m; // add 1-ε to magnitude if negative
|
||||
ix&=~m; // truncate
|
||||
return ui322float(ix);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(ceilf)(float x) {
|
||||
check_nan_f1(x);
|
||||
ui32 ix=float2ui32(x),m;
|
||||
int e=fgetexp(x);
|
||||
if(e==0) { // x==0
|
||||
ix&=0x80000000;
|
||||
return ui322float(ix);
|
||||
}
|
||||
e-=0x7f; // remove exponent bias
|
||||
if(e<0) { // |x|<1, not zero
|
||||
if(fisneg(x)) return MZERO;
|
||||
return 1;
|
||||
}
|
||||
e=23-e; // bit position in mantissa with significance 1
|
||||
if(e<=0) return x; // |x| large, so must be an integer
|
||||
m=(1<<e)-1; // mask for bit of significance <1
|
||||
if(!fisneg(x)) ix+=m; // add 1-ε to magnitude if positive
|
||||
ix&=~m; // truncate
|
||||
return ui322float(ix);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(asinf)(float x) {
|
||||
check_nan_f1(x);
|
||||
float u;
|
||||
u=(1.0f-x)*(1.0f+x);
|
||||
if(fisstrictneg(u)) return fnan_or(FPINF);
|
||||
return atan2f(x,sqrtf(u));
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(acosf)(float x) {
|
||||
check_nan_f1(x);
|
||||
float u;
|
||||
u=(1.0f-x)*(1.0f+x);
|
||||
if(fisstrictneg(u)) return fnan_or(FPINF);
|
||||
return atan2f(sqrtf(u),x);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(atanf)(float x) {
|
||||
check_nan_f1(x);
|
||||
if(fispinf(x)) return (float)( PIf/2);
|
||||
if(fisminf(x)) return (float)(-PIf/2);
|
||||
return atan2f(x,1.0f);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(sinhf)(float x) {
|
||||
check_nan_f1(x);
|
||||
return fldexp((expf(x)-expf(fneg(x))),-1);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(coshf)(float x) {
|
||||
check_nan_f1(x);
|
||||
return fldexp((expf(x)+expf(fneg(x))),-1);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(tanhf)(float x) {
|
||||
check_nan_f1(x);
|
||||
float u;
|
||||
int e;
|
||||
e=fgetexp(x);
|
||||
if(e>=4+0x7f) { // |x|>=16?
|
||||
if(!fisneg(x)) return 1; // 1 << exp 2x; avoid generating infinities later
|
||||
else return -1; // 1 >> exp 2x
|
||||
}
|
||||
u=expf(fldexp(x,1));
|
||||
return (u-1.0f)/(u+1.0f);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(asinhf)(float x) {
|
||||
check_nan_f1(x);
|
||||
int e;
|
||||
e=fgetexp(x);
|
||||
if(e>=16+0x7f) { // |x|>=2^16?
|
||||
if(!fisneg(x)) return logf( x )+LOG2f; // 1/x^2 << 1
|
||||
else return fneg(logf(fneg(x))+LOG2f); // 1/x^2 << 1
|
||||
}
|
||||
if(x>0) return (float)log(sqrt((double)x*(double)x+1.0)+(double)x);
|
||||
else return fneg((float)log(sqrt((double)x*(double)x+1.0)-(double)x));
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(acoshf)(float x) {
|
||||
check_nan_f1(x);
|
||||
int e;
|
||||
if(fisneg(x)) x=fneg(x);
|
||||
e=fgetexp(x);
|
||||
if(e>=16+0x7f) return logf(x)+LOG2f; // |x|>=2^16?
|
||||
return (float)log(sqrt(((double)x+1.0)*((double)x-1.0))+(double)x);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(atanhf)(float x) {
|
||||
check_nan_f1(x);
|
||||
return fldexp(logf((1.0f+x)/(1.0f-x)),-1);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(exp2f)(float x) { check_nan_f1(x); return (float)exp((double)x*LOG2); }
|
||||
float WRAPPER_FUNC(log2f)(float x) { check_nan_f1(x); return logf(x)*LOG2Ef; }
|
||||
float WRAPPER_FUNC(exp10f)(float x) { check_nan_f1(x); return (float)exp((double)x*LOG10); }
|
||||
float WRAPPER_FUNC(log10f)(float x) { check_nan_f1(x); return logf(x)*LOG10Ef; }
|
||||
|
||||
float WRAPPER_FUNC(expm1f)(float x) { check_nan_f1(x); return (float)(exp((double)x)-1); }
|
||||
float WRAPPER_FUNC(log1pf)(float x) { check_nan_f1(x); return (float)(log(1+(double)x)); }
|
||||
float WRAPPER_FUNC(fmaf)(float x,float y,float z) {
|
||||
check_nan_f2(x,y);
|
||||
check_nan_f1(z);
|
||||
return (float)((double)x*(double)y+(double)z);
|
||||
} // has double rounding so not exact
|
||||
|
||||
// general power, x>0
|
||||
static inline float fpow_1(float x,float y) {
|
||||
return (float)exp(log((double)x)*(double)y); // using double-precision intermediates for better accuracy
|
||||
}
|
||||
|
||||
static float fpow_int2(float x,int y) {
|
||||
float u;
|
||||
if(y==1) return x;
|
||||
u=fpow_int2(x,y/2);
|
||||
u*=u;
|
||||
if(y&1) u*=x;
|
||||
return u;
|
||||
}
|
||||
|
||||
// for the case where x not zero or infinity, y small and not zero
|
||||
static inline float fpowint_1(float x,int y) {
|
||||
if(y<0) x=1.0f/x,y=-y;
|
||||
return fpow_int2(x,y);
|
||||
}
|
||||
|
||||
// for the case where x not zero or infinity
|
||||
static float fpowint_0(float x,int y) {
|
||||
int e;
|
||||
if(fisneg(x)) {
|
||||
if(fisoddint(y)) return fneg(fpowint_0(fneg(x),y));
|
||||
else return fpowint_0(fneg(x),y);
|
||||
}
|
||||
if(fispo2(x)) {
|
||||
e=fgetexp(x)-0x7f;
|
||||
if(y>=256) y= 255; // avoid overflow
|
||||
if(y<-256) y=-256;
|
||||
y*=e;
|
||||
return fldexp(1,y);
|
||||
}
|
||||
if(y==0) return 1;
|
||||
if(y>=-32&&y<=32) return fpowint_1(x,y);
|
||||
return fpow_1(x,y);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(powintf)(float x,int y) {
|
||||
GCC_Pragma("GCC diagnostic push")
|
||||
GCC_Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
|
||||
if(x==1.0f||y==0) return 1;
|
||||
if(x==0.0f) {
|
||||
if(y>0) {
|
||||
if(y&1) return x;
|
||||
else return 0;
|
||||
}
|
||||
if((y&1)) return fcopysign(FPINF,x);
|
||||
return FPINF;
|
||||
}
|
||||
GCC_Pragma("GCC diagnostic pop")
|
||||
check_nan_f1(x);
|
||||
if(fispinf(x)) {
|
||||
if(y<0) return 0;
|
||||
else return FPINF;
|
||||
}
|
||||
if(fisminf(x)) {
|
||||
if(y>0) {
|
||||
if((y&1)) return FMINF;
|
||||
else return FPINF;
|
||||
}
|
||||
if((y&1)) return MZERO;
|
||||
else return PZERO;
|
||||
}
|
||||
return fpowint_0(x,y);
|
||||
}
|
||||
|
||||
// for the case where y is guaranteed a finite integer, x not zero or infinity
|
||||
static float fpow_0(float x,float y) {
|
||||
int e,p;
|
||||
if(fisneg(x)) {
|
||||
if(fisoddint(y)) return fneg(fpow_0(fneg(x),y));
|
||||
else return fpow_0(fneg(x),y);
|
||||
}
|
||||
p=(int)y;
|
||||
if(fispo2(x)) {
|
||||
e=fgetexp(x)-0x7f;
|
||||
if(p>=256) p= 255; // avoid overflow
|
||||
if(p<-256) p=-256;
|
||||
p*=e;
|
||||
return fldexp(1,p);
|
||||
}
|
||||
if(p==0) return 1;
|
||||
if(p>=-32&&p<=32) return fpowint_1(x,p);
|
||||
return fpow_1(x,y);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(powf)(float x,float y) {
|
||||
GCC_Like_Pragma("GCC diagnostic push")
|
||||
GCC_Like_Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
|
||||
if(x==1.0f||fiszero(y)) return 1;
|
||||
check_nan_f2(x,y);
|
||||
if(x==-1.0f&&fisinf(y)) return 1;
|
||||
GCC_Like_Pragma("GCC diagnostic pop")
|
||||
if(fiszero(x)) {
|
||||
if(!fisneg(y)) {
|
||||
if(fisoddint(y)) return x;
|
||||
else return 0;
|
||||
}
|
||||
if(fisoddint(y)) return fcopysign(FPINF,x);
|
||||
return FPINF;
|
||||
}
|
||||
if(fispinf(x)) {
|
||||
if(fisneg(y)) return 0;
|
||||
else return FPINF;
|
||||
}
|
||||
if(fisminf(x)) {
|
||||
if(!fisneg(y)) {
|
||||
if(fisoddint(y)) return FMINF;
|
||||
else return FPINF;
|
||||
}
|
||||
if(fisoddint(y)) return MZERO;
|
||||
else return PZERO;
|
||||
}
|
||||
if(fispinf(y)) {
|
||||
if(fgetexp(x)<0x7f) return PZERO;
|
||||
else return FPINF;
|
||||
}
|
||||
if(fisminf(y)) {
|
||||
if(fgetexp(x)<0x7f) return FPINF;
|
||||
else return PZERO;
|
||||
}
|
||||
if(fisint(y)) return fpow_0(x,y);
|
||||
if(fisneg(x)) return FPINF;
|
||||
return fpow_1(x,y);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(hypotf)(float x,float y) {
|
||||
check_nan_f2(x,y);
|
||||
int ex,ey;
|
||||
ex=fgetexp(x); ey=fgetexp(y);
|
||||
if(ex>=0x7f+50||ey>=0x7f+50) { // overflow, or nearly so
|
||||
x=fldexp(x,-70),y=fldexp(y,-70);
|
||||
return fldexp(sqrtf(x*x+y*y), 70);
|
||||
}
|
||||
else if(ex<=0x7f-50&&ey<=0x7f-50) { // underflow, or nearly so
|
||||
x=fldexp(x, 70),y=fldexp(y, 70);
|
||||
return fldexp(sqrtf(x*x+y*y),-70);
|
||||
}
|
||||
return sqrtf(x*x+y*y);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(cbrtf)(float x) {
|
||||
check_nan_f1(x);
|
||||
int e;
|
||||
if(fisneg(x)) return fneg(cbrtf(fneg(x)));
|
||||
if(fiszero(x)) return fcopysign(PZERO,x);
|
||||
e=fgetexp(x)-0x7f;
|
||||
e=(e*0x5555+0x8000)>>16; // ~e/3, rounded
|
||||
x=fldexp(x,-e*3);
|
||||
x=expf(logf(x)*ONETHIRDf);
|
||||
return fldexp(x,e);
|
||||
}
|
||||
|
||||
// reduces mx*2^e modulo my, returning bottom bits of quotient at *pquo
|
||||
// 2^23<=|mx|,my<2^24, e>=0; 0<=result<my
|
||||
static i32 frem_0(i32 mx,i32 my,int e,int*pquo) {
|
||||
int quo=0,q,r=0,s;
|
||||
if(e>0) {
|
||||
r=0xffffffffU/(ui32)(my>>7); // reciprocal estimate Q16
|
||||
}
|
||||
while(e>0) {
|
||||
s=e; if(s>12) s=12; // gain up to 12 bits on each iteration
|
||||
q=(mx>>9)*r; // Q30
|
||||
q=((q>>(29-s))+1)>>1; // Q(s), rounded
|
||||
mx=(mx<<s)-my*q;
|
||||
quo=(quo<<s)+q;
|
||||
e-=s;
|
||||
}
|
||||
if(mx>=my) mx-=my,quo++; // when e==0 mx can be nearly as big as 2my
|
||||
if(mx>=my) mx-=my,quo++;
|
||||
if(mx<0) mx+=my,quo--;
|
||||
if(mx<0) mx+=my,quo--;
|
||||
if(pquo) *pquo=quo;
|
||||
return mx;
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(fmodf)(float x,float y) {
|
||||
check_nan_f2(x,y);
|
||||
ui32 ix=float2ui32(x),iy=float2ui32(y);
|
||||
int sx,ex,ey;
|
||||
i32 mx,my;
|
||||
FUNPACKS(ix,sx,ex,mx);
|
||||
FUNPACK(iy,ey,my);
|
||||
if(ex==0xff) {
|
||||
return fnan_or(FPINF);
|
||||
}
|
||||
if(ey==0) return FPINF;
|
||||
if(ex==0) {
|
||||
if(!fisneg(x)) return PZERO;
|
||||
return MZERO;
|
||||
}
|
||||
if(ex<ey) return x; // |x|<|y|, including case x=±0
|
||||
mx=frem_0(mx,my,ex-ey,0);
|
||||
if(sx) mx=-mx;
|
||||
return fix2float(mx,0x7f-ey+23);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(remquof)(float x,float y,int*quo) {
|
||||
check_nan_f2(x,y);
|
||||
ui32 ix=float2ui32(x),iy=float2ui32(y);
|
||||
int sx,sy,ex,ey,q;
|
||||
i32 mx,my;
|
||||
FUNPACKS(ix,sx,ex,mx);
|
||||
FUNPACKS(iy,sy,ey,my);
|
||||
if(quo) *quo=0;
|
||||
if(ex==0xff) return FPINF;
|
||||
if(ey==0) return FPINF;
|
||||
if(ex==0) return PZERO;
|
||||
if(ey==0xff) return x;
|
||||
if(ex<ey-1) return x; // |x|<|y|/2
|
||||
if(ex==ey-1) {
|
||||
if(mx<=my) return x; // |x|<=|y|/2, even quotient
|
||||
// here |y|/2<|x|<|y|
|
||||
if(!sx) { // x>|y|/2
|
||||
mx-=my+my;
|
||||
ey--;
|
||||
q=1;
|
||||
} else { // x<-|y|/2
|
||||
mx=my+my-mx;
|
||||
ey--;
|
||||
q=-1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if(sx) mx=-mx;
|
||||
mx=frem_0(mx,my,ex-ey,&q);
|
||||
if(mx+mx>my || (mx+mx==my&&(q&1)) ) { // |x|>|y|/2, or equality and an odd quotient?
|
||||
mx-=my;
|
||||
q++;
|
||||
}
|
||||
}
|
||||
if(sy) q=-q;
|
||||
if(quo) *quo=q;
|
||||
return fix2float(mx,0x7f-ey+23);
|
||||
}
|
||||
|
||||
float WRAPPER_FUNC(dremf)(float x,float y) { check_nan_f2(x,y); return remquof(x,y,0); }
|
||||
|
||||
float WRAPPER_FUNC(remainderf)(float x,float y) { check_nan_f2(x,y); return remquof(x,y,0); }
|
||||
|
||||
GCC_Pragma("GCC diagnostic pop") // conversion
|
82
lib/main/pico-sdk/rp2_common/pico_float/float_none.S
Normal file
82
lib/main/pico-sdk/rp2_common/pico_float/float_none.S
Normal file
|
@ -0,0 +1,82 @@
|
|||
/*
|
||||
* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#include "pico/asm_helper.S"
|
||||
#include "pico/bootrom/sf_table.h"
|
||||
|
||||
pico_default_asm_setup
|
||||
|
||||
wrapper_func __aeabi_fadd
|
||||
wrapper_func __aeabi_fdiv
|
||||
wrapper_func __aeabi_fmul
|
||||
wrapper_func __aeabi_frsub
|
||||
wrapper_func __aeabi_fsub
|
||||
wrapper_func __aeabi_cfcmpeq
|
||||
wrapper_func __aeabi_cfrcmple
|
||||
wrapper_func __aeabi_cfcmple
|
||||
wrapper_func __aeabi_fcmpeq
|
||||
wrapper_func __aeabi_fcmplt
|
||||
wrapper_func __aeabi_fcmple
|
||||
wrapper_func __aeabi_fcmpge
|
||||
wrapper_func __aeabi_fcmpgt
|
||||
wrapper_func __aeabi_fcmpun
|
||||
wrapper_func __aeabi_i2f
|
||||
wrapper_func __aeabi_l2f
|
||||
wrapper_func __aeabi_ui2f
|
||||
wrapper_func __aeabi_ul2f
|
||||
wrapper_func __aeabi_f2iz
|
||||
wrapper_func __aeabi_f2lz
|
||||
wrapper_func __aeabi_f2uiz
|
||||
wrapper_func __aeabi_f2ulz
|
||||
wrapper_func __aeabi_f2d
|
||||
wrapper_func sqrtf
|
||||
wrapper_func cosf
|
||||
wrapper_func sinf
|
||||
wrapper_func tanf
|
||||
wrapper_func atan2f
|
||||
wrapper_func expf
|
||||
wrapper_func logf
|
||||
wrapper_func ldexpf
|
||||
wrapper_func copysignf
|
||||
wrapper_func truncf
|
||||
wrapper_func floorf
|
||||
wrapper_func ceilf
|
||||
wrapper_func roundf
|
||||
wrapper_func sincosf
|
||||
wrapper_func asinf
|
||||
wrapper_func acosf
|
||||
wrapper_func atanf
|
||||
wrapper_func sinhf
|
||||
wrapper_func coshf
|
||||
wrapper_func tanhf
|
||||
wrapper_func asinhf
|
||||
wrapper_func acoshf
|
||||
wrapper_func atanhf
|
||||
wrapper_func exp2f
|
||||
wrapper_func log2f
|
||||
wrapper_func exp10f
|
||||
wrapper_func log10f
|
||||
wrapper_func powf
|
||||
wrapper_func powintf
|
||||
wrapper_func hypotf
|
||||
wrapper_func cbrtf
|
||||
wrapper_func fmodf
|
||||
wrapper_func dremf
|
||||
wrapper_func remainderf
|
||||
wrapper_func remquof
|
||||
wrapper_func expm1f
|
||||
wrapper_func log1pf
|
||||
wrapper_func fmaf
|
||||
#ifdef __riscv
|
||||
la a0, str
|
||||
j panic
|
||||
#else
|
||||
push {lr} // keep stack trace sane
|
||||
ldr r0, =str
|
||||
bl panic
|
||||
#endif
|
||||
str:
|
||||
.asciz "float support is disabled"
|
801
lib/main/pico-sdk/rp2_common/pico_float/float_sci_m33.S
Normal file
801
lib/main/pico-sdk/rp2_common/pico_float/float_sci_m33.S
Normal file
|
@ -0,0 +1,801 @@
|
|||
/*
|
||||
* Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#if !PICO_RP2040
|
||||
#include "pico/asm_helper.S"
|
||||
|
||||
pico_default_asm_setup
|
||||
|
||||
.macro float_section name
|
||||
#if PICO_FLOAT_IN_RAM
|
||||
.section RAM_SECTION_NAME(\name), "ax"
|
||||
#else
|
||||
.section SECTION_NAME(\name), "ax"
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro float_wrapper_section func
|
||||
float_section WRAPPER_FUNC_NAME(\func)
|
||||
.endm
|
||||
|
||||
.extern rtwopi
|
||||
.extern logtab0
|
||||
.extern exptab0
|
||||
.extern exptab1
|
||||
.extern exptab2
|
||||
.extern trigtab
|
||||
|
||||
@ load a 32-bit constant n into register rx
|
||||
.macro movlong rx,n
|
||||
movw \rx,#(\n)&0xffff
|
||||
movt \rx,#((\n)>>16)&0xffff
|
||||
.endm
|
||||
|
||||
float_section frrcore
|
||||
|
||||
@ input:
|
||||
@ r0 mantissa m Q23
|
||||
@ r1 exponent e>=-32, typically offset by +9
|
||||
@ output:
|
||||
@ r0..r1 preserved
|
||||
@ r6 range reduced result
|
||||
@ r2,r3,r4,r5 trashed
|
||||
frr_core:
|
||||
ldr r2,=rtwopi
|
||||
asrs r3,r1,#5 @ k=e/32, k<=5 for e offsets up to 9+32
|
||||
add r2,r2,r3,lsl#2 @ p
|
||||
and r3,r1,#31 @ s=e%32
|
||||
mov r4,#1
|
||||
lsls r4,r4,r3 @ 1<<s
|
||||
umull r3,r4,r4,r0
|
||||
@ r2 p
|
||||
@ r3:r4 u0:u1 = m<<(e%32); u1 is never more than 2<<23
|
||||
ldr r5,[r2,#12] @ a0=p[3]
|
||||
umull r5,r6,r5,r4 @ r0=a0*u1 hi, discard lo
|
||||
@ r6 r0
|
||||
ldr r5,[r2,#8] @ a1=p[2]
|
||||
mla r6,r5,r4,r6 @ a1*u1 lo, discard hi
|
||||
umlal r5,r6,r5,r3 @ a1*u0 hi, discard lo
|
||||
@ r6 r0
|
||||
ldr r5,[r2,#4] @ a2=p[1]
|
||||
mla r6,r5,r3,r6 @ r0+=a2*u0
|
||||
bx r14
|
||||
|
||||
|
||||
float_wrapper_section expf
|
||||
|
||||
2:
|
||||
@ we could use fadd macro to calculate x+1 here
|
||||
@ need to add quadratic term
|
||||
rsb r3,#1 @ 1-(e+9)
|
||||
lsrs r0,r3 @ Q31
|
||||
smmulr r1,r0,r0 @ Q30
|
||||
cmp r12,#0
|
||||
bne 1f
|
||||
add r0,r0,r1
|
||||
lsrs r0,#8
|
||||
adc r0,r0,#0x3f800000 @ result is just over 1
|
||||
bx r14
|
||||
1:
|
||||
sub r0,r1,r0
|
||||
asrs r0,#7
|
||||
adc r0,r0,#0x3f800000 @ result is just under 1
|
||||
bx r14
|
||||
|
||||
7:
|
||||
movs r1,#1
|
||||
bfi r0,r1,#23,#9 @ implied 1, clear sign bit
|
||||
adds r3,#9
|
||||
bmi 2b @ <~2^-9? return 1+x+x²/2
|
||||
movlong r1,0xb17217f8 @ ln2 Q32
|
||||
lsls r0,r3 @ Q32
|
||||
cmp r12,#0
|
||||
bne 8f
|
||||
cmp r0,r1
|
||||
blo 6f
|
||||
sub r0,r0,r1
|
||||
mov r12,#1
|
||||
b 6f
|
||||
|
||||
8:
|
||||
mvn r12,#0
|
||||
subs r0,r1,r0
|
||||
bhs 6f
|
||||
add r0,r0,r1
|
||||
mvn r12,#1
|
||||
b 6f
|
||||
|
||||
wrapper_func expf
|
||||
ands r12,r0,#1<<31 @ save sign
|
||||
ubfx r3,r0,#23,#8 @ get exponent
|
||||
subs r3,r3,#0x7f
|
||||
bmi 7b @ e<0?
|
||||
cmp r3,#7
|
||||
bge 20f @ overflow, Inf or NaN?
|
||||
movs r1,#1
|
||||
bfi r0,r1,#23,#9 @ implied 1, clear sign bit
|
||||
lsls r0,r3 @ x Q23
|
||||
eors r0,r0,r12,asr#31
|
||||
adds r0,r0,r12,lsr#31 @ negate if sign bit set
|
||||
mov r1,#0xB8AA @ 1/ln2 Q15, rounded down
|
||||
add r1,r1,r12,lsr#31 @ rounded up if we have a negative argument
|
||||
smull r1,r2,r1,r0 @ Q38
|
||||
movlong r1,0xb17217f8 @ ln2 Q32
|
||||
asr r12,r2,#6 @ Q0 exponent candidate
|
||||
lsls r0,#9 @ Q32
|
||||
mls r0,r12,r1,r0 @ Q32-Q0*Q32=Q32
|
||||
|
||||
6:
|
||||
lsrs r1,r0,#28
|
||||
ldr r2,=exptab0
|
||||
add r2,r2,r1,lsl#3
|
||||
ldmia r2,{r2-r3}
|
||||
and r2,r2,#63
|
||||
orr r2,#64 @ y=(t&0x3f)+0x40; Q6
|
||||
sub r0,r3
|
||||
|
||||
lsrs r1,r0,#24
|
||||
ldr r3,=exptab1+4
|
||||
ldr r3,[r3,r1,lsl#3]
|
||||
add r1,#256
|
||||
muls r2,r2,r1 @ y Q14
|
||||
sub r0,r3
|
||||
|
||||
lsrs r1,r0,#21
|
||||
ldr r3,=exptab2+4
|
||||
ldr r3,[r3,r1,lsl#3]
|
||||
add r1,r1,r1
|
||||
add r1,#4096
|
||||
mla r2,r2,r1,r2 @ y Q26
|
||||
sub r0,r3 @ ε Q32
|
||||
smmlar r2,r2,r0,r2 @ y(1+ε) Q26
|
||||
|
||||
cmp r2,#1<<27
|
||||
bhs 1f
|
||||
2:
|
||||
cmp r12,#0x7f
|
||||
bgt 23f
|
||||
cmn r12,#0x7e
|
||||
blt 23f
|
||||
lsls r0,r12,#23
|
||||
adds r0,r0,r2,lsr#3
|
||||
adc r0,r0,#0x3f000000
|
||||
bx r14
|
||||
|
||||
@ mantissa calculation has overflowed
|
||||
1:
|
||||
lsrs r2,#1
|
||||
adds r12,#1
|
||||
b 2b
|
||||
|
||||
20:
|
||||
@ process Inf/NaN for fexp
|
||||
cmp r3,#0x80
|
||||
bne 22f
|
||||
lsls r2,r0,#9
|
||||
ite eq
|
||||
biceq r0,r0,r0,asr#31 @ +Inf -> + Inf; -Inf -> +0
|
||||
orrne r0,r0,#0x00400000
|
||||
bx r14
|
||||
|
||||
23:
|
||||
ands r12,r12,#1<<31
|
||||
22:
|
||||
eors r0,r12,#1<<31
|
||||
subs r0,r0,r0,lsr#8 @ overflow: convert to +0 or +Inf
|
||||
bx r14
|
||||
|
||||
|
||||
|
||||
|
||||
float_wrapper_section logf
|
||||
|
||||
1:
|
||||
movlong r0,0xff800000 @ return -Inf
|
||||
bx r14
|
||||
|
||||
4:
|
||||
lsls r1,r0,#9
|
||||
it ne
|
||||
orrne r0,r0,#0x00400000
|
||||
bx r14
|
||||
|
||||
@ here result may be close to zero
|
||||
5:
|
||||
sub r1,r0,#0x3f800000
|
||||
bmi 7f
|
||||
cmp r1,#0x8000
|
||||
bge 6f @ |ε|>~2^-8?
|
||||
@ here ε positive
|
||||
clz r12,r1
|
||||
sub r3,r12,#1
|
||||
b 8f
|
||||
|
||||
7:
|
||||
rsbs r2,r1,#0
|
||||
cmp r2,#0x8000
|
||||
bge 6f @ |ε|>~2^-8?
|
||||
@ here ε negative
|
||||
@ r1: x Q24
|
||||
clz r12,r2
|
||||
sub r3,r12,#2
|
||||
8:
|
||||
sub r12,#10
|
||||
lsls r0,r1,r3 @ ε Q24+r3
|
||||
beq 10f @ ε=0?
|
||||
smmulr r1,r0,r0
|
||||
asrs r1,r1,r12
|
||||
sub r0,r0,r1,asr#1 @ - ε²/2
|
||||
smmulr r1,r1,r0
|
||||
asrs r2,r1,r12
|
||||
movs r3,#0x55
|
||||
mul r2,r2,r3
|
||||
adds r0,r0,r2,asr#8 @ + ~ ε³/3
|
||||
rsb r1,r12,#117
|
||||
b 9f
|
||||
|
||||
wrapper_func logf
|
||||
lsls r3,r0,#1
|
||||
bcs 1b @ x<0?
|
||||
lsrs r3,#24
|
||||
beq 1b @ x==0/denormal?
|
||||
sub r12,r3,#0x7e
|
||||
cmp r12,#0x81 @ +Inf/NaN?
|
||||
beq 4b
|
||||
push {r14}
|
||||
cmp r12,#1 @ result will be near zero?
|
||||
bls 5b
|
||||
6:
|
||||
movs r2,#1
|
||||
bfi r0,r2,#23,#9 @ set implied 1, clear exponent Q52
|
||||
|
||||
lsls r0,#8
|
||||
|
||||
lsrs r1,r0,#27
|
||||
ldr r2,=logtab0-16*8
|
||||
add r2,r2,r1,lsl#3
|
||||
ldmia r2,{r2-r3}
|
||||
lsls r2,#26
|
||||
umlal r2,r0,r2,r0
|
||||
|
||||
mvn r1,r0,asr#24
|
||||
ldr r2,=exptab1+4
|
||||
ldr r2,[r2,r1,lsl#3]
|
||||
add r3,r3,r2
|
||||
lsls r1,#24
|
||||
umlal r1,r0,r1,r0
|
||||
|
||||
ldr r2,=exptab2+4
|
||||
mvn r1,r0,asr#21
|
||||
ldr r2,[r2,r1,lsl#3]
|
||||
lsls r1,#21
|
||||
orr r1,#0x00100000
|
||||
umlal r1,r0,r1,r0
|
||||
add r3,r3,r2
|
||||
|
||||
@ r0: ε Q32
|
||||
@ r3: log y
|
||||
smmulr r1,r0,r0 @ ε² Q32
|
||||
sub r3,r0
|
||||
add r3,r3,r1,lsr#1 @ log u - ε + ε²/2 Q32
|
||||
movlong r2,0xb17218 @ ln2 Q24, but accurate to 2^-29 or so
|
||||
cmp r12,#1 @ result will be near zero?
|
||||
bls 2f
|
||||
mul r2,r2,r12
|
||||
mov r1,#125
|
||||
add r3,#255 @ reduce systematic error
|
||||
subs r0,r2,r3,lsr#8
|
||||
9:
|
||||
bmi 1f
|
||||
bl fpack_q
|
||||
10:
|
||||
pop {r15}
|
||||
|
||||
2:
|
||||
mov r1,#117
|
||||
bne 3f @ r12=0?
|
||||
@ here r12=1
|
||||
movlong r2,0xb17217f8
|
||||
sub r0,r2,r3
|
||||
bl fpack_q
|
||||
pop {r15}
|
||||
|
||||
3:
|
||||
mov r0,r3
|
||||
bl fpack_q
|
||||
orrs r0,r0,#1<<31
|
||||
pop {r15}
|
||||
|
||||
1:
|
||||
rsbs r0,#0
|
||||
bl fpack_q
|
||||
orrs r0,r0,#1<<31
|
||||
pop {r15}
|
||||
|
||||
|
||||
float_wrapper_section atan2f
|
||||
|
||||
@ fatan small reduced angle case
|
||||
@ r0 y/x in IEEE format, 0..2^-8
|
||||
@ r2 e+6 <0
|
||||
@ r3 #1
|
||||
@ r6 flags
|
||||
20:
|
||||
rsbs r4,r2,#0 @ -e-6 = shift down of mantissa required to get to Q29 >0
|
||||
cmp r4,#7 @ -e-6 ≥ 7 [ e ≤ -13 ]
|
||||
bge 1f @ very small reduced angle? atn θ ~ θ
|
||||
@ do a power series
|
||||
bfi r0,r3,#23,#9 @ fix up mantissa Q23-e
|
||||
lsls r0,#7 @ Q30-e
|
||||
smmulr r1,r0,r0 @ θ² Q60-2e-Q32=Q28-2e
|
||||
lsrs r1,r4 @ Q28-2e + (e+6) = Q34-e
|
||||
smmulr r1,r1,r0 @ θ³ Q34-e+Q30-e-Q32=Q32-2e
|
||||
mov r3,#0x55555555 @ 1/3 Q32
|
||||
lsrs r1,r4 @ Q32-2e + e+6 = Q38-e
|
||||
smmulr r1,r1,r3 @ θ³/3 Q38-e
|
||||
sub r0,r0,r1,lsr#8 @ Q30-e
|
||||
cmp r6,#4 @ at least one quadrant to add?
|
||||
bhs 2f
|
||||
add r1,r2,#113
|
||||
bl fpack_q
|
||||
eors r0,r0,r6,lsl#31 @ fix sign
|
||||
pop {r4-r6,r15}
|
||||
|
||||
@ here reduced angle is < 2^-12
|
||||
1:
|
||||
cmp r6,#4
|
||||
bhs 3f @ at least one quadrant to add?
|
||||
eors r0,r0,r6,lsl#31 @ no: return y/x with the correct sign
|
||||
pop {r4-r6,r15}
|
||||
|
||||
3:
|
||||
bfi r0,r3,#23,#9 @ fix up mantissa
|
||||
lsrs r0,r4 @ Q29
|
||||
lsls r0,#1 @ Q30
|
||||
b 40f
|
||||
|
||||
2:
|
||||
lsrs r0,r4 @ Q30-e + e+6 = Q36
|
||||
lsrs r0,#6 @ Q30
|
||||
b 40f
|
||||
|
||||
@ case where reduced (x',y') has x' infinite
|
||||
71:
|
||||
sbfx r4,r0,#23,#8
|
||||
movs r0,#0
|
||||
cmn r4,#1 @ y' also infinite?
|
||||
bne 80f
|
||||
mov r0,#0x3f800000 @ both infinite: pretend ∞/∞=1
|
||||
b 80f
|
||||
|
||||
@ case where reduced (x',y') has y' zero
|
||||
70:
|
||||
ubfx r4,r1,#23,#8
|
||||
movs r0,#0
|
||||
cbnz r4,80f @ x' also zero?
|
||||
tst r6,#4
|
||||
beq 80f @ already in quadrants 0/±2? then 0/0 result will be correct
|
||||
tst r6,#2
|
||||
ite eq
|
||||
addeq r6,#6
|
||||
subne r6,#6 @ fix up when in quadrants ±0
|
||||
b 80f
|
||||
|
||||
90:
|
||||
movs r0,r1
|
||||
91:
|
||||
orrs r0,r0,#0x00400000
|
||||
pop {r4-r6,r15}
|
||||
|
||||
wrapper_func atan2f
|
||||
push {r4-r6,r14}
|
||||
lsls r4,r1,#1
|
||||
cmp r4,#0xff000000
|
||||
bhi 90b @ y NaN?
|
||||
lsls r4,r0,#1
|
||||
cmp r4,#0xff000000
|
||||
bhi 91b @ x NaN?
|
||||
lsrs r6,r0,#31 @ b31..2: quadrant count; b1: sign to apply before addition; b0: sign to apply after addition
|
||||
bic r0,#1<<31
|
||||
@ y now positive
|
||||
movs r1,r1
|
||||
bpl 1f
|
||||
|
||||
@ here y positive, x negative
|
||||
adds r6,#10
|
||||
bic r1,r1,#1<<31
|
||||
cmp r1,r0
|
||||
bhi 4f @ |x| > y: 3rd octant
|
||||
@ |x| < y: 2nd octant
|
||||
subs r6,#6
|
||||
b 3f
|
||||
|
||||
1:
|
||||
@ here x and y positive
|
||||
cmp r1,r0
|
||||
bhs 4f
|
||||
@ x < y: 1st octant
|
||||
adds r6,#6
|
||||
3:
|
||||
movs r2,r1 @ exchange x and y
|
||||
movs r1,r0
|
||||
movs r0,r2
|
||||
4:
|
||||
@ here
|
||||
@ r0 y'
|
||||
@ r1 x'
|
||||
@ where both x' and y' are positive, y'/x' < 1+δ, and the final result is
|
||||
@ ± (Q.π/2 ± atn y/x) where 0≤Q≤2 is a quadrant count in r6b3..2, the inner negation
|
||||
@ is given by r6b1 and the outer negation by r6b0. x' can be infinite, or both x' and
|
||||
@ y' can be infinite, but not y' alone.
|
||||
|
||||
sbfx r2,r1,#23,#8
|
||||
cmn r2,#1
|
||||
beq 71b @ x' infinite?
|
||||
ubfx r2,r0,#23,#8
|
||||
cmp r2,#0
|
||||
beq 70b @ y' zero?
|
||||
bl __aeabi_fdiv
|
||||
80:
|
||||
@ r0 y/x in IEEE format, 0..1
|
||||
lsr r2,r0,#23 @ exponent
|
||||
movs r3,#1
|
||||
subs r2,#0x7f-6
|
||||
bmi 20b
|
||||
bfi r0,r3,#23,#9 @ fix up mantissa
|
||||
lsls r0,r2
|
||||
lsls r0,#2
|
||||
50:
|
||||
|
||||
@ from here atan2(y,1) where 1 implied
|
||||
@ r0 y Q31 0≤y<1+δ
|
||||
|
||||
lsrs r2,r0,#16
|
||||
mul r3,r2,r2 @ y^2
|
||||
movw r4,#0x895c
|
||||
muls r2,r2,r4 @ y*0x895c
|
||||
movw r5,#0x1227
|
||||
lsrs r3,#14
|
||||
mls r2,r3,r5,r2
|
||||
subs r2,#0x330000 @ Chebyshev approximation to atn(y)
|
||||
lsrs r2,#25
|
||||
ldr r3,=trigtab+4
|
||||
add r3,r3,r2,lsl#4
|
||||
ldmia r3,{r3-r5}
|
||||
@ r0 y Q30
|
||||
@ r3 phi0 Q32
|
||||
@ r4 sphi0 Q31
|
||||
@ r5 cphi0 Q31
|
||||
@ r6 flags
|
||||
@ x0= cphi0 + sphi0*y
|
||||
@ y0=-sphi0 + cphi0*y
|
||||
umull r12,r1,r0,r4 @ sphi0*y
|
||||
umull r12,r2,r0,r5 @ cphi0*y
|
||||
add r1,r1,r5,lsr#1 @ x0 Q30
|
||||
sub r2,r2,r4,lsr#1 @ y0 Q30
|
||||
11:
|
||||
@ r1 x0 Q30
|
||||
@ r2 y0 Q30
|
||||
@ r3 phi0 Q32
|
||||
@ r6 flags
|
||||
mov r0,#0xffffffff
|
||||
lsrs r4,r1,#15
|
||||
udiv r12,r0,r4 @ rx0 Q17
|
||||
lsls r4,r2,#6 @ y0 Q36
|
||||
smmul r4,r4,r12 @ t2=y0*rx0 Q21
|
||||
lsls r5,r4,#11 @ t2 Q32
|
||||
smmlar r0,r5,r2,r1
|
||||
smmlsr r1,r5,r1,r2
|
||||
@ r0 x1 Q30
|
||||
@ r1 y1 Q30
|
||||
@ r3 phi0
|
||||
@ r4 r2 Q21
|
||||
@ r12 rx0 Q17
|
||||
mul r5,r4,r4 @ t2_2 Q42
|
||||
smull r2,r5,r4,r5 @ t2_3 Q63
|
||||
add r3,r3,r4,lsl#11 @ Q32
|
||||
lsls r5,#1 @ Q32
|
||||
mov r2,#0x55555555 @ 1/3
|
||||
smmlsr r3,r2,r5,r3 @ t2_3/3
|
||||
|
||||
@ r0 x1 Q30
|
||||
@ r1 y1 Q30
|
||||
@ r3 phi0+phi1 Q32
|
||||
@ r12 rx0 Q17
|
||||
mul r0,r1,r12 @ y1*rx0 Q30+Q17=Q47
|
||||
add r3,r3,r0,asr#15
|
||||
@ r3 phi0+phi1+phi2, result over reduced range Q32
|
||||
@ r6 flags
|
||||
|
||||
lsrs r0,r3,#2 @ Q30
|
||||
@ adc r0,#0 @ rounding
|
||||
40:
|
||||
lsl r5,r6,#30 @ b1 -> b31
|
||||
eor r0,r0,r5,asr#31 @ negate if required
|
||||
movlong r1,0x6487ED51 @ π/2 Q30
|
||||
|
||||
lsr r5,r6,#2 @ quadrants to add
|
||||
mla r0,r5,r1,r0
|
||||
mov r1,#0x80-9 @ for packing Q30
|
||||
60:
|
||||
bl fpack_q
|
||||
eors r0,r0,r6,lsl#31
|
||||
pop {r4-r6,r15}
|
||||
|
||||
@=======================================
|
||||
|
||||
float_wrapper_section fpack
|
||||
|
||||
@ fnegpack: negate and pack
|
||||
@ fpack_q31:
|
||||
@ input
|
||||
@ r0 Q31 result, must not be zero
|
||||
@ r1 exponent offset [fpack_q only]
|
||||
@ output
|
||||
@ r0 IEEE single
|
||||
@ trashes (r1,)r2
|
||||
fnegpack_q31:
|
||||
rsbs r0,r0,#0
|
||||
fpack_q31:
|
||||
mov r1,#0x7f-9 @ exponent
|
||||
fpack_q:
|
||||
clz r2,r0
|
||||
rsbs r2,#8
|
||||
bmi 1f
|
||||
lsrs r0,r0,r2 @ save rounding bit in carry
|
||||
add r2,r2,r1
|
||||
adc r0,r0,r2,lsl#23 @ insert exponent
|
||||
bx r14
|
||||
|
||||
1:
|
||||
rsb r2,#0
|
||||
lsls r0,r0,r2
|
||||
sub r2,r1,r2
|
||||
add r0,r0,r2,lsl#23
|
||||
bx r14
|
||||
|
||||
float_wrapper_section tanf
|
||||
|
||||
wrapper_func tanf
|
||||
push {r14}
|
||||
bl sincosf_raw
|
||||
bl __aeabi_fdiv
|
||||
pop {r15}
|
||||
|
||||
float_wrapper_section fsin_fcos
|
||||
|
||||
10: @ process Inf/NaN for sinf and fcos
|
||||
lsls r1,r0,#9
|
||||
it eq
|
||||
orreq r0,#0x80000000
|
||||
orr r0,#0x00400000 @ turn Inf into NaN
|
||||
movs r1,r0 @ copy result to cosine output
|
||||
bx r14
|
||||
|
||||
@ case where angle is very small (<2^-32) before reduction
|
||||
32:
|
||||
adds r1,r1,#0x7f
|
||||
it eq
|
||||
moveq r0,#0 @ flush denormal to zero
|
||||
|
||||
movs r1,0x3f800000 @ return x for sine, 1 for cosine
|
||||
tst r12,#2
|
||||
it ne
|
||||
movne r0,r1 @ calculating cosine? move to result registers
|
||||
bx r14
|
||||
|
||||
40:
|
||||
@ case where range-reduced angle is very small
|
||||
@ here
|
||||
@ r0 mantissa
|
||||
@ r1 exponent+9
|
||||
@ r6 abs range-reduced angle / 2π < 2^-7 Q32
|
||||
@ r12b31: dsincos flag
|
||||
@ r12b30: original argument sign
|
||||
@ r12b2..1: quadrant count
|
||||
@ r12b0: sign of reduced angle
|
||||
push {r12}
|
||||
movs r12,#0
|
||||
2:
|
||||
add r12,#2
|
||||
add r1,#2
|
||||
bl frr_core @ repeat range reduction with extra factor of 2^2 (, 2^4, 2^6, 2^8,...)
|
||||
eors r4,r6,r6,asr#31 @ we want to keep the sign in r6b31 for later
|
||||
cmp r4,#1<<26 @ ≥ 2^-6?
|
||||
bhs 1f @ loop until the result is big enough
|
||||
cmp r12,#32 @ safety net
|
||||
bne 2b
|
||||
1:
|
||||
@ here r4 is the abs range-reduced angle Q32+r12, 2^-6..2^-4 in Q32 terms
|
||||
|
||||
@ 2π=6.487ED511 (0B4611A6...)
|
||||
movlong r5,0x487ED511 @ 2π Q64 high fractional word
|
||||
umull r2,r0,r4,r5
|
||||
movs r5,#6 @ 2π integer part
|
||||
mla r0,r4,r5,r0
|
||||
|
||||
@ here
|
||||
@ r0 θ, abs range reduced angle θ 0..π/4 Q32+r12, 2π * 2^-6..2^-4 in Q32 terms (so top bit is clear)
|
||||
@ r6b31: sign of reduced angle
|
||||
@ r12: excess exponent ≥2, multiple of 2
|
||||
@ r0 / 2^r12 < ~ 2π * 2^-7 so for sin we need to go to term in x^3
|
||||
|
||||
smmulr r1,r0,r0 @ θ² Q32+2*r12
|
||||
lsrs r1,r1,r12 @ θ² Q32+r12
|
||||
lsrs r1,r1,r12 @ θ² Q32
|
||||
movs r4,#0x55555555 @ 1/3 Q32
|
||||
smmulr r4,r1,r4 @ θ²/3 Q32
|
||||
smmulr r2,r4,r1 @ θ⁴/3 Q32
|
||||
rsb r3,r1,r2,lsr#2 @ -θ²+θ⁴/12) Q32
|
||||
asrs r3,#9 @ -θ²/2+θ⁴/24 Q24
|
||||
adc r3,r3,#0x3f800000 @ IEEE single with rounding
|
||||
|
||||
smmulr r2,r4,r0 @ θ³/3 Q32+r12
|
||||
sub r0,r0,r2,lsr#1 @ θ-θ³/6 Q32+r12
|
||||
rsb r1,r12,#117
|
||||
bl fpack_q
|
||||
@ here
|
||||
@ r0 packed sin
|
||||
@ r3 packed cos
|
||||
@ r6b31: sign of reduced angle
|
||||
pop {r12} @ get flags
|
||||
lsrs r6,#31
|
||||
bfi r12,r6,#0,#1
|
||||
|
||||
asrs r2,r12,#1
|
||||
bmi 23f @ doing sincos?
|
||||
asrs r2,#1
|
||||
bcc 21f @ need sine?
|
||||
@ need cosine:
|
||||
ands r12,#4
|
||||
orrs r0,r3,r12,lsl#29 @ insert sign
|
||||
pop {r4-r6,r15}
|
||||
|
||||
21:
|
||||
eors r12,r12,r12,lsr#2
|
||||
orrs r0,r0,r12,lsl#31 @ insert sign
|
||||
pop {r4-r6,r15}
|
||||
|
||||
23:
|
||||
ands r2,r12,#4
|
||||
orrs r3,r3,r2,lsl#29 @ insert sign
|
||||
push {r3}
|
||||
@ drop into sincosf code below...
|
||||
b 20f
|
||||
|
||||
wrapper_func sincosf
|
||||
push {r1, r2, lr}
|
||||
bl sincosf_raw
|
||||
pop {r2, r3}
|
||||
str r0, [r2]
|
||||
str r1, [r3]
|
||||
pop {pc}
|
||||
|
||||
sincosf_raw:
|
||||
ands r12,r0,#1<<31
|
||||
lsrs r12,#1 @ save argument sign in r12b30
|
||||
orrs r12,r12,#1<<31 @ flag we want both results in r12b31
|
||||
b 1f
|
||||
|
||||
wrapper_func sinf
|
||||
lsrs r12,r0,#29 @ negative argument -> +2 quadrants
|
||||
ands r12,#4
|
||||
b 1f
|
||||
|
||||
wrapper_func cosf
|
||||
movs r12,#2 @ cos -> +1 quadrant
|
||||
1:
|
||||
ubfx r1,r0,#23,#8 @ get exponent
|
||||
sub r1,r1,#0x7f
|
||||
cmp r1,#0x80
|
||||
beq 10b @ Inf or NaN?
|
||||
cmn r1,#32
|
||||
blt 32b @ very small argument?
|
||||
movs r3,#1
|
||||
bfi r0,r3,#23,#9 @ fix implied 1 in mantissa
|
||||
push {r4-r6,r14}
|
||||
add r1,#9 @ e+9
|
||||
bl frr_core
|
||||
@ r6 θ/2π 0..1 Q64
|
||||
lsrs r4,r6,#30 @ quadrant count
|
||||
adc r4,r4,#0 @ rounded
|
||||
sub r6,r6,r4,lsl#30 @ now -0.125≤r6<+0.125 Q32
|
||||
add r12,r12,r4,lsl#1
|
||||
orr r12,r12,r6,lsr#31 @ sign into r12b0
|
||||
@ r12b2..1: quadrant count
|
||||
@ r12b0: sign of reduced angle
|
||||
eors r6,r6,r6,asr#31 @ absolute value of reduced angle 0≤r7<0.125 Q32
|
||||
cmp r6,#1<<25 @ θ / 2π < 2^-7?
|
||||
blo 40b
|
||||
|
||||
@ 2π=6.487ED511 (0B4611A6...)
|
||||
movlong r5,0x487ED511 @ 2π Q64 high fractional word
|
||||
umull r2,r0,r6,r5
|
||||
movs r5,#6 @ 2π integer part
|
||||
mla r0,r6,r5,r0
|
||||
@ r0 range reduced angle θ 0..π/4 Q32
|
||||
lsrs r2,r0,#27
|
||||
ldr r3,=trigtab+4
|
||||
add r3,r3,r2,lsl#4
|
||||
ldmia r3,{r1-r3}
|
||||
subs r0,r1
|
||||
@ r0: ε Q32 |ε| < 1.17*2^-6
|
||||
@ r2: sin φ Q31
|
||||
@ r3: cos φ Q31
|
||||
asrs r1,r12,#1
|
||||
bmi 5f @ doing sincosf?
|
||||
asrs r1,#1
|
||||
bcs 3f @ need cosine?
|
||||
@ here need sine
|
||||
smmulr r1,r0,r0 @ ε² Q32
|
||||
mov r4,#0x55555555 @ 1/3 Q32
|
||||
asrs r1,#1
|
||||
smmlsr r2,r1,r2,r2 @ sin φ - ε²/2*sin φ ~ sin φ cos ε
|
||||
smmulr r1,r1,r0 @ ε³/2
|
||||
smmlsr r1,r1,r4,r0 @ ε - ε³/6
|
||||
|
||||
smmlar r0,r1,r3,r2 @ sin φ cos ε + cos φ (ε - ε³/6) ~ sin (φ+ε)
|
||||
@ the sign of the result is r12b0^r12b2
|
||||
bl fpack_q31
|
||||
eors r12,r12,r12,lsr#2
|
||||
orrs r0,r0,r12,lsl#31 @ insert sign
|
||||
pop {r4-r6,r15}
|
||||
|
||||
3:
|
||||
@ here need cosine
|
||||
smmulr r1,r0,r0 @ ε² Q32
|
||||
mov r4,#0x55555555 @ 1/3 Q32
|
||||
asrs r1,#1
|
||||
smmlsr r3,r1,r3,r3 @ cos φ - ε²/2*cos φ ~ cos φ cos ε
|
||||
smmulr r1,r1,r0 @ ε³/2
|
||||
smmlsr r1,r1,r4,r0 @ ε - ε³/6
|
||||
|
||||
smmlsr r0,r1,r2,r3 @ cos φ cos ε - sin φ (ε - ε³/6) ~ cos (φ+ε)
|
||||
@ the sign of the result is r12b2
|
||||
bl fpack_q31
|
||||
ands r12,#4
|
||||
orrs r0,r0,r12,lsl#29 @ insert sign
|
||||
pop {r4-r6,r15}
|
||||
|
||||
5:
|
||||
@ here doing sincosf
|
||||
smmulr r1,r0,r0 @ ε² Q32
|
||||
mov r6,#0x55555555 @ 1/3 Q32
|
||||
asrs r1,#1
|
||||
smmlsr r4,r1,r2,r2 @ sin φ - ε²/2*sin φ ~ sin φ cos ε
|
||||
smmlsr r5,r1,r3,r3 @ cos φ - ε²/2*cos φ ~ cos φ cos ε
|
||||
smmulr r1,r1,r0 @ ε³/2
|
||||
smmlsr r6,r1,r6,r0 @ ε - ε³/6
|
||||
@ here
|
||||
@ r2 sin φ
|
||||
@ r3 cos φ
|
||||
@ r4 sin φ cos ε
|
||||
@ r5 cos φ cos ε
|
||||
@ r6 ε - ε³/6 ~ sin ε
|
||||
smmlsr r0,r6,r2,r5 @ cos φ cos ε - sin φ (ε - ε³/6) ~ cos (φ+ε)
|
||||
bl fpack_q31
|
||||
ands r5,r12,#4
|
||||
eors r0,r0,r5,lsl#29 @ negate cosine in quadrants 2 and 3
|
||||
push {r0}
|
||||
smmlar r0,r6,r3,r4 @ sin φ cos ε + cos φ (ε - ε³/6) ~ sin (φ+ε)
|
||||
bl fpack_q31
|
||||
20:
|
||||
eors r4,r12,r12,lsr#1
|
||||
eors r4,r4,r12,lsr#2
|
||||
ands r5,r12,#1<<30
|
||||
tst r12,#2 @ exchange sine and cosine in odd quadrants
|
||||
beq 1f
|
||||
eors r1,r0,r4,lsl#31 @ negate sine on b0^b1^b2
|
||||
pop {r0}
|
||||
eors r0,r0,r5,lsl#1 @ negate sine result if argument was negative
|
||||
pop {r4-r6,r15}
|
||||
1:
|
||||
pop {r1}
|
||||
eors r0,r0,r4,lsl#31 @ negate sine on b0^b1^b2
|
||||
eors r0,r0,r5,lsl#1 @ negate sine result if argument was negative
|
||||
pop {r4-r6,r15}
|
||||
|
||||
#endif
|
856
lib/main/pico-sdk/rp2_common/pico_float/float_sci_m33_vfp.S
Normal file
856
lib/main/pico-sdk/rp2_common/pico_float/float_sci_m33_vfp.S
Normal file
|
@ -0,0 +1,856 @@
|
|||
/*
|
||||
* Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#if !PICO_RP2040
|
||||
#include "pico/asm_helper.S"
|
||||
|
||||
pico_default_asm_setup
|
||||
|
||||
.macro float_section name
|
||||
#if PICO_FLOAT_IN_RAM
|
||||
.section RAM_SECTION_NAME(\name), "ax"
|
||||
#else
|
||||
.section SECTION_NAME(\name), "ax"
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro float_wrapper_section func
|
||||
float_section WRAPPER_FUNC_NAME(\func)
|
||||
.endm
|
||||
|
||||
@ load a 32-bit constant n into register rx
|
||||
.macro movlong rx,n
|
||||
movw \rx,#(\n)&0xffff
|
||||
movt \rx,#((\n)>>16)&0xffff
|
||||
.endm
|
||||
|
||||
float_section frrcore_v
|
||||
|
||||
// 1/2π to plenty of accuracy
|
||||
.long 0 @ this allows values of e down to -32
|
||||
rtwopi:
|
||||
.long 0,0
|
||||
.long 0x28BE60DB, 0x9391054A, 0x7F09D5F4, 0x7D4D3770, 0x36D8A566, 0x4F10E410
|
||||
|
||||
@ input:
|
||||
@ r0 mantissa m Q23
|
||||
@ r1 exponent e>=-32, typically offset by +9
|
||||
@ output:
|
||||
@ r0..r1 preserved
|
||||
@ r6 range reduced result in revolutions Q32
|
||||
@ r2,r3,r4,r5 trashed
|
||||
.thumb_func
|
||||
frr_core:
|
||||
adr r2,rtwopi
|
||||
asrs r3,r1,#5 @ k=e/32, k<=5 for e offsets up to 9+32
|
||||
add r2,r2,r3,lsl#2 @ p
|
||||
and r3,r1,#31 @ s=e%32
|
||||
mov r4,#1
|
||||
lsls r4,r4,r3 @ 1<<s
|
||||
umull r3,r4,r4,r0
|
||||
@ r2 p
|
||||
@ r3:r4 u0:u1 = m<<(e%32); u1 is never more than 2<<23
|
||||
ldr r5,[r2,#12] @ a0=p[3]
|
||||
umull r5,r6,r5,r4 @ r0=a0*u1 hi, discard lo
|
||||
@ r6 r0
|
||||
ldr r5,[r2,#8] @ a1=p[2]
|
||||
mla r6,r5,r4,r6 @ a1*u1 lo, discard hi
|
||||
umlal r5,r6,r5,r3 @ a1*u0 hi, discard lo
|
||||
@ r6 r0
|
||||
ldr r5,[r2,#4] @ a2=p[1]
|
||||
mla r6,r5,r3,r6 @ r0+=a2*u0
|
||||
bx r14
|
||||
|
||||
float_wrapper_section expf
|
||||
|
||||
wrapper_func expf
|
||||
@ soft float version, via 2^x
|
||||
asrs r1,r0,#23
|
||||
bmi 1f
|
||||
cmp r1,#0x85
|
||||
bge 3f
|
||||
10:
|
||||
movs r2,#1
|
||||
bfi r0,r2,#23,#9
|
||||
subs r1,#0x7e
|
||||
bmi 2f
|
||||
lsl r0,r1 @ x Q24
|
||||
11:
|
||||
movlong r3,0x5c551d95 @ 1/log(2) Q30
|
||||
smull r0,r1,r0,r3 @ Q54
|
||||
adr r2,k_exp2
|
||||
vldmia r2,{s8-s10}
|
||||
lsrs r2,r0,#22
|
||||
bfi r2,r1,#10,#17 @ ε Q32
|
||||
vmov s0,r2
|
||||
vcvt.f32.u32 s0,s0,#32
|
||||
vmul.f32 s1,s0,s0
|
||||
ubfx r2,r1,#17,#5
|
||||
vmul.f32 s4,s0,s8
|
||||
adr r3,exptab3
|
||||
vmul.f32 s2,s0,s1
|
||||
ldr r2,[r3,r2,lsl#2]
|
||||
vmla.f32 s4,s1,s9
|
||||
asrs r1,#22
|
||||
vmla.f32 s4,s2,s10
|
||||
add r2,r2,r1,lsl#23
|
||||
vmov s0,r2
|
||||
vmla.f32 s0,s0,s4
|
||||
vmov r0,s0
|
||||
bx r14
|
||||
|
||||
2: @ x≤0.5
|
||||
rsbs r1,#0
|
||||
lsrs r0,r1
|
||||
@ adc r0,#0 @ rounding not needed
|
||||
b 11b
|
||||
|
||||
3: @ risk of overflow, Inf,NaN
|
||||
movlong r2,0x42B17218
|
||||
cmp r0,r2
|
||||
blo 10b @ in range after all
|
||||
cmp r0,#0x7f800000
|
||||
bls 4f @ not NaN?
|
||||
orrs r0,#0x00400000
|
||||
bx r14
|
||||
|
||||
4:
|
||||
movlong r0,0x7f800000 @ return +Inf
|
||||
bx r14
|
||||
|
||||
1: @ x<0, r1=0xffffffXX where XX is biased exponent
|
||||
cmn r1,#0x7b
|
||||
bge 5f @ risk of underflow, -Inf, -NaN?
|
||||
13:
|
||||
movs r2,#1
|
||||
bfi r0,r2,#23,#9
|
||||
adds r1,#0x82
|
||||
bpl 6f
|
||||
rsbs r1,#0
|
||||
lsrs r0,r1
|
||||
adc r0,r0,#0 @ rounding
|
||||
12:
|
||||
rsbs r0,#0
|
||||
b 11b
|
||||
|
||||
6:
|
||||
lsls r0,r1
|
||||
b 12b
|
||||
|
||||
5:
|
||||
movlong r2,0xC2AEAC4F
|
||||
cmp r0,r2
|
||||
bls 13b
|
||||
cmp r0,#0xff800000
|
||||
bls 14f
|
||||
orrs r0,#0x00400000
|
||||
bx r14
|
||||
14:
|
||||
mov r0,#0
|
||||
bx r14
|
||||
|
||||
.align 3
|
||||
k_exp2:
|
||||
.float 0.693147181 @ log2
|
||||
.float 0.240226507 @ log²2/2
|
||||
.float 0.055504109 @ log³2/6
|
||||
|
||||
exptab3: @ pow(2,[0..31]/32)
|
||||
.word 0x3f800000
|
||||
.word 0x3f82cd87
|
||||
.word 0x3f85aac3
|
||||
.word 0x3f88980f
|
||||
.word 0x3f8b95c2
|
||||
.word 0x3f8ea43a
|
||||
.word 0x3f91c3d3
|
||||
.word 0x3f94f4f0
|
||||
.word 0x3f9837f0
|
||||
.word 0x3f9b8d3a
|
||||
.word 0x3f9ef532
|
||||
.word 0x3fa27043
|
||||
.word 0x3fa5fed7
|
||||
.word 0x3fa9a15b
|
||||
.word 0x3fad583f
|
||||
.word 0x3fb123f6
|
||||
.word 0x3fb504f3
|
||||
.word 0x3fb8fbaf
|
||||
.word 0x3fbd08a4
|
||||
.word 0x3fc12c4d
|
||||
.word 0x3fc5672a
|
||||
.word 0x3fc9b9be
|
||||
.word 0x3fce248c
|
||||
.word 0x3fd2a81e
|
||||
.word 0x3fd744fd
|
||||
.word 0x3fdbfbb8
|
||||
.word 0x3fe0ccdf
|
||||
.word 0x3fe5b907
|
||||
.word 0x3feac0c7
|
||||
.word 0x3fefe4ba
|
||||
.word 0x3ff5257d
|
||||
.word 0x3ffa83b3
|
||||
|
||||
float_wrapper_section logf
|
||||
|
||||
wrapper_func logf
|
||||
cmp r0,#0x7f800000 @ catch Inf, NaN, -ve
|
||||
bhs 1f
|
||||
asrs r1,r0,#23 @ get exponent; C from here is preserved...
|
||||
beq 2f @ ±0?
|
||||
mov r2,#1
|
||||
bfi r0,r2,#23,#9 @ fix implied 1
|
||||
it cc @ 50% ... to here...
|
||||
lslcc r0,#1 @ this plus sbc below means we work relative to nearby power of 2
|
||||
adr r3,#k_log3
|
||||
vldmia r3,{s8-s10}
|
||||
@ 0x00c00000 ≤ r0 < 0x017fffff
|
||||
adr r3,logtab3-24*8+4
|
||||
add r3,r3,r0,lsr#16 @ look up r0>>19 rounded, preserving flags
|
||||
bic r3,#7
|
||||
|
||||
ldrd r2,r3,[r3]
|
||||
mul r0,r0,r2 @ ε
|
||||
vmov s0,s1,r3,r0 @ s0=-log u, s1=ε
|
||||
|
||||
vcvt.f32.s32 s1,s1,#32
|
||||
vmul.f32 s2,s1,s1 @ power series in ε
|
||||
sbc r1,r1,#0x7e @ ... and here
|
||||
vmul.f32 s3,s1,s2
|
||||
lsls r1,#23 @ e Q23
|
||||
vmul.f32 s4,s2,s2 @ to ε⁴
|
||||
@ movlong r2,0x58b90bfc @ log 2 Q31, more accurate than we deserve
|
||||
movw r2,0x0bfc
|
||||
vmul.f32 s2,s2,s8
|
||||
movt r2,0x58b9
|
||||
vmul.f32 s3,s3,s9
|
||||
smmulr r1,r1,r2 @ Q22
|
||||
vmul.f32 s4,s4,s10
|
||||
vmov s7,r1
|
||||
vsub.f32 s3,s3,s4
|
||||
vcvt.f32.s32 s7,s7,#22
|
||||
vsub.f32 s2,s2,s3
|
||||
vsub.f32 s1,s1,s2
|
||||
vadd.f32 s0,s0,s1 @ log ε - log u
|
||||
vadd.f32 s0,s0,s7 @ e log 2 + log ε - log u
|
||||
vmov r0,s0
|
||||
bx r14
|
||||
|
||||
1:
|
||||
bgt 3f @ +NaN?
|
||||
beq 10f @ +Inf?
|
||||
2:
|
||||
cmp r0,#0x80800000 @ -0?
|
||||
blo 11f
|
||||
cmp r0,#0xff800000 @ -NaN/-Inf?
|
||||
3:
|
||||
orr r0,#0x00400000
|
||||
bhi 10f
|
||||
movlong r0,0xffc00000
|
||||
10:
|
||||
bx r14
|
||||
11:
|
||||
movlong r0,0xff800000
|
||||
bx r14
|
||||
|
||||
.align 3
|
||||
k_log3:
|
||||
.float 0.5
|
||||
.float 0.333333333333333
|
||||
.float 0.25
|
||||
.float 0 @ alignment
|
||||
|
||||
logtab3:
|
||||
@ u=64/[48:2:96]; u Q8, -log u F32
|
||||
.word 0x0155,0xbe92cb01 @ 00003e9b..00004145
|
||||
.word 0x0148,0xbe7dc8c3 @ 00003ec8..00004158
|
||||
.word 0x013b,0xbe545f68 @ 00003ec1..00004137
|
||||
.word 0x012f,0xbe2c99c7 @ 00003ebb..00004119
|
||||
.word 0x0125,0xbe0a3c2c @ 00003ef3..0000413d
|
||||
.word 0x011a,0xbdc61a2f @ 00003eca..000040fe
|
||||
.word 0x0111,0xbd83acc2 @ 00003eeb..0000410d
|
||||
.word 0x0108,0xbcfc14d8 @ 00003ee8..000040f8
|
||||
.word 0x0100,0x00000000 @ 00003f00..00004100
|
||||
.word 0x00f8,0x3d020aec @ 00003ef8..000040e8
|
||||
.word 0x00f1,0x3d77518e @ 00003f13..000040f5
|
||||
.word 0x00ea,0x3db80698 @ 00003f12..000040e6
|
||||
.word 0x00e4,0x3ded393b @ 00003f3c..00004104
|
||||
.word 0x00dd,0x3e168b08 @ 00003f05..000040bf
|
||||
.word 0x00d8,0x3e2dfa03 @ 00003f48..000040f8
|
||||
.word 0x00d2,0x3e4ad2d7 @ 00003f2a..000040ce
|
||||
.word 0x00cd,0x3e637fde @ 00003f43..000040dd
|
||||
.word 0x00c8,0x3e7cc8e3 @ 00003f48..000040d8
|
||||
.word 0x00c3,0x3e8b5ae6 @ 00003f39..000040bf
|
||||
.word 0x00bf,0x3e95f784 @ 00003f6b..000040e9
|
||||
.word 0x00ba,0x3ea38c6e @ 00003f36..000040aa
|
||||
.word 0x00b6,0x3eaeadef @ 00003f46..000040b2
|
||||
.word 0x00b2,0x3eba0ec4 @ 00003f46..000040aa
|
||||
.word 0x00ae,0x3ec5b1cd @ 00003f36..00004092
|
||||
.word 0x00ab,0x3ece995f @ 00003f75..000040cb
|
||||
|
||||
float_wrapper_section fsin_fcos
|
||||
|
||||
30:
|
||||
lsls r1,r0,#9
|
||||
bne 1f @ NaN? return it
|
||||
orrs r0,r0,#0x80000000 @ Inf: make a NaN
|
||||
1:
|
||||
orrs r0,r0,#0x00400000 @ set top mantissa bit of NaN
|
||||
bx r14
|
||||
|
||||
@ heavy-duty range reduction
|
||||
@ here x≥256, -e in r1
|
||||
40:
|
||||
push {r4-r7,r14}
|
||||
movs r3,#1
|
||||
bfi r0,r3,#23,#9 @ insert implied 1 in mantissa, clear sign
|
||||
rsb r1,#9 @ e+9
|
||||
mov r7,#0x7e @ this will be the exponent of the reduced angle - 1
|
||||
42:
|
||||
bl frr_core
|
||||
@ here r6 is revolutions Q32
|
||||
lsrs r3,r6,#30 @ quadrant count
|
||||
adcs r3,r3,#0 @ rounded
|
||||
add r12,r12,r3
|
||||
subs r6,r6,r3,lsl#30 @ reduced angle/2π Q32 -.125≤x<+.125
|
||||
@ comment out from here...
|
||||
lsls r2,r6,#2 @ Q34
|
||||
it cs
|
||||
rsbcs r2,r2,#0 @ absolute value
|
||||
cmp r2,#1<<28 @ big enough for accuracy?
|
||||
bhs 41f
|
||||
@ ... to here for slightly better accuracy
|
||||
43:
|
||||
adds r1,r1,#2 @ try again with increased exponent
|
||||
bl frr_core
|
||||
eors r2,r6,r6,asr#32 @ absolute value
|
||||
adc r2,r2,#0
|
||||
cmp r2,#1<<28 @ big enough yet?
|
||||
bhs 44f
|
||||
subs r7,r7,#2
|
||||
bpl 43b @ safety net
|
||||
44:
|
||||
|
||||
41:
|
||||
ldr r4,=0xC90FDAA2 @ 2π Q29
|
||||
umull r2,r4,r2,r4 @ r4 has reduced angle Q34+Q29-Q32=Q31
|
||||
@ add r4,r4,r2,lsr#31
|
||||
clz r2,r4 @ normalise
|
||||
lsls r4,r4,r2
|
||||
lsrs r4,r4,#8
|
||||
sub r2,r7,r2
|
||||
adc r0,r4,r2,lsl#23 @ with rounding
|
||||
lsrs r1,r0,#23 @ re-extract exponent as there may have been a carry into it
|
||||
rsbs r1,r1,#0x7f @ prepare exponent for re-entry
|
||||
lsrs r6,r6,#31
|
||||
add r3,r0,r6,lsl#31 @ apply sign of reduced angle
|
||||
pop {r4-r7,r14}
|
||||
b 5f @ re-enter with no risk of looping
|
||||
|
||||
.ltorg
|
||||
|
||||
@ light-duty range reduction
|
||||
@ here argument ≥1
|
||||
@ r0: argument
|
||||
@ r1: -e
|
||||
@ r12: quadrant count
|
||||
@ required result is sin(r0+r12*π/2)
|
||||
10:
|
||||
cmn r1,#0x80
|
||||
beq 30b @ Inf/NaN
|
||||
bics r2,r0,r12,lsl#31 @ negative argument,doing sin -> +2 quadrants
|
||||
it mi
|
||||
addmi r12,r12,#2
|
||||
bic r0,r0,#0x80000000 @ make positive: original sign is now captured in quadrant count in r12
|
||||
|
||||
@ this may not actually be faster than doing it in integer registers
|
||||
vmov s0,r0
|
||||
adr r2,k_sc4
|
||||
vldmia r2!,{s5-s7}
|
||||
@ vmul.f32 s4,s4,s0 @ this accurate calculation of the quadrant count does not seem necessary
|
||||
@ vfma.f32 s4,s5,s0
|
||||
vmul.f32 s4,s5,s0 @ this is BALGE
|
||||
cmn r1,#8 @ ≥256?
|
||||
vrintn.f32.f32 s4,s4 @ round to quadrant count: x<256 so count≤163
|
||||
ble 40b @ then do heavy-duty range reduction
|
||||
vfms.f32 s0,s4,s7
|
||||
vfms.f32 s0,s4,s6
|
||||
vmov r3,s0 @ reduced angle
|
||||
vcvt.s32.f32 s3,s4
|
||||
ubfx r2,r3,#23,#8 @ get exponent
|
||||
cmp r2,#0x78
|
||||
blo 40b @ very small result? use heavy-duty reduction to get a more accurate answer
|
||||
rsbs r1,r2,#0x7f @ ready for re-entry
|
||||
vmov r2,s3 @ integer quadrant count
|
||||
add r12,r12,r2
|
||||
@ prepare to re-enter with no risk of looping
|
||||
b 5f
|
||||
|
||||
k_sc4:
|
||||
@ 2/π=0.A2F9836E4E441529FC...
|
||||
.word 0x3f22f983 @ 2/π
|
||||
@ π/2=1.921FB54442D1846989...
|
||||
.word 0xb695777a,0x3fc91000 @ these two add up to π/2 with error ~1.6e-13
|
||||
|
||||
wrapper_func sincosf
|
||||
|
||||
push {r0-r2,r14}
|
||||
ubfx r1,r0,#23,#8
|
||||
cmp r1,#0xff @ Inf/NaN?
|
||||
beq 2f
|
||||
bl cosf_entry @ this will exit via 1f or 2f...
|
||||
pop {r1-r2,r14}
|
||||
str r0,[r14]
|
||||
@ here C is still set from lsrs r12,r12,#1
|
||||
bcs 1f
|
||||
mvns r1,r1
|
||||
eor r12,r12,r1,lsr#31
|
||||
@ this is fsc_costail:
|
||||
@ here calculate cos φ+ε = cosθ
|
||||
vmul.f32 s5,s7,s1 @ sinφ sinε
|
||||
vfma.f32 s5,s2,s6 @ sinφ sinε + cosφ(1-cosε)
|
||||
vsub.f32 s5,s6,s5 @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
|
||||
vmov.f32 r0,s5
|
||||
eor r0,r0,r12,lsl#31
|
||||
str r0,[r2]
|
||||
pop {r15}
|
||||
|
||||
1:
|
||||
eor r12,r12,r1,lsr#31
|
||||
@ this is fsc_sintail:
|
||||
@ here calculate sin φ+ε = sinθ
|
||||
vmul.f32 s4,s2,s7 @ sinφ(1-cosε)
|
||||
vfms.f32 s4,s6,s1 @ sinφ(1-cosε) - cosφ sinε
|
||||
eor r1,r12,r3,lsr#31 @ flip sign if (reduced) argument was negative
|
||||
vsub.f32 s4,s7,s4 @ cosφ sinε + sinφ cosε
|
||||
vmov.f32 r0,s4
|
||||
eor r0,r0,r1,lsl#31
|
||||
str r0,[r2] @ save cos result
|
||||
pop {r15}
|
||||
|
||||
@ sincos of Inf or NaN
|
||||
2:
|
||||
lsls r1,r0,#9
|
||||
pop {r1-r3,r14}
|
||||
bne 1f @ NaN? return it
|
||||
orrs r0,r0,#0x80000000 @ Inf: make a NaN
|
||||
1:
|
||||
orrs r0,r0,#0x00400000 @ set top mantissa bit of NaN
|
||||
str r0,[r2] @ both sin and cos results
|
||||
str r0,[r3]
|
||||
bx r14
|
||||
|
||||
wrapper_func sinf
|
||||
@ r12b1..0: quadrant count
|
||||
movs r12,#0
|
||||
b 1f
|
||||
|
||||
wrapper_func cosf
|
||||
.thumb_func
|
||||
cosf_entry:
|
||||
movs r12,#1 @ cos -> +1 quadrant
|
||||
1:
|
||||
ubfx r1,r0,#23,#8 @ get exponent
|
||||
cbz r1,20f @ 0/denormal?
|
||||
22:
|
||||
rsbs r1,r1,#0x7f
|
||||
bls 10b @ argument ≥1? needs reduction; also Inf/NaN handling
|
||||
bic r3,r0,r12,lsl#31 @ this would mess up NaNs so do it here
|
||||
5:
|
||||
@ here we have a quadrant count in r12 and a signed offset r0 from r12*π/2
|
||||
bic r0,r3,#0x80000000 @ this would mess up NaNs so do it here
|
||||
vmov s0,r0
|
||||
ubfx r0,r0,#18,#5 @ extract top of mantissa
|
||||
adds r0,r0,#32 @ insert implied 1
|
||||
lsrs r1,r0,r1 @ to fixed point Q5
|
||||
ldr r2,=k_sc3
|
||||
adcs r1,r1,#0 @ rounding
|
||||
vldmia r2!,{s8-s9}
|
||||
add r2,r2,r1,lsl#2 @ 12 bytes per entry
|
||||
add r2,r2,r1,lsl#3
|
||||
|
||||
vldmia r2,{s5-s7} @ φ, cosφ, sinφ
|
||||
vsub.f32 s1,s0,s5 @ ε
|
||||
vmul.f32 s2,s1,s1 @ ε²
|
||||
lsrs r12,r12,#1 @ computing cosine?
|
||||
vmul.f32 s3,s2,s1 @ ε³
|
||||
bcs 2f
|
||||
|
||||
vmul.f32 s2,s2,s8 @ ε²/2! ~ 1-cosε
|
||||
vmul.f32 s3,s3,s9 @ ε³/3!
|
||||
vsub.f32 s1,s1,s3 @ ε-ε³/3! ~ sinε
|
||||
|
||||
@ here:
|
||||
@ s1: sinε
|
||||
@ s2: 1-cosε
|
||||
@ s6: cosφ
|
||||
@ s7: sinφ
|
||||
@ r12: quadrant count
|
||||
fsc_sintail:
|
||||
@ here calculate sin φ+ε = sinθ
|
||||
vmul.f32 s4,s2,s7 @ sinφ(1-cosε)
|
||||
vfms.f32 s4,s6,s1 @ sinφ(1-cosε) - cosφ sinε
|
||||
eor r1,r12,r3,lsr#31 @ flip sign if (reduced) argument was negative
|
||||
vsub.f32 s4,s7,s4 @ cosφ sinε + sinφ cosε
|
||||
vmov.f32 r0,s4
|
||||
eor r0,r0,r1,lsl#31
|
||||
bx r14
|
||||
|
||||
20:
|
||||
and r0,r0,#0x80000000 @ make signed zero
|
||||
b 22b
|
||||
|
||||
.align 2
|
||||
2:
|
||||
vmul.f32 s3,s3,s9 @ ε³/3!
|
||||
vsub.f32 s1,s1,s3 @ ε-ε³/3! ~ sinε
|
||||
vmul.f32 s2,s2,s8 @ ε²/2! ~ 1-cosε
|
||||
fsc_costail:
|
||||
@ here calculate cos φ+ε = cosθ
|
||||
vmul.f32 s5,s7,s1 @ sinφ sinε
|
||||
vfma.f32 s5,s2,s6 @ sinφ sinε + cosφ(1-cosε)
|
||||
vsub.f32 s5,s6,s5 @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
|
||||
vmov.f32 r0,s5
|
||||
eor r0,r0,r12,lsl#31
|
||||
bx r14
|
||||
|
||||
.align 3
|
||||
k_sc3:
|
||||
.word 0x3EFFFEC1 @ ~ 1/2! with PMC
|
||||
.word 0x3e2aaa25 @ ~ 1/3! with PMC
|
||||
|
||||
trigtab2:
|
||||
// φ cos φ sin φ
|
||||
.word 0x00000000,0x3f800000,0x00000000
|
||||
.word 0x3cfcc961,0x3f7fe0cd,0x3cfcbf1c @ φ=0.03085774 : cos φ=3feffc199ff28ef4 33.3b; sin φ=3f9f97e38006c678 39.2b
|
||||
.word 0x3d810576,0x3f7f7dfe,0x3d80ef9e @ φ=0.06299870 : cos φ=3fefefbfc00d6b6d 33.3b; sin φ=3fb01df3c000dfd5 40.2b
|
||||
.word 0x3dbf0c09,0x3f7ee30f,0x3dbec522 @ φ=0.09328467 : cos φ=3fefdc61dff4f58e 33.5b; sin φ=3fb7d8a43ffdf9ac 39.0b
|
||||
.word 0x3dff24b6,0x3f7e0414,0x3dfe7be2 @ φ=0.12458174 : cos φ=3fefc0827fdaf90f 31.8b; sin φ=3fbfcf7c3ff9dd0c 37.4b
|
||||
.word 0x3e1f0713,0x3f7ceb48,0x3e1e63a0 @ φ=0.15530042 : cos φ=3fef9d68ffe680a0 32.3b; sin φ=3fc3cc73fffa6d09 36.5b
|
||||
.word 0x3e40306d,0x3f7b811d,0x3e3f1015 @ φ=0.18768473 : cos φ=3fef70239fe32301 32.1b; sin φ=3fc7e2029ffdbc2c 37.8b
|
||||
.word 0x3e60ada2,0x3f79dccf,0x3e5ee13e @ φ=0.21941236 : cos φ=3fef3b99e023f5aa 31.8b; sin φ=3fcbdc27bffe216d 38.1b
|
||||
.word 0x3e800d7b,0x3f7808fa,0x3e7d7196 @ φ=0.25010285 : cos φ=3fef011f401572a6 32.6b; sin φ=3fcfae32c00328bb 37.3b
|
||||
.word 0x3e8f986e,0x3f75ff65,0x3e8db868 @ φ=0.28045982 : cos φ=3feebfeca0aaaf99 29.6b; sin φ=3fd1b70cfffc1468 36.0b
|
||||
.word 0x3e9fe1f4,0x3f739e93,0x3e9d4bfd @ φ=0.31227076 : cos φ=3fee73d25fbf733b 31.0b; sin φ=3fd3a97fa0002ced 40.5b
|
||||
.word 0x3eb054c6,0x3f70f7ae,0x3eacddb3 @ φ=0.34439677 : cos φ=3fee1ef5bfcf70cb 31.4b; sin φ=3fd59bb65fff5c30 38.6b
|
||||
.word 0x3ebf89c5,0x3f6e4b60,0x3ebb1a0a @ φ=0.37409797 : cos φ=3fedc96bffdebb8a 31.9b; sin φ=3fd763414003344b 36.3b
|
||||
.word 0x3ecfc426,0x3f6b35ca,0x3eca1c63 @ φ=0.40579337 : cos φ=3fed66b93fe27dc6 32.1b; sin φ=3fd9438c5ffe5d45 37.3b
|
||||
.word 0x3ee054f2,0x3f67d166,0x3ed93907 @ φ=0.43814808 : cos φ=3fecfa2cbffc16e9 35.0b; sin φ=3fdb2720dffef5b6 37.9b
|
||||
.word 0x3eeff0dd,0x3f64664b,0x3ee74116 @ φ=0.46863452 : cos φ=3fec8cc95f714272 29.8b; sin φ=3fdce822c00479ad 35.8b
|
||||
.word 0x3f002b31,0x3f609488,0x3ef5c30f @ φ=0.50065905 : cos φ=3fec1290ffc99208 31.2b; sin φ=3fdeb861dfff3932 38.4b
|
||||
.word 0x3f07e407,0x3f5cc5a2,0x3f01992b @ φ=0.53082317 : cos φ=3feb98b44034cd46 31.3b; sin φ=3fe033255ffff628 41.7b
|
||||
.word 0x3f101fc5,0x3f587d8f,0x3f08a165 @ φ=0.56298476 : cos φ=3feb0fb1e0ceda6f 29.3b; sin φ=3fe1142c9ffd5ae4 35.6b
|
||||
.word 0x3f17f68a,0x3f5434b5,0x3f0f31ca @ φ=0.59360564 : cos φ=3fea8696a038a06f 31.2b; sin φ=3fe1e639400269fb 35.7b
|
||||
.word 0x3f1fffe2,0x3f4f9b59,0x3f15c8d7 @ φ=0.62499821 : cos φ=3fe9f36b1f428363 29.4b; sin φ=3fe2b91ae001d55d 36.1b
|
||||
.word 0x3f280646,0x3f4acf6b,0x3f1c37c4 @ φ=0.65634573 : cos φ=3fe959ed61449f08 28.7b; sin φ=3fe386f87ffd9617 35.7b
|
||||
.word 0x3f303041,0x3f45b9e0,0x3f229ae4 @ φ=0.68823630 : cos φ=3fe8b73c0047ae7a 30.8b; sin φ=3fe4535c7ffdf1ac 36.0b
|
||||
.word 0x3f381da7,0x3f4098ca,0x3f28a620 @ φ=0.71920246 : cos φ=3fe81319402ae6e1 31.6b; sin φ=3fe514c3ffff423c 37.4b
|
||||
.word 0x3f3fc72f,0x3f3b76ac,0x3f2e564a @ φ=0.74913305 : cos φ=3fe76ed5809d419f 29.7b; sin φ=3fe5cac93fffaf1d 38.7b
|
||||
.word 0x3f4813db,0x3f35b6cc,0x3f34526b @ φ=0.78155297 : cos φ=3fe6b6d9800e8b52 33.1b; sin φ=3fe68a4d5ffe89fc 36.5b
|
||||
.word 0x3f4fc779,0x3f30352f,0x3f39b4d0 @ φ=0.81163746 : cos φ=3fe606a5dfdc2b5b 31.8b; sin φ=3fe73699fffd7fc8 35.7b
|
||||
.word 0x3f57dd52,0x3f2a4170,0x3f3f2d91 @ φ=0.84322083 : cos φ=3fe5482e011ba752 28.9b; sin φ=3fe7e5b21ffcb223 35.3b
|
||||
.word 0x3f5fce26,0x3f243e9f,0x3f445dc3 @ φ=0.87423933 : cos φ=3fe487d3e0b9864b 29.5b; sin φ=3fe88bb85ffde6d5 35.9b
|
||||
.word 0x3f6825f1,0x3f1dc250,0x3f499d1c @ φ=0.90682894 : cos φ=3fe3b849ffea9b8f 32.6b; sin φ=3fe933a38002730d 35.7b
|
||||
.word 0x3f703be1,0x3f175041,0x3f4e7ebf @ φ=0.93841368 : cos φ=3fe2ea0820791b4e 30.1b; sin φ=3fe9cfd7e0053e65 34.6b
|
||||
.word 0x3f781078,0x3f10ed71,0x3f5306af @ φ=0.96900129 : cos φ=3fe21dae1fdea23e 31.9b; sin φ=3fea60d5e001b90b 36.2b
|
||||
.word 0x3f7ff4d4,0x3f0a5aa7,0x3f57649b @ φ=0.99982953 : cos φ=3fe14b54deeaa407 28.9b; sin φ=3feaec9360012825 36.8b
|
||||
|
||||
float_wrapper_section tanf
|
||||
|
||||
wrapper_func tanf
|
||||
push {r0,r14}
|
||||
ubfx r1,r0,#23,#8
|
||||
cmp r1,#0xff @ Inf/NaN?
|
||||
beq 2f
|
||||
bl cosf_entry @ this will exit via sintail or costail...
|
||||
ldr r1,[sp,#0]
|
||||
@ here C is still set from lsrs r12,r12,#1
|
||||
bcs 1f
|
||||
@ we exited via sintail
|
||||
@ this is fsc_costail:
|
||||
@ here calculate cos φ+ε = cosθ
|
||||
vmul.f32 s5,s7,s1 @ sinφ sinε
|
||||
vfma.f32 s5,s2,s6 @ sinφ sinε + cosφ(1-cosε)
|
||||
eors r1,r1,r3
|
||||
vsub.f32 s5,s6,s5 @ cosφ - (sinφ sinε + cosφ(1-cosε)) = cosφ cosε - sinφ sinε
|
||||
vdiv.f32 s0,s5,s4
|
||||
vmov.f32 r0,s0
|
||||
it pl
|
||||
eorpl r0,r0,#0x80000000
|
||||
pop {r1,r15}
|
||||
|
||||
1:
|
||||
@ we exited via costail
|
||||
@ this is fsc_sintail:
|
||||
@ here calculate sin φ+ε = sinθ
|
||||
vmul.f32 s4,s2,s7 @ sinφ(1-cosε)
|
||||
vfms.f32 s4,s6,s1 @ sinφ(1-cosε) - cosφ sinε
|
||||
eors r1,r1,r3
|
||||
vsub.f32 s4,s7,s4 @ cosφ sinε + sinφ cosε
|
||||
vdiv.f32 s0,s4,s5
|
||||
vmov.f32 r0,s0
|
||||
it mi
|
||||
eormi r0,r0,#0x80000000
|
||||
pop {r1,r15}
|
||||
|
||||
@ tan of Inf or NaN
|
||||
2:
|
||||
lsls r1,r0,#9
|
||||
bne 1f @ NaN? return it
|
||||
orrs r0,r0,#0x80000000 @ Inf: make a NaN
|
||||
1:
|
||||
orrs r0,r0,#0x00400000 @ set top mantissa bit of NaN
|
||||
pop {r3,r15}
|
||||
|
||||
|
||||
float_wrapper_section atan2f
|
||||
|
||||
50:
|
||||
60:
|
||||
orrs r0,r1,#0x00400000
|
||||
bx r14
|
||||
|
||||
51:
|
||||
bne 52f @ NaN?
|
||||
cmp r3,#0x7f800000 @ y an infinity; x an infinity too?
|
||||
bne 55f @ no: carry on
|
||||
@ here x and y are both infinities
|
||||
b 66f
|
||||
|
||||
52:
|
||||
62:
|
||||
orrs r0,r0,#0x00400000
|
||||
bx r14
|
||||
|
||||
61:
|
||||
bne 62b @ NaN?
|
||||
cmp r3,#0x7f800000 @ y an infinity; x an infinity too?
|
||||
bne 65f @ no: carry on
|
||||
66:
|
||||
@ here x and y are both infinities
|
||||
subs r0,r0,#1 @ make both finite (and equal) with same sign and retry
|
||||
subs r1,r1,#1
|
||||
b 86f
|
||||
|
||||
70:
|
||||
and r3,#0x80000000
|
||||
cmp r2,#0x00800000
|
||||
bhs 72f @ y 0 or denormal?
|
||||
@ here both x and y are zeros
|
||||
b 85f
|
||||
71:
|
||||
and r2,#0x80000000
|
||||
72:
|
||||
vmov s0,s1,r2,r3
|
||||
vdiv.f32 s2,s0,s1 @ restart the division
|
||||
b 73f @ and go back and check for NaNs
|
||||
|
||||
80:
|
||||
and r3,#0x80000000
|
||||
cmp r2,#0x00800000
|
||||
bhs 82f @ y 0 or denormal?
|
||||
85:
|
||||
@ here both x and y are zeros
|
||||
orr r1,r1,0x3f800000 @ retry with x replaced by ~1 with appropriate sign
|
||||
b 86f
|
||||
|
||||
81:
|
||||
and r2,#0x80000000
|
||||
82:
|
||||
vmov s0,s1,r2,r3
|
||||
vdiv.f32 s2,s1,s0 @ restart the division
|
||||
b 83f @ and go back and check for NaNs
|
||||
|
||||
wrapper_func atan2f
|
||||
86:
|
||||
bic r2,r0,#0x80000000
|
||||
bic r3,r1,#0x80000000
|
||||
vmov s0,s1,r2,r3
|
||||
cmp r2,r3 @ |y| vs. |x|
|
||||
bhi 1f
|
||||
@ here |x|≥|y| so we need |y|/|x|; octant/xs/ys: 0++,3-+,4--,7+-
|
||||
vdiv.f32 s2,s0,s1 @ get this division started; result ≤1
|
||||
cmp r3,#0x00800000
|
||||
blo 70b @ x 0 or denormal?
|
||||
cmp r2,#0x00800000
|
||||
blo 71b @ y 0 or denormal?
|
||||
73:
|
||||
cmp r3,#0x7f800000
|
||||
bhi 50b @ x NaN?
|
||||
cmp r2,#0x7f800000
|
||||
bhs 51b @ y Inf or NaN?
|
||||
55:
|
||||
cmp r1,#0
|
||||
ite mi
|
||||
ldrmi r12,pi @ if x<0, need two extra quadrants
|
||||
movpl r12,#0
|
||||
@ inner negation is the sign of x
|
||||
b 2f
|
||||
|
||||
1:
|
||||
@ here |x|<|y| so we need |x|/|y|; octant/xs/ys: 1++,2-+,5--,6+-
|
||||
vdiv.f32 s2,s1,s0 @ result <1
|
||||
cmp r3,#0x00800000
|
||||
blo 80b @ x 0 or denormal?
|
||||
cmp r2,#0x00800000
|
||||
blo 81b @ y 0 or denormal?
|
||||
83:
|
||||
cmp r3,#0x7f800000
|
||||
bhi 60b @ x NaN?
|
||||
cmp r2,#0x7f800000
|
||||
bhs 61b @ y Inf or NaN?
|
||||
65:
|
||||
ldr r12,piover2 @ always one extra quadrant in this path
|
||||
eors r1,r1,#0x80000000 @ inner negation is the complement of the sign of x
|
||||
|
||||
2:
|
||||
@ here
|
||||
@ r0 y
|
||||
@ r1 ±x
|
||||
@ r2 |y|
|
||||
@ r3 |x|
|
||||
@ s0,s1 = |x|,|y|
|
||||
@ s2=s0/s1 or s1/s0, 0≤s2≤1
|
||||
@ r12=quadrant count * π/2
|
||||
@ where the final result is
|
||||
@ ± (r12 ± atn s2) where the inner negation is given by r1b31 and the outer negation by r0b31
|
||||
|
||||
adr r2,trigtab3
|
||||
vmov.f32 s3,s2
|
||||
vcvt.u32.f32 s3,s3,#6
|
||||
vmov.f32 r3,s3
|
||||
lsrs r3,r3,#1
|
||||
adcs r3,r3,#0 @ rounding; set Z if in φ==0 case
|
||||
add r2,r2,r3,lsl#3
|
||||
vldr s5,[r2,#4] @ t=tanφ
|
||||
vmul.f32 s0,s5,s2 @ ty
|
||||
vsub.f32 s1,s2,s5 @ y-t
|
||||
vmov.f32 s5,#1.0
|
||||
vadd.f32 s0,s5,s0 @ 1+ty
|
||||
beq 9f @ did we look up zeroth table entry?
|
||||
|
||||
@ now (s0,s1) = (x,y)
|
||||
vdiv.f32 s0,s1,s0 @ ε
|
||||
ldr r2,[r2] @ φ Q29
|
||||
@ result is now ±(r12±(r2+atn(s0))
|
||||
cmp r1,#0 @ inner negation
|
||||
it mi
|
||||
rsbmi r2,r2,#0
|
||||
add r2,r12,r2 @ Q29
|
||||
cmp r0,#0 @ outer negation
|
||||
it mi
|
||||
rsbmi r2,r2,#0
|
||||
cmp r2,#0
|
||||
bpl 1f
|
||||
rsbs r2,r2,#0
|
||||
clz r3,r2
|
||||
lsls r2,r2,r3
|
||||
beq 3f
|
||||
rsb r3,#0x180
|
||||
b 2f
|
||||
1:
|
||||
clz r3,r2
|
||||
lsls r2,r2,r3
|
||||
beq 3f
|
||||
rsb r3,#0x80
|
||||
2:
|
||||
lsrs r2,r2,#8 @ rounding bit to carry
|
||||
adc r2,r2,r3,lsl#23 @ with rounding
|
||||
3:
|
||||
vmul.f32 s2,s0,s0 @ ε²
|
||||
vldr.f32 s3,onethird
|
||||
vmul.f32 s2,s2,s0 @ ε³
|
||||
teq r0,r1
|
||||
vmul.f32 s2,s2,s3 @ ε³/3
|
||||
vmov.f32 s4,r2
|
||||
vsub.f32 s0,s0,s2 @ ~atn(ε)
|
||||
ite pl
|
||||
vaddpl.f32 s0,s4,s0
|
||||
vsubmi.f32 s0,s4,s0
|
||||
vmov.f32 r0,s0
|
||||
bx r14
|
||||
|
||||
9: @ we looked up the zeroth table entry; we could generate slightly more accurate results here
|
||||
@ now (s0,s1) = (x,y)
|
||||
vdiv.f32 s0,s1,s0 @ ε
|
||||
@ result is now ±(r12±(0+atn(s0))
|
||||
mov r2,r12 @ Q29; in fact r12 is only ±π/2 or ±π so can probably simplify this
|
||||
cmp r0,#0 @ outer negation
|
||||
it mi
|
||||
rsbmi r2,r2,#0
|
||||
cmp r2,#0
|
||||
bpl 1f
|
||||
rsbs r2,r2,#0
|
||||
clz r3,r2
|
||||
lsls r2,r2,r3
|
||||
beq 3f
|
||||
rsb r3,#0x180
|
||||
b 2f
|
||||
1:
|
||||
clz r3,r2
|
||||
lsls r2,r2,r3
|
||||
beq 3f
|
||||
rsb r3,#0x80
|
||||
2:
|
||||
lsrs r2,r2,#8 @ rounding bit to carry
|
||||
adc r2,r2,r3,lsl#23 @ with rounding
|
||||
3:
|
||||
vmul.f32 s2,s0,s0 @ ε²
|
||||
vldr.f32 s3,onethird
|
||||
vmul.f32 s2,s2,s0 @ ε³
|
||||
teq r0,r1
|
||||
vmul.f32 s2,s2,s3 @ ε³/3
|
||||
vmov.f32 s4,r2
|
||||
vsub.f32 s0,s0,s2 @ ~atn(ε)
|
||||
ite pl
|
||||
vaddpl.f32 s0,s4,s0
|
||||
vsubmi.f32 s0,s4,s0
|
||||
vmov.f32 r0,s0
|
||||
tst r0,#0x7f800000 @ about to return a denormal?
|
||||
it ne
|
||||
bxne r14
|
||||
and r0,r0,#0x80000000 @ make it zero
|
||||
bx r14
|
||||
|
||||
piover2: .word 0x3243f6a9 @ Q29
|
||||
pi: .word 0x6487ed51 @ Q29
|
||||
onethird: .float 0.33333333
|
||||
|
||||
trigtab3:
|
||||
// φ Q29 tan φ SP
|
||||
.word 0x00000000,0x00000000
|
||||
.word 0x00ffee23,0x3d0001bb @ φ=0.03124148 : tan φ=3fa000375fffff9d 50.4b
|
||||
.word 0x01fe88dc,0x3d7f992a @ φ=0.06232112 : tan φ=3faff3253fffea1f 44.5b
|
||||
.word 0x02fe0a70,0x3dc01203 @ φ=0.09351084 : tan φ=3fb8024060002522 42.8b
|
||||
.word 0x03fad228,0x3e000368 @ φ=0.12436779 : tan φ=3fc0006cfffffc90 45.2b
|
||||
.word 0x04f5ab70,0x3e1ffdea @ φ=0.15498897 : tan φ=3fc3ffbd400014d5 42.6b
|
||||
.word 0x05ed56f8,0x3e3fdddc @ φ=0.18522213 : tan φ=3fc7fbbb80000beb 43.4b
|
||||
.word 0x06e4cfa0,0x3e601425 @ φ=0.21543103 : tan φ=3fcc02849fffe817 42.4b
|
||||
.word 0x07d8d3e0,0x3e80215d @ φ=0.24521822 : tan φ=3fd0042b9ffff89f 43.1b
|
||||
.word 0x08c60460,0x3e9000a5 @ φ=0.27417201 : tan φ=3fd20014a000182b 41.4b
|
||||
.word 0x09b26770,0x3ea01492 @ φ=0.30302784 : tan φ=3fd402923ffff932 43.2b
|
||||
.word 0x0a996d50,0x3eb01377 @ φ=0.33122888 : tan φ=3fd6026ee0001062 42.0b
|
||||
.word 0x0b7a6d10,0x3ebff4a0 @ φ=0.35869458 : tan φ=3fd7fe93ffff8c38 39.1b
|
||||
.word 0x0c593ce0,0x3ed0019f @ φ=0.38589329 : tan φ=3fda0033e0001354 41.7b
|
||||
.word 0x0d33ebd0,0x3ee01bbc @ φ=0.41258803 : tan φ=3fdc0377800162a1 37.5b
|
||||
.word 0x0e087ab0,0x3ef01fbd @ φ=0.43853506 : tan φ=3fde03f79fffddf2 40.9b
|
||||
.word 0x0ed56180,0x3effef98 @ φ=0.46354747 : tan φ=3fdffdf30000767d 39.1b
|
||||
.word 0x0fa1de80,0x3f080ebf @ φ=0.48850942 : tan φ=3fe101d7dfffb9fc 38.9b
|
||||
.word 0x10639d00,0x3f0fec31 @ φ=0.51215982 : tan φ=3fe1fd862000aad5 37.6b
|
||||
.word 0x112690e0,0x3f180cfd @ φ=0.53595775 : tan φ=3fe3019fa00069ea 38.3b
|
||||
.word 0x11e014c0,0x3f200065 @ φ=0.55860364 : tan φ=3fe4000ca00022e5 39.9b
|
||||
.word 0x129651e0,0x3f2808be @ φ=0.58084959 : tan φ=3fe50117c00015a7 40.6b
|
||||
.word 0x1346d400,0x3f300a7d @ φ=0.60239601 : tan φ=3fe6014f9fffa020 38.4b
|
||||
.word 0x13efc7c0,0x3f37ee2f @ φ=0.62302005 : tan φ=3fe6fdc5dfff98d7 38.3b
|
||||
.word 0x14988960,0x3f400c32 @ φ=0.64362019 : tan φ=3fe801863fffff81 46.0b
|
||||
.word 0x1537a8c0,0x3f47ef42 @ φ=0.66304433 : tan φ=3fe8fde8400062a4 38.4b
|
||||
.word 0x15d4cc60,0x3f4ff630 @ φ=0.68222636 : tan φ=3fe9fec5ffff76e2 37.9b
|
||||
.word 0x166ef280,0x3f581534 @ φ=0.70104337 : tan φ=3feb02a680004e91 38.7b
|
||||
.word 0x16ff75c0,0x3f5fef1e @ φ=0.71868408 : tan φ=3febfde3c0001404 40.7b
|
||||
.word 0x179116a0,0x3f68184d @ φ=0.73646098 : tan φ=3fed03099ffed6e5 36.8b
|
||||
.word 0x181b5aa0,0x3f701722 @ φ=0.75333911 : tan φ=3fee02e43fffd351 39.5b
|
||||
.word 0x18a10560,0x3f781071 @ φ=0.76965588 : tan φ=3fef020e20005c05 38.5b
|
||||
.word 0x19214060,0x3f7ff451 @ φ=0.78530902 : tan φ=3feffe8a1fffe11b 40.1b
|
||||
|
||||
#endif
|
318
lib/main/pico-sdk/rp2_common/pico_float/float_single_hazard3.S
Normal file
318
lib/main/pico-sdk/rp2_common/pico_float/float_single_hazard3.S
Normal file
|
@ -0,0 +1,318 @@
|
|||
/*
|
||||
* Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#include "pico/asm_helper.S"
|
||||
#include "hardware/hazard3.h"
|
||||
|
||||
// This file reimplements some common single-precision soft float routines
|
||||
// from libgcc. It targets the RV32IMBZbkb dialect (plus optionally Xh3bextm)
|
||||
// and is tuned for Hazard3 execution timings.
|
||||
|
||||
// Subnormal values are always flushed to zero on both input and output.
|
||||
// Rounding is always to nearest (even on tie).
|
||||
|
||||
pico_default_asm_setup
|
||||
|
||||
.macro float_section name
|
||||
#if PICO_FLOAT_IN_RAM
|
||||
.section RAM_SECTION_NAME(\name), "ax"
|
||||
#else
|
||||
.section SECTION_NAME(\name), "ax"
|
||||
#endif
|
||||
.endm
|
||||
|
||||
float_section __addsf3
|
||||
.global __subsf3
|
||||
.p2align 2
|
||||
__subsf3:
|
||||
binvi a1, a1, 31
|
||||
.global __addsf3
|
||||
__addsf3:
|
||||
// Unpack exponent:
|
||||
h3.bextmi a2, a0, 23, 8
|
||||
h3.bextmi a3, a1, 23, 8
|
||||
// Flush-to-zero => 0 + y = y applies, including nan, with the sole
|
||||
// exception of y being subnormal (which also needs to be flushed)
|
||||
beqz a2, __addsf_return_y_flushed
|
||||
// Don't have to handle this case for x + 0 = 0 because we already know x
|
||||
// is nonzero
|
||||
beqz a3, __addsf_return_x
|
||||
// Unpack significand, plus 3 extra zeroes for working space:
|
||||
slli a4, a0, 9
|
||||
slli a5, a1, 9
|
||||
// check nan/inf on input
|
||||
li t0, 255
|
||||
beq a2, t0, __addsf_x_nan_inf
|
||||
beq a3, t0, __addsf_y_nan_inf
|
||||
// (finish unpacking significand)
|
||||
srli a4, a4, 6
|
||||
srli a5, a5, 6
|
||||
|
||||
// If we're still on the straight path then we are adding two normal
|
||||
// values. Add implicit one (1.xx...xx000)
|
||||
bseti a4, a4, 23 + 3
|
||||
bseti a5, a5, 23 + 3
|
||||
// Negate if sign bit is set
|
||||
bgez a0, 1f
|
||||
neg a4, a4
|
||||
1:
|
||||
// (tuck this 16-bit here to avoid alignment penalty)
|
||||
li t1, 25
|
||||
bgez a1, 1f
|
||||
neg a5, a5
|
||||
1:
|
||||
|
||||
bltu a2, a3, __addsf_ye_gt_xe
|
||||
|
||||
// The main body is repeated twice with different register assignments.
|
||||
// lhs is the more-significant addend:
|
||||
.macro addsf_core packed_lhs, packed_rhs, sig_lhs, sig_rhs, exp_lhs, exp_rhs, rhs_is_x
|
||||
sub \packed_rhs, \exp_lhs, \exp_rhs
|
||||
// If there is a large exponent difference then there is no effect on lhs
|
||||
.if \rhs_is_x
|
||||
bgeu \packed_rhs, t1, __addsf_return_y
|
||||
.else
|
||||
bgeu \packed_rhs, t1, __addsf_return_x
|
||||
.endif
|
||||
// Shift rhs down to correct relative significance
|
||||
sra \packed_lhs, \sig_rhs, \packed_rhs
|
||||
// Set sticky bit if ones were shifted out
|
||||
sll \packed_rhs, \packed_lhs, \packed_rhs
|
||||
sltu \packed_rhs, \packed_rhs, \sig_rhs
|
||||
or \packed_lhs, \packed_lhs, \packed_rhs
|
||||
// Add significands
|
||||
add \sig_lhs, \sig_lhs, \packed_lhs
|
||||
// Detect exact cancellation (may be beyond max normalisation shift; also
|
||||
// IEEE 754 requires +0 for exact cancellation, no matter input signs)
|
||||
beqz \sig_lhs, __addsf_return_0
|
||||
// Convert two's complement back to sign + magnitude
|
||||
srai \exp_rhs, \sig_lhs, 31
|
||||
xor \sig_lhs, \sig_lhs, \exp_rhs
|
||||
sub \sig_lhs, \sig_lhs, \exp_rhs
|
||||
// Renormalise significand: bit 31 is now implicit one
|
||||
clz \packed_lhs, \sig_lhs
|
||||
sll \sig_lhs, \sig_lhs, \packed_lhs
|
||||
// Adjust exponent
|
||||
addi \packed_lhs, \packed_lhs, -5
|
||||
sub \exp_lhs, \exp_lhs, \packed_lhs
|
||||
|
||||
// Round to nearest, even on tie (bias upward if above odd number)
|
||||
bexti \packed_lhs, \sig_lhs, 8
|
||||
addi \sig_lhs, \sig_lhs, 127
|
||||
add \sig_lhs, \sig_lhs, \packed_lhs
|
||||
// Exponent may increase by one due to rounding up from all-ones; this is
|
||||
// detected by clearing of implicit one (there is a carry-out too)
|
||||
bgez \sig_lhs, 3f
|
||||
4:
|
||||
// Detect underflow/overflow
|
||||
bgeu \exp_lhs, t0, 1f
|
||||
|
||||
// Pack and return
|
||||
packh \exp_lhs, \exp_lhs, \exp_rhs
|
||||
slli \exp_lhs, \exp_lhs, 23
|
||||
slli \sig_lhs, \sig_lhs, 1
|
||||
srli \sig_lhs, \sig_lhs, 9
|
||||
add a0, \sig_lhs, \exp_lhs
|
||||
ret
|
||||
1:
|
||||
bgez \exp_lhs, 2f
|
||||
// Signed zero on underflow
|
||||
slli a0, \exp_rhs, 31
|
||||
ret
|
||||
2:
|
||||
// Signed infinity on overflow
|
||||
packh a0, t0, \exp_rhs
|
||||
slli a0, a0, 23
|
||||
ret
|
||||
3:
|
||||
// Exponent increase due to rounding (uncommon)
|
||||
srli \sig_lhs, \sig_lhs, 1
|
||||
addi \exp_lhs, \exp_lhs, 1
|
||||
j 4b
|
||||
.endm
|
||||
|
||||
__addsf_xe_gte_ye:
|
||||
addsf_core a0, a1, a4, a5, a2, a3, 0
|
||||
.p2align 2
|
||||
__addsf_ye_gt_xe:
|
||||
addsf_core a1, a0, a5, a4, a3, a2, 1
|
||||
|
||||
__addsf_x_nan_inf:
|
||||
// When at least one operand is nan, we must propagate at least one of
|
||||
// those nan payloads (sign of nan result is unspecified, which we take
|
||||
// advantage of by implementing x - y as x + -y). Check x nan vs inf:
|
||||
bnez a4, __addsf_return_x
|
||||
__addsf_x_inf:
|
||||
// If x is +-inf, need to distinguish the following cases:
|
||||
bne a3, t0, __addsf_return_x // y is neither inf nor nan -> return x (propagate inf)
|
||||
bnez a5, __addsf_return_y // y is nan: -> return y (propagate nan)
|
||||
xor a5, a0, a1
|
||||
srli a5, a5, 31
|
||||
beqz a5, __addsf_return_x // y is inf of same sign -> return either x or y (x is faster)
|
||||
li a0, -1 // y is inf of different sign -> return nan
|
||||
ret
|
||||
|
||||
__addsf_y_nan_inf:
|
||||
// Mirror of __addsf_x_nan_inf
|
||||
bnez a5, __addsf_return_y
|
||||
__addsf_y_inf:
|
||||
bne a2, t0, __addsf_return_y
|
||||
bnez a4, __addsf_return_x
|
||||
xor a4, a0, a1
|
||||
srli a4, a4, 31
|
||||
beqz a4, __addsf_return_x
|
||||
li a0, -1
|
||||
ret
|
||||
|
||||
__addsf_return_y_flushed:
|
||||
bnez a3, 1f
|
||||
srli a1, a1, 23
|
||||
slli a1, a1, 23
|
||||
1:
|
||||
__addsf_return_y:
|
||||
mv a0, a1
|
||||
__addsf_return_x:
|
||||
ret
|
||||
__addsf_return_0:
|
||||
li a0, 0
|
||||
ret
|
||||
|
||||
|
||||
float_section __mulsf3
|
||||
.global __mulsf3
|
||||
.p2align 2
|
||||
__mulsf3:
|
||||
// Force y to be positive (by possibly negating x) *before* unpacking.
|
||||
// This allows many special cases to be handled without repacking.
|
||||
bgez a1, 1f
|
||||
binvi a0, a0, 31
|
||||
1:
|
||||
// Unpack exponent:
|
||||
h3.bextmi a2, a0, 23, 8
|
||||
h3.bextmi a3, a1, 23, 8
|
||||
// Check special cases
|
||||
li t0, 255
|
||||
beqz a2, __mulsf_x_0
|
||||
beqz a3, __mulsf_y_0
|
||||
beq a2, t0, __mulsf_x_nan_inf
|
||||
beq a3, t0, __mulsf_y_nan_inf
|
||||
|
||||
// Finish unpacking sign
|
||||
srai a6, a0, 31
|
||||
// Unpack significand (with implicit one in MSB)
|
||||
slli a4, a0, 8
|
||||
slli a5, a1, 8
|
||||
bseti a4, a4, 31
|
||||
bseti a5, a5, 31
|
||||
// Get full 64-bit multiply result in a4:a1 (one cycle each half)
|
||||
// Going from Q1.23 to Q2.46 (both left-justified)
|
||||
mul a1, a4, a5
|
||||
mulhu a4, a4, a5
|
||||
// Normalise (shift left by either 0 or 1) -- bit 8 is the LSB of the
|
||||
// final significand (ignoring rounding)
|
||||
clz a0, a4
|
||||
sll a4, a4, a0
|
||||
sub a2, a2, a0
|
||||
// After normalising we can calculate the final exponent, since rounding
|
||||
// cannot increase the exponent for multiplication (unlike addition)
|
||||
add a2, a2, a3
|
||||
// Subtract redundant bias term (127), add 1 for normalisation correction
|
||||
addi a2, a2, -126
|
||||
blez a2, __mulsf_underflow
|
||||
bge a2, t0, __mulsf_overflow
|
||||
|
||||
// Gather sticky bits from low fraction:
|
||||
snez a1, a1
|
||||
or a4, a4, a1
|
||||
// Round to nearest, even on tie (aka bias upward if odd)
|
||||
bexti a1, a4, 8
|
||||
add a4, a4, a1
|
||||
addi a4, a4, 127
|
||||
// Pack it and ship it
|
||||
packh a2, a2, a6
|
||||
slli a2, a2, 23
|
||||
slli a4, a4, 1
|
||||
srli a4, a4, 9
|
||||
add a0, a4, a2
|
||||
ret
|
||||
|
||||
__mulsf_underflow:
|
||||
// Signed zero
|
||||
slli a0, a6, 31
|
||||
ret
|
||||
__mulsf_overflow:
|
||||
// Signed inf
|
||||
packh a0, t0, a6
|
||||
slli a0, a0, 23
|
||||
ret
|
||||
|
||||
__mulsf_x_0:
|
||||
// 0 times nan -> propagate nan
|
||||
// 0 times inf -> generate nan
|
||||
// 0 times others -> 0 (need to flush significand too as we are FTZ)
|
||||
bne a3, t0, __mulsf_return_flushed_x
|
||||
slli a5, a1, 9
|
||||
beqz a5, 1f
|
||||
// Propagate nan from y
|
||||
__mulsf_return_y:
|
||||
mv a0, a1
|
||||
ret
|
||||
1:
|
||||
// Generate new nan
|
||||
li a0, -1
|
||||
ret
|
||||
|
||||
__mulsf_y_0:
|
||||
// Mirror image of x_0 except we still return x for signed 0, since the
|
||||
// signs were already resolved.
|
||||
bne a2, t0, __mulsf_return_flushed_x
|
||||
slli a1, a0, 9
|
||||
bnez a1, 1f
|
||||
li a0, -1
|
||||
1:
|
||||
ret
|
||||
|
||||
__mulsf_return_flushed_x:
|
||||
// If we don't support subnormals we at least need to flush to a canonical
|
||||
// zero. This is just a sign bit in bit 31.
|
||||
srli a0, a0, 31
|
||||
slli a0, a0, 31
|
||||
__mulsf_return_x:
|
||||
ret
|
||||
|
||||
__mulsf_x_nan_inf:
|
||||
// We know that y is not zero and is positive. So...
|
||||
// x is nan -> return x
|
||||
// else y is nan -> return y
|
||||
// else y is inf -> return x
|
||||
// else y is normal -> return x
|
||||
// (the order of the first two clauses is actually our free choice)
|
||||
slli a4, a0, 9
|
||||
bnez a4, __mulsf_return_x
|
||||
bne a3, t0, __mulsf_return_x
|
||||
slli a5, a1, 9
|
||||
bnez a5, __mulsf_return_y
|
||||
ret // return x
|
||||
|
||||
__mulsf_y_nan_inf:
|
||||
// We know that x is not zero, nan, nor inf. That just leaves normals.
|
||||
// y is nan -> return y
|
||||
// y is inf -> return inf * sgn(x) (since we already merged the signs)
|
||||
slli a5, a1, 9
|
||||
bnez a5, __mulsf_return_y
|
||||
srai a0, a0, 31
|
||||
packh a0, t0, a0
|
||||
slli a0, a0, 23
|
||||
ret
|
||||
|
||||
|
||||
// This is a hack to improve soft float performance for the routines we don't
|
||||
// implement (e.g. libm) in libraries built against a non-Zbb ISA dialect:
|
||||
float_section __clz2si
|
||||
.global __clz2si
|
||||
__clz2si:
|
||||
clz a0, a0
|
||||
ret
|
|
@ -0,0 +1,346 @@
|
|||
/*
|
||||
* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#include "pico/asm_helper.S"
|
||||
|
||||
#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
|
||||
|
||||
#ifndef PICO_FLOAT_IN_RAM
|
||||
#define PICO_FLOAT_IN_RAM 0
|
||||
#endif
|
||||
|
||||
pico_default_asm_setup
|
||||
|
||||
.macro float_section name
|
||||
// todo separate flag for shims?
|
||||
#if PICO_FLOAT_IN_RAM
|
||||
.section RAM_SECTION_NAME(\name), "ax"
|
||||
#else
|
||||
.section SECTION_NAME(\name), "ax"
|
||||
#endif
|
||||
.endm
|
||||
|
||||
float_section float_table_shim_on_use_helper
|
||||
regular_func float_table_shim_on_use_helper
|
||||
push {r0-r2, lr}
|
||||
mov r0, ip
|
||||
#ifndef NDEBUG
|
||||
// sanity check to make sure we weren't called by non (shimmable_) table_tail_call macro
|
||||
cmp r0, #0
|
||||
bne 1f
|
||||
bkpt #0
|
||||
#endif
|
||||
1:
|
||||
ldrh r1, [r0]
|
||||
lsrs r2, r1, #8
|
||||
adds r0, #2
|
||||
cmp r2, #0xdf
|
||||
bne 1b
|
||||
uxtb r1, r1 // r1 holds table offset
|
||||
lsrs r2, r0, #2
|
||||
bcc 1f
|
||||
// unaligned
|
||||
ldrh r2, [r0, #0]
|
||||
ldrh r0, [r0, #2]
|
||||
lsls r0, #16
|
||||
orrs r0, r2
|
||||
b 2f
|
||||
1:
|
||||
ldr r0, [r0]
|
||||
2:
|
||||
ldr r2, =sf_table
|
||||
str r0, [r2, r1]
|
||||
str r0, [sp, #12]
|
||||
pop {r0-r2, pc}
|
||||
|
||||
float_section 642float_shims
|
||||
|
||||
@ convert uint64 to float, rounding
|
||||
regular_func uint642float_shim
|
||||
movs r2,#0 @ fall through
|
||||
|
||||
@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
|
||||
regular_func ufix642float_shim
|
||||
push {r4,r5,r14}
|
||||
cmp r1,#0
|
||||
bpl 3f @ positive? we can use signed code
|
||||
lsls r5,r1,#31 @ contribution to sticky bits
|
||||
orrs r5,r0
|
||||
lsrs r0,r1,#1
|
||||
subs r2,#1
|
||||
b 4f
|
||||
|
||||
@ convert int64 to float, rounding
|
||||
regular_func int642float_shim
|
||||
movs r2,#0 @ fall through
|
||||
|
||||
@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
|
||||
regular_func fix642float_shim
|
||||
push {r4,r5,r14}
|
||||
3:
|
||||
movs r5,r0
|
||||
orrs r5,r1
|
||||
beq ret_pop45 @ zero? return +0
|
||||
asrs r5,r1,#31 @ sign bits
|
||||
2:
|
||||
asrs r4,r1,#24 @ try shifting 7 bits at a time
|
||||
cmp r4,r5
|
||||
bne 1f @ next shift will overflow?
|
||||
lsls r1,#7
|
||||
lsrs r4,r0,#25
|
||||
orrs r1,r4
|
||||
lsls r0,#7
|
||||
adds r2,#7
|
||||
b 2b
|
||||
1:
|
||||
movs r5,r0
|
||||
movs r0,r1
|
||||
4:
|
||||
negs r2,r2
|
||||
adds r2,#32+29
|
||||
|
||||
// bl packx
|
||||
ldr r1, =0x29ef // packx
|
||||
blx r1
|
||||
ret_pop45:
|
||||
pop {r4,r5,r15}
|
||||
|
||||
float_section fatan2_shim
|
||||
regular_func fatan2_shim
|
||||
push {r4,r5,r14}
|
||||
|
||||
ldr r4, =0x29c1 // unpackx
|
||||
mov ip, r4
|
||||
@ unpack arguments and shift one down to have common exponent
|
||||
blx ip
|
||||
mov r4,r0
|
||||
mov r0,r1
|
||||
mov r1,r4
|
||||
mov r4,r2
|
||||
mov r2,r3
|
||||
mov r3,r4
|
||||
blx ip
|
||||
lsls r0,r0,#5 @ Q28
|
||||
lsls r1,r1,#5 @ Q28
|
||||
adds r4,r2,r3 @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise
|
||||
asrs r4,#9
|
||||
adds r4,#1
|
||||
bmi 2f @ force y to 0 proper, so result will be zero
|
||||
subs r4,r2,r3 @ calculate shift
|
||||
bge 1f @ ex>=ey?
|
||||
negs r4,r4 @ make shift positive
|
||||
asrs r0,r4
|
||||
cmp r4,#28
|
||||
blo 3f
|
||||
asrs r0,#31
|
||||
b 3f
|
||||
1:
|
||||
asrs r1,r4
|
||||
cmp r4,#28
|
||||
blo 3f
|
||||
2:
|
||||
@ here |x|>>|y| or both x and y are ±0
|
||||
cmp r0,#0
|
||||
bge 4f @ x positive, return signed 0
|
||||
ldr r3, =0x2cfc @ &pi_q29, circular coefficients
|
||||
ldr r0,[r3] @ x negative, return +/- pi
|
||||
asrs r1,#31
|
||||
eors r0,r1
|
||||
b 7f
|
||||
4:
|
||||
asrs r0,r1,#31
|
||||
b 7f
|
||||
3:
|
||||
movs r2,#0 @ initial angle
|
||||
ldr r3, =0x2cfc @ &pi_q29, circular coefficients
|
||||
cmp r0,#0 @ x negative
|
||||
bge 5f
|
||||
negs r0,r0 @ rotate to 1st/4th quadrants
|
||||
negs r1,r1
|
||||
ldr r2,[r3] @ pi Q29
|
||||
5:
|
||||
movs r4,#1 @ m=1
|
||||
ldr r5, =0x2b97 @ cordic_vec
|
||||
blx r5 @ also produces magnitude (with scaling factor 1.646760119), which is discarded
|
||||
mov r0,r2 @ result here is -pi/2..3pi/2 Q29
|
||||
@ asrs r2,#29
|
||||
@ subs r0,r2
|
||||
ldr r3, =0x2cfc @ &pi_q29, circular coefficients
|
||||
ldr r2,[r3] @ pi Q29
|
||||
adds r4,r0,r2 @ attempt to fix -3pi/2..-pi case
|
||||
bcs 6f @ -pi/2..0? leave result as is
|
||||
subs r4,r0,r2 @ <pi? leave as is
|
||||
bmi 6f
|
||||
subs r0,r4,r2 @ >pi: take off 2pi
|
||||
6:
|
||||
subs r0,#1 @ fiddle factor so atan2(0,1)==0
|
||||
7:
|
||||
movs r2,#0 @ exponent for pack
|
||||
ldr r3, =0x2b19
|
||||
bx r3
|
||||
|
||||
float_section float232_shims
|
||||
|
||||
regular_func float2int_shim
|
||||
movs r1,#0 @ fall through
|
||||
regular_func float2fix_shim
|
||||
// check for -0 or -denormal upfront
|
||||
asrs r2, r0, #23
|
||||
adds r2, #128
|
||||
adds r2, #128
|
||||
beq 1f
|
||||
// call original
|
||||
ldr r2, =0x2acd
|
||||
bx r2
|
||||
1:
|
||||
movs r0, #0
|
||||
bx lr
|
||||
|
||||
float_section float264_shims
|
||||
|
||||
regular_func float2int64_shim
|
||||
movs r1,#0 @ and fall through
|
||||
regular_func float2fix64_shim
|
||||
push {r14}
|
||||
bl f2fix
|
||||
b d2f64_a
|
||||
|
||||
regular_func float2uint64_shim
|
||||
movs r1,#0 @ and fall through
|
||||
regular_func float2ufix64_shim
|
||||
asrs r3,r0,#23 @ negative? return 0
|
||||
bmi ret_dzero
|
||||
@ and fall through
|
||||
|
||||
@ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf
|
||||
@ result clamped so that r3 can only be 0 or -1
|
||||
@ trashes r12
|
||||
.thumb_func
|
||||
f2fix:
|
||||
push {r4,r14}
|
||||
mov r12,r1
|
||||
asrs r3,r0,#31
|
||||
lsls r0,#1
|
||||
lsrs r2,r0,#24
|
||||
beq 1f @ zero?
|
||||
cmp r2,#0xff @ Inf?
|
||||
beq 2f
|
||||
subs r1,r2,#1
|
||||
subs r2,#0x7f @ remove exponent bias
|
||||
lsls r1,#24
|
||||
subs r0,r1 @ insert implied 1
|
||||
eors r0,r3
|
||||
subs r0,r3 @ top two's complement
|
||||
asrs r1,r0,#4 @ convert to double format
|
||||
lsls r0,#28
|
||||
ldr r4, =d2fix_a
|
||||
bx r4
|
||||
1:
|
||||
movs r0,#0
|
||||
movs r1,r0
|
||||
movs r3,r0
|
||||
pop {r4,r15}
|
||||
2:
|
||||
mvns r0,r3 @ return max/min value
|
||||
mvns r1,r3
|
||||
pop {r4,r15}
|
||||
|
||||
ret_dzero:
|
||||
movs r0,#0
|
||||
movs r1,#0
|
||||
bx r14
|
||||
|
||||
float_section d2fix_a_float
|
||||
|
||||
.weak d2fix_a // weak because it exists in float shims too
|
||||
.thumb_func
|
||||
d2fix_a:
|
||||
@ here
|
||||
@ r0:r1 two's complement mantissa
|
||||
@ r2 unbaised exponent
|
||||
@ r3 mantissa sign extension bits
|
||||
add r2,r12 @ exponent plus offset for required binary point position
|
||||
subs r2,#52 @ required shift
|
||||
bmi 1f @ shift down?
|
||||
@ here a shift up by r2 places
|
||||
cmp r2,#12 @ will clamp?
|
||||
bge 2f
|
||||
movs r4,r0
|
||||
lsls r1,r2
|
||||
lsls r0,r2
|
||||
negs r2,r2
|
||||
adds r2,#32 @ complementary shift
|
||||
lsrs r4,r2
|
||||
orrs r1,r4
|
||||
pop {r4,r15}
|
||||
2:
|
||||
mvns r0,r3
|
||||
mvns r1,r3 @ overflow: clamp to extreme fixed-point values
|
||||
pop {r4,r15}
|
||||
1:
|
||||
@ here a shift down by -r2 places
|
||||
adds r2,#32
|
||||
bmi 1f @ long shift?
|
||||
mov r4,r1
|
||||
lsls r4,r2
|
||||
negs r2,r2
|
||||
adds r2,#32 @ complementary shift
|
||||
asrs r1,r2
|
||||
lsrs r0,r2
|
||||
orrs r0,r4
|
||||
pop {r4,r15}
|
||||
1:
|
||||
@ here a long shift down
|
||||
movs r0,r1
|
||||
asrs r1,#31 @ shift down 32 places
|
||||
adds r2,#32
|
||||
bmi 1f @ very long shift?
|
||||
negs r2,r2
|
||||
adds r2,#32
|
||||
asrs r0,r2
|
||||
pop {r4,r15}
|
||||
1:
|
||||
movs r0,r3 @ result very near zero: use sign extension bits
|
||||
movs r1,r3
|
||||
pop {r4,r15}
|
||||
d2f64_a:
|
||||
asrs r2,r1,#31
|
||||
cmp r2,r3
|
||||
bne 1f @ sign extension bits fail to match sign of result?
|
||||
pop {r15}
|
||||
1:
|
||||
mvns r0,r3
|
||||
movs r1,#1
|
||||
lsls r1,#31
|
||||
eors r1,r1,r0 @ generate extreme fixed-point values
|
||||
pop {r15}
|
||||
|
||||
float_section float2double_shim
|
||||
regular_func float2double_shim
|
||||
lsrs r3,r0,#31 @ sign bit
|
||||
lsls r3,#31
|
||||
lsls r1,r0,#1
|
||||
lsrs r2,r1,#24 @ exponent
|
||||
beq 1f @ zero?
|
||||
cmp r2,#0xff @ Inf?
|
||||
beq 2f
|
||||
lsrs r1,#4 @ exponent and top 20 bits of mantissa
|
||||
ldr r2,=(0x3ff-0x7f)<<20 @ difference in exponent offsets
|
||||
adds r1,r2
|
||||
orrs r1,r3
|
||||
lsls r0,#29 @ bottom 3 bits of mantissa
|
||||
bx r14
|
||||
1:
|
||||
movs r1,r3 @ return signed zero
|
||||
3:
|
||||
movs r0,#0
|
||||
bx r14
|
||||
2:
|
||||
ldr r1,=0x7ff00000 @ return signed infinity
|
||||
adds r1,r3
|
||||
b 3b
|
||||
|
||||
#endif
|
90
lib/main/pico-sdk/rp2_common/pico_float/include/pico/float.h
Normal file
90
lib/main/pico-sdk/rp2_common/pico_float/include/pico/float.h
Normal file
|
@ -0,0 +1,90 @@
|
|||
/*
|
||||
* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#ifndef _PICO_FLOAT_H
|
||||
#define _PICO_FLOAT_H
|
||||
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include "pico.h"
|
||||
#include "pico/bootrom/sf_table.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/** \file float.h
|
||||
* \defgroup pico_float pico_float
|
||||
*
|
||||
* \brief Optimized single-precision floating point functions
|
||||
*
|
||||
* (Replacement) optimized implementations are provided for the following compiler built-ins
|
||||
* and math library functions on Arm:
|
||||
*
|
||||
* - __aeabi_fadd, __aeabi_fdiv, __aeabi_fmul, __aeabi_frsub, __aeabi_fsub, __aeabi_cfcmpeq, __aeabi_cfrcmple, __aeabi_cfcmple, __aeabi_fcmpeq, __aeabi_fcmplt, __aeabi_fcmple, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmpun, __aeabi_i2f, __aeabi_l2f, __aeabi_ui2f, __aeabi_ul2f, __aeabi_f2iz, __aeabi_f2lz, __aeabi_f2uiz, __aeabi_f2ulz, __aeabi_f2d, sqrtf, cosf, sinf, tanf, atan2f, expf, logf
|
||||
* - ldexpf, copysignf, truncf, floorf, ceilf, roundf, asinf, acosf, atanf, sinhf, coshf, tanhf, asinhf, acoshf, atanhf, exp2f, log2f, exp10f, log10f, powf, hypotf, cbrtf, fmodf, dremf, remainderf, remquof, expm1f, log1pf, fmaf
|
||||
* - powintf, sincosf (GNU extensions)
|
||||
*
|
||||
* The following additional optimized functions are also provided:
|
||||
*
|
||||
* - int2float, uint2float, int642float, uint642float, fix2float, ufix2float, fix642float, ufix642float
|
||||
* - float2fix, float2ufix, float2fix64, float2ufix64, float2int, float2uint, float2int64, float2uint64, float2int_z, float2int64_z, float2uint_z, float2uint64_z
|
||||
* - exp10f, sincosf, powintf
|
||||
*
|
||||
* On RP2350 (Arm) the following additional functions are available; the _fast methods are faster but do not round correctly
|
||||
*
|
||||
* - float2fix64_z, fdiv_fast, fsqrt_fast,
|
||||
*
|
||||
* On RP2350 RISC-V, only a small number of compiler runtime functions are overridden with faster implementations:
|
||||
*
|
||||
* - __addsf3, __subsf3, __mulsf3
|
||||
*/
|
||||
|
||||
// None of these functions are available on RISC-V:
|
||||
#if !defined(__riscv) || PICO_COMBINED_DOCS
|
||||
|
||||
float int2float(int32_t f);
|
||||
float uint2float(uint32_t f);
|
||||
float int642float(int64_t f);
|
||||
float uint642float(uint64_t f);
|
||||
float fix2float(int32_t m, int e);
|
||||
float ufix2float(uint32_t m, int e);
|
||||
float fix642float(int64_t m, int e);
|
||||
float ufix642float(uint64_t m, int e);
|
||||
|
||||
// These methods round towards -Infinity.
|
||||
int32_t float2fix(float f, int e);
|
||||
uint32_t float2ufix(float f, int e);
|
||||
int64_t float2fix64(float f, int e);
|
||||
uint64_t float2ufix64(float f, int e);
|
||||
int32_t float2int(float f);
|
||||
uint32_t float2uint(float f);
|
||||
int64_t float2int64(float f);
|
||||
uint64_t float2uint64(float f);
|
||||
|
||||
// These methods round towards 0.
|
||||
int32_t float2int_z(float f);
|
||||
int64_t float2int64_z(float f);
|
||||
int32_t float2uint_z(float f);
|
||||
int64_t float2uint64_z(float f);
|
||||
|
||||
float exp10f(float x);
|
||||
void sincosf(float x, float *sinx, float *cosx);
|
||||
float powintf(float x, int y);
|
||||
|
||||
#if !PICO_RP2040 || PICO_COMBINED_DOCS
|
||||
int64_t float2fix64_z(float f, int e);
|
||||
float fdiv_fast(float n, float d);
|
||||
float fsqrt_fast(float f);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
Loading…
Add table
Add a link
Reference in a new issue