Adding RP2350 SDK and target framework (#13988)

* Adding RP2350 SDK and target framework * Spacing * Removing board definitions
2025-07-20 06:45:16 +03:00 · 2024-10-23 10:02:48 +11:00 · 2024-10-23 10:02:48 +11:00 · 2dd6f95aad
commit 2dd6f95aad
parent 462cb05930
576 changed files with 435012 additions and 0 deletions
--- a/lib/main/pico-sdk/rp2_common/pico_double/double_aeabi_dcp.S
+++ b/lib/main/pico-sdk/rp2_common/pico_double/double_aeabi_dcp.S
@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include "pico/asm_helper.S"
+
+#if !HAS_DOUBLE_COPROCESSOR
+#error attempt to compile double_aeabi_rp2350 when there is no DCP
+#else
+
+#include "hardware/dcp_instr.inc.S"
+#include "hardware/dcp_canned.inc.S"
+
+pico_default_asm_setup
+
+.macro double_section name
+#if PICO_DOUBLE_IN_RAM
+.section RAM_SECTION_NAME(\name), "ax"
+#else
+.section SECTION_NAME(\name), "ax"
+#endif
+.endm
+
+.macro double_wrapper_section func
+double_section WRAPPER_FUNC_NAME(\func)
+.endm
+
+// ============== STATE SAVE AND RESTORE ===============
+
+.macro saving_func type func
+  // Note we are usually 32-bit aligned already at this point, as most of the
+  // function bodies contain exactly two 16-bit instructions: bmi and bx lr.
+  // We want the PCMP word-aligned.
+.p2align 2
+  // When the engaged flag is set, branch back here to invoke save routine and
+  // hook lr with the restore routine, then fall back through to the entry
+  // point. The engaged flag will be clear when checked a second time.
+1:
+  push {lr}              // 16-bit instruction
+  bl generic_save_state  // 32-bit instruction
+  b 1f                   // 16-bit instruction
+  // This is the actual entry point:
+\type\()_func \func
+  PCMP apsr_nzcv
+  bmi 1b
+1:
+.endm
+
+.macro saving_func_return
+  bx lr
+.endm
+
+double_section __rp2350_dcp_engaged_state_save_restore
+.thumb_func
+generic_save_state:
+  sub sp, #24
+  push {r0, r1}
+  // do save here
+  PXMD r0, r1
+  strd r0, r1, [sp, #8 + 0]
+  PYMD r0, r1
+  strd r0, r1, [sp, #8 + 8]
+  REFD r0, r1
+  strd r0, r1, [sp, #8 + 16]
+  pop {r0, r1}
+  blx lr
+  // <- wrapped function returns here
+  // fall through into restore:
+.thumb_func
+generic_restore_state:
+  // do restore here
+  pop {r12, r14}
+  WXMD r12, r14
+  pop {r12, r14}
+  WYMD r12, r14
+  pop {r12, r14}
+  WEFD r12, r14
+  pop {pc}
+
+// ============== ARITHMETIC FUNCTIONS ===============
+
+double_wrapper_section __aeabi_dadd
+saving_func wrapper __aeabi_dadd
+  dcp_dadd_m r0,r1,r0,r1,r2,r3
+  saving_func_return
+
+double_wrapper_section __aeabi_dsub
+saving_func wrapper __aeabi_dsub
+  dcp_dsub_m r0,r1,r0,r1,r2,r3
+  saving_func_return
+
+double_wrapper_section __aeabi_drsub
+saving_func wrapper __aeabi_drsub
+  dcp_dsub_m r0,r1,r2,r3,r0,r1
+  saving_func_return
+
+double_wrapper_section __aeabi_dmul
+saving_func wrapper __aeabi_dmul
+
+ // todo optimize this based on final decision on saving_func_entry
+  push {r4,r14}
+  dcp_dmul_m r0,r1,r0,r1,r2,r3,r0,r1,r2,r3,r4,r12,r14
+ // todo optimize this based on final decision on saving_func_entry
+  pop {r4,lr}
+  saving_func_return
+
+double_section ddiv_fast
+saving_func regular ddiv_fast
+  dcp_ddiv_fast_m r0,r1,r0,r1,r2,r3,r0,r1,r2,r3,r12
+  saving_func_return
+
+double_wrapper_section __aeabi_ddiv
+saving_func wrapper __aeabi_ddiv
+@ with correct rounding
+  dcp_ddiv_m r0,r1,r0,r1,r2,r3,r0,r1,r2,r3,r12
+  saving_func_return
+
+double_section sqrt_fast
+saving_func regular sqrt_fast
+  dcp_dsqrt_fast_m r0,r1,r0,r1,r0,r1,r2,r3,r12
+  saving_func_return
+
+double_wrapper_section sqrt
+saving_func wrapper sqrt
+@ with correct rounding
+  dcp_dsqrt_m r0,r1,r0,r1,r0,r1,r2,r3,r12
+  saving_func_return
+
+// todo not a real thing
+double_wrapper_section __aeabi_dclassify
+saving_func wrapper __aeabi_dclassify
+@ with correct rounding
+  dcp_dclassify_m apsr_nzcv,r0,r1
+  saving_func_return
+
+// ============== CONVERSION FUNCTIONS ===============
+
+double_wrapper_section __aeabi_d2f
+saving_func wrapper __aeabi_d2f
+@ with rounding
+  dcp_double2float_m r0,r0,r1
+  saving_func_return
+
+double_wrapper_section __aeabi_i2d
+saving_func wrapper __aeabi_i2d
+  dcp_int2double_m r0,r1,r0
+  saving_func_return
+
+double_wrapper_section __aeabi_ui2d
+saving_func wrapper __aeabi_ui2d
+  dcp_uint2double_m r0,r1,r0
+  saving_func_return
+
+double_wrapper_section __aeabi_d2iz
+saving_func wrapper __aeabi_d2iz
+@ with truncation towards 0
+  dcp_double2int_m r0,r0,r1
+  saving_func_return
+
+double_wrapper_section __aeabi_d2uiz
+saving_func wrapper __aeabi_d2uiz
+@ with truncation towards 0
+  dcp_double2uint_m r0,r0,r1
+  saving_func_return
+
+// todo not a real thing
+double_wrapper_section __aeabi_d2i_r
+saving_func wrapper __aeabi_d2i_r
+@ with rounding
+  dcp_double2int_r_m r0,r0,r1
+  saving_func_return
+
+// todo not a real thing
+double_wrapper_section __aeabi_d2ui_r
+saving_func wrapper __aeabi_d2ui_r
+@ with rounding
+  dcp_double2uint_r_m r0,r0,r1
+  saving_func_return
+
+// ============== COMPARISON FUNCTIONS ===============
+
+double_wrapper_section __aeabi_dcmpun
+saving_func wrapper __aeabi_dcmpun
+  dcp_dcmp_m r0,r0,r1,r2,r3
+  // extract unordered bit
+  ubfx r0, r0, #28, #1
+  saving_func_return
+
+double_wrapper_section __aeabi_dcmp
+
+saving_func wrapper __aeabi_cdrcmple
+  dcp_dcmp_m apsr_nzcv,r2,r3,r0,r1 // with arguments reversed
+  bvs cmp_nan
+  saving_func_return
+
+// these next two can be the same function in the absence of exceptions
+saving_func wrapper __aeabi_cdcmple
+//wrapper_func __aeabi_dcmp
+  dcp_dcmp_m apsr_nzcv,r0,r1,r2,r3
+  bvs cmp_nan
+  saving_func_return
+
+// It is not clear from the ABI documentation whether cdcmpeq must set the C flag
+// in the same way as cdcmple. If not, we could save the "bvs" below; but we
+// err on the side of caution.
+saving_func wrapper __aeabi_cdcmpeq
+//wrapper_func __aeabi_dcmp
+  dcp_dcmp_m apsr_nzcv,r0,r1,r2,r3
+  bvs cmp_nan
+  saving_func_return
+
+// If the result of a flag-setting comparison is "unordered" then we need to set C and clear Z.
+// We could conceivably just do lsrs r12,r14,#1, or even cmp r14,r14,lsr#1 as (a) r14 here is a
+// return address and r14b0=1 for Thumb mode; (b) we are unlikely to be returning to address 0.
+cmp_nan:
+  movs r12, #3 // r12 does not need to be preserved by the flag-setting comparisons
+  lsrs r12, #1 // set C, clear Z
+  saving_func_return
+
+// int FUNC_NAME(__aeabi_dcmpeq)(double, double)         result (1, 0) denotes (=, ?<>) [2], use for C == and !=
+double_wrapper_section __aeabi_dcmpeq
+saving_func wrapper __aeabi_dcmpeq
+  dcp_dcmp_m r0,r0,r1,r2,r3
+  // extract Z
+  ubfx r0, r0, #30, #1
+  saving_func_return
+
+// int FUNC_NAME(__aeabi_dcmplt)(double, double)         result (1, 0) denotes (<, ?>=) [2], use for C <
+double_wrapper_section __aeabi_dcmplt
+saving_func wrapper __aeabi_dcmplt
+  dcp_dcmp_m apsr_nzcv,r2,r3,r0,r1
+  ite hi
+  movhi r0,#1
+  movls r0,#0
+  saving_func_return
+
+// int FUNC_NAME(__aeabi_dcmple)(double, double)         result (1, 0) denotes (<=, ?>) [2], use for C <=
+double_wrapper_section __aeabi_dcmple
+saving_func wrapper __aeabi_dcmple
+  dcp_dcmp_m apsr_nzcv,r2,r3,r0,r1
+  ite hs
+  movhs r0,#1
+  movlo r0,#0
+  saving_func_return
+
+// int FUNC_NAME(__aeabi_dcmpge)(double, double)         result (1, 0) denotes (>=, ?<) [2], use for C >=
+double_wrapper_section __aeabi_dcmpge
+saving_func wrapper __aeabi_dcmpge
+  dcp_dcmp_m apsr_nzcv,r0,r1,r2,r3
+  ite hs
+  movhs r0,#1
+  movlo r0,#0
+  saving_func_return
+
+// int FUNC_NAME(__aeabi_dcmpgt)(double, double)         result (1, 0) denotes (>, ?<=) [2], use for C >
+double_wrapper_section __aeabi_dcmpgt
+saving_func wrapper __aeabi_dcmpgt
+  dcp_dcmp_m apsr_nzcv,r0,r1,r2,r3
+  ite hi
+  movhi r0,#1
+  movls r0,#0
+  saving_func_return
+
+#endif
--- a/lib/main/pico-sdk/rp2_common/pico_double/double_aeabi_rp2040.S
+++ b/lib/main/pico-sdk/rp2_common/pico_double/double_aeabi_rp2040.S
@ -0,0 +1,847 @@
+/*
+ * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#if PICO_RP2040
+#include "pico/asm_helper.S"
+#include "pico/runtime_init.h"
+#include "pico/bootrom/sf_table.h"
+#include "hardware/divider_helper.S"
+
+pico_default_asm_setup
+
+PICO_RUNTIME_INIT_FUNC_RUNTIME(__aeabi_double_init, PICO_RUNTIME_INIT_AEABI_DOUBLE)
+
+.macro double_section name
+#if PICO_DOUBLE_IN_RAM
+.section RAM_SECTION_NAME(\name), "ax"
+#else
+.section SECTION_NAME(\name), "ax"
+#endif
+.endm
+
+.macro _double_wrapper_func x
+    wrapper_func \x
+.endm
+
+.macro wrapper_func_d1 x
+   _double_wrapper_func \x
+#if PICO_DOUBLE_PROPAGATE_NANS
+    mov ip, lr
+    bl __check_nan_d1
+    mov lr, ip
+#endif
+.endm
+
+.macro wrapper_func_d2 x
+    _double_wrapper_func \x
+#if PICO_DOUBLE_PROPAGATE_NANS
+    mov ip, lr
+    bl __check_nan_d2
+    mov lr, ip
+#endif
+.endm
+
+.section .text
+
+#if PICO_DOUBLE_PROPAGATE_NANS
+.thumb_func
+__check_nan_d1:
+   movs r3, #1
+   lsls r3, #21
+   lsls r2, r1, #1
+   adds r2, r3
+   bhi 1f
+   bx lr
+1:
+   bx ip
+
+.thumb_func
+__check_nan_d2:
+   push {r0, r2}
+   movs r2, #1
+   lsls r2, #21
+   lsls r0, r1, #1
+   adds r0, r2
+   bhi 1f
+   lsls r0, r3, #1
+   adds r0, r2
+   bhi 2f
+   pop {r0, r2}
+   bx lr
+2:
+   pop {r0, r2}
+   mov r0, r2
+   mov r1, r3
+   bx ip
+1:
+   pop {r0, r2}
+   bx ip
+#endif
+
+.macro table_tail_call SF_TABLE_OFFSET
+    push {r3, r4}
+#if PICO_DOUBLE_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
+#ifndef NDEBUG
+    movs r3, #0
+    mov ip, r3
+#endif
+#endif
+    ldr r3, =sd_table
+    ldr r3, [r3, #\SF_TABLE_OFFSET]
+    str r3, [sp, #4]
+    pop {r3, pc}
+.endm
+
+.macro shimmable_table_tail_call SF_TABLE_OFFSET shim
+    push {r3, r4}
+    ldr r3, =sd_table
+    ldr r3, [r3, #\SF_TABLE_OFFSET]
+#if PICO_DOUBLE_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
+    mov ip, pc
+#endif
+    str r3, [sp, #4]
+    pop {r3, pc}
+#if PICO_DOUBLE_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
+.byte \SF_TABLE_OFFSET, 0xdf
+.word \shim
+#endif
+.endm
+
+.macro double_wrapper_section func
+double_section WRAPPER_FUNC_NAME(\func)
+.endm
+
+double_section push_r8_r11
+regular_func push_r8_r11
+ mov r4,r8
+ mov r5,r9
+ mov r6,r10
+ mov r7,r11
+ push {r4-r7}
+ bx r14
+
+double_section pop_r8_r11
+regular_func pop_r8_r11
+ pop {r4-r7}
+ mov r8,r4
+ mov r9,r5
+ mov r10,r6
+ mov r11,r7
+ bx r14
+
+// note generally each function is in a separate section unless there is fall thru or branching between them
+// note fadd, fsub, fmul, fdiv are so tiny and just defer to rom so are lumped together so they can share constant pool
+
+// note functions are word aligned except where they are an odd number of linear instructions
+
+// double FUNC_NAME(__aeabi_dadd)(double, double)         double-precision addition
+double_wrapper_section __aeabi_darithmetic
+// double FUNC_NAME(__aeabi_drsub)(double x, double y)    double-precision reverse subtraction, y - x
+
+// frsub first because it is the only one that needs alignment
+.align 2
+wrapper_func __aeabi_drsub
+    eors r0, r1
+    eors r1, r0
+    eors r0, r1
+    // fall thru
+
+// double FUNC_NAME(__aeabi_dsub)(double x, double y)     double-precision subtraction, x - y
+wrapper_func_d2 __aeabi_dsub
+#if PICO_DOUBLE_PROPAGATE_NANS
+    // we want to return nan for inf-inf or -inf - -inf, but without too much upfront cost
+    mov ip, r0
+    mov r0, r1
+    eors r0, r3
+    bmi 1f // different signs
+    mov r0, ip
+    push {r0-r3, lr}
+    bl 2f
+    b ddiv_dsub_nan_helper
+1:
+    mov r0, ip
+2:
+#endif
+   shimmable_table_tail_call SF_TABLE_FSUB dsub_shim
+
+wrapper_func_d2 __aeabi_dadd
+   shimmable_table_tail_call SF_TABLE_FADD dadd_shim
+
+// double FUNC_NAME(__aeabi_ddiv)(double n, double d)     double-precision division, n / d
+wrapper_func_d2 __aeabi_ddiv
+#if PICO_DOUBLE_PROPAGATE_NANS
+    push {r0-r3, lr}
+    bl 1f
+    b ddiv_dsub_nan_helper
+1:
+#endif
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
+    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
+    mov ip, r2
+    ldr r2, =(SIO_BASE)
+    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
+    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
+    bcs ddiv_save_state
+    mov r2, ip
+#else
+    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
+    push {r4, lr}
+    mrs r4, PRIMASK
+    cpsid i
+    bl ddiv_shim_call
+    msr PRIMASK, r4
+    pop {r4, pc}
+#endif
+ddiv_shim_call:
+    shimmable_table_tail_call SF_TABLE_FDIV ddiv_shim
+
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
+ddiv_save_state:
+    ldr r2, =(SIO_BASE)
+    save_div_state_and_lr
+    mov r2, ip
+    bl ddiv_shim_call
+    ldr r2, =(SIO_BASE)
+    restore_div_state_and_return
+#endif
+
+ddiv_dsub_nan_helper:
+#if PICO_DOUBLE_PROPAGATE_NANS
+    // check for infinite op infinite (or rather check for infinite result with both
+    // operands being infinite)
+    lsls r2, r1, #1
+    asrs r2, r2, #21
+    adds r2, #1
+    beq 2f
+    add sp, #16
+    pop {pc}
+2:
+    ldr r2, [sp, #4]
+    ldr r3, [sp, #12]
+    lsls r2, #1
+    asrs r2, r2, #21
+    lsls r3, #1
+    asrs r3, r3, #24
+    ands r2, r3
+    adds r2, #1
+    bne 3f
+    // infinite to nan
+    movs r2, #1
+    lsls r2, #19
+    orrs r1, r2
+3:
+    add sp, #16
+    pop {pc}
+#endif
+
+// double FUNC_NAME(__aeabi_dmul)(double, double)         double-precision multiplication
+wrapper_func_d2 __aeabi_dmul
+#if PICO_DOUBLE_PROPAGATE_NANS
+    push {r0-r3, lr}
+    bl 1f
+
+    // check for multiplication of infinite by zero (or rather check for infinite result with either
+    // operand 0)
+    lsls r3, r1, #1
+    asrs r3, r3, #21
+    adds r3, #1
+    beq 2f
+    add sp, #16
+    pop {pc}
+2:
+    ldr r2, [sp, #4]
+    ldr r3, [sp, #12]
+    ands r2, r3
+    bne 3f
+    // infinite to nan
+    movs r2, #1
+    lsls r2, #19
+    orrs r1, r2
+3:
+    add sp, #16
+    pop {pc}
+1:
+#endif
+   shimmable_table_tail_call SF_TABLE_FMUL dmul_shim
+
+// void FUNC_NAME(__aeabi_cdrcmple)(double, double)         reversed 3-way (<, =, ?>) compare [1], result in PSR ZC flags
+double_wrapper_section __aeabi_cdcmple
+
+wrapper_func __aeabi_cdrcmple
+ push {r0-r7,r14}
+    eors r0, r2
+    eors r2, r0
+    eors r0, r2
+    eors r1, r3
+    eors r3, r1
+    eors r1, r3
+    b __aeabi_dfcmple_guts
+
+// NOTE these share an implementation as we have no excepting NaNs.
+// void FUNC_NAME(__aeabi_cdcmple)(double, double)         3-way (<, =, ?>) compare [1], result in PSR ZC flags
+// void FUNC_NAME(__aeabi_cdcmpeq)(double, double)         non-excepting equality comparison [1], result in PSR ZC flags
+@ compare r0:r1 against r2:r3, returning -1/0/1 for <, =, >
+@ also set flags accordingly
+.align 2
+wrapper_func __aeabi_cdcmple
+wrapper_func __aeabi_cdcmpeq
+ push {r0-r7,r14}
+__aeabi_dfcmple_guts:
+ ldr r7,=0x7ff                @ flush NaNs and denormals
+ lsls r4,r1,#1
+ lsrs r4,#21
+ beq 1f
+ cmp r4,r7
+ bne 2f
+ lsls r4, r1, #12
+ bhi 7f
+1:
+ movs r0,#0
+ lsrs r1,#20
+ lsls r1,#20
+2:
+ lsls r4,r3,#1
+ lsrs r4,#21
+ beq 1f
+ cmp r4,r7
+ bne 2f
+ lsls r4, r3, #12
+ bhi 7f
+1:
+ movs r2,#0
+ lsrs r3,#20
+ lsls r3,#20
+2:
+ movs r6,#1
+ eors r3,r1
+ bmi 4f                        @ opposite signs? then can proceed on basis of sign of x
+ eors r3,r1                    @ restore r3
+ bpl 2f
+ cmp r3,r1
+ bne 7f
+1:
+ cmp r2,r0
+7:
+ pop {r0-r7,r15}
+2:
+ cmp r1,r3
+ bne 7b
+1:
+ cmp r0,r2
+ pop {r0-r7,r15}
+4:
+ orrs r3,r1                    @ make -0==+0
+ adds r3,r3
+ orrs r3,r0
+ orrs r3,r2
+ beq 7b
+ mvns r1, r1     @ carry inverse of r1 sign
+ adds r1, r1
+ pop {r0-r7,r15}
+
+
+// int FUNC_NAME(__aeabi_dcmpeq)(double, double)         result (1, 0) denotes (=, ?<>) [2], use for C == and !=
+double_wrapper_section __aeabi_dcmpeq
+.align 2
+wrapper_func __aeabi_dcmpeq
+    push {lr}
+    bl __aeabi_cdcmpeq
+    beq 1f
+    movs r0, #0
+    pop {pc}
+1:
+    movs r0, #1
+    pop {pc}
+
+// int FUNC_NAME(__aeabi_dcmplt)(double, double)         result (1, 0) denotes (<, ?>=) [2], use for C <
+double_wrapper_section __aeabi_dcmplt
+.align 2
+wrapper_func __aeabi_dcmplt
+    push {lr}
+    bl __aeabi_cdcmple
+    sbcs r0, r0
+    pop {pc}
+
+// int FUNC_NAME(__aeabi_dcmple)(double, double)         result (1, 0) denotes (<=, ?>) [2], use for C <=
+double_wrapper_section __aeabi_dcmple
+.align 2
+wrapper_func __aeabi_dcmple
+    push {lr}
+    bl __aeabi_cdcmple
+    bls 1f
+    movs r0, #0
+    pop {pc}
+1:
+    movs r0, #1
+    pop {pc}
+
+// int FUNC_NAME(__aeabi_dcmpge)(double, double)         result (1, 0) denotes (>=, ?<) [2], use for C >=
+double_wrapper_section __aeabi_dcmpge
+.align 2
+wrapper_func __aeabi_dcmpge
+    push {lr}
+    // because of NaNs it is better to reverse the args than the result
+    bl __aeabi_cdrcmple
+    bls 1f
+    movs r0, #0
+    pop {pc}
+1:
+    movs r0, #1
+    pop {pc}
+
+// int FUNC_NAME(__aeabi_dcmpgt)(double, double)         result (1, 0) denotes (>, ?<=) [2], use for C >
+double_wrapper_section __aeabi_dcmpgt
+wrapper_func __aeabi_dcmpgt
+    push {lr}
+    // because of NaNs it is better to reverse the args than the result
+    bl __aeabi_cdrcmple
+    sbcs r0, r0
+    pop {pc}
+
+// int FUNC_NAME(__aeabi_dcmpun)(double, double)         result (1, 0) denotes (?, <=>) [2], use for C99 isunordered()
+double_wrapper_section __aeabi_dcmpun
+wrapper_func __aeabi_dcmpun
+   movs r0, #1
+   lsls r0, #21
+   lsls r2, r1, #1
+   adds r2, r0
+   bhi 1f
+   lsls r2, r3, #1
+   adds r2, r0
+   bhi 1f
+   movs r0, #0
+   bx lr
+1:
+   movs r0, #1
+   bx lr
+
+// double FUNC_NAME(__aeabi_ui2d)(unsigned)             unsigned to double (double precision) conversion
+double_wrapper_section __aeabi_ui2d
+    shimmable_table_tail_call SF_TABLE_UINT2FLOAT uint2double_shim
+
+double_wrapper_section __aeabi_i2d
+
+wrapper_func __aeabi_ui2d
+    movs r1, #0
+    cmp r0, #0
+    bne 2f
+1:
+    bx lr
+// double FUNC_NAME(__aeabi_i2d)(int)                     integer to double (double precision) conversion
+wrapper_func __aeabi_i2d
+    asrs r1, r0, #31
+    eors r0, r1
+    subs r0, r1
+    beq 1b
+    lsls r1, #31
+2:
+    push {r0, r1, r4, lr}
+    ldr r3, =sf_clz_func
+    ldr r3, [r3]
+    blx r3
+    pop {r2, r3}
+    adds r4, r0, #1
+    lsls r2, r4
+    lsls r0, r2, #20
+    lsrs r2, #12
+    ldr r1,=1055
+    subs r1, r4
+    lsls r1, #20
+    orrs r1, r3
+    orrs r1, r2
+    pop {r4, pc}
+
+// int FUNC_NAME(__aeabi_d2iz)(double)                     double (double precision) to integer C-style conversion [3]
+double_wrapper_section __aeabi_d2iz
+wrapper_func __aeabi_d2iz
+regular_func double2int_z
+    push {r4, lr}
+    lsls r4, r1, #1
+    lsrs r2, r4, #21
+    movs r3, #0x80
+    adds r2, r3
+    lsls r3, #3
+    subs r2, r3
+    lsls r3, #21
+    cmp r2, #126
+    ble 1f
+    subs r2, #158
+    bge 2f
+    asrs r4, r1, #31
+    lsls r1, #12
+    lsrs r1, #1
+    orrs r1, r3
+    negs r2, r2
+    lsrs r1, r2
+    lsls r4, #1
+    adds r4, #1
+    adds r2, #21
+    cmp r2, #32
+    bge 3f
+    lsrs r0, r2
+    orrs r0, r1
+    muls r0, r4
+    pop {r4, pc}
+1:
+    movs r0, #0
+    pop {r4, pc}
+3:
+    mov r0, r1
+    muls r0, r4
+    pop {r4, pc}
+2:
+    // overflow
+    lsrs r0, r1, #31
+    adds r0, r3
+    subs r0, #1
+    pop {r4, pc}
+
+double_section double2int
+regular_func double2int
+    shimmable_table_tail_call SF_TABLE_FLOAT2INT double2int_shim
+
+// unsigned FUNC_NAME(__aeabi_d2uiz)(double)             double (double precision) to unsigned C-style conversion [3]
+double_wrapper_section __aeabi_d2uiz
+wrapper_func __aeabi_d2uiz
+regular_func double2uint
+    shimmable_table_tail_call SF_TABLE_FLOAT2UINT double2uint_shim
+
+double_section fix2double
+regular_func fix2double
+    shimmable_table_tail_call SF_TABLE_FIX2FLOAT fix2double_shim
+
+double_section ufix2double
+regular_func ufix2double
+    shimmable_table_tail_call SF_TABLE_UFIX2FLOAT ufix2double_shim
+
+double_section fix642double
+regular_func fix642double
+    shimmable_table_tail_call SF_TABLE_FIX642FLOAT fix642double_shim
+
+double_section ufix2double
+regular_func ufix642double
+    shimmable_table_tail_call SF_TABLE_UFIX642FLOAT ufix642double_shim
+
+// double FUNC_NAME(__aeabi_l2d)(long long)             long long to double (double precision) conversion
+double_wrapper_section __aeabi_l2d
+wrapper_func __aeabi_l2d
+    shimmable_table_tail_call SF_TABLE_INT642FLOAT int642double_shim
+
+// double FUNC_NAME(__aeabi_l2f)(long long)             long long to double (double precision) conversion
+double_wrapper_section __aeabi_ul2d
+wrapper_func __aeabi_ul2d
+    shimmable_table_tail_call SF_TABLE_UINT642FLOAT uint642double_shim
+
+// long long FUNC_NAME(__aeabi_d2lz)(double)             double (double precision) to long long C-style conversion [3]
+double_wrapper_section __aeabi_d2lz
+wrapper_func __aeabi_d2lz
+regular_func double2int64_z
+    cmn r1, r1
+    bcc double2int64
+    push {lr}
+    lsls r1, #1
+    lsrs r1, #1
+    movs r2, #0
+    bl double2ufix64
+    cmp r1, #0
+    bmi 1f
+    movs r2, #0
+    negs r0, r0
+    sbcs r2, r1
+    mov r1, r2
+    pop {pc}
+1:
+    movs r1, #128
+    lsls r1, #24
+    movs r0, #0
+    pop {pc}
+
+double_section double2int64
+regular_func double2int64
+    shimmable_table_tail_call SF_TABLE_FLOAT2INT64 double2int64_shim
+
+// unsigned long long FUNC_NAME(__aeabi_d2ulz)(double)     double to unsigned long long C-style conversion [3]
+double_wrapper_section __aeabi_d2ulz
+wrapper_func __aeabi_d2ulz
+    shimmable_table_tail_call SF_TABLE_FLOAT2UINT64 double2uint64_shim
+
+double_section double2fix64
+regular_func double2fix64
+    shimmable_table_tail_call SF_TABLE_FLOAT2FIX64 double2fix64_shim
+
+double_section double2ufix64
+regular_func double2ufix64
+    shimmable_table_tail_call SF_TABLE_FLOAT2UFIX64 double2ufix64_shim
+
+double_section double2fix
+regular_func double2fix
+    shimmable_table_tail_call SF_TABLE_FLOAT2FIX double2fix_shim
+
+double_section double2ufix
+regular_func double2ufix
+    shimmable_table_tail_call SF_TABLE_FLOAT2UFIX double2ufix_shim
+
+double_wrapper_section __aeabi_d2f
+1:
+#if PICO_DOUBLE_PROPAGATE_NANS
+    // copy sign bit and 23 NAN id bits into sign bit and significant id bits, also set high id bit
+
+    lsrs r0, #30
+    lsls r2, r1, #12
+    lsrs r2, #9
+    asrs r1, #22
+    lsls r1, #22
+    orrs r0, r1
+    orrs r0, r2
+    bx lr
+#endif
+wrapper_func __aeabi_d2f
+#if PICO_DOUBLE_PROPAGATE_NANS
+    movs r3, #1
+    lsls r3, #21
+    lsls r2, r1, #1
+    adds r2, r3
+    bhi 1b
+#endif
+    // note double->float in double table at same index as float->double in double table
+    shimmable_table_tail_call SF_TABLE_FLOAT2DOUBLE double2float_shim
+
+double_wrapper_section srqt
+wrapper_func_d1 sqrt
+    shimmable_table_tail_call SF_TABLE_FSQRT dsqrt_shim
+
+double_wrapper_section sincostan_remainder
+regular_func sincostan_remainder
+    ldr r2, =0x54442D18 // 2 * M_PI
+    ldr r3, =0x401921FB
+    push {lr}
+    // note remainder only uses the divider thru integer divider functions
+    // which save and restore themselves
+    bl remainder
+    pop {pc}
+
+double_wrapper_section cos
+#don't use _d1 as we're doing a range check anyway and infinites/nans are bigger than 1024
+wrapper_func cos
+    // rom version only works for -1024 < angle < 1024
+    lsls r2, r1, #2
+    bcc 1f
+    lsrs r2, #22
+    cmp r2, #9
+    bge 2f
+1:
+    shimmable_table_tail_call SF_TABLE_FCOS dcos_shim
+2:
+#if PICO_DOUBLE_PROPAGATE_NANS
+    lsls r2, r1, #1
+    asrs r2, #21
+    adds r2, #1
+    bne 3f
+    // infinite to nan
+    movs r2, #1
+    lsls r2, #19
+    orrs r1, r2
+    bx lr
+3:
+#endif
+    push {lr}
+    bl sincostan_remainder
+    pop {r2}
+    mov lr, r2
+    b 1b
+
+double_wrapper_section sin
+#don't use _d1 as we're doing a range check anyway and infinites/nans are bigger than 1024
+wrapper_func sin
+    // rom version only works for -1024 < angle < 1024
+    lsls r2, r1, #2
+    bcc 1f
+    lsrs r2, #22
+    cmp r2, #9
+    bge 2f
+1:
+    shimmable_table_tail_call SF_TABLE_FSIN dsin_shim
+2:
+#if PICO_DOUBLE_PROPAGATE_NANS
+    lsls r2, r1, #1
+    asrs r2, #21
+    adds r2, #1
+    bne 3f
+    // infinite to nan
+    movs r2, #1
+    lsls r2, #19
+    orrs r1, r2
+    bx lr
+3:
+#endif
+    push {lr}
+    bl sincostan_remainder
+    pop {r2}
+    mov lr, r2
+    b 1b
+
+double_wrapper_section sincos
+    // out of line remainder code for abs(angle)>=1024
+2:
+#if PICO_DOUBLE_PROPAGATE_NANS
+    lsls r2, r1, #1
+    asrs r2, #21
+    adds r2, #1
+    bne 3f
+    // infinite to nan
+    movs r2, #1
+    lsls r2, #19
+    orrs r1, r2
+    pop {r4-r5}
+    stmia r4!, {r0, r1}
+    stmia r5!, {r0, r1}
+    pop {r4, r5, pc}
+3:
+#endif
+    push {lr}
+    bl sincostan_remainder
+    pop {r2}
+    mov lr, r2
+    b 1f // continue with sincos
+
+wrapper_func sincos
+    push {r2-r5, lr}
+    // rom version only works for -1024 < angle < 1024
+    lsls r2, r1, #2
+    bcc 1f
+    lsrs r2, #22
+    cmp r2, #9
+    bge 2b
+1:
+    bl 2f // call the shim
+    pop {r4-r5}
+    stmia r4!, {r0, r1}
+    stmia r5!, {r2, r3}
+    pop {r4, r5, pc}
+
+2:
+    shimmable_table_tail_call SF_TABLE_V3_FSINCOS sincos_shim_bootstrap
+
+.thumb_func
+sincos_shim_bootstrap:
+    push {r2, r3, r4}
+    movs r3, #0x13
+    ldrb r3, [r3]
+#if PICO_DOUBLE_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
+    cmp r3, #1
+    bne 1f
+    ldr r3, =dsincos_shim
+    b 2f
+#endif
+1:
+    ldr r3, =dsincos_shim_v2
+2:
+    ldr r2, =sd_table
+    str r3, [r2, #SF_TABLE_V3_FSINCOS]
+    str r3, [sp, #8]
+    pop {r2, r3, pc}
+.thumb_func
+dsincos_shim_v2:
+     push {r4-r7,r14}
+     bl push_r8_r11
+     bl v2_rom_dsincos_internal
+     mov r12,r0                    @ save ε
+     bl v2_rom_dcos_finish
+     push {r0,r1}
+     mov r0,r12
+     bl v2_rom_dsin_finish
+     pop {r2,r3}
+     bl pop_r8_r11
+     pop {r4-r7,r15}
+.thumb_func
+v2_rom_dsincos_internal:
+    push {r0, lr}
+    ldr r0, =0x3855
+    str r0, [sp, #4]
+    pop {r0, pc}
+.thumb_func
+v2_rom_dcos_finish:
+    push {r0, r1}
+    ldr r0, =0x389d
+    str r0, [sp, #4]
+    pop {r0, pc}
+.thumb_func
+v2_rom_dsin_finish:
+    push {r0, r1}
+    ldr r0, =0x38d9
+    str r0, [sp, #4]
+    pop {r0, pc}
+
+double_wrapper_section tan
+#don't use _d1 as we're doing a range check anyway and infinites/nans are bigger than 1024
+wrapper_func tan
+    // rom version only works for -1024 < angle < 1024
+    lsls r2, r1, #2
+    bcc dtan_in_range
+    lsrs r2, #22
+    cmp r2, #9
+    bge dtan_angle_out_of_range
+dtan_in_range:
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
+    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
+    mov ip, r2
+    ldr r2, =(SIO_BASE)
+    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
+    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
+    bcs dtan_save_state
+    mov r2, ip
+#else
+    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
+    push {r4, lr}
+    mrs r4, PRIMASK
+    cpsid i
+    bl dtan_shim_call
+    msr PRIMASK, r4
+    pop {r4, pc}
+#endif
+dtan_shim_call:
+    shimmable_table_tail_call SF_TABLE_FTAN dtan_shim
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
+dtan_save_state:
+    ldr r2, =(SIO_BASE)
+    save_div_state_and_lr
+    mov r2, ip
+    bl dtan_shim_call
+    ldr r2, =(SIO_BASE)
+    restore_div_state_and_return
+#endif
+dtan_angle_out_of_range:
+#if PICO_DOUBLE_PROPAGATE_NANS
+    lsls r2, r1, #1
+    asrs r2, #21
+    adds r2, #1
+    bne 3f
+    // infinite to nan
+    movs r2, #1
+    lsls r2, #19
+    orrs r1, r2
+    bx lr
+3:
+#endif
+    push {lr}
+    bl sincostan_remainder
+    pop {r2}
+    mov lr, r2
+    b dtan_in_range
+
+double_wrapper_section atan2
+wrapper_func_d2 atan2
+    shimmable_table_tail_call SF_TABLE_FATAN2 datan2_shim
+
+double_wrapper_section exp
+wrapper_func_d1 exp
+    shimmable_table_tail_call SF_TABLE_FEXP dexp_shim
+
+double_wrapper_section log
+wrapper_func_d1 log
+    shimmable_table_tail_call SF_TABLE_FLN dln_shim
+
+#endif
--- a/lib/main/pico-sdk/rp2_common/pico_double/double_conv_m33.S
+++ b/lib/main/pico-sdk/rp2_common/pico_double/double_conv_m33.S
@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include "pico/asm_helper.S"
+#if HAS_DOUBLE_COPROCESSOR
+
+pico_default_asm_setup
+
+.macro double_section name
+#if PICO_DOUBLE_IN_RAM
+.section RAM_SECTION_NAME(\name), "ax"
+#else
+.section SECTION_NAME(\name), "ax"
+#endif
+.endm
+
+.macro double_wrapper_section func
+double_section WRAPPER_FUNC_NAME(\func)
+.endm
+
+double_wrapper_section conv_tod
+
+@ convert int64 to double, rounding
+wrapper_func __aeabi_l2d
+regular_func int642double
+ movs r2,#0       @ fall through
+@ convert unsigned 64-bit fix to double, rounding; number of r0:r1 bits after point in r2
+regular_func fix642double
+ cmp r1,#0
+ bge 10f @ positive? can use unsigned code
+ rsbs r0,#0
+ sbc r1,r1,r1,lsl#1 @ make positive
+ cbz r1,7f @ high word is zero?
+ clz r3,r1
+ subs r3,#11
+ bmi 2f
+ rsbs r12,r3,#32
+ lsrs r12,r0,r12
+ lsls r0,r3
+ lsls r1,r3
+ orrs r1,r1,r12
+ add r2,r2,r3
+ rsbs r2,#0
+ add r2,#0x3ff+19+32
+ add r1,r1,r2,lsl#20 @ insert exponent
+ orr r1,#0x80000000
+ mov r3,0x7fe
+ cmp r2,r3
+ it lo @ over/underflow?
+ bxlo r14
+ b 3f
+7:
+ mov r1,r2
+ b fix2double_neg
+2:
+ add r3,#33
+ lsls r12,r0,r3 @ rounding bit in carry, sticky bits in Z
+ sub r3,#1
+ lsl r12,r1,r3
+ rsb r3,#32
+ lsr r0,r3
+ lsr r1,r3
+ orr r0,r0,r12
+@ push {r14}
+@ bl dumpreg
+@ pop {r14}
+ sub r2,r3,r2
+ add r2,#0x3ff+19+32
+ beq 4f @ potential rounding tie?
+ adcs r0,r0,#0
+5:
+ adc r1,r1,r2,lsl#20 @ insert exponent, add rounding
+ orr r1,#0x80000000
+ mov r3,0x7fe
+ cmp r2,r3
+ it lo
+ bxlo r14
+@ over/underflow?
+3:
+ mov r1,#0
+ it ge
+ movtge r1,#0x7ff0 @ overflow
+ mov r0,#0
+ bx r14
+1:
+ movs r1,#0
+ bx r14
+4:
+ bcc 5b @ not a rounding tie after all
+ adcs r0,r0,#0
+ bic r0,r0,#1 @ force to even
+ b 5b
+
+@ convert uint64 to double, rounding
+wrapper_func __aeabi_ul2d
+regular_func uint642double
+ movs r2,#0       @ fall through
+@ convert unsigned 64-bit fix to double, rounding; number of r0:r1 bits after point in r2
+regular_func ufix642double
+10:
+ cbz r1,7f @ high word zero?
+ clz r3,r1
+ subs r3,#11
+ bmi 2f
+ rsbs r12,r3,#32
+ lsrs r12,r0,r12
+ lsls r0,r3
+ lsls r1,r3
+ orrs r1,r1,r12
+ add r2,r2,r3
+ rsbs r2,#0
+ add r2,#0x3ff+19+32
+ add r1,r1,r2,lsl#20 @ insert exponent
+ mov r3,0x7fe
+ cmp r2,r3
+ it lo @ over/underflow?
+ bxlo r14
+ b 3f
+7:
+ mov r1,r2
+ b ufix2double
+2:
+ add r3,#33
+ lsls r12,r0,r3 @ rounding bit in carry, sticky bits in Z
+ sub r3,#1
+ lsl r12,r1,r3
+ rsb r3,#32
+ lsr r0,r3
+ lsr r1,r3
+ orr r0,r0,r12
+@ push {r14}
+@ bl dumpreg
+@ pop {r14}
+ sub r2,r3,r2
+ add r2,#0x3ff+19+32
+ beq 4f @ potential rounding tie?
+ adcs r0,r0,#0
+5:
+ adc r1,r1,r2,lsl#20 @ insert exponent, add rounding
+ mov r3,0x7fe
+ cmp r2,r3
+ it lo
+ bxlo r14
+@ over/underflow?
+3:
+ mov r1,#0
+ it ge
+ movtge r1,#0x7ff0 @ overflow
+ mov r0,#0
+ bx r14
+1:
+ movs r1,#0
+ bx r14
+4:
+ bcc 5b @ not a rounding tie after all
+ adcs r0,r0,#0
+ bic r0,r0,#1 @ force to even
+ b 5b
+
+regular_func fix2double
+ cmp r0,#0
+ bge ufix2double @ positive? can use unsigned code
+ rsbs r0,#0 @ make positive
+fix2double_neg:
+ clz r3,r0
+ subs r3,#11
+ bmi 2f
+ lsls r0,r3
+ add r2,r1,r3
+ rsbs r2,#0
+ add r2,#0x3ff+19
+ add r1,r0,r2,lsl#20 @ insert exponent
+ orr r1,#0x80000000
+ mov r0,#0
+ mov r3,0x7fe
+ cmp r2,r3
+ it lo @ over/underflow?
+ bxlo r14
+ b 3f
+2:
+ rsb r3,#0
+ lsrs r12,r0,r3
+ rsb r2,r3,#32
+ lsls r0,r0,r2
+@ push {r14}
+@ bl dumpreg
+@ pop {r14}
+ sub r2,r3,r1
+ add r2,#0x3ff+19
+ add r1,r12,r2,lsl#20 @ insert exponent
+ orr r1,#0x80000000
+ mov r3,0x7fe
+ cmp r2,r3
+ it lo
+ bxlo r14
+@ over/underflow?
+3:
+ mov r1,#0x80000000
+ it ge
+ movtge r1,#0xfff0 @ overflow
+ mov r0,#0
+ bx r14
+1:
+ movs r1,#0
+ bx r14
+
+regular_func ufix2double
+ cbz r0,1f @ zero? return it
+ clz r3,r0
+ subs r3,#11
+ bmi 2f
+ lsls r0,r3
+ add r2,r1,r3
+ rsbs r2,#0
+ add r2,#0x3ff+19
+ add r1,r0,r2,lsl#20 @ insert exponent
+ mov r0,#0
+ mov r3,0x7fe
+ cmp r2,r3
+ it lo @ over/underflow?
+ bxlo r14
+ b 3f
+2:
+ rsbs r3,#0
+ lsrs r12,r0,r3
+ rsb r2,r3,#32
+ lsls r0,r0,r2
+@ push {r14}
+@ bl dumpreg
+@ pop {r14}
+ sub r2,r3,r1
+ add r2,#0x3ff+19
+ add r1,r12,r2,lsl#20 @ insert exponent
+ mov r3,0x7fe
+ cmp r2,r3
+ it lo
+ bxlo r14
+@ over/underflow?
+3:
+ mov r1,#0
+ it ge
+ movtge r1,#0x7ff0 @ overflow
+ mov r0,#0
+ bx r14
+1:
+ movs r1,#0
+ bx r14
+
+double_wrapper_section conv_dtoi64
+
+@ convert double to signed int64, rounding towards 0, clamping
+wrapper_func __aeabi_d2lz
+regular_func double2int64_z
+ movs r2,#0      @ fall through
+@ convert double in r0:r1 to signed fixed point in r0:r1, clamping
+regular_func double2fix64_z
+ sub r2,#0x3ff+52-1 @ remove exponent bias, compensate for mantissa length
+ asrs r12,r1,#20 @ sign and exponent
+ sub r3,r12,#1
+ sub r1,r1,r3,lsl#20 @ install implied 1, clear exponent
+ lsls r3,#21
+@ push {r14}
+@ bl dumpreg
+@ pop {r14}
+ cmp r3,#0xffc00000
+ bhs 1f @ 0, ∞/NaN?
+ adds r2,r2,r3,lsr#21 @ offset exponent by fix precision; r1 is now required left shift
+ bmi 4f @ actually a right shift?
+ cmp r2,#11 @ overflow?
+ bge 5f
+ lsls r1,r2
+ rsbs r3,r2,#32
+ lsrs r3,r0,r3
+ orrs r1,r1,r3
+ lsls r0,r2
+ cmp r12,#0
+ it ge
+ bxge r14
+ rsbs r0,#0
+ sbc r1,r1,r1,lsl#1
+ bx r14
+4:
+ adds r2,#32
+ ble 6f @ result fits in low word?
+ lsl r3,r1,r2
+ rsbs r2,#32
+ lsrs r1,r2
+ lsrs r0,r2
+ orrs r0,r0,r3
+ cmp r12,#0
+ it ge
+ bxge r14
+ rsbs r0,#0
+ sbc r1,r1,r1,lsl#1
+ bx r14
+6:
+ rsbs r2,#0
+ usat r2,#5,r2 @ underflow to 0
+ lsrs r0,r1,r2
+ movs r1,#0
+ cmp r12,#0
+ it ge
+ bxge r14
+ rsbs r0,#0
+ sbc r1,r1,r1,lsl#1
+ bx r14
+1:
+ beq 3f @ ±∞/±NaN?
+2:
+ movs r0,#0 @ ±0: return 0
+ movs r1,#0
+ bx r14
+3:
+ orrs r1,r0,r1,lsl#12 @ mantissa field
+ it ne @ NaN?
+ movne r12,#0 @ treat NaNs as +∞
+@ here original argument was ±Inf or we have under/overflow
+5:
+ mvn r1,#0x80000000
+ add r1,r1,r12,lsr#31 @ so -Inf → 0x80000000, +Inf → 0x7fffffff
+ mvn r0,r12,asr#31
+ bx r14
+
+double_wrapper_section conv_dtoui64
+
+@ convert double to unsigned int64, rounding towards -Inf, clamping
+wrapper_func __aeabi_d2ulz
+regular_func double2uint64
+regular_func double2uint64_z
+ movs r2,#0      @ fall through
+@ convert double in r0:r1 to unsigned fixed point in r0:r1, clamping
+regular_func double2ufix64
+regular_func double2ufix64_z
+ subw r2,r2,#0x3ff+52-1 @ remove exponent bias, compensate for mantissa length
+ asrs r3,r1,#20 @ sign and exponent
+ sub r3,#1
+ sub r1,r1,r3,lsl#20 @ install implied 1, clear exponent and sign
+ bmi 7f @ argument negative?
+ movw r12,#0x7fe
+ cmp r3,r12
+ bhs 1f @ 0, ∞/NaN?
+ adds r2,r3 @ offset exponent by fix precision; r2 is now required left shift
+ bmi 2f @ actually a right shift?
+ cmp r2,#12 @ overflow?
+ bge 4f
+ lsls r1,r2
+ rsbs r3,r2,#32
+ lsrs r3,r0,r3
+ lsls r0,r2
+ orrs r1,r1,r3
+ bx r14
+2:
+ adds r2,#32
+ ble 5f @ result fits in low word?
+ lsl r3,r1,r2
+ rsbs r2,#32
+ lsrs r1,r2
+ lsrs r0,r2
+ orrs r0,r0,r3
+ bx r14
+5:
+ rsbs r2,#0
+ usat r2,#5,r2 @ underflow to 0
+ lsrs r0,r1,r2
+ movs r1,#0
+ bx r14
+1:
+ bhi 3f @ 0? return 0
+4:
+@ here overflow has occurred
+ mvn r0,#0
+ mvn r1,#0
+ bx r14
+7:
+ cmp r3,#0xfffffffe
+ bne 3f @ -0? return 0
+ orrs r2,r0,r1,lsl#12 @ mantissa field
+ bne 4b
+3:
+ movs r0,#0
+ movs r1,#0
+ bx r14
+
+#endif
--- a/lib/main/pico-sdk/rp2_common/pico_double/double_fma_dcp.S
+++ b/lib/main/pico-sdk/rp2_common/pico_double/double_fma_dcp.S
@ -0,0 +1,608 @@
+/*
+ * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include "pico/asm_helper.S"
+
+#if !HAS_DOUBLE_COPROCESSOR
+#error attempt to compile double_fma_rp2350 when there is no DCP
+#else
+
+#include "hardware/dcp_instr.inc.S"
+#include "hardware/dcp_canned.inc.S"
+
+pico_default_asm_setup
+
+// factor out save/restore (there is a copy in float code)
+
+.macro double_section name
+#if PICO_DOUBLE_IN_RAM
+.section RAM_SECTION_NAME(\name), "ax"
+#else
+.section SECTION_NAME(\name), "ax"
+#endif
+.endm
+
+.macro double_wrapper_section func
+double_section WRAPPER_FUNC_NAME(\func)
+.endm
+
+// ============== STATE SAVE AND RESTORE ===============
+
+.macro saving_func_return
+  bx lr
+.endm
+
+double_section __rp2350_dcp_engaged_state_save_restore_copy
+.thumb_func
+__dcp_save_state:
+  sub sp, #24
+  push {r0, r1}
+  // do save here
+  PXMD r0, r1
+  strd r0, r1, [sp, #8 + 0]
+  PYMD r0, r1
+  strd r0, r1, [sp, #8 + 8]
+  REFD r0, r1
+  strd r0, r1, [sp, #8 + 16]
+  pop {r0, r1}
+  blx lr
+  // <- wrapped function returns here
+  // fall through into restore:
+.thumb_func
+__dcp_restore_state:
+  // do restore here
+  pop {r12, r14}
+  WXMD r12, r14
+  pop {r12, r14}
+  WYMD r12, r14
+  pop {r12, r14}
+  WEFD r12, r14
+  pop {pc}
+
+double_wrapper_section __dfma
+@ cf saving_func macro: but here we need to record the SP before the state save possibly changes it
+1:
+ push {lr}              // 16-bit instruction
+ bl __dcp_save_state    // 32-bit instruction
+ b 1f                   // 16-bit instruction
+
+@ compute mn+a with full intermediate precision
+@ r0:r1 m
+@ r2:r3 n
+@ [r13,#0] a
+wrapper_func fma
+ mov r12,sp                  @ save the SP
+ PCMP apsr_nzcv              @ test the engaged flag
+ bmi 1b
+1:
+ push {r4-r8,r14}
+ ldrd r4,r5,[r12,#0]         @ fetch a using original SP
+ ubfx r7,r1,#20,#11          @ r7=em
+ ubfx r8,r3,#20,#11          @ r8=en
+ add r8,r7,r8                @ em+en
+
+ eors r6,r1,r3               @ get sign of mn
+ eors r6,r6,r5               @ set N if mn has opposite sign to a, i.e. if the operation is essentially a subtraction
+ WXUP r4,r5                  @ write a to coprocessor to get its classification
+ PEFD r14,r12                @ r14=fa
+ WXUP r0,r1                  @ write m and n to coprocessor to get their classifications
+ WYUP r2,r3
+ PEFD r6,r12                 @ r6=fm, r12=fn, r14=fa
+ orr r14,r14,r6
+ orr r14,r14,r12             @ OR of all the classification flags, so we can check if any are zero/Inf/NaN
+
+ RXMS r3,r6,0                @ we will almost always need the full product so compute it here (cf dmul macro)
+ RYMS r7,r12,0
+ umull r0,r1,r3,r7
+ mov r2,#0                   @ seems to be no 16-bit instruction which zeros a register without affecting the flags
+ umlal r1,r2,r3,r12
+ umlal r1,r2,r6,r7
+ mov r3,#0
+ umlal r2,r3,r6,r12          @ r0:r1:r2:r3: full product mn Q124 1≤mn<4
+
+ bmi 50f                     @ mn has opposite sign to a so operation is essentially a subtraction
+
+@ ======================== ADDITION PATH ========================
+
+ tst r14,#0x70000000         @ were any of the arguments zero/inf/NaN?
+ bne 90f                     @ then use mla path which gives the correct result in all these cases
+ ubfx r14,r5,#20,#11         @ r14=ea
+@ here all operands are finite and non-zero
+@ r0:r1:r2:r3: full product mn Q124 1≤mn<4
+@ r4:r5 a IEEE packed
+@ r8: em+en [biased +0x3ff*2]
+@ r14: ea [biased +0x3ff]
+ subw r7,r8,#0x3fd
+ subs r7,r7,r14              @ em+en-ea+2 (debiased)
+ blt 80f                     @ branch if |a| is big compared to |mn|, more precisely if ea-(em+en)≥3 so e.g. if ea=0 (hence 1≤a<2) then em+en≤-3 and mn<4.2¯³=1/2
+
+@ ======================== ADDITION PATH, RESULT HAS COMPARABLE MAGNITUDE TO mn  ========================
+
+@ here |mn| is big compared to |a|; e.g. if em+en=0 (so 1≤mn<4) then ea≤2 and a<8
+ movs r8,#1
+ bfi r5,r8,#20,#12           @ insert implied 1 in a
+ rsbs r7,r7,#74              @ shift up ≤74 (can be negative) that will be required for a (Q52) to align with mn (Q124, ending in 20 zeros)
+@ now add (shifted) a into mn, preserving flags
+ and r8,r7,#0x1f             @ k=shift mod 32
+ mov r12,#1
+ lsl r12,r12,r8              @ 2^k
+ umull r5,r6,r5,r12          @ shift up high word: r4:r5:r6 is now a_lo + 2^k a_hi
+ sub r12,#1                  @ 2^k-1
+ umlal r4,r5,r4,r12          @ shift up low word, adding in: r4:r5:r6 is now (a_lo + 2^k a_hi) + (2^k-1) a_lo = 2^k (a_lo + a_hi) = a shifted up by k
+ bmi 91f                     @ use flags: will a be shifted down?
+ cmp r7,#64                  @ shift up by two more words?
+ bge 92f
+ cmp r7,#32                  @ shift up by one more word?
+ bge 93f
+ adds r0,r0,r4               @ no more word shifts
+ adcs r1,r1,r5
+ adcs r2,r2,r6
+ adcs r3,r3,#0               @ r0:r1:r2:r3: mn + a (cf dmul macro)
+ WXMS r0,r1                  @ write sticky bits
+ WXMO r2,r3                  @ write sticky+result bits
+ NRDD                        @ as dmul macro tail: exponent computed in coprocessor is correct
+ RDDM r0,r1
+// todo optimize this based on final decision on saving_func_entry
+  pop {r4-r8,lr}
+  saving_func_return
+
+93:
+ adds r1,r1,r4
+ adcs r2,r2,r5
+ adcs r3,r3,r6               @ r0:r1:r2:r3: mn + (a<<32)
+ WXMS r0,r1                  @ write sticky bits
+ WXMO r2,r3                  @ write sticky+result bits
+ NRDD
+ RDDM r0,r1
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+92:
+ adds r2,r2,r4
+ adcs r3,r3,r5               @ r0:r1:r2:r3: mn + (a<<64); note this cannot overflow as total shift was at most 74 (see above)
+ WXMS r0,r1                  @ write sticky bits
+ WXMO r2,r3                  @ write sticky+result bits
+ NRDD
+ RDDM r0,r1
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+91:                          @ case where a (Q52) is shifted down relative to mn (Q124); the mod 32 part of the shift of a has already been done
+@ r0:r1:r2:r3: mn
+@ r4:r5:r6: a
+@ r7: alignment shift required (negative)
+ cmn r7,#32                  @ shift down one word?
+ bge 94f
+ cmn r7,#64                  @ shift down two words?
+ bge 95f
+@ here a is shifted entirely below the bottom of m
+ orr r0,r0,#1                @ a is non-zero so ensure we set the sticky bit
+ WXMS r0,r1                  @ write sticky bits
+ WXMO r2,r3                  @ write sticky+result bits
+ NRDD
+ RDDM r0,r1
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+94:
+ adds r0,r0,r5               @ one word shift down
+ adcs r1,r1,r6
+ adcs r2,r2,#0
+ adcs r3,r3,#0
+ orr r0,r0,r4                @ contribution from a to sticky bits
+ WXMS r0,r1                  @ write sticky bits
+ WXMO r2,r3                  @ write sticky+result bits
+ NRDD
+ RDDM r0,r1
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+95:
+ adds r0,r0,r6               @ two word shift down
+ adcs r1,r1,#0
+ adcs r2,r2,#0
+ adcs r3,r3,#0
+ orr r0,r0,r4                @ contribution from a to sticky bits
+ orr r0,r0,r5
+ WXMS r0,r1                  @ write sticky bits
+ WXMO r2,r3                  @ write sticky+result bits
+ NRDD
+ RDDM r0,r1
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+@ ======================== ADDITION PATH, RESULT HAS COMPARABLE MAGNITUDE TO a  ========================
+
+80:
+@ here |mn|<~|a|
+@ r0:r1:r2:r3: mn Q124
+@ r4:r5 a IEEE packed
+@ r7: -(shift down required to align mn with a), guaranteed negative
+@ r8: em+en [biased +0x3ff*2]
+@ r14: ea [biased +0x3ff]
+ tst r3,#0x20000000
+ bne 1f @ 2≤mn<4?
+ adds r2,r2,r2               @ normalise so mn is 2..4 Q124; note that the contents of r0 and r1 are always destined for the sticky bit in this path
+ adcs r3,r3,r3
+ subs r7,r7,#1               @ correction to alignment shift
+1:
+@ now we construct an IEEE packed value in r2:r3 such that adding it to r4:r5 gives the correct final result
+@ observe that the exponent of this constructed value will be at least two less than that of a (by the "|a| is big compared to |mn|" test above)
+@ so the alignment shift in the final addition will be by at least two places; thus we can use bit 0 of the constructed
+@ value as a sticky bit, and we still have one bit in hand for rounding
+ subs r7,r7,#2               @ now r7 < -2
+ orr r0,r0,r2,lsl#23         @ shift r2:r3 down 9 places, ORing excess into sticky bits
+ lsrs r2,r2,#9
+ orr r2,r2,r3,lsl#23
+ lsrs r3,r3,#9
+ orrs r0,r0,r1
+ it ne
+ orrne r2,r2,#1              @ sticky bit from bottom 64 bits of mn as shifted
+@ r2:r3 mn 2..4 Q51, i.e. 1..2 Q52
+@ r2b0 holds sticky bit; note that for alignment with a in r4:r5, r2:r3 will be shifted down at least one place
+
+ lsrs r6,r5,#31              @ get sign of a (which in this path is the same as the sign of mn, and of the result)
+ orr r3,r3,r6,lsl#31         @ set sign in mn
+
+ adds r14,r7,r14             @ get exponent for mn relative to a; note this can go negative
+ add r3,r3,r14,lsl#20        @ note that "implied" 1 is present in r3, giving an offset of 1 in the exponent
+ bmi 1f                      @ negative? then we have just constructed a denormal (or less) and the addition will give an incorrect result
+ dcp_dadd_m r0,r1,r2,r3,r4,r5
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+1:
+@ compare with similar code in subtraction path: here we cannot underflow
+ cmn r7,#64                  @ if the alignment shift for mn is very large then the result is just a
+ ble 82f
+ add r3,r3,#0x40000000       @ ea cannot be very large (as adding r7 made it negative), so safe to add 1024 to exponents of both a and mn
+ add r5,r5,#0x40000000
+ dcp_dadd_m r0,r1,r2,r3,r4,r5
+ sub r1,r1,#0x40000000       @ fix exponent
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+90:
+@ dcp_dmul_m tail then dadd ("mla path")
+ WXMS r0,r1                  @ write sticky bits
+ WXMO r2,r3                  @ write sticky+result bits
+ NRDD
+ RDDM r0,r1
+ dcp_dadd_m r0,r1,r0,r1,r4,r5
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+82:                          @ |mn| is very small compared to |a|, so result is a
+ RDDM r0,r1                  @ clear the engaged flag
+ movs r0,r4
+ movs r1,r5
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+@ ======================== SUBTRACTION PATH ========================
+
+50:
+ tst r14,#0x70000000         @ were any of the arguments zero/inf/NaN?
+ bne 90b @ then use mla path which gives the correct result in all these cases
+ ubfx r14,r5,#20,#11         @ r14=ea
+@ now all operands are finite and non-zero
+@ r0:r1:r2:r3: full product mn Q124 1≤mn<4
+@ r4:r5 a IEEE packed (including sign bit; sign of mn is opposite as we are in the subtraction path)
+@ r8: em+en [+0x3ff*2]
+@ r14: ea [+0x3ff]
+ subw r8,r8,#0x3fc           @ em+en+3
+ subs r7,r8,r14              @ em+en-ea+3 (debiased)
+ blt 80f                     @ branch if |a| is big compared to |mn|, more precisely if ea-(em+en)≥4 so e.g. if ea=0 then em+en≤-4 and mn<4.2^-4=1/4
+ beq 94f                     @ branch if ea-(em+en)=3 e.g. if ea=0 then em+en=-3 and 1/8=2^-3≤mn<4.2^-3=1/2
+@ in this branch, if e.g. em+en=0 (so 1≤mn<4) then ea≤2 and a<8
+ rsbs r7,r7,#75              @ 75-(em+en-ea+3) = 72-(em+en-ea), shift up 0..74 that will be required for a (Q52) to align with mn (Q124, ending in 20 zeros)
+ mvn r14,r5,lsr#31           @ save complement of sign of a
+@ subtract (shifted) a from mn
+ and r6,r7,#0x1f             @ k=shift mod 32
+ mov r12,#1
+ bfi r5,r12,#20,#12          @ insert implied 1 in a
+ lsl r12,r12,r6              @ 2^k
+ umull r5,r6,r5,r12
+ sub r12,#1
+ umlal r4,r5,r4,r12          @ shift a up by shift amount mod 32 (see comment in addition path)
+@ r4:r5:r6: a shifted up by k=shift mod 32
+ bmi 91f                     @ will a be shifted down?
+ cmp r7,#64                  @ shift up by two more words?
+ bge 92f
+ cmp r7,#32                  @ shift up by one more word?
+ bge 93f
+ subs r0,r0,r4               @ no more word shifts; this cannot go negative or have bad cancellation
+ sbcs r1,r1,r5
+ sbcs r2,r2,r6
+ sbcs r3,r3,#0               @ r0:r1:r2:r3: mn - a (cf dmul macro)
+ WXMS r0,r1                  @ write sticky bits
+ WXMO r2,r3                  @ write sticky+result bits
+ NRDD                        @ as dmul macro tail: exponent and sign computed in coprocessor is correct
+ RDDM r0,r1
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+94:
+@ here if ea-(em+en)=3 e.g. if ea=0 then em+en=-3 and 1/8=2^-3≤mn<4.2^-3=1/2
+@ r0:r1:r2:r3: full product mn Q124 1≤mn<4
+@ r4:r5 a IEEE packed (including sign bit; sign of mn is opposite as we are in the subtraction path)
+ lsls r5,r5,#11              @ convert a to mantissa Q63 in r4:r5
+ orrs r5,r5,r4,lsr#21
+ lsls r4,r4,#11
+ orrs r5,r5,0x80000000       @ implied 1
+ movs r6,#0
+ subs r0,r6,r0               @ compute |a|-|mn|
+ sbcs r6,r6,r1
+ sbcs r4,r4,r2
+ sbcs r5,r5,r3
+ WXMS r0,r6                  @ write sticky bits
+ WXMO r4,r5                  @ write sticky+result bits
+ NRDD
+ RDDM r0,r1
+ eor r1,r1,0x80000000        @ sign of result is opposite to that of product as yielded by coprocessor
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+93:
+ subs r1,r1,r4               @ shifting a up by one word: this cannot go negative or have bad cancellation
+ sbcs r2,r2,r5
+ sbcs r3,r3,r6
+ WXMS r0,r1                  @ write sticky bits
+ WXMO r2,r3                  @ write sticky+result bits
+ NRDD
+ RDDM r0,r1
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+92:
+ subs r2,r2,r4               @ shifting a up by two words: this /can/ go negative or have bad cancellation
+ sbcs r3,r3,r5
+ cmp r3,#0x01000000          @ check we have at least 57 bits of product so that dmul tail will round correctly (this test is slightly conservative - 55 needed?)
+ blt 1f                      @ also trap case where result is negative
+ WXMS r0,r1                  @ write sticky bits
+ WXMO r2,r3                  @ write sticky+result bits
+ NRDD
+ RDDM r0,r1
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+@ heavy cancellation case
+@ r0:r1:r2:r3: result Q124, signed
+@ r8: em+en+3
+@ r14b0: save complement of sign of a
+1:
+ sub r8,r8,#1                @ em+en+2
+ RDDM r6,r7                  @ clear engaged flag
+ blo 2f                      @ if result is negative...
+ movs r6,#0                  @ ... negate it...
+ subs r0,r6,r0
+ sbcs r1,r6,r1
+ sbcs r2,r6,r2
+ sbcs r3,r6,r3
+ eor r14,r14,#1              @ ... and flip saved sign
+2:                           @ now normalise result
+ orrs r6,r2,r3               @ shift up by 64 possible?
+ bne 7f
+ movs r3,r1                  @ do it
+ movs r2,r0
+ movs r1,#0
+ movs r0,#0
+ sub r8,r8,#64               @ fix exponent
+7:
+ cmp r3,#0                   @ shift up by 32 possible?
+ bne 8f
+ movs r3,r2                  @ do it
+ movs r2,r1
+ movs r1,r0
+ movs r0,#0
+ sub r8,r8,#32
+8:
+ cmp r3,#0                   @ is result zero? return it
+ beq 9f
+ clz r6,r3                   @ k=amount of final shift
+ subs r8,r8,r6               @ final exponent
+ movs r7,#1
+ lsls r7,r7,r6               @ r7=2^k
+ muls r3,r3,r7
+ subs r7,r7,#1               @ 2^k-1
+ umlal r2,r3,r2,r7
+ umlal r1,r2,r1,r7
+ umlal r0,r1,r0,r7           @ r0:r1:r2:r3: normalised result
+ orrs r0,r0,r1               @ any sticky bits below top 64?
+ it ne
+ orrne r2,r2,#1              @ or into sticky bit
+ lsrs r0,r2,#11              @ align to mantissa position for IEEE format
+ lsrs r1,r3,#11
+ orr r0,r0,r3,lsl#21
+ lsls r2,r2,#22              @ rounding bit in C, sticky bit in ~Z
+ bcc 10f                     @ no rounding?
+ beq 11f                     @ rounding tie?
+ adcs r0,r0,#0               @ round up (C is set)
+ adcs r1,r1,#0
+ adds r8,r8,r1,lsr#20        @ candidate for exponent field
+ ble 12f                     @ underflow? overflow cannot occur here as the result is smaller in magnitude than a
+ bfi r1,r8,#20,#11           @ insert exponent
+ orr r1,r1,r14,lsl#31        @ or in sign
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+11:
+ adcs r0,r0,#0               @ round up as above
+ adcs r1,r1,#0
+ bic r0,r0,#1                @ to even
+ adds r8,r8,r1,lsr#20        @ candidate for exponent field
+ ble 12f                     @ underflow?
+ bfi r1,r8,#20,#11           @ insert exponent
+ orr r1,r1,r14,lsl#31        @ or in sign
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+10:
+ adds r8,r8,r1,lsr#20        @ candidate for exponent field
+ ble 12f                     @ underflow?
+ bfi r1,r8,#20,#11           @ insert exponent
+ orr r1,r1,r14,lsl#31        @ or in sign
+9:
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+12:
+ mov r1,r14,lsl#31           @ underflow: return signed zero
+ movs r0,#0
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+91:                          @ case where a (Q52) is shifted down relative to mn (Q124); the mod 32 part of the shift of a has already been done
+@ r0:r1:r2:r3: mn
+@ r4:r5:r6: a
+@ r7: alignment shift required (negative)
+ cmn r7,#32                  @ shift down one word?
+ bge 94f
+ cmn r7,#64                  @ shift down two words?
+ bge 95f
+@ here a is shifted entirely below the bottom of m
+ subs r0,r0,#1               @ subtract an epsilon (a is non-zero)
+ sbcs r1,r1,#0
+ sbcs r2,r2,#0
+ sbcs r3,r3,#0
+ orr r0,r0,#1                @ ensure the sticky bit is set (a is non-zero)
+ WXMS r0,r1                  @ write sticky bits
+ WXMO r2,r3                  @ write sticky+result bits
+ NRDD
+ RDDM r0,r1
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+94:
+ rsbs r4,r4,#0               @ one word shift down
+ sbcs r0,r0,r5
+ sbcs r1,r1,r6
+ sbcs r2,r2,#0
+ sbcs r3,r3,#0
+ orr r0,r0,r4                @ sticky bits
+ WXMS r0,r1                  @ write sticky bits
+ WXMO r2,r3                  @ write sticky+result bits
+ NRDD
+ RDDM r0,r1
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+95:
+ movs r7,#0                  @ two words shift down
+ subs r4,r7,r4
+ sbcs r5,r7,r5
+ sbcs r0,r0,r6
+ sbcs r1,r1,r7
+ sbcs r2,r2,r7
+ sbcs r3,r3,r7
+ orrs r0,r0,r4               @ sticky bits
+ orrs r0,r0,r5
+ WXMS r0,r1                  @ write sticky bits
+ WXMO r2,r3                  @ write sticky+result bits
+ NRDD
+ RDDM r0,r1
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+80:
+@ here |a| is big compared to |mn|, more precisely ea-(em+en)≥4 so e.g. if ea=0 then em+en≤-4 and mn<4.2^-4=1/4
+@ r0:r1:r2:r3: mn Q124
+@ r4:r5: a IEEE packed
+@ r7<0, em+en-ea+3 (debiased)
+@ r14: ea [+0x3ff]
+ lsrs r6,r3,#29
+ bne 1f                      @ 2≤mn<4?
+ adds r2,r2,r2               @ shift up one place
+ adcs r3,r3,r3
+ subs r7,r7,#1               @ fix exponent
+1:                           @ now r2:r3 is mn Q61, sticky bits in r0:r1
+ subs r7,r7,#3
+@ r7=emn-ea <-3
+ orr r0,r0,r2,lsl#23         @ gather sticky bits
+ lsrs r2,r2,#9               @ adjust mn to Q52 ready to create packed IEEE version of mn
+ orr r2,r2,r3,lsl#23
+ lsrs r3,r3,#9
+ orrs r0,r0,r1               @ or of all sticky bits
+ it ne
+ orrne r2,r2,#1              @ sticky bit from bottom 64 bits of mn
+
+ mvn r6,r5,lsr#31            @ complement of sign of a
+ orr r3,r3,r6,lsl#31         @ fix sign of mn so we do a subtraction
+
+ adds r14,r7,r14             @ this can go negative; r14 is now at most ea[+0x3ff]-4
+ add r3,r3,r14,lsl#20
+@ the exponent field in r2:r3 (mn) is now at most ea[+0x3ff]-3
+@ that means that in the dadd operation that follows, mn will be shifted down at least three places to align with a,
+@ and a post-normalisation shift up of at most one place will be needed
+@ therefore in the worst case r2b2 affects b0 of the result; r2b1 affects the rounding of the result; and r2b0 can be used as a sticky bit
+ bmi 1f                      @ did exponent go negative?
+
+ dcp_dadd_m r0,r1,r2,r3,r4,r5
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+1:
+ cmn r7,#64                  @ is mn being shifted well below the bottom of a?
+ ble 82b                     @ then result is just a
+ add r3,r3,#0x40000000       @ otherwise offset exponents by +1024
+ add r5,r5,#0x40000000
+ dcp_dadd_m r0,r1,r2,r3,r4,r5
+ ubfx r2,r1,#20,#11          @ get exponent
+ cmp r2,#0x400               @ too small?
+ itte ls
+ andls r1,r1,0x80000000      @ flush to signed zero
+ movls r0,#0
+ subhi r1,r1,#0x40000000     @ else fix exponent of result
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4-r8,lr}
+ saving_func_return
+
+
+double_wrapper_section __dmla
+@ cf saving_func macro: but here we need to record the SP before the state save possibly changes it
+1:
+ push {lr}              // 16-bit instruction
+ bl __dcp_save_state    // 32-bit instruction
+ b 1f                   // 16-bit instruction
+
+@ r0:r1 m
+@ r2:r3 n
+@ [r13,#0] a
+regular_func mla
+ mov r12,sp                  @ save the SP
+ PCMP apsr_nzcv              @ test the engaged flag
+ bmi 1b
+1:
+ push {r4,r5,r14}
+ dcp_dmul_m r0,r1,r0,r1,r2,r3,r0,r1,r2,r3,r4,r5,r14
+ ldrd r2,r3,[r12,#0]        @ fetch a using original SP
+ dcp_dadd_m r0,r1,r0,r1,r2,r3
+// todo optimize this based on final decision on saving_func_entry
+ pop {r4,r5,r14}
+ saving_func_return
+
+#endif
--- a/lib/main/pico-sdk/rp2_common/pico_double/double_init_rom_rp2040.c
+++ b/lib/main/pico-sdk/rp2_common/pico_double/double_init_rom_rp2040.c
@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <string.h>
+#include "pico/bootrom.h"
+#include "pico/bootrom/sf_table.h"
+
+// NOTE THIS FUNCTION TABLE IS NOT PUBLIC OR NECESSARILY COMPLETE...
+// IT IS ***NOT*** SAFE TO CALL THESE FUNCTION POINTERS FROM ARBITRARY CODE
+uint32_t sd_table[SF_TABLE_V2_SIZE / 2];
+
+#if !(PICO_DOUBLE_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED)
+static __attribute__((noreturn)) void missing_double_func_shim(void) {
+    panic("missing double function");
+}
+#endif
+extern void double_table_shim_on_use_helper(void);
+
+void __attribute__((weak)) *sf_clz_func;
+
+void __aeabi_double_init(void) {
+    int rom_version = rp2040_rom_version();
+#if PICO_DOUBLE_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED
+    if (rom_version == 1) {
+
+        // this is a little tricky.. we only want to pull in a shim if the corresponding function
+        // is called. to that end we include a SVC instruction with the table offset as the call number
+        // followed by the shim function pointer inside the actual wrapper function. that way if the wrapper
+        // function is garbage collected, so is the shim function.
+        //
+        // double_table_shim_on_use_helper expects this SVC instruction in the calling code soon after the address
+        // pointed to by IP and patches the double_table entry with the real shim the first time the function is called.
+        for(uint i=0; i<SF_TABLE_V2_SIZE/4; i++) {
+            sd_table[i] = (uintptr_t)double_table_shim_on_use_helper;
+        }
+    }
+#else
+    if (rom_version == 1) {
+        // opting for soft failure for now - you'll get a panic at runtime if you call any of the missing methods
+        for(uint i=0;i<SF_TABLE_V2_SIZE/4;i++) {
+            sd_table[i] = (uintptr_t)missing_double_func_shim;
+        }
+    }
+#endif
+    if (rom_version >= 2) {
+        void *rom_table_double = rom_data_lookup(rom_table_code('S', 'D'));
+        void *rom_table_float = rom_data_lookup(rom_table_code('S', 'F'));
+        assert(*((uint8_t *)(((void *)rom_table_float)-2)) * 4 >= SF_TABLE_V2_SIZE);
+        (void)rom_table_float;
+        memcpy(&sd_table, rom_table_double, SF_TABLE_V2_SIZE);
+        if (rom_version == 2) {
+#ifndef NDEBUG
+            if (*(uint16_t *)0x3854 != 0xb500 || // this is dsincos(_internal)
+
+                *(uint16_t *)0x38d8 != 0x4649 || // this is dsin_finish
+                *(uint16_t *)0x389c != 0x4659  // this is dcos_finish
+                    ) {
+                panic(NULL);
+            }
+#endif
+        }
+    }
+    if (rom_version < 3) {
+        // we use the unused entry for SINCOS
+        sd_table[SF_TABLE_V3_FSINCOS / 4] = (uintptr_t) double_table_shim_on_use_helper;
+    }
+
+    sf_clz_func = rom_func_lookup(ROM_FUNC_CLZ32);
+}
--- a/lib/main/pico-sdk/rp2_common/pico_double/double_math.c
+++ b/lib/main/pico-sdk/rp2_common/pico_double/double_math.c
@ -0,0 +1,626 @@
+/*
+ * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <math.h>
+#include "pico/double.h"
+
+// opened a separate issue https://github.com/raspberrypi/pico-sdk/issues/166 to deal with these warnings if at all
+GCC_Pragma("GCC diagnostic push")
+GCC_Pragma("GCC diagnostic ignored \"-Wconversion\"")
+GCC_Pragma("GCC diagnostic ignored \"-Wsign-conversion\"")
+
+typedef uint64_t ui64;
+typedef uint32_t ui32;
+typedef int64_t i64;
+
+#define PINF ( HUGE_VAL)
+#define MINF (-HUGE_VAL)
+#define PZERO (+0.0)
+#define MZERO (-0.0)
+
+
+#define PI       3.14159265358979323846
+#define LOG2     0.69314718055994530941
+// Unfortunately in double precision ln(10) is very close to half-way between to representable numbers
+#define LOG10    2.30258509299404568401
+#define LOG2E    1.44269504088896340737
+#define LOG10E   0.43429448190325182765
+#define ONETHIRD 0.33333333333333333333
+
+#define PIf       3.14159265358979323846f
+#define LOG2f     0.69314718055994530941f
+#define LOG2Ef    1.44269504088896340737f
+#define LOG10Ef   0.43429448190325182765f
+#define ONETHIRDf 0.33333333333333333333f
+
+#define DUNPACK(x,e,m) e=((x)>>52)&0x7ff,m=((x)&0x000fffffffffffffULL)|0x0010000000000000ULL
+#define DUNPACKS(x,s,e,m) s=((x)>>63),DUNPACK((x),(e),(m))
+
+typedef union {
+    double d;
+    ui64 ix;
+} double_ui64;
+
+static inline double ui642double(ui64 ix) {
+    double_ui64 tmp;
+    tmp.ix = ix;
+    return tmp.d;
+}
+
+static inline ui64 double2ui64(double d) {
+    double_ui64 tmp;
+    tmp.d = d;
+    return tmp.ix;
+}
+
+#if PICO_DOUBLE_PROPAGATE_NANS
+static inline bool disnan(double x) {
+    ui64 ix= double2ui64(x);
+    // checks the top bit of the low 32 bit of the NAN, but it I think that is ok
+    return ((uint32_t)(ix >> 31)) > 0xffe00000u;
+}
+
+#define check_nan_d1(x) if (disnan((x))) return (x)
+#define check_nan_d2(x,y) if (disnan((x))) return (x); else if (disnan((y))) return (y);
+#else
+#define check_nan_d1(x) ((void)0)
+#define check_nan_d2(x,y) ((void)0)
+#endif
+
+static inline int dgetsignexp(double x) {
+    ui64 ix=double2ui64(x);
+    return (ix>>52)&0xfff;
+}
+
+static inline int dgetexp(double x) {
+    ui64 ix=double2ui64(x);
+    return (ix>>52)&0x7ff;
+}
+
+static inline double dldexp(double x,int de) {
+    ui64 ix=double2ui64(x),iy;
+    int e;
+    e=dgetexp(x);
+    if(e==0||e==0x7ff) return x;
+    e+=de;
+    if(e<=0) iy=ix&0x8000000000000000ULL; // signed zero for underflow
+    else if(e>=0x7ff) iy=(ix&0x8000000000000000ULL)|0x7ff0000000000000ULL; // signed infinity on overflow
+    else iy=ix+((ui64)de<<52);
+    return ui642double(iy);
+}
+
+double WRAPPER_FUNC(ldexp)(double x, int de) {
+    check_nan_d1(x);
+    return dldexp(x, de);
+}
+
+
+static inline double dcopysign(double x,double y) {
+    ui64 ix=double2ui64(x),iy=double2ui64(y);
+    ix=((ix&0x7fffffffffffffffULL)|(iy&0x8000000000000000ULL));
+    return ui642double(ix);
+}
+
+double WRAPPER_FUNC(copysign)(double x, double y) {
+    check_nan_d2(x,y);
+    return dcopysign(x, y);
+}
+static inline int diszero(double x)  { return dgetexp    (x)==0; }
+//static inline int dispzero(double x) { return dgetsignexp(x)==0; }
+//static inline int dismzero(double x) { return dgetsignexp(x)==0x800; }
+static inline int disinf(double x)   { return dgetexp    (x)==0x7ff; }
+static inline int dispinf(double x)  { return dgetsignexp(x)==0x7ff; }
+static inline int disminf(double x)  { return dgetsignexp(x)==0xfff; }
+
+static inline int disint(double x) {
+    ui64 ix=double2ui64(x),m;
+    int e=dgetexp(x);
+    if(e==0) return 1;       // 0 is an integer
+    e-=0x3ff;                // remove exponent bias
+    if(e<0) return 0;        // |x|<1
+    e=52-e;                  // bit position in mantissa with significance 1
+    if(e<=0) return 1;       // |x| large, so must be an integer
+    m=(1ULL<<e)-1;           // mask for bits of significance <1
+    if(ix&m) return 0;       // not an integer
+    return 1;
+}
+
+static inline int disoddint(double x) {
+    ui64 ix=double2ui64(x),m;
+    int e=dgetexp(x);
+    e-=0x3ff;                // remove exponent bias
+    if(e<0) return 0;        // |x|<1; 0 is not odd
+    e=52-e;                  // bit position in mantissa with significance 1
+    if(e<0) return 0;        // |x| large, so must be even
+    m=(1ULL<<e)-1;           // mask for bits of significance <1 (if any)
+    if(ix&m) return 0;       // not an integer
+    if(e==52) return 1;      // value is exactly 1
+    return (ix>>e)&1;
+}
+
+static inline int disstrictneg(double x) {
+    ui64 ix=double2ui64(x);
+    if(diszero(x)) return 0;
+    return ix>>63;
+}
+
+static inline int disneg(double x) {
+    ui64 ix=double2ui64(x);
+    return ix>>63;
+}
+
+static inline double dneg(double x) {
+    ui64 ix=double2ui64(x);
+    ix^=0x8000000000000000ULL;
+    return ui642double(ix);
+}
+
+static inline int dispo2(double x) {
+    ui64 ix=double2ui64(x);
+    if(diszero(x)) return 0;
+    if(disinf(x)) return 0;
+    ix&=0x000fffffffffffffULL;
+    return ix==0;
+}
+
+static inline double dnan_or(double x) {
+#if PICO_DOUBLE_PROPAGATE_NANS
+    return NAN;
+#else
+    return x;
+#endif
+}
+
+double WRAPPER_FUNC(trunc)(double x) {
+    check_nan_d1(x);
+    ui64 ix=double2ui64(x),m;
+    int e=dgetexp(x);
+    e-=0x3ff;                // remove exponent bias
+    if(e<0) {                // |x|<1
+        ix&=0x8000000000000000ULL;
+        return ui642double(ix);
+    }
+    e=52-e;                  // bit position in mantissa with significance 1
+    if(e<=0) return x;       // |x| large, so must be an integer
+    m=(1ULL<<e)-1;           // mask for bits of significance <1
+    ix&=~m;
+    return ui642double(ix);
+}
+
+double WRAPPER_FUNC(round)(double x) {
+    check_nan_d1(x);
+    ui64 ix=double2ui64(x),m;
+    int e=dgetexp(x);
+    e-=0x3ff;                // remove exponent bias
+    if(e<-1) {               // |x|<0.5
+        ix&=0x8000000000000000ULL;
+        return ui642double(ix);
+    }
+    if(e==-1) {              // 0.5<=|x|<1
+        ix&=0x8000000000000000ULL;
+        ix|=0x3ff0000000000000ULL;        // ±1
+        return ui642double(ix);
+    }
+    e=52-e;                  // bit position in mantissa with significance 1, <=52
+    if(e<=0) return x;       // |x| large, so must be an integer
+    m=1ULL<<(e-1);           // mask for bit of significance 0.5
+    ix+=m;
+    m=m+m-1;                 // mask for bits of significance <1
+    ix&=~m;
+    return ui642double(ix);
+}
+
+double WRAPPER_FUNC(floor)(double x) {
+    check_nan_d1(x);
+    ui64 ix=double2ui64(x),m;
+    int e=dgetexp(x);
+    if(e==0) {               // x==0
+        ix&=0x8000000000000000ULL;
+        return ui642double(ix);
+    }
+    e-=0x3ff;                // remove exponent bias
+    if(e<0) {                // |x|<1, not zero
+        if(disneg(x)) return -1;
+        return PZERO;
+    }
+    e=52-e;                  // bit position in mantissa with significance 1
+    if(e<=0) return x;       // |x| large, so must be an integer
+    m=(1ULL<<e)-1;           // mask for bit of significance <1
+    if(disneg(x)) ix+=m;     // add 1-ε to magnitude if negative
+    ix&=~m;                  // truncate
+    return ui642double(ix);
+}
+
+double WRAPPER_FUNC(ceil)(double x) {
+    check_nan_d1(x);
+    ui64 ix=double2ui64(x),m;
+    int e=dgetexp(x);
+    if(e==0) {               // x==0
+        ix&=0x8000000000000000ULL;
+        return ui642double(ix);
+    }
+    e-=0x3ff;                 // remove exponent bias
+    if(e<0) {                // |x|<1, not zero
+        if(disneg(x)) return MZERO;
+        return 1;
+    }
+    e=52-e;                  // bit position in mantissa with significance 1
+    if(e<=0) return x;       // |x| large, so must be an integer
+    m=(1ULL<<e)-1;           // mask for bit of significance <1
+    if(!disneg(x)) ix+=m;    // add 1-ε to magnitude if positive
+    ix&=~m;                  // truncate
+    return ui642double(ix);
+}
+
+double WRAPPER_FUNC(asin)(double x) {
+    check_nan_d1(x);
+    double u;
+    u=(1-x)*(1+x);
+    if(disstrictneg(u)) return dnan_or(PINF);
+    return atan2(x,sqrt(u));
+}
+
+double WRAPPER_FUNC(acos)(double x) {
+    check_nan_d1(x);
+    double u;
+    u=(1-x)*(1+x);
+    if(disstrictneg(u)) return dnan_or(PINF);
+    return atan2(sqrt(u),x);
+}
+
+double WRAPPER_FUNC(atan)(double x) {
+    check_nan_d1(x);
+    if(dispinf(x)) return  PI/2;
+    if(disminf(x)) return -PI/2;
+    return atan2(x,1);
+}
+
+double WRAPPER_FUNC(sinh)(double x) {
+    check_nan_d1(x);
+    return dldexp((exp(x)-exp(dneg(x))),-1);
+}
+
+double WRAPPER_FUNC(cosh)(double x) {
+    check_nan_d1(x);
+    return dldexp((exp(x)+exp(dneg(x))),-1);
+}
+
+double WRAPPER_FUNC(tanh)(double x) {
+    check_nan_d1(x);
+    double u;
+    int e;
+    e=dgetexp(x);
+    if(e>=5+0x3ff) {             // |x|>=32?
+        if(!disneg(x)) return  1;  // 1 << exp 2x; avoid generating infinities later
+        else           return -1;  // 1 >> exp 2x
+    }
+    u=exp(dldexp(x,1));
+    return (u-1)/(u+1);
+}
+
+double WRAPPER_FUNC(asinh)(double x) {
+    check_nan_d1(x);
+    int e;
+    e=dgetexp(x);
+    if(e>=32+0x3ff) {                                // |x|>=2^32?
+        if(!disneg(x)) return      log(     x )+LOG2;  // 1/x^2 << 1
+        else           return dneg(log(dneg(x))+LOG2); // 1/x^2 << 1
+    }
+    if(x>0) return      log(sqrt(x*x+1)+x);
+    else    return dneg(log(sqrt(x*x+1)-x));
+}
+
+double WRAPPER_FUNC(acosh)(double x) {
+    check_nan_d1(x);
+    int e;
+    if(disneg(x)) x=dneg(x);
+    e=dgetexp(x);
+    if(e>=32+0x3ff) return log(x)+LOG2;           // |x|>=2^32?
+    return log(sqrt((x-1)*(x+1))+x);
+}
+
+double WRAPPER_FUNC(atanh)(double x) {
+    check_nan_d1(x);
+    return dldexp(log((1+x)/(1-x)),-1);
+}
+
+double WRAPPER_FUNC(exp2)(double x) {
+    check_nan_d1(x);
+    int e;
+    // extra check for disminf as this catches -Nan, and x<=-4096 doesn't.
+    if (disminf(x) || x<=-4096) return 0;    // easily underflows
+    else if (x>=4096)           return PINF; // easily overflows
+    e=(int)round(x);
+    x-=e;
+    return dldexp(exp(x*LOG2),e);
+}
+double WRAPPER_FUNC(log2)(double x) { check_nan_d1(x); return log(x)*LOG2E;  }
+double WRAPPER_FUNC(exp10)(double x) { check_nan_d1(x); return pow(10,x); }
+double WRAPPER_FUNC(log10)(double x) { check_nan_d1(x); return log(x)*LOG10E; }
+
+// todo these are marked as lofi
+double WRAPPER_FUNC(expm1)(double x) { check_nan_d1(x); return exp(x)-1; }
+double WRAPPER_FUNC(log1p)(double x) { check_nan_d1(x); return log(1+x); }
+#if !HAS_DOUBLE_COPROCESSOR
+double WRAPPER_FUNC(fma)(double x,double y,double z) { check_nan_d1(x); return x*y+z; }
+#endif
+
+// general power, x>0, finite
+static double dpow_1(double x,double y) {
+    int a,b,c;
+    double t,rt,u,v,v0,v1,w,ry;
+    a=dgetexp(x)-0x3ff;
+    u=log2(dldexp(x,-a)); // now log_2 x = a+u
+    if(u>0.5) u-=1,a++;              // |u|<=~0.5
+    if(a==0) return exp2(u*y);
+    // here |log_2 x| >~0.5
+    if(y>= 4096) { // then easily over/underflows
+        if(a<0) return 0;
+        return PINF;
+    }
+    if(y<=-4096) { // then easily over/underflows
+        if(a<0) return PINF;
+        return 0;
+    }
+    ry=round(y);
+    v=y-ry;
+    v0=dldexp(round(ldexp(v,26)),-26);
+    v1=v-v0;
+    b=(int)ry; // guaranteed to fit in an int; y=b+v0+v1
+    // now the result is exp2( (a+u) * (b+v0+v1) )
+    c=a*b;      // integer
+    t=a*v0;
+    rt=round(t);
+    c+=(int)rt;
+    w=t-rt;
+    t=a*v1;
+    w+=t;
+    t=u*b;
+    rt=round(t);
+    c+=(int)rt;
+    w+=t-rt;
+    w+=u*v;
+    return dldexp(exp2(w),c);
+}
+
+static double dpow_int2(double x,int y) {
+    double u;
+    if(y==1) return x;
+    u=dpow_int2(x,y/2);
+    u*=u;
+    if(y&1) u*=x;
+    return u;
+}
+
+// for the case where x not zero or infinity, y small and not zero
+static inline double dpowint_1(double x,int y) {
+    if(y<0) x=1/x,y=-y;
+    return dpow_int2(x,y);
+}
+
+// for the case where x not zero or infinity
+static double dpowint_0(double x,int y) {
+    int e;
+    if(disneg(x)) {
+        if(disoddint(y)) return dneg(dpowint_0(dneg(x),y));
+        else             return      dpowint_0(dneg(x),y);
+    }
+    if(dispo2(x)) {
+        e=dgetexp(x)-0x3ff;
+        if(y>=2048) y= 2047;  // avoid overflow
+        if(y<-2048) y=-2048;
+        y*=e;
+        return dldexp(1,y);
+    }
+    if(y==0) return 1;
+    if(y>=-32&&y<=32) return dpowint_1(x,y);
+    return dpow_1(x,y);
+}
+
+double WRAPPER_FUNC(powint)(double x,int y) {
+    GCC_Like_Pragma("GCC diagnostic push")
+    GCC_Like_Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
+    if(x==1.0||y==0) return 1;
+    GCC_Like_Pragma("GCC diagnostic pop")
+    check_nan_d1(x);
+    if(diszero(x)) {
+        if(y>0) {
+            if(y&1) return x;
+            else    return 0;
+        }
+        if((y&1)) return dcopysign(PINF,x);
+        return PINF;
+    }
+    if(dispinf(x)) {
+        if(y<0) return 0;
+        else    return PINF;
+    }
+    if(disminf(x)) {
+        if(y>0) {
+            if((y&1)) return MINF;
+            else      return PINF;
+        }
+        if((y&1)) return MZERO;
+        else      return PZERO;
+    }
+    return dpowint_0(x,y);
+}
+
+// for the case where y is guaranteed a finite integer, x not zero or infinity
+static double dpow_0(double x,double y) {
+    int e,p;
+    if(disneg(x)) {
+        if(disoddint(y)) return dneg(dpow_0(dneg(x),y));
+        else             return      dpow_0(dneg(x),y);
+    }
+    p=(int)y;
+    if(dispo2(x)) {
+        e=dgetexp(x)-0x3ff;
+        if(p>=2048) p= 2047;  // avoid overflow
+        if(p<-2048) p=-2048;
+        p*=e;
+        return dldexp(1,p);
+    }
+    if(p==0) return 1;
+    if(p>=-32&&p<=32) return dpowint_1(x,p);
+    return dpow_1(x,y);
+}
+
+double WRAPPER_FUNC(pow)(double x,double y) {
+    GCC_Like_Pragma("GCC diagnostic push")
+    GCC_Like_Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
+
+    if(x==1.0||diszero(y)) return 1;
+    check_nan_d2(x, y);
+    if(x==-1.0&&disinf(y)) return 1;
+    GCC_Like_Pragma("GCC diagnostic pop")
+
+    if(diszero(x)) {
+        if(!disneg(y)) {
+            if(disoddint(y)) return x;
+            else             return 0;
+        }
+        if(disoddint(y)) return dcopysign(PINF,x);
+        return PINF;
+    }
+    if(dispinf(x)) {
+        if(disneg(y)) return 0;
+        else          return PINF;
+    }
+    if(disminf(x)) {
+        if(!disneg(y)) {
+            if(disoddint(y)) return MINF;
+            else             return PINF;
+        }
+        if(disoddint(y)) return MZERO;
+        else             return PZERO;
+    }
+    if(dispinf(y)) {
+        if(dgetexp(x)<0x3ff) return PZERO;
+        else                 return PINF;
+    }
+    if(disminf(y)) {
+        if(dgetexp(x)<0x3ff) return PINF;
+        else                 return PZERO;
+    }
+    if(disint(y)) return dpow_0(x,y);
+    if(disneg(x)) return PINF;
+    return dpow_1(x,y);
+}
+
+double WRAPPER_FUNC(hypot)(double x,double y) {
+    check_nan_d2(x, y);
+    int ex,ey;
+    ex=dgetexp(x); ey=dgetexp(y);
+    if(ex>=0x3ff+400||ey>=0x3ff+400) { // overflow, or nearly so
+        x=dldexp(x,-600),y=dldexp(y,-600);
+        return dldexp(sqrt(x*x+y*y), 600);
+    }
+    else if(ex<=0x3ff-400&&ey<=0x3ff-400) { // underflow, or nearly so
+        x=dldexp(x, 600),y=dldexp(y, 600);
+        return dldexp(sqrt(x*x+y*y),-600);
+    }
+    return sqrt(x*x+y*y);
+}
+
+double WRAPPER_FUNC(cbrt)(double x) {
+    check_nan_d1(x);
+    int e;
+    if(disneg(x)) return dneg(cbrt(dneg(x)));
+    if(diszero(x)) return dcopysign(PZERO,x);
+    e=dgetexp(x)-0x3ff;
+    e=(e*0x5555+0x8000)>>16;  // ~e/3, rounded
+    x=dldexp(x,-e*3);
+    x=exp(log(x)*ONETHIRD);
+    return dldexp(x,e);
+}
+
+// reduces mx*2^e modulo my, returning bottom bits of quotient at *pquo
+// 2^52<=|mx|,my<2^53, e>=0; 0<=result<my
+static i64 drem_0(i64 mx,i64 my,int e,int*pquo) {
+    int quo=0,q,r=0,s;
+    if(e>0) {
+        r=0xffffffffU/(ui32)(my>>36);  // reciprocal estimate Q16
+    }
+    while(e>0) {
+        s=e; if(s>12) s=12;    // gain up to 12 bits on each iteration
+        q=(mx>>38)*r;          // Q30
+        q=((q>>(29-s))+1)>>1;  // Q(s), rounded
+        mx=(mx<<s)-my*q;
+        quo=(quo<<s)+q;
+        e-=s;
+    }
+    if(mx>=my) mx-=my,quo++; // when e==0 mx can be nearly as big as 2my
+    if(mx>=my) mx-=my,quo++;
+    if(mx<0) mx+=my,quo--;
+    if(mx<0) mx+=my,quo--;
+    if(pquo) *pquo=quo;
+    return mx;
+}
+
+double WRAPPER_FUNC(fmod)(double x,double y) {
+    check_nan_d2(x, y);
+    ui64 ix=double2ui64(x),iy=double2ui64(y);
+    int sx,ex,ey;
+    i64 mx,my;
+    DUNPACKS(ix,sx,ex,mx);
+    DUNPACK(iy,ey,my);
+    if(ex==0x7ff) return dnan_or(PINF);
+    if(ey==0) return PINF;
+    if(ex==0) {
+        if(!disneg(x)) return PZERO;
+        return MZERO;
+    }
+    if(ex<ey) return x;  // |x|<|y|, including case x=±0
+    mx=drem_0(mx,my,ex-ey,0);
+    if(sx) mx=-mx;
+    return fix642double(mx,0x3ff-ey+52);
+}
+
+double WRAPPER_FUNC(remquo)(double x,double y,int*quo) {
+    check_nan_d2(x, y);
+    ui64 ix=double2ui64(x),iy=double2ui64(y);
+    int sx,sy,ex,ey,q;
+    i64 mx,my;
+    DUNPACKS(ix,sx,ex,mx);
+    DUNPACKS(iy,sy,ey,my);
+    if(quo) *quo=0;
+    if(ex==0x7ff) return PINF;
+    if(ey==0)     return PINF;
+    if(ex==0)     return PZERO;
+    if(ey==0x7ff) return x;
+    if(ex<ey-1)   return x;  // |x|<|y|/2
+    if(ex==ey-1) {
+        if(mx<=my) return x;   // |x|<=|y|/2, even quotient
+        // here |y|/2<|x|<|y|
+        if(!sx) { // x>|y|/2
+            mx-=my+my;
+            ey--;
+            q=1;
+        } else { // x<-|y|/2
+            mx=my+my-mx;
+            ey--;
+            q=-1;
+        }
+    }
+    else {
+        if(sx) mx=-mx;
+        mx=drem_0(mx,my,ex-ey,&q);
+        if(mx+mx>my || (mx+mx==my&&(q&1)) ) { // |x|>|y|/2, or equality and an odd quotient?
+            mx-=my;
+            q++;
+        }
+    }
+    if(sy) q=-q;
+    if(quo) *quo=q;
+    return fix642double(mx,0x3ff-ey+52);
+}
+
+double WRAPPER_FUNC(drem)(double x,double y) { check_nan_d2(x, y); return remquo(x,y,0); }
+
+double WRAPPER_FUNC(remainder)(double x,double y) { check_nan_d2(x, y); return remquo(x,y,0); }
+
+GCC_Pragma("GCC diagnostic pop") // conversion
--- a/lib/main/pico-sdk/rp2_common/pico_double/double_none.S
+++ b/lib/main/pico-sdk/rp2_common/pico_double/double_none.S
@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include "pico/asm_helper.S"
+#include "pico/bootrom/sf_table.h"
+
+pico_default_asm_setup
+
+    wrapper_func __aeabi_dadd
+    wrapper_func __aeabi_ddiv
+    wrapper_func __aeabi_dmul
+    wrapper_func __aeabi_drsub
+    wrapper_func __aeabi_dsub
+    wrapper_func __aeabi_cdcmpeq
+    wrapper_func __aeabi_cdrcmple
+    wrapper_func __aeabi_cdcmple
+    wrapper_func __aeabi_dcmpeq
+    wrapper_func __aeabi_dcmplt
+    wrapper_func __aeabi_dcmple
+    wrapper_func __aeabi_dcmpge
+    wrapper_func __aeabi_dcmpgt
+    wrapper_func __aeabi_dcmpun
+    wrapper_func __aeabi_i2d
+    wrapper_func __aeabi_l2d
+    wrapper_func __aeabi_ui2d
+    wrapper_func __aeabi_ul2d
+    wrapper_func __aeabi_d2iz
+    wrapper_func __aeabi_d2lz
+    wrapper_func __aeabi_d2uiz
+    wrapper_func __aeabi_d2ulz
+    wrapper_func __aeabi_d2f
+    wrapper_func sqrt
+    wrapper_func cos
+    wrapper_func sin
+    wrapper_func tan
+    wrapper_func atan2
+    wrapper_func exp
+    wrapper_func log
+
+    wrapper_func ldexp
+    wrapper_func copysign
+    wrapper_func trunc
+    wrapper_func floor
+    wrapper_func ceil
+    wrapper_func round
+    wrapper_func sincos
+    wrapper_func asin
+    wrapper_func acos
+    wrapper_func atan
+    wrapper_func sinh
+    wrapper_func cosh
+    wrapper_func tanh
+    wrapper_func asinh
+    wrapper_func acosh
+    wrapper_func atanh
+    wrapper_func exp2
+    wrapper_func log2
+    wrapper_func exp10
+    wrapper_func log10
+    wrapper_func pow
+    wrapper_func powint
+    wrapper_func hypot
+    wrapper_func cbrt
+    wrapper_func fmod
+    wrapper_func drem
+    wrapper_func remainder
+    wrapper_func remquo
+    wrapper_func expm1
+    wrapper_func log1p
+    wrapper_func fma
+
+#ifdef __riscv
+    la a0, str
+    j  panic
+#else
+    push {lr}       // keep stack trace sane
+    ldr r0, =str
+    bl panic
+#endif
+
+str:
+    .asciz "double support is disabled"
--- a/lib/main/pico-sdk/rp2_common/pico_double/double_sci_m33.S
+++ b/lib/main/pico-sdk/rp2_common/pico_double/double_sci_m33.S
--- a/lib/main/pico-sdk/rp2_common/pico_double/double_v1_rom_shim_rp2040.S
+++ b/lib/main/pico-sdk/rp2_common/pico_double/double_v1_rom_shim_rp2040.S
--- a/lib/main/pico-sdk/rp2_common/pico_double/include/pico/double.h
+++ b/lib/main/pico-sdk/rp2_common/pico_double/include/pico/double.h
@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef _PICO_DOUBLE_H
+#define _PICO_DOUBLE_H
+
+#include <math.h>
+#include "pico.h"
+#include "pico/bootrom/sf_table.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \file double.h
+*  \defgroup pico_double pico_double
+*
+* \brief Optimized double-precision floating point functions
+*
+* (Replacement) optimized implementations are provided of the following compiler built-ins
+* and math library functions:
+*
+* - __aeabi_dadd, __aeabi_ddiv, __aeabi_dmul, __aeabi_drsub, __aeabi_dsub, __aeabi_cdcmpeq, __aeabi_cdrcmple, __aeabi_cdcmple, __aeabi_dcmpeq, __aeabi_dcmplt, __aeabi_dcmple, __aeabi_dcmpge, __aeabi_dcmpgt, __aeabi_dcmpun, __aeabi_i2d, __aeabi_l2d, __aeabi_ui2d, __aeabi_ul2d, __aeabi_d2iz, __aeabi_d2lz, __aeabi_d2uiz, __aeabi_d2ulz, __aeabi_d2f
+* - sqrt, cos, sin, tan, atan2, exp, log, ldexp, copysign, trunc, floor, ceil, round, asin, acos, atan, sinh, cosh, tanh, asinh, acosh, atanh, exp2, log2, exp10, log10, pow,, hypot, cbrt, fmod, drem, remainder, remquo, expm1, log1p, fma
+* - powint, sincos (GNU extensions)
+*
+* The following additional optimized functions are also provided:
+*
+* - int2double, uint2double, int642double, uint642double, fix2double, ufix2double, fix642double, ufix642double
+* - double2fix, double2ufix, double2fix64, double2ufix64, double2int, double2uint, double2int64, double2uint64, double2int_z, double2int64_z,
+* - exp10, sincos, powint
+*
+* On RP2350 the following additional functions are available; the _fast methods are faster but do not round correctly"
+*
+* - ddiv_fast, sqrt_fast
+*/
+
+double int2double(int32_t i);
+double uint2double(uint32_t u);
+double int642double(int64_t i);
+double uint642double(uint64_t u);
+double fix2double(int32_t m, int e);
+double ufix2double(uint32_t m, int e);
+double fix642double(int64_t m, int e);
+double ufix642double(uint64_t m, int e);
+
+// These methods round towards -Infinity.
+int32_t double2fix(double d, int e);
+uint32_t double2ufix(double d, int e);
+int64_t double2fix64(double d, int e);
+uint64_t double2ufix64(double d, int e);
+int32_t double2int(double d);
+uint32_t double2uint(double d);
+int64_t double2int64(double d);
+uint64_t double2uint64(double d);
+
+// These methods round towards 0.
+int32_t double2int_z(double d);
+int64_t double2int64_z(double d);
+
+double exp10(double x);
+void sincos(double x, double *sinx, double *cosx);
+double powint(double x, int y);
+
+#if !PICO_RP2040
+double ddiv_fast(double n, double d);
+double sqrt_fast(double d);
+double mla(double x, double y, double z); // note this is not fused
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+