1
0
Fork 0
mirror of https://github.com/betaflight/betaflight.git synced 2025-07-23 16:25:31 +03:00

AT32F435/7 Libraries (#12158) (#12263)

Source: https://github.com/ArteryTek/AT32F435_437_Firmware_Library
Version: 2.1.1
This commit is contained in:
J Blackman 2023-01-31 08:05:32 +11:00 committed by GitHub
parent 5e16ddb01b
commit 8900a831e5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
559 changed files with 289319 additions and 0 deletions

View file

@ -0,0 +1,19 @@
cmake_minimum_required (VERSION 3.6)
project(CMSISDSPStatistics)
include(configLib)
include(configDsp)
file(GLOB SRC "./*_*.c")
add_library(CMSISDSPStatistics STATIC ${SRC})
configLib(CMSISDSPStatistics ${ROOT})
configDsp(CMSISDSPStatistics ${ROOT})
### Includes
target_include_directories(CMSISDSPStatistics PUBLIC "${DSP}/Include")

View file

@ -0,0 +1,60 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: StatisticsFunctions.c
* Description: Combination of all statistics function source files.
*
* $Date: 16. March 2020
* $Revision: V1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_entropy_f32.c"
#include "arm_entropy_f64.c"
#include "arm_kullback_leibler_f32.c"
#include "arm_kullback_leibler_f64.c"
#include "arm_logsumexp_dot_prod_f32.c"
#include "arm_logsumexp_f32.c"
#include "arm_max_f32.c"
#include "arm_max_q15.c"
#include "arm_max_q31.c"
#include "arm_max_q7.c"
#include "arm_max_no_idx_f32.c"
#include "arm_mean_f32.c"
#include "arm_mean_q15.c"
#include "arm_mean_q31.c"
#include "arm_mean_q7.c"
#include "arm_min_f32.c"
#include "arm_min_q15.c"
#include "arm_min_q31.c"
#include "arm_min_q7.c"
#include "arm_power_f32.c"
#include "arm_power_q15.c"
#include "arm_power_q31.c"
#include "arm_power_q7.c"
#include "arm_rms_f32.c"
#include "arm_rms_q15.c"
#include "arm_rms_q31.c"
#include "arm_std_f32.c"
#include "arm_std_q15.c"
#include "arm_std_q31.c"
#include "arm_var_f32.c"
#include "arm_var_q15.c"
#include "arm_var_q31.c"

View file

@ -0,0 +1,172 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_logsumexp_f32.c
* Description: LogSumExp
*
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
#include <limits.h>
#include <math.h>
/**
* @addtogroup groupStats
* @{
*/
/**
* @brief Entropy
*
* @param[in] pSrcA Array of input values.
* @param[in] blockSize Number of samples in the input array.
* @return Entropy -Sum(p ln p)
*
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_math.h"
float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize)
{
uint32_t blkCnt;
float32_t accum=0.0f,p;
blkCnt = blockSize;
f32x4_t vSum = vdupq_n_f32(0.0f);
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
f32x4_t vecIn = vld1q(pSrcA);
vSum = vaddq_f32(vSum, vmulq(vecIn, vlogq_f32(vecIn)));
/*
* Decrement the blockSize loop counter
* Advance vector source and destination pointers
*/
pSrcA += 4;
blkCnt --;
}
accum = vecAddAcrossF32Mve(vSum);
/* Tail */
blkCnt = blockSize & 0x3;
while(blkCnt > 0)
{
p = *pSrcA++;
accum += p * logf(p);
blkCnt--;
}
return (-accum);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "NEMath.h"
float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize)
{
const float32_t *pIn;
uint32_t blkCnt;
float32_t accum, p;
float32x4_t accumV;
float32x2_t accumV2;
float32x4_t tmpV, tmpV2;
pIn = pSrcA;
accum = 0.0f;
accumV = vdupq_n_f32(0.0f);
blkCnt = blockSize >> 2;
while(blkCnt > 0)
{
tmpV = vld1q_f32(pIn);
pIn += 4;
tmpV2 = vlogq_f32(tmpV);
accumV = vmlaq_f32(accumV, tmpV, tmpV2);
blkCnt--;
}
accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
blkCnt = blockSize & 3;
while(blkCnt > 0)
{
p = *pIn++;
accum += p * logf(p);
blkCnt--;
}
return(-accum);
}
#else
float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize)
{
const float32_t *pIn;
uint32_t blkCnt;
float32_t accum, p;
pIn = pSrcA;
blkCnt = blockSize;
accum = 0.0f;
while(blkCnt > 0)
{
p = *pIn++;
accum += p * logf(p);
blkCnt--;
}
return(-accum);
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of groupStats group
*/

View file

@ -0,0 +1,71 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_logsumexp_f64.c
* Description: LogSumExp
*
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
#include <limits.h>
#include <math.h>
/**
* @addtogroup groupStats
* @{
*/
/**
* @brief Entropy
*
* @param[in] pSrcA Array of input values.
* @param[in] blockSize Number of samples in the input array.
* @return Entropy -Sum(p ln p)
*
*/
float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize)
{
const float64_t *pIn;
uint32_t blkCnt;
float64_t accum, p;
pIn = pSrcA;
blkCnt = blockSize;
accum = 0.0f;
while(blkCnt > 0)
{
p = *pIn++;
accum += p * log(p);
blkCnt--;
}
return(-accum);
}
/**
* @} end of groupStats group
*/

View file

@ -0,0 +1,191 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_logsumexp_f32.c
* Description: LogSumExp
*
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
#include <limits.h>
#include <math.h>
/**
* @addtogroup groupStats
* @{
*/
/**
* @brief Kullback-Leibler
*
* Distribution A may contain 0 with Neon version.
* Result will be right but some exception flags will be set.
*
* Distribution B must not contain 0 probability.
*
* @param[in] *pSrcA points to an array of input values for probaility distribution A.
* @param[in] *pSrcB points to an array of input values for probaility distribution B.
* @param[in] blockSize number of samples in the input array.
* @return Kullback-Leibler divergence D(A || B)
*
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_math.h"
float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)
{
uint32_t blkCnt;
float32_t accum, pA,pB;
blkCnt = blockSize;
accum = 0.0f;
f32x4_t vSum = vdupq_n_f32(0.0f);
blkCnt = blockSize >> 2;
while(blkCnt > 0)
{
f32x4_t vecA = vld1q(pSrcA);
f32x4_t vecB = vld1q(pSrcB);
f32x4_t vRatio;
vRatio = vdiv_f32(vecB, vecA);
vSum = vaddq_f32(vSum, vmulq(vecA, vlogq_f32(vRatio)));
/*
* Decrement the blockSize loop counter
* Advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
blkCnt --;
}
accum = vecAddAcrossF32Mve(vSum);
blkCnt = blockSize & 3;
while(blkCnt > 0)
{
pA = *pSrcA++;
pB = *pSrcB++;
accum += pA * logf(pB / pA);
blkCnt--;
}
return(-accum);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "NEMath.h"
float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)
{
const float32_t *pInA, *pInB;
uint32_t blkCnt;
float32_t accum, pA,pB;
float32x4_t accumV;
float32x2_t accumV2;
float32x4_t tmpVA, tmpVB,tmpV;
pInA = pSrcA;
pInB = pSrcB;
accum = 0.0f;
accumV = vdupq_n_f32(0.0f);
blkCnt = blockSize >> 2;
while(blkCnt > 0)
{
tmpVA = vld1q_f32(pInA);
pInA += 4;
tmpVB = vld1q_f32(pInB);
pInB += 4;
tmpV = vinvq_f32(tmpVA);
tmpVB = vmulq_f32(tmpVB, tmpV);
tmpVB = vlogq_f32(tmpVB);
accumV = vmlaq_f32(accumV, tmpVA, tmpVB);
blkCnt--;
}
accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
blkCnt = blockSize & 3;
while(blkCnt > 0)
{
pA = *pInA++;
pB = *pInB++;
accum += pA * logf(pB/pA);
blkCnt--;
}
return(-accum);
}
#else
float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)
{
const float32_t *pInA, *pInB;
uint32_t blkCnt;
float32_t accum, pA,pB;
pInA = pSrcA;
pInB = pSrcB;
blkCnt = blockSize;
accum = 0.0f;
while(blkCnt > 0)
{
pA = *pInA++;
pB = *pInB++;
accum += pA * logf(pB / pA);
blkCnt--;
}
return(-accum);
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of groupStats group
*/

View file

@ -0,0 +1,73 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_logsumexp_f64.c
* Description: LogSumExp
*
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
#include <limits.h>
#include <math.h>
/**
* @addtogroup groupStats
* @{
*/
/**
* @brief Kullback-Leibler
*
* @param[in] *pSrcA points to an array of input values for probaility distribution A.
* @param[in] *pSrcB points to an array of input values for probaility distribution B.
* @param[in] blockSize number of samples in the input array.
* @return Kullback-Leibler divergence D(A || B)
*
*/
float64_t arm_kullback_leibler_f64(const float64_t * pSrcA, const float64_t * pSrcB, uint32_t blockSize)
{
const float64_t *pInA, *pInB;
uint32_t blkCnt;
float64_t accum, pA,pB;
pInA = pSrcA;
pInB = pSrcB;
blkCnt = blockSize;
accum = 0.0f;
while(blkCnt > 0)
{
pA = *pInA++;
pB = *pInB++;
accum += pA * log(pB / pA);
blkCnt--;
}
return(-accum);
}
/**
* @} end of groupStats group
*/

View file

@ -0,0 +1,66 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_logsumexp_f32.c
* Description: LogSumExp
*
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
#include <limits.h>
#include <math.h>
/**
* @addtogroup groupStats
* @{
*/
/**
* @brief Dot product with log arithmetic
*
* Vectors are containing the log of the samples
*
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[in] *pTmpBuffer temporary buffer of length blockSize
* @return The log of the dot product.
*
*/
float32_t arm_logsumexp_dot_prod_f32(const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t blockSize,
float32_t *pTmpBuffer)
{
float32_t result;
arm_add_f32((float32_t*)pSrcA, (float32_t*)pSrcB, pTmpBuffer, blockSize);
result = arm_logsumexp_f32(pTmpBuffer, blockSize);
return(result);
}
/**
* @} end of groupStats group
*/

View file

@ -0,0 +1,275 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_logsumexp_f32.c
* Description: LogSumExp
*
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
#include <limits.h>
#include <math.h>
/**
* @addtogroup groupStats
* @{
*/
/**
* @brief Computation of the LogSumExp
*
* In probabilistic computations, the dynamic of the probability values can be very
* wide because they come from gaussian functions.
* To avoid underflow and overflow issues, the values are represented by their log.
* In this representation, multiplying the original exp values is easy : their logs are added.
* But adding the original exp values is requiring some special handling and it is the
* goal of the LogSumExp function.
*
* If the values are x1...xn, the function is computing:
*
* ln(exp(x1) + ... + exp(xn)) and the computation is done in such a way that
* rounding issues are minimised.
*
* The max xm of the values is extracted and the function is computing:
* xm + ln(exp(x1 - xm) + ... + exp(xn - xm))
*
* @param[in] *in Pointer to an array of input values.
* @param[in] blockSize Number of samples in the input array.
* @return LogSumExp
*
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_math.h"
float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize)
{
float32_t maxVal;
const float32_t *pIn;
int32_t blkCnt;
float32_t accum=0.0f;
float32_t tmp;
arm_max_no_idx_f32((float32_t *) in, blockSize, &maxVal);
blkCnt = blockSize;
pIn = in;
f32x4_t vSum = vdupq_n_f32(0.0f);
blkCnt = blockSize >> 2;
while(blkCnt > 0)
{
f32x4_t vecIn = vld1q(pIn);
f32x4_t vecExp;
vecExp = vexpq_f32(vsubq_n_f32(vecIn, maxVal));
vSum = vaddq_f32(vSum, vecExp);
/*
* Decrement the blockSize loop counter
* Advance vector source and destination pointers
*/
pIn += 4;
blkCnt --;
}
/* sum + log */
accum = vecAddAcrossF32Mve(vSum);
blkCnt = blockSize & 0x3;
while(blkCnt > 0)
{
tmp = *pIn++;
accum += expf(tmp - maxVal);
blkCnt--;
}
accum = maxVal + log(accum);
return (accum);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "NEMath.h"
float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize)
{
float32_t maxVal;
float32_t tmp;
float32x4_t tmpV, tmpVb;
float32x4_t maxValV;
uint32x4_t idxV;
float32x4_t accumV;
float32x2_t accumV2;
const float32_t *pIn;
uint32_t blkCnt;
float32_t accum;
pIn = in;
blkCnt = blockSize;
if (blockSize <= 3)
{
maxVal = *pIn++;
blkCnt--;
while(blkCnt > 0)
{
tmp = *pIn++;
if (tmp > maxVal)
{
maxVal = tmp;
}
blkCnt--;
}
}
else
{
maxValV = vld1q_f32(pIn);
pIn += 4;
blkCnt = (blockSize - 4) >> 2;
while(blkCnt > 0)
{
tmpVb = vld1q_f32(pIn);
pIn += 4;
idxV = vcgtq_f32(tmpVb, maxValV);
maxValV = vbslq_f32(idxV, tmpVb, maxValV );
blkCnt--;
}
accumV2 = vpmax_f32(vget_low_f32(maxValV),vget_high_f32(maxValV));
accumV2 = vpmax_f32(accumV2,accumV2);
maxVal = vget_lane_f32(accumV2, 0) ;
blkCnt = (blockSize - 4) & 3;
while(blkCnt > 0)
{
tmp = *pIn++;
if (tmp > maxVal)
{
maxVal = tmp;
}
blkCnt--;
}
}
maxValV = vdupq_n_f32(maxVal);
pIn = in;
accum = 0;
accumV = vdupq_n_f32(0.0f);
blkCnt = blockSize >> 2;
while(blkCnt > 0)
{
tmpV = vld1q_f32(pIn);
pIn += 4;
tmpV = vsubq_f32(tmpV, maxValV);
tmpV = vexpq_f32(tmpV);
accumV = vaddq_f32(accumV, tmpV);
blkCnt--;
}
accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
blkCnt = blockSize & 0x3;
while(blkCnt > 0)
{
tmp = *pIn++;
accum += expf(tmp - maxVal);
blkCnt--;
}
accum = maxVal + logf(accum);
return(accum);
}
#else
float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize)
{
float32_t maxVal;
float32_t tmp;
const float32_t *pIn;
uint32_t blkCnt;
float32_t accum;
pIn = in;
blkCnt = blockSize;
maxVal = *pIn++;
blkCnt--;
while(blkCnt > 0)
{
tmp = *pIn++;
if (tmp > maxVal)
{
maxVal = tmp;
}
blkCnt--;
}
blkCnt = blockSize;
pIn = in;
accum = 0;
while(blkCnt > 0)
{
tmp = *pIn++;
accum += expf(tmp - maxVal);
blkCnt--;
}
accum = maxVal + logf(accum);
return(accum);
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of groupStats group
*/

View file

@ -0,0 +1,365 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_max_f32.c
* Description: Maximum value of a floating-point vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
#include <limits.h>
#endif
/**
@ingroup groupStats
*/
/**
@defgroup Max Maximum
Computes the maximum value of an array of data.
The function returns both the maximum value and its position within the array.
There are separate functions for floating-point, Q31, Q15, and Q7 data types.
*/
/**
@addtogroup Max
@{
*/
/**
@brief Maximum value of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult maximum value returned here
@param[out] pIndex index of maximum value returned here
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_max_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
uint32_t blkCnt;
f32x4_t vecSrc;
f32x4_t curExtremValVec = vdupq_n_f32(F32_MIN);
float32_t maxValue = F32_MIN;
uint32_t idx = blockSize;
uint32x4_t indexVec;
uint32x4_t curExtremIdxVec;
uint32_t curIdx = 0;
mve_pred16_t p0;
float32_t tmp;
indexVec = vidupq_wb_u32(&curIdx, 1);
curExtremIdxVec = vdupq_n_u32(0);
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vldrwq_f32(pSrc);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpgeq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = vidupq_wb_u32(&curIdx, 1);
pSrc += 4;
/* Decrement the loop counter */
blkCnt--;
}
/*
* Get max value across the vector
*/
maxValue = vmaxnmvq(maxValue, curExtremValVec);
/*
* set index for lower values to max possible index
*/
p0 = vcmpgeq(curExtremValVec, maxValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
/*
* Get min index which is thus for a max value
*/
idx = vminvq(idx, indexVec);
/* Tail */
blkCnt = blockSize & 0x3;
while (blkCnt > 0U)
{
/* Initialize tmp to the next consecutive values one by one */
tmp = *pSrc++;
/* compare for the maximum value */
if (maxValue < tmp)
{
/* Update the maximum value and it's index */
maxValue = tmp;
idx = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/*
* Save result
*/
*pIndex = idx;
*pResult = maxValue;
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_max_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
float32_t maxVal1, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* loop counter */
float32x4_t outV, srcV;
float32x2_t outV2;
uint32x4_t idxV;
uint32x4_t maxIdx;
static const uint32_t indexInit[4]={4,5,6,7};
static const uint32_t countVInit[4]={0,1,2,3};
uint32x4_t index;
uint32x4_t delta;
uint32x4_t countV;
uint32x2_t countV2;
maxIdx = vdupq_n_u32(ULONG_MAX);
delta = vdupq_n_u32(4);
index = vld1q_u32(indexInit);
countV = vld1q_u32(countVInit);
/* Initialise the index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparison */
if (blockSize <= 3)
{
out = *pSrc++;
blkCnt = blockSize - 1;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
{
/* Update the maximum value and it's index */
out = maxVal1;
outIndex = blockSize - blkCnt;
}
/* Decrement the loop counter */
blkCnt--;
}
}
else
{
outV = vld1q_f32(pSrc);
pSrc += 4;
/* Compute 4 outputs at a time */
blkCnt = (blockSize - 4 ) >> 2U;
while (blkCnt > 0U)
{
srcV = vld1q_f32(pSrc);
pSrc += 4;
idxV = vcgtq_f32(srcV, outV);
outV = vbslq_f32(idxV, srcV, outV );
countV = vbslq_u32(idxV, index,countV );
index = vaddq_u32(index,delta);
/* Decrement the loop counter */
blkCnt--;
}
outV2 = vpmax_f32(vget_low_f32(outV),vget_high_f32(outV));
outV2 = vpmax_f32(outV2,outV2);
out = vget_lane_f32(outV2, 0);
idxV = vceqq_f32(outV, vdupq_n_f32(out));
countV = vbslq_u32(idxV, countV,maxIdx);
countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
countV2 = vpmin_u32(countV2,countV2);
outIndex = vget_lane_u32(countV2,0);
/* if (blockSize - 1U) is not multiple of 4 */
blkCnt = (blockSize - 4 ) % 4U;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
/* compare for the maximum value */
if (out < maxVal1)
{
/* Update the maximum value and it's index */
out = maxVal1;
outIndex = blockSize - blkCnt ;
}
/* Decrement the loop counter */
blkCnt--;
}
}
/* Store the maximum value and it's index into destination pointers */
*pResult = out;
*pIndex = outIndex;
}
#else
void arm_max_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
float32_t maxVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32_t index; /* index of maximum value */
#endif
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize maxVal to next consecutive values one by one */
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal)
{
/* Update the maximum value and it's index */
out = maxVal;
outIndex = index + 1U;
}
maxVal = *pSrc++;
if (out < maxVal)
{
out = maxVal;
outIndex = index + 2U;
}
maxVal = *pSrc++;
if (out < maxVal)
{
out = maxVal;
outIndex = index + 3U;
}
maxVal = *pSrc++;
if (out < maxVal)
{
out = maxVal;
outIndex = index + 4U;
}
index += 4U;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal)
{
/* Update the maximum value and it's index */
out = maxVal;
outIndex = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/* Store the maximum value and it's index into destination pointers */
*pResult = out;
*pIndex = outIndex;
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of Max group
*/

View file

@ -0,0 +1,138 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_max_no_idx_f32.c
* Description: Maximum value of a floating-point vector without returning the index
*
* $Date: 16. October 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
#include <limits.h>
#endif
/**
@ingroup groupStats
*/
/**
@addtogroup Max
@{
*/
/**
@brief Maximum value of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult maximum value returned here
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_max_no_idx_f32(
const float32_t *pSrc,
uint32_t blockSize,
float32_t *pResult)
{
f32x4_t vecSrc;
f32x4_t curExtremValVec = vdupq_n_f32(F32_MIN);
float32_t maxValue = F32_MIN;
float32_t newVal;
uint32_t blkCnt;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vldrwq_f32(pSrc);
/*
* update per-lane max.
*/
curExtremValVec = vmaxnmq(vecSrc, curExtremValVec);
/*
* Decrement the blockSize loop counter
* Advance vector source and destination pointers
*/
pSrc += 4;
blkCnt --;
}
/*
* Get max value across the vector
*/
maxValue = vmaxnmvq(maxValue, curExtremValVec);
blkCnt = blockSize & 3;
while (blkCnt > 0U)
{
newVal = *pSrc++;
/* compare for the maximum value */
if (maxValue < newVal)
{
/* Update the maximum value and it's index */
maxValue = newVal;
}
blkCnt --;
}
*pResult = maxValue;
}
#else
void arm_max_no_idx_f32(
const float32_t *pSrc,
uint32_t blockSize,
float32_t *pResult)
{
float32_t maxValue = F32_MIN;
float32_t newVal;
while (blockSize > 0U)
{
newVal = *pSrc++;
/* compare for the maximum value */
if (maxValue < newVal)
{
/* Update the maximum value and it's index */
maxValue = newVal;
}
blockSize --;
}
*pResult = maxValue;
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of Max group
*/

View file

@ -0,0 +1,230 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_max_q15.c
* Description: Maximum value of a Q15 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup Max
@{
*/
/**
@brief Maximum value of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult maximum value returned here
@param[out] pIndex index of maximum value returned here
@return none
*/
#if defined(ARM_MATH_MVEI)
#include "arm_helium_utils.h"
void arm_max_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult,
uint32_t * pIndex)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
q15x8_t curExtremValVec = vdupq_n_s16(Q15_MIN);
q15_t maxValue = Q15_MIN, temp;
uint32_t idx = blockSize;
uint16x8_t indexVec;
uint16x8_t curExtremIdxVec;
mve_pred16_t p0;
indexVec = vidupq_u16((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u16(0);
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vldrhq_s16(pSrc);
pSrc += 8;
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpgeq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = indexVec + 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* Get max value across the vector
*/
maxValue = vmaxvq(maxValue, curExtremValVec);
/*
* set index for lower values to max possible index
*/
p0 = vcmpgeq(curExtremValVec, maxValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
/*
* Get min index which is thus for a max value
*/
idx = vminvq(idx, indexVec);
/* Tail */
blkCnt = blockSize & 0x7;
while (blkCnt > 0U)
{
/* Initialize temp to the next consecutive values one by one */
temp = *pSrc++;
/* compare for the maximum value */
if (maxValue < temp)
{
/* Update the maximum value and it's index */
maxValue = temp;
idx = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/*
* Save result
*/
*pIndex = idx;
*pResult = maxValue;
}
#else
void arm_max_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult,
uint32_t * pIndex)
{
q15_t maxVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize maxVal to next consecutive values one by one */
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal)
{
/* Update the maximum value and it's index */
out = maxVal;
outIndex = index + 1U;
}
maxVal = *pSrc++;
if (out < maxVal)
{
out = maxVal;
outIndex = index + 2U;
}
maxVal = *pSrc++;
if (out < maxVal)
{
out = maxVal;
outIndex = index + 3U;
}
maxVal = *pSrc++;
if (out < maxVal)
{
out = maxVal;
outIndex = index + 4U;
}
index += 4U;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal)
{
/* Update the maximum value and it's index */
out = maxVal;
outIndex = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/* Store the maximum value and it's index into destination pointers */
*pResult = out;
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of Max group
*/

View file

@ -0,0 +1,234 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_max_q31.c
* Description: Maximum value of a Q31 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup Max
@{
*/
/**
@brief Maximum value of a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult maximum value returned here
@param[out] pIndex index of maximum value returned here
@return none
*/
#if defined(ARM_MATH_MVEI)
#include "arm_helium_utils.h"
void arm_max_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex)
{
uint32_t blkCnt; /* loop counters */
q31x4_t vecSrc;
q31x4_t curExtremValVec = vdupq_n_s32( Q31_MIN);
q31_t maxValue = Q31_MIN;
q31_t temp;
uint32_t idx = blockSize;
uint32x4_t indexVec;
uint32x4_t curExtremIdxVec;
mve_pred16_t p0;
indexVec = vidupq_u32((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u32(0);
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vldrwq_s32(pSrc);
pSrc += 4;
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpgeq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = indexVec + 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* Get max value across the vector
*/
maxValue = vmaxvq(maxValue, curExtremValVec);
/*
* set index for lower values to max possible index
*/
p0 = vcmpgeq(curExtremValVec, maxValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
/*
* Get min index which is thus for a max value
*/
idx = vminvq(idx, indexVec);
/* Tail */
blkCnt = blockSize & 0x3;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
temp = *pSrc++;
/* compare for the maximum value */
if (maxValue < temp)
{
/* Update the maximum value and it's index */
maxValue = temp;
idx = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/*
* Save result
*/
*pIndex = idx;
*pResult = maxValue;
}
#else
void arm_max_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex)
{
q31_t maxVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize maxVal to next consecutive values one by one */
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal)
{
/* Update the maximum value and it's index */
out = maxVal;
outIndex = index + 1U;
}
maxVal = *pSrc++;
if (out < maxVal)
{
out = maxVal;
outIndex = index + 2U;
}
maxVal = *pSrc++;
if (out < maxVal)
{
out = maxVal;
outIndex = index + 3U;
}
maxVal = *pSrc++;
if (out < maxVal)
{
out = maxVal;
outIndex = index + 4U;
}
index += 4U;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal)
{
/* Update the maximum value and it's index */
out = maxVal;
outIndex = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/* Store the maximum value and it's index into destination pointers */
*pResult = out;
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of Max group
*/

View file

@ -0,0 +1,288 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_max_q7.c
* Description: Maximum value of a Q7 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup Max
@{
*/
/**
@brief Maximum value of a Q7 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult maximum value returned here
@param[out] pIndex index of maximum value returned here
@return none
*/
#if defined(ARM_MATH_MVEI)
#include "arm_helium_utils.h"
static void arm_small_blk_max_q7(
const q7_t * pSrc,
uint8_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecSrc;
q7x16_t curExtremValVec = vdupq_n_s8( Q7_MIN);
q7_t maxValue = Q7_MIN, temp;
uint32_t idx = blockSize;
uint8x16_t indexVec;
uint8x16_t curExtremIdxVec;
mve_pred16_t p0;
indexVec = vidupq_u8((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u8(0);
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrc = vldrbq_s8(pSrc);
pSrc += 16;
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpgeq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = indexVec + 16;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* Get max value across the vector
*/
maxValue = vmaxvq(maxValue, curExtremValVec);
/*
* set index for lower values to max possible index
*/
p0 = vcmpgeq(curExtremValVec, maxValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u8(blockSize), p0);
/*
* Get min index which is thus for a max value
*/
idx = vminvq(idx, indexVec);
/*
* tail
*/
blkCnt = blockSize & 0xF;
while (blkCnt > 0U)
{
/* Initialize temp to the next consecutive values one by one */
temp = *pSrc++;
/* compare for the maximum value */
if (maxValue < temp)
{
/* Update the maximum value and it's index */
maxValue = temp;
idx = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/*
* Save result
*/
*pIndex = idx;
*pResult = maxValue;
}
void arm_max_q7(
const q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
{
int32_t totalSize = blockSize;
if (totalSize <= UINT8_MAX)
{
arm_small_blk_max_q7(pSrc, blockSize, pResult, pIndex);
}
else
{
uint32_t curIdx = 0;
q7_t curBlkExtr = Q7_MIN;
uint32_t curBlkPos = 0;
uint32_t curBlkIdx = 0;
/*
* process blocks of 255 elts
*/
while (totalSize >= UINT8_MAX)
{
const q7_t *curSrc = pSrc;
arm_small_blk_max_q7(curSrc, UINT8_MAX, pResult, pIndex);
if (*pResult > curBlkExtr)
{
/*
* update partial extrema
*/
curBlkExtr = *pResult;
curBlkPos = *pIndex;
curBlkIdx = curIdx;
}
curIdx++;
pSrc += UINT8_MAX;
totalSize -= UINT8_MAX;
}
/*
* remainder
*/
arm_small_blk_max_q7(pSrc, totalSize, pResult, pIndex);
if (*pResult > curBlkExtr)
{
curBlkExtr = *pResult;
curBlkPos = *pIndex;
curBlkIdx = curIdx;
}
*pIndex = curBlkIdx * UINT8_MAX + curBlkPos;
*pResult = curBlkExtr;
}
}
#else
void arm_max_q7(
const q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
{
q7_t maxVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize maxVal to next consecutive values one by one */
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal)
{
/* Update the maximum value and it's index */
out = maxVal;
outIndex = index + 1U;
}
maxVal = *pSrc++;
if (out < maxVal)
{
out = maxVal;
outIndex = index + 2U;
}
maxVal = *pSrc++;
if (out < maxVal)
{
out = maxVal;
outIndex = index + 3U;
}
maxVal = *pSrc++;
if (out < maxVal)
{
out = maxVal;
outIndex = index + 4U;
}
index += 4U;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal = *pSrc++;
/* compare for the maximum value */
if (out < maxVal)
{
/* Update the maximum value and it's index */
out = maxVal;
outIndex = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/* Store the maximum value and it's index into destination pointers */
*pResult = out;
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of Max group
*/

View file

@ -0,0 +1,210 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mean_f32.c
* Description: Mean value of a floating-point vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@defgroup mean Mean
Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector.
The underlying algorithm is used:
<pre>
Result = (pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]) / blockSize;
</pre>
There are separate functions for floating-point, Q31, Q15, and Q7 data types.
*/
/**
@addtogroup mean
@{
*/
/**
@brief Mean value of a floating-point vector.
@param[in] pSrc points to the input vector.
@param[in] blockSize number of samples in input vector.
@param[out] pResult mean value returned here.
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mean_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
uint32_t blkCnt; /* loop counters */
f32x4_t vecSrc;
f32x4_t sumVec = vdupq_n_f32(0.0f);
float32_t sum = 0.0f;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vldrwq_f32(pSrc);
sumVec = vaddq_f32(sumVec, vecSrc);
blkCnt --;
pSrc += 4;
}
sum = vecAddAcrossF32Mve(sumVec);
/* Tail */
blkCnt = blockSize & 0x3;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement loop counter */
blkCnt--;
}
*pResult = sum / (float32_t) blockSize;
}
#else
#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_mean_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
float32_t sum = 0.0f; /* Temporary result storage */
float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
float32x2_t sumV2;
uint32_t blkCnt; /* Loop counter */
float32x4_t inV;
blkCnt = blockSize >> 2U;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
inV = vld1q_f32(pSrc);
sumV = vaddq_f32(sumV, inV);
pSrc += 4;
/* Decrement the loop counter */
blkCnt--;
}
sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize & 3;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement the loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store the result to the destination */
*pResult = sum / (float32_t) blockSize;
}
#else
void arm_mean_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float32_t sum = 0.0f; /* Temporary result storage */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
sum += *pSrc++;
sum += *pSrc++;
sum += *pSrc++;
/* Decrement the loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store result to destination */
*pResult = (sum / blockSize);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of mean group
*/

View file

@ -0,0 +1,156 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mean_q15.c
* Description: Mean value of a Q15 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup mean
@{
*/
/**
@brief Mean value of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult mean value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 32-bit internal accumulator.
The input is represented in 1.15 format and is accumulated in a 32-bit
accumulator in 17.15 format.
There is no risk of internal overflow with this approach, and the
full precision of intermediate result is preserved.
Finally, the accumulator is truncated to yield a result of 1.15 format.
*/
#if defined(ARM_MATH_MVEI)
void arm_mean_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
q31_t sum = 0L;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecSrc = vldrhq_s16(pSrc);
/*
* sum lanes
*/
sum = vaddvaq(sum, vecSrc);
blkCnt--;
pSrc += 8;
}
/* Tail */
blkCnt = blockSize & 0x7;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store the result to the destination */
*pResult = (q15_t) (sum / (int32_t) blockSize);
}
#else
void arm_mean_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
q31_t sum = 0; /* Temporary result storage */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t in;
#endif
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
in = read_q15x2_ia ((q15_t **) &pSrc);
sum += ((in << 16U) >> 16U);
sum += (in >> 16U);
in = read_q15x2_ia ((q15_t **) &pSrc);
sum += ((in << 16U) >> 16U);
sum += (in >> 16U);
/* Decrement the loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store result to destination */
*pResult = (q15_t) (sum / (int32_t) blockSize);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of mean group
*/

View file

@ -0,0 +1,149 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mean_q31.c
* Description: Mean value of a Q31 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup mean
@{
*/
/**
@brief Mean value of a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult mean value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
The input is represented in 1.31 format and is accumulated in a 64-bit
accumulator in 33.31 format.
There is no risk of internal overflow with this approach, and the
full precision of intermediate result is preserved.
Finally, the accumulator is truncated to yield a result of 1.31 format.
*/
#if defined(ARM_MATH_MVEI)
void arm_mean_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
uint32_t blkCnt; /* loop counters */
q31x4_t vecSrc;
q63_t sum = 0LL;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vldrwq_s32(pSrc);
/*
* sum lanes
*/
sum = vaddlvaq(sum, vecSrc);
blkCnt --;
pSrc += 4;
}
/* Tail */
blkCnt = blockSize & 0x3;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
blkCnt --;
}
*pResult = arm_div_q63_to_q31(sum, blockSize);
}
#else
void arm_mean_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary result storage */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
sum += *pSrc++;
sum += *pSrc++;
sum += *pSrc++;
/* Decrement the loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store result to destination */
*pResult = (q31_t) (sum / blockSize);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of mean group
*/

View file

@ -0,0 +1,153 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mean_q7.c
* Description: Mean value of a Q7 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup mean
@{
*/
/**
@brief Mean value of a Q7 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult mean value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 32-bit internal accumulator.
The input is represented in 1.7 format and is accumulated in a 32-bit
accumulator in 25.7 format.
There is no risk of internal overflow with this approach, and the
full precision of intermediate result is preserved.
Finally, the accumulator is truncated to yield a result of 1.7 format.
*/
#if defined(ARM_MATH_MVEI)
void arm_mean_q7(
const q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecSrc;
q31_t sum = 0L;
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrc = vldrbq_s8(pSrc);
/*
* sum lanes
*/
sum = vaddvaq(sum, vecSrc);
blkCnt--;
pSrc += 16;
}
blkCnt = blockSize & 0xF;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store the result to the destination */
*pResult = (q7_t) (sum / (int32_t) blockSize);
}
#else
void arm_mean_q7(
const q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
q31_t sum = 0; /* Temporary result storage */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t in;
#endif
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
in = read_q7x4_ia ((q7_t **) &pSrc);
sum += ((in << 24U) >> 24U);
sum += ((in << 16U) >> 24U);
sum += ((in << 8U) >> 24U);
sum += (in >> 24U);
/* Decrement the loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store result to destination */
*pResult = (q7_t) (sum / (int32_t) blockSize);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of mean group
*/

View file

@ -0,0 +1,363 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_min_f32.c
* Description: Minimum value of a floating-point vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
#include <limits.h>
#endif
/**
@ingroup groupStats
*/
/**
@defgroup Min Minimum
Computes the minimum value of an array of data.
The function returns both the minimum value and its position within the array.
There are separate functions for floating-point, Q31, Q15, and Q7 data types.
*/
/**
@addtogroup Min
@{
*/
/**
@brief Minimum value of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult minimum value returned here
@param[out] pIndex index of minimum value returned here
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_min_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
uint32_t blkCnt; /* loop counters */
f32x4_t vecSrc;
float32_t const *pSrcVec;
f32x4_t curExtremValVec = vdupq_n_f32(F32_MAX);
float32_t minValue = F32_MAX;
uint32_t idx = blockSize;
uint32x4_t indexVec;
uint32x4_t curExtremIdxVec;
float32_t tmp;
mve_pred16_t p0;
indexVec = vidupq_u32((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u32(0);
pSrcVec = (float32_t const *) pSrc;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vldrwq_f32(pSrcVec);
pSrcVec += 4;
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpleq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = indexVec + 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* Get min value across the vector
*/
minValue = vminnmvq(minValue, curExtremValVec);
/*
* set index for lower values to max possible index
*/
p0 = vcmpleq(curExtremValVec, minValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
/*
* Get min index which is thus for a max value
*/
idx = vminvq(idx, indexVec);
/*
* tail
*/
blkCnt = blockSize & 0x3;
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
tmp = *pSrc++;
/* compare for the minimum value */
if (minValue > tmp)
{
/* Update the minimum value and it's index */
minValue = tmp;
idx = blockSize - blkCnt;
}
blkCnt--;
}
/*
* Save result
*/
*pIndex = idx;
*pResult = minValue;
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_min_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
float32_t maxVal1, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* loop counter */
float32x4_t outV, srcV;
float32x2_t outV2;
uint32x4_t idxV;
static const uint32_t indexInit[4]={4,5,6,7};
static const uint32_t countVInit[4]={0,1,2,3};
uint32x4_t maxIdx;
uint32x4_t index;
uint32x4_t delta;
uint32x4_t countV;
uint32x2_t countV2;
maxIdx = vdupq_n_u32(ULONG_MAX);
delta = vdupq_n_u32(4);
index = vld1q_u32(indexInit);
countV = vld1q_u32(countVInit);
/* Initialise the index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparison */
if (blockSize <= 3)
{
out = *pSrc++;
blkCnt = blockSize - 1;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
/* compare for the maximum value */
if (out > maxVal1)
{
/* Update the maximum value and it's index */
out = maxVal1;
outIndex = blockSize - blkCnt;
}
/* Decrement the loop counter */
blkCnt--;
}
}
else
{
outV = vld1q_f32(pSrc);
pSrc += 4;
/* Compute 4 outputs at a time */
blkCnt = (blockSize - 4 ) >> 2U;
while (blkCnt > 0U)
{
srcV = vld1q_f32(pSrc);
pSrc += 4;
idxV = vcltq_f32(srcV, outV);
outV = vbslq_f32(idxV, srcV, outV );
countV = vbslq_u32(idxV, index,countV );
index = vaddq_u32(index,delta);
/* Decrement the loop counter */
blkCnt--;
}
outV2 = vpmin_f32(vget_low_f32(outV),vget_high_f32(outV));
outV2 = vpmin_f32(outV2,outV2);
out = vget_lane_f32(outV2,0);
idxV = vceqq_f32(outV, vdupq_n_f32(out));
countV = vbslq_u32(idxV, countV,maxIdx);
countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
countV2 = vpmin_u32(countV2,countV2);
outIndex = vget_lane_u32(countV2,0);
/* if (blockSize - 1U) is not multiple of 4 */
blkCnt = (blockSize - 4 ) % 4U;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
maxVal1 = *pSrc++;
/* compare for the maximum value */
if (out > maxVal1)
{
/* Update the maximum value and it's index */
out = maxVal1;
outIndex = blockSize - blkCnt ;
}
/* Decrement the loop counter */
blkCnt--;
}
}
/* Store the maximum value and it's index into destination pointers */
*pResult = out;
*pIndex = outIndex;
}
#else
void arm_min_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
float32_t minVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32_t index; /* index of maximum value */
#endif
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize minVal to next consecutive values one by one */
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal;
outIndex = index + 1U;
}
minVal = *pSrc++;
if (out > minVal)
{
out = minVal;
outIndex = index + 2U;
}
minVal = *pSrc++;
if (out > minVal)
{
out = minVal;
outIndex = index + 3U;
}
minVal = *pSrc++;
if (out > minVal)
{
out = minVal;
outIndex = index + 4U;
}
index += 4U;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal;
outIndex = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/* Store the minimum value and it's index into destination pointers */
*pResult = out;
*pIndex = outIndex;
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of Min group
*/

View file

@ -0,0 +1,234 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_min_q15.c
* Description: Minimum value of a Q15 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup Min
@{
*/
/**
@brief Minimum value of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult minimum value returned here
@param[out] pIndex index of minimum value returned here
@return none
*/
#if defined(ARM_MATH_MVEI)
#include "arm_helium_utils.h"
void arm_min_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult,
uint32_t * pIndex)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
q15x8_t curExtremValVec = vdupq_n_s16(Q15_MAX);
q15_t minValue = Q15_MAX,temp;
uint32_t idx = blockSize;
uint16x8_t indexVec;
uint16x8_t curExtremIdxVec;
mve_pred16_t p0;
indexVec = vidupq_u16((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u16(0);
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vldrhq_s16(pSrc);
pSrc += 8;
/*
* Get current min per lane and current index per lane
* when a min is selected
*/
p0 = vcmpleq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = indexVec + 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* Get min value across the vector
*/
minValue = vminvq(minValue, curExtremValVec);
/*
* set index for lower values to min possible index
*/
p0 = vcmpleq(curExtremValVec, minValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
/*
* Get min index which is thus for a min value
*/
idx = vminvq(idx, indexVec);
/*
* tail
*/
blkCnt = blockSize & 7;
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
temp = *pSrc++;
/* compare for the minimum value */
if (minValue > temp)
{
/* Update the minimum value and it's index */
minValue = temp;
idx = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/*
* Save result
*/
*pIndex = idx;
*pResult = minValue;
}
#else
void arm_min_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult,
uint32_t * pIndex)
{
q15_t minVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize minVal to next consecutive values one by one */
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal;
outIndex = index + 1U;
}
minVal = *pSrc++;
if (out > minVal)
{
out = minVal;
outIndex = index + 2U;
}
minVal = *pSrc++;
if (out > minVal)
{
out = minVal;
outIndex = index + 3U;
}
minVal = *pSrc++;
if (out > minVal)
{
out = minVal;
outIndex = index + 4U;
}
index += 4U;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal;
outIndex = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/* Store the minimum value and it's index into destination pointers */
*pResult = out;
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of Min group
*/

View file

@ -0,0 +1,233 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_min_q31.c
* Description: Minimum value of a Q31 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup Min
@{
*/
/**
@brief Minimum value of a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult minimum value returned here
@param[out] pIndex index of minimum value returned here
@return none
*/
#if defined(ARM_MATH_MVEI)
#include "arm_helium_utils.h"
void arm_min_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex)
{
uint32_t blkCnt; /* loop counters */
q31x4_t vecSrc;
q31x4_t curExtremValVec = vdupq_n_s32(Q31_MAX);
q31_t minValue = Q31_MAX, temp;
uint32_t idx = blockSize;
uint32x4_t indexVec;
uint32x4_t curExtremIdxVec;
mve_pred16_t p0;
indexVec = vidupq_u32((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u32(0);
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vldrwq_s32(pSrc);
pSrc += 4;
/*
* Get current min per lane and current index per lane
* when a min is selected
*/
p0 = vcmpleq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = indexVec + 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* Get min value across the vector
*/
minValue = vminvq(minValue, curExtremValVec);
/*
* set index for lower values to min possible index
*/
p0 = vcmpleq(curExtremValVec, minValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
/*
* Get min index which is thus for a min value
*/
idx = vminvq(idx, indexVec);
/* Tail */
blkCnt = blockSize & 0x3;
while (blkCnt > 0U)
{
/* Initialize temp to the next consecutive values one by one */
temp = *pSrc++;
/* compare for the minimum value */
if (minValue > temp)
{
/* Update the minimum value and it's index */
minValue = temp;
idx = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/*
* Save result
*/
*pIndex = idx;
*pResult = minValue;
}
#else
void arm_min_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex)
{
q31_t minVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize minVal to next consecutive values one by one */
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal;
outIndex = index + 1U;
}
minVal = *pSrc++;
if (out > minVal)
{
out = minVal;
outIndex = index + 2U;
}
minVal = *pSrc++;
if (out > minVal)
{
out = minVal;
outIndex = index + 3U;
}
minVal = *pSrc++;
if (out > minVal)
{
out = minVal;
outIndex = index + 4U;
}
index += 4U;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal;
outIndex = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/* Store the minimum value and it's index into destination pointers */
*pResult = out;
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of Min group
*/

View file

@ -0,0 +1,284 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_min_q7.c
* Description: Minimum value of a Q7 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup Min
@{
*/
/**
@brief Minimum value of a Q7 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult minimum value returned here
@param[out] pIndex index of minimum value returned here
@return none
*/
#if defined(ARM_MATH_MVEI)
#include "arm_helium_utils.h"
static void arm_small_blk_min_q7(
const q7_t * pSrc,
uint8_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecSrc;
q7x16_t curExtremValVec = vdupq_n_s8(Q7_MAX);
q7_t minValue = Q7_MAX,temp;
uint32_t idx = blockSize;
uint8x16_t indexVec;
uint8x16_t curExtremIdxVec;
mve_pred16_t p0;
indexVec = vidupq_u8((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u8(0);
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrc = vldrbq_s8(pSrc);
pSrc += 16;
/*
* Get current min per lane and current index per lane
* when a min is selected
*/
p0 = vcmpleq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = indexVec + 16;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* Get min value across the vector
*/
minValue = vminvq(minValue, curExtremValVec);
/*
* set index for lower values to min possible index
*/
p0 = vcmpleq(curExtremValVec, minValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u8(blockSize), p0);
/*
* Get min index which is thus for a min value
*/
idx = vminvq(idx, indexVec);
blkCnt = blockSize & 0xF;
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
temp = *pSrc++;
/* compare for the minimum value */
if (minValue > temp)
{
/* Update the minimum value and it's index */
minValue = temp;
idx = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/*
* Save result
*/
*pIndex = idx;
*pResult = minValue;
}
void arm_min_q7(
const q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
{
int32_t totalSize = blockSize;
if (totalSize <= UINT8_MAX)
{
arm_small_blk_min_q7(pSrc, blockSize, pResult, pIndex);
}
else
{
uint32_t curIdx = 0;
q7_t curBlkExtr = Q7_MAX;
uint32_t curBlkPos = 0;
uint32_t curBlkIdx = 0;
/*
* process blocks of 255 elts
*/
while (totalSize >= UINT8_MAX)
{
const q7_t *curSrc = pSrc;
arm_small_blk_min_q7(curSrc, UINT8_MAX, pResult, pIndex);
if (*pResult < curBlkExtr)
{
/*
* update partial extrema
*/
curBlkExtr = *pResult;
curBlkPos = *pIndex;
curBlkIdx = curIdx;
}
curIdx++;
pSrc += UINT8_MAX;
totalSize -= UINT8_MAX;
}
/*
* remainder
*/
arm_small_blk_min_q7(pSrc, totalSize, pResult, pIndex);
if (*pResult < curBlkExtr)
{
curBlkExtr = *pResult;
curBlkPos = *pIndex;
curBlkIdx = curIdx;
}
*pIndex = curBlkIdx * UINT8_MAX + curBlkPos;
*pResult = curBlkExtr;
}
}
#else
void arm_min_q7(
const q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
{
q7_t minVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt, outIndex; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
uint32_t index; /* index of maximum value */
#endif
/* Initialise index value to zero. */
outIndex = 0U;
/* Load first input value that act as reference value for comparision */
out = *pSrc++;
#if defined (ARM_MATH_LOOPUNROLL)
/* Initialise index of maximum value. */
index = 0U;
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = (blockSize - 1U) >> 2U;
while (blkCnt > 0U)
{
/* Initialize minVal to next consecutive values one by one */
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal;
outIndex = index + 1U;
}
minVal = *pSrc++;
if (out > minVal)
{
out = minVal;
outIndex = index + 2U;
}
minVal = *pSrc++;
if (out > minVal)
{
out = minVal;
outIndex = index + 3U;
}
minVal = *pSrc++;
if (out > minVal)
{
out = minVal;
outIndex = index + 4U;
}
index += 4U;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize - 1U) % 4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal = *pSrc++;
/* compare for the minimum value */
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal;
outIndex = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
}
/* Store the minimum value and it's index into destination pointers */
*pResult = out;
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of Min group
*/

View file

@ -0,0 +1,225 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_power_f32.c
* Description: Sum of the squares of the elements of a floating-point vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@defgroup power Power
Calculates the sum of the squares of the elements in the input vector.
The underlying algorithm is used:
<pre>
Result = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + pSrc[2] * pSrc[2] + ... + pSrc[blockSize-1] * pSrc[blockSize-1];
</pre>
There are separate functions for floating point, Q31, Q15, and Q7 data types.
*/
/**
@addtogroup power
@{
*/
/**
@brief Sum of the squares of the elements of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult sum of the squares value returned here
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_power_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
uint32_t blkCnt; /* loop counters */
f32x4_t vecSrc;
f32x4_t sumVec = vdupq_n_f32(0.0f);
float32_t sum = 0.0f;
float32_t in;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vldrwq_f32(pSrc);
/*
* sum lanes
*/
sumVec = vfmaq(sumVec, vecSrc, vecSrc);
blkCnt --;
pSrc += 4;
}
sum = vecAddAcrossF32Mve(sumVec);
/*
* tail
*/
blkCnt = blockSize & 0x3;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
/* Decrement loop counter */
blkCnt--;
}
*pResult = sum;
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_power_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
float32_t sum = 0.0f; /* accumulator */
float32_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counter */
float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
float32x2_t sumV2;
float32x4_t inV;
blkCnt = blockSize >> 2U;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and then store the result in a temporary variable, sum. */
inV = vld1q_f32(pSrc);
sumV = vmlaq_f32(sumV, inV, inV);
pSrc += 4;
/* Decrement the loop counter */
blkCnt--;
}
sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* compute power and then store the result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
/* Decrement the loop counter */
blkCnt--;
}
/* Store the result to the destination */
*pResult = sum;
}
#else
void arm_power_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float32_t sum = 0.0f; /* Temporary result storage */
float32_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
/* Decrement loop counter */
blkCnt--;
}
/* Store result to destination */
*pResult = sum;
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of power group
*/

View file

@ -0,0 +1,177 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_power_q15.c
* Description: Sum of the squares of the elements of a Q15 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup power
@{
*/
/**
@brief Sum of the squares of the elements of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult sum of the squares value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
The input is represented in 1.15 format.
Intermediate multiplication yields a 2.30 format, and this
result is added without saturation to a 64-bit accumulator in 34.30 format.
With 33 guard bits in the accumulator, there is no risk of overflow, and the
full precision of the intermediate multiplication is preserved.
Finally, the return result is in 34.30 format.
*/
#if defined(ARM_MATH_MVEI)
void arm_power_q15(
const q15_t * pSrc,
uint32_t blockSize,
q63_t * pResult)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
q63_t sum = 0LL;
q15_t in;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecSrc = vldrhq_s16(pSrc);
/*
* sum lanes
*/
sum = vmlaldavaq(sum, vecSrc, vecSrc);
blkCnt --;
pSrc += 8;
}
/*
* tail
*/
blkCnt = blockSize & 0x7;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q31_t) in * in);
/* Decrement loop counter */
blkCnt--;
}
*pResult = sum;
}
#else
void arm_power_q15(
const q15_t * pSrc,
uint32_t blockSize,
q63_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary result storage */
q15_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t in32; /* Temporary variable to store packed input value */
#endif
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
#if defined (ARM_MATH_DSP)
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sum = __SMLALD(in32, in32, sum);
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sum = __SMLALD(in32, in32, sum);
#else
in = *pSrc++;
sum += ((q31_t) in * in);
in = *pSrc++;
sum += ((q31_t) in * in);
in = *pSrc++;
sum += ((q31_t) in * in);
in = *pSrc++;
sum += ((q31_t) in * in);
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q31_t) in * in);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in 34.30 format */
*pResult = sum;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of power group
*/

View file

@ -0,0 +1,165 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_power_q31.c
* Description: Sum of the squares of the elements of a Q31 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup power
@{
*/
/**
@brief Sum of the squares of the elements of a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult sum of the squares value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
The input is represented in 1.31 format.
Intermediate multiplication yields a 2.62 format, and this
result is truncated to 2.48 format by discarding the lower 14 bits.
The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
With 15 guard bits in the accumulator, there is no risk of overflow, and the
full precision of the intermediate multiplication is preserved.
Finally, the return result is in 16.48 format.
*/
#if defined(ARM_MATH_MVEI)
void arm_power_q31(
const q31_t * pSrc,
uint32_t blockSize,
q63_t * pResult)
{
uint32_t blkCnt; /* loop counters */
q31x4_t vecSrc;
q63_t sum = 0LL;
q31_t in;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vldrwq_s32(pSrc);
/*
* sum lanes
*/
sum = vrmlaldavhaq(sum, vecSrc, vecSrc);
blkCnt --;
pSrc += 4;
}
/*
* tail
*/
blkCnt = blockSize & 0x3;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q63_t) in * in) >> 8;
/* Decrement loop counter */
blkCnt--;
}
*pResult = asrl(sum, 6);
}
#else
void arm_power_q31(
const q31_t * pSrc,
uint32_t blockSize,
q63_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary result storage */
q31_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power then shift intermediate results by 14 bits to maintain 16.48 format and store result in a temporary variable sum, providing 15 guard bits. */
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q63_t) in * in) >> 14U;
/* Decrement loop counter */
blkCnt--;
}
/* Store results in 16.48 format */
*pResult = sum;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of power group
*/

View file

@ -0,0 +1,180 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_power_q7.c
* Description: Sum of the squares of the elements of a Q7 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup power
@{
*/
/**
@brief Sum of the squares of the elements of a Q7 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult sum of the squares value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 32-bit internal accumulator.
The input is represented in 1.7 format.
Intermediate multiplication yields a 2.14 format, and this
result is added without saturation to an accumulator in 18.14 format.
With 17 guard bits in the accumulator, there is no risk of overflow, and the
full precision of the intermediate multiplication is preserved.
Finally, the return result is in 18.14 format.
*/
#if defined(ARM_MATH_MVEI)
void arm_power_q7(
const q7_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecSrc;
q31_t sum = 0LL;
q7_t in;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4U;
while (blkCnt > 0U)
{
vecSrc = vldrbq_s8(pSrc);
/*
* sum lanes
*/
sum = vmladavaq(sum, vecSrc, vecSrc);
blkCnt--;
pSrc += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q15_t) in * in);
/* Decrement loop counter */
blkCnt--;
}
*pResult = sum;
}
#else
void arm_power_q7(
const q7_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
q31_t sum = 0; /* Temporary result storage */
q7_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t in32; /* Temporary variable to store packed input value */
q31_t in1, in2; /* Temporary variables to store input value */
#endif
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
#if defined (ARM_MATH_DSP)
in32 = read_q7x4_ia ((q7_t **) &pSrc);
in1 = __SXTB16(__ROR(in32, 8));
in2 = __SXTB16(in32);
/* calculate power and accumulate to accumulator */
sum = __SMLAD(in1, in1, sum);
sum = __SMLAD(in2, in2, sum);
#else
in = *pSrc++;
sum += ((q15_t) in * in);
in = *pSrc++;
sum += ((q15_t) in * in);
in = *pSrc++;
sum += ((q15_t) in * in);
in = *pSrc++;
sum += ((q15_t) in * in);
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += ((q15_t) in * in);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in 18.14 format */
*pResult = sum;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of power group
*/

View file

@ -0,0 +1,192 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_rms_f32.c
* Description: Root mean square value of the elements of a floating-point vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@defgroup RMS Root mean square (RMS)
Calculates the Root Mean Square of the elements in the input vector.
The underlying algorithm is used:
<pre>
Result = sqrt(((pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]) / blockSize));
</pre>
There are separate functions for floating point, Q31, and Q15 data types.
*/
/**
@addtogroup RMS
@{
*/
/**
@brief Root Mean Square of the elements of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult root mean square value returned here
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_rms_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
float32_t pow = 0.0f;
arm_power_f32(pSrc, blockSize, &pow);
/* Compute Rms and store the result in the destination */
arm_sqrt_f32(pow / (float32_t) blockSize, pResult);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_rms_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
float32_t sum = 0.0f; /* accumulator */
float32_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counter */
float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
float32x2_t sumV2;
float32x4_t inV;
blkCnt = blockSize >> 2U;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and then store the result in a temporary variable, sum. */
inV = vld1q_f32(pSrc);
sumV = vmlaq_f32(sumV, inV, inV);
pSrc += 4;
/* Decrement the loop counter */
blkCnt--;
}
sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* compute power and then store the result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
/* Decrement the loop counter */
blkCnt--;
}
/* Compute Rms and store the result in the destination */
arm_sqrt_f32(sum / (float32_t) blockSize, pResult);
}
#else
void arm_rms_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float32_t sum = 0.0f; /* Temporary result storage */
float32_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable, sum. */
sum += in * in;
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
in = *pSrc++;
sum += in * in;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable. */
sum += ( in * in);
/* Decrement loop counter */
blkCnt--;
}
/* Compute Rms and store result in destination */
arm_sqrt_f32(sum / (float32_t) blockSize, pResult);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of RMS group
*/

View file

@ -0,0 +1,149 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_rms_q15.c
* Description: Root Mean Square of the elements of a Q15 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup RMS
@{
*/
/**
@brief Root Mean Square of the elements of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult root mean square value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
The input is represented in 1.15 format.
Intermediate multiplication yields a 2.30 format, and this
result is added without saturation to a 64-bit accumulator in 34.30 format.
With 33 guard bits in the accumulator, there is no risk of overflow, and the
full precision of the intermediate multiplication is preserved.
Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
15 bits, and then saturated to yield a result in 1.15 format.
*/
#if defined(ARM_MATH_MVEI)
void arm_rms_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
{
q63_t pow = 0.0f;
q15_t normalizedPower;
arm_power_q15(pSrc, blockSize, &pow);
normalizedPower=__SSAT((pow / (q63_t) blockSize) >> 15,16);
arm_sqrt_q15(normalizedPower, pResult);
}
#else
void arm_rms_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary result storage */
q15_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t in32; /* Temporary variable to store input value */
#endif
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute sum of squares and store result in a temporary variable. */
#if defined (ARM_MATH_DSP)
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sum = __SMLALD(in32, in32, sum);
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sum = __SMLALD(in32, in32, sum);
#else
in = *pSrc++;
sum += ((q31_t) in * in);
in = *pSrc++;
sum += ((q31_t) in * in);
in = *pSrc++;
sum += ((q31_t) in * in);
in = *pSrc++;
sum += ((q31_t) in * in);
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable. */
sum += ((q31_t) in * in);
/* Decrement loop counter */
blkCnt--;
}
/* Truncating and saturating the accumulator to 1.15 format */
/* Store result in destination */
arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of RMS group
*/

View file

@ -0,0 +1,141 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_rms_q31.c
* Description: Root Mean Square of the elements of a Q31 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup RMS
@{
*/
/**
@brief Root Mean Square of the elements of a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult root mean square value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The input is represented in 1.31 format, and intermediate multiplication
yields a 2.62 format.
The accumulator maintains full precision of the intermediate multiplication results,
but provides only a single guard bit.
There is no saturation on intermediate additions.
If the accumulator overflows, it wraps around and distorts the result.
In order to avoid overflows completely, the input signal must be scaled down by
log2(blockSize) bits, as a total of blockSize additions are performed internally.
Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value.
*/
#if defined(ARM_MATH_MVEI)
void arm_rms_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
q63_t pow = 0.0f;
q31_t normalizedPower;
arm_power_q31(pSrc, blockSize, &pow);
normalizedPower=clip_q63_to_q31((pow / (q63_t) blockSize) >> 17);
arm_sqrt_q31(normalizedPower, pResult);
}
#else
void arm_rms_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
uint64_t sum = 0; /* Temporary result storage (can get never negative. changed type from q63 to uint64 */
q31_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable, sum. */
sum += ((q63_t) in * in);
in = *pSrc++;
sum += ((q63_t) in * in);
in = *pSrc++;
sum += ((q63_t) in * in);
in = *pSrc++;
sum += ((q63_t) in * in);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable. */
sum += ((q63_t) in * in);
/* Decrement loop counter */
blkCnt--;
}
/* Convert data in 2.62 to 1.31 by 31 right shifts and saturate */
/* Compute Rms and store result in destination vector */
arm_sqrt_q31(clip_q63_to_q31((sum / (q63_t) blockSize) >> 31), pResult);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of RMS group
*/

View file

@ -0,0 +1,83 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_std_f32.c
* Description: Standard deviation of the elements of a floating-point vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@defgroup STD Standard deviation
Calculates the standard deviation of the elements in the input vector.
The float implementation is relying on arm_var_f32 which is using a two-pass algorithm
to avoid problem of numerical instabilities and cancellation errors.
Fixed point versions are using the standard textbook algorithm since the fixed point
numerical behavior is different from the float one.
Algorithm for fixed point versions is summarized below:
<pre>
Result = sqrt((sumOfSquares - sum<sup>2</sup> / blockSize) / (blockSize - 1))
sumOfSquares = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]
sum = pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]
</pre>
There are separate functions for floating point, Q31, and Q15 data types.
*/
/**
@addtogroup STD
@{
*/
/**
@brief Standard deviation of the elements of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult standard deviation value returned here
@return none
*/
void arm_std_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
float32_t var;
arm_var_f32(pSrc,blockSize,&var);
arm_sqrt_f32(var, pResult);
}
/**
@} end of STD group
*/

View file

@ -0,0 +1,173 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_std_q15.c
* Description: Standard deviation of an array of Q15 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup STD
@{
*/
/**
@brief Standard deviation of the elements of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult standard deviation value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
The input is represented in 1.15 format.
Intermediate multiplication yields a 2.30 format, and this
result is added without saturation to a 64-bit accumulator in 34.30 format.
With 33 guard bits in the accumulator, there is no risk of overflow, and the
full precision of the intermediate multiplication is preserved.
Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
15 bits, and then saturated to yield a result in 1.15 format.
*/
#if defined(ARM_MATH_MVEI)
void arm_std_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
{
q15_t var=0;
arm_var_q15(pSrc, blockSize, &var);
arm_sqrt_q15(var,pResult);
}
#else
void arm_std_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
q31_t sum = 0; /* Accumulator */
q31_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
q63_t sumOfSquares = 0; /* Sum of squares */
q15_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t in32; /* Temporary variable to store input value */
#endif
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
/* Compute sum and store result in a temporary variable, sum. */
#if defined (ARM_MATH_DSP)
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
sum += ((in32 << 16U) >> 16U);
sum += (in32 >> 16U);
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
sum += ((in32 << 16U) >> 16U);
sum += (in32 >> 16U);
#else
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += (in * in);
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
/* mean of squares minus the square of mean. */
/* Compute standard deviation and store result in destination */
arm_sqrt_q15(__SSAT((meanOfSquares - squareOfMean) >> 15U, 16U), pResult);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of STD group
*/

View file

@ -0,0 +1,159 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_std_q31.c
* Description: Standard deviation of the elements of a Q31 vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup STD
@{
*/
/**
@brief Standard deviation of the elements of a Q31 vector.
@param[in] pSrc points to the input vector.
@param[in] blockSize number of samples in input vector.
@param[out] pResult standard deviation value returned here.
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The input is represented in 1.31 format, which is then downshifted by 8 bits
which yields 1.23, and intermediate multiplication yields a 2.46 format.
The accumulator maintains full precision of the intermediate multiplication results,
but provides only a 16 guard bits.
There is no saturation on intermediate additions.
If the accumulator overflows it wraps around and distorts the result.
In order to avoid overflows completely the input signal must be scaled down by
log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
After division, internal variables should be Q18.46
Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
*/
#if defined(ARM_MATH_MVEI)
void arm_std_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
q31_t var=0;
arm_var_q31(pSrc, blockSize, &var);
arm_sqrt_q31(var, pResult);
}
#else
void arm_std_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Accumulator */
q63_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
q63_t sumOfSquares = 0; /* Sum of squares */
q31_t in; /* Temporary variable to store input value */
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
in = *pSrc++ >> 8U;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ((q63_t) (in) * (in));
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
sum += in;
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
sum += in;
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
in = *pSrc++ >> 8U;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ((q63_t) (in) * (in));
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
meanOfSquares = (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
squareOfMean = ( sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
/* Compute standard deviation and store result in destination */
arm_sqrt_q31((meanOfSquares - squareOfMean) >> 15U, pResult);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of STD group
*/

View file

@ -0,0 +1,293 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_var_f32.c
* Description: Variance of the elements of a floating-point vector
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@defgroup variance Variance
Calculates the variance of the elements in the input vector.
The underlying algorithm used is the direct method sometimes referred to as the two-pass method:
<pre>
Result = sum(element - meanOfElements)^2) / numElement - 1
meanOfElements = ( pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] ) / blockSize
</pre>
There are separate functions for floating point, Q31, and Q15 data types.
*/
/**
@addtogroup variance
@{
*/
/**
@brief Variance of the elements of a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult variance value returned here
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_var_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
uint32_t blkCnt; /* loop counters */
f32x4_t vecSrc;
f32x4_t sumVec = vdupq_n_f32(0.0f);
float32_t fMean;
float32_t sum = 0.0f; /* accumulator */
float32_t in; /* Temporary variable to store input value */
if (blockSize <= 1U) {
*pResult = 0;
return;
}
arm_mean_f32(pSrc, blockSize, &fMean);
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vldrwq_f32(pSrc);
/*
* sum lanes
*/
vecSrc = vsubq(vecSrc, fMean);
sumVec = vfmaq(sumVec, vecSrc, vecSrc);
blkCnt --;
pSrc += 4;
}
sum = vecAddAcrossF32Mve(sumVec);
/*
* tail
*/
blkCnt = blockSize & 0x3;
while (blkCnt > 0U)
{
in = *pSrc++ - fMean;
sum += in * in;
/* Decrement loop counter */
blkCnt--;
}
/* Variance */
*pResult = sum / (float32_t) (blockSize - 1);
}
#else
#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_var_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
float32_t mean;
float32_t sum = 0.0f; /* accumulator */
float32_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counter */
float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
float32x2_t sumV2;
float32x4_t inV;
float32x4_t avg;
arm_mean_f32(pSrc,blockSize,&mean);
avg = vdupq_n_f32(mean);
blkCnt = blockSize >> 2U;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and then store the result in a temporary variable, sum. */
inV = vld1q_f32(pSrc);
inV = vsubq_f32(inV, avg);
sumV = vmlaq_f32(sumV, inV, inV);
pSrc += 4;
/* Decrement the loop counter */
blkCnt--;
}
sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
/* compute power and then store the result in a temporary variable, sum. */
in = *pSrc++;
in = in - mean;
sum += in * in;
/* Decrement the loop counter */
blkCnt--;
}
/* Variance */
*pResult = sum / (float32_t)(blockSize - 1.0f);
}
#else
void arm_var_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float32_t sum = 0.0f; /* Temporary result storage */
float32_t fSum = 0.0f;
float32_t fMean, fValue;
const float32_t * pInput = pSrc;
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pInput++;
sum += *pInput++;
sum += *pInput++;
sum += *pInput++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pInput++;
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
fMean = sum / (float32_t) blockSize;
pInput = pSrc;
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
/* Decrement loop counter */
blkCnt--;
}
/* Variance */
*pResult = fSum / (float32_t)(blockSize - 1.0f);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of variance group
*/

View file

@ -0,0 +1,230 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_var_q15.c
* Description: Variance of an array of Q15 type
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup variance
@{
*/
/**
@brief Variance of the elements of a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult variance value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using a 64-bit internal accumulator.
The input is represented in 1.15 format.
Intermediate multiplication yields a 2.30 format, and this
result is added without saturation to a 64-bit accumulator in 34.30 format.
With 33 guard bits in the accumulator, there is no risk of overflow, and the
full precision of the intermediate multiplication is preserved.
Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
15 bits, and then saturated to yield a result in 1.15 format.
*/
#if defined(ARM_MATH_MVEI)
void arm_var_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
q63_t sumOfSquares = 0LL;
q63_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
q63_t sum = 0LL;
q15_t in;
if (blockSize <= 1U) {
*pResult = 0;
return;
}
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vldrhq_s16(pSrc);
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sumOfSquares. */
sumOfSquares = vmlaldavaq(sumOfSquares, vecSrc, vecSrc);
sum = vaddvaq(sum, vecSrc);
blkCnt --;
pSrc += 8;
}
/* Tail */
blkCnt = blockSize & 7;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
#if defined (ARM_MATH_DSP)
sumOfSquares = __SMLALD(in, in, sumOfSquares);
#else
sumOfSquares += (in * in);
#endif /* #if defined (ARM_MATH_DSP) */
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* Compute Mean of squares of the input samples
* and then store the result in a temporary variable, meanOfSquares. */
meanOfSquares = arm_div_q63_to_q31(sumOfSquares, (blockSize - 1U));
/* Compute square of mean */
squareOfMean = arm_div_q63_to_q31((q63_t)sum * sum, (q31_t)(blockSize * (blockSize - 1U)));
/* mean of the squares minus the square of the mean. */
*pResult = (meanOfSquares - squareOfMean) >> 15;
}
#else
void arm_var_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
q31_t sum = 0; /* Accumulator */
q31_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
q63_t sumOfSquares = 0; /* Sum of squares */
q15_t in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t in32; /* Temporary variable to store input value */
#endif
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
/* Compute sum and store result in a temporary variable, sum. */
#if defined (ARM_MATH_DSP)
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
sum += ((in32 << 16U) >> 16U);
sum += (in32 >> 16U);
in32 = read_q15x2_ia ((q15_t **) &pSrc);
sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
sum += ((in32 << 16U) >> 16U);
sum += (in32 >> 16U);
#else
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
in = *pSrc++;
sumOfSquares += (in * in);
sum += in;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
in = *pSrc++;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
#if defined (ARM_MATH_DSP)
sumOfSquares = __SMLALD(in, in, sumOfSquares);
#else
sumOfSquares += (in * in);
#endif /* #if defined (ARM_MATH_DSP) */
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
/* mean of squares minus the square of mean. */
*pResult = (meanOfSquares - squareOfMean) >> 15U;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of variance group
*/

View file

@ -0,0 +1,214 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_var_q31.c
* Description: Variance of an array of Q31 type
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupStats
*/
/**
@addtogroup variance
@{
*/
/**
@brief Variance of the elements of a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] blockSize number of samples in input vector
@param[out] pResult variance value returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The input is represented in 1.31 format, which is then downshifted by 8 bits
which yields 1.23, and intermediate multiplication yields a 2.46 format.
The accumulator maintains full precision of the intermediate multiplication results,
and as a consequence has only 16 guard bits.
There is no saturation on intermediate additions.
If the accumulator overflows it wraps around and distorts the result.
In order to avoid overflows completely the input signal must be scaled down by
log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
After division, internal variables should be Q18.46
Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
*/
#if defined(ARM_MATH_MVEI)
void arm_var_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
uint32_t blkCnt; /* loop counters */
q31x4_t vecSrc;
q63_t sumOfSquares = 0LL;
q63_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
q63_t sum = 0LL;
q31_t in;
if (blockSize <= 1U) {
*pResult = 0;
return;
}
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vldrwq_s32(pSrc);
/* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
/* Compute Sum of squares of the input samples
* and then store the result in a temporary variable, sumOfSquares. */
/* downscale */
vecSrc = vshrq(vecSrc, 8);
sumOfSquares = vmlaldavaq(sumOfSquares, vecSrc, vecSrc);
sum = vaddlvaq(sum, vecSrc);
blkCnt --;
pSrc += 4;
}
/*
* tail
*/
blkCnt = blockSize & 0x3;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
in = *pSrc++ >> 8U;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ((q63_t) (in) * (in));
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* Compute Mean of squares of the input samples
* and then store the result in a temporary variable, meanOfSquares. */
meanOfSquares = sumOfSquares / (q63_t) (blockSize - 1U);
/* Compute square of mean */
squareOfMean = sum * sum / (q63_t) (blockSize * (blockSize - 1U));
/* Compute standard deviation and then store the result to the destination */
*pResult = asrl(meanOfSquares - squareOfMean, 15U);
}
#else
void arm_var_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary result storage */
q63_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
q63_t sumOfSquares = 0; /* Sum of squares */
q31_t in; /* Temporary variable to store input value */
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
in = *pSrc++ >> 8U;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ((q63_t) (in) * (in));
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
sum += in;
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
sum += in;
in = *pSrc++ >> 8U;
sumOfSquares += ((q63_t) (in) * (in));
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* C = A[0] + A[1] + ... + A[blockSize-1] */
in = *pSrc++ >> 8U;
/* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
sumOfSquares += ((q63_t) (in) * (in));
/* Compute sum and store result in a temporary variable, sum. */
sum += in;
/* Decrement loop counter */
blkCnt--;
}
/* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
meanOfSquares = (sumOfSquares / (q63_t)(blockSize - 1U));
/* Compute square of mean */
squareOfMean = ( sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
/* Compute variance and store result in destination */
*pResult = (meanOfSquares - squareOfMean) >> 15U;
}
#endif
/**
@} end of variance group
*/