1
0
Fork 0
mirror of https://github.com/betaflight/betaflight.git synced 2025-07-19 14:25:20 +03:00

AT32F435/7 Libraries (#12158) (#12263)

Source: https://github.com/ArteryTek/AT32F435_437_Firmware_Library
Version: 2.1.1
This commit is contained in:
J Blackman 2023-01-31 08:05:32 +11:00 committed by GitHub
parent 5e16ddb01b
commit 8900a831e5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
559 changed files with 289319 additions and 0 deletions

View file

@ -0,0 +1,19 @@
cmake_minimum_required (VERSION 3.6)
project(CMSISDSPMatrix)
include(configLib)
include(configDsp)
file(GLOB SRC "./*_*.c")
add_library(CMSISDSPMatrix STATIC ${SRC})
configLib(CMSISDSPMatrix ${ROOT})
configDsp(CMSISDSPMatrix ${ROOT})
### Includes
target_include_directories(CMSISDSPMatrix PUBLIC "${DSP}/Include")

View file

@ -0,0 +1,53 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: MatrixFunctions.c
* Description: Combination of all matrix function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_mat_add_f32.c"
#include "arm_mat_add_q15.c"
#include "arm_mat_add_q31.c"
#include "arm_mat_cmplx_mult_f32.c"
#include "arm_mat_cmplx_mult_q15.c"
#include "arm_mat_cmplx_mult_q31.c"
#include "arm_mat_init_f32.c"
#include "arm_mat_init_q15.c"
#include "arm_mat_init_q31.c"
#include "arm_mat_inverse_f32.c"
#include "arm_mat_inverse_f64.c"
#include "arm_mat_mult_f32.c"
#include "arm_mat_mult_fast_q15.c"
#include "arm_mat_mult_fast_q31.c"
#include "arm_mat_mult_q15.c"
#include "arm_mat_mult_q31.c"
#include "arm_mat_scale_f32.c"
#include "arm_mat_scale_q15.c"
#include "arm_mat_scale_q31.c"
#include "arm_mat_sub_f32.c"
#include "arm_mat_sub_q15.c"
#include "arm_mat_sub_q31.c"
#include "arm_mat_trans_f32.c"
#include "arm_mat_trans_q15.c"
#include "arm_mat_trans_q31.c"

View file

@ -0,0 +1,304 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_add_f32.c
* Description: Floating-point matrix addition
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixAdd Matrix Addition
Adds two matrices.
\image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
The functions check to make sure that
<code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
number of rows and columns.
*/
/**
@addtogroup MatrixAdd
@{
*/
/**
@brief Floating-point matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
arm_status status;
uint32_t numSamples; /* total number of elements in the matrix */
float32_t *pDataA, *pDataB, *pDataDst;
f32x4_t vecA, vecB, vecDst;
float32_t const *pSrcAVec;
float32_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (float32_t const *) pDataA;
pSrcBVec = (float32_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vaddq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
vecDst = vaddq_m(vecDst, vecA, vecB, p0);
vstrwq_p(pDataDst, vecDst, p0);
}
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
return (status);
}
#else
#if defined(ARM_MATH_NEON)
/*
Neon version is assuming the matrix is small enough.
So no blocking is used for taking into account cache effects.
For big matrix, there exist better libraries for Neon.
*/
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
{
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2U;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q_f32(pIn1);
vec2 = vld1q_f32(pIn2);
res = vaddq_f32(vec1, vec2);
vst1q_f32(pOut, res);
/* update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
*pOut++ = (*pIn1++) + (*pIn2++);
/* Decrement the loop counter */
blkCnt--;
}
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = *pInA++ + *pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixAdd group
*/

View file

@ -0,0 +1,227 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_add_q15.c
* Description: Q15 matrix addition
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixAdd
@{
*/
/**
@brief Q15 matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI)
arm_status arm_mat_add_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
uint32_t numSamples; /* total number of elements in the matrix */
q15_t *pDataA, *pDataB, *pDataDst;
q15x8_t vecA, vecB, vecDst;
q15_t const *pSrcAVec;
q15_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (q15_t const *) pDataA;
pSrcBVec = (q15_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec); pSrcAVec += 8;
vecB = vld1q(pSrcBVec); pSrcBVec += 8;
vecDst = vqaddq(vecA, vecB);
vst1q(pDataDst, vecDst); pDataDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec); pSrcAVec += 8;
vecB = vld1q(pSrcBVec); pSrcBVec += 8;
vecDst = vqaddq_m(vecDst, vecA, vecB, p0);
vstrhq_p(pDataDst, vecDst, p0);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_add_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
#else
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pOut++ = (q15_t) __QADD16(*pInA++, *pInB++);
#else
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixAdd group
*/

View file

@ -0,0 +1,216 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_add_q31.c
* Description: Q31 matrix addition
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixAdd
@{
*/
/**
@brief Q31 matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI)
arm_status arm_mat_add_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
arm_status status; /* status of matrix addition */
uint32_t numSamples; /* total number of elements in the matrix */
q31_t *pDataA, *pDataB, *pDataDst;
q31x4_t vecA, vecB, vecDst;
q31_t const *pSrcAVec;
q31_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (q31_t const *) pDataA;
pSrcBVec = (q31_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vqaddq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vqaddq_m(vecDst, vecA, vecB, p0);
vstrwq_p(pDataDst, vecDst, p0);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_add_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
*pOut++ = __QADD(*pInA++, *pInB++);
*pOut++ = __QADD(*pInA++, *pInB++);
*pOut++ = __QADD(*pInA++, *pInB++);
*pOut++ = __QADD(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
*pOut++ = __QADD(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixAdd group
*/

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,595 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mat_mult_q15.c
* Description: Q15 complex matrix multiplication
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup CmplxMatrixMult
@{
*/
/**
@brief Q15 Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@param[in] pScratch points to an array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Conditions for optimum performance
Input, output and state buffers should be aligned by 32-bit
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator. The inputs to the
multiplications are in 1.15 format and multiplications yield a 2.30 result.
The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
This approach provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
*/
#if defined(ARM_MATH_MVEI)
#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
arm_status arm_mat_cmplx_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pScratch)
{
q15_t const *pInA = (q15_t const *) pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t const *pInB = (q15_t const *) pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t const *pInB2;
q15_t *px; /* Temporary output data matrix pointer */
uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
uint32_t blkCnt; /* loop counters */
uint16x8_t vecOffs, vecColBOffs;
arm_status status; /* Status of matrix multiplication */
(void)pScratch;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
vecColBOffs[0] = 0;
vecColBOffs[1] = 1;
vecColBOffs[2] = numColsB * CMPLX_DIM;
vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
vecColBOffs[4] = 2 * numColsB * CMPLX_DIM;
vecColBOffs[5] = 2 * (numColsB * CMPLX_DIM) + 1;
vecColBOffs[6] = 3 * numColsB * CMPLX_DIM;
vecColBOffs[7] = 3 * (numColsB * CMPLX_DIM) + 1;
/*
* Reset the variables for the usage in the following multiplication process
*/
i = 0;
row = numRowsA;
px = pDst->pData;
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
while (row > 0u)
{
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB >> 1;
j = 0;
/*
* column loop
*/
while (col > 0u)
{
q15_t const *pSrcAVec;
//, *pSrcBVec, *pSrcB2Vec;
q15x8_t vecA, vecB, vecB2;
q63_t acc0, acc1, acc2, acc3;
/*
* Initiate the pointer pIn1 to point to the starting address of the column being processed
*/
pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
pInB2 = pInB + CMPLX_DIM;
j += 2 * CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Initiate the pointers
* - current Matrix A rows
* - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
*/
pSrcAVec = (q15_t const *) pInA;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
vecOffs = vecColBOffs;
blkCnt = (numColsA * CMPLX_DIM) >> 3;
while (blkCnt > 0U)
{
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
acc0 = vmlsldavaq(acc0, vecA, vecB);
acc1 = vmlaldavaxq(acc1, vecA, vecB);
vecB2 = vldrhq_gather_shifted_offset(pInB2, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
acc2 = vmlsldavaq(acc2, vecA, vecB2);
acc3 = vmlaldavaxq(acc3, vecA, vecB2);
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA * CMPLX_DIM) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
vecA = vldrhq_z_s16(pSrcAVec, p0);
acc0 = vmlsldavaq(acc0, vecA, vecB);
acc1 = vmlaldavaxq(acc1, vecA, vecB);
vecB2 = vldrhq_gather_shifted_offset(pInB2, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
acc2 = vmlsldavaq(acc2, vecA, vecB2);
acc3 = vmlaldavaxq(acc3, vecA, vecB2);
}
/*
* Convert to 1.15, Store the results (1 x 2 block) in the destination buffer
*/
*px++ = (q15_t)MVE_ASRL_SAT16(acc0, 15);
*px++ = (q15_t)MVE_ASRL_SAT16(acc1, 15);
*px++ = (q15_t)MVE_ASRL_SAT16(acc2, 15);
*px++ = (q15_t)MVE_ASRL_SAT16(acc3, 15);
}
col = numColsB & 1;
/*
* column loop
*/
while (col > 0u)
{
q15_t const *pSrcAVec;
//, *pSrcBVec, *pSrcB2Vec;
q15x8_t vecA, vecB;
q63_t acc0, acc1;
/*
* Initiate the pointer pIn1 to point to the starting address of the column being processed
*/
pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
j += CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Initiate the pointers
* - current Matrix A rows
* - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
*/
pSrcAVec = (q15_t const *) pInA;
acc0 = 0LL;
acc1 = 0LL;
vecOffs = vecColBOffs;
blkCnt = (numColsA * CMPLX_DIM) >> 3;
while (blkCnt > 0U)
{
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
acc0 = vmlsldavaq(acc0, vecA, vecB);
acc1 = vmlaldavaxq(acc1, vecA, vecB);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA * CMPLX_DIM) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
vecA = vldrhq_z_s16(pSrcAVec, p0);
acc0 = vmlsldavaq(acc0, vecA, vecB);
acc1 = vmlaldavaxq(acc1, vecA, vecB);
}
/*
* Convert to 1.15, Store the results (1 x 2 block) in the destination buffer
*/
*px++ = (q15_t)MVE_ASRL_SAT16(acc0, 15);
*px++ = (q15_t)MVE_ASRL_SAT16(acc1, 15);
}
i = i + numColsA * CMPLX_DIM;
/*
* Decrement the row loop counter
*/
row--;
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_cmplx_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pScratch)
{
q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
q63_t sumReal, sumImag; /* accumulator */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#if defined (ARM_MATH_DSP)
q31_t prod1, prod2;
q31_t pSourceA, pSourceB;
#else
q15_t a, b, c, d;
#endif /* #if defined (ARM_MATH_DSP) */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and exchange the columns with row elements */
col = numColsB >> 2;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Decrement column loop counter */
col--;
}
/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
col = numColsB % 0x4U;
#else
/* Initialize blkCnt with number of samples */
col = numColsB;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Decrement column loop counter */
col--;
}
i = i + 2U;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
/* column loop */
do
{
/* Set variable sum, that acts as accumulator, to zero */
sumReal = 0;
sumImag = 0;
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i * 2;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 1U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia ((q15_t **) &pInA);
pSourceB = read_q15x2_ia ((q15_t **) &pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia ((q15_t **) &pInA);
pSourceB = read_q15x2_ia ((q15_t **) &pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#else /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA buffer */
a = *pInA;
b = *(pInA + 1U);
/* read real and imag values from pSrcB buffer */
c = *pInB;
d = *(pInB + 1U);
/* Multiply and Accumlates */
sumReal += (q31_t) a *c;
sumImag += (q31_t) a *d;
sumReal -= (q31_t) b *d;
sumImag += (q31_t) b *c;
/* read next real and imag values from pSrcA buffer */
a = *(pInA + 2U);
b = *(pInA + 3U);
/* read next real and imag values from pSrcB buffer */
c = *(pInB + 2U);
d = *(pInB + 3U);
/* update pointer */
pInA += 4U;
/* Multiply and Accumlates */
sumReal += (q31_t) a * c;
sumImag += (q31_t) a * d;
sumReal -= (q31_t) b * d;
sumImag += (q31_t) b * c;
/* update pointer */
pInB += 4U;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
colCnt--;
}
/* process odd column samples */
if ((numColsA & 0x1U) > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia ((q15_t **) &pInA);
pSourceB = read_q15x2_ia ((q15_t **) &pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#else /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA and pSrcB buffer */
a = *pInA++;
b = *pInA++;
c = *pInB++;
d = *pInB++;
/* Multiply and Accumlates */
sumReal += (q31_t) a * c;
sumImag += (q31_t) a * d;
sumReal -= (q31_t) b * d;
sumImag += (q31_t) b * c;
#endif /* #if defined (ARM_MATH_DSP) */
}
/* Saturate and store result in destination buffer */
*px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
*px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
/* Decrement column loop counter */
col--;
} while (col > 0U);
i = i + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixMult group
*/

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,76 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_init_f32.c
* Description: Floating-point matrix initialization
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixInit Matrix Initialization
Initializes the underlying matrix data structure.
The functions set the <code>numRows</code>,
<code>numCols</code>, and <code>pData</code> fields
of the matrix data structure.
*/
/**
@addtogroup MatrixInit
@{
*/
/**
@brief Floating-point matrix initialization.
@param[in,out] S points to an instance of the floating-point matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_f32(
arm_matrix_instance_f32 * S,
uint16_t nRows,
uint16_t nColumns,
float32_t * pData)
{
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
}
/**
@} end of MatrixInit group
*/

View file

@ -0,0 +1,67 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_init_q15.c
* Description: Q15 matrix initialization
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInit
@{
*/
/**
@brief Q15 matrix initialization.
@param[in,out] S points to an instance of the floating-point matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_q15(
arm_matrix_instance_q15 * S,
uint16_t nRows,
uint16_t nColumns,
q15_t * pData)
{
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
}
/**
@} end of MatrixInit group
*/

View file

@ -0,0 +1,72 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_init_q31.c
* Description: Q31 matrix initialization
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixInit Matrix Initialization
*/
/**
@addtogroup MatrixInit
@{
*/
/**
@brief Q31 matrix initialization.
@param[in,out] S points to an instance of the Q31 matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_q31(
arm_matrix_instance_q31 * S,
uint16_t nRows,
uint16_t nColumns,
q31_t * pData)
{
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
}
/**
@} end of MatrixInit group
*/

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,652 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_inverse_f64.c
* Description: Floating-point matrix inverse
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
@brief Floating-point (64 bit) matrix inverse.
@param[in] pSrc points to input matrix structure. The source matrix is modified by the function.
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
*/
arm_status arm_mat_inverse_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
{
float64_t *pIn = pSrc->pData; /* input data matrix pointer */
float64_t *pOut = pDst->pData; /* output data matrix pointer */
float64_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
float64_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
float64_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
#if defined (ARM_MATH_DSP)
float64_t Xchg, in = 0.0, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
/* Loop over the number of rows */
rowCnt = numRows;
/* Making the destination matrix as identity matrix */
while (rowCnt > 0U)
{
/* Writing all zeroes in lower triangle of the destination matrix */
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0;
j--;
}
/* Decrement loop counter */
rowCnt--;
}
/* Loop over the number of columns of the input matrix.
All the elements in each column are processed by the row operations */
loopCnt = numCols;
/* Index modifier to navigate through the columns */
l = 0U;
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular. */
/* Working pointer for the input matrix that points
* to the pivot element of the particular row */
pInT1 = pIn + (l * numCols);
/* Working pointer for the destination matrix that points
* to the pivot element of the particular row */
pOutT1 = pOut + (l * numCols);
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Destination pointer modifier */
k = 1U;
/* Check if the pivot element is zero */
if (*pInT1 == 0.0)
{
/* Loop over the number rows present below */
for (i = (l + 1U); i < numRows; i++)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * k);
/* Check if there is a non zero pivot element to
* replace in the rows below */
if (*pInT2 != 0.0)
{
/* Loop over number of columns
* to the right of the pilot element */
j = numCols - l;
while (j > 0U)
{
/* Exchange the row elements of the input matrix */
Xchg = *pInT2;
*pInT2++ = *pInT1;
*pInT1++ = Xchg;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Exchange the row elements of the destination matrix */
Xchg = *pOutT2;
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
/* Decrement loop counter */
j--;
}
/* Flag to indicate whether exchange is done or not */
flag = 1U;
/* Break after exchange is done */
break;
}
/* Update the destination pointer modifier */
k++;
/* Decrement loop counter */
i--;
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0))
{
return ARM_MATH_SINGULAR;
}
/* Points to the pivot row of input and destination matrices */
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/* Temporary pointers to the pivot row pointers */
pInT1 = pPivotRowIn;
pInT2 = pPivotRowDst;
/* Pivot element of the row */
in = *pPivotRowIn;
/* Loop over number of columns
* to the right of the pilot element */
j = (numCols - l);
while (j > 0U)
{
/* Divide each element of the row of the input matrix
* by the pivot element */
in1 = *pInT1;
*pInT1++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Divide each element of the row of the destination matrix
* by the pivot element */
in1 = *pInT2;
*pInT2++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Replace the rows with the sum of that row and a multiple of row i
* so that each new element in column i above row i is zero.*/
/* Temporary pointers for input and destination matrices */
pInT1 = pIn;
pInT2 = pOut;
/* index used to check for pivot element */
i = 0U;
/* Loop over number of rows */
/* to be replaced by the sum of that row and a multiple of row i */
k = numRows;
while (k > 0U)
{
/* Check for the pivot element */
if (i == l)
{
/* If the processing element is the pivot element,
only the columns to the right are to be processed */
pInT1 += numCols - l;
pInT2 += numCols;
}
else
{
/* Element of the reference row */
in = *pInT1;
/* Working pointers for input and destination pivot rows */
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/* Loop over the number of columns to the right of the pivot element,
to replace the elements in the input matrix */
j = (numCols - l);
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT1;
*pInT1++ = in1 - (in * *pPRT_in++);
/* Decrement the loop counter */
j--;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
j = numCols;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT2;
*pInT2++ = in1 - (in * *pPRT_pDst++);
/* Decrement loop counter */
j--;
}
}
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
/* Decrement loop counter */
k--;
/* Increment pivot index */
i++;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
#else
float64_t Xchg, in = 0.0; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
/* Loop over the number of rows */
rowCnt = numRows;
/* Making the destination matrix as identity matrix */
while (rowCnt > 0U)
{
/* Writing all zeroes in lower triangle of the destination matrix */
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0;
j--;
}
/* Decrement loop counter */
rowCnt--;
}
/* Loop over the number of columns of the input matrix.
All the elements in each column are processed by the row operations */
loopCnt = numCols;
/* Index modifier to navigate through the columns */
l = 0U;
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular. */
/* Working pointer for the input matrix that points
* to the pivot element of the particular row */
pInT1 = pIn + (l * numCols);
/* Working pointer for the destination matrix that points
* to the pivot element of the particular row */
pOutT1 = pOut + (l * numCols);
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Destination pointer modifier */
k = 1U;
/* Check if the pivot element is zero */
if (*pInT1 == 0.0)
{
/* Loop over the number rows present below */
for (i = (l + 1U); i < numRows; i++)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * k);
/* Check if there is a non zero pivot element to
* replace in the rows below */
if (*pInT2 != 0.0)
{
/* Loop over number of columns
* to the right of the pilot element */
for (j = 0U; j < (numCols - l); j++)
{
/* Exchange the row elements of the input matrix */
Xchg = *pInT2;
*pInT2++ = *pInT1;
*pInT1++ = Xchg;
}
for (j = 0U; j < numCols; j++)
{
Xchg = *pOutT2;
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
}
/* Flag to indicate whether exchange is done or not */
flag = 1U;
/* Break after exchange is done */
break;
}
/* Update the destination pointer modifier */
k++;
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0))
{
return ARM_MATH_SINGULAR;
}
/* Points to the pivot row of input and destination matrices */
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/* Temporary pointers to the pivot row pointers */
pInT1 = pPivotRowIn;
pOutT1 = pPivotRowDst;
/* Pivot element of the row */
in = *(pIn + (l * numCols));
/* Loop over number of columns
* to the right of the pilot element */
for (j = 0U; j < (numCols - l); j++)
{
/* Divide each element of the row of the input matrix
* by the pivot element */
*pInT1 = *pInT1 / in;
pInT1++;
}
for (j = 0U; j < numCols; j++)
{
/* Divide each element of the row of the destination matrix
* by the pivot element */
*pOutT1 = *pOutT1 / in;
pOutT1++;
}
/* Replace the rows with the sum of that row and a multiple of row i
* so that each new element in column i above row i is zero.*/
/* Temporary pointers for input and destination matrices */
pInT1 = pIn;
pOutT1 = pOut;
for (i = 0U; i < numRows; i++)
{
/* Check for the pivot element */
if (i == l)
{
/* If the processing element is the pivot element,
only the columns to the right are to be processed */
pInT1 += numCols - l;
pOutT1 += numCols;
}
else
{
/* Element of the reference row */
in = *pInT1;
/* Working pointers for input and destination pivot rows */
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/* Loop over the number of columns to the right of the pivot element,
to replace the elements in the input matrix */
for (j = 0U; j < (numCols - l); j++)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
*pInT1 = *pInT1 - (in * *pPRT_in++);
pInT1++;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
for (j = 0U; j < numCols; j++)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
*pOutT1 = *pOutT1 - (in * *pPRT_pDst++);
pOutT1++;
}
}
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if (pIn[i] != 0.0)
break;
}
if (i == numRows * numCols)
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
/**
@} end of MatrixInv group
*/

View file

@ -0,0 +1,983 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_f32.c
* Description: Floating-point matrix multiplication
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMatrix
*/
/**
* @defgroup MatrixMult Matrix Multiplication
*
* Multiplies two matrices.
*
* \image html MatrixMultiplication.gif "Multiplication of two 3 x 3 matrices"
* Matrix multiplication is only defined if the number of columns of the
* first matrix equals the number of rows of the second matrix.
* Multiplying an <code>M x N</code> matrix with an <code>N x P</code> matrix results
* in an <code>M x P</code> matrix.
* When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
* <code>pSrcA</code> and <code>pSrcB</code> are equal; and (2) that the size of the output
* matrix equals the outer dimensions of <code>pSrcA</code> and <code>pSrcB</code>.
*/
/**
* @addtogroup MatrixMult
* @{
*/
/**
* @brief Floating-point matrix multiplication.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#define MATRIX_DIM3 3
#define MATRIX_DIM4 4
__STATIC_INLINE arm_status arm_mat_mult_f32_2x2_mve(
const arm_matrix_instance_f32 *pSrcA,
const arm_matrix_instance_f32 *pSrcB,
arm_matrix_instance_f32 *pDst)
{
/* {a00, a00, a10, a10} */
static const uint32_t offsetA0[4] = { 0, 0, 2, 2 };
/* {b00, b01, b00, b01} */
static const uint32_t offsetB0[4] = { 0, 1, 0, 1 };
/* {a01, a01, a11, a11} */
static const uint32_t offsetA1[4] = { 1, 1, 3, 3 };
/* {b10, b11, b10, b11} */
static const uint32_t offsetB1[4] = { 2, 3, 2, 3 };
uint32x4_t vecOffsA, vecOffsB;
f32x4_t vecInA, vecInB, vecDst;
vecOffsA = vldrwq_u32((uint32_t const *) offsetA0);
vecOffsB = vldrwq_u32((uint32_t const *) offsetB0);
vecInA = vldrwq_gather_shifted_offset((float32_t const *) pSrcA->pData, vecOffsA);
vecInB = vldrwq_gather_shifted_offset((float32_t const *) pSrcB->pData, vecOffsB);
vecDst = vmulq(vecInA, vecInB);
vecOffsA = vldrwq_u32((uint32_t const *) offsetA1);
vecOffsB = vldrwq_u32((uint32_t const *) offsetB1);
vecInA = vldrwq_gather_shifted_offset((float32_t const *) pSrcA->pData, vecOffsA);
vecInB = vldrwq_gather_shifted_offset((float32_t const *) pSrcB->pData, vecOffsB);
vecDst = vfmaq(vecDst, vecInA, vecInB);
vstrwq_f32(pDst->pData, vecDst);
return (ARM_MATH_SUCCESS);
}
/*
* A = {{a00, a01, a02},
* {a10, a11, a12},
* {a20, a21, a22}}
* B = {{b00, b01, b02},
* {b10, b11, b12},
* {b20, b21, b22}}
*
* Dst = {{a00 b00 + a01 b10 + a02 b20, a00 b01 + a01 b11 + a02 b21, a00 b02 + a01 b12 + a02 b22},
* {a10 b00 + a11 b10 + a12 b20, a10 b01 + a11 b11 + a12 b21, a10 b02 + a11 b12 + a12 b22},
* {a20 b00 + a21 b10 + a22 b20, a20 b01 + a21 b11 + a22 b21, a20 b02 + a21 b12 + a22 b22}}
*/
__STATIC_INLINE arm_status arm_mat_mult_f32_3x3_mve(
const arm_matrix_instance_f32 *pSrcA,
const arm_matrix_instance_f32 *pSrcB,
arm_matrix_instance_f32 *pDst)
{
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *pInA0, *pInA1, *pInA2;
f32x4_t vecMac0, vecMac1, vecMac2;
f32x4_t vecInB;
float32_t const *pSrBVec;
pSrBVec = (float32_t const *) pInB;
pInA0 = pInA;
pInA1 = pInA0 + MATRIX_DIM3;
pInA2 = pInA1 + MATRIX_DIM3;
/* enable predication to disable last (4th) vector element */
mve_pred16_t p0 = vctp32q(MATRIX_DIM3);
/*
* load {b0,0, b0,1, b0,2, 0}
*/
vecInB = vldrwq_z_f32(pSrBVec, p0);
pSrBVec += MATRIX_DIM3;
vecMac0 = vmulq(vecInB, *pInA0++);
vecMac1 = vmulq(vecInB, *pInA1++);
vecMac2 = vmulq(vecInB, *pInA2++);
/*
* load {b1,0, b1,1, b1,2, 0}
*/
vecInB = vldrwq_z_f32(pSrBVec, p0);
pSrBVec += MATRIX_DIM3;
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
/*
* load {b2,0, b2,1 , b2,2, 0}
*/
vecInB = vldrwq_z_f32(pSrBVec, p0);
pSrBVec += MATRIX_DIM3;
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
/* partial vector stores */
vstrwq_p_f32(pOut, vecMac0, p0);
pOut += MATRIX_DIM3;
vstrwq_p_f32(pOut, vecMac1, p0);
pOut += MATRIX_DIM3;
vstrwq_p_f32(pOut, vecMac2, p0);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_f32_4x4_mve(
const arm_matrix_instance_f32 *pSrcA,
const arm_matrix_instance_f32 *pSrcB,
arm_matrix_instance_f32 *pDst)
{
float32_t const *pSrBVec;
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *pInA0, *pInA1, *pInA2, *pInA3;
f32x4_t vecMac0, vecMac1, vecMac2, vecMac3;
f32x4_t vecInB;
pSrBVec = (float32_t const *) pInB;
pInA0 = pInA;
pInA1 = pInA0 + MATRIX_DIM4;
pInA2 = pInA1 + MATRIX_DIM4;
pInA3 = pInA2 + MATRIX_DIM4;
/*
* load {b0,0, b0,1, b0,2, b0,3}
*/
vecInB = vld1q(pSrBVec);
pSrBVec += MATRIX_DIM4;
vecMac0 = vmulq(vecInB, *pInA0++);
vecMac1 = vmulq(vecInB, *pInA1++);
vecMac2 = vmulq(vecInB, *pInA2++);
vecMac3 = vmulq(vecInB, *pInA3++);
/*
* load {b1,0, b1,1, b1,2, b1,3}
*/
vecInB = vld1q(pSrBVec);
pSrBVec += MATRIX_DIM4;
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
/*
* load {b2,0, b2,1, b2,2, b2,3}
*/
vecInB = vld1q(pSrBVec);
pSrBVec += MATRIX_DIM4;
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
/*
* load {b3,0, b3,1, b3,2, b3,3}
*/
vecInB = vld1q(pSrBVec);
pSrBVec += MATRIX_DIM4;
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
vst1q(pOut, vecMac0);
pOut += MATRIX_DIM4;
vst1q(pOut, vecMac1);
pOut += MATRIX_DIM4;
vst1q(pOut, vecMac2);
pOut += MATRIX_DIM4;
vst1q(pOut, vecMac3);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
int numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
int numColsB = pSrcB->numCols; /* number of columns of input matrix B */
int numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t blkCnt; /* loop counters */
uint32_t i;
arm_status status;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if (numRowsA == 1)
{
pOut[0] = pInA[0] * pInB[0];
return(ARM_MATH_SUCCESS);
}
else if(numRowsA == 2)
return arm_mat_mult_f32_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_f32_3x3_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 4)
return arm_mat_mult_f32_4x4_mve(pSrcA, pSrcB, pDst);
}
/* main loop process 4 rows */
i = numRowsA >> 2;
while (i > 0U)
{
float32_t *pInA0, *pInA1, *pInA2, *pInA3;
float32_t *pInB0;
float32_t *pOut0, *pOut1, *pOut2, *pOut3;
f32x4_t vecMac0, vecMac1, vecMac2, vecMac3;
f32x4_t vecInB;
/* pointers to 4 consecutive output rows */
pOut0 = pOut;
pOut1 = pOut0 + numColsB;
pOut2 = pOut1 + numColsB;
pOut3 = pOut2 + numColsB;
pInB0 = pInB;
uint32_t k = numColsB >> 2;
while (k > 0U)
{
/* pointers to 4 consecutive Matrix A rows */
pInA0 = pInA;
pInA1 = pInA0 + numColsA;
pInA2 = pInA1 + numColsA;
pInA3 = pInA2 + numColsA;
vecMac0 = vdupq_n_f32(0.0f);
vecMac1 = vdupq_n_f32(0.0f);
vecMac2 = vdupq_n_f32(0.0f);
vecMac3 = vdupq_n_f32(0.0f);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
*/
vecInB = *(f32x4_t *)pInB0; /* vldrwq_f32(pInB0, 0); */
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (4 x 4 block) in the destination buffer */
vst1q(pOut0, vecMac0);
pOut0 += 4;
vst1q(pOut1, vecMac1);
pOut1 += 4;
vst1q(pOut2, vecMac2);
pOut2 += 4;
vst1q(pOut3, vecMac3);
pOut3 += 4;
/*
* rewind
*/
pInB0 -= (numColsB * numColsA) - 4;
k--;
}
int colBLeft = numColsB & 3;
if (colBLeft)
{
pInA0 = pInA;
pInA1 = pInA0 + numColsA;
pInA2 = pInA1 + numColsA;
pInA3 = pInA2 + numColsA;
mve_pred16_t p0 = vctp32q(colBLeft);
vecMac0 = vdupq_n_f32(0.0f);
vecMac1 = vdupq_n_f32(0.0f);
vecMac2 = vdupq_n_f32(0.0f);
vecMac3 = vdupq_n_f32(0.0f);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
*/
vecInB = vldrwq_z_f32(pInB0, p0);
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (4 x colBLeft block) in the destination buffer */
vstrwq_p_f32(pOut0, vecMac0, p0);
vstrwq_p_f32(pOut1, vecMac1, p0);
vstrwq_p_f32(pOut2, vecMac2, p0);
vstrwq_p_f32(pOut3, vecMac3, p0);
}
/* move to next rows */
pInA += 4 * numColsA;
pOut += 4 * numColsB;
i--;
}
/*
* non multiple of 4 rows for Matrix A
* process single row
*/
if (numRowsA & 3)
{
i = numRowsA & 3;
while (i > 0U)
{
float32_t *pInA0;
float32_t *pInB0;
float32_t *pOut0;
f32x4_t vecInB;
f32x4_t vecMac0;
pOut0 = pOut;
pInB0 = pInB;
uint32_t k = numColsB >> 2;
while (k > 0U)
{
pInA0 = pInA;
vecMac0 = vdupq_n_f32(0.0f);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
*/
vecInB = *(f32x4_t *)pInB0; /* vldrwq_f32(pInB0, 0); */
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (1 x 4 block) in the destination buffer */
vst1q(pOut0, vecMac0);
pOut0 += 4;
/*
* rewind
*/
pInB0 -= (numColsB * numColsA) - 4;
k--;
}
int colBLeft = numColsB & 3;
if (colBLeft)
{
pInA0 = pInA;
mve_pred16_t p0 = vctp32q(colBLeft);
vecMac0 = vdupq_n_f32(0.0f);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
*/
vecInB = vldrwq_z_f32(pInB0, p0);
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (1 x colBLeft block) in the destination buffer */
vstrwq_p_f32(pOut0, vecMac0, p0);
}
/* move to next row */
pInA += 1 * numColsA;
pOut += 1 * numColsB;
i--;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON)
#define GROUPOFROWS 8
arm_status arm_mat_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
float32_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t col, i = 0U, j, row = numRowsA, rowCnt, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
float32x4_t a0V, a1V, a2V, a3V, a4V, a5V, a6V, a7V;
float32x4_t acc0,acc1,acc2,acc3,acc4,acc5,acc6,acc7,temp;
float32x2_t accum = vdup_n_f32(0);
float32_t *pIn1B = pSrcA->pData;
float32_t *pIn1C = pSrcA->pData;
float32_t *pIn1D = pSrcA->pData;
float32_t *pIn1E = pSrcA->pData;
float32_t *pIn1F = pSrcA->pData;
float32_t *pIn1G = pSrcA->pData;
float32_t *pIn1H = pSrcA->pData;
float32_t *pxB,*pxC, *pxD, *pxE, *pxF, *pxG, *pxH; /* Temporary output data matrix pointer */
float32_t sum0,sum1, sum2,sum3, sum4, sum5 , sum6, sum7;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* Row loop */
rowCnt = row >> 3;
while(rowCnt > 0)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + GROUPOFROWS*i;
pxB = px + numColsB;
pxC = px + 2*numColsB;
pxD = px + 3*numColsB;
pxE = px + 4*numColsB;
pxF = px + 5*numColsB;
pxG = px + 6*numColsB;
pxH = px + 7*numColsB;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* Column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum0 = 0.0f;
sum1 = 0.0f;
sum2 = 0.0f;
sum3 = 0.0f;
sum4 = 0.0f;
sum5 = 0.0f;
sum6 = 0.0f;
sum7 = 0.0f;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
pIn1B = pIn1 + numColsA;
pIn1C = pIn1 + 2*numColsA;
pIn1D = pIn1 + 3*numColsA;
pIn1E = pIn1 + 4*numColsA;
pIn1F = pIn1 + 5*numColsA;
pIn1G = pIn1 + 6*numColsA;
pIn1H = pIn1 + 7*numColsA;
acc0 = vdupq_n_f32(0.0);
acc1 = vdupq_n_f32(0.0);
acc2 = vdupq_n_f32(0.0);
acc3 = vdupq_n_f32(0.0);
acc4 = vdupq_n_f32(0.0);
acc5 = vdupq_n_f32(0.0);
acc6 = vdupq_n_f32(0.0);
acc7 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* Matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a0V = vld1q_f32(pIn1);
a1V = vld1q_f32(pIn1B);
a2V = vld1q_f32(pIn1C);
a3V = vld1q_f32(pIn1D);
a4V = vld1q_f32(pIn1E);
a5V = vld1q_f32(pIn1F);
a6V = vld1q_f32(pIn1G);
a7V = vld1q_f32(pIn1H);
pIn1 += 4;
pIn1B += 4;
pIn1C += 4;
pIn1D += 4;
pIn1E += 4;
pIn1F += 4;
pIn1G += 4;
pIn1H += 4;
temp = vsetq_lane_f32(*pIn2,temp,0);
pIn2 += numColsB;
temp = vsetq_lane_f32(*pIn2,temp,1);
pIn2 += numColsB;
temp = vsetq_lane_f32(*pIn2,temp,2);
pIn2 += numColsB;
temp = vsetq_lane_f32(*pIn2,temp,3);
pIn2 += numColsB;
acc0 = vmlaq_f32(acc0,a0V,temp);
acc1 = vmlaq_f32(acc1,a1V,temp);
acc2 = vmlaq_f32(acc2,a2V,temp);
acc3 = vmlaq_f32(acc3,a3V,temp);
acc4 = vmlaq_f32(acc4,a4V,temp);
acc5 = vmlaq_f32(acc5,a5V,temp);
acc6 = vmlaq_f32(acc6,a6V,temp);
acc7 = vmlaq_f32(acc7,a7V,temp);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
sum0 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
sum1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
sum2 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
sum3 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc4), vget_high_f32(acc4));
sum4 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc5), vget_high_f32(acc5));
sum5 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc6), vget_high_f32(acc6));
sum6 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc7), vget_high_f32(acc7));
sum7 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA & 3;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
sum0 += *pIn1++ * (*pIn2);
sum1 += *pIn1B++ * (*pIn2);
sum2 += *pIn1C++ * (*pIn2);
sum3 += *pIn1D++ * (*pIn2);
sum4 += *pIn1E++ * (*pIn2);
sum5 += *pIn1F++ * (*pIn2);
sum6 += *pIn1G++ * (*pIn2);
sum7 += *pIn1H++ * (*pIn2);
pIn2 += numColsB;
/* Decrement the loop counter */
colCnt--;
}
/* Store the result in the destination buffer */
*px++ = sum0;
*pxB++ = sum1;
*pxC++ = sum2;
*pxD++ = sum3;
*pxE++ = sum4;
*pxF++ = sum5;
*pxG++ = sum6;
*pxH++ = sum7;
/* Update the pointer pIn2 to point to the starting address of the next column */
j++;
pIn2 = pSrcB->pData + j;
/* Decrement the column loop counter */
col--;
} while (col > 0U);
/* Update the pointer pInA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + GROUPOFROWS*numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
/*
i was the index of a group of rows computed by previous loop.
Now i is the index of a row since below code is computing row per row
and no more group of row per group of rows.
*/
i = GROUPOFROWS*i;
rowCnt = row & 7;
while(rowCnt > 0)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* Column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0f;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
acc0 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* Matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a0V = vld1q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
pIn1 += 4;
temp = vsetq_lane_f32(*pIn2,temp,0);
pIn2 += numColsB;
temp = vsetq_lane_f32(*pIn2,temp,1);
pIn2 += numColsB;
temp = vsetq_lane_f32(*pIn2,temp,2);
pIn2 += numColsB;
temp = vsetq_lane_f32(*pIn2,temp,3);
pIn2 += numColsB;
acc0 = vmlaq_f32(acc0,a0V,temp);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
sum += *pIn1++ * (*pIn2);
pIn2 += numColsB;
/* Decrement the loop counter */
colCnt--;
}
/* Store the result in the destination buffer */
*px++ = sum;
/* Update the pointer pIn2 to point to the starting address of the next column */
j++;
pIn2 = pSrcB->pData + j;
/* Decrement the column loop counter */
col--;
} while (col > 0U);
/* Update the pointer pInA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
float32_t *pOut = pDst->pData; /* Output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
float32_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0f;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sum;
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixMult group
*/

View file

@ -0,0 +1,483 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_fast_q15.c
* Description: Q15 matrix multiplication (fast variant)
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q15 matrix multiplication (fast variant).
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The difference between the function \ref arm_mat_mult_q15() and this fast variant is that
the fast variant use a 32-bit rather than a 64-bit accumulator.
The result of each 1.15 x 1.15 multiplication is truncated to
2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
format. Finally, the accumulator is saturated and converted to a 1.15 result.
@par
The fast version has the same overflow behavior as the standard version but provides
less precision since it discards the low 16 bits of each multiplication result.
In order to avoid overflows completely the input signals must be scaled down.
Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
as a total of numColsA additions are computed internally for each output element.
@remark
Refer to \ref arm_mat_mult_q15() for a slower implementation of this function
which uses 64-bit accumulation to provide higher precision.
*/
arm_status arm_mat_mult_fast_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState)
{
q31_t sum; /* Accumulator */
q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix A */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#if defined (ARM_MATH_DSP)
q31_t in; /* Temporary variable to hold the input value */
q31_t inA1, inB1, inA2, inB2;
q31_t sum2, sum3, sum4;
q15_t *pInA2, *pInB2, *px2;
uint32_t j = 0;
#else
q15_t in; /* Temporary variable to hold the input value */
q15_t inA1, inB1, inA2, inB2;
#endif /* #if defined (ARM_MATH_DSP) */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
/* Apply loop unrolling and exchange columns with row elements */
col = numColsB >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
#if defined (ARM_MATH_DSP)
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pInB);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
in = read_q15x2_ia ((q15_t **) &pInB);
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
#else /* #if defined (ARM_MATH_DSP) */
/* Read one element from row */
in = *pInB++;
/* Store one element in destination */
*px = in;
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
in = *pInB++;
*px = in;
px += numRowsB;
in = *pInB++;
*px = in;
px += numRowsB;
in = *pInB++;
*px = in;
px += numRowsB;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement column loop counter */
col--;
}
/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
col = numColsB % 0x4U;
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pInB++;
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
#if defined (ARM_MATH_DSP)
/* Process two rows from matrix A at a time and output two rows at a time */
row = row >> 1U;
px2 = px + numColsB;
#endif
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
while (row > 0U)
{
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
#if defined (ARM_MATH_DSP)
/* Process two (transposed) columns from matrix B at a time */
col = col >> 1U;
j = 0;
#endif
/* column loop */
while (col > 0U)
{
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i;
#if defined (ARM_MATH_DSP)
sum2 = 0;
sum3 = 0;
sum4 = 0;
pInB = pSrcBT + j;
pInA2 = pInA + numColsA;
pInB2 = pInB + numRowsB;
/* Read in two elements at once - alows dual MAC instruction */
colCnt = numColsA >> 1U;
#else
colCnt = numColsA >> 2U;
#endif
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = read_q15x2_ia ((q15_t **) &pInA);
inB1 = read_q15x2_ia ((q15_t **) &pInB);
inA2 = read_q15x2_ia ((q15_t **) &pInA2);
inB2 = read_q15x2_ia ((q15_t **) &pInB2);
/* Multiply and Accumlates */
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
#else
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = *pInA++;
inB1 = *pInB++;
/* Multiply and Accumlates */
sum += inA1 * inB1;
inA2 = *pInA++;
inB2 = *pInB++;
sum += inA2 * inB2;
inA1 = *pInA++;
inB1 = *pInB++;
sum += inA1 * inB1;
inA2 = *pInA++;
inB2 = *pInB++;
sum += inA2 * inB2;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
colCnt--;
}
/* process odd column samples */
#if defined (ARM_MATH_DSP)
if (numColsA & 1U) {
inA1 = *pInA++;
inB1 = *pInB++;
inA2 = *pInA2++;
inB2 = *pInB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
}
#else
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
sum += (q31_t) *pInA++ * *pInB++;
/* Decrement loop counter */
colCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Saturate and store result in destination buffer */
*px++ = (q15_t) (sum >> 15);
#if defined (ARM_MATH_DSP)
*px++ = (q15_t) (sum2 >> 15);
*px2++ = (q15_t) (sum3 >> 15);
*px2++ = (q15_t) (sum4 >> 15);
j += numRowsB * 2;
#endif
/* Decrement column loop counter */
col--;
}
i = i + numColsA;
#if defined (ARM_MATH_DSP)
i = i + numColsA;
px = px2 + (numColsB & 1U);
px2 = px + numColsB;
#endif
/* Decrement row loop counter */
row--;
}
/* Compute any remaining odd row/column below */
#if defined (ARM_MATH_DSP)
/* Compute remaining output column */
if (numColsB & 1U) {
/* Avoid redundant computation of last element */
row = numRowsA & (~0x1);
/* Point to remaining unfilled column in output matrix */
px = pDst->pData + numColsB-1;
pInA = pSrcA->pData;
/* row loop */
while (row > 0)
{
/* point to last column in matrix B */
pInB = pSrcBT + numRowsB * (numColsB-1);
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = read_q15x2_ia ((q15_t **) &pInA);
inA2 = read_q15x2_ia ((q15_t **) &pInA);
inB1 = read_q15x2_ia ((q15_t **) &pInB);
inB2 = read_q15x2_ia ((q15_t **) &pInB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
/* Decrement loop counter */
colCnt--;
}
colCnt = numColsA & 3U;
while (colCnt > 0U) {
sum += (q31_t) (*pInA++) * (*pInB++);
colCnt--;
}
/* Store result in destination buffer */
*px = (q15_t) (sum >> 15);
px += numColsB;
/* Decrement row loop counter */
row--;
}
}
/* Compute remaining output row */
if (numRowsA & 1U) {
/* point to last row in output matrix */
px = pDst->pData + (numColsB) * (numRowsA-1);
pInB = pSrcBT;
col = numColsB;
i = 0U;
/* col loop */
while (col > 0)
{
/* point to last row in matrix A */
pInA = pSrcA->pData + (numRowsA-1) * numColsA;
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = read_q15x2_ia ((q15_t **) &pInA);
inA2 = read_q15x2_ia ((q15_t **) &pInA);
inB1 = read_q15x2_ia ((q15_t **) &pInB);
inB2 = read_q15x2_ia ((q15_t **) &pInB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
/* Decrement loop counter */
colCnt--;
}
colCnt = numColsA % 4U;
while (colCnt > 0U) {
sum += (q31_t) (*pInA++) * (*pInB++);
colCnt--;
}
/* Store result in destination buffer */
*px++ = (q15_t) (sum >> 15);
/* Decrement column loop counter */
col--;
}
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixMult group
*/

View file

@ -0,0 +1,374 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_fast_q31.c
* Description: Q31 matrix multiplication (fast variant)
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q31 matrix multiplication (fast variant).
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The difference between the function \ref arm_mat_mult_q31() and this fast variant is that
the fast variant use a 32-bit rather than a 64-bit accumulator.
The result of each 1.31 x 1.31 multiplication is truncated to
2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
format. Finally, the accumulator is saturated and converted to a 1.31 result.
@par
The fast version has the same overflow behavior as the standard version but provides
less precision since it discards the low 32 bits of each multiplication result.
In order to avoid overflows completely the input signals must be scaled down.
Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
as a total of numColsA additions are computed internally for each output element.
@remark
Refer to \ref arm_mat_mult_q31() for a slower implementation of this function
which uses 64-bit accumulation to provide higher precision.
*/
arm_status arm_mat_mult_fast_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA2;
q31_t *px; /* Temporary output data matrix pointer */
q31_t *px2;
q31_t sum1, sum2, sum3, sum4; /* Accumulator */
q31_t inA1, inA2, inB1, inB2;
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
px = pDst->pData;
row = row >> 1U;
px2 = px + numColsB;
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
while (row > 0U)
{
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pInB = pSrcB->pData;
j = 0U;
col = col >> 1U;
/* column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
sum1 = 0;
sum2 = 0;
sum3 = 0;
sum4 = 0;
/* Initiate data pointers */
pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
pInA2 = pInA + numColsA;
colCnt = numColsA;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
inA1 = *pInA++;
inB1 = pInB[0];
inA2 = *pInA2++;
inB2 = pInB[1];
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum2 = __SMMLA(inA1, inB2, sum2);
sum3 = __SMMLA(inA2, inB1, sum3);
sum4 = __SMMLA(inA2, inB2, sum4);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum2 = (q31_t) ((((q63_t) sum2 << 32) + ((q63_t) inA1 * inB2)) >> 32);
sum3 = (q31_t) ((((q63_t) sum3 << 32) + ((q63_t) inA2 * inB1)) >> 32);
sum4 = (q31_t) ((((q63_t) sum4 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
/* Decrement loop counter */
colCnt--;
}
/* Convert the result from 2.30 to 1.31 format and store in destination buffer */
*px++ = sum1 << 1;
*px++ = sum2 << 1;
*px2++ = sum3 << 1;
*px2++ = sum4 << 1;
j += 2;
/* Decrement column loop counter */
col--;
}
i = i + (numColsA << 1U);
px = px2 + (numColsB & 1U);
px2 = px + numColsB;
/* Decrement row loop counter */
row--;
}
/* Compute any remaining odd row/column below */
/* Compute remaining output column */
if (numColsB & 1U) {
/* Avoid redundant computation of last element */
row = numRowsA & (~1U);
/* Point to remaining unfilled column in output matrix */
px = pDst->pData + numColsB-1;
pInA = pSrcA->pData;
/* row loop */
while (row > 0)
{
/* point to last column in matrix B */
pInB = pSrcB->pData + numColsB-1;
/* Set variable sum1, that acts as accumulator, to zero */
sum1 = 0;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 columns at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining column */
colCnt = numColsA % 4U;
#else
/* Initialize colCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U) {
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
colCnt--;
}
/* Convert the result from 2.30 to 1.31 format and store in destination buffer */
*px = sum1 << 1;
px += numColsB;
/* Decrement row loop counter */
row--;
}
}
/* Compute remaining output row */
if (numRowsA & 1U) {
/* point to last row in output matrix */
px = pDst->pData + (numColsB) * (numRowsA-1);
col = numColsB;
i = 0U;
/* col loop */
while (col > 0)
{
/* point to last row in matrix A */
pInA = pSrcA->pData + (numRowsA-1) * numColsA;
pInB = pSrcB->pData + i;
/* Set variable sum1, that acts as accumulator, to zero */
sum1 = 0;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 columns at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = *pInA++;
inA2 = *pInA++;
inB1 = *pInB;
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum1 = __SMMLA(inA2, inB2, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
inA1 = *pInA++;
inA2 = *pInA++;
inB1 = *pInB;
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum1 = __SMMLA(inA2, inB2, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining column */
colCnt = numColsA % 4U;
#else
/* Initialize colCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U) {
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
colCnt--;
}
/* Saturate and store the result in the destination buffer */
*px++ = sum1 << 1;
i++;
/* Decrement col loop counter */
col--;
}
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixMult group
*/

View file

@ -0,0 +1,892 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_q15.c
* Description: Q15 matrix multiplication
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q15 matrix multiplication.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results (Unused)
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator. The inputs to the
multiplications are in 1.15 format and multiplications yield a 2.30 result.
The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
This approach provides 33 guard bits and there is no risk of overflow.
The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits
and then saturated to 1.15 format.
@par
Refer to \ref arm_mat_mult_fast_q15() for a faster but less precise version of this function.
*/
#if defined(ARM_MATH_MVEI)
#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
#define MATRIX_DIM2 2
#define MATRIX_DIM3 3
#define MATRIX_DIM4 4
__STATIC_INLINE arm_status arm_mat_mult_q15_2x2_mve(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs;
q15_t *pInA0 = pInA;
q15_t *pInA1 = pInA0 + MATRIX_DIM2;
q63_t acc0, acc1;
q15x8_t vecB, vecA0, vecA1;
mve_pred16_t p0 = vctp16q(MATRIX_DIM2);
vecColBOffs = vidupq_u16((uint32_t)0, 2); /* MATRIX_DIM2 */
pInB = pSrcB->pData;
vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
vecA0 = vldrhq_s16(pInA0);
vecA1 = vldrhq_s16(pInA1);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
pOut[0 * MATRIX_DIM2] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM2] = (q15_t) __SSAT(acc1, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
pOut[0 * MATRIX_DIM2] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM2] = (q15_t) __SSAT(acc1, 16);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_q15_3x3_mve(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs;
q15_t *pInA0 = pInA;
q15_t *pInA1 = pInA0 + MATRIX_DIM3;
q15_t *pInA2 = pInA1 + MATRIX_DIM3;
q63_t acc0, acc1, acc2;
q15x8_t vecB, vecA0, vecA1, vecA2;
mve_pred16_t p0 = vctp16q(MATRIX_DIM3);
vecColBOffs = vidupq_u16((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM3;
pInB = pSrcB->pData;
vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
vecA0 = vldrhq_s16(pInA0);
vecA1 = vldrhq_s16(pInA1);
vecA2 = vldrhq_s16(pInA2);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_q15_4x4_mve(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs;
q15_t *pInA0 = pInA;
q15_t *pInA1 = pInA0 + MATRIX_DIM4;
q15_t *pInA2 = pInA1 + MATRIX_DIM4;
q15_t *pInA3 = pInA2 + MATRIX_DIM4;
q63_t acc0, acc1, acc2, acc3;
q15x8_t vecB, vecA0, vecA1, vecA2, vecA3;
mve_pred16_t p0 = vctp16q(MATRIX_DIM4);
vecColBOffs = vidupq_u16((uint32_t)0, 4);
pInB = pSrcB->pData;
vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
vecA0 = vldrhq_s16(pInA0);
vecA1 = vldrhq_s16(pInA1);
vecA2 = vldrhq_s16(pInA2);
vecA3 = vldrhq_s16(pInA3);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc3 = vmlaldavq(vecA3, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
acc3 = asrl(acc3, 15);
pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc3 = vmlaldavq(vecA3, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
acc3 = asrl(acc3, 15);
pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc3 = vmlaldavq(vecA3, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
acc3 = asrl(acc3, 15);
pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc3 = vmlaldavq(vecA3, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
acc3 = asrl(acc3, 15);
pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState)
{
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
uint16x8_t vecOffs, vecColBOffs;
uint32_t blkCnt,rowCnt; /* loop counters */
arm_status status; /* Status of matrix multiplication */
(void)pState;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if (numRowsA == 1)
{
q63_t sum;
sum = pInA[0] * pInB[0];
pOut[0] = (q15_t) __SSAT((sum >> 15), 16);
return (ARM_MATH_SUCCESS);
}
else if(numRowsA == 2)
return arm_mat_mult_q15_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_q15_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_mult_q15_4x4_mve(pSrcA, pSrcB, pDst);
}
vecColBOffs = vidupq_u16((uint32_t)0, 1);
vecColBOffs = vecColBOffs * (uint16_t) (numColsB);
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
rowCnt = row >> 2;
while (rowCnt > 0U)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i;
i = i + 4 * numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = pSrcB->pData;
/*
* column loop
*/
while (col > 0U)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
colCnt = numColsA;
q15_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
q15_t *pInA0 = pInA;
q15_t *pInA1 = pInA0 + numColsA;
q15_t *pInA2 = pInA1 + numColsA;
q15_t *pInA3 = pInA2 + numColsA;
q63_t acc0, acc1, acc2, acc3;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
pSrcA0Vec = (q15_t const *) pInA0;
pSrcA1Vec = (q15_t const *) pInA1;
pSrcA2Vec = (q15_t const *) pInA2;
pSrcA3Vec = (q15_t const *) pInA3;
vecOffs = vecColBOffs;
blkCnt = (numColsA) >> 3;
while (blkCnt > 0U)
{
q15x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
vecOffs = vecOffs + (uint16_t) (numColsB * 8);
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 8;
acc0 = vmlaldavaq(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 8;
acc1 = vmlaldavaq(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 8;
acc2 = vmlaldavaq(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 8;
acc3 = vmlaldavaq(acc3, vecA, vecB);
blkCnt--;
}
/*
* tail
*/
blkCnt = numColsA & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
q15x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
vecOffs = vecOffs + (uint16_t) (numColsB * 8);
vecA = vld1q(pSrcA0Vec);
acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
vecA = vld1q(pSrcA1Vec);
acc1 = vmlaldavaq_p(acc1, vecA, vecB, p0);
vecA = vld1q(pSrcA2Vec);
acc2 = vmlaldavaq_p(acc2, vecA, vecB, p0);
vecA = vld1q(pSrcA3Vec);
acc3 = vmlaldavaq_p(acc3, vecA, vecB, p0);
}
px[0] = (q15_t)MVE_ASRL_SAT16(acc0, 15);
px[1 * numColsB] = (q15_t)MVE_ASRL_SAT16(acc1, 15);
px[2 * numColsB] = (q15_t)MVE_ASRL_SAT16(acc2, 15);
px[3 * numColsB] = (q15_t)MVE_ASRL_SAT16(acc3, 15);
px++;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = pSrcB->pData + (numColsB - col);
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += (numColsA * 4);
/*
* Decrement the row loop counter
*/
rowCnt --;
}
rowCnt = row & 3;
while (rowCnt > 0U)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i;
i = i + numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = pSrcB->pData;
/*
* column loop
*/
while (col > 0U)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
colCnt = numColsA;
q15_t const *pSrcA0Vec;
q15_t *pInA0 = pInA;
q63_t acc0;
acc0 = 0LL;
pSrcA0Vec = (q15_t const *) pInA0;
vecOffs = vecColBOffs;
blkCnt = (numColsA) >> 3;
while (blkCnt > 0U)
{
q15x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
vecOffs = vecOffs + (uint16_t) (numColsB * 8);
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 8;
acc0 = vmlaldavaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
*/
blkCnt = numColsA & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
q15x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
vecOffs = vecOffs + (uint16_t) (numColsB * 8);
vecA = vld1q(pSrcA0Vec);
acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
}
px[0] = (q15_t)MVE_ASRL_SAT16(acc0, 15);
px++;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = pSrcB->pData + (numColsB - col);
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += (numColsA );
rowCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState)
{
q63_t sum; /* Accumulator */
#if defined (ARM_MATH_DSP) /* != CM0 */
q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix A */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
q31_t in; /* Temporary variable to hold the input value */
q31_t inA1, inB1, inA2, inB2;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
/* Apply loop unrolling and exchange columns with row elements */
col = numColsB >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pInB);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pInB);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
/* Decrement column loop counter */
col--;
}
/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
col = numColsB % 0x4U;
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pInB++;
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
/* column loop */
do
{
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = read_q15x2_ia ((q15_t **) &pInA);
inB1 = read_q15x2_ia ((q15_t **) &pInB);
inA2 = read_q15x2_ia ((q15_t **) &pInA);
inB2 = read_q15x2_ia ((q15_t **) &pInB);
/* Multiply and Accumlates */
sum = __SMLALD(inA1, inB1, sum);
sum = __SMLALD(inA2, inB2, sum);
/* Decrement loop counter */
colCnt--;
}
/* process remaining column samples */
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
sum += *pInA++ * *pInB++;
/* Decrement loop counter */
colCnt--;
}
/* Saturate and store result in destination buffer */
*px = (q15_t) (__SSAT((sum >> 15), 16));
px++;
/* Decrement column loop counter */
col--;
} while (col > 0U);
i = i + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
#else /* #if defined (ARM_MATH_DSP) */
q15_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q15_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *pOut = pDst->pData; /* Output data matrix pointer */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
(void)pState;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate pointer pIn1 to point to starting address of pSrcA */
pIn1 = pInA;
/* Matrix A columns number of MAC operations are to be performed */
colCnt = numColsA;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform multiply-accumulates */
sum += (q31_t) * pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Convert result from 34.30 to 1.15 format and store saturated value in destination buffer */
/* Saturate and store result in destination buffer */
*px++ = (q15_t) __SSAT((sum >> 15), 16);
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pSrcA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixMult group
*/

View file

@ -0,0 +1,763 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_q31.c
* Description: Q31 matrix multiplication
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q31 matrix multiplication.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate
multiplication results but provides only a single guard bit. There is no saturation
on intermediate additions. Thus, if the accumulator overflows it wraps around and
distorts the result. The input signals should be scaled down to avoid intermediate
overflows. The input is thus scaled down by log2(numColsA) bits
to avoid overflows, as a total of numColsA additions are performed internally.
The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
@remark
Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
*/
#if defined(ARM_MATH_MVEI)
#define MATRIX_DIM2 2
#define MATRIX_DIM3 3
#define MATRIX_DIM4 4
__STATIC_INLINE arm_status arm_mat_mult_q31_2x2_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM2;
q63_t acc0, acc1;
q31x4_t vecB, vecA0, vecA1;
/* enable predication to disable half of vector elements */
mve_pred16_t p0 = vctp32q(MATRIX_DIM2);
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM2;
pInB = pSrcB->pData;
/* load 1st B column (partial load) */
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
/* load A rows */
vecA0 = vldrwq_s32(pInA0);
vecA1 = vldrwq_s32(pInA1);
acc0 = vrmlaldavhq(vecA0, vecB);
acc1 = vrmlaldavhq(vecA1, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
acc0 = vrmlaldavhq(vecA0, vecB);
acc1 = vrmlaldavhq(vecA1, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_q31_3x3_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM3;
q31_t *pInA2 = pInA1 + MATRIX_DIM3;
q63_t acc0, acc1, acc2;
q31x4_t vecB, vecA;
/* enable predication to disable last (4th) vector element */
mve_pred16_t p0 = vctp32q(MATRIX_DIM3);
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM3;
pInB = pSrcB->pData;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_q31_4x4_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM4;
q31_t *pInA2 = pInA1 + MATRIX_DIM4;
q31_t *pInA3 = pInA2 + MATRIX_DIM4;
q63_t acc0, acc1, acc2, acc3;
q31x4_t vecB, vecA;
vecColBOffs = vidupq_u32((uint32_t)0, 4);
pInB = pSrcB->pData;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t const *pInB = (q31_t const *)pSrcB->pData; /* input data matrix pointer B */
q31_t const *pInA = (q31_t const *)pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
uint32x4_t vecOffs, vecColBOffs;
uint32_t blkCnt, rowCnt; /* loop counters */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if (numRowsA == 1)
{
q63_t sum = (q63_t) *pInA * *pInB;
pOut[0] = (q31_t)(sum >> 31);
return (ARM_MATH_SUCCESS);
}
else if(numRowsA == 2)
return arm_mat_mult_q31_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_q31_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_mult_q31_4x4_mve(pSrcA, pSrcB, pDst);
}
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * (uint32_t) (numColsB);
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
rowCnt = row >> 2;
while (rowCnt > 0U)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i;
i = i + 4 * numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (q31_t const *)pSrcB->pData;
/*
* column loop
*/
while (col > 0U)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
colCnt = numColsA;
q31_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
q31_t const *pInA0 = pInA;
q31_t const *pInA1 = pInA0 + numColsA;
q31_t const *pInA2 = pInA1 + numColsA;
q31_t const *pInA3 = pInA2 + numColsA;
q63_t acc0, acc1, acc2, acc3;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
pSrcA0Vec = (q31_t const *) pInA0;
pSrcA1Vec = (q31_t const *) pInA1;
pSrcA2Vec = (q31_t const *) pInA2;
pSrcA3Vec = (q31_t const *) pInA3;
vecOffs = vecColBOffs;
/* process 1 x 4 block output */
blkCnt = numColsA >> 2;
while (blkCnt > 0U)
{
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
/* move Matrix B read offsets, 4 rows down */
vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
acc1 = vrmlaldavhaq(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
acc2 = vrmlaldavhaq(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
acc3 = vrmlaldavhaq(acc3, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numColsA & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
//vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
acc1 = vrmlaldavhaq(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
acc2 = vrmlaldavhaq(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
acc3 = vrmlaldavhaq(acc3, vecA, vecB);
}
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
px[0] = (q31_t) acc0;
px[1 * numColsB] = (q31_t) acc1;
px[2 * numColsB] = (q31_t) acc2;
px[3 * numColsB] = (q31_t) acc3;
px++;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += (numColsA * 4);
/*
* Decrement the row loop counter
*/
rowCnt --;
}
rowCnt = row & 3;
while (rowCnt > 0U)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i;
i = i + numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (q31_t const *)pSrcB->pData;
/*
* column loop
*/
while (col > 0U)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
colCnt = numColsA;
q31_t const *pSrcA0Vec;
q31_t const *pInA0 = pInA;
q63_t acc0;
acc0 = 0LL;
pSrcA0Vec = (q31_t const *) pInA0;
vecOffs = vecColBOffs;
/* process 1 x 4 block output */
blkCnt = numColsA >> 2;
while (blkCnt > 0U)
{
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
/* move Matrix B read offsets, 4 rows down */
vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numColsA & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
//vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
}
acc0 = asrl(acc0, 23);
px[0] = (q31_t) acc0;
px++;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += numColsA;
/*
* Decrement the row loop counter
*/
rowCnt--;
}
/*
* set status as ARM_MATH_SUCCESS
*/
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
q63_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Convert result from 2.62 to 1.31 format and store in destination buffer */
*px++ = (q31_t) (sum >> 31);
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixMult group
*/

View file

@ -0,0 +1,287 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_scale_f32.c
* Description: Multiplies a floating-point matrix by a scalar
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixScale Matrix Scale
Multiplies a matrix by a scalar. This is accomplished by multiplying each element in the
matrix by the scalar. For example:
\image html MatrixScale.gif "Matrix Scaling of a 3 x 3 matrix"
The function checks to make sure that the input and output matrices are of the same size.
In the fixed-point Q15 and Q31 functions, <code>scale</code> is represented by
a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
The shift allows the gain of the scaling operation to exceed 1.0.
The overall scale factor applied to the fixed-point data is
<pre>
scale = scaleFract * 2^shift.
</pre>
*/
/**
@addtogroup MatrixScale
@{
*/
/**
@brief Floating-point matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scale scale factor to be applied
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
f32x4_t vecIn, vecOut;
float32_t const *pInVec;
pInVec = (float32_t const *) pIn;
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/*
* C(m,n) = A(m,n) * scale
* Scaling and results are stored in the destination buffer.
*/
vecIn = vld1q(pInVec);
pInVec += 4;
vecOut = vecIn * scale;
vst1q(pOut, vecOut);
pOut += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecIn = vld1q(pInVec);
vecOut = vecIn * scale;
vstrwq_p(pOut, vecOut, p0);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON_EXPERIMENTAL)
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float32x4_t vec1;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 2;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scaling and results are stored in the destination buffer. */
vec1 = vld1q_f32(pIn);
res = vmulq_f32(vec1, vdupq_n_f32(scale));
vst1q_f32(pOut, res);
/* update pointers to process next sampels */
pIn += 4U;
pOut += 4U;
/* Decrement the numSamples loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* The results are stored in the destination buffer. */
*pOut++ = (*pIn++) * scale;
/* Decrement the loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* Input data matrix pointer */
float32_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counters */
arm_status status; /* Status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (*pIn++) * scale;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixScale group
*/

View file

@ -0,0 +1,249 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_scale_q15.c
* Description: Multiplies a Q15 matrix by a scalar
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixScale
@{
*/
/**
@brief Q15 matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scaleFract fractional portion of the scale factor
@param[in] shift number of bits to shift the result by
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
*/
#if defined(ARM_MATH_MVEI)
arm_status arm_mat_scale_q15(
const arm_matrix_instance_q15 * pSrc,
q15_t scaleFract,
int32_t shift,
arm_matrix_instance_q15 * pDst)
{
arm_status status; /* Status of matrix scaling */
q15_t *pIn = pSrc->pData; /* input data matrix pointer */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
q15x8_t vecIn, vecOut;
q15_t const *pInVec;
int32_t totShift = shift + 1; /* shift to apply after scaling */
pInVec = (q15_t const *) pIn;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/*
* C(m,n) = A(m,n) * scale
* Scaling and results are stored in the destination buffer.
*/
vecIn = vld1q(pInVec); pInVec += 8;
/* multiply input with scaler value */
vecOut = vmulhq(vecIn, vdupq_n_s16(scaleFract));
/* apply shifting */
vecOut = vqshlq_r(vecOut, totShift);
vst1q(pOut, vecOut); pOut += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecIn = vld1q(pInVec); pInVec += 8;
vecOut = vmulhq(vecIn, vdupq_n_s16(scaleFract));
vecOut = vqshlq_r(vecOut, totShift);
vstrhq_p(pOut, vecOut, p0);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_scale_q15(
const arm_matrix_instance_q15 * pSrc,
q15_t scaleFract,
int32_t shift,
arm_matrix_instance_q15 * pDst)
{
q15_t *pIn = pSrc->pData; /* Input data matrix pointer */
q15_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counter */
arm_status status; /* Status of matrix scaling */
int32_t kShift = 15 - shift; /* Total shift to apply after scaling */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t inA1, inA2;
q31_t out1, out2, out3, out4; /* Temporary output variables */
q15_t in1, in2, in3, in4; /* Temporary input variables */
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
#if defined (ARM_MATH_DSP)
/* read 2 times 2 samples at a time from source */
inA1 = read_q15x2_ia ((q15_t **) &pIn);
inA2 = read_q15x2_ia ((q15_t **) &pIn);
/* Scale inputs and store result in temporary variables
* in single cycle by packing the outputs */
out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
out2 = (q31_t) ((q15_t) (inA1 ) * scaleFract);
out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
out4 = (q31_t) ((q15_t) (inA2 ) * scaleFract);
/* apply shifting */
out1 = out1 >> kShift;
out2 = out2 >> kShift;
out3 = out3 >> kShift;
out4 = out4 >> kShift;
/* saturate the output */
in1 = (q15_t) (__SSAT(out1, 16));
in2 = (q15_t) (__SSAT(out2, 16));
in3 = (q15_t) (__SSAT(out3, 16));
in4 = (q15_t) (__SSAT(out4, 16));
/* store result to destination */
write_q15x2_ia (&pOut, __PKHBT(in2, in1, 16));
write_q15x2_ia (&pOut, __PKHBT(in4, in3, 16));
#else
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and store result in destination buffer. */
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixScale group
*/

View file

@ -0,0 +1,242 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_scale_q31.c
* Description: Multiplies a Q31 matrix by a scalar
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixScale
@{
*/
/**
@brief Q31 matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scaleFract fractional portion of the scale factor
@param[in] shift number of bits to shift the result by
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
These are multiplied to yield a 2.62 intermediate result which is shifted with saturation to 1.31 format.
*/
#if defined(ARM_MATH_MVEI)
arm_status arm_mat_scale_q31(
const arm_matrix_instance_q31 * pSrc,
q31_t scaleFract,
int32_t shift,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* input data matrix pointer */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
q31x4_t vecIn, vecOut;
q31_t const *pInVec;
int32_t totShift = shift + 1; /* shift to apply after scaling */
arm_status status; /* Status of matrix scaling */
pInVec = (q31_t const *) pIn;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/*
* C(m,n) = A(m,n) * scale
* Scaling and results are stored in the destination buffer.
*/
vecIn = vld1q(pInVec);
pInVec += 4;
/* multiply input with scaler value */
vecOut = vmulhq(vecIn, vdupq_n_s32(scaleFract));
/* apply shifting */
vecOut = vqshlq_r(vecOut, totShift);
vst1q(pOut, vecOut);
pOut += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecIn = vld1q(pInVec);
pInVec += 4;
vecOut = vmulhq(vecIn, vdupq_n_s32(scaleFract));
vecOut = vqshlq_r(vecOut, totShift);
vstrwq_p(pOut, vecOut, p0);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_scale_q31(
const arm_matrix_instance_q31 * pSrc,
q31_t scaleFract,
int32_t shift,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* Input data matrix pointer */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counter */
arm_status status; /* Status of matrix scaling */
int32_t kShift = shift + 1; /* Shift to apply after scaling */
q31_t in, out; /* Temporary variabels */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and store result in destination buffer. */
in = *pIn++; /* read four inputs from source */
in = ((q63_t) in * scaleFract) >> 32; /* multiply input with scaler value */
out = in << kShift; /* apply shifting */
if (in != (out >> kShift)) /* saturate the results. */
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out; /* Store result destination */
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and store result in destination buffer. */
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixScale group
*/

View file

@ -0,0 +1,298 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_f32.c
* Description: Floating-point matrix subtraction
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixSub Matrix Subtraction
Subtract two matrices.
\image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
The functions check to make sure that
<code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
number of rows and columns.
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Floating-point matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix subtraction */
uint32_t numSamples; /* total number of elements in the matrix */
float32_t *pDataA, *pDataB, *pDataDst;
f32x4_t vecA, vecB, vecDst;
float32_t const *pSrcAVec;
float32_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (float32_t const *) pDataA;
pSrcBVec = (float32_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* sub and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vsubq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
vecDst = vsubq_m(vecDst, vecA, vecB, p0);
vstrwq_p(pDataDst, vecDst, p0);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON)
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2U;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and then store the results in the destination buffer. */
/* Read values from source A */
vec1 = vld1q_f32(pIn1);
vec2 = vld1q_f32(pIn2);
res = vsubq_f32(vec1, vec2);
vst1q_f32(pOut, res);
/* Update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and then store the results in the destination buffer. */
*pOut++ = (*pIn1++) - (*pIn2++);
/* Decrement the loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixSub group
*/

View file

@ -0,0 +1,219 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_q15.c
* Description: Q15 Matrix subtraction
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Q15 matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI)
arm_status arm_mat_sub_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
uint32_t numSamples; /* total number of elements in the matrix */
q15_t *pDataA, *pDataB, *pDataDst;
q15x8_t vecA, vecB, vecDst;
q15_t const *pSrcAVec;
q15_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (q15_t const *) pDataA;
pSrcBVec = (q15_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* sub and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec); pSrcAVec += 8;
vecB = vld1q(pSrcBVec); pSrcBVec += 8;
vecDst = vqsubq(vecA, vecB);
vst1q(pDataDst, vecDst); pDataDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec); pSrcAVec += 8;
vecB = vld1q(pSrcBVec); pSrcBVec += 8;
vecDst = vqsubq_m(vecDst, vecA, vecB, p0);
vstrhq_p(pDataDst, vecDst, p0);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_sub_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, Saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia ((q15_t **) &pInA), read_q15x2_ia ((q15_t **) &pInB)));
write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia ((q15_t **) &pInA), read_q15x2_ia ((q15_t **) &pInB)));
#else
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++);
#else
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixSub group
*/

View file

@ -0,0 +1,218 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_q31.c
* Description: Q31 matrix subtraction
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Q31 matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI)
arm_status arm_mat_sub_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
uint32_t numSamples; /* total number of elements in the matrix */
q31_t *pDataA, *pDataB, *pDataDst;
q31x4_t vecA, vecB, vecDst;
q31_t const *pSrcAVec;
q31_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (q31_t const *) pDataA;
pSrcBVec = (q31_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* sub and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vqsubq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vqsubq_m(vecDst, vecA, vecB, p0);
vstrwq_p(pDataDst, vecDst, p0);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_sub_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, saturate and then store the results in the destination buffer. */
*pOut++ = __QSUB(*pInA++, *pInB++);
*pOut++ = __QSUB(*pInA++, *pInB++);
*pOut++ = __QSUB(*pInA++, *pInB++);
*pOut++ = __QSUB(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, saturate and store result in destination buffer. */
*pOut++ = __QSUB(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixSub group
*/

View file

@ -0,0 +1,325 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_trans_f32.c
* Description: Floating-point matrix transpose
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixTrans Matrix Transpose
Tranposes a matrix.
Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
\image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
*/
/**
@addtogroup MatrixTrans
@{
*/
/**
@brief Floating-point matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_trans_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
if (pDst->numRows == pDst->numCols)
{
if (pDst->numCols == 2)
return arm_mat_trans_32bit_2x2_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
if (pDst->numCols == 3)
return arm_mat_trans_32bit_3x3_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
if (pDst->numCols == 4)
return arm_mat_trans_32bit_4x4_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
}
arm_mat_trans_32bit_generic_mve(pSrc->numRows, pSrc->numCols, (uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON)
arm_status arm_mat_trans_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nColumns = pSrc->numCols; /* number of columns */
uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* Row loop */
rowCnt = row >> 2;
while (rowCnt > 0U)
{
float32x4_t row0V,row1V,row2V,row3V;
float32x4x2_t ra0,ra1,rb0,rb1;
blkCnt = nColumns >> 2;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) /* Column loop */
{
row0V = vld1q_f32(pIn);
row1V = vld1q_f32(pIn + 1 * nColumns);
row2V = vld1q_f32(pIn + 2 * nColumns);
row3V = vld1q_f32(pIn + 3 * nColumns);
pIn += 4;
ra0 = vzipq_f32(row0V,row2V);
ra1 = vzipq_f32(row1V,row3V);
rb0 = vzipq_f32(ra0.val[0],ra1.val[0]);
rb1 = vzipq_f32(ra0.val[1],ra1.val[1]);
vst1q_f32(px,rb0.val[0]);
px += nRows;
vst1q_f32(px,rb0.val[1]);
px += nRows;
vst1q_f32(px,rb1.val[0]);
px += nRows;
vst1q_f32(px,rb1.val[1]);
px += nRows;
/* Decrement the column loop counter */
blkCnt--;
}
/* Perform matrix transpose for last 3 samples here. */
blkCnt = nColumns % 0x4U;
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px++ = *pIn;
*px++ = *(pIn + 1 * nColumns);
*px++ = *(pIn + 2 * nColumns);
*px++ = *(pIn + 3 * nColumns);
px += (nRows - 4);
pIn++;
/* Decrement the column loop counter */
blkCnt--;
}
i += 4;
pIn += 3 * nColumns;
/* Decrement the row loop counter */
rowCnt--;
} /* Row loop end */
rowCnt = row & 3;
while (rowCnt > 0U)
{
blkCnt = nColumns ;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
/* Decrement the column loop counter */
blkCnt--;
}
i++;
rowCnt -- ;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_trans_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer px is set to starting address of column being processed */
px = pOut + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixTrans group
*/

View file

@ -0,0 +1,348 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_trans_q15.c
* Description: Q15 matrix transpose
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixTrans
@{
*/
/**
@brief Q15 matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEI)
__STATIC_INLINE arm_status arm_mat_trans_16bit_2x2(uint16_t * pDataSrc, uint16_t * pDataDest)
{
pDataDest[0] = pDataSrc[0];
pDataDest[3] = pDataSrc[3];
pDataDest[2] = pDataSrc[1];
pDataDest[1] = pDataSrc[2];
return (ARM_MATH_SUCCESS);
}
static arm_status arm_mat_trans_16bit_3x3_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
{
static const uint16_t stridesTr33[8] = { 0, 3, 6, 1, 4, 7, 2, 5 };
uint16x8_t vecOffs1;
uint16x8_t vecIn1;
/*
*
* | 0 1 2 | | 0 3 6 | 8 x 16 flattened version | 0 3 6 1 4 7 2 5 |
* | 3 4 5 | => | 1 4 7 | => | 8 . . . . . . . |
* | 6 7 8 | | 2 5 8 | (row major)
*
*/
vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr33);
vecIn1 = vldrhq_u16((uint16_t const *) pDataSrc);
vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
pDataDest[8] = pDataSrc[8];
return (ARM_MATH_SUCCESS);
}
static arm_status arm_mat_trans_16bit_4x4_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
{
static const uint16_t stridesTr44_1[8] = { 0, 4, 8, 12, 1, 5, 9, 13 };
static const uint16_t stridesTr44_2[8] = { 2, 6, 10, 14, 3, 7, 11, 15 };
uint16x8_t vecOffs1, vecOffs2;
uint16x8_t vecIn1, vecIn2;
uint16_t const * pDataSrcVec = (uint16_t const *) pDataSrc;
/*
* 4x4 Matrix transposition
*
* | 0 1 2 3 | | 0 4 8 12 | 8 x 16 flattened version
* | 4 5 6 7 | => | 1 5 9 13 | => [0 4 8 12 1 5 9 13]
* | 8 9 10 11 | | 2 6 10 14 | [2 6 10 14 3 7 11 15]
* | 12 13 14 15 | | 3 7 11 15 |
*/
vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr44_1);
vecOffs2 = vldrhq_u16((uint16_t const *) stridesTr44_2);
vecIn1 = vldrhq_u16(pDataSrcVec);
pDataSrcVec += 8;
vecIn2 = vldrhq_u16(pDataSrcVec);
vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs2, vecIn2);
return (ARM_MATH_SUCCESS);
}
static arm_status arm_mat_trans_16bit_generic(
uint16_t srcRows,
uint16_t srcCols,
uint16_t * pDataSrc,
uint16_t * pDataDest)
{
uint16x8_t vecOffs;
uint32_t i;
uint32_t blkCnt;
uint16_t const *pDataC;
uint16_t *pDataDestR;
uint16x8_t vecIn;
vecOffs = vidupq_u16((uint32_t)0, 1);
vecOffs = vecOffs * srcCols;
i = srcCols;
while(i > 0U)
{
pDataC = (uint16_t const *) pDataSrc;
pDataDestR = pDataDest;
blkCnt = srcRows >> 3;
while (blkCnt > 0U)
{
vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
vstrhq_u16(pDataDestR, vecIn);
pDataDestR += 8;
pDataC = pDataC + srcCols * 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = srcRows & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
vstrhq_p_u16(pDataDestR, vecIn, p0);
}
pDataSrc += 1;
pDataDest += srcRows;
i--;
}
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_trans_q15(
const arm_matrix_instance_q15 * pSrc,
arm_matrix_instance_q15 * pDst)
{
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
if (pDst->numRows == pDst->numCols)
{
if (pDst->numCols == 1)
{
pDst->pData[0] = pSrc->pData[0];
return(ARM_MATH_SUCCESS);
}
if (pDst->numCols == 2)
return arm_mat_trans_16bit_2x2((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
if (pDst->numCols == 3)
return arm_mat_trans_16bit_3x3_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
if (pDst->numCols == 4)
return arm_mat_trans_16bit_4x4_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
}
arm_mat_trans_16bit_generic(pSrc->numRows, pSrc->numCols, (uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_trans_q15(
const arm_matrix_instance_q15 * pSrc,
arm_matrix_instance_q15 * pDst)
{
q15_t *pIn = pSrc->pData; /* input data matrix pointer */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t in; /* variable to hold temporary output */
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer pOut is set to starting address of column being processed */
pOut = pDst->pData + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pIn);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) in;
#else
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*pOut = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pIn);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) in;
#else
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*pOut = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*pOut = *pIn++;
/* Update pointer pOut to point to next row of transposed matrix */
pOut += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixTrans group
*/

View file

@ -0,0 +1,191 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_trans_q31.c
* Description: Q31 matrix transpose
*
* $Date: 18. March 2019
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixTrans
@{
*/
/**
@brief Q31 matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEI)
#include "arm_helium_utils.h"
arm_status arm_mat_trans_q31(
const arm_matrix_instance_q31 * pSrc,
arm_matrix_instance_q31 * pDst)
{
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
if (pDst->numRows == pDst->numCols)
{
if (pDst->numCols == 2)
return arm_mat_trans_32bit_2x2_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
if (pDst->numCols == 3)
return arm_mat_trans_32bit_3x3_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
if (pDst->numCols == 4)
return arm_mat_trans_32bit_4x4_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
}
arm_mat_trans_32bit_generic_mve(pSrc->numRows, pSrc->numCols, (uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_trans_q31(
const arm_matrix_instance_q31 * pSrc,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* input data matrix pointer */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer px is set to starting address of column being processed */
px = pOut + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixTrans group
*/