diff --git a/CMSIS/DSP/Include/arm_common_tables.h b/CMSIS/DSP/Include/arm_common_tables.h
index 721b18dd2d4399d748cff75b0f07725ab401fbf0..91d2be0a297d807c9345dc5d51465e4e5e0cf208 100644
--- a/CMSIS/DSP/Include/arm_common_tables.h
+++ b/CMSIS/DSP/Include/arm_common_tables.h
@@ -3,13 +3,13 @@
  * Title:        arm_common_tables.h
  * Description:  Extern declaration for common tables
  *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * @version  V1.9.0
+ * @date     23 April 2021
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -29,7 +29,13 @@
 #ifndef _ARM_COMMON_TABLES_H
 #define _ARM_COMMON_TABLES_H
 
-#include "arm_math.h"
+#include "arm_math_types.h"
+#include "dsp/fast_math_functions.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
 
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
   /* Double Precision Float CFFT twiddles */
@@ -110,6 +116,8 @@
     #define twiddleCoef twiddleCoef_4096
   #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
 
+  /* Q31 */
+
   #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_16)
     extern const q31_t twiddleCoef_16_q31[24];
   #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
@@ -490,13 +498,13 @@
     extern const q15_t sinTable_q15[FAST_MATH_TABLE_SIZE + 1];
   #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
 
-  #if defined(ARM_MATH_MVEI)
+  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
      #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
        extern const q31_t sqrtTable_Q31[256];
      #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
   #endif
 
-  #if defined(ARM_MATH_MVEI)
+  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
      #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
        extern const q15_t sqrtTable_Q15[256];
      #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
@@ -509,9 +517,13 @@
        extern const float32_t __logf_lut_f32[8];
 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
-#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
+#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
 extern const unsigned char hwLUT[256];
 #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
 
+#ifdef   __cplusplus
+}
+#endif
+
 #endif /*  ARM_COMMON_TABLES_H */
 
diff --git a/CMSIS/DSP/Include/arm_common_tables_f16.h b/CMSIS/DSP/Include/arm_common_tables_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..f40c1a4ea8983dfdfcd8c9847f32e9a19b1a0ddd
--- /dev/null
+++ b/CMSIS/DSP/Include/arm_common_tables_f16.h
@@ -0,0 +1,132 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_common_tables_f16.h
+ * Description:  Extern declaration for common tables
+ *
+ * @version  V1.9.0
+ * @date     23 April 2021
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_COMMON_TABLES_F16_H
+#define _ARM_COMMON_TABLES_F16_H
+
+#include "arm_math_types_f16.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+  /* F16 */
+  #if !defined(__CC_ARM) && defined(ARM_FLOAT16_SUPPORTED)
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_16)
+    extern const float16_t twiddleCoefF16_16[32];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_32)
+    extern const float16_t twiddleCoefF16_32[64];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_64)
+    extern const float16_t twiddleCoefF16_64[128];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_128)
+    extern const float16_t twiddleCoefF16_128[256];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_256)
+    extern const float16_t twiddleCoefF16_256[512];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_512)
+    extern const float16_t twiddleCoefF16_512[1024];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_1024)
+    extern const float16_t twiddleCoefF16_1024[2048];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_2048)
+    extern const float16_t twiddleCoefF16_2048[4096];
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096)
+    extern const float16_t twiddleCoefF16_4096[8192];
+    #define twiddleCoefF16 twiddleCoefF16_4096
+  #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+  
+ 
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_32)
+  extern const float16_t twiddleCoefF16_rfft_32[32];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_64)
+  extern const float16_t twiddleCoefF16_rfft_64[64];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_128)
+  extern const float16_t twiddleCoefF16_rfft_128[128];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_256)
+  extern const float16_t twiddleCoefF16_rfft_256[256];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_512)
+  extern const float16_t twiddleCoefF16_rfft_512[512];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_1024)
+  extern const float16_t twiddleCoefF16_rfft_1024[1024];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_2048)
+  extern const float16_t twiddleCoefF16_rfft_2048[2048];
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_4096)
+  extern const float16_t twiddleCoefF16_rfft_4096[4096];
+  #endif
+
+  #endif /* ARMAC5 */
+    
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
+
+#if !defined(__CC_ARM) && defined(ARM_FLOAT16_SUPPORTED)
+
+#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
+       extern const float16_t exp_tab_f16[8];
+       extern const float16_t __logf_lut_f16[8];
+#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) */
+#endif 
+       
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /*  _ARM_COMMON_TABLES_F16_H */
+
+  
diff --git a/CMSIS/DSP/Include/arm_const_structs.h b/CMSIS/DSP/Include/arm_const_structs.h
index 83984c40cd01bd809157afb4cd786459ed97a8eb..15e7726f7393783a962d40481fdaf7d2d37508fc 100644
--- a/CMSIS/DSP/Include/arm_const_structs.h
+++ b/CMSIS/DSP/Include/arm_const_structs.h
@@ -4,13 +4,13 @@
  * Description:  Constant structs that are initialized for user convenience.
  *               For example, some can be given as arguments to the arm_cfft_f32() function.
  *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * @version  V1.9.0
+ * @date     23 April 2021
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -30,9 +30,14 @@
 #ifndef _ARM_CONST_STRUCTS_H
 #define _ARM_CONST_STRUCTS_H
 
-#include "arm_math.h"
+#include "arm_math_types.h"
 #include "arm_common_tables.h"
+#include "dsp/transform_functions.h"
 
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
    extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len16;
    extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len32;
    extern const arm_cfft_instance_f64 arm_cfft_sR_f64_len64;
@@ -73,4 +78,9 @@
    extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len2048;
    extern const arm_cfft_instance_q15 arm_cfft_sR_q15_len4096;
 
+#ifdef   __cplusplus
+}
+#endif
+
 #endif
+
diff --git a/CMSIS/DSP/Include/arm_const_structs_f16.h b/CMSIS/DSP/Include/arm_const_structs_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..584941e6b70bb14e146b4e20cdf824ff30c628de
--- /dev/null
+++ b/CMSIS/DSP/Include/arm_const_structs_f16.h
@@ -0,0 +1,77 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_const_structs_f16.h
+ * Description:  Constant structs that are initialized for user convenience.
+ *               For example, some can be given as arguments to the arm_cfft_f16() function.
+ *
+ * @version  V1.9.0
+ * @date     23 April 2021
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_CONST_STRUCTS_F16_H
+#define _ARM_CONST_STRUCTS_F16_H
+
+#include "arm_math_types_f16.h"
+#include "arm_common_tables.h"
+#include "arm_common_tables_f16.h"
+#include "dsp/transform_functions_f16.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if !defined(__CC_ARM) && defined(ARM_FLOAT16_SUPPORTED)
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
+   extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len16;
+   #endif
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
+   extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len32;
+    #endif
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
+   extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len64;
+    #endif
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
+   extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len128;
+    #endif
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
+   extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len256;
+    #endif
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
+   extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len512;
+    #endif
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
+   extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len1024;
+    #endif
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
+   extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len2048;
+    #endif
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
+   extern const arm_cfft_instance_f16 arm_cfft_sR_f16_len4096;
+  #endif
+#endif
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/CMSIS/DSP/Include/arm_helium_utils.h b/CMSIS/DSP/Include/arm_helium_utils.h
index 7609d329f088181dd1442a95da1f552eb8602233..54a9db59bafdc3b8af7d0e855dc260873a59c807 100644
--- a/CMSIS/DSP/Include/arm_helium_utils.h
+++ b/CMSIS/DSP/Include/arm_helium_utils.h
@@ -3,13 +3,13 @@
  * Title:        arm_helium_utils.h
  * Description:  Utility functions for Helium development
  *
- * $Date:        09. September 2019
- * $Revision:    V.1.5.1
+ * @version  V1.9.0
+ * @date     23 April 2021
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -29,12 +29,17 @@
 #ifndef _ARM_UTILS_HELIUM_H_
 #define _ARM_UTILS_HELIUM_H_
 
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
 /***************************************
 
 Definitions available for MVEF and MVEI
 
 ***************************************/
-#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
+#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI))  && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #define INACTIVELANE            0 /* inactive lane content */
 
@@ -46,7 +51,7 @@ Definitions available for MVEF and MVEI
 Definitions available for MVEF only
 
 ***************************************/
-#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF)
+#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF))  && !defined(ARM_MATH_AUTOVECTORIZE)
 
 __STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)
 {
@@ -58,8 +63,13 @@ __STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)
     return acc;
 }
 
+
+
+
 /* newton initial guess */
 #define INVSQRT_MAGIC_F32           0x5f3759df
+#define INV_NEWTON_INIT_F32         0x7EF127EA
+
 
 #define INVSQRT_NEWTON_MVE_F32(invSqrt, xHalf, xStart)\
 {                                                     \
@@ -75,16 +85,108 @@ __STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)
 }
 #endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) */
 
+
 /***************************************
 
-Definitions available for MVEI only
+Definitions available for f16 datatype with HW acceleration only
 
 ***************************************/
-#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)
+#if defined(ARM_FLOAT16_SUPPORTED)
+#if defined (ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+__STATIC_FORCEINLINE float16_t vecAddAcrossF16Mve(float16x8_t in)
+{
+    float16x8_t tmpVec;
+    _Float16 acc;
 
+    tmpVec = (float16x8_t) vrev32q_s16((int16x8_t) in);
+    in = vaddq_f16(tmpVec, in);
+    tmpVec = (float16x8_t) vrev64q_s32((int32x4_t) in);
+    in = vaddq_f16(tmpVec, in);
+    acc = (_Float16)vgetq_lane_f16(in, 0) + (_Float16)vgetq_lane_f16(in, 4);
 
-#include "arm_common_tables.h"
+    return acc;
+}
+
+__STATIC_FORCEINLINE float16x8_t __mve_cmplx_sum_intra_vec_f16(
+    float16x8_t   vecIn)
+{
+    float16x8_t   vecTmp, vecOut;
+    uint32_t    tmp;
+
+    vecTmp = (float16x8_t) vrev64q_s32((int32x4_t) vecIn);
+    // TO TRACK : using canonical addition leads to unefficient code generation for f16
+    // vecTmp = vecTmp + vecAccCpx0;
+    /*
+     * Compute
+     *  re0+re1 | im0+im1 | re0+re1 | im0+im1
+     *  re2+re3 | im2+im3 | re2+re3 | im2+im3
+     */
+    vecTmp = vaddq_f16(vecTmp, vecIn);
+    vecOut = vecTmp;
+    /*
+     * shift left, random tmp insertion in bottom
+     */
+    vecOut = vreinterpretq_f16_s32(vshlcq_s32(vreinterpretq_s32_f16(vecOut)   , &tmp, 32));
+    /*
+     * Compute:
+     *    DONTCARE     |    DONTCARE     | re0+re1+re0+re1 |im0+im1+im0+im1
+     * re0+re1+re2+re3 | im0+im1+im2+im3 | re2+re3+re2+re3 |im2+im3+im2+im3
+     */
+    vecOut = vaddq_f16(vecOut, vecTmp);
+    /*
+     * Cmplx sum is in 4rd & 5th f16 elt
+     * return full vector
+     */
+    return vecOut;
+}
 
+
+#define mve_cmplx_sum_intra_r_i_f16(vec, Re, Im)                \
+{                                                               \
+    float16x8_t   vecOut = __mve_cmplx_sum_intra_vec_f16(vec);    \
+    Re = vgetq_lane(vecOut, 4);                                 \
+    Im = vgetq_lane(vecOut, 5);                                 \
+}
+
+__STATIC_FORCEINLINE void mve_cmplx_sum_intra_vec_f16(
+    float16x8_t   vecIn,
+    float16_t  *pOut)
+{
+    float16x8_t   vecOut = __mve_cmplx_sum_intra_vec_f16(vecIn);
+    /*
+     * Cmplx sum is in 4rd & 5th f16 elt
+     * use 32-bit extraction
+     */
+    *(float32_t *) pOut = ((float32x4_t) vecOut)[2];
+}
+
+
+#define INVSQRT_MAGIC_F16           0x59ba      /*  ( 0x1ba = 0x3759df >> 13) */
+
+/* canonical version of INVSQRT_NEWTON_MVE_F16 leads to bad performance */
+#define INVSQRT_NEWTON_MVE_F16(invSqrt, xHalf, xStart)                  \
+{                                                                       \
+    float16x8_t tmp;                                                      \
+                                                                        \
+    /* tmp = xhalf * x * x */                                           \
+    tmp = vmulq(xStart, xStart);                                        \
+    tmp = vmulq(tmp, xHalf);                                            \
+    /* (1.5f - xhalf * x * x) */                                        \
+    tmp = vsubq(vdupq_n_f16((float16_t)1.5), tmp);                      \
+    /* x = x*(1.5f-xhalf*x*x); */                                       \
+    invSqrt = vmulq(tmp, xStart);                                       \
+}
+
+#endif
+#endif 
+
+/***************************************
+
+Definitions available for MVEI and MVEF only
+
+***************************************/
+#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI))  && !defined(ARM_MATH_AUTOVECTORIZE)
 /* Following functions are used to transpose matrix in f32 and q31 cases */
 __STATIC_INLINE arm_status arm_mat_trans_32bit_2x2_mve(
     uint32_t * pDataSrc,
@@ -208,6 +310,305 @@ __STATIC_INLINE arm_status arm_mat_trans_32bit_generic_mve(
     return (ARM_MATH_SUCCESS);
 }
 
+__STATIC_INLINE arm_status arm_mat_cmplx_trans_32bit(
+    uint16_t    srcRows,
+    uint16_t    srcCols,
+    uint32_t   *pDataSrc,
+    uint16_t    dstRows,
+    uint16_t    dstCols,
+    uint32_t   *pDataDest)
+{
+    uint32_t        i;
+    uint32_t const *pDataC;
+    uint32_t       *pDataRow;
+    uint32_t       *pDataDestR, *pDataDestRow;
+    uint32x4_t      vecOffsRef, vecOffsCur;
+    uint32_t        blkCnt;
+    uint32x4_t      vecIn;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+    /*
+     * Check for matrix mismatch condition
+     */
+    if ((srcRows != dstCols) || (srcCols != dstRows))
+    {
+        /*
+         * Set status as ARM_MATH_SIZE_MISMATCH
+         */
+        return ARM_MATH_SIZE_MISMATCH;
+    }
+#else
+    (void)dstRows;
+    (void)dstCols;
+#endif
+
+    /* 2x2, 3x3 and 4x4 specialization to be added */
+
+    vecOffsRef[0] = 0;
+    vecOffsRef[1] = 1;
+    vecOffsRef[2] = srcCols << 1;
+    vecOffsRef[3] = (srcCols << 1) + 1;
+
+    pDataRow = pDataSrc;
+    pDataDestRow = pDataDest;
+    i = srcCols;
+    do
+    {
+        pDataC = (uint32_t const *) pDataRow;
+        pDataDestR = pDataDestRow;
+        vecOffsCur = vecOffsRef;
+
+        blkCnt = (srcRows * CMPLX_DIM) >> 2;
+        while (blkCnt > 0U)
+        {
+            vecIn = vldrwq_gather_shifted_offset(pDataC, vecOffsCur);
+            vstrwq(pDataDestR, vecIn); 
+            pDataDestR += 4;
+            vecOffsCur = vaddq(vecOffsCur, (srcCols << 2));
+            /*
+             * Decrement the blockSize loop counter
+             */
+             blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = (srcRows * CMPLX_DIM) & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+            vecIn = vldrwq_gather_shifted_offset(pDataC, vecOffsCur);
+            vstrwq_p(pDataDestR, vecIn, p0);
+        }
+
+        pDataRow += CMPLX_DIM;
+        pDataDestRow += (srcRows * CMPLX_DIM);
+    }
+    while (--i);
+
+    return (ARM_MATH_SUCCESS);
+}
+
+__STATIC_INLINE arm_status arm_mat_trans_16bit_2x2(uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+    pDataDest[0] = pDataSrc[0];
+    pDataDest[3] = pDataSrc[3];
+    pDataDest[2] = pDataSrc[1];
+    pDataDest[1] = pDataSrc[2];
+
+    return (ARM_MATH_SUCCESS);
+}
+
+__STATIC_INLINE arm_status arm_mat_trans_16bit_3x3_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+    static const uint16_t stridesTr33[8] = { 0, 3, 6, 1, 4, 7, 2, 5 };
+    uint16x8_t    vecOffs1;
+    uint16x8_t    vecIn1;
+    /*
+     *
+     *  | 0   1   2 |       | 0   3   6 |  8 x 16 flattened version | 0   3   6   1   4   7   2   5 |
+     *  | 3   4   5 | =>    | 1   4   7 |            =>             | 8   .   .   .   .   .   .   . |
+     *  | 6   7   8 |       | 2   5   8 |       (row major)
+     *
+     */
+    vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr33);
+    vecIn1 = vldrhq_u16((uint16_t const *) pDataSrc);
+
+    vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
+
+    pDataDest[8] = pDataSrc[8];
+
+    return (ARM_MATH_SUCCESS);
+}
+
+
+__STATIC_INLINE arm_status arm_mat_trans_16bit_4x4_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+    static const uint16_t stridesTr44_1[8] = { 0, 4, 8, 12, 1, 5, 9, 13 };
+    static const uint16_t stridesTr44_2[8] = { 2, 6, 10, 14, 3, 7, 11, 15 };
+    uint16x8_t    vecOffs1, vecOffs2;
+    uint16x8_t    vecIn1, vecIn2;
+    uint16_t const * pDataSrcVec = (uint16_t const *) pDataSrc;
+
+    /*
+     * 4x4 Matrix transposition
+     *
+     * | 0   1   2   3  |       | 0   4   8   12 |   8 x 16 flattened version
+     * | 4   5   6   7  |  =>   | 1   5   9   13 |   =>      [0   4   8   12  1   5   9   13]
+     * | 8   9   10  11 |       | 2   6   10  14 |           [2   6   10  14  3   7   11  15]
+     * | 12  13  14  15 |       | 3   7   11  15 |
+     */
+
+    vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr44_1);
+    vecOffs2 = vldrhq_u16((uint16_t const *) stridesTr44_2);
+    vecIn1 = vldrhq_u16(pDataSrcVec);
+    pDataSrcVec += 8;
+    vecIn2 = vldrhq_u16(pDataSrcVec);
+
+    vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
+    vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs2, vecIn2);
+
+
+    return (ARM_MATH_SUCCESS);
+}
+
+
+
+__STATIC_INLINE arm_status arm_mat_trans_16bit_generic(
+    uint16_t    srcRows,
+    uint16_t    srcCols,
+    uint16_t  * pDataSrc,
+    uint16_t  * pDataDest)
+{
+    uint16x8_t    vecOffs;
+    uint32_t        i;
+    uint32_t        blkCnt;
+    uint16_t const *pDataC;
+    uint16_t       *pDataDestR;
+    uint16x8_t    vecIn;
+
+    vecOffs = vidupq_u16((uint32_t)0, 1);
+    vecOffs = vecOffs * srcCols;
+
+    i = srcCols;
+    while(i > 0U)
+    {
+        pDataC = (uint16_t const *) pDataSrc;
+        pDataDestR = pDataDest;
+
+        blkCnt = srcRows >> 3;
+        while (blkCnt > 0U)
+        {
+            vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
+            vstrhq_u16(pDataDestR, vecIn); 
+            pDataDestR += 8;
+            pDataC = pDataC + srcCols * 8;
+            /*
+             * Decrement the blockSize loop counter
+             */
+            blkCnt--;
+        }
+
+        /*
+         * tail
+         */
+        blkCnt = srcRows & 7;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp16q(blkCnt);
+            vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
+            vstrhq_p_u16(pDataDestR, vecIn, p0);
+        }
+        pDataSrc += 1;
+        pDataDest += srcRows;
+        i--;
+    }
+
+    return (ARM_MATH_SUCCESS);
+}
+
+
+__STATIC_INLINE arm_status arm_mat_cmplx_trans_16bit(
+    uint16_t    srcRows,
+    uint16_t    srcCols,
+    uint16_t   *pDataSrc,
+    uint16_t    dstRows,
+    uint16_t    dstCols,
+    uint16_t   *pDataDest)
+{
+    static const uint16_t loadCmplxCol[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
+    int             i;
+    uint16x8_t    vecOffsRef, vecOffsCur;
+    uint16_t const *pDataC;
+    uint16_t       *pDataRow;
+    uint16_t       *pDataDestR, *pDataDestRow;
+    uint32_t        blkCnt;
+    uint16x8_t    vecIn;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+    /*
+     * Check for matrix mismatch condition
+     */
+    if ((srcRows != dstCols) || (srcCols != dstRows))
+    {
+        /*
+         * Set status as ARM_MATH_SIZE_MISMATCH
+         */
+        return ARM_MATH_SIZE_MISMATCH;
+    }
+#else
+    (void)dstRows;
+    (void)dstCols;
+#endif
+
+    /*
+     * 2x2, 3x3 and 4x4 specialization to be added
+     */
+
+
+    /*
+     * build  [0, 1, 2xcol, 2xcol+1, 4xcol, 4xcol+1, 6xcol, 6xcol+1]
+     */
+    vecOffsRef = vldrhq_u16((uint16_t const *) loadCmplxCol);
+    vecOffsRef = vmulq(vecOffsRef, (uint16_t) (srcCols * CMPLX_DIM))
+                    + viwdupq_u16((uint32_t)0, (uint16_t) 2, 1);
+
+    pDataRow = pDataSrc;
+    pDataDestRow = pDataDest;
+    i = srcCols;
+    do
+    {
+        pDataC = (uint16_t const *) pDataRow;
+        pDataDestR = pDataDestRow;
+        vecOffsCur = vecOffsRef;
+
+        blkCnt = (srcRows * CMPLX_DIM) >> 3;
+        while (blkCnt > 0U)
+        {
+            vecIn = vldrhq_gather_shifted_offset(pDataC, vecOffsCur);
+            vstrhq(pDataDestR, vecIn);  
+            pDataDestR+= 8; // VEC_LANES_U16
+            vecOffsCur = vaddq(vecOffsCur, (srcCols << 3));
+            /*
+             * Decrement the blockSize loop counter
+             */
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = (srcRows * CMPLX_DIM) & 0x7;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp16q(blkCnt);
+            vecIn = vldrhq_gather_shifted_offset(pDataC, vecOffsCur);
+            vstrhq_p(pDataDestR, vecIn, p0);
+        }
+
+        pDataRow += CMPLX_DIM;
+        pDataDestRow += (srcRows * CMPLX_DIM);
+    }
+    while (--i);
+
+    return (ARM_MATH_SUCCESS);
+}
+#endif /* MVEF and MVEI */
+
+/***************************************
+
+Definitions available for MVEI only
+
+***************************************/
+#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI))  && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_common_tables.h"
+
+#define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
+#define MVE_ASRL_SAT32(acc, shift)          ((sqrshrl(acc, -(32-shift)) >> 32) & 0xffffffff)
+
+
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
 __STATIC_INLINE q31x4_t FAST_VSQRT_Q31(q31x4_t vecIn)
 {
@@ -219,7 +620,7 @@ __STATIC_INLINE q31x4_t FAST_VSQRT_Q31(q31x4_t vecIn)
 
 
     vecSignBits = vclsq(vecIn);
-    vecSignBits = vbicq(vecSignBits, 1);
+    vecSignBits = vbicq_n_s32(vecSignBits, 1);
     /*
      * in = in << no_of_sign_bits;
      */
@@ -230,11 +631,11 @@ __STATIC_INLINE q31x4_t FAST_VSQRT_Q31(q31x4_t vecIn)
     vecIdx = vecNrm >> 24;
     vecIdx = vecIdx << 1;
 
-    vecTmp0 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, vecIdx);
+    vecTmp0 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, (uint32x4_t)vecIdx);
 
     vecIdx = vecIdx + 1;
 
-    vecTmp1 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, vecIdx);
+    vecTmp1 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, (uint32x4_t)vecIdx);
 
     vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
     vecTmp0 = vecTmp0 - vecTmp1;
@@ -286,7 +687,7 @@ __STATIC_INLINE q15x8_t FAST_VSQRT_Q15(q15x8_t vecIn)
     vecDst = vuninitializedq_s16();
 
     vecSignBits = vclsq(vecIn);
-    vecSignBits = vbicq(vecSignBits, 1);
+    vecSignBits = vbicq_n_s16(vecSignBits, 1);
     /*
      * in = in << no_of_sign_bits;
      */
@@ -295,11 +696,11 @@ __STATIC_INLINE q15x8_t FAST_VSQRT_Q15(q15x8_t vecIn)
     vecIdx = vecNrm >> 8;
     vecIdx = vecIdx << 1;
 
-    vecTmp0 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, vecIdx);
+    vecTmp0 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, (uint16x8_t)vecIdx);
 
     vecIdx = vecIdx + 1;
 
-    vecTmp1 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, vecIdx);
+    vecTmp1 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, (uint16x8_t)vecIdx);
 
     vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
     vecTmp0 = vecTmp0 - vecTmp1;
@@ -345,4 +746,8 @@ __STATIC_INLINE q15x8_t FAST_VSQRT_Q15(q15x8_t vecIn)
 
 #endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI) */
 
+#ifdef   __cplusplus
+}
+#endif
+
 #endif
diff --git a/CMSIS/DSP/Include/arm_math.h b/CMSIS/DSP/Include/arm_math.h
index 6a0bdf61f72feddfbcb11b160623eba4cb8856c7..79ce541ea2701edd20c39ed24176e79fd333b5be 100644
--- a/CMSIS/DSP/Include/arm_math.h
+++ b/CMSIS/DSP/Include/arm_math.h
@@ -1,11 +1,12 @@
 /******************************************************************************
  * @file     arm_math.h
  * @brief    Public header file for CMSIS DSP Library
- * @version  V1.7.0
- * @date     18. March 2019
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
  ******************************************************************************/
 /*
- * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,8 +26,7 @@
 /**
    \mainpage CMSIS DSP Software Library
    *
-   * Introduction
-   * ------------
+   * \section intro Introduction
    *
    * This user manual describes the CMSIS DSP software library,
    * a suite of common signal processing functions for use on Cortex-M and Cortex-A processor 
@@ -46,67 +46,39 @@
    * - Support Vector Machine functions (SVM)
    * - Bayes classifier functions
    * - Distance functions
+   * - Quaternion functions
    *
    * The library has generally separate functions for operating on 8-bit integers, 16-bit integers,
    * 32-bit integer and 32-bit floating-point values.
    *
-   * Using the Library
-   * ------------
+   * The library is providing vectorized versions of most algorthms for Helium
+   * and of most f32 algorithms for Neon.
    *
-   * The library installer contains prebuilt versions of the libraries in the <code>Lib</code> folder.
+   * When using a vectorized version, provide a little bit of padding after the end of
+   * a buffer (3 words) because the vectorized code may read a little bit after the end
+   * of a buffer. You don't have to modify your buffers but just ensure that the
+   * end of buffer + padding is not outside of a memory region.
    *
-   * Here is the list of pre-built libraries :
-   * - arm_cortexM7lfdp_math.lib (Cortex-M7, Little endian, Double Precision Floating Point Unit)
-   * - arm_cortexM7bfdp_math.lib (Cortex-M7, Big endian, Double Precision Floating Point Unit)
-   * - arm_cortexM7lfsp_math.lib (Cortex-M7, Little endian, Single Precision Floating Point Unit)
-   * - arm_cortexM7bfsp_math.lib (Cortex-M7, Big endian and Single Precision Floating Point Unit on)
-   * - arm_cortexM7l_math.lib (Cortex-M7, Little endian)
-   * - arm_cortexM7b_math.lib (Cortex-M7, Big endian)
-   * - arm_cortexM4lf_math.lib (Cortex-M4, Little endian, Floating Point Unit)
-   * - arm_cortexM4bf_math.lib (Cortex-M4, Big endian, Floating Point Unit)
-   * - arm_cortexM4l_math.lib (Cortex-M4, Little endian)
-   * - arm_cortexM4b_math.lib (Cortex-M4, Big endian)
-   * - arm_cortexM3l_math.lib (Cortex-M3, Little endian)
-   * - arm_cortexM3b_math.lib (Cortex-M3, Big endian)
-   * - arm_cortexM0l_math.lib (Cortex-M0 / Cortex-M0+, Little endian)
-   * - arm_cortexM0b_math.lib (Cortex-M0 / Cortex-M0+, Big endian)
-   * - arm_ARMv8MBLl_math.lib (Armv8-M Baseline, Little endian)
-   * - arm_ARMv8MMLl_math.lib (Armv8-M Mainline, Little endian)
-   * - arm_ARMv8MMLlfsp_math.lib (Armv8-M Mainline, Little endian, Single Precision Floating Point Unit)
-   * - arm_ARMv8MMLld_math.lib (Armv8-M Mainline, Little endian, DSP instructions)
-   * - arm_ARMv8MMLldfsp_math.lib (Armv8-M Mainline, Little endian, DSP instructions, Single Precision Floating Point Unit)
+   * \section using Using the Library
    *
-   * The library functions are declared in the public file <code>arm_math.h</code> which is placed in the <code>Include</code> folder.
-   * Simply include this file and link the appropriate library in the application and begin calling the library functions. The Library supports single
-   * public header file <code> arm_math.h</code> for Cortex-M cores with little endian and big endian. Same header file will be used for floating point unit(FPU) variants.
+   * The library is released in source form. It is strongly advised to compile the library using -Ofast to
+   * have the best performances.
    *
+   * The library functions are declared in the public file <code>arm_math.h</code> which is placed in the <code>Include</code> folder.
+   * Simply include this file. If you don't want to include everything, you can also rely
+   * on headers in Include/dsp folder and use only what you need.
    *
-   * Examples
-   * --------
+   * \section example Examples
    *
    * The library ships with a number of examples which demonstrate how to use the library functions.
    *
-   * Toolchain Support
-   * ------------
+   * \section toolchain Toolchain Support
    *
    * The library is now tested on Fast Models building with cmake.
-   * Core M0, M7, A5 are tested.
-   * 
-   * 
-   *
-   * Building the Library
-   * ------------
-   *
-   * The library installer contains a project file to rebuild libraries on MDK toolchain in the <code>CMSIS\\DSP\\Projects\\ARM</code> folder.
-   * - arm_cortexM_math.uvprojx
+   * Core M0, M4, M7, M33, M55, A32 are tested.
    *
    *
-   * The libraries can be built by opening the arm_cortexM_math.uvprojx project in MDK-ARM, selecting a specific target, and defining the optional preprocessor macros detailed above.
-   *
-   * There is also a work in progress cmake build. The README file is giving more details.
-   *
-   * Preprocessor Macros
-   * ------------
+   * \section preprocessor Preprocessor Macros
    *
    * Each library project have different preprocessor macros.
    *
@@ -140,7 +112,13 @@
    *
    * - ARM_MATH_HELIUM:
    *
-   * It implies the flags ARM_MATH_MVEF and ARM_MATH_MVEI and ARM_MATH_FLOAT16.
+   * It implies the flags ARM_MATH_MVEF and ARM_MATH_MVEI and ARM_MATH_MVE_FLOAT16.
+   *
+   * - ARM_MATH_HELIUM_EXPERIMENTAL:
+   *
+   * Only taken into account when ARM_MATH_MVEF, ARM_MATH_MVEI or ARM_MATH_MVE_FLOAT16 are defined.
+   * Enable some vector versions which may have worse performance than scalar
+   * depending on the core / compiler configuration.
    *
    * - ARM_MATH_MVEF:
    *
@@ -151,8811 +129,99 @@
    *
    * Select Helium versions of the int and fixed point algorithms.
    *
-   * - ARM_MATH_FLOAT16:
+   * - ARM_MATH_MVE_FLOAT16:
    *
-   * Float16 implementations of some algorithms (Requires MVE extension).
+   * MVE Float16 implementations of some algorithms (Requires MVE extension).
+   *
+   * - DISABLEFLOAT16:
+   *
+   * Disable float16 algorithms when __fp16 is not supported for a
+   * specific compiler / core configuration.
+   * This is only valid for scalar. When vector architecture is
+   * supporting f16 then it can't be disabled.
+   *
+   * - ARM_MATH_AUTOVECTORIZE:
+   *
+   * With Helium or Neon, disable the use of vectorized code with C intrinsics
+   * and use pure C instead. The vectorization is then done by the compiler.
    *
    * <hr>
-   * CMSIS-DSP in ARM::CMSIS Pack
-   * -----------------------------
+   * \section pack CMSIS-DSP in ARM::CMSIS Pack
    *
    * The following files relevant to CMSIS-DSP are present in the <b>ARM::CMSIS</b> Pack directories:
    * |File/Folder                      |Content                                                                 |
    * |---------------------------------|------------------------------------------------------------------------|
    * |\b CMSIS\\Documentation\\DSP     | This documentation                                                     |
-   * |\b CMSIS\\DSP\\DSP_Lib_TestSuite | DSP_Lib test suite                                                     |
    * |\b CMSIS\\DSP\\Examples          | Example projects demonstrating the usage of the library functions      |
-   * |\b CMSIS\\DSP\\Include           | DSP_Lib include files                                                  |
+   * |\b CMSIS\\DSP\\Include           | DSP_Lib include files for using and building the lib
+   * |\b CMSIS\\DSP\\PrivateInclude    | DSP_Lib private include files for building the lib                                               |
    * |\b CMSIS\\DSP\\Lib               | DSP_Lib binaries                                                       |
-   * |\b CMSIS\\DSP\\Projects          | Projects to rebuild DSP_Lib binaries                                   |
    * |\b CMSIS\\DSP\\Source            | DSP_Lib source files                                                   |
    *
    * <hr>
-   * Revision History of CMSIS-DSP
-   * ------------
+   * \section rev Revision History of CMSIS-DSP
    * Please refer to \ref ChangeLog_pg.
    */
 
 
-/**
- * @defgroup groupMath Basic Math Functions
- */
-
-/**
- * @defgroup groupFastMath Fast Math Functions
- * This set of functions provides a fast approximation to sine, cosine, and square root.
- * As compared to most of the other functions in the CMSIS math library, the fast math functions
- * operate on individual values and not arrays.
- * There are separate functions for Q15, Q31, and floating-point data.
- *
- */
 
-/**
- * @defgroup groupCmplxMath Complex Math Functions
- * This set of functions operates on complex data vectors.
- * The data in the complex arrays is stored in an interleaved fashion
- * (real, imag, real, imag, ...).
- * In the API functions, the number of samples in a complex array refers
- * to the number of complex values; the array contains twice this number of
- * real values.
- */
 
-/**
- * @defgroup groupFilters Filtering Functions
- */
 
-/**
- * @defgroup groupMatrix Matrix Functions
- *
- * This set of functions provides basic matrix math operations.
- * The functions operate on matrix data structures.  For example,
- * the type
- * definition for the floating-point matrix structure is shown
- * below:
- * <pre>
- *     typedef struct
- *     {
- *       uint16_t numRows;     // number of rows of the matrix.
- *       uint16_t numCols;     // number of columns of the matrix.
- *       float32_t *pData;     // points to the data of the matrix.
- *     } arm_matrix_instance_f32;
- * </pre>
- * There are similar definitions for Q15 and Q31 data types.
- *
- * The structure specifies the size of the matrix and then points to
- * an array of data.  The array is of size <code>numRows X numCols</code>
- * and the values are arranged in row order.  That is, the
- * matrix element (i, j) is stored at:
- * <pre>
- *     pData[i*numCols + j]
- * </pre>
- *
- * \par Init Functions
- * There is an associated initialization function for each type of matrix
- * data structure.
- * The initialization function sets the values of the internal structure fields.
- * Refer to \ref arm_mat_init_f32(), \ref arm_mat_init_q31() and \ref arm_mat_init_q15()
- * for floating-point, Q31 and Q15 types,  respectively.
- *
- * \par
- * Use of the initialization function is optional. However, if initialization function is used
- * then the instance structure cannot be placed into a const data section.
- * To place the instance structure in a const data
- * section, manually initialize the data structure.  For example:
- * <pre>
- * <code>arm_matrix_instance_f32 S = {nRows, nColumns, pData};</code>
- * <code>arm_matrix_instance_q31 S = {nRows, nColumns, pData};</code>
- * <code>arm_matrix_instance_q15 S = {nRows, nColumns, pData};</code>
- * </pre>
- * where <code>nRows</code> specifies the number of rows, <code>nColumns</code>
- * specifies the number of columns, and <code>pData</code> points to the
- * data array.
- *
- * \par Size Checking
- * By default all of the matrix functions perform size checking on the input and
- * output matrices. For example, the matrix addition function verifies that the
- * two input matrices and the output matrix all have the same number of rows and
- * columns. If the size check fails the functions return:
- * <pre>
- *     ARM_MATH_SIZE_MISMATCH
- * </pre>
- * Otherwise the functions return
- * <pre>
- *     ARM_MATH_SUCCESS
- * </pre>
- * There is some overhead associated with this matrix size checking.
- * The matrix size checking is enabled via the \#define
- * <pre>
- *     ARM_MATH_MATRIX_CHECK
- * </pre>
- * within the library project settings.  By default this macro is defined
- * and size checking is enabled. By changing the project settings and
- * undefining this macro size checking is eliminated and the functions
- * run a bit faster. With size checking disabled the functions always
- * return <code>ARM_MATH_SUCCESS</code>.
- */
 
-/**
- * @defgroup groupTransforms Transform Functions
- */
 
-/**
- * @defgroup groupController Controller Functions
- */
 
-/**
- * @defgroup groupStats Statistics Functions
- */
 
-/**
- * @defgroup groupSupport Support Functions
- */
 
-/**
- * @defgroup groupInterpolation Interpolation Functions
- * These functions perform 1- and 2-dimensional interpolation of data.
- * Linear interpolation is used for 1-dimensional data and
- * bilinear interpolation is used for 2-dimensional data.
- */
 
 /**
  * @defgroup groupExamples Examples
  */
 
-/**
- * @defgroup groupSVM SVM Functions
- * This set of functions is implementing SVM classification on 2 classes.
- * The training must be done from scikit-learn. The parameters can be easily
- * generated from the scikit-learn object. Some examples are given in
- * DSP/Testing/PatternGeneration/SVM.py
- *
- * If more than 2 classes are needed, the functions in this folder 
- * will have to be used, as building blocks, to do multi-class classification.
- *
- * No multi-class classification is provided in this SVM folder.
- * 
- */
-
 
-/**
- * @defgroup groupBayes Bayesian estimators
- *
- * Implement the naive gaussian Bayes estimator.
- * The training must be done from scikit-learn.
- *
- * The parameters can be easily
- * generated from the scikit-learn object. Some examples are given in
- * DSP/Testing/PatternGeneration/Bayes.py
- */
 
-/**
- * @defgroup groupDistance Distance functions
- *
- * Distance functions for use with clustering algorithms.
- * There are distance functions for float vectors and boolean vectors.
- *
- */
 
 
 #ifndef _ARM_MATH_H
 #define _ARM_MATH_H
 
-#ifdef   __cplusplus
-extern "C"
-{
-#endif
-
-/* Compiler specific diagnostic adjustment */
-#if   defined ( __CC_ARM )
-
-#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
-
-#elif defined ( __GNUC__ )
-  #pragma GCC diagnostic push
-  #pragma GCC diagnostic ignored "-Wsign-conversion"
-  #pragma GCC diagnostic ignored "-Wconversion"
-  #pragma GCC diagnostic ignored "-Wunused-parameter"
-
-#elif defined ( __ICCARM__ )
-
-#elif defined ( __TI_ARM__ )
-
-#elif defined ( __CSMC__ )
-
-#elif defined ( __TASKING__ )
-
-#elif defined ( _MSC_VER )
-
-#else
-  #error Unknown compiler
-#endif
-
-
-/* Included for instrinsics definitions */
-#if defined (_MSC_VER ) 
-#include <stdint.h>
-#define __STATIC_FORCEINLINE static __forceinline
-#define __STATIC_INLINE static __inline
-#define __ALIGNED(x) __declspec(align(x))
-
-#elif defined (__GNUC_PYTHON__)
-#include <stdint.h>
-#define  __ALIGNED(x) __attribute__((aligned(x)))
-#define __STATIC_FORCEINLINE static inline __attribute__((always_inline))
-#define __STATIC_INLINE static inline
-#pragma GCC diagnostic ignored "-Wunused-function"
-#pragma GCC diagnostic ignored "-Wattributes"
-
-#else
-#include "cmsis_compiler.h"
-#endif
-
-
-
-#include <string.h>
-#include <math.h>
-#include <float.h>
-#include <limits.h>
-
-
-#define F64_MAX   ((float64_t)DBL_MAX)
-#define F32_MAX   ((float32_t)FLT_MAX)
-
-#if defined(ARM_MATH_FLOAT16)
-#define F16_MAX   ((float16_t)FLT_MAX)
-#endif
-
-#define F64_MIN   (-DBL_MAX)
-#define F32_MIN   (-FLT_MAX)
-
-#if defined(ARM_MATH_FLOAT16)
-#define F16_MIN   (-(float16_t)FLT_MAX)
-#endif
-
-#define F64_ABSMAX   ((float64_t)DBL_MAX)
-#define F32_ABSMAX   ((float32_t)FLT_MAX)
-
-#if defined(ARM_MATH_FLOAT16)
-#define F16_ABSMAX   ((float16_t)FLT_MAX)
-#endif
-
-#define F64_ABSMIN   ((float64_t)0.0)
-#define F32_ABSMIN   ((float32_t)0.0)
-
-#if defined(ARM_MATH_FLOAT16)
-#define F16_ABSMIN   ((float16_t)0.0)
-#endif
-
-#define Q31_MAX   ((q31_t)(0x7FFFFFFFL))
-#define Q15_MAX   ((q15_t)(0x7FFF))
-#define Q7_MAX    ((q7_t)(0x7F))
-#define Q31_MIN   ((q31_t)(0x80000000L))
-#define Q15_MIN   ((q15_t)(0x8000))
-#define Q7_MIN    ((q7_t)(0x80))
-
-#define Q31_ABSMAX   ((q31_t)(0x7FFFFFFFL))
-#define Q15_ABSMAX   ((q15_t)(0x7FFF))
-#define Q7_ABSMAX    ((q7_t)(0x7F))
-#define Q31_ABSMIN   ((q31_t)0)
-#define Q15_ABSMIN   ((q15_t)0)
-#define Q7_ABSMIN    ((q7_t)0)
-
-/* evaluate ARM DSP feature */
-#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
-  #define ARM_MATH_DSP                   1
-#endif
-
-#if defined(ARM_MATH_NEON)
-#include <arm_neon.h>
-#endif
-
-#if defined (ARM_MATH_HELIUM)
-  #define ARM_MATH_MVEF
-  #define ARM_MATH_FLOAT16
-#endif
-
-#if defined (ARM_MATH_MVEF)
-  #define ARM_MATH_MVEI
-  #define ARM_MATH_FLOAT16
-#endif
-
-#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
-#include <arm_mve.h>
-#endif
-
-
-  /**
-   * @brief Macros required for reciprocal calculation in Normalized LMS
-   */
-
-#define DELTA_Q31          ((q31_t)(0x100))
-#define DELTA_Q15          ((q15_t)0x5)
-#define INDEX_MASK         0x0000003F
-#ifndef PI
-  #define PI               3.14159265358979f
-#endif
-
-  /**
-   * @brief Macros required for SINE and COSINE Fast math approximations
-   */
-
-#define FAST_MATH_TABLE_SIZE  512
-#define FAST_MATH_Q31_SHIFT   (32 - 10)
-#define FAST_MATH_Q15_SHIFT   (16 - 10)
-#define CONTROLLER_Q31_SHIFT  (32 - 9)
-#define TABLE_SPACING_Q31     0x400000
-#define TABLE_SPACING_Q15     0x80
-
-  /**
-   * @brief Macros required for SINE and COSINE Controller functions
-   */
-  /* 1.31(q31) Fixed value of 2/360 */
-  /* -1 to +1 is divided into 360 values so total spacing is (2/360) */
-#define INPUT_SPACING         0xB60B61
-
-  /**
-   * @brief Macros for complex numbers
-   */
-
-  /* Dimension C vector space */
-  #define CMPLX_DIM 2
-
-  /**
-   * @brief Error status returned by some functions in the library.
-   */
-
-  typedef enum
-  {
-    ARM_MATH_SUCCESS        =  0,        /**< No error */
-    ARM_MATH_ARGUMENT_ERROR = -1,        /**< One or more arguments are incorrect */
-    ARM_MATH_LENGTH_ERROR   = -2,        /**< Length of data buffer is incorrect */
-    ARM_MATH_SIZE_MISMATCH  = -3,        /**< Size of matrices is not compatible with the operation */
-    ARM_MATH_NANINF         = -4,        /**< Not-a-number (NaN) or infinity is generated */
-    ARM_MATH_SINGULAR       = -5,        /**< Input matrix is singular and cannot be inverted */
-    ARM_MATH_TEST_FAILURE   = -6         /**< Test Failed */
-  } arm_status;
-
-  /**
-   * @brief 8-bit fractional data type in 1.7 format.
-   */
-  typedef int8_t q7_t;
-
-  /**
-   * @brief 16-bit fractional data type in 1.15 format.
-   */
-  typedef int16_t q15_t;
-
-  /**
-   * @brief 32-bit fractional data type in 1.31 format.
-   */
-  typedef int32_t q31_t;
-
-  /**
-   * @brief 64-bit fractional data type in 1.63 format.
-   */
-  typedef int64_t q63_t;
-
-  /**
-   * @brief 32-bit floating-point type definition.
-   */
-  typedef float float32_t;
-
-  /**
-   * @brief 64-bit floating-point type definition.
-   */
-  typedef double float64_t;
-
-  /**
-   * @brief vector types
-   */
-#if defined(ARM_MATH_NEON) || defined (ARM_MATH_MVEI)
-  /**
-   * @brief 64-bit fractional 128-bit vector data type in 1.63 format
-   */
-  typedef int64x2_t q63x2_t;
-
-  /**
-   * @brief 32-bit fractional 128-bit vector data type in 1.31 format.
-   */
-  typedef int32x4_t q31x4_t;
-
-  /**
-   * @brief 16-bit fractional 128-bit vector data type with 16-bit alignement in 1.15 format.
-   */
-  typedef __ALIGNED(2) int16x8_t q15x8_t;
-
- /**
-   * @brief 8-bit fractional 128-bit vector data type with 8-bit alignement in 1.7 format.
-   */
-  typedef __ALIGNED(1) int8x16_t q7x16_t;
-
-    /**
-   * @brief 32-bit fractional 128-bit vector pair data type in 1.31 format.
-   */
-  typedef int32x4x2_t q31x4x2_t;
-
-  /**
-   * @brief 32-bit fractional 128-bit vector quadruplet data type in 1.31 format.
-   */
-  typedef int32x4x4_t q31x4x4_t;
-
-  /**
-   * @brief 16-bit fractional 128-bit vector pair data type in 1.15 format.
-   */
-  typedef int16x8x2_t q15x8x2_t;
-
-  /**
-   * @brief 16-bit fractional 128-bit vector quadruplet data type in 1.15 format.
-   */
-  typedef int16x8x4_t q15x8x4_t;
-
-  /**
-   * @brief 8-bit fractional 128-bit vector pair data type in 1.7 format.
-   */
-  typedef int8x16x2_t q7x16x2_t;
-
-  /**
-   * @brief 8-bit fractional 128-bit vector quadruplet data type in 1.7 format.
-   */
-   typedef int8x16x4_t q7x16x4_t;
-
-  /**
-   * @brief 32-bit fractional data type in 9.23 format.
-   */
-  typedef int32_t q23_t;
-
-  /**
-   * @brief 32-bit fractional 128-bit vector data type in 9.23 format.
-   */
-  typedef int32x4_t q23x4_t;
-
-  /**
-   * @brief 64-bit status 128-bit vector data type.
-   */
-  typedef int64x2_t status64x2_t;
-
-  /**
-   * @brief 32-bit status 128-bit vector data type.
-   */
-  typedef int32x4_t status32x4_t;
-
-  /**
-   * @brief 16-bit status 128-bit vector data type.
-   */
-  typedef int16x8_t status16x8_t;
-
-  /**
-   * @brief 8-bit status 128-bit vector data type.
-   */
-  typedef int8x16_t status8x16_t;
-
-
-#endif
-
-#if defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF) /* floating point vector*/
-  /**
-   * @brief 32-bit floating-point 128-bit vector type
-   */
-  typedef float32x4_t f32x4_t;
-
-#if defined(ARM_MATH_FLOAT16)
-  /**
-   * @brief 16-bit floating-point 128-bit vector data type
-   */
-  typedef __ALIGNED(2) float16x8_t f16x8_t;
-#endif
-
-  /**
-   * @brief 32-bit floating-point 128-bit vector pair data type
-   */
-  typedef float32x4x2_t f32x4x2_t;
-
-  /**
-   * @brief 32-bit floating-point 128-bit vector quadruplet data type
-   */
-  typedef float32x4x4_t f32x4x4_t;
-
-#if defined(ARM_MATH_FLOAT16)
-  /**
-   * @brief 16-bit floating-point 128-bit vector pair data type
-   */
-  typedef float16x8x2_t f16x8x2_t;
-
-  /**
-   * @brief 16-bit floating-point 128-bit vector quadruplet data type
-   */
-  typedef float16x8x4_t f16x8x4_t;
-#endif
-
-  /**
-   * @brief 32-bit ubiquitous 128-bit vector data type
-   */
-  typedef union _any32x4_t
-  {
-      float32x4_t     f;
-      int32x4_t       i;
-  } any32x4_t;
-
-#if defined(ARM_MATH_FLOAT16)
-  /**
-   * @brief 16-bit ubiquitous 128-bit vector data type
-   */
-  typedef union _any16x8_t
-  {
-      float16x8_t     f;
-      int16x8_t       i;
-  } any16x8_t;
-#endif
-
-#endif
-
-#if defined(ARM_MATH_NEON)
-  /**
-   * @brief 32-bit fractional 64-bit vector data type in 1.31 format.
-   */
-  typedef int32x2_t  q31x2_t;
-
-  /**
-   * @brief 16-bit fractional 64-bit vector data type in 1.15 format.
-   */
-  typedef  __ALIGNED(2) int16x4_t q15x4_t;
-
-  /**
-   * @brief 8-bit fractional 64-bit vector data type in 1.7 format.
-   */
-  typedef  __ALIGNED(1) int8x8_t q7x8_t;
-
-  /**
-   * @brief 32-bit float 64-bit vector data type.
-   */
-  typedef float32x2_t  f32x2_t;
 
-#if defined(ARM_MATH_FLOAT16)
-  /**
-   * @brief 16-bit float 64-bit vector data type.
-   */
-  typedef  __ALIGNED(2) float16x4_t f16x4_t;
-#endif 
-
-  /**
-   * @brief 32-bit floating-point 128-bit vector triplet data type
-   */
-  typedef float32x4x3_t f32x4x3_t;
-
-#if defined(ARM_MATH_FLOAT16)
-  /**
-   * @brief 16-bit floating-point 128-bit vector triplet data type
-   */
-  typedef float16x8x3_t f16x8x3_t;
-#endif
-
-  /**
-   * @brief 32-bit fractional 128-bit vector triplet data type in 1.31 format
-   */
-  typedef int32x4x3_t q31x4x3_t;
-
-  /**
-   * @brief 16-bit fractional 128-bit vector triplet data type in 1.15 format
-   */
-  typedef int16x8x3_t q15x8x3_t;
-
-  /**
-   * @brief 8-bit fractional 128-bit vector triplet data type in 1.7 format
-   */
-  typedef int8x16x3_t q7x16x3_t;
-
-  /**
-   * @brief 32-bit floating-point 64-bit vector pair data type
-   */
-  typedef float32x2x2_t f32x2x2_t;
-
-  /**
-   * @brief 32-bit floating-point 64-bit vector triplet data type
-   */
-  typedef float32x2x3_t f32x2x3_t;
-
-  /**
-   * @brief 32-bit floating-point 64-bit vector quadruplet data type
-   */
-  typedef float32x2x4_t f32x2x4_t;
-
-#if defined(ARM_MATH_FLOAT16)
-  /**
-   * @brief 16-bit floating-point 64-bit vector pair data type
-   */
-  typedef float16x4x2_t f16x4x2_t;
-
-  /**
-   * @brief 16-bit floating-point 64-bit vector triplet data type
-   */
-  typedef float16x4x3_t f16x4x3_t;
-
-  /**
-   * @brief 16-bit floating-point 64-bit vector quadruplet data type
-   */
-  typedef float16x4x4_t f16x4x4_t;
-#endif 
-
-  /**
-   * @brief 32-bit fractional 64-bit vector pair data type in 1.31 format
-   */
-  typedef int32x2x2_t q31x2x2_t;
-
-  /**
-   * @brief 32-bit fractional 64-bit vector triplet data type in 1.31 format
-   */
-  typedef int32x2x3_t q31x2x3_t;
-
-  /**
-   * @brief 32-bit fractional 64-bit vector quadruplet data type in 1.31 format
-   */
-  typedef int32x4x3_t q31x2x4_t;
-
-  /**
-   * @brief 16-bit fractional 64-bit vector pair data type in 1.15 format
-   */
-  typedef int16x4x2_t q15x4x2_t;
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
 
-  /**
-   * @brief 16-bit fractional 64-bit vector triplet data type in 1.15 format
-   */
-  typedef int16x4x2_t q15x4x3_t;
-
-  /**
-   * @brief 16-bit fractional 64-bit vector quadruplet data type in 1.15 format
-   */
-  typedef int16x4x3_t q15x4x4_t;
-
-  /**
-   * @brief 8-bit fractional 64-bit vector pair data type in 1.7 format
-   */
-  typedef int8x8x2_t q7x8x2_t;
-
-  /**
-   * @brief 8-bit fractional 64-bit vector triplet data type in 1.7 format
-   */
-  typedef int8x8x3_t q7x8x3_t;
-
-  /**
-   * @brief 8-bit fractional 64-bit vector quadruplet data type in 1.7 format
-   */
-  typedef int8x8x4_t q7x8x4_t;
-
-  /**
-   * @brief 32-bit ubiquitous 64-bit vector data type
-   */
-  typedef union _any32x2_t
-  {
-      float32x2_t     f;
-      int32x2_t       i;
-  } any32x2_t;
-
-#if defined(ARM_MATH_FLOAT16)
-  /**
-   * @brief 16-bit ubiquitous 64-bit vector data type
-   */
-  typedef union _any16x4_t
-  {
-      float16x4_t     f;
-      int16x4_t       i;
-  } any16x4_t;
-#endif 
-
-  /**
-   * @brief 32-bit status 64-bit vector data type.
-   */
-  typedef int32x4_t status32x2_t;
-
-  /**
-   * @brief 16-bit status 64-bit vector data type.
-   */
-  typedef int16x8_t status16x4_t;
-
-  /**
-   * @brief 8-bit status 64-bit vector data type.
-   */
-  typedef int8x16_t status8x8_t;
+#include "dsp/none.h"
+#include "dsp/utils.h"
 
-#endif
+#include "dsp/basic_math_functions.h"  
+#include "dsp/interpolation_functions.h"
+#include "dsp/bayes_functions.h"
+#include "dsp/matrix_functions.h"
+#include "dsp/complex_math_functions.h"
+#include "dsp/statistics_functions.h"
+#include "dsp/controller_functions.h"
+#include "dsp/support_functions.h"
+#include "dsp/distance_functions.h"
+#include "dsp/svm_functions.h"
+#include "dsp/fast_math_functions.h"
+#include "dsp/transform_functions.h"
+#include "dsp/filtering_functions.h"
+#include "dsp/quaternion_math_functions.h"
 
 
 
-/**
-  @brief definition to read/write two 16 bit values.
-  @deprecated
- */
-#if   defined ( __CC_ARM )
-  #define __SIMD32_TYPE int32_t __packed
-#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
-  #define __SIMD32_TYPE int32_t
-#elif defined ( __GNUC__ )
-  #define __SIMD32_TYPE int32_t
-#elif defined ( __ICCARM__ )
-  #define __SIMD32_TYPE int32_t __packed
-#elif defined ( __TI_ARM__ )
-  #define __SIMD32_TYPE int32_t
-#elif defined ( __CSMC__ )
-  #define __SIMD32_TYPE int32_t
-#elif defined ( __TASKING__ )
-  #define __SIMD32_TYPE __un(aligned) int32_t
-#elif defined(_MSC_VER )
-  #define __SIMD32_TYPE int32_t
-#else
-  #error Unknown compiler
+#ifdef   __cplusplus
+extern "C"
+{
 #endif
 
-#define __SIMD32(addr)        (*(__SIMD32_TYPE **) & (addr))
-#define __SIMD32_CONST(addr)  ( (__SIMD32_TYPE * )   (addr))
-#define _SIMD32_OFFSET(addr)  (*(__SIMD32_TYPE * )   (addr))
-#define __SIMD64(addr)        (*(      int64_t **) & (addr))
 
-#define STEP(x) (x) <= 0 ? 0 : 1
-#define SQ(x) ((x) * (x))
 
-/* SIMD replacement */
 
+//#define TABLE_SPACING_Q31     0x400000
+//#define TABLE_SPACING_Q15     0x80
 
-/**
-  @brief         Read 2 Q15 from Q15 pointer.
-  @param[in]     pQ15      points to input value
-  @return        Q31 value
- */
-__STATIC_FORCEINLINE q31_t read_q15x2 (
-  q15_t * pQ15)
-{
-  q31_t val;
 
-#ifdef __ARM_FEATURE_UNALIGNED
-  memcpy (&val, pQ15, 4);
-#else
-  val = (pQ15[1] << 16) | (pQ15[0] & 0x0FFFF) ;
-#endif
 
-  return (val);
-}
 
-/**
-  @brief         Read 2 Q15 from Q15 pointer and increment pointer afterwards.
-  @param[in]     pQ15      points to input value
-  @return        Q31 value
- */
-__STATIC_FORCEINLINE q31_t read_q15x2_ia (
-  q15_t ** pQ15)
-{
-  q31_t val;
-
-#ifdef __ARM_FEATURE_UNALIGNED
-  memcpy (&val, *pQ15, 4);
-#else
-  val = ((*pQ15)[1] << 16) | ((*pQ15)[0] & 0x0FFFF);
-#endif
-
- *pQ15 += 2;
- return (val);
-}
-
-/**
-  @brief         Read 2 Q15 from Q15 pointer and decrement pointer afterwards.
-  @param[in]     pQ15      points to input value
-  @return        Q31 value
- */
-__STATIC_FORCEINLINE q31_t read_q15x2_da (
-  q15_t ** pQ15)
-{
-  q31_t val;
-
-#ifdef __ARM_FEATURE_UNALIGNED
-  memcpy (&val, *pQ15, 4);
-#else
-  val = ((*pQ15)[1] << 16) | ((*pQ15)[0] & 0x0FFFF);
-#endif
-
-  *pQ15 -= 2;
-  return (val);
-}
-
-/**
-  @brief         Write 2 Q15 to Q15 pointer and increment pointer afterwards.
-  @param[in]     pQ15      points to input value
-  @param[in]     value     Q31 value
-  @return        none
- */
-__STATIC_FORCEINLINE void write_q15x2_ia (
-  q15_t ** pQ15,
-  q31_t    value)
-{
-  q31_t val = value;
-#ifdef __ARM_FEATURE_UNALIGNED
-  memcpy (*pQ15, &val, 4);
-#else
-  (*pQ15)[0] = (val & 0x0FFFF);
-  (*pQ15)[1] = (val >> 16) & 0x0FFFF;
-#endif
-
- *pQ15 += 2;
-}
-
-/**
-  @brief         Write 2 Q15 to Q15 pointer.
-  @param[in]     pQ15      points to input value
-  @param[in]     value     Q31 value
-  @return        none
- */
-__STATIC_FORCEINLINE void write_q15x2 (
-  q15_t * pQ15,
-  q31_t   value)
-{
-  q31_t val = value;
-
-#ifdef __ARM_FEATURE_UNALIGNED
-  memcpy (pQ15, &val, 4);
-#else
-  pQ15[0] = val & 0x0FFFF;
-  pQ15[1] = val >> 16;
-#endif
-}
-
-
-/**
-  @brief         Read 4 Q7 from Q7 pointer and increment pointer afterwards.
-  @param[in]     pQ7       points to input value
-  @return        Q31 value
- */
-__STATIC_FORCEINLINE q31_t read_q7x4_ia (
-  q7_t ** pQ7)
-{
-  q31_t val;
-
-
-#ifdef __ARM_FEATURE_UNALIGNED
-  memcpy (&val, *pQ7, 4);
-#else
-  val =(((*pQ7)[3] & 0x0FF) << 24)  | (((*pQ7)[2] & 0x0FF) << 16)  | (((*pQ7)[1] & 0x0FF) << 8)  | ((*pQ7)[0] & 0x0FF);
-#endif 
-
-  *pQ7 += 4;
-
-  return (val);
-}
-
-/**
-  @brief         Read 4 Q7 from Q7 pointer and decrement pointer afterwards.
-  @param[in]     pQ7       points to input value
-  @return        Q31 value
- */
-__STATIC_FORCEINLINE q31_t read_q7x4_da (
-  q7_t ** pQ7)
-{
-  q31_t val;
-#ifdef __ARM_FEATURE_UNALIGNED
-  memcpy (&val, *pQ7, 4);
-#else
-  val = ((((*pQ7)[3]) & 0x0FF) << 24) | ((((*pQ7)[2]) & 0x0FF) << 16)   | ((((*pQ7)[1]) & 0x0FF) << 8)  | ((*pQ7)[0] & 0x0FF);
-#endif 
-  *pQ7 -= 4;
-
-  return (val);
-}
-
-/**
-  @brief         Write 4 Q7 to Q7 pointer and increment pointer afterwards.
-  @param[in]     pQ7       points to input value
-  @param[in]     value     Q31 value
-  @return        none
- */
-__STATIC_FORCEINLINE void write_q7x4_ia (
-  q7_t ** pQ7,
-  q31_t   value)
-{
-  q31_t val = value;
-#ifdef __ARM_FEATURE_UNALIGNED
-  memcpy (*pQ7, &val, 4);
-#else
-  (*pQ7)[0] = val & 0x0FF;
-  (*pQ7)[1] = (val >> 8) & 0x0FF;
-  (*pQ7)[2] = (val >> 16) & 0x0FF;
-  (*pQ7)[3] = (val >> 24) & 0x0FF;
-
-#endif
-  *pQ7 += 4;
-}
-
-/*
-
-Normally those kind of definitions are in a compiler file
-in Core or Core_A.
-
-But for MSVC compiler it is a bit special. The goal is very specific
-to CMSIS-DSP and only to allow the use of this library from other
-systems like Python or Matlab.
-
-MSVC is not going to be used to cross-compile to ARM. So, having a MSVC
-compiler file in Core or Core_A would not make sense.
-
-*/
-#if defined ( _MSC_VER ) || defined(__GNUC_PYTHON__)
-    __STATIC_FORCEINLINE uint8_t __CLZ(uint32_t data)
-    {
-      if (data == 0U) { return 32U; }
-
-      uint32_t count = 0U;
-      uint32_t mask = 0x80000000U;
-
-      while ((data & mask) == 0U)
-      {
-        count += 1U;
-        mask = mask >> 1U;
-      }
-      return count;
-    }
-
-  __STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
-  {
-    if ((sat >= 1U) && (sat <= 32U))
-    {
-      const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
-      const int32_t min = -1 - max ;
-      if (val > max)
-      {
-        return max;
-      }
-      else if (val < min)
-      {
-        return min;
-      }
-    }
-    return val;
-  }
-
-  __STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
-  {
-    if (sat <= 31U)
-    {
-      const uint32_t max = ((1U << sat) - 1U);
-      if (val > (int32_t)max)
-      {
-        return max;
-      }
-      else if (val < 0)
-      {
-        return 0U;
-      }
-    }
-    return (uint32_t)val;
-  }
-#endif
-
-#ifndef ARM_MATH_DSP
-  /**
-   * @brief definition to pack two 16 bit values.
-   */
-  #define __PKHBT(ARG1, ARG2, ARG3) ( (((int32_t)(ARG1) <<    0) & (int32_t)0x0000FFFF) | \
-                                      (((int32_t)(ARG2) << ARG3) & (int32_t)0xFFFF0000)  )
-  #define __PKHTB(ARG1, ARG2, ARG3) ( (((int32_t)(ARG1) <<    0) & (int32_t)0xFFFF0000) | \
-                                      (((int32_t)(ARG2) >> ARG3) & (int32_t)0x0000FFFF)  )
-#endif
-
-   /**
-   * @brief definition to pack four 8 bit values.
-   */
-#ifndef ARM_MATH_BIG_ENDIAN
-  #define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v0) <<  0) & (int32_t)0x000000FF) | \
-                                  (((int32_t)(v1) <<  8) & (int32_t)0x0000FF00) | \
-                                  (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | \
-                                  (((int32_t)(v3) << 24) & (int32_t)0xFF000000)  )
-#else
-  #define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v3) <<  0) & (int32_t)0x000000FF) | \
-                                  (((int32_t)(v2) <<  8) & (int32_t)0x0000FF00) | \
-                                  (((int32_t)(v1) << 16) & (int32_t)0x00FF0000) | \
-                                  (((int32_t)(v0) << 24) & (int32_t)0xFF000000)  )
-#endif
-
-
-  /**
-   * @brief Clips Q63 to Q31 values.
-   */
-  __STATIC_FORCEINLINE q31_t clip_q63_to_q31(
-  q63_t x)
-  {
-    return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
-      ((0x7FFFFFFF ^ ((q31_t) (x >> 63)))) : (q31_t) x;
-  }
-
-  /**
-   * @brief Clips Q63 to Q15 values.
-   */
-  __STATIC_FORCEINLINE q15_t clip_q63_to_q15(
-  q63_t x)
-  {
-    return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
-      ((0x7FFF ^ ((q15_t) (x >> 63)))) : (q15_t) (x >> 15);
-  }
-
-  /**
-   * @brief Clips Q31 to Q7 values.
-   */
-  __STATIC_FORCEINLINE q7_t clip_q31_to_q7(
-  q31_t x)
-  {
-    return ((q31_t) (x >> 24) != ((q31_t) x >> 23)) ?
-      ((0x7F ^ ((q7_t) (x >> 31)))) : (q7_t) x;
-  }
-
-  /**
-   * @brief Clips Q31 to Q15 values.
-   */
-  __STATIC_FORCEINLINE q15_t clip_q31_to_q15(
-  q31_t x)
-  {
-    return ((q31_t) (x >> 16) != ((q31_t) x >> 15)) ?
-      ((0x7FFF ^ ((q15_t) (x >> 31)))) : (q15_t) x;
-  }
-
-  /**
-   * @brief Multiplies 32 X 64 and returns 32 bit result in 2.30 format.
-   */
-  __STATIC_FORCEINLINE q63_t mult32x64(
-  q63_t x,
-  q31_t y)
-  {
-    return ((((q63_t) (x & 0x00000000FFFFFFFF) * y) >> 32) +
-            (((q63_t) (x >> 32)                * y)      )  );
-  }
-
-  /**
-   * @brief Function to Calculates 1/in (reciprocal) value of Q31 Data type.
-   */
-  __STATIC_FORCEINLINE uint32_t arm_recip_q31(
-        q31_t in,
-        q31_t * dst,
-  const q31_t * pRecipTable)
-  {
-    q31_t out;
-    uint32_t tempVal;
-    uint32_t index, i;
-    uint32_t signBits;
-
-    if (in > 0)
-    {
-      signBits = ((uint32_t) (__CLZ( in) - 1));
-    }
-    else
-    {
-      signBits = ((uint32_t) (__CLZ(-in) - 1));
-    }
-
-    /* Convert input sample to 1.31 format */
-    in = (in << signBits);
-
-    /* calculation of index for initial approximated Val */
-    index = (uint32_t)(in >> 24);
-    index = (index & INDEX_MASK);
-
-    /* 1.31 with exp 1 */
-    out = pRecipTable[index];
-
-    /* calculation of reciprocal value */
-    /* running approximation for two iterations */
-    for (i = 0U; i < 2U; i++)
-    {
-      tempVal = (uint32_t) (((q63_t) in * out) >> 31);
-      tempVal = 0x7FFFFFFFu - tempVal;
-      /*      1.31 with exp 1 */
-      /* out = (q31_t) (((q63_t) out * tempVal) >> 30); */
-      out = clip_q63_to_q31(((q63_t) out * tempVal) >> 30);
-    }
-
-    /* write output */
-    *dst = out;
-
-    /* return num of signbits of out = 1/in value */
-    return (signBits + 1U);
-  }
-
-
-  /**
-   * @brief Function to Calculates 1/in (reciprocal) value of Q15 Data type.
-   */
-  __STATIC_FORCEINLINE uint32_t arm_recip_q15(
-        q15_t in,
-        q15_t * dst,
-  const q15_t * pRecipTable)
-  {
-    q15_t out = 0;
-    uint32_t tempVal = 0;
-    uint32_t index = 0, i = 0;
-    uint32_t signBits = 0;
-
-    if (in > 0)
-    {
-      signBits = ((uint32_t)(__CLZ( in) - 17));
-    }
-    else
-    {
-      signBits = ((uint32_t)(__CLZ(-in) - 17));
-    }
-
-    /* Convert input sample to 1.15 format */
-    in = (in << signBits);
-
-    /* calculation of index for initial approximated Val */
-    index = (uint32_t)(in >>  8);
-    index = (index & INDEX_MASK);
-
-    /*      1.15 with exp 1  */
-    out = pRecipTable[index];
-
-    /* calculation of reciprocal value */
-    /* running approximation for two iterations */
-    for (i = 0U; i < 2U; i++)
-    {
-      tempVal = (uint32_t) (((q31_t) in * out) >> 15);
-      tempVal = 0x7FFFu - tempVal;
-      /*      1.15 with exp 1 */
-      out = (q15_t) (((q31_t) out * tempVal) >> 14);
-      /* out = clip_q31_to_q15(((q31_t) out * tempVal) >> 14); */
-    }
-
-    /* write output */
-    *dst = out;
-
-    /* return num of signbits of out = 1/in value */
-    return (signBits + 1);
-  }
-
-/**
- * @brief Integer exponentiation
- * @param[in]    x           value
- * @param[in]    nb          integer exponent >= 1
- * @return x^nb
- *
- */
-__STATIC_INLINE float32_t arm_exponent_f32(float32_t x, int32_t nb)
-{
-    float32_t r = x;
-    nb --;
-    while(nb > 0)
-    {
-        r = r * x;
-        nb--;
-    }
-    return(r);
-}
-
-/**
- * @brief  64-bit to 32-bit unsigned normalization
- * @param[in]  in           is input unsigned long long value
- * @param[out] normalized   is the 32-bit normalized value
- * @param[out] norm         is norm scale
- */
-__STATIC_INLINE  void arm_norm_64_to_32u(uint64_t in, int32_t * normalized, int32_t *norm)
-{
-    int32_t     n1;
-    int32_t     hi = (int32_t) (in >> 32);
-    int32_t     lo = (int32_t) ((in << 32) >> 32);
-
-    n1 = __CLZ(hi) - 32;
-    if (!n1)
-    {
-        /*
-         * input fits in 32-bit
-         */
-        n1 = __CLZ(lo);
-        if (!n1)
-        {
-            /*
-             * MSB set, need to scale down by 1
-             */
-            *norm = -1;
-            *normalized = (((uint32_t) lo) >> 1);
-        } else
-        {
-            if (n1 == 32)
-            {
-                /*
-                 * input is zero
-                 */
-                *norm = 0;
-                *normalized = 0;
-            } else
-            {
-                /*
-                 * 32-bit normalization
-                 */
-                *norm = n1 - 1;
-                *normalized = lo << *norm;
-            }
-        }
-    } else
-    {
-        /*
-         * input fits in 64-bit
-         */
-        n1 = 1 - n1;
-        *norm = -n1;
-        /*
-         * 64 bit normalization
-         */
-        *normalized = (((uint32_t) lo) >> n1) | (hi << (32 - n1));
-    }
-}
-
-__STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
-{
-    q31_t   result;
-    uint64_t   absNum;
-    int32_t   normalized;
-    int32_t   norm;
-
-    /*
-     * if sum fits in 32bits
-     * avoid costly 64-bit division
-     */
-    absNum = num > 0 ? num : -num;
-    arm_norm_64_to_32u(absNum, &normalized, &norm);
-    if (norm > 0)
-        /*
-         * 32-bit division
-         */
-        result = (q31_t) num / den;
-    else
-        /*
-         * 64-bit division
-         */
-        result = (q31_t) (num / den);
-
-    return result;
-}
-
-
-/*
- * @brief C custom defined intrinsic functions
- */
-#if !defined (ARM_MATH_DSP)
-
-  /*
-   * @brief C custom defined QADD8
-   */
-  __STATIC_FORCEINLINE uint32_t __QADD8(
-  uint32_t x,
-  uint32_t y)
-  {
-    q31_t r, s, t, u;
-
-    r = __SSAT(((((q31_t)x << 24) >> 24) + (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF;
-    s = __SSAT(((((q31_t)x << 16) >> 24) + (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF;
-    t = __SSAT(((((q31_t)x <<  8) >> 24) + (((q31_t)y <<  8) >> 24)), 8) & (int32_t)0x000000FF;
-    u = __SSAT(((((q31_t)x      ) >> 24) + (((q31_t)y      ) >> 24)), 8) & (int32_t)0x000000FF;
-
-    return ((uint32_t)((u << 24) | (t << 16) | (s <<  8) | (r      )));
-  }
-
-
-  /*
-   * @brief C custom defined QSUB8
-   */
-  __STATIC_FORCEINLINE uint32_t __QSUB8(
-  uint32_t x,
-  uint32_t y)
-  {
-    q31_t r, s, t, u;
-
-    r = __SSAT(((((q31_t)x << 24) >> 24) - (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF;
-    s = __SSAT(((((q31_t)x << 16) >> 24) - (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF;
-    t = __SSAT(((((q31_t)x <<  8) >> 24) - (((q31_t)y <<  8) >> 24)), 8) & (int32_t)0x000000FF;
-    u = __SSAT(((((q31_t)x      ) >> 24) - (((q31_t)y      ) >> 24)), 8) & (int32_t)0x000000FF;
-
-    return ((uint32_t)((u << 24) | (t << 16) | (s <<  8) | (r      )));
-  }
-
-
-  /*
-   * @brief C custom defined QADD16
-   */
-  __STATIC_FORCEINLINE uint32_t __QADD16(
-  uint32_t x,
-  uint32_t y)
-  {
-/*  q31_t r,     s;  without initialisation 'arm_offset_q15 test' fails  but 'intrinsic' tests pass! for armCC */
-    q31_t r = 0, s = 0;
-
-    r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
-    s = __SSAT(((((q31_t)x      ) >> 16) + (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
-
-    return ((uint32_t)((s << 16) | (r      )));
-  }
-
-
-  /*
-   * @brief C custom defined SHADD16
-   */
-  __STATIC_FORCEINLINE uint32_t __SHADD16(
-  uint32_t x,
-  uint32_t y)
-  {
-    q31_t r, s;
-
-    r = (((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
-    s = (((((q31_t)x      ) >> 16) + (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
-
-    return ((uint32_t)((s << 16) | (r      )));
-  }
-
-
-  /*
-   * @brief C custom defined QSUB16
-   */
-  __STATIC_FORCEINLINE uint32_t __QSUB16(
-  uint32_t x,
-  uint32_t y)
-  {
-    q31_t r, s;
-
-    r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
-    s = __SSAT(((((q31_t)x      ) >> 16) - (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
-
-    return ((uint32_t)((s << 16) | (r      )));
-  }
-
-
-  /*
-   * @brief C custom defined SHSUB16
-   */
-  __STATIC_FORCEINLINE uint32_t __SHSUB16(
-  uint32_t x,
-  uint32_t y)
-  {
-    q31_t r, s;
-
-    r = (((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
-    s = (((((q31_t)x      ) >> 16) - (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
-
-    return ((uint32_t)((s << 16) | (r      )));
-  }
-
-
-  /*
-   * @brief C custom defined QASX
-   */
-  __STATIC_FORCEINLINE uint32_t __QASX(
-  uint32_t x,
-  uint32_t y)
-  {
-    q31_t r, s;
-
-    r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
-    s = __SSAT(((((q31_t)x      ) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
-
-    return ((uint32_t)((s << 16) | (r      )));
-  }
-
-
-  /*
-   * @brief C custom defined SHASX
-   */
-  __STATIC_FORCEINLINE uint32_t __SHASX(
-  uint32_t x,
-  uint32_t y)
-  {
-    q31_t r, s;
-
-    r = (((((q31_t)x << 16) >> 16) - (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
-    s = (((((q31_t)x      ) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
-
-    return ((uint32_t)((s << 16) | (r      )));
-  }
-
-
-  /*
-   * @brief C custom defined QSAX
-   */
-  __STATIC_FORCEINLINE uint32_t __QSAX(
-  uint32_t x,
-  uint32_t y)
-  {
-    q31_t r, s;
-
-    r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
-    s = __SSAT(((((q31_t)x      ) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
-
-    return ((uint32_t)((s << 16) | (r      )));
-  }
-
-
-  /*
-   * @brief C custom defined SHSAX
-   */
-  __STATIC_FORCEINLINE uint32_t __SHSAX(
-  uint32_t x,
-  uint32_t y)
-  {
-    q31_t r, s;
-
-    r = (((((q31_t)x << 16) >> 16) + (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
-    s = (((((q31_t)x      ) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
-
-    return ((uint32_t)((s << 16) | (r      )));
-  }
-
-
-  /*
-   * @brief C custom defined SMUSDX
-   */
-  __STATIC_FORCEINLINE uint32_t __SMUSDX(
-  uint32_t x,
-  uint32_t y)
-  {
-    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) -
-                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16))   ));
-  }
-
-  /*
-   * @brief C custom defined SMUADX
-   */
-  __STATIC_FORCEINLINE uint32_t __SMUADX(
-  uint32_t x,
-  uint32_t y)
-  {
-    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
-                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16))   ));
-  }
-
-
-  /*
-   * @brief C custom defined QADD
-   */
-  __STATIC_FORCEINLINE int32_t __QADD(
-  int32_t x,
-  int32_t y)
-  {
-    return ((int32_t)(clip_q63_to_q31((q63_t)x + (q31_t)y)));
-  }
-
-
-  /*
-   * @brief C custom defined QSUB
-   */
-  __STATIC_FORCEINLINE int32_t __QSUB(
-  int32_t x,
-  int32_t y)
-  {
-    return ((int32_t)(clip_q63_to_q31((q63_t)x - (q31_t)y)));
-  }
-
-
-  /*
-   * @brief C custom defined SMLAD
-   */
-  __STATIC_FORCEINLINE uint32_t __SMLAD(
-  uint32_t x,
-  uint32_t y,
-  uint32_t sum)
-  {
-    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
-                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16)) +
-                       ( ((q31_t)sum    )                                  )   ));
-  }
-
-
-  /*
-   * @brief C custom defined SMLADX
-   */
-  __STATIC_FORCEINLINE uint32_t __SMLADX(
-  uint32_t x,
-  uint32_t y,
-  uint32_t sum)
-  {
-    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
-                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
-                       ( ((q31_t)sum    )                                  )   ));
-  }
-
-
-  /*
-   * @brief C custom defined SMLSDX
-   */
-  __STATIC_FORCEINLINE uint32_t __SMLSDX(
-  uint32_t x,
-  uint32_t y,
-  uint32_t sum)
-  {
-    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) -
-                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
-                       ( ((q31_t)sum    )                                  )   ));
-  }
-
-
-  /*
-   * @brief C custom defined SMLALD
-   */
-  __STATIC_FORCEINLINE uint64_t __SMLALD(
-  uint32_t x,
-  uint32_t y,
-  uint64_t sum)
-  {
-/*  return (sum + ((q15_t) (x >> 16) * (q15_t) (y >> 16)) + ((q15_t) x * (q15_t) y)); */
-    return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
-                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16)) +
-                       ( ((q63_t)sum    )                                  )   ));
-  }
-
-
-  /*
-   * @brief C custom defined SMLALDX
-   */
-  __STATIC_FORCEINLINE uint64_t __SMLALDX(
-  uint32_t x,
-  uint32_t y,
-  uint64_t sum)
-  {
-/*  return (sum + ((q15_t) (x >> 16) * (q15_t) y)) + ((q15_t) x * (q15_t) (y >> 16)); */
-    return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
-                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
-                       ( ((q63_t)sum    )                                  )   ));
-  }
-
-
-  /*
-   * @brief C custom defined SMUAD
-   */
-  __STATIC_FORCEINLINE uint32_t __SMUAD(
-  uint32_t x,
-  uint32_t y)
-  {
-    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
-                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16))   ));
-  }
-
-
-  /*
-   * @brief C custom defined SMUSD
-   */
-  __STATIC_FORCEINLINE uint32_t __SMUSD(
-  uint32_t x,
-  uint32_t y)
-  {
-    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) -
-                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16))   ));
-  }
-
-
-  /*
-   * @brief C custom defined SXTB16
-   */
-  __STATIC_FORCEINLINE uint32_t __SXTB16(
-  uint32_t x)
-  {
-    return ((uint32_t)(((((q31_t)x << 24) >> 24) & (q31_t)0x0000FFFF) |
-                       ((((q31_t)x <<  8) >>  8) & (q31_t)0xFFFF0000)  ));
-  }
-
-  /*
-   * @brief C custom defined SMMLA
-   */
-  __STATIC_FORCEINLINE int32_t __SMMLA(
-  int32_t x,
-  int32_t y,
-  int32_t sum)
-  {
-    return (sum + (int32_t) (((int64_t) x * y) >> 32));
-  }
-
-#endif /* !defined (ARM_MATH_DSP) */
-
-
-  /**
-   * @brief Instance structure for the Q7 FIR filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;        /**< number of filter coefficients in the filter. */
-          q7_t *pState;            /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
-    const q7_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps.*/
-  } arm_fir_instance_q7;
-
-  /**
-   * @brief Instance structure for the Q15 FIR filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;         /**< number of filter coefficients in the filter. */
-          q15_t *pState;            /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
-    const q15_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps.*/
-  } arm_fir_instance_q15;
-
-  /**
-   * @brief Instance structure for the Q31 FIR filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;         /**< number of filter coefficients in the filter. */
-          q31_t *pState;            /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
-    const q31_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps. */
-  } arm_fir_instance_q31;
-
-  /**
-   * @brief Instance structure for the floating-point FIR filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;     /**< number of filter coefficients in the filter. */
-          float32_t *pState;    /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
-    const float32_t *pCoeffs;   /**< points to the coefficient array. The array is of length numTaps. */
-  } arm_fir_instance_f32;
-
-  /**
-   * @brief Processing function for the Q7 FIR filter.
-   * @param[in]  S          points to an instance of the Q7 FIR filter structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_fir_q7(
-  const arm_fir_instance_q7 * S,
-  const q7_t * pSrc,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @brief  Initialization function for the Q7 FIR filter.
-   * @param[in,out] S          points to an instance of the Q7 FIR structure.
-   * @param[in]     numTaps    Number of filter coefficients in the filter.
-   * @param[in]     pCoeffs    points to the filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     blockSize  number of samples that are processed.
-   */
-  void arm_fir_init_q7(
-        arm_fir_instance_q7 * S,
-        uint16_t numTaps,
-  const q7_t * pCoeffs,
-        q7_t * pState,
-        uint32_t blockSize);
-
-  /**
-   * @brief Processing function for the Q15 FIR filter.
-   * @param[in]  S          points to an instance of the Q15 FIR structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_fir_q15(
-  const arm_fir_instance_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @brief Processing function for the fast Q15 FIR filter (fast version).
-   * @param[in]  S          points to an instance of the Q15 FIR filter structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_fir_fast_q15(
-  const arm_fir_instance_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @brief  Initialization function for the Q15 FIR filter.
-   * @param[in,out] S          points to an instance of the Q15 FIR filter structure.
-   * @param[in]     numTaps    Number of filter coefficients in the filter. Must be even and greater than or equal to 4.
-   * @param[in]     pCoeffs    points to the filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     blockSize  number of samples that are processed at a time.
-   * @return     The function returns either
-   * <code>ARM_MATH_SUCCESS</code> if initialization was successful or
-   * <code>ARM_MATH_ARGUMENT_ERROR</code> if <code>numTaps</code> is not a supported value.
-   */
-  arm_status arm_fir_init_q15(
-        arm_fir_instance_q15 * S,
-        uint16_t numTaps,
-  const q15_t * pCoeffs,
-        q15_t * pState,
-        uint32_t blockSize);
-
-  /**
-   * @brief Processing function for the Q31 FIR filter.
-   * @param[in]  S          points to an instance of the Q31 FIR filter structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_fir_q31(
-  const arm_fir_instance_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @brief Processing function for the fast Q31 FIR filter (fast version).
-   * @param[in]  S          points to an instance of the Q31 FIR filter structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_fir_fast_q31(
-  const arm_fir_instance_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @brief  Initialization function for the Q31 FIR filter.
-   * @param[in,out] S          points to an instance of the Q31 FIR structure.
-   * @param[in]     numTaps    Number of filter coefficients in the filter.
-   * @param[in]     pCoeffs    points to the filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     blockSize  number of samples that are processed at a time.
-   */
-  void arm_fir_init_q31(
-        arm_fir_instance_q31 * S,
-        uint16_t numTaps,
-  const q31_t * pCoeffs,
-        q31_t * pState,
-        uint32_t blockSize);
-
-  /**
-   * @brief Processing function for the floating-point FIR filter.
-   * @param[in]  S          points to an instance of the floating-point FIR structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_fir_f32(
-  const arm_fir_instance_f32 * S,
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @brief  Initialization function for the floating-point FIR filter.
-   * @param[in,out] S          points to an instance of the floating-point FIR filter structure.
-   * @param[in]     numTaps    Number of filter coefficients in the filter.
-   * @param[in]     pCoeffs    points to the filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     blockSize  number of samples that are processed at a time.
-   */
-  void arm_fir_init_f32(
-        arm_fir_instance_f32 * S,
-        uint16_t numTaps,
-  const float32_t * pCoeffs,
-        float32_t * pState,
-        uint32_t blockSize);
-
-  /**
-   * @brief Instance structure for the Q15 Biquad cascade filter.
-   */
-  typedef struct
-  {
-          int8_t numStages;        /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
-          q15_t *pState;           /**< Points to the array of state coefficients.  The array is of length 4*numStages. */
-    const q15_t *pCoeffs;          /**< Points to the array of coefficients.  The array is of length 5*numStages. */
-          int8_t postShift;        /**< Additional shift, in bits, applied to each output sample. */
-  } arm_biquad_casd_df1_inst_q15;
-
-  /**
-   * @brief Instance structure for the Q31 Biquad cascade filter.
-   */
-  typedef struct
-  {
-          uint32_t numStages;      /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
-          q31_t *pState;           /**< Points to the array of state coefficients.  The array is of length 4*numStages. */
-    const q31_t *pCoeffs;          /**< Points to the array of coefficients.  The array is of length 5*numStages. */
-          uint8_t postShift;       /**< Additional shift, in bits, applied to each output sample. */
-  } arm_biquad_casd_df1_inst_q31;
-
-  /**
-   * @brief Instance structure for the floating-point Biquad cascade filter.
-   */
-  typedef struct
-  {
-          uint32_t numStages;      /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
-          float32_t *pState;       /**< Points to the array of state coefficients.  The array is of length 4*numStages. */
-    const float32_t *pCoeffs;      /**< Points to the array of coefficients.  The array is of length 5*numStages. */
-  } arm_biquad_casd_df1_inst_f32;
-
-#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
-  /**
-   * @brief Instance structure for the modified Biquad coefs required by vectorized code.
-   */
-  typedef struct
-  {
-      float32_t coeffs[8][4]; /**< Points to the array of modified coefficients.  The array is of length 32. There is one per stage */
-  } arm_biquad_mod_coef_f32;
-#endif 
-
-  /**
-   * @brief Processing function for the Q15 Biquad cascade filter.
-   * @param[in]  S          points to an instance of the Q15 Biquad cascade structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_biquad_cascade_df1_q15(
-  const arm_biquad_casd_df1_inst_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @brief  Initialization function for the Q15 Biquad cascade filter.
-   * @param[in,out] S          points to an instance of the Q15 Biquad cascade structure.
-   * @param[in]     numStages  number of 2nd order stages in the filter.
-   * @param[in]     pCoeffs    points to the filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     postShift  Shift to be applied to the output. Varies according to the coefficients format
-   */
-  void arm_biquad_cascade_df1_init_q15(
-        arm_biquad_casd_df1_inst_q15 * S,
-        uint8_t numStages,
-  const q15_t * pCoeffs,
-        q15_t * pState,
-        int8_t postShift);
-
-  /**
-   * @brief Fast but less precise processing function for the Q15 Biquad cascade filter for Cortex-M3 and Cortex-M4.
-   * @param[in]  S          points to an instance of the Q15 Biquad cascade structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_biquad_cascade_df1_fast_q15(
-  const arm_biquad_casd_df1_inst_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @brief Processing function for the Q31 Biquad cascade filter
-   * @param[in]  S          points to an instance of the Q31 Biquad cascade structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_biquad_cascade_df1_q31(
-  const arm_biquad_casd_df1_inst_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @brief Fast but less precise processing function for the Q31 Biquad cascade filter for Cortex-M3 and Cortex-M4.
-   * @param[in]  S          points to an instance of the Q31 Biquad cascade structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_biquad_cascade_df1_fast_q31(
-  const arm_biquad_casd_df1_inst_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @brief  Initialization function for the Q31 Biquad cascade filter.
-   * @param[in,out] S          points to an instance of the Q31 Biquad cascade structure.
-   * @param[in]     numStages  number of 2nd order stages in the filter.
-   * @param[in]     pCoeffs    points to the filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     postShift  Shift to be applied to the output. Varies according to the coefficients format
-   */
-  void arm_biquad_cascade_df1_init_q31(
-        arm_biquad_casd_df1_inst_q31 * S,
-        uint8_t numStages,
-  const q31_t * pCoeffs,
-        q31_t * pState,
-        int8_t postShift);
-
-  /**
-   * @brief Processing function for the floating-point Biquad cascade filter.
-   * @param[in]  S          points to an instance of the floating-point Biquad cascade structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_biquad_cascade_df1_f32(
-  const arm_biquad_casd_df1_inst_f32 * S,
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @brief  Initialization function for the floating-point Biquad cascade filter.
-   * @param[in,out] S          points to an instance of the floating-point Biquad cascade structure.
-   * @param[in]     numStages  number of 2nd order stages in the filter.
-   * @param[in]     pCoeffs    points to the filter coefficients.
-   * @param[in]     pCoeffsMod points to the modified filter coefficients (only MVE version).
-   * @param[in]     pState     points to the state buffer.
-   */
-#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
-  void arm_biquad_cascade_df1_mve_init_f32(
-      arm_biquad_casd_df1_inst_f32 * S,
-      uint8_t numStages,
-      const float32_t * pCoeffs, 
-      arm_biquad_mod_coef_f32 * pCoeffsMod, 
-      float32_t * pState);
-#endif
-  
-  void arm_biquad_cascade_df1_init_f32(
-        arm_biquad_casd_df1_inst_f32 * S,
-        uint8_t numStages,
-  const float32_t * pCoeffs,
-        float32_t * pState);
-
-
-  /**
-   * @brief         Compute the logical bitwise AND of two fixed-point vectors.
-   * @param[in]     pSrcA      points to input vector A
-   * @param[in]     pSrcB      points to input vector B
-   * @param[out]    pDst       points to output vector
-   * @param[in]     blockSize  number of samples in each vector
-   * @return        none
-   */
-  void arm_and_u16(
-    const uint16_t * pSrcA,
-    const uint16_t * pSrcB,
-          uint16_t * pDst,
-          uint32_t blockSize);
-
-  /**
-   * @brief         Compute the logical bitwise AND of two fixed-point vectors.
-   * @param[in]     pSrcA      points to input vector A
-   * @param[in]     pSrcB      points to input vector B
-   * @param[out]    pDst       points to output vector
-   * @param[in]     blockSize  number of samples in each vector
-   * @return        none
-   */
-  void arm_and_u32(
-    const uint32_t * pSrcA,
-    const uint32_t * pSrcB,
-          uint32_t * pDst,
-          uint32_t blockSize);
-
-  /**
-   * @brief         Compute the logical bitwise AND of two fixed-point vectors.
-   * @param[in]     pSrcA      points to input vector A
-   * @param[in]     pSrcB      points to input vector B
-   * @param[out]    pDst       points to output vector
-   * @param[in]     blockSize  number of samples in each vector
-   * @return        none
-   */
-  void arm_and_u8(
-    const uint8_t * pSrcA,
-    const uint8_t * pSrcB,
-          uint8_t * pDst,
-          uint32_t blockSize);
-
-  /**
-   * @brief         Compute the logical bitwise OR of two fixed-point vectors.
-   * @param[in]     pSrcA      points to input vector A
-   * @param[in]     pSrcB      points to input vector B
-   * @param[out]    pDst       points to output vector
-   * @param[in]     blockSize  number of samples in each vector
-   * @return        none
-   */
-  void arm_or_u16(
-    const uint16_t * pSrcA,
-    const uint16_t * pSrcB,
-          uint16_t * pDst,
-          uint32_t blockSize);
-
-  /**
-   * @brief         Compute the logical bitwise OR of two fixed-point vectors.
-   * @param[in]     pSrcA      points to input vector A
-   * @param[in]     pSrcB      points to input vector B
-   * @param[out]    pDst       points to output vector
-   * @param[in]     blockSize  number of samples in each vector
-   * @return        none
-   */
-  void arm_or_u32(
-    const uint32_t * pSrcA,
-    const uint32_t * pSrcB,
-          uint32_t * pDst,
-          uint32_t blockSize);
-
-  /**
-   * @brief         Compute the logical bitwise OR of two fixed-point vectors.
-   * @param[in]     pSrcA      points to input vector A
-   * @param[in]     pSrcB      points to input vector B
-   * @param[out]    pDst       points to output vector
-   * @param[in]     blockSize  number of samples in each vector
-   * @return        none
-   */
-  void arm_or_u8(
-    const uint8_t * pSrcA,
-    const uint8_t * pSrcB,
-          uint8_t * pDst,
-          uint32_t blockSize);
-
-  /**
-   * @brief         Compute the logical bitwise NOT of a fixed-point vector.
-   * @param[in]     pSrc       points to input vector 
-   * @param[out]    pDst       points to output vector
-   * @param[in]     blockSize  number of samples in each vector
-   * @return        none
-   */
-  void arm_not_u16(
-    const uint16_t * pSrc,
-          uint16_t * pDst,
-          uint32_t blockSize);
-
-  /**
-   * @brief         Compute the logical bitwise NOT of a fixed-point vector.
-   * @param[in]     pSrc       points to input vector 
-   * @param[out]    pDst       points to output vector
-   * @param[in]     blockSize  number of samples in each vector
-   * @return        none
-   */
-  void arm_not_u32(
-    const uint32_t * pSrc,
-          uint32_t * pDst,
-          uint32_t blockSize);
-
-  /**
-   * @brief         Compute the logical bitwise NOT of a fixed-point vector.
-   * @param[in]     pSrc       points to input vector 
-   * @param[out]    pDst       points to output vector
-   * @param[in]     blockSize  number of samples in each vector
-   * @return        none
-   */
-  void arm_not_u8(
-    const uint8_t * pSrc,
-          uint8_t * pDst,
-          uint32_t blockSize);
-
-/**
-   * @brief         Compute the logical bitwise XOR of two fixed-point vectors.
-   * @param[in]     pSrcA      points to input vector A
-   * @param[in]     pSrcB      points to input vector B
-   * @param[out]    pDst       points to output vector
-   * @param[in]     blockSize  number of samples in each vector
-   * @return        none
-   */
-  void arm_xor_u16(
-    const uint16_t * pSrcA,
-    const uint16_t * pSrcB,
-          uint16_t * pDst,
-          uint32_t blockSize);
-
-  /**
-   * @brief         Compute the logical bitwise XOR of two fixed-point vectors.
-   * @param[in]     pSrcA      points to input vector A
-   * @param[in]     pSrcB      points to input vector B
-   * @param[out]    pDst       points to output vector
-   * @param[in]     blockSize  number of samples in each vector
-   * @return        none
-   */
-  void arm_xor_u32(
-    const uint32_t * pSrcA,
-    const uint32_t * pSrcB,
-          uint32_t * pDst,
-          uint32_t blockSize);
-
-  /**
-   * @brief         Compute the logical bitwise XOR of two fixed-point vectors.
-   * @param[in]     pSrcA      points to input vector A
-   * @param[in]     pSrcB      points to input vector B
-   * @param[out]    pDst       points to output vector
-   * @param[in]     blockSize  number of samples in each vector
-   * @return        none
-   */
-  void arm_xor_u8(
-    const uint8_t * pSrcA,
-    const uint8_t * pSrcB,
-          uint8_t * pDst,
-    uint32_t blockSize);
-
-  /**
-   * @brief Struct for specifying sorting algorithm
-   */
-  typedef enum
-  {
-    ARM_SORT_BITONIC   = 0,
-             /**< Bitonic sort   */
-    ARM_SORT_BUBBLE    = 1,
-             /**< Bubble sort    */
-    ARM_SORT_HEAP      = 2,
-             /**< Heap sort      */
-    ARM_SORT_INSERTION = 3,
-             /**< Insertion sort */
-    ARM_SORT_QUICK     = 4,
-             /**< Quick sort     */
-    ARM_SORT_SELECTION = 5
-             /**< Selection sort */
-  } arm_sort_alg;
-
-  /**
-   * @brief Struct for specifying sorting algorithm
-   */
-  typedef enum
-  {
-    ARM_SORT_DESCENDING = 0,
-             /**< Descending order (9 to 0) */
-    ARM_SORT_ASCENDING = 1
-             /**< Ascending order (0 to 9) */
-  } arm_sort_dir;
-
-  /**
-   * @brief Instance structure for the sorting algorithms.
-   */
-  typedef struct            
-  {
-    arm_sort_alg alg;        /**< Sorting algorithm selected */
-    arm_sort_dir dir;        /**< Sorting order (direction)  */
-  } arm_sort_instance_f32;  
-
-  /**
-   * @param[in]  S          points to an instance of the sorting structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_sort_f32(
-    const arm_sort_instance_f32 * S, 
-          float32_t * pSrc, 
-          float32_t * pDst, 
-          uint32_t blockSize);
-
-  /**
-   * @param[in,out]  S            points to an instance of the sorting structure.
-   * @param[in]      alg          Selected algorithm.
-   * @param[in]      dir          Sorting order.
-   */
-  void arm_sort_init_f32(
-    arm_sort_instance_f32 * S, 
-    arm_sort_alg alg, 
-    arm_sort_dir dir); 
-
-  /**
-   * @brief Instance structure for the sorting algorithms.
-   */
-  typedef struct            
-  {
-    arm_sort_dir dir;        /**< Sorting order (direction)  */
-    float32_t * buffer;      /**< Working buffer */
-  } arm_merge_sort_instance_f32;  
-
-  /**
-   * @param[in]      S          points to an instance of the sorting structure.
-   * @param[in,out]  pSrc       points to the block of input data.
-   * @param[out]     pDst       points to the block of output data
-   * @param[in]      blockSize  number of samples to process.
-   */
-  void arm_merge_sort_f32(
-    const arm_merge_sort_instance_f32 * S,
-          float32_t *pSrc,
-          float32_t *pDst,
-          uint32_t blockSize);
-
-  /**
-   * @param[in,out]  S            points to an instance of the sorting structure.
-   * @param[in]      dir          Sorting order.
-   * @param[in]      buffer       Working buffer.
-   */
-  void arm_merge_sort_init_f32(
-    arm_merge_sort_instance_f32 * S,
-    arm_sort_dir dir,
-    float32_t * buffer);
-
-  /**
-   * @brief Struct for specifying cubic spline type
-   */
-  typedef enum
-  {
-    ARM_SPLINE_NATURAL = 0,           /**< Natural spline */
-    ARM_SPLINE_PARABOLIC_RUNOUT = 1   /**< Parabolic runout spline */
-  } arm_spline_type;
-
-  /**
-   * @brief Instance structure for the floating-point cubic spline interpolation.
-   */
-  typedef struct
-  {
-    arm_spline_type type;      /**< Type (boundary conditions) */
-    const float32_t * x;       /**< x values */
-    const float32_t * y;       /**< y values */
-    uint32_t n_x;              /**< Number of known data points */
-    float32_t * coeffs;        /**< Coefficients buffer (b,c, and d) */
-  } arm_spline_instance_f32;
-
-  /**
-   * @brief Processing function for the floating-point cubic spline interpolation.
-   * @param[in]  S          points to an instance of the floating-point spline structure.
-   * @param[in]  xq         points to the x values ot the interpolated data points.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples of output data.
-   */
-  void arm_spline_f32(
-        arm_spline_instance_f32 * S, 
-  const float32_t * xq,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @brief Initialization function for the floating-point cubic spline interpolation.
-   * @param[in,out] S        points to an instance of the floating-point spline structure.
-   * @param[in]     type     type of cubic spline interpolation (boundary conditions)
-   * @param[in]     x        points to the x values of the known data points.
-   * @param[in]     y        points to the y values of the known data points.
-   * @param[in]     n        number of known data points.
-   * @param[in]     coeffs   coefficients array for b, c, and d
-   * @param[in]     tempBuffer   buffer array for internal computations
-   */
-  void arm_spline_init_f32(
-          arm_spline_instance_f32 * S,
-          arm_spline_type type,
-    const float32_t * x,
-    const float32_t * y,
-          uint32_t n, 
-          float32_t * coeffs,
-          float32_t * tempBuffer);
-
-  /**
-   * @brief Instance structure for the floating-point matrix structure.
-   */
-  typedef struct
-  {
-    uint16_t numRows;     /**< number of rows of the matrix.     */
-    uint16_t numCols;     /**< number of columns of the matrix.  */
-    float32_t *pData;     /**< points to the data of the matrix. */
-  } arm_matrix_instance_f32;
- 
- /**
-   * @brief Instance structure for the floating-point matrix structure.
-   */
-  typedef struct
-  {
-    uint16_t numRows;     /**< number of rows of the matrix.     */
-    uint16_t numCols;     /**< number of columns of the matrix.  */
-    float64_t *pData;     /**< points to the data of the matrix. */
-  } arm_matrix_instance_f64;
-
-  /**
-   * @brief Instance structure for the Q15 matrix structure.
-   */
-  typedef struct
-  {
-    uint16_t numRows;     /**< number of rows of the matrix.     */
-    uint16_t numCols;     /**< number of columns of the matrix.  */
-    q15_t *pData;         /**< points to the data of the matrix. */
-  } arm_matrix_instance_q15;
-
-  /**
-   * @brief Instance structure for the Q31 matrix structure.
-   */
-  typedef struct
-  {
-    uint16_t numRows;     /**< number of rows of the matrix.     */
-    uint16_t numCols;     /**< number of columns of the matrix.  */
-    q31_t *pData;         /**< points to the data of the matrix. */
-  } arm_matrix_instance_q31;
-
-  /**
-   * @brief Floating-point matrix addition.
-   * @param[in]  pSrcA  points to the first input matrix structure
-   * @param[in]  pSrcB  points to the second input matrix structure
-   * @param[out] pDst   points to output matrix structure
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_add_f32(
-  const arm_matrix_instance_f32 * pSrcA,
-  const arm_matrix_instance_f32 * pSrcB,
-        arm_matrix_instance_f32 * pDst);
-
-  /**
-   * @brief Q15 matrix addition.
-   * @param[in]   pSrcA  points to the first input matrix structure
-   * @param[in]   pSrcB  points to the second input matrix structure
-   * @param[out]  pDst   points to output matrix structure
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_add_q15(
-  const arm_matrix_instance_q15 * pSrcA,
-  const arm_matrix_instance_q15 * pSrcB,
-        arm_matrix_instance_q15 * pDst);
-
-  /**
-   * @brief Q31 matrix addition.
-   * @param[in]  pSrcA  points to the first input matrix structure
-   * @param[in]  pSrcB  points to the second input matrix structure
-   * @param[out] pDst   points to output matrix structure
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_add_q31(
-  const arm_matrix_instance_q31 * pSrcA,
-  const arm_matrix_instance_q31 * pSrcB,
-        arm_matrix_instance_q31 * pDst);
-
-  /**
-   * @brief Floating-point, complex, matrix multiplication.
-   * @param[in]  pSrcA  points to the first input matrix structure
-   * @param[in]  pSrcB  points to the second input matrix structure
-   * @param[out] pDst   points to output matrix structure
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_cmplx_mult_f32(
-  const arm_matrix_instance_f32 * pSrcA,
-  const arm_matrix_instance_f32 * pSrcB,
-        arm_matrix_instance_f32 * pDst);
-
-  /**
-   * @brief Q15, complex,  matrix multiplication.
-   * @param[in]  pSrcA  points to the first input matrix structure
-   * @param[in]  pSrcB  points to the second input matrix structure
-   * @param[out] pDst   points to output matrix structure
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_cmplx_mult_q15(
-  const arm_matrix_instance_q15 * pSrcA,
-  const arm_matrix_instance_q15 * pSrcB,
-        arm_matrix_instance_q15 * pDst,
-        q15_t * pScratch);
-
-  /**
-   * @brief Q31, complex, matrix multiplication.
-   * @param[in]  pSrcA  points to the first input matrix structure
-   * @param[in]  pSrcB  points to the second input matrix structure
-   * @param[out] pDst   points to output matrix structure
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_cmplx_mult_q31(
-  const arm_matrix_instance_q31 * pSrcA,
-  const arm_matrix_instance_q31 * pSrcB,
-        arm_matrix_instance_q31 * pDst);
-
-  /**
-   * @brief Floating-point matrix transpose.
-   * @param[in]  pSrc  points to the input matrix
-   * @param[out] pDst  points to the output matrix
-   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
-   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_trans_f32(
-  const arm_matrix_instance_f32 * pSrc,
-        arm_matrix_instance_f32 * pDst);
-
-  /**
-   * @brief Q15 matrix transpose.
-   * @param[in]  pSrc  points to the input matrix
-   * @param[out] pDst  points to the output matrix
-   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
-   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_trans_q15(
-  const arm_matrix_instance_q15 * pSrc,
-        arm_matrix_instance_q15 * pDst);
-
-  /**
-   * @brief Q31 matrix transpose.
-   * @param[in]  pSrc  points to the input matrix
-   * @param[out] pDst  points to the output matrix
-   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
-   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_trans_q31(
-  const arm_matrix_instance_q31 * pSrc,
-        arm_matrix_instance_q31 * pDst);
-
-  /**
-   * @brief Floating-point matrix multiplication
-   * @param[in]  pSrcA  points to the first input matrix structure
-   * @param[in]  pSrcB  points to the second input matrix structure
-   * @param[out] pDst   points to output matrix structure
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_mult_f32(
-  const arm_matrix_instance_f32 * pSrcA,
-  const arm_matrix_instance_f32 * pSrcB,
-        arm_matrix_instance_f32 * pDst);
-
-  /**
-   * @brief Q15 matrix multiplication
-   * @param[in]  pSrcA   points to the first input matrix structure
-   * @param[in]  pSrcB   points to the second input matrix structure
-   * @param[out] pDst    points to output matrix structure
-   * @param[in]  pState  points to the array for storing intermediate results
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_mult_q15(
-  const arm_matrix_instance_q15 * pSrcA,
-  const arm_matrix_instance_q15 * pSrcB,
-        arm_matrix_instance_q15 * pDst,
-        q15_t * pState);
-
-  /**
-   * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
-   * @param[in]  pSrcA   points to the first input matrix structure
-   * @param[in]  pSrcB   points to the second input matrix structure
-   * @param[out] pDst    points to output matrix structure
-   * @param[in]  pState  points to the array for storing intermediate results
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_mult_fast_q15(
-  const arm_matrix_instance_q15 * pSrcA,
-  const arm_matrix_instance_q15 * pSrcB,
-        arm_matrix_instance_q15 * pDst,
-        q15_t * pState);
-
-  /**
-   * @brief Q31 matrix multiplication
-   * @param[in]  pSrcA  points to the first input matrix structure
-   * @param[in]  pSrcB  points to the second input matrix structure
-   * @param[out] pDst   points to output matrix structure
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_mult_q31(
-  const arm_matrix_instance_q31 * pSrcA,
-  const arm_matrix_instance_q31 * pSrcB,
-        arm_matrix_instance_q31 * pDst);
-
-  /**
-   * @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
-   * @param[in]  pSrcA  points to the first input matrix structure
-   * @param[in]  pSrcB  points to the second input matrix structure
-   * @param[out] pDst   points to output matrix structure
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_mult_fast_q31(
-  const arm_matrix_instance_q31 * pSrcA,
-  const arm_matrix_instance_q31 * pSrcB,
-        arm_matrix_instance_q31 * pDst);
-
-  /**
-   * @brief Floating-point matrix subtraction
-   * @param[in]  pSrcA  points to the first input matrix structure
-   * @param[in]  pSrcB  points to the second input matrix structure
-   * @param[out] pDst   points to output matrix structure
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_sub_f32(
-  const arm_matrix_instance_f32 * pSrcA,
-  const arm_matrix_instance_f32 * pSrcB,
-        arm_matrix_instance_f32 * pDst);
-
-  /**
-   * @brief Q15 matrix subtraction
-   * @param[in]  pSrcA  points to the first input matrix structure
-   * @param[in]  pSrcB  points to the second input matrix structure
-   * @param[out] pDst   points to output matrix structure
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_sub_q15(
-  const arm_matrix_instance_q15 * pSrcA,
-  const arm_matrix_instance_q15 * pSrcB,
-        arm_matrix_instance_q15 * pDst);
-
-  /**
-   * @brief Q31 matrix subtraction
-   * @param[in]  pSrcA  points to the first input matrix structure
-   * @param[in]  pSrcB  points to the second input matrix structure
-   * @param[out] pDst   points to output matrix structure
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_sub_q31(
-  const arm_matrix_instance_q31 * pSrcA,
-  const arm_matrix_instance_q31 * pSrcB,
-        arm_matrix_instance_q31 * pDst);
-
-  /**
-   * @brief Floating-point matrix scaling.
-   * @param[in]  pSrc   points to the input matrix
-   * @param[in]  scale  scale factor
-   * @param[out] pDst   points to the output matrix
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_scale_f32(
-  const arm_matrix_instance_f32 * pSrc,
-        float32_t scale,
-        arm_matrix_instance_f32 * pDst);
-
-  /**
-   * @brief Q15 matrix scaling.
-   * @param[in]  pSrc        points to input matrix
-   * @param[in]  scaleFract  fractional portion of the scale factor
-   * @param[in]  shift       number of bits to shift the result by
-   * @param[out] pDst        points to output matrix
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_scale_q15(
-  const arm_matrix_instance_q15 * pSrc,
-        q15_t scaleFract,
-        int32_t shift,
-        arm_matrix_instance_q15 * pDst);
-
-  /**
-   * @brief Q31 matrix scaling.
-   * @param[in]  pSrc        points to input matrix
-   * @param[in]  scaleFract  fractional portion of the scale factor
-   * @param[in]  shift       number of bits to shift the result by
-   * @param[out] pDst        points to output matrix structure
-   * @return     The function returns either
-   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
-   */
-arm_status arm_mat_scale_q31(
-  const arm_matrix_instance_q31 * pSrc,
-        q31_t scaleFract,
-        int32_t shift,
-        arm_matrix_instance_q31 * pDst);
-
-  /**
-   * @brief  Q31 matrix initialization.
-   * @param[in,out] S         points to an instance of the floating-point matrix structure.
-   * @param[in]     nRows     number of rows in the matrix.
-   * @param[in]     nColumns  number of columns in the matrix.
-   * @param[in]     pData     points to the matrix data array.
-   */
-void arm_mat_init_q31(
-        arm_matrix_instance_q31 * S,
-        uint16_t nRows,
-        uint16_t nColumns,
-        q31_t * pData);
-
-  /**
-   * @brief  Q15 matrix initialization.
-   * @param[in,out] S         points to an instance of the floating-point matrix structure.
-   * @param[in]     nRows     number of rows in the matrix.
-   * @param[in]     nColumns  number of columns in the matrix.
-   * @param[in]     pData     points to the matrix data array.
-   */
-void arm_mat_init_q15(
-        arm_matrix_instance_q15 * S,
-        uint16_t nRows,
-        uint16_t nColumns,
-        q15_t * pData);
-
-  /**
-   * @brief  Floating-point matrix initialization.
-   * @param[in,out] S         points to an instance of the floating-point matrix structure.
-   * @param[in]     nRows     number of rows in the matrix.
-   * @param[in]     nColumns  number of columns in the matrix.
-   * @param[in]     pData     points to the matrix data array.
-   */
-void arm_mat_init_f32(
-        arm_matrix_instance_f32 * S,
-        uint16_t nRows,
-        uint16_t nColumns,
-        float32_t * pData);
-
-
-  /**
-   * @brief Instance structure for the Q15 PID Control.
-   */
-  typedef struct
-  {
-          q15_t A0;           /**< The derived gain, A0 = Kp + Ki + Kd . */
-#if !defined (ARM_MATH_DSP)
-          q15_t A1;
-          q15_t A2;
-#else
-          q31_t A1;           /**< The derived gain A1 = -Kp - 2Kd | Kd.*/
-#endif
-          q15_t state[3];     /**< The state array of length 3. */
-          q15_t Kp;           /**< The proportional gain. */
-          q15_t Ki;           /**< The integral gain. */
-          q15_t Kd;           /**< The derivative gain. */
-  } arm_pid_instance_q15;
-
-  /**
-   * @brief Instance structure for the Q31 PID Control.
-   */
-  typedef struct
-  {
-          q31_t A0;            /**< The derived gain, A0 = Kp + Ki + Kd . */
-          q31_t A1;            /**< The derived gain, A1 = -Kp - 2Kd. */
-          q31_t A2;            /**< The derived gain, A2 = Kd . */
-          q31_t state[3];      /**< The state array of length 3. */
-          q31_t Kp;            /**< The proportional gain. */
-          q31_t Ki;            /**< The integral gain. */
-          q31_t Kd;            /**< The derivative gain. */
-  } arm_pid_instance_q31;
-
-  /**
-   * @brief Instance structure for the floating-point PID Control.
-   */
-  typedef struct
-  {
-          float32_t A0;          /**< The derived gain, A0 = Kp + Ki + Kd . */
-          float32_t A1;          /**< The derived gain, A1 = -Kp - 2Kd. */
-          float32_t A2;          /**< The derived gain, A2 = Kd . */
-          float32_t state[3];    /**< The state array of length 3. */
-          float32_t Kp;          /**< The proportional gain. */
-          float32_t Ki;          /**< The integral gain. */
-          float32_t Kd;          /**< The derivative gain. */
-  } arm_pid_instance_f32;
-
-
-
-  /**
-   * @brief  Initialization function for the floating-point PID Control.
-   * @param[in,out] S               points to an instance of the PID structure.
-   * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
-   */
-  void arm_pid_init_f32(
-        arm_pid_instance_f32 * S,
-        int32_t resetStateFlag);
-
-
-  /**
-   * @brief  Reset function for the floating-point PID Control.
-   * @param[in,out] S  is an instance of the floating-point PID Control structure
-   */
-  void arm_pid_reset_f32(
-        arm_pid_instance_f32 * S);
-
-
-  /**
-   * @brief  Initialization function for the Q31 PID Control.
-   * @param[in,out] S               points to an instance of the Q15 PID structure.
-   * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
-   */
-  void arm_pid_init_q31(
-        arm_pid_instance_q31 * S,
-        int32_t resetStateFlag);
-
-
-  /**
-   * @brief  Reset function for the Q31 PID Control.
-   * @param[in,out] S   points to an instance of the Q31 PID Control structure
-   */
-
-  void arm_pid_reset_q31(
-        arm_pid_instance_q31 * S);
-
-
-  /**
-   * @brief  Initialization function for the Q15 PID Control.
-   * @param[in,out] S               points to an instance of the Q15 PID structure.
-   * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
-   */
-  void arm_pid_init_q15(
-        arm_pid_instance_q15 * S,
-        int32_t resetStateFlag);
-
-
-  /**
-   * @brief  Reset function for the Q15 PID Control.
-   * @param[in,out] S  points to an instance of the q15 PID Control structure
-   */
-  void arm_pid_reset_q15(
-        arm_pid_instance_q15 * S);
-
-
-  /**
-   * @brief Instance structure for the floating-point Linear Interpolate function.
-   */
-  typedef struct
-  {
-          uint32_t nValues;           /**< nValues */
-          float32_t x1;               /**< x1 */
-          float32_t xSpacing;         /**< xSpacing */
-          float32_t *pYData;          /**< pointer to the table of Y values */
-  } arm_linear_interp_instance_f32;
-
-  /**
-   * @brief Instance structure for the floating-point bilinear interpolation function.
-   */
-  typedef struct
-  {
-          uint16_t numRows;   /**< number of rows in the data table. */
-          uint16_t numCols;   /**< number of columns in the data table. */
-          float32_t *pData;   /**< points to the data table. */
-  } arm_bilinear_interp_instance_f32;
-
-   /**
-   * @brief Instance structure for the Q31 bilinear interpolation function.
-   */
-  typedef struct
-  {
-          uint16_t numRows;   /**< number of rows in the data table. */
-          uint16_t numCols;   /**< number of columns in the data table. */
-          q31_t *pData;       /**< points to the data table. */
-  } arm_bilinear_interp_instance_q31;
-
-   /**
-   * @brief Instance structure for the Q15 bilinear interpolation function.
-   */
-  typedef struct
-  {
-          uint16_t numRows;   /**< number of rows in the data table. */
-          uint16_t numCols;   /**< number of columns in the data table. */
-          q15_t *pData;       /**< points to the data table. */
-  } arm_bilinear_interp_instance_q15;
-
-   /**
-   * @brief Instance structure for the Q15 bilinear interpolation function.
-   */
-  typedef struct
-  {
-          uint16_t numRows;   /**< number of rows in the data table. */
-          uint16_t numCols;   /**< number of columns in the data table. */
-          q7_t *pData;        /**< points to the data table. */
-  } arm_bilinear_interp_instance_q7;
-
-
-  /**
-   * @brief Q7 vector multiplication.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_mult_q7(
-  const q7_t * pSrcA,
-  const q7_t * pSrcB,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Q15 vector multiplication.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_mult_q15(
-  const q15_t * pSrcA,
-  const q15_t * pSrcB,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Q31 vector multiplication.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_mult_q31(
-  const q31_t * pSrcA,
-  const q31_t * pSrcB,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Floating-point vector multiplication.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_mult_f32(
-  const float32_t * pSrcA,
-  const float32_t * pSrcB,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Instance structure for the Q15 CFFT/CIFFT function.
-   */
-  typedef struct
-  {
-          uint16_t fftLen;                 /**< length of the FFT. */
-          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
-          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
-    const q15_t *pTwiddle;                 /**< points to the Sin twiddle factor table. */
-    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
-          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
-          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
-  } arm_cfft_radix2_instance_q15;
-
-/* Deprecated */
-  arm_status arm_cfft_radix2_init_q15(
-        arm_cfft_radix2_instance_q15 * S,
-        uint16_t fftLen,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag);
-
-/* Deprecated */
-  void arm_cfft_radix2_q15(
-  const arm_cfft_radix2_instance_q15 * S,
-        q15_t * pSrc);
-
-
-  /**
-   * @brief Instance structure for the Q15 CFFT/CIFFT function.
-   */
-  typedef struct
-  {
-          uint16_t fftLen;                 /**< length of the FFT. */
-          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
-          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
-    const q15_t *pTwiddle;                 /**< points to the twiddle factor table. */
-    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
-          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
-          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
-  } arm_cfft_radix4_instance_q15;
-
-/* Deprecated */
-  arm_status arm_cfft_radix4_init_q15(
-        arm_cfft_radix4_instance_q15 * S,
-        uint16_t fftLen,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag);
-
-/* Deprecated */
-  void arm_cfft_radix4_q15(
-  const arm_cfft_radix4_instance_q15 * S,
-        q15_t * pSrc);
-
-  /**
-   * @brief Instance structure for the Radix-2 Q31 CFFT/CIFFT function.
-   */
-  typedef struct
-  {
-          uint16_t fftLen;                 /**< length of the FFT. */
-          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
-          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
-    const q31_t *pTwiddle;                 /**< points to the Twiddle factor table. */
-    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
-          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
-          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
-  } arm_cfft_radix2_instance_q31;
-
-/* Deprecated */
-  arm_status arm_cfft_radix2_init_q31(
-        arm_cfft_radix2_instance_q31 * S,
-        uint16_t fftLen,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag);
-
-/* Deprecated */
-  void arm_cfft_radix2_q31(
-  const arm_cfft_radix2_instance_q31 * S,
-        q31_t * pSrc);
-
-  /**
-   * @brief Instance structure for the Q31 CFFT/CIFFT function.
-   */
-  typedef struct
-  {
-          uint16_t fftLen;                 /**< length of the FFT. */
-          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
-          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
-    const q31_t *pTwiddle;                 /**< points to the twiddle factor table. */
-    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
-          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
-          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
-  } arm_cfft_radix4_instance_q31;
-
-/* Deprecated */
-  void arm_cfft_radix4_q31(
-  const arm_cfft_radix4_instance_q31 * S,
-        q31_t * pSrc);
-
-/* Deprecated */
-  arm_status arm_cfft_radix4_init_q31(
-        arm_cfft_radix4_instance_q31 * S,
-        uint16_t fftLen,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag);
-
-  /**
-   * @brief Instance structure for the floating-point CFFT/CIFFT function.
-   */
-  typedef struct
-  {
-          uint16_t fftLen;                   /**< length of the FFT. */
-          uint8_t ifftFlag;                  /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
-          uint8_t bitReverseFlag;            /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
-    const float32_t *pTwiddle;               /**< points to the Twiddle factor table. */
-    const uint16_t *pBitRevTable;            /**< points to the bit reversal table. */
-          uint16_t twidCoefModifier;         /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
-          uint16_t bitRevFactor;             /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
-          float32_t onebyfftLen;             /**< value of 1/fftLen. */
-  } arm_cfft_radix2_instance_f32;
-
-/* Deprecated */
-  arm_status arm_cfft_radix2_init_f32(
-        arm_cfft_radix2_instance_f32 * S,
-        uint16_t fftLen,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag);
-
-/* Deprecated */
-  void arm_cfft_radix2_f32(
-  const arm_cfft_radix2_instance_f32 * S,
-        float32_t * pSrc);
-
-  /**
-   * @brief Instance structure for the floating-point CFFT/CIFFT function.
-   */
-  typedef struct
-  {
-          uint16_t fftLen;                   /**< length of the FFT. */
-          uint8_t ifftFlag;                  /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
-          uint8_t bitReverseFlag;            /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
-    const float32_t *pTwiddle;               /**< points to the Twiddle factor table. */
-    const uint16_t *pBitRevTable;            /**< points to the bit reversal table. */
-          uint16_t twidCoefModifier;         /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
-          uint16_t bitRevFactor;             /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
-          float32_t onebyfftLen;             /**< value of 1/fftLen. */
-  } arm_cfft_radix4_instance_f32;
-
-/* Deprecated */
-  arm_status arm_cfft_radix4_init_f32(
-        arm_cfft_radix4_instance_f32 * S,
-        uint16_t fftLen,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag);
-
-/* Deprecated */
-  void arm_cfft_radix4_f32(
-  const arm_cfft_radix4_instance_f32 * S,
-        float32_t * pSrc);
-
-  /**
-   * @brief Instance structure for the fixed-point CFFT/CIFFT function.
-   */
-  typedef struct
-  {
-          uint16_t fftLen;                   /**< length of the FFT. */
-    const q15_t *pTwiddle;             /**< points to the Twiddle factor table. */
-    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
-          uint16_t bitRevLength;             /**< bit reversal table length. */
-#if defined(ARM_MATH_MVEI)
-   const uint32_t *rearranged_twiddle_tab_stride1_arr;        /**< Per stage reordered twiddle pointer (offset 1) */                                                       \
-   const uint32_t *rearranged_twiddle_tab_stride2_arr;        /**< Per stage reordered twiddle pointer (offset 2) */                                                       \
-   const uint32_t *rearranged_twiddle_tab_stride3_arr;        /**< Per stage reordered twiddle pointer (offset 3) */                                                       \
-   const q15_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */                                                                   \
-   const q15_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */                                                                   \
-   const q15_t *rearranged_twiddle_stride3;
-#endif
-  } arm_cfft_instance_q15;
-
-arm_status arm_cfft_init_q15(
-  arm_cfft_instance_q15 * S,
-  uint16_t fftLen);
-
-void arm_cfft_q15(
-    const arm_cfft_instance_q15 * S,
-          q15_t * p1,
-          uint8_t ifftFlag,
-          uint8_t bitReverseFlag);
-
-  /**
-   * @brief Instance structure for the fixed-point CFFT/CIFFT function.
-   */
-  typedef struct
-  {
-          uint16_t fftLen;                   /**< length of the FFT. */
-    const q31_t *pTwiddle;             /**< points to the Twiddle factor table. */
-    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
-          uint16_t bitRevLength;             /**< bit reversal table length. */
-#if defined(ARM_MATH_MVEI)
-   const uint32_t *rearranged_twiddle_tab_stride1_arr;        /**< Per stage reordered twiddle pointer (offset 1) */                                                       \
-   const uint32_t *rearranged_twiddle_tab_stride2_arr;        /**< Per stage reordered twiddle pointer (offset 2) */                                                       \
-   const uint32_t *rearranged_twiddle_tab_stride3_arr;        /**< Per stage reordered twiddle pointer (offset 3) */                                                       \
-   const q31_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */                                                                   \
-   const q31_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */                                                                   \
-   const q31_t *rearranged_twiddle_stride3;
-#endif
-  } arm_cfft_instance_q31;
-
-arm_status arm_cfft_init_q31(
-  arm_cfft_instance_q31 * S,
-  uint16_t fftLen);
-
-void arm_cfft_q31(
-    const arm_cfft_instance_q31 * S,
-          q31_t * p1,
-          uint8_t ifftFlag,
-          uint8_t bitReverseFlag);
-
-  /**
-   * @brief Instance structure for the floating-point CFFT/CIFFT function.
-   */
-  typedef struct
-  {
-          uint16_t fftLen;                   /**< length of the FFT. */
-    const float32_t *pTwiddle;         /**< points to the Twiddle factor table. */
-    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
-          uint16_t bitRevLength;             /**< bit reversal table length. */
-#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
-   const uint32_t *rearranged_twiddle_tab_stride1_arr;        /**< Per stage reordered twiddle pointer (offset 1) */                                                       \
-   const uint32_t *rearranged_twiddle_tab_stride2_arr;        /**< Per stage reordered twiddle pointer (offset 2) */                                                       \
-   const uint32_t *rearranged_twiddle_tab_stride3_arr;        /**< Per stage reordered twiddle pointer (offset 3) */                                                       \
-   const float32_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */                                                                   \
-   const float32_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */                                                                   \
-   const float32_t *rearranged_twiddle_stride3;
-#endif
-  } arm_cfft_instance_f32;
-
-
-  arm_status arm_cfft_init_f32(
-  arm_cfft_instance_f32 * S,
-  uint16_t fftLen);
-
-  void arm_cfft_f32(
-  const arm_cfft_instance_f32 * S,
-        float32_t * p1,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag);
-
-
-  /**
-   * @brief Instance structure for the Double Precision Floating-point CFFT/CIFFT function.
-   */
-  typedef struct
-  {
-          uint16_t fftLen;                   /**< length of the FFT. */
-    const float64_t *pTwiddle;         /**< points to the Twiddle factor table. */
-    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
-          uint16_t bitRevLength;             /**< bit reversal table length. */
-  } arm_cfft_instance_f64;
-
-  void arm_cfft_f64(
-  const arm_cfft_instance_f64 * S,
-        float64_t * p1,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag);
-
-  /**
-   * @brief Instance structure for the Q15 RFFT/RIFFT function.
-   */
-  typedef struct
-  {
-          uint32_t fftLenReal;                      /**< length of the real FFT. */
-          uint8_t ifftFlagR;                        /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
-          uint8_t bitReverseFlagR;                  /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
-          uint32_t twidCoefRModifier;               /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
-    const q15_t *pTwiddleAReal;                     /**< points to the real twiddle factor table. */
-    const q15_t *pTwiddleBReal;                     /**< points to the imag twiddle factor table. */
-#if defined(ARM_MATH_MVEI)
-    arm_cfft_instance_q15 cfftInst;
-#else
-    const arm_cfft_instance_q15 *pCfft;       /**< points to the complex FFT instance. */
-#endif
-  } arm_rfft_instance_q15;
-
-  arm_status arm_rfft_init_q15(
-        arm_rfft_instance_q15 * S,
-        uint32_t fftLenReal,
-        uint32_t ifftFlagR,
-        uint32_t bitReverseFlag);
-
-  void arm_rfft_q15(
-  const arm_rfft_instance_q15 * S,
-        q15_t * pSrc,
-        q15_t * pDst);
-
-  /**
-   * @brief Instance structure for the Q31 RFFT/RIFFT function.
-   */
-  typedef struct
-  {
-          uint32_t fftLenReal;                        /**< length of the real FFT. */
-          uint8_t ifftFlagR;                          /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
-          uint8_t bitReverseFlagR;                    /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
-          uint32_t twidCoefRModifier;                 /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
-    const q31_t *pTwiddleAReal;                       /**< points to the real twiddle factor table. */
-    const q31_t *pTwiddleBReal;                       /**< points to the imag twiddle factor table. */
-#if defined(ARM_MATH_MVEI)
-    arm_cfft_instance_q31 cfftInst;
-#else
-    const arm_cfft_instance_q31 *pCfft;         /**< points to the complex FFT instance. */
-#endif
-  } arm_rfft_instance_q31;
-
-  arm_status arm_rfft_init_q31(
-        arm_rfft_instance_q31 * S,
-        uint32_t fftLenReal,
-        uint32_t ifftFlagR,
-        uint32_t bitReverseFlag);
-
-  void arm_rfft_q31(
-  const arm_rfft_instance_q31 * S,
-        q31_t * pSrc,
-        q31_t * pDst);
-
-  /**
-   * @brief Instance structure for the floating-point RFFT/RIFFT function.
-   */
-  typedef struct
-  {
-          uint32_t fftLenReal;                        /**< length of the real FFT. */
-          uint16_t fftLenBy2;                         /**< length of the complex FFT. */
-          uint8_t ifftFlagR;                          /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
-          uint8_t bitReverseFlagR;                    /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
-          uint32_t twidCoefRModifier;                     /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
-    const float32_t *pTwiddleAReal;                   /**< points to the real twiddle factor table. */
-    const float32_t *pTwiddleBReal;                   /**< points to the imag twiddle factor table. */
-          arm_cfft_radix4_instance_f32 *pCfft;        /**< points to the complex FFT instance. */
-  } arm_rfft_instance_f32;
-
-  arm_status arm_rfft_init_f32(
-        arm_rfft_instance_f32 * S,
-        arm_cfft_radix4_instance_f32 * S_CFFT,
-        uint32_t fftLenReal,
-        uint32_t ifftFlagR,
-        uint32_t bitReverseFlag);
-
-  void arm_rfft_f32(
-  const arm_rfft_instance_f32 * S,
-        float32_t * pSrc,
-        float32_t * pDst);
-
-  /**
-   * @brief Instance structure for the Double Precision Floating-point RFFT/RIFFT function.
-   */
-typedef struct
-  {
-          arm_cfft_instance_f64 Sint;      /**< Internal CFFT structure. */
-          uint16_t fftLenRFFT;             /**< length of the real sequence */
-    const float64_t * pTwiddleRFFT;        /**< Twiddle factors real stage  */
-  } arm_rfft_fast_instance_f64 ;
-
-arm_status arm_rfft_fast_init_f64 (
-         arm_rfft_fast_instance_f64 * S,
-         uint16_t fftLen);
-
-
-void arm_rfft_fast_f64(
-    arm_rfft_fast_instance_f64 * S,
-    float64_t * p, float64_t * pOut,
-    uint8_t ifftFlag);
-
-
-  /**
-   * @brief Instance structure for the floating-point RFFT/RIFFT function.
-   */
-typedef struct
-  {
-          arm_cfft_instance_f32 Sint;      /**< Internal CFFT structure. */
-          uint16_t fftLenRFFT;             /**< length of the real sequence */
-    const float32_t * pTwiddleRFFT;        /**< Twiddle factors real stage  */
-  } arm_rfft_fast_instance_f32 ;
-
-arm_status arm_rfft_fast_init_f32 (
-         arm_rfft_fast_instance_f32 * S,
-         uint16_t fftLen);
-
-
-  void arm_rfft_fast_f32(
-        const arm_rfft_fast_instance_f32 * S,
-        float32_t * p, float32_t * pOut,
-        uint8_t ifftFlag);
-
-  /**
-   * @brief Instance structure for the floating-point DCT4/IDCT4 function.
-   */
-  typedef struct
-  {
-          uint16_t N;                          /**< length of the DCT4. */
-          uint16_t Nby2;                       /**< half of the length of the DCT4. */
-          float32_t normalize;                 /**< normalizing factor. */
-    const float32_t *pTwiddle;                 /**< points to the twiddle factor table. */
-    const float32_t *pCosFactor;               /**< points to the cosFactor table. */
-          arm_rfft_instance_f32 *pRfft;        /**< points to the real FFT instance. */
-          arm_cfft_radix4_instance_f32 *pCfft; /**< points to the complex FFT instance. */
-  } arm_dct4_instance_f32;
-
-
-  /**
-   * @brief  Initialization function for the floating-point DCT4/IDCT4.
-   * @param[in,out] S          points to an instance of floating-point DCT4/IDCT4 structure.
-   * @param[in]     S_RFFT     points to an instance of floating-point RFFT/RIFFT structure.
-   * @param[in]     S_CFFT     points to an instance of floating-point CFFT/CIFFT structure.
-   * @param[in]     N          length of the DCT4.
-   * @param[in]     Nby2       half of the length of the DCT4.
-   * @param[in]     normalize  normalizing factor.
-   * @return      arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if <code>fftLenReal</code> is not a supported transform length.
-   */
-  arm_status arm_dct4_init_f32(
-        arm_dct4_instance_f32 * S,
-        arm_rfft_instance_f32 * S_RFFT,
-        arm_cfft_radix4_instance_f32 * S_CFFT,
-        uint16_t N,
-        uint16_t Nby2,
-        float32_t normalize);
-
-
-  /**
-   * @brief Processing function for the floating-point DCT4/IDCT4.
-   * @param[in]     S              points to an instance of the floating-point DCT4/IDCT4 structure.
-   * @param[in]     pState         points to state buffer.
-   * @param[in,out] pInlineBuffer  points to the in-place input and output buffer.
-   */
-  void arm_dct4_f32(
-  const arm_dct4_instance_f32 * S,
-        float32_t * pState,
-        float32_t * pInlineBuffer);
-
-
-  /**
-   * @brief Instance structure for the Q31 DCT4/IDCT4 function.
-   */
-  typedef struct
-  {
-          uint16_t N;                          /**< length of the DCT4. */
-          uint16_t Nby2;                       /**< half of the length of the DCT4. */
-          q31_t normalize;                     /**< normalizing factor. */
-    const q31_t *pTwiddle;                     /**< points to the twiddle factor table. */
-    const q31_t *pCosFactor;                   /**< points to the cosFactor table. */
-          arm_rfft_instance_q31 *pRfft;        /**< points to the real FFT instance. */
-          arm_cfft_radix4_instance_q31 *pCfft; /**< points to the complex FFT instance. */
-  } arm_dct4_instance_q31;
-
-
-  /**
-   * @brief  Initialization function for the Q31 DCT4/IDCT4.
-   * @param[in,out] S          points to an instance of Q31 DCT4/IDCT4 structure.
-   * @param[in]     S_RFFT     points to an instance of Q31 RFFT/RIFFT structure
-   * @param[in]     S_CFFT     points to an instance of Q31 CFFT/CIFFT structure
-   * @param[in]     N          length of the DCT4.
-   * @param[in]     Nby2       half of the length of the DCT4.
-   * @param[in]     normalize  normalizing factor.
-   * @return      arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if <code>N</code> is not a supported transform length.
-   */
-  arm_status arm_dct4_init_q31(
-        arm_dct4_instance_q31 * S,
-        arm_rfft_instance_q31 * S_RFFT,
-        arm_cfft_radix4_instance_q31 * S_CFFT,
-        uint16_t N,
-        uint16_t Nby2,
-        q31_t normalize);
-
-
-  /**
-   * @brief Processing function for the Q31 DCT4/IDCT4.
-   * @param[in]     S              points to an instance of the Q31 DCT4 structure.
-   * @param[in]     pState         points to state buffer.
-   * @param[in,out] pInlineBuffer  points to the in-place input and output buffer.
-   */
-  void arm_dct4_q31(
-  const arm_dct4_instance_q31 * S,
-        q31_t * pState,
-        q31_t * pInlineBuffer);
-
-
-  /**
-   * @brief Instance structure for the Q15 DCT4/IDCT4 function.
-   */
-  typedef struct
-  {
-          uint16_t N;                          /**< length of the DCT4. */
-          uint16_t Nby2;                       /**< half of the length of the DCT4. */
-          q15_t normalize;                     /**< normalizing factor. */
-    const q15_t *pTwiddle;                     /**< points to the twiddle factor table. */
-    const q15_t *pCosFactor;                   /**< points to the cosFactor table. */
-          arm_rfft_instance_q15 *pRfft;        /**< points to the real FFT instance. */
-          arm_cfft_radix4_instance_q15 *pCfft; /**< points to the complex FFT instance. */
-  } arm_dct4_instance_q15;
-
-
-  /**
-   * @brief  Initialization function for the Q15 DCT4/IDCT4.
-   * @param[in,out] S          points to an instance of Q15 DCT4/IDCT4 structure.
-   * @param[in]     S_RFFT     points to an instance of Q15 RFFT/RIFFT structure.
-   * @param[in]     S_CFFT     points to an instance of Q15 CFFT/CIFFT structure.
-   * @param[in]     N          length of the DCT4.
-   * @param[in]     Nby2       half of the length of the DCT4.
-   * @param[in]     normalize  normalizing factor.
-   * @return      arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if <code>N</code> is not a supported transform length.
-   */
-  arm_status arm_dct4_init_q15(
-        arm_dct4_instance_q15 * S,
-        arm_rfft_instance_q15 * S_RFFT,
-        arm_cfft_radix4_instance_q15 * S_CFFT,
-        uint16_t N,
-        uint16_t Nby2,
-        q15_t normalize);
-
-
-  /**
-   * @brief Processing function for the Q15 DCT4/IDCT4.
-   * @param[in]     S              points to an instance of the Q15 DCT4 structure.
-   * @param[in]     pState         points to state buffer.
-   * @param[in,out] pInlineBuffer  points to the in-place input and output buffer.
-   */
-  void arm_dct4_q15(
-  const arm_dct4_instance_q15 * S,
-        q15_t * pState,
-        q15_t * pInlineBuffer);
-
-
-  /**
-   * @brief Floating-point vector addition.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_add_f32(
-  const float32_t * pSrcA,
-  const float32_t * pSrcB,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Q7 vector addition.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_add_q7(
-  const q7_t * pSrcA,
-  const q7_t * pSrcB,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Q15 vector addition.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_add_q15(
-  const q15_t * pSrcA,
-  const q15_t * pSrcB,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Q31 vector addition.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_add_q31(
-  const q31_t * pSrcA,
-  const q31_t * pSrcB,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Floating-point vector subtraction.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_sub_f32(
-  const float32_t * pSrcA,
-  const float32_t * pSrcB,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Q7 vector subtraction.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_sub_q7(
-  const q7_t * pSrcA,
-  const q7_t * pSrcB,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Q15 vector subtraction.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_sub_q15(
-  const q15_t * pSrcA,
-  const q15_t * pSrcB,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Q31 vector subtraction.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_sub_q31(
-  const q31_t * pSrcA,
-  const q31_t * pSrcB,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Multiplies a floating-point vector by a scalar.
-   * @param[in]  pSrc       points to the input vector
-   * @param[in]  scale      scale factor to be applied
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in the vector
-   */
-  void arm_scale_f32(
-  const float32_t * pSrc,
-        float32_t scale,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Multiplies a Q7 vector by a scalar.
-   * @param[in]  pSrc        points to the input vector
-   * @param[in]  scaleFract  fractional portion of the scale value
-   * @param[in]  shift       number of bits to shift the result by
-   * @param[out] pDst        points to the output vector
-   * @param[in]  blockSize   number of samples in the vector
-   */
-  void arm_scale_q7(
-  const q7_t * pSrc,
-        q7_t scaleFract,
-        int8_t shift,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Multiplies a Q15 vector by a scalar.
-   * @param[in]  pSrc        points to the input vector
-   * @param[in]  scaleFract  fractional portion of the scale value
-   * @param[in]  shift       number of bits to shift the result by
-   * @param[out] pDst        points to the output vector
-   * @param[in]  blockSize   number of samples in the vector
-   */
-  void arm_scale_q15(
-  const q15_t * pSrc,
-        q15_t scaleFract,
-        int8_t shift,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Multiplies a Q31 vector by a scalar.
-   * @param[in]  pSrc        points to the input vector
-   * @param[in]  scaleFract  fractional portion of the scale value
-   * @param[in]  shift       number of bits to shift the result by
-   * @param[out] pDst        points to the output vector
-   * @param[in]  blockSize   number of samples in the vector
-   */
-  void arm_scale_q31(
-  const q31_t * pSrc,
-        q31_t scaleFract,
-        int8_t shift,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Q7 vector absolute value.
-   * @param[in]  pSrc       points to the input buffer
-   * @param[out] pDst       points to the output buffer
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_abs_q7(
-  const q7_t * pSrc,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Floating-point vector absolute value.
-   * @param[in]  pSrc       points to the input buffer
-   * @param[out] pDst       points to the output buffer
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_abs_f32(
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Q15 vector absolute value.
-   * @param[in]  pSrc       points to the input buffer
-   * @param[out] pDst       points to the output buffer
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_abs_q15(
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Q31 vector absolute value.
-   * @param[in]  pSrc       points to the input buffer
-   * @param[out] pDst       points to the output buffer
-   * @param[in]  blockSize  number of samples in each vector
-   */
-  void arm_abs_q31(
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Dot product of floating-point vectors.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[in]  blockSize  number of samples in each vector
-   * @param[out] result     output result returned here
-   */
-  void arm_dot_prod_f32(
-  const float32_t * pSrcA,
-  const float32_t * pSrcB,
-        uint32_t blockSize,
-        float32_t * result);
-
-
-  /**
-   * @brief Dot product of Q7 vectors.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[in]  blockSize  number of samples in each vector
-   * @param[out] result     output result returned here
-   */
-  void arm_dot_prod_q7(
-  const q7_t * pSrcA,
-  const q7_t * pSrcB,
-        uint32_t blockSize,
-        q31_t * result);
-
-
-  /**
-   * @brief Dot product of Q15 vectors.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[in]  blockSize  number of samples in each vector
-   * @param[out] result     output result returned here
-   */
-  void arm_dot_prod_q15(
-  const q15_t * pSrcA,
-  const q15_t * pSrcB,
-        uint32_t blockSize,
-        q63_t * result);
-
-
-  /**
-   * @brief Dot product of Q31 vectors.
-   * @param[in]  pSrcA      points to the first input vector
-   * @param[in]  pSrcB      points to the second input vector
-   * @param[in]  blockSize  number of samples in each vector
-   * @param[out] result     output result returned here
-   */
-  void arm_dot_prod_q31(
-  const q31_t * pSrcA,
-  const q31_t * pSrcB,
-        uint32_t blockSize,
-        q63_t * result);
-
-
-  /**
-   * @brief  Shifts the elements of a Q7 vector a specified number of bits.
-   * @param[in]  pSrc       points to the input vector
-   * @param[in]  shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in the vector
-   */
-  void arm_shift_q7(
-  const q7_t * pSrc,
-        int8_t shiftBits,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Shifts the elements of a Q15 vector a specified number of bits.
-   * @param[in]  pSrc       points to the input vector
-   * @param[in]  shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in the vector
-   */
-  void arm_shift_q15(
-  const q15_t * pSrc,
-        int8_t shiftBits,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Shifts the elements of a Q31 vector a specified number of bits.
-   * @param[in]  pSrc       points to the input vector
-   * @param[in]  shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in the vector
-   */
-  void arm_shift_q31(
-  const q31_t * pSrc,
-        int8_t shiftBits,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Adds a constant offset to a floating-point vector.
-   * @param[in]  pSrc       points to the input vector
-   * @param[in]  offset     is the offset to be added
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in the vector
-   */
-  void arm_offset_f32(
-  const float32_t * pSrc,
-        float32_t offset,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Adds a constant offset to a Q7 vector.
-   * @param[in]  pSrc       points to the input vector
-   * @param[in]  offset     is the offset to be added
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in the vector
-   */
-  void arm_offset_q7(
-  const q7_t * pSrc,
-        q7_t offset,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Adds a constant offset to a Q15 vector.
-   * @param[in]  pSrc       points to the input vector
-   * @param[in]  offset     is the offset to be added
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in the vector
-   */
-  void arm_offset_q15(
-  const q15_t * pSrc,
-        q15_t offset,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Adds a constant offset to a Q31 vector.
-   * @param[in]  pSrc       points to the input vector
-   * @param[in]  offset     is the offset to be added
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in the vector
-   */
-  void arm_offset_q31(
-  const q31_t * pSrc,
-        q31_t offset,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Negates the elements of a floating-point vector.
-   * @param[in]  pSrc       points to the input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in the vector
-   */
-  void arm_negate_f32(
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Negates the elements of a Q7 vector.
-   * @param[in]  pSrc       points to the input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in the vector
-   */
-  void arm_negate_q7(
-  const q7_t * pSrc,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Negates the elements of a Q15 vector.
-   * @param[in]  pSrc       points to the input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in the vector
-   */
-  void arm_negate_q15(
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Negates the elements of a Q31 vector.
-   * @param[in]  pSrc       points to the input vector
-   * @param[out] pDst       points to the output vector
-   * @param[in]  blockSize  number of samples in the vector
-   */
-  void arm_negate_q31(
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Copies the elements of a floating-point vector.
-   * @param[in]  pSrc       input pointer
-   * @param[out] pDst       output pointer
-   * @param[in]  blockSize  number of samples to process
-   */
-  void arm_copy_f32(
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Copies the elements of a Q7 vector.
-   * @param[in]  pSrc       input pointer
-   * @param[out] pDst       output pointer
-   * @param[in]  blockSize  number of samples to process
-   */
-  void arm_copy_q7(
-  const q7_t * pSrc,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Copies the elements of a Q15 vector.
-   * @param[in]  pSrc       input pointer
-   * @param[out] pDst       output pointer
-   * @param[in]  blockSize  number of samples to process
-   */
-  void arm_copy_q15(
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Copies the elements of a Q31 vector.
-   * @param[in]  pSrc       input pointer
-   * @param[out] pDst       output pointer
-   * @param[in]  blockSize  number of samples to process
-   */
-  void arm_copy_q31(
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Fills a constant value into a floating-point vector.
-   * @param[in]  value      input value to be filled
-   * @param[out] pDst       output pointer
-   * @param[in]  blockSize  number of samples to process
-   */
-  void arm_fill_f32(
-        float32_t value,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Fills a constant value into a Q7 vector.
-   * @param[in]  value      input value to be filled
-   * @param[out] pDst       output pointer
-   * @param[in]  blockSize  number of samples to process
-   */
-  void arm_fill_q7(
-        q7_t value,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Fills a constant value into a Q15 vector.
-   * @param[in]  value      input value to be filled
-   * @param[out] pDst       output pointer
-   * @param[in]  blockSize  number of samples to process
-   */
-  void arm_fill_q15(
-        q15_t value,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Fills a constant value into a Q31 vector.
-   * @param[in]  value      input value to be filled
-   * @param[out] pDst       output pointer
-   * @param[in]  blockSize  number of samples to process
-   */
-  void arm_fill_q31(
-        q31_t value,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-/**
- * @brief Convolution of floating-point sequences.
- * @param[in]  pSrcA    points to the first input sequence.
- * @param[in]  srcALen  length of the first input sequence.
- * @param[in]  pSrcB    points to the second input sequence.
- * @param[in]  srcBLen  length of the second input sequence.
- * @param[out] pDst     points to the location where the output result is written.  Length srcALen+srcBLen-1.
- */
-  void arm_conv_f32(
-  const float32_t * pSrcA,
-        uint32_t srcALen,
-  const float32_t * pSrcB,
-        uint32_t srcBLen,
-        float32_t * pDst);
-
-
-  /**
-   * @brief Convolution of Q15 sequences.
-   * @param[in]  pSrcA      points to the first input sequence.
-   * @param[in]  srcALen    length of the first input sequence.
-   * @param[in]  pSrcB      points to the second input sequence.
-   * @param[in]  srcBLen    length of the second input sequence.
-   * @param[out] pDst       points to the block of output data  Length srcALen+srcBLen-1.
-   * @param[in]  pScratch1  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
-   * @param[in]  pScratch2  points to scratch buffer of size min(srcALen, srcBLen).
-   */
-  void arm_conv_opt_q15(
-  const q15_t * pSrcA,
-        uint32_t srcALen,
-  const q15_t * pSrcB,
-        uint32_t srcBLen,
-        q15_t * pDst,
-        q15_t * pScratch1,
-        q15_t * pScratch2);
-
-
-/**
- * @brief Convolution of Q15 sequences.
- * @param[in]  pSrcA    points to the first input sequence.
- * @param[in]  srcALen  length of the first input sequence.
- * @param[in]  pSrcB    points to the second input sequence.
- * @param[in]  srcBLen  length of the second input sequence.
- * @param[out] pDst     points to the location where the output result is written.  Length srcALen+srcBLen-1.
- */
-  void arm_conv_q15(
-  const q15_t * pSrcA,
-        uint32_t srcALen,
-  const q15_t * pSrcB,
-        uint32_t srcBLen,
-        q15_t * pDst);
-
-
-  /**
-   * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4
-   * @param[in]  pSrcA    points to the first input sequence.
-   * @param[in]  srcALen  length of the first input sequence.
-   * @param[in]  pSrcB    points to the second input sequence.
-   * @param[in]  srcBLen  length of the second input sequence.
-   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
-   */
-  void arm_conv_fast_q15(
-  const q15_t * pSrcA,
-        uint32_t srcALen,
-  const q15_t * pSrcB,
-        uint32_t srcBLen,
-        q15_t * pDst);
-
-
-  /**
-   * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4
-   * @param[in]  pSrcA      points to the first input sequence.
-   * @param[in]  srcALen    length of the first input sequence.
-   * @param[in]  pSrcB      points to the second input sequence.
-   * @param[in]  srcBLen    length of the second input sequence.
-   * @param[out] pDst       points to the block of output data  Length srcALen+srcBLen-1.
-   * @param[in]  pScratch1  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
-   * @param[in]  pScratch2  points to scratch buffer of size min(srcALen, srcBLen).
-   */
-  void arm_conv_fast_opt_q15(
-  const q15_t * pSrcA,
-        uint32_t srcALen,
-  const q15_t * pSrcB,
-        uint32_t srcBLen,
-        q15_t * pDst,
-        q15_t * pScratch1,
-        q15_t * pScratch2);
-
-
-  /**
-   * @brief Convolution of Q31 sequences.
-   * @param[in]  pSrcA    points to the first input sequence.
-   * @param[in]  srcALen  length of the first input sequence.
-   * @param[in]  pSrcB    points to the second input sequence.
-   * @param[in]  srcBLen  length of the second input sequence.
-   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
-   */
-  void arm_conv_q31(
-  const q31_t * pSrcA,
-        uint32_t srcALen,
-  const q31_t * pSrcB,
-        uint32_t srcBLen,
-        q31_t * pDst);
-
-
-  /**
-   * @brief Convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4
-   * @param[in]  pSrcA    points to the first input sequence.
-   * @param[in]  srcALen  length of the first input sequence.
-   * @param[in]  pSrcB    points to the second input sequence.
-   * @param[in]  srcBLen  length of the second input sequence.
-   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
-   */
-  void arm_conv_fast_q31(
-  const q31_t * pSrcA,
-        uint32_t srcALen,
-  const q31_t * pSrcB,
-        uint32_t srcBLen,
-        q31_t * pDst);
-
-
-    /**
-   * @brief Convolution of Q7 sequences.
-   * @param[in]  pSrcA      points to the first input sequence.
-   * @param[in]  srcALen    length of the first input sequence.
-   * @param[in]  pSrcB      points to the second input sequence.
-   * @param[in]  srcBLen    length of the second input sequence.
-   * @param[out] pDst       points to the block of output data  Length srcALen+srcBLen-1.
-   * @param[in]  pScratch1  points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
-   * @param[in]  pScratch2  points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
-   */
-  void arm_conv_opt_q7(
-  const q7_t * pSrcA,
-        uint32_t srcALen,
-  const q7_t * pSrcB,
-        uint32_t srcBLen,
-        q7_t * pDst,
-        q15_t * pScratch1,
-        q15_t * pScratch2);
-
-
-  /**
-   * @brief Convolution of Q7 sequences.
-   * @param[in]  pSrcA    points to the first input sequence.
-   * @param[in]  srcALen  length of the first input sequence.
-   * @param[in]  pSrcB    points to the second input sequence.
-   * @param[in]  srcBLen  length of the second input sequence.
-   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
-   */
-  void arm_conv_q7(
-  const q7_t * pSrcA,
-        uint32_t srcALen,
-  const q7_t * pSrcB,
-        uint32_t srcBLen,
-        q7_t * pDst);
-
-
-  /**
-   * @brief Partial convolution of floating-point sequences.
-   * @param[in]  pSrcA       points to the first input sequence.
-   * @param[in]  srcALen     length of the first input sequence.
-   * @param[in]  pSrcB       points to the second input sequence.
-   * @param[in]  srcBLen     length of the second input sequence.
-   * @param[out] pDst        points to the block of output data
-   * @param[in]  firstIndex  is the first output sample to start with.
-   * @param[in]  numPoints   is the number of output points to be computed.
-   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
-   */
-  arm_status arm_conv_partial_f32(
-  const float32_t * pSrcA,
-        uint32_t srcALen,
-  const float32_t * pSrcB,
-        uint32_t srcBLen,
-        float32_t * pDst,
-        uint32_t firstIndex,
-        uint32_t numPoints);
-
-
-  /**
-   * @brief Partial convolution of Q15 sequences.
-   * @param[in]  pSrcA       points to the first input sequence.
-   * @param[in]  srcALen     length of the first input sequence.
-   * @param[in]  pSrcB       points to the second input sequence.
-   * @param[in]  srcBLen     length of the second input sequence.
-   * @param[out] pDst        points to the block of output data
-   * @param[in]  firstIndex  is the first output sample to start with.
-   * @param[in]  numPoints   is the number of output points to be computed.
-   * @param[in]  pScratch1   points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
-   * @param[in]  pScratch2   points to scratch buffer of size min(srcALen, srcBLen).
-   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
-   */
-  arm_status arm_conv_partial_opt_q15(
-  const q15_t * pSrcA,
-        uint32_t srcALen,
-  const q15_t * pSrcB,
-        uint32_t srcBLen,
-        q15_t * pDst,
-        uint32_t firstIndex,
-        uint32_t numPoints,
-        q15_t * pScratch1,
-        q15_t * pScratch2);
-
-
-  /**
-   * @brief Partial convolution of Q15 sequences.
-   * @param[in]  pSrcA       points to the first input sequence.
-   * @param[in]  srcALen     length of the first input sequence.
-   * @param[in]  pSrcB       points to the second input sequence.
-   * @param[in]  srcBLen     length of the second input sequence.
-   * @param[out] pDst        points to the block of output data
-   * @param[in]  firstIndex  is the first output sample to start with.
-   * @param[in]  numPoints   is the number of output points to be computed.
-   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
-   */
-  arm_status arm_conv_partial_q15(
-  const q15_t * pSrcA,
-        uint32_t srcALen,
-  const q15_t * pSrcB,
-        uint32_t srcBLen,
-        q15_t * pDst,
-        uint32_t firstIndex,
-        uint32_t numPoints);
-
-
-  /**
-   * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4
-   * @param[in]  pSrcA       points to the first input sequence.
-   * @param[in]  srcALen     length of the first input sequence.
-   * @param[in]  pSrcB       points to the second input sequence.
-   * @param[in]  srcBLen     length of the second input sequence.
-   * @param[out] pDst        points to the block of output data
-   * @param[in]  firstIndex  is the first output sample to start with.
-   * @param[in]  numPoints   is the number of output points to be computed.
-   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
-   */
-  arm_status arm_conv_partial_fast_q15(
-  const q15_t * pSrcA,
-        uint32_t srcALen,
-  const q15_t * pSrcB,
-        uint32_t srcBLen,
-        q15_t * pDst,
-        uint32_t firstIndex,
-        uint32_t numPoints);
-
-
-  /**
-   * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4
-   * @param[in]  pSrcA       points to the first input sequence.
-   * @param[in]  srcALen     length of the first input sequence.
-   * @param[in]  pSrcB       points to the second input sequence.
-   * @param[in]  srcBLen     length of the second input sequence.
-   * @param[out] pDst        points to the block of output data
-   * @param[in]  firstIndex  is the first output sample to start with.
-   * @param[in]  numPoints   is the number of output points to be computed.
-   * @param[in]  pScratch1   points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
-   * @param[in]  pScratch2   points to scratch buffer of size min(srcALen, srcBLen).
-   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
-   */
-  arm_status arm_conv_partial_fast_opt_q15(
-  const q15_t * pSrcA,
-        uint32_t srcALen,
-  const q15_t * pSrcB,
-        uint32_t srcBLen,
-        q15_t * pDst,
-        uint32_t firstIndex,
-        uint32_t numPoints,
-        q15_t * pScratch1,
-        q15_t * pScratch2);
-
-
-  /**
-   * @brief Partial convolution of Q31 sequences.
-   * @param[in]  pSrcA       points to the first input sequence.
-   * @param[in]  srcALen     length of the first input sequence.
-   * @param[in]  pSrcB       points to the second input sequence.
-   * @param[in]  srcBLen     length of the second input sequence.
-   * @param[out] pDst        points to the block of output data
-   * @param[in]  firstIndex  is the first output sample to start with.
-   * @param[in]  numPoints   is the number of output points to be computed.
-   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
-   */
-  arm_status arm_conv_partial_q31(
-  const q31_t * pSrcA,
-        uint32_t srcALen,
-  const q31_t * pSrcB,
-        uint32_t srcBLen,
-        q31_t * pDst,
-        uint32_t firstIndex,
-        uint32_t numPoints);
-
-
-  /**
-   * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4
-   * @param[in]  pSrcA       points to the first input sequence.
-   * @param[in]  srcALen     length of the first input sequence.
-   * @param[in]  pSrcB       points to the second input sequence.
-   * @param[in]  srcBLen     length of the second input sequence.
-   * @param[out] pDst        points to the block of output data
-   * @param[in]  firstIndex  is the first output sample to start with.
-   * @param[in]  numPoints   is the number of output points to be computed.
-   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
-   */
-  arm_status arm_conv_partial_fast_q31(
-  const q31_t * pSrcA,
-        uint32_t srcALen,
-  const q31_t * pSrcB,
-        uint32_t srcBLen,
-        q31_t * pDst,
-        uint32_t firstIndex,
-        uint32_t numPoints);
-
-
-  /**
-   * @brief Partial convolution of Q7 sequences
-   * @param[in]  pSrcA       points to the first input sequence.
-   * @param[in]  srcALen     length of the first input sequence.
-   * @param[in]  pSrcB       points to the second input sequence.
-   * @param[in]  srcBLen     length of the second input sequence.
-   * @param[out] pDst        points to the block of output data
-   * @param[in]  firstIndex  is the first output sample to start with.
-   * @param[in]  numPoints   is the number of output points to be computed.
-   * @param[in]  pScratch1   points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
-   * @param[in]  pScratch2   points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
-   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
-   */
-  arm_status arm_conv_partial_opt_q7(
-  const q7_t * pSrcA,
-        uint32_t srcALen,
-  const q7_t * pSrcB,
-        uint32_t srcBLen,
-        q7_t * pDst,
-        uint32_t firstIndex,
-        uint32_t numPoints,
-        q15_t * pScratch1,
-        q15_t * pScratch2);
-
-
-/**
-   * @brief Partial convolution of Q7 sequences.
-   * @param[in]  pSrcA       points to the first input sequence.
-   * @param[in]  srcALen     length of the first input sequence.
-   * @param[in]  pSrcB       points to the second input sequence.
-   * @param[in]  srcBLen     length of the second input sequence.
-   * @param[out] pDst        points to the block of output data
-   * @param[in]  firstIndex  is the first output sample to start with.
-   * @param[in]  numPoints   is the number of output points to be computed.
-   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
-   */
-  arm_status arm_conv_partial_q7(
-  const q7_t * pSrcA,
-        uint32_t srcALen,
-  const q7_t * pSrcB,
-        uint32_t srcBLen,
-        q7_t * pDst,
-        uint32_t firstIndex,
-        uint32_t numPoints);
-
-
-  /**
-   * @brief Instance structure for the Q15 FIR decimator.
-   */
-  typedef struct
-  {
-          uint8_t M;                  /**< decimation factor. */
-          uint16_t numTaps;           /**< number of coefficients in the filter. */
-    const q15_t *pCoeffs;             /**< points to the coefficient array. The array is of length numTaps.*/
-          q15_t *pState;              /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
-  } arm_fir_decimate_instance_q15;
-
-  /**
-   * @brief Instance structure for the Q31 FIR decimator.
-   */
-  typedef struct
-  {
-          uint8_t M;                  /**< decimation factor. */
-          uint16_t numTaps;           /**< number of coefficients in the filter. */
-    const q31_t *pCoeffs;             /**< points to the coefficient array. The array is of length numTaps.*/
-          q31_t *pState;              /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
-  } arm_fir_decimate_instance_q31;
-
-/**
-  @brief Instance structure for floating-point FIR decimator.
- */
-typedef struct
-  {
-          uint8_t M;                  /**< decimation factor. */
-          uint16_t numTaps;           /**< number of coefficients in the filter. */
-    const float32_t *pCoeffs;         /**< points to the coefficient array. The array is of length numTaps.*/
-          float32_t *pState;          /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
-  } arm_fir_decimate_instance_f32;
-
-
-/**
-  @brief         Processing function for floating-point FIR decimator.
-  @param[in]     S         points to an instance of the floating-point FIR decimator structure
-  @param[in]     pSrc      points to the block of input data
-  @param[out]    pDst      points to the block of output data
-  @param[in]     blockSize number of samples to process
- */
-void arm_fir_decimate_f32(
-  const arm_fir_decimate_instance_f32 * S,
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-/**
-  @brief         Initialization function for the floating-point FIR decimator.
-  @param[in,out] S          points to an instance of the floating-point FIR decimator structure
-  @param[in]     numTaps    number of coefficients in the filter
-  @param[in]     M          decimation factor
-  @param[in]     pCoeffs    points to the filter coefficients
-  @param[in]     pState     points to the state buffer
-  @param[in]     blockSize  number of input samples to process per call
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS      : Operation successful
-                   - \ref ARM_MATH_LENGTH_ERROR : <code>blockSize</code> is not a multiple of <code>M</code>
- */
-arm_status arm_fir_decimate_init_f32(
-        arm_fir_decimate_instance_f32 * S,
-        uint16_t numTaps,
-        uint8_t M,
-  const float32_t * pCoeffs,
-        float32_t * pState,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Processing function for the Q15 FIR decimator.
-   * @param[in]  S          points to an instance of the Q15 FIR decimator structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data
-   * @param[in]  blockSize  number of input samples to process per call.
-   */
-  void arm_fir_decimate_q15(
-  const arm_fir_decimate_instance_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
-   * @param[in]  S          points to an instance of the Q15 FIR decimator structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data
-   * @param[in]  blockSize  number of input samples to process per call.
-   */
-  void arm_fir_decimate_fast_q15(
-  const arm_fir_decimate_instance_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Initialization function for the Q15 FIR decimator.
-   * @param[in,out] S          points to an instance of the Q15 FIR decimator structure.
-   * @param[in]     numTaps    number of coefficients in the filter.
-   * @param[in]     M          decimation factor.
-   * @param[in]     pCoeffs    points to the filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     blockSize  number of input samples to process per call.
-   * @return    The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
-   * <code>blockSize</code> is not a multiple of <code>M</code>.
-   */
-  arm_status arm_fir_decimate_init_q15(
-        arm_fir_decimate_instance_q15 * S,
-        uint16_t numTaps,
-        uint8_t M,
-  const q15_t * pCoeffs,
-        q15_t * pState,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Processing function for the Q31 FIR decimator.
-   * @param[in]  S     points to an instance of the Q31 FIR decimator structure.
-   * @param[in]  pSrc  points to the block of input data.
-   * @param[out] pDst  points to the block of output data
-   * @param[in] blockSize number of input samples to process per call.
-   */
-  void arm_fir_decimate_q31(
-  const arm_fir_decimate_instance_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @brief Processing function for the Q31 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
-   * @param[in]  S          points to an instance of the Q31 FIR decimator structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data
-   * @param[in]  blockSize  number of input samples to process per call.
-   */
-  void arm_fir_decimate_fast_q31(
-  const arm_fir_decimate_instance_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Initialization function for the Q31 FIR decimator.
-   * @param[in,out] S          points to an instance of the Q31 FIR decimator structure.
-   * @param[in]     numTaps    number of coefficients in the filter.
-   * @param[in]     M          decimation factor.
-   * @param[in]     pCoeffs    points to the filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     blockSize  number of input samples to process per call.
-   * @return    The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
-   * <code>blockSize</code> is not a multiple of <code>M</code>.
-   */
-  arm_status arm_fir_decimate_init_q31(
-        arm_fir_decimate_instance_q31 * S,
-        uint16_t numTaps,
-        uint8_t M,
-  const q31_t * pCoeffs,
-        q31_t * pState,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Instance structure for the Q15 FIR interpolator.
-   */
-  typedef struct
-  {
-        uint8_t L;                      /**< upsample factor. */
-        uint16_t phaseLength;           /**< length of each polyphase filter component. */
-  const q15_t *pCoeffs;                 /**< points to the coefficient array. The array is of length L*phaseLength. */
-        q15_t *pState;                  /**< points to the state variable array. The array is of length blockSize+phaseLength-1. */
-  } arm_fir_interpolate_instance_q15;
-
-  /**
-   * @brief Instance structure for the Q31 FIR interpolator.
-   */
-  typedef struct
-  {
-        uint8_t L;                      /**< upsample factor. */
-        uint16_t phaseLength;           /**< length of each polyphase filter component. */
-  const q31_t *pCoeffs;                 /**< points to the coefficient array. The array is of length L*phaseLength. */
-        q31_t *pState;                  /**< points to the state variable array. The array is of length blockSize+phaseLength-1. */
-  } arm_fir_interpolate_instance_q31;
-
-  /**
-   * @brief Instance structure for the floating-point FIR interpolator.
-   */
-  typedef struct
-  {
-        uint8_t L;                     /**< upsample factor. */
-        uint16_t phaseLength;          /**< length of each polyphase filter component. */
-  const float32_t *pCoeffs;            /**< points to the coefficient array. The array is of length L*phaseLength. */
-        float32_t *pState;             /**< points to the state variable array. The array is of length phaseLength+numTaps-1. */
-  } arm_fir_interpolate_instance_f32;
-
-
-  /**
-   * @brief Processing function for the Q15 FIR interpolator.
-   * @param[in]  S          points to an instance of the Q15 FIR interpolator structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of input samples to process per call.
-   */
-  void arm_fir_interpolate_q15(
-  const arm_fir_interpolate_instance_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Initialization function for the Q15 FIR interpolator.
-   * @param[in,out] S          points to an instance of the Q15 FIR interpolator structure.
-   * @param[in]     L          upsample factor.
-   * @param[in]     numTaps    number of filter coefficients in the filter.
-   * @param[in]     pCoeffs    points to the filter coefficient buffer.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     blockSize  number of input samples to process per call.
-   * @return        The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
-   * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
-   */
-  arm_status arm_fir_interpolate_init_q15(
-        arm_fir_interpolate_instance_q15 * S,
-        uint8_t L,
-        uint16_t numTaps,
-  const q15_t * pCoeffs,
-        q15_t * pState,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Processing function for the Q31 FIR interpolator.
-   * @param[in]  S          points to an instance of the Q15 FIR interpolator structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of input samples to process per call.
-   */
-  void arm_fir_interpolate_q31(
-  const arm_fir_interpolate_instance_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Initialization function for the Q31 FIR interpolator.
-   * @param[in,out] S          points to an instance of the Q31 FIR interpolator structure.
-   * @param[in]     L          upsample factor.
-   * @param[in]     numTaps    number of filter coefficients in the filter.
-   * @param[in]     pCoeffs    points to the filter coefficient buffer.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     blockSize  number of input samples to process per call.
-   * @return        The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
-   * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
-   */
-  arm_status arm_fir_interpolate_init_q31(
-        arm_fir_interpolate_instance_q31 * S,
-        uint8_t L,
-        uint16_t numTaps,
-  const q31_t * pCoeffs,
-        q31_t * pState,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Processing function for the floating-point FIR interpolator.
-   * @param[in]  S          points to an instance of the floating-point FIR interpolator structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of input samples to process per call.
-   */
-  void arm_fir_interpolate_f32(
-  const arm_fir_interpolate_instance_f32 * S,
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Initialization function for the floating-point FIR interpolator.
-   * @param[in,out] S          points to an instance of the floating-point FIR interpolator structure.
-   * @param[in]     L          upsample factor.
-   * @param[in]     numTaps    number of filter coefficients in the filter.
-   * @param[in]     pCoeffs    points to the filter coefficient buffer.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     blockSize  number of input samples to process per call.
-   * @return        The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
-   * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
-   */
-  arm_status arm_fir_interpolate_init_f32(
-        arm_fir_interpolate_instance_f32 * S,
-        uint8_t L,
-        uint16_t numTaps,
-  const float32_t * pCoeffs,
-        float32_t * pState,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Instance structure for the high precision Q31 Biquad cascade filter.
-   */
-  typedef struct
-  {
-          uint8_t numStages;       /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
-          q63_t *pState;           /**< points to the array of state coefficients.  The array is of length 4*numStages. */
-    const q31_t *pCoeffs;          /**< points to the array of coefficients.  The array is of length 5*numStages. */
-          uint8_t postShift;       /**< additional shift, in bits, applied to each output sample. */
-  } arm_biquad_cas_df1_32x64_ins_q31;
-
-
-  /**
-   * @param[in]  S          points to an instance of the high precision Q31 Biquad cascade filter structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_biquad_cas_df1_32x64_q31(
-  const arm_biquad_cas_df1_32x64_ins_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @param[in,out] S          points to an instance of the high precision Q31 Biquad cascade filter structure.
-   * @param[in]     numStages  number of 2nd order stages in the filter.
-   * @param[in]     pCoeffs    points to the filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     postShift  shift to be applied to the output. Varies according to the coefficients format
-   */
-  void arm_biquad_cas_df1_32x64_init_q31(
-        arm_biquad_cas_df1_32x64_ins_q31 * S,
-        uint8_t numStages,
-  const q31_t * pCoeffs,
-        q63_t * pState,
-        uint8_t postShift);
-
-
-  /**
-   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
-   */
-  typedef struct
-  {
-          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
-          float32_t *pState;         /**< points to the array of state coefficients.  The array is of length 2*numStages. */
-    const float32_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
-  } arm_biquad_cascade_df2T_instance_f32;
-
-  /**
-   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
-   */
-  typedef struct
-  {
-          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
-          float32_t *pState;         /**< points to the array of state coefficients.  The array is of length 4*numStages. */
-    const float32_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
-  } arm_biquad_cascade_stereo_df2T_instance_f32;
-
-  /**
-   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
-   */
-  typedef struct
-  {
-          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
-          float64_t *pState;         /**< points to the array of state coefficients.  The array is of length 2*numStages. */
-    const float64_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
-  } arm_biquad_cascade_df2T_instance_f64;
-
-
-  /**
-   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
-   * @param[in]  S          points to an instance of the filter data structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_biquad_cascade_df2T_f32(
-  const arm_biquad_cascade_df2T_instance_f32 * S,
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. 2 channels
-   * @param[in]  S          points to an instance of the filter data structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_biquad_cascade_stereo_df2T_f32(
-  const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
-   * @param[in]  S          points to an instance of the filter data structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_biquad_cascade_df2T_f64(
-  const arm_biquad_cascade_df2T_instance_f64 * S,
-  const float64_t * pSrc,
-        float64_t * pDst,
-        uint32_t blockSize);
-
-
-#if defined(ARM_MATH_NEON) 
-void arm_biquad_cascade_df2T_compute_coefs_f32(
-  arm_biquad_cascade_df2T_instance_f32 * S,
-  uint8_t numStages,
-  float32_t * pCoeffs);
-#endif
-  /**
-   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
-   * @param[in,out] S          points to an instance of the filter data structure.
-   * @param[in]     numStages  number of 2nd order stages in the filter.
-   * @param[in]     pCoeffs    points to the filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   */
-  void arm_biquad_cascade_df2T_init_f32(
-        arm_biquad_cascade_df2T_instance_f32 * S,
-        uint8_t numStages,
-  const float32_t * pCoeffs,
-        float32_t * pState);
-
-
-  /**
-   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
-   * @param[in,out] S          points to an instance of the filter data structure.
-   * @param[in]     numStages  number of 2nd order stages in the filter.
-   * @param[in]     pCoeffs    points to the filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   */
-  void arm_biquad_cascade_stereo_df2T_init_f32(
-        arm_biquad_cascade_stereo_df2T_instance_f32 * S,
-        uint8_t numStages,
-  const float32_t * pCoeffs,
-        float32_t * pState);
-
-
-  /**
-   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
-   * @param[in,out] S          points to an instance of the filter data structure.
-   * @param[in]     numStages  number of 2nd order stages in the filter.
-   * @param[in]     pCoeffs    points to the filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   */
-  void arm_biquad_cascade_df2T_init_f64(
-        arm_biquad_cascade_df2T_instance_f64 * S,
-        uint8_t numStages,
-        const float64_t * pCoeffs,
-        float64_t * pState);
-
-
-  /**
-   * @brief Instance structure for the Q15 FIR lattice filter.
-   */
-  typedef struct
-  {
-          uint16_t numStages;                  /**< number of filter stages. */
-          q15_t *pState;                       /**< points to the state variable array. The array is of length numStages. */
-    const q15_t *pCoeffs;                      /**< points to the coefficient array. The array is of length numStages. */
-  } arm_fir_lattice_instance_q15;
-
-  /**
-   * @brief Instance structure for the Q31 FIR lattice filter.
-   */
-  typedef struct
-  {
-          uint16_t numStages;                  /**< number of filter stages. */
-          q31_t *pState;                       /**< points to the state variable array. The array is of length numStages. */
-    const q31_t *pCoeffs;                      /**< points to the coefficient array. The array is of length numStages. */
-  } arm_fir_lattice_instance_q31;
-
-  /**
-   * @brief Instance structure for the floating-point FIR lattice filter.
-   */
-  typedef struct
-  {
-          uint16_t numStages;                  /**< number of filter stages. */
-          float32_t *pState;                   /**< points to the state variable array. The array is of length numStages. */
-    const float32_t *pCoeffs;                  /**< points to the coefficient array. The array is of length numStages. */
-  } arm_fir_lattice_instance_f32;
-
-
-  /**
-   * @brief Initialization function for the Q15 FIR lattice filter.
-   * @param[in] S          points to an instance of the Q15 FIR lattice structure.
-   * @param[in] numStages  number of filter stages.
-   * @param[in] pCoeffs    points to the coefficient buffer.  The array is of length numStages.
-   * @param[in] pState     points to the state buffer.  The array is of length numStages.
-   */
-  void arm_fir_lattice_init_q15(
-        arm_fir_lattice_instance_q15 * S,
-        uint16_t numStages,
-  const q15_t * pCoeffs,
-        q15_t * pState);
-
-
-  /**
-   * @brief Processing function for the Q15 FIR lattice filter.
-   * @param[in]  S          points to an instance of the Q15 FIR lattice structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_fir_lattice_q15(
-  const arm_fir_lattice_instance_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Initialization function for the Q31 FIR lattice filter.
-   * @param[in] S          points to an instance of the Q31 FIR lattice structure.
-   * @param[in] numStages  number of filter stages.
-   * @param[in] pCoeffs    points to the coefficient buffer.  The array is of length numStages.
-   * @param[in] pState     points to the state buffer.   The array is of length numStages.
-   */
-  void arm_fir_lattice_init_q31(
-        arm_fir_lattice_instance_q31 * S,
-        uint16_t numStages,
-  const q31_t * pCoeffs,
-        q31_t * pState);
-
-
-  /**
-   * @brief Processing function for the Q31 FIR lattice filter.
-   * @param[in]  S          points to an instance of the Q31 FIR lattice structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_fir_lattice_q31(
-  const arm_fir_lattice_instance_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-/**
- * @brief Initialization function for the floating-point FIR lattice filter.
- * @param[in] S          points to an instance of the floating-point FIR lattice structure.
- * @param[in] numStages  number of filter stages.
- * @param[in] pCoeffs    points to the coefficient buffer.  The array is of length numStages.
- * @param[in] pState     points to the state buffer.  The array is of length numStages.
- */
-  void arm_fir_lattice_init_f32(
-        arm_fir_lattice_instance_f32 * S,
-        uint16_t numStages,
-  const float32_t * pCoeffs,
-        float32_t * pState);
-
-
-  /**
-   * @brief Processing function for the floating-point FIR lattice filter.
-   * @param[in]  S          points to an instance of the floating-point FIR lattice structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_fir_lattice_f32(
-  const arm_fir_lattice_instance_f32 * S,
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Instance structure for the Q15 IIR lattice filter.
-   */
-  typedef struct
-  {
-          uint16_t numStages;                  /**< number of stages in the filter. */
-          q15_t *pState;                       /**< points to the state variable array. The array is of length numStages+blockSize. */
-          q15_t *pkCoeffs;                     /**< points to the reflection coefficient array. The array is of length numStages. */
-          q15_t *pvCoeffs;                     /**< points to the ladder coefficient array. The array is of length numStages+1. */
-  } arm_iir_lattice_instance_q15;
-
-  /**
-   * @brief Instance structure for the Q31 IIR lattice filter.
-   */
-  typedef struct
-  {
-          uint16_t numStages;                  /**< number of stages in the filter. */
-          q31_t *pState;                       /**< points to the state variable array. The array is of length numStages+blockSize. */
-          q31_t *pkCoeffs;                     /**< points to the reflection coefficient array. The array is of length numStages. */
-          q31_t *pvCoeffs;                     /**< points to the ladder coefficient array. The array is of length numStages+1. */
-  } arm_iir_lattice_instance_q31;
-
-  /**
-   * @brief Instance structure for the floating-point IIR lattice filter.
-   */
-  typedef struct
-  {
-          uint16_t numStages;                  /**< number of stages in the filter. */
-          float32_t *pState;                   /**< points to the state variable array. The array is of length numStages+blockSize. */
-          float32_t *pkCoeffs;                 /**< points to the reflection coefficient array. The array is of length numStages. */
-          float32_t *pvCoeffs;                 /**< points to the ladder coefficient array. The array is of length numStages+1. */
-  } arm_iir_lattice_instance_f32;
-
-
-  /**
-   * @brief Processing function for the floating-point IIR lattice filter.
-   * @param[in]  S          points to an instance of the floating-point IIR lattice structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_iir_lattice_f32(
-  const arm_iir_lattice_instance_f32 * S,
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Initialization function for the floating-point IIR lattice filter.
-   * @param[in] S          points to an instance of the floating-point IIR lattice structure.
-   * @param[in] numStages  number of stages in the filter.
-   * @param[in] pkCoeffs   points to the reflection coefficient buffer.  The array is of length numStages.
-   * @param[in] pvCoeffs   points to the ladder coefficient buffer.  The array is of length numStages+1.
-   * @param[in] pState     points to the state buffer.  The array is of length numStages+blockSize-1.
-   * @param[in] blockSize  number of samples to process.
-   */
-  void arm_iir_lattice_init_f32(
-        arm_iir_lattice_instance_f32 * S,
-        uint16_t numStages,
-        float32_t * pkCoeffs,
-        float32_t * pvCoeffs,
-        float32_t * pState,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Processing function for the Q31 IIR lattice filter.
-   * @param[in]  S          points to an instance of the Q31 IIR lattice structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_iir_lattice_q31(
-  const arm_iir_lattice_instance_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Initialization function for the Q31 IIR lattice filter.
-   * @param[in] S          points to an instance of the Q31 IIR lattice structure.
-   * @param[in] numStages  number of stages in the filter.
-   * @param[in] pkCoeffs   points to the reflection coefficient buffer.  The array is of length numStages.
-   * @param[in] pvCoeffs   points to the ladder coefficient buffer.  The array is of length numStages+1.
-   * @param[in] pState     points to the state buffer.  The array is of length numStages+blockSize.
-   * @param[in] blockSize  number of samples to process.
-   */
-  void arm_iir_lattice_init_q31(
-        arm_iir_lattice_instance_q31 * S,
-        uint16_t numStages,
-        q31_t * pkCoeffs,
-        q31_t * pvCoeffs,
-        q31_t * pState,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Processing function for the Q15 IIR lattice filter.
-   * @param[in]  S          points to an instance of the Q15 IIR lattice structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_iir_lattice_q15(
-  const arm_iir_lattice_instance_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-/**
- * @brief Initialization function for the Q15 IIR lattice filter.
- * @param[in] S          points to an instance of the fixed-point Q15 IIR lattice structure.
- * @param[in] numStages  number of stages in the filter.
- * @param[in] pkCoeffs   points to reflection coefficient buffer.  The array is of length numStages.
- * @param[in] pvCoeffs   points to ladder coefficient buffer.  The array is of length numStages+1.
- * @param[in] pState     points to state buffer.  The array is of length numStages+blockSize.
- * @param[in] blockSize  number of samples to process per call.
- */
-  void arm_iir_lattice_init_q15(
-        arm_iir_lattice_instance_q15 * S,
-        uint16_t numStages,
-        q15_t * pkCoeffs,
-        q15_t * pvCoeffs,
-        q15_t * pState,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Instance structure for the floating-point LMS filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;    /**< number of coefficients in the filter. */
-          float32_t *pState;   /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
-          float32_t *pCoeffs;  /**< points to the coefficient array. The array is of length numTaps. */
-          float32_t mu;        /**< step size that controls filter coefficient updates. */
-  } arm_lms_instance_f32;
-
-
-  /**
-   * @brief Processing function for floating-point LMS filter.
-   * @param[in]  S          points to an instance of the floating-point LMS filter structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[in]  pRef       points to the block of reference data.
-   * @param[out] pOut       points to the block of output data.
-   * @param[out] pErr       points to the block of error data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_lms_f32(
-  const arm_lms_instance_f32 * S,
-  const float32_t * pSrc,
-        float32_t * pRef,
-        float32_t * pOut,
-        float32_t * pErr,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Initialization function for floating-point LMS filter.
-   * @param[in] S          points to an instance of the floating-point LMS filter structure.
-   * @param[in] numTaps    number of filter coefficients.
-   * @param[in] pCoeffs    points to the coefficient buffer.
-   * @param[in] pState     points to state buffer.
-   * @param[in] mu         step size that controls filter coefficient updates.
-   * @param[in] blockSize  number of samples to process.
-   */
-  void arm_lms_init_f32(
-        arm_lms_instance_f32 * S,
-        uint16_t numTaps,
-        float32_t * pCoeffs,
-        float32_t * pState,
-        float32_t mu,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Instance structure for the Q15 LMS filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;    /**< number of coefficients in the filter. */
-          q15_t *pState;       /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
-          q15_t *pCoeffs;      /**< points to the coefficient array. The array is of length numTaps. */
-          q15_t mu;            /**< step size that controls filter coefficient updates. */
-          uint32_t postShift;  /**< bit shift applied to coefficients. */
-  } arm_lms_instance_q15;
-
-
-  /**
-   * @brief Initialization function for the Q15 LMS filter.
-   * @param[in] S          points to an instance of the Q15 LMS filter structure.
-   * @param[in] numTaps    number of filter coefficients.
-   * @param[in] pCoeffs    points to the coefficient buffer.
-   * @param[in] pState     points to the state buffer.
-   * @param[in] mu         step size that controls filter coefficient updates.
-   * @param[in] blockSize  number of samples to process.
-   * @param[in] postShift  bit shift applied to coefficients.
-   */
-  void arm_lms_init_q15(
-        arm_lms_instance_q15 * S,
-        uint16_t numTaps,
-        q15_t * pCoeffs,
-        q15_t * pState,
-        q15_t mu,
-        uint32_t blockSize,
-        uint32_t postShift);
-
-
-  /**
-   * @brief Processing function for Q15 LMS filter.
-   * @param[in]  S          points to an instance of the Q15 LMS filter structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[in]  pRef       points to the block of reference data.
-   * @param[out] pOut       points to the block of output data.
-   * @param[out] pErr       points to the block of error data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_lms_q15(
-  const arm_lms_instance_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pRef,
-        q15_t * pOut,
-        q15_t * pErr,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Instance structure for the Q31 LMS filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;    /**< number of coefficients in the filter. */
-          q31_t *pState;       /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
-          q31_t *pCoeffs;      /**< points to the coefficient array. The array is of length numTaps. */
-          q31_t mu;            /**< step size that controls filter coefficient updates. */
-          uint32_t postShift;  /**< bit shift applied to coefficients. */
-  } arm_lms_instance_q31;
-
-
-  /**
-   * @brief Processing function for Q31 LMS filter.
-   * @param[in]  S          points to an instance of the Q15 LMS filter structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[in]  pRef       points to the block of reference data.
-   * @param[out] pOut       points to the block of output data.
-   * @param[out] pErr       points to the block of error data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_lms_q31(
-  const arm_lms_instance_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pRef,
-        q31_t * pOut,
-        q31_t * pErr,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Initialization function for Q31 LMS filter.
-   * @param[in] S          points to an instance of the Q31 LMS filter structure.
-   * @param[in] numTaps    number of filter coefficients.
-   * @param[in] pCoeffs    points to coefficient buffer.
-   * @param[in] pState     points to state buffer.
-   * @param[in] mu         step size that controls filter coefficient updates.
-   * @param[in] blockSize  number of samples to process.
-   * @param[in] postShift  bit shift applied to coefficients.
-   */
-  void arm_lms_init_q31(
-        arm_lms_instance_q31 * S,
-        uint16_t numTaps,
-        q31_t * pCoeffs,
-        q31_t * pState,
-        q31_t mu,
-        uint32_t blockSize,
-        uint32_t postShift);
-
-
-  /**
-   * @brief Instance structure for the floating-point normalized LMS filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;     /**< number of coefficients in the filter. */
-          float32_t *pState;    /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
-          float32_t *pCoeffs;   /**< points to the coefficient array. The array is of length numTaps. */
-          float32_t mu;         /**< step size that control filter coefficient updates. */
-          float32_t energy;     /**< saves previous frame energy. */
-          float32_t x0;         /**< saves previous input sample. */
-  } arm_lms_norm_instance_f32;
-
-
-  /**
-   * @brief Processing function for floating-point normalized LMS filter.
-   * @param[in]  S          points to an instance of the floating-point normalized LMS filter structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[in]  pRef       points to the block of reference data.
-   * @param[out] pOut       points to the block of output data.
-   * @param[out] pErr       points to the block of error data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_lms_norm_f32(
-        arm_lms_norm_instance_f32 * S,
-  const float32_t * pSrc,
-        float32_t * pRef,
-        float32_t * pOut,
-        float32_t * pErr,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Initialization function for floating-point normalized LMS filter.
-   * @param[in] S          points to an instance of the floating-point LMS filter structure.
-   * @param[in] numTaps    number of filter coefficients.
-   * @param[in] pCoeffs    points to coefficient buffer.
-   * @param[in] pState     points to state buffer.
-   * @param[in] mu         step size that controls filter coefficient updates.
-   * @param[in] blockSize  number of samples to process.
-   */
-  void arm_lms_norm_init_f32(
-        arm_lms_norm_instance_f32 * S,
-        uint16_t numTaps,
-        float32_t * pCoeffs,
-        float32_t * pState,
-        float32_t mu,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Instance structure for the Q31 normalized LMS filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;     /**< number of coefficients in the filter. */
-          q31_t *pState;        /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
-          q31_t *pCoeffs;       /**< points to the coefficient array. The array is of length numTaps. */
-          q31_t mu;             /**< step size that controls filter coefficient updates. */
-          uint8_t postShift;    /**< bit shift applied to coefficients. */
-    const q31_t *recipTable;    /**< points to the reciprocal initial value table. */
-          q31_t energy;         /**< saves previous frame energy. */
-          q31_t x0;             /**< saves previous input sample. */
-  } arm_lms_norm_instance_q31;
-
-
-  /**
-   * @brief Processing function for Q31 normalized LMS filter.
-   * @param[in]  S          points to an instance of the Q31 normalized LMS filter structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[in]  pRef       points to the block of reference data.
-   * @param[out] pOut       points to the block of output data.
-   * @param[out] pErr       points to the block of error data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_lms_norm_q31(
-        arm_lms_norm_instance_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pRef,
-        q31_t * pOut,
-        q31_t * pErr,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Initialization function for Q31 normalized LMS filter.
-   * @param[in] S          points to an instance of the Q31 normalized LMS filter structure.
-   * @param[in] numTaps    number of filter coefficients.
-   * @param[in] pCoeffs    points to coefficient buffer.
-   * @param[in] pState     points to state buffer.
-   * @param[in] mu         step size that controls filter coefficient updates.
-   * @param[in] blockSize  number of samples to process.
-   * @param[in] postShift  bit shift applied to coefficients.
-   */
-  void arm_lms_norm_init_q31(
-        arm_lms_norm_instance_q31 * S,
-        uint16_t numTaps,
-        q31_t * pCoeffs,
-        q31_t * pState,
-        q31_t mu,
-        uint32_t blockSize,
-        uint8_t postShift);
-
-
-  /**
-   * @brief Instance structure for the Q15 normalized LMS filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;     /**< Number of coefficients in the filter. */
-          q15_t *pState;        /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
-          q15_t *pCoeffs;       /**< points to the coefficient array. The array is of length numTaps. */
-          q15_t mu;             /**< step size that controls filter coefficient updates. */
-          uint8_t postShift;    /**< bit shift applied to coefficients. */
-    const q15_t *recipTable;    /**< Points to the reciprocal initial value table. */
-          q15_t energy;         /**< saves previous frame energy. */
-          q15_t x0;             /**< saves previous input sample. */
-  } arm_lms_norm_instance_q15;
-
-
-  /**
-   * @brief Processing function for Q15 normalized LMS filter.
-   * @param[in]  S          points to an instance of the Q15 normalized LMS filter structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[in]  pRef       points to the block of reference data.
-   * @param[out] pOut       points to the block of output data.
-   * @param[out] pErr       points to the block of error data.
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_lms_norm_q15(
-        arm_lms_norm_instance_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pRef,
-        q15_t * pOut,
-        q15_t * pErr,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Initialization function for Q15 normalized LMS filter.
-   * @param[in] S          points to an instance of the Q15 normalized LMS filter structure.
-   * @param[in] numTaps    number of filter coefficients.
-   * @param[in] pCoeffs    points to coefficient buffer.
-   * @param[in] pState     points to state buffer.
-   * @param[in] mu         step size that controls filter coefficient updates.
-   * @param[in] blockSize  number of samples to process.
-   * @param[in] postShift  bit shift applied to coefficients.
-   */
-  void arm_lms_norm_init_q15(
-        arm_lms_norm_instance_q15 * S,
-        uint16_t numTaps,
-        q15_t * pCoeffs,
-        q15_t * pState,
-        q15_t mu,
-        uint32_t blockSize,
-        uint8_t postShift);
-
-
-  /**
-   * @brief Correlation of floating-point sequences.
-   * @param[in]  pSrcA    points to the first input sequence.
-   * @param[in]  srcALen  length of the first input sequence.
-   * @param[in]  pSrcB    points to the second input sequence.
-   * @param[in]  srcBLen  length of the second input sequence.
-   * @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
-   */
-  void arm_correlate_f32(
-  const float32_t * pSrcA,
-        uint32_t srcALen,
-  const float32_t * pSrcB,
-        uint32_t srcBLen,
-        float32_t * pDst);
-
-
-/**
- @brief Correlation of Q15 sequences
- @param[in]  pSrcA     points to the first input sequence
- @param[in]  srcALen   length of the first input sequence
- @param[in]  pSrcB     points to the second input sequence
- @param[in]  srcBLen   length of the second input sequence
- @param[out] pDst      points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
- @param[in]  pScratch  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
-*/
-void arm_correlate_opt_q15(
-  const q15_t * pSrcA,
-        uint32_t srcALen,
-  const q15_t * pSrcB,
-        uint32_t srcBLen,
-        q15_t * pDst,
-        q15_t * pScratch);
-
-
-/**
-  @brief Correlation of Q15 sequences.
-  @param[in]  pSrcA    points to the first input sequence
-  @param[in]  srcALen  length of the first input sequence
-  @param[in]  pSrcB    points to the second input sequence
-  @param[in]  srcBLen  length of the second input sequence
-  @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
- */
-  void arm_correlate_q15(
-  const q15_t * pSrcA,
-        uint32_t srcALen,
-  const q15_t * pSrcB,
-        uint32_t srcBLen,
-        q15_t * pDst);
-
-
-/**
-  @brief         Correlation of Q15 sequences (fast version).
-  @param[in]     pSrcA      points to the first input sequence
-  @param[in]     srcALen    length of the first input sequence
-  @param[in]     pSrcB      points to the second input sequence
-  @param[in]     srcBLen    length of the second input sequence
-  @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
-  @return        none
- */
-void arm_correlate_fast_q15(
-  const q15_t * pSrcA,
-        uint32_t srcALen,
-  const q15_t * pSrcB,
-        uint32_t srcBLen,
-        q15_t * pDst);
-
-
-/**
-  @brief Correlation of Q15 sequences (fast version).
-  @param[in]  pSrcA     points to the first input sequence.
-  @param[in]  srcALen   length of the first input sequence.
-  @param[in]  pSrcB     points to the second input sequence.
-  @param[in]  srcBLen   length of the second input sequence.
-  @param[out] pDst      points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
-  @param[in]  pScratch  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
- */
-void arm_correlate_fast_opt_q15(
-  const q15_t * pSrcA,
-        uint32_t srcALen,
-  const q15_t * pSrcB,
-        uint32_t srcBLen,
-        q15_t * pDst,
-        q15_t * pScratch);
-
-
-  /**
-   * @brief Correlation of Q31 sequences.
-   * @param[in]  pSrcA    points to the first input sequence.
-   * @param[in]  srcALen  length of the first input sequence.
-   * @param[in]  pSrcB    points to the second input sequence.
-   * @param[in]  srcBLen  length of the second input sequence.
-   * @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
-   */
-  void arm_correlate_q31(
-  const q31_t * pSrcA,
-        uint32_t srcALen,
-  const q31_t * pSrcB,
-        uint32_t srcBLen,
-        q31_t * pDst);
-
-
-/**
-  @brief Correlation of Q31 sequences (fast version).
-  @param[in]  pSrcA    points to the first input sequence
-  @param[in]  srcALen  length of the first input sequence
-  @param[in]  pSrcB    points to the second input sequence
-  @param[in]  srcBLen  length of the second input sequence
-  @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
- */
-void arm_correlate_fast_q31(
-  const q31_t * pSrcA,
-        uint32_t srcALen,
-  const q31_t * pSrcB,
-        uint32_t srcBLen,
-        q31_t * pDst);
-
-
- /**
-   * @brief Correlation of Q7 sequences.
-   * @param[in]  pSrcA      points to the first input sequence.
-   * @param[in]  srcALen    length of the first input sequence.
-   * @param[in]  pSrcB      points to the second input sequence.
-   * @param[in]  srcBLen    length of the second input sequence.
-   * @param[out] pDst       points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
-   * @param[in]  pScratch1  points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
-   * @param[in]  pScratch2  points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
-   */
-  void arm_correlate_opt_q7(
-  const q7_t * pSrcA,
-        uint32_t srcALen,
-  const q7_t * pSrcB,
-        uint32_t srcBLen,
-        q7_t * pDst,
-        q15_t * pScratch1,
-        q15_t * pScratch2);
-
-
-  /**
-   * @brief Correlation of Q7 sequences.
-   * @param[in]  pSrcA    points to the first input sequence.
-   * @param[in]  srcALen  length of the first input sequence.
-   * @param[in]  pSrcB    points to the second input sequence.
-   * @param[in]  srcBLen  length of the second input sequence.
-   * @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
-   */
-  void arm_correlate_q7(
-  const q7_t * pSrcA,
-        uint32_t srcALen,
-  const q7_t * pSrcB,
-        uint32_t srcBLen,
-        q7_t * pDst);
-
-
-  /**
-   * @brief Instance structure for the floating-point sparse FIR filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;             /**< number of coefficients in the filter. */
-          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
-          float32_t *pState;            /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
-    const float32_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps.*/
-          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
-          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
-  } arm_fir_sparse_instance_f32;
-
-  /**
-   * @brief Instance structure for the Q31 sparse FIR filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;             /**< number of coefficients in the filter. */
-          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
-          q31_t *pState;                /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
-    const q31_t *pCoeffs;               /**< points to the coefficient array. The array is of length numTaps.*/
-          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
-          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
-  } arm_fir_sparse_instance_q31;
-
-  /**
-   * @brief Instance structure for the Q15 sparse FIR filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;             /**< number of coefficients in the filter. */
-          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
-          q15_t *pState;                /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
-    const q15_t *pCoeffs;               /**< points to the coefficient array. The array is of length numTaps.*/
-          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
-          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
-  } arm_fir_sparse_instance_q15;
-
-  /**
-   * @brief Instance structure for the Q7 sparse FIR filter.
-   */
-  typedef struct
-  {
-          uint16_t numTaps;             /**< number of coefficients in the filter. */
-          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
-          q7_t *pState;                 /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
-    const q7_t *pCoeffs;                /**< points to the coefficient array. The array is of length numTaps.*/
-          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
-          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
-  } arm_fir_sparse_instance_q7;
-
-
-  /**
-   * @brief Processing function for the floating-point sparse FIR filter.
-   * @param[in]  S           points to an instance of the floating-point sparse FIR structure.
-   * @param[in]  pSrc        points to the block of input data.
-   * @param[out] pDst        points to the block of output data
-   * @param[in]  pScratchIn  points to a temporary buffer of size blockSize.
-   * @param[in]  blockSize   number of input samples to process per call.
-   */
-  void arm_fir_sparse_f32(
-        arm_fir_sparse_instance_f32 * S,
-  const float32_t * pSrc,
-        float32_t * pDst,
-        float32_t * pScratchIn,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Initialization function for the floating-point sparse FIR filter.
-   * @param[in,out] S          points to an instance of the floating-point sparse FIR structure.
-   * @param[in]     numTaps    number of nonzero coefficients in the filter.
-   * @param[in]     pCoeffs    points to the array of filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     pTapDelay  points to the array of offset times.
-   * @param[in]     maxDelay   maximum offset time supported.
-   * @param[in]     blockSize  number of samples that will be processed per block.
-   */
-  void arm_fir_sparse_init_f32(
-        arm_fir_sparse_instance_f32 * S,
-        uint16_t numTaps,
-  const float32_t * pCoeffs,
-        float32_t * pState,
-        int32_t * pTapDelay,
-        uint16_t maxDelay,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Processing function for the Q31 sparse FIR filter.
-   * @param[in]  S           points to an instance of the Q31 sparse FIR structure.
-   * @param[in]  pSrc        points to the block of input data.
-   * @param[out] pDst        points to the block of output data
-   * @param[in]  pScratchIn  points to a temporary buffer of size blockSize.
-   * @param[in]  blockSize   number of input samples to process per call.
-   */
-  void arm_fir_sparse_q31(
-        arm_fir_sparse_instance_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pDst,
-        q31_t * pScratchIn,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Initialization function for the Q31 sparse FIR filter.
-   * @param[in,out] S          points to an instance of the Q31 sparse FIR structure.
-   * @param[in]     numTaps    number of nonzero coefficients in the filter.
-   * @param[in]     pCoeffs    points to the array of filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     pTapDelay  points to the array of offset times.
-   * @param[in]     maxDelay   maximum offset time supported.
-   * @param[in]     blockSize  number of samples that will be processed per block.
-   */
-  void arm_fir_sparse_init_q31(
-        arm_fir_sparse_instance_q31 * S,
-        uint16_t numTaps,
-  const q31_t * pCoeffs,
-        q31_t * pState,
-        int32_t * pTapDelay,
-        uint16_t maxDelay,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Processing function for the Q15 sparse FIR filter.
-   * @param[in]  S            points to an instance of the Q15 sparse FIR structure.
-   * @param[in]  pSrc         points to the block of input data.
-   * @param[out] pDst         points to the block of output data
-   * @param[in]  pScratchIn   points to a temporary buffer of size blockSize.
-   * @param[in]  pScratchOut  points to a temporary buffer of size blockSize.
-   * @param[in]  blockSize    number of input samples to process per call.
-   */
-  void arm_fir_sparse_q15(
-        arm_fir_sparse_instance_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pDst,
-        q15_t * pScratchIn,
-        q31_t * pScratchOut,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Initialization function for the Q15 sparse FIR filter.
-   * @param[in,out] S          points to an instance of the Q15 sparse FIR structure.
-   * @param[in]     numTaps    number of nonzero coefficients in the filter.
-   * @param[in]     pCoeffs    points to the array of filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     pTapDelay  points to the array of offset times.
-   * @param[in]     maxDelay   maximum offset time supported.
-   * @param[in]     blockSize  number of samples that will be processed per block.
-   */
-  void arm_fir_sparse_init_q15(
-        arm_fir_sparse_instance_q15 * S,
-        uint16_t numTaps,
-  const q15_t * pCoeffs,
-        q15_t * pState,
-        int32_t * pTapDelay,
-        uint16_t maxDelay,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Processing function for the Q7 sparse FIR filter.
-   * @param[in]  S            points to an instance of the Q7 sparse FIR structure.
-   * @param[in]  pSrc         points to the block of input data.
-   * @param[out] pDst         points to the block of output data
-   * @param[in]  pScratchIn   points to a temporary buffer of size blockSize.
-   * @param[in]  pScratchOut  points to a temporary buffer of size blockSize.
-   * @param[in]  blockSize    number of input samples to process per call.
-   */
-  void arm_fir_sparse_q7(
-        arm_fir_sparse_instance_q7 * S,
-  const q7_t * pSrc,
-        q7_t * pDst,
-        q7_t * pScratchIn,
-        q31_t * pScratchOut,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Initialization function for the Q7 sparse FIR filter.
-   * @param[in,out] S          points to an instance of the Q7 sparse FIR structure.
-   * @param[in]     numTaps    number of nonzero coefficients in the filter.
-   * @param[in]     pCoeffs    points to the array of filter coefficients.
-   * @param[in]     pState     points to the state buffer.
-   * @param[in]     pTapDelay  points to the array of offset times.
-   * @param[in]     maxDelay   maximum offset time supported.
-   * @param[in]     blockSize  number of samples that will be processed per block.
-   */
-  void arm_fir_sparse_init_q7(
-        arm_fir_sparse_instance_q7 * S,
-        uint16_t numTaps,
-  const q7_t * pCoeffs,
-        q7_t * pState,
-        int32_t * pTapDelay,
-        uint16_t maxDelay,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Floating-point sin_cos function.
-   * @param[in]  theta   input value in degrees
-   * @param[out] pSinVal  points to the processed sine output.
-   * @param[out] pCosVal  points to the processed cos output.
-   */
-  void arm_sin_cos_f32(
-        float32_t theta,
-        float32_t * pSinVal,
-        float32_t * pCosVal);
-
-
-  /**
-   * @brief  Q31 sin_cos function.
-   * @param[in]  theta    scaled input value in degrees
-   * @param[out] pSinVal  points to the processed sine output.
-   * @param[out] pCosVal  points to the processed cosine output.
-   */
-  void arm_sin_cos_q31(
-        q31_t theta,
-        q31_t * pSinVal,
-        q31_t * pCosVal);
-
-
-  /**
-   * @brief  Floating-point complex conjugate.
-   * @param[in]  pSrc        points to the input vector
-   * @param[out] pDst        points to the output vector
-   * @param[in]  numSamples  number of complex samples in each vector
-   */
-  void arm_cmplx_conj_f32(
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t numSamples);
-
-  /**
-   * @brief  Q31 complex conjugate.
-   * @param[in]  pSrc        points to the input vector
-   * @param[out] pDst        points to the output vector
-   * @param[in]  numSamples  number of complex samples in each vector
-   */
-  void arm_cmplx_conj_q31(
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t numSamples);
-
-
-  /**
-   * @brief  Q15 complex conjugate.
-   * @param[in]  pSrc        points to the input vector
-   * @param[out] pDst        points to the output vector
-   * @param[in]  numSamples  number of complex samples in each vector
-   */
-  void arm_cmplx_conj_q15(
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t numSamples);
-
-
-  /**
-   * @brief  Floating-point complex magnitude squared
-   * @param[in]  pSrc        points to the complex input vector
-   * @param[out] pDst        points to the real output vector
-   * @param[in]  numSamples  number of complex samples in the input vector
-   */
-  void arm_cmplx_mag_squared_f32(
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t numSamples);
-
-
-  /**
-   * @brief  Q31 complex magnitude squared
-   * @param[in]  pSrc        points to the complex input vector
-   * @param[out] pDst        points to the real output vector
-   * @param[in]  numSamples  number of complex samples in the input vector
-   */
-  void arm_cmplx_mag_squared_q31(
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t numSamples);
-
-
-  /**
-   * @brief  Q15 complex magnitude squared
-   * @param[in]  pSrc        points to the complex input vector
-   * @param[out] pDst        points to the real output vector
-   * @param[in]  numSamples  number of complex samples in the input vector
-   */
-  void arm_cmplx_mag_squared_q15(
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t numSamples);
-
-
- /**
-   * @ingroup groupController
-   */
-
-  /**
-   * @defgroup PID PID Motor Control
-   *
-   * A Proportional Integral Derivative (PID) controller is a generic feedback control
-   * loop mechanism widely used in industrial control systems.
-   * A PID controller is the most commonly used type of feedback controller.
-   *
-   * This set of functions implements (PID) controllers
-   * for Q15, Q31, and floating-point data types.  The functions operate on a single sample
-   * of data and each call to the function returns a single processed value.
-   * <code>S</code> points to an instance of the PID control data structure.  <code>in</code>
-   * is the input sample value. The functions return the output value.
-   *
-   * \par Algorithm:
-   * <pre>
-   *    y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2]
-   *    A0 = Kp + Ki + Kd
-   *    A1 = (-Kp ) - (2 * Kd )
-   *    A2 = Kd
-   * </pre>
-   *
-   * \par
-   * where \c Kp is proportional constant, \c Ki is Integral constant and \c Kd is Derivative constant
-   *
-   * \par
-   * \image html PID.gif "Proportional Integral Derivative Controller"
-   *
-   * \par
-   * The PID controller calculates an "error" value as the difference between
-   * the measured output and the reference input.
-   * The controller attempts to minimize the error by adjusting the process control inputs.
-   * The proportional value determines the reaction to the current error,
-   * the integral value determines the reaction based on the sum of recent errors,
-   * and the derivative value determines the reaction based on the rate at which the error has been changing.
-   *
-   * \par Instance Structure
-   * The Gains A0, A1, A2 and state variables for a PID controller are stored together in an instance data structure.
-   * A separate instance structure must be defined for each PID Controller.
-   * There are separate instance structure declarations for each of the 3 supported data types.
-   *
-   * \par Reset Functions
-   * There is also an associated reset function for each data type which clears the state array.
-   *
-   * \par Initialization Functions
-   * There is also an associated initialization function for each data type.
-   * The initialization function performs the following operations:
-   * - Initializes the Gains A0, A1, A2 from Kp,Ki, Kd gains.
-   * - Zeros out the values in the state buffer.
-   *
-   * \par
-   * Instance structure cannot be placed into a const data section and it is recommended to use the initialization function.
-   *
-   * \par Fixed-Point Behavior
-   * Care must be taken when using the fixed-point versions of the PID Controller functions.
-   * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
-   * Refer to the function specific documentation below for usage guidelines.
-   */
-
-  /**
-   * @addtogroup PID
-   * @{
-   */
-
-  /**
-   * @brief         Process function for the floating-point PID Control.
-   * @param[in,out] S   is an instance of the floating-point PID Control structure
-   * @param[in]     in  input sample to process
-   * @return        processed output sample.
-   */
-  __STATIC_FORCEINLINE float32_t arm_pid_f32(
-  arm_pid_instance_f32 * S,
-  float32_t in)
-  {
-    float32_t out;
-
-    /* y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2]  */
-    out = (S->A0 * in) +
-      (S->A1 * S->state[0]) + (S->A2 * S->state[1]) + (S->state[2]);
-
-    /* Update state */
-    S->state[1] = S->state[0];
-    S->state[0] = in;
-    S->state[2] = out;
-
-    /* return to application */
-    return (out);
-
-  }
-
-/**
-  @brief         Process function for the Q31 PID Control.
-  @param[in,out] S  points to an instance of the Q31 PID Control structure
-  @param[in]     in  input sample to process
-  @return        processed output sample.
-
-  \par Scaling and Overflow Behavior
-         The function is implemented using an internal 64-bit accumulator.
-         The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
-         Thus, if the accumulator result overflows it wraps around rather than clip.
-         In order to avoid overflows completely the input signal must be scaled down by 2 bits as there are four additions.
-         After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
- */
-__STATIC_FORCEINLINE q31_t arm_pid_q31(
-  arm_pid_instance_q31 * S,
-  q31_t in)
-  {
-    q63_t acc;
-    q31_t out;
-
-    /* acc = A0 * x[n]  */
-    acc = (q63_t) S->A0 * in;
-
-    /* acc += A1 * x[n-1] */
-    acc += (q63_t) S->A1 * S->state[0];
-
-    /* acc += A2 * x[n-2]  */
-    acc += (q63_t) S->A2 * S->state[1];
-
-    /* convert output to 1.31 format to add y[n-1] */
-    out = (q31_t) (acc >> 31U);
-
-    /* out += y[n-1] */
-    out += S->state[2];
-
-    /* Update state */
-    S->state[1] = S->state[0];
-    S->state[0] = in;
-    S->state[2] = out;
-
-    /* return to application */
-    return (out);
-  }
-
-
-/**
-  @brief         Process function for the Q15 PID Control.
-  @param[in,out] S   points to an instance of the Q15 PID Control structure
-  @param[in]     in  input sample to process
-  @return        processed output sample.
-
-  \par Scaling and Overflow Behavior
-         The function is implemented using a 64-bit internal accumulator.
-         Both Gains and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
-         The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
-         There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
-         After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
-         Lastly, the accumulator is saturated to yield a result in 1.15 format.
- */
-__STATIC_FORCEINLINE q15_t arm_pid_q15(
-  arm_pid_instance_q15 * S,
-  q15_t in)
-  {
-    q63_t acc;
-    q15_t out;
-
-#if defined (ARM_MATH_DSP)
-    /* Implementation of PID controller */
-
-    /* acc = A0 * x[n]  */
-    acc = (q31_t) __SMUAD((uint32_t)S->A0, (uint32_t)in);
-
-    /* acc += A1 * x[n-1] + A2 * x[n-2]  */
-    acc = (q63_t)__SMLALD((uint32_t)S->A1, (uint32_t)read_q15x2 (S->state), (uint64_t)acc);
-#else
-    /* acc = A0 * x[n]  */
-    acc = ((q31_t) S->A0) * in;
-
-    /* acc += A1 * x[n-1] + A2 * x[n-2]  */
-    acc += (q31_t) S->A1 * S->state[0];
-    acc += (q31_t) S->A2 * S->state[1];
-#endif
-
-    /* acc += y[n-1] */
-    acc += (q31_t) S->state[2] << 15;
-
-    /* saturate the output */
-    out = (q15_t) (__SSAT((q31_t)(acc >> 15), 16));
-
-    /* Update state */
-    S->state[1] = S->state[0];
-    S->state[0] = in;
-    S->state[2] = out;
-
-    /* return to application */
-    return (out);
-  }
-
-  /**
-   * @} end of PID group
-   */
-
-
-  /**
-   * @brief Floating-point matrix inverse.
-   * @param[in]  src   points to the instance of the input floating-point matrix structure.
-   * @param[out] dst   points to the instance of the output floating-point matrix structure.
-   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
-   * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR.
-   */
-  arm_status arm_mat_inverse_f32(
-  const arm_matrix_instance_f32 * src,
-  arm_matrix_instance_f32 * dst);
-
-
-  /**
-   * @brief Floating-point matrix inverse.
-   * @param[in]  src   points to the instance of the input floating-point matrix structure.
-   * @param[out] dst   points to the instance of the output floating-point matrix structure.
-   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
-   * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR.
-   */
-  arm_status arm_mat_inverse_f64(
-  const arm_matrix_instance_f64 * src,
-  arm_matrix_instance_f64 * dst);
-
-
-
-  /**
-   * @ingroup groupController
-   */
-
-  /**
-   * @defgroup clarke Vector Clarke Transform
-   * Forward Clarke transform converts the instantaneous stator phases into a two-coordinate time invariant vector.
-   * Generally the Clarke transform uses three-phase currents <code>Ia, Ib and Ic</code> to calculate currents
-   * in the two-phase orthogonal stator axis <code>Ialpha</code> and <code>Ibeta</code>.
-   * When <code>Ialpha</code> is superposed with <code>Ia</code> as shown in the figure below
-   * \image html clarke.gif Stator current space vector and its components in (a,b).
-   * and <code>Ia + Ib + Ic = 0</code>, in this condition <code>Ialpha</code> and <code>Ibeta</code>
-   * can be calculated using only <code>Ia</code> and <code>Ib</code>.
-   *
-   * The function operates on a single sample of data and each call to the function returns the processed output.
-   * The library provides separate functions for Q31 and floating-point data types.
-   * \par Algorithm
-   * \image html clarkeFormula.gif
-   * where <code>Ia</code> and <code>Ib</code> are the instantaneous stator phases and
-   * <code>pIalpha</code> and <code>pIbeta</code> are the two coordinates of time invariant vector.
-   * \par Fixed-Point Behavior
-   * Care must be taken when using the Q31 version of the Clarke transform.
-   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
-   * Refer to the function specific documentation below for usage guidelines.
-   */
-
-  /**
-   * @addtogroup clarke
-   * @{
-   */
-
-  /**
-   *
-   * @brief  Floating-point Clarke transform
-   * @param[in]  Ia       input three-phase coordinate <code>a</code>
-   * @param[in]  Ib       input three-phase coordinate <code>b</code>
-   * @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
-   * @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
-   * @return        none
-   */
-  __STATIC_FORCEINLINE void arm_clarke_f32(
-  float32_t Ia,
-  float32_t Ib,
-  float32_t * pIalpha,
-  float32_t * pIbeta)
-  {
-    /* Calculate pIalpha using the equation, pIalpha = Ia */
-    *pIalpha = Ia;
-
-    /* Calculate pIbeta using the equation, pIbeta = (1/sqrt(3)) * Ia + (2/sqrt(3)) * Ib */
-    *pIbeta = ((float32_t) 0.57735026919 * Ia + (float32_t) 1.15470053838 * Ib);
-  }
-
-
-/**
-  @brief  Clarke transform for Q31 version
-  @param[in]  Ia       input three-phase coordinate <code>a</code>
-  @param[in]  Ib       input three-phase coordinate <code>b</code>
-  @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
-  @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
-  @return     none
-
-  \par Scaling and Overflow Behavior
-         The function is implemented using an internal 32-bit accumulator.
-         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
-         There is saturation on the addition, hence there is no risk of overflow.
- */
-__STATIC_FORCEINLINE void arm_clarke_q31(
-  q31_t Ia,
-  q31_t Ib,
-  q31_t * pIalpha,
-  q31_t * pIbeta)
-  {
-    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
-
-    /* Calculating pIalpha from Ia by equation pIalpha = Ia */
-    *pIalpha = Ia;
-
-    /* Intermediate product is calculated by (1/(sqrt(3)) * Ia) */
-    product1 = (q31_t) (((q63_t) Ia * 0x24F34E8B) >> 30);
-
-    /* Intermediate product is calculated by (2/sqrt(3) * Ib) */
-    product2 = (q31_t) (((q63_t) Ib * 0x49E69D16) >> 30);
-
-    /* pIbeta is calculated by adding the intermediate products */
-    *pIbeta = __QADD(product1, product2);
-  }
-
-  /**
-   * @} end of clarke group
-   */
-
-
-  /**
-   * @ingroup groupController
-   */
-
-  /**
-   * @defgroup inv_clarke Vector Inverse Clarke Transform
-   * Inverse Clarke transform converts the two-coordinate time invariant vector into instantaneous stator phases.
-   *
-   * The function operates on a single sample of data and each call to the function returns the processed output.
-   * The library provides separate functions for Q31 and floating-point data types.
-   * \par Algorithm
-   * \image html clarkeInvFormula.gif
-   * where <code>pIa</code> and <code>pIb</code> are the instantaneous stator phases and
-   * <code>Ialpha</code> and <code>Ibeta</code> are the two coordinates of time invariant vector.
-   * \par Fixed-Point Behavior
-   * Care must be taken when using the Q31 version of the Clarke transform.
-   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
-   * Refer to the function specific documentation below for usage guidelines.
-   */
-
-  /**
-   * @addtogroup inv_clarke
-   * @{
-   */
-
-   /**
-   * @brief  Floating-point Inverse Clarke transform
-   * @param[in]  Ialpha  input two-phase orthogonal vector axis alpha
-   * @param[in]  Ibeta   input two-phase orthogonal vector axis beta
-   * @param[out] pIa     points to output three-phase coordinate <code>a</code>
-   * @param[out] pIb     points to output three-phase coordinate <code>b</code>
-   * @return     none
-   */
-  __STATIC_FORCEINLINE void arm_inv_clarke_f32(
-  float32_t Ialpha,
-  float32_t Ibeta,
-  float32_t * pIa,
-  float32_t * pIb)
-  {
-    /* Calculating pIa from Ialpha by equation pIa = Ialpha */
-    *pIa = Ialpha;
-
-    /* Calculating pIb from Ialpha and Ibeta by equation pIb = -(1/2) * Ialpha + (sqrt(3)/2) * Ibeta */
-    *pIb = -0.5f * Ialpha + 0.8660254039f * Ibeta;
-  }
-
-
-/**
-  @brief  Inverse Clarke transform for Q31 version
-  @param[in]  Ialpha  input two-phase orthogonal vector axis alpha
-  @param[in]  Ibeta   input two-phase orthogonal vector axis beta
-  @param[out] pIa     points to output three-phase coordinate <code>a</code>
-  @param[out] pIb     points to output three-phase coordinate <code>b</code>
-  @return     none
-
-  \par Scaling and Overflow Behavior
-         The function is implemented using an internal 32-bit accumulator.
-         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
-         There is saturation on the subtraction, hence there is no risk of overflow.
- */
-__STATIC_FORCEINLINE void arm_inv_clarke_q31(
-  q31_t Ialpha,
-  q31_t Ibeta,
-  q31_t * pIa,
-  q31_t * pIb)
-  {
-    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
-
-    /* Calculating pIa from Ialpha by equation pIa = Ialpha */
-    *pIa = Ialpha;
-
-    /* Intermediate product is calculated by (1/(2*sqrt(3)) * Ia) */
-    product1 = (q31_t) (((q63_t) (Ialpha) * (0x40000000)) >> 31);
-
-    /* Intermediate product is calculated by (1/sqrt(3) * pIb) */
-    product2 = (q31_t) (((q63_t) (Ibeta) * (0x6ED9EBA1)) >> 31);
-
-    /* pIb is calculated by subtracting the products */
-    *pIb = __QSUB(product2, product1);
-  }
-
-  /**
-   * @} end of inv_clarke group
-   */
-
-
-
-  /**
-   * @ingroup groupController
-   */
-
-  /**
-   * @defgroup park Vector Park Transform
-   *
-   * Forward Park transform converts the input two-coordinate vector to flux and torque components.
-   * The Park transform can be used to realize the transformation of the <code>Ialpha</code> and the <code>Ibeta</code> currents
-   * from the stationary to the moving reference frame and control the spatial relationship between
-   * the stator vector current and rotor flux vector.
-   * If we consider the d axis aligned with the rotor flux, the diagram below shows the
-   * current vector and the relationship from the two reference frames:
-   * \image html park.gif "Stator current space vector and its component in (a,b) and in the d,q rotating reference frame"
-   *
-   * The function operates on a single sample of data and each call to the function returns the processed output.
-   * The library provides separate functions for Q31 and floating-point data types.
-   * \par Algorithm
-   * \image html parkFormula.gif
-   * where <code>Ialpha</code> and <code>Ibeta</code> are the stator vector components,
-   * <code>pId</code> and <code>pIq</code> are rotor vector components and <code>cosVal</code> and <code>sinVal</code> are the
-   * cosine and sine values of theta (rotor flux position).
-   * \par Fixed-Point Behavior
-   * Care must be taken when using the Q31 version of the Park transform.
-   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
-   * Refer to the function specific documentation below for usage guidelines.
-   */
-
-  /**
-   * @addtogroup park
-   * @{
-   */
-
-  /**
-   * @brief Floating-point Park transform
-   * @param[in]  Ialpha  input two-phase vector coordinate alpha
-   * @param[in]  Ibeta   input two-phase vector coordinate beta
-   * @param[out] pId     points to output   rotor reference frame d
-   * @param[out] pIq     points to output   rotor reference frame q
-   * @param[in]  sinVal  sine value of rotation angle theta
-   * @param[in]  cosVal  cosine value of rotation angle theta
-   * @return     none
-   *
-   * The function implements the forward Park transform.
-   *
-   */
-  __STATIC_FORCEINLINE void arm_park_f32(
-  float32_t Ialpha,
-  float32_t Ibeta,
-  float32_t * pId,
-  float32_t * pIq,
-  float32_t sinVal,
-  float32_t cosVal)
-  {
-    /* Calculate pId using the equation, pId = Ialpha * cosVal + Ibeta * sinVal */
-    *pId = Ialpha * cosVal + Ibeta * sinVal;
-
-    /* Calculate pIq using the equation, pIq = - Ialpha * sinVal + Ibeta * cosVal */
-    *pIq = -Ialpha * sinVal + Ibeta * cosVal;
-  }
-
-
-/**
-  @brief  Park transform for Q31 version
-  @param[in]  Ialpha  input two-phase vector coordinate alpha
-  @param[in]  Ibeta   input two-phase vector coordinate beta
-  @param[out] pId     points to output rotor reference frame d
-  @param[out] pIq     points to output rotor reference frame q
-  @param[in]  sinVal  sine value of rotation angle theta
-  @param[in]  cosVal  cosine value of rotation angle theta
-  @return     none
-
-  \par Scaling and Overflow Behavior
-         The function is implemented using an internal 32-bit accumulator.
-         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
-         There is saturation on the addition and subtraction, hence there is no risk of overflow.
- */
-__STATIC_FORCEINLINE void arm_park_q31(
-  q31_t Ialpha,
-  q31_t Ibeta,
-  q31_t * pId,
-  q31_t * pIq,
-  q31_t sinVal,
-  q31_t cosVal)
-  {
-    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
-    q31_t product3, product4;                    /* Temporary variables used to store intermediate results */
-
-    /* Intermediate product is calculated by (Ialpha * cosVal) */
-    product1 = (q31_t) (((q63_t) (Ialpha) * (cosVal)) >> 31);
-
-    /* Intermediate product is calculated by (Ibeta * sinVal) */
-    product2 = (q31_t) (((q63_t) (Ibeta) * (sinVal)) >> 31);
-
-
-    /* Intermediate product is calculated by (Ialpha * sinVal) */
-    product3 = (q31_t) (((q63_t) (Ialpha) * (sinVal)) >> 31);
-
-    /* Intermediate product is calculated by (Ibeta * cosVal) */
-    product4 = (q31_t) (((q63_t) (Ibeta) * (cosVal)) >> 31);
-
-    /* Calculate pId by adding the two intermediate products 1 and 2 */
-    *pId = __QADD(product1, product2);
-
-    /* Calculate pIq by subtracting the two intermediate products 3 from 4 */
-    *pIq = __QSUB(product4, product3);
-  }
-
-  /**
-   * @} end of park group
-   */
-
-
-  /**
-   * @ingroup groupController
-   */
-
-  /**
-   * @defgroup inv_park Vector Inverse Park transform
-   * Inverse Park transform converts the input flux and torque components to two-coordinate vector.
-   *
-   * The function operates on a single sample of data and each call to the function returns the processed output.
-   * The library provides separate functions for Q31 and floating-point data types.
-   * \par Algorithm
-   * \image html parkInvFormula.gif
-   * where <code>pIalpha</code> and <code>pIbeta</code> are the stator vector components,
-   * <code>Id</code> and <code>Iq</code> are rotor vector components and <code>cosVal</code> and <code>sinVal</code> are the
-   * cosine and sine values of theta (rotor flux position).
-   * \par Fixed-Point Behavior
-   * Care must be taken when using the Q31 version of the Park transform.
-   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
-   * Refer to the function specific documentation below for usage guidelines.
-   */
-
-  /**
-   * @addtogroup inv_park
-   * @{
-   */
-
-   /**
-   * @brief  Floating-point Inverse Park transform
-   * @param[in]  Id       input coordinate of rotor reference frame d
-   * @param[in]  Iq       input coordinate of rotor reference frame q
-   * @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
-   * @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
-   * @param[in]  sinVal   sine value of rotation angle theta
-   * @param[in]  cosVal   cosine value of rotation angle theta
-   * @return     none
-   */
-  __STATIC_FORCEINLINE void arm_inv_park_f32(
-  float32_t Id,
-  float32_t Iq,
-  float32_t * pIalpha,
-  float32_t * pIbeta,
-  float32_t sinVal,
-  float32_t cosVal)
-  {
-    /* Calculate pIalpha using the equation, pIalpha = Id * cosVal - Iq * sinVal */
-    *pIalpha = Id * cosVal - Iq * sinVal;
-
-    /* Calculate pIbeta using the equation, pIbeta = Id * sinVal + Iq * cosVal */
-    *pIbeta = Id * sinVal + Iq * cosVal;
-  }
-
-
-/**
-  @brief  Inverse Park transform for   Q31 version
-  @param[in]  Id       input coordinate of rotor reference frame d
-  @param[in]  Iq       input coordinate of rotor reference frame q
-  @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
-  @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
-  @param[in]  sinVal   sine value of rotation angle theta
-  @param[in]  cosVal   cosine value of rotation angle theta
-  @return     none
-
-  @par Scaling and Overflow Behavior
-         The function is implemented using an internal 32-bit accumulator.
-         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
-         There is saturation on the addition, hence there is no risk of overflow.
- */
-__STATIC_FORCEINLINE void arm_inv_park_q31(
-  q31_t Id,
-  q31_t Iq,
-  q31_t * pIalpha,
-  q31_t * pIbeta,
-  q31_t sinVal,
-  q31_t cosVal)
-  {
-    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
-    q31_t product3, product4;                    /* Temporary variables used to store intermediate results */
-
-    /* Intermediate product is calculated by (Id * cosVal) */
-    product1 = (q31_t) (((q63_t) (Id) * (cosVal)) >> 31);
-
-    /* Intermediate product is calculated by (Iq * sinVal) */
-    product2 = (q31_t) (((q63_t) (Iq) * (sinVal)) >> 31);
-
-
-    /* Intermediate product is calculated by (Id * sinVal) */
-    product3 = (q31_t) (((q63_t) (Id) * (sinVal)) >> 31);
-
-    /* Intermediate product is calculated by (Iq * cosVal) */
-    product4 = (q31_t) (((q63_t) (Iq) * (cosVal)) >> 31);
-
-    /* Calculate pIalpha by using the two intermediate products 1 and 2 */
-    *pIalpha = __QSUB(product1, product2);
-
-    /* Calculate pIbeta by using the two intermediate products 3 and 4 */
-    *pIbeta = __QADD(product4, product3);
-  }
-
-  /**
-   * @} end of Inverse park group
-   */
-
-
-  /**
-   * @ingroup groupInterpolation
-   */
-
-  /**
-   * @defgroup LinearInterpolate Linear Interpolation
-   *
-   * Linear interpolation is a method of curve fitting using linear polynomials.
-   * Linear interpolation works by effectively drawing a straight line between two neighboring samples and returning the appropriate point along that line
-   *
-   * \par
-   * \image html LinearInterp.gif "Linear interpolation"
-   *
-   * \par
-   * A  Linear Interpolate function calculates an output value(y), for the input(x)
-   * using linear interpolation of the input values x0, x1( nearest input values) and the output values y0 and y1(nearest output values)
-   *
-   * \par Algorithm:
-   * <pre>
-   *       y = y0 + (x - x0) * ((y1 - y0)/(x1-x0))
-   *       where x0, x1 are nearest values of input x
-   *             y0, y1 are nearest values to output y
-   * </pre>
-   *
-   * \par
-   * This set of functions implements Linear interpolation process
-   * for Q7, Q15, Q31, and floating-point data types.  The functions operate on a single
-   * sample of data and each call to the function returns a single processed value.
-   * <code>S</code> points to an instance of the Linear Interpolate function data structure.
-   * <code>x</code> is the input sample value. The functions returns the output value.
-   *
-   * \par
-   * if x is outside of the table boundary, Linear interpolation returns first value of the table
-   * if x is below input range and returns last value of table if x is above range.
-   */
-
-  /**
-   * @addtogroup LinearInterpolate
-   * @{
-   */
-
-  /**
-   * @brief  Process function for the floating-point Linear Interpolation Function.
-   * @param[in,out] S  is an instance of the floating-point Linear Interpolation structure
-   * @param[in]     x  input sample to process
-   * @return y processed output sample.
-   *
-   */
-  __STATIC_FORCEINLINE float32_t arm_linear_interp_f32(
-  arm_linear_interp_instance_f32 * S,
-  float32_t x)
-  {
-    float32_t y;
-    float32_t x0, x1;                            /* Nearest input values */
-    float32_t y0, y1;                            /* Nearest output values */
-    float32_t xSpacing = S->xSpacing;            /* spacing between input values */
-    int32_t i;                                   /* Index variable */
-    float32_t *pYData = S->pYData;               /* pointer to output table */
-
-    /* Calculation of index */
-    i = (int32_t) ((x - S->x1) / xSpacing);
-
-    if (i < 0)
-    {
-      /* Iniatilize output for below specified range as least output value of table */
-      y = pYData[0];
-    }
-    else if ((uint32_t)i >= (S->nValues - 1))
-    {
-      /* Iniatilize output for above specified range as last output value of table */
-      y = pYData[S->nValues - 1];
-    }
-    else
-    {
-      /* Calculation of nearest input values */
-      x0 = S->x1 +  i      * xSpacing;
-      x1 = S->x1 + (i + 1) * xSpacing;
-
-      /* Read of nearest output values */
-      y0 = pYData[i];
-      y1 = pYData[i + 1];
-
-      /* Calculation of output */
-      y = y0 + (x - x0) * ((y1 - y0) / (x1 - x0));
-
-    }
-
-    /* returns output value */
-    return (y);
-  }
-
-
-   /**
-   *
-   * @brief  Process function for the Q31 Linear Interpolation Function.
-   * @param[in] pYData   pointer to Q31 Linear Interpolation table
-   * @param[in] x        input sample to process
-   * @param[in] nValues  number of table values
-   * @return y processed output sample.
-   *
-   * \par
-   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
-   * This function can support maximum of table size 2^12.
-   *
-   */
-  __STATIC_FORCEINLINE q31_t arm_linear_interp_q31(
-  q31_t * pYData,
-  q31_t x,
-  uint32_t nValues)
-  {
-    q31_t y;                                     /* output */
-    q31_t y0, y1;                                /* Nearest output values */
-    q31_t fract;                                 /* fractional part */
-    int32_t index;                               /* Index to read nearest output values */
-
-    /* Input is in 12.20 format */
-    /* 12 bits for the table index */
-    /* Index value calculation */
-    index = ((x & (q31_t)0xFFF00000) >> 20);
-
-    if (index >= (int32_t)(nValues - 1))
-    {
-      return (pYData[nValues - 1]);
-    }
-    else if (index < 0)
-    {
-      return (pYData[0]);
-    }
-    else
-    {
-      /* 20 bits for the fractional part */
-      /* shift left by 11 to keep fract in 1.31 format */
-      fract = (x & 0x000FFFFF) << 11;
-
-      /* Read two nearest output values from the index in 1.31(q31) format */
-      y0 = pYData[index];
-      y1 = pYData[index + 1];
-
-      /* Calculation of y0 * (1-fract) and y is in 2.30 format */
-      y = ((q31_t) ((q63_t) y0 * (0x7FFFFFFF - fract) >> 32));
-
-      /* Calculation of y0 * (1-fract) + y1 *fract and y is in 2.30 format */
-      y += ((q31_t) (((q63_t) y1 * fract) >> 32));
-
-      /* Convert y to 1.31 format */
-      return (y << 1U);
-    }
-  }
-
-
-  /**
-   *
-   * @brief  Process function for the Q15 Linear Interpolation Function.
-   * @param[in] pYData   pointer to Q15 Linear Interpolation table
-   * @param[in] x        input sample to process
-   * @param[in] nValues  number of table values
-   * @return y processed output sample.
-   *
-   * \par
-   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
-   * This function can support maximum of table size 2^12.
-   *
-   */
-  __STATIC_FORCEINLINE q15_t arm_linear_interp_q15(
-  q15_t * pYData,
-  q31_t x,
-  uint32_t nValues)
-  {
-    q63_t y;                                     /* output */
-    q15_t y0, y1;                                /* Nearest output values */
-    q31_t fract;                                 /* fractional part */
-    int32_t index;                               /* Index to read nearest output values */
-
-    /* Input is in 12.20 format */
-    /* 12 bits for the table index */
-    /* Index value calculation */
-    index = ((x & (int32_t)0xFFF00000) >> 20);
-
-    if (index >= (int32_t)(nValues - 1))
-    {
-      return (pYData[nValues - 1]);
-    }
-    else if (index < 0)
-    {
-      return (pYData[0]);
-    }
-    else
-    {
-      /* 20 bits for the fractional part */
-      /* fract is in 12.20 format */
-      fract = (x & 0x000FFFFF);
-
-      /* Read two nearest output values from the index */
-      y0 = pYData[index];
-      y1 = pYData[index + 1];
-
-      /* Calculation of y0 * (1-fract) and y is in 13.35 format */
-      y = ((q63_t) y0 * (0xFFFFF - fract));
-
-      /* Calculation of (y0 * (1-fract) + y1 * fract) and y is in 13.35 format */
-      y += ((q63_t) y1 * (fract));
-
-      /* convert y to 1.15 format */
-      return (q15_t) (y >> 20);
-    }
-  }
-
-
-  /**
-   *
-   * @brief  Process function for the Q7 Linear Interpolation Function.
-   * @param[in] pYData   pointer to Q7 Linear Interpolation table
-   * @param[in] x        input sample to process
-   * @param[in] nValues  number of table values
-   * @return y processed output sample.
-   *
-   * \par
-   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
-   * This function can support maximum of table size 2^12.
-   */
-  __STATIC_FORCEINLINE q7_t arm_linear_interp_q7(
-  q7_t * pYData,
-  q31_t x,
-  uint32_t nValues)
-  {
-    q31_t y;                                     /* output */
-    q7_t y0, y1;                                 /* Nearest output values */
-    q31_t fract;                                 /* fractional part */
-    uint32_t index;                              /* Index to read nearest output values */
-
-    /* Input is in 12.20 format */
-    /* 12 bits for the table index */
-    /* Index value calculation */
-    if (x < 0)
-    {
-      return (pYData[0]);
-    }
-    index = (x >> 20) & 0xfff;
-
-    if (index >= (nValues - 1))
-    {
-      return (pYData[nValues - 1]);
-    }
-    else
-    {
-      /* 20 bits for the fractional part */
-      /* fract is in 12.20 format */
-      fract = (x & 0x000FFFFF);
-
-      /* Read two nearest output values from the index and are in 1.7(q7) format */
-      y0 = pYData[index];
-      y1 = pYData[index + 1];
-
-      /* Calculation of y0 * (1-fract ) and y is in 13.27(q27) format */
-      y = ((y0 * (0xFFFFF - fract)));
-
-      /* Calculation of y1 * fract + y0 * (1-fract) and y is in 13.27(q27) format */
-      y += (y1 * fract);
-
-      /* convert y to 1.7(q7) format */
-      return (q7_t) (y >> 20);
-     }
-  }
-
-  /**
-   * @} end of LinearInterpolate group
-   */
-
-  /**
-   * @brief  Fast approximation to the trigonometric sine function for floating-point data.
-   * @param[in] x  input value in radians.
-   * @return  sin(x).
-   */
-  float32_t arm_sin_f32(
-  float32_t x);
-
-
-  /**
-   * @brief  Fast approximation to the trigonometric sine function for Q31 data.
-   * @param[in] x  Scaled input value in radians.
-   * @return  sin(x).
-   */
-  q31_t arm_sin_q31(
-  q31_t x);
-
-
-  /**
-   * @brief  Fast approximation to the trigonometric sine function for Q15 data.
-   * @param[in] x  Scaled input value in radians.
-   * @return  sin(x).
-   */
-  q15_t arm_sin_q15(
-  q15_t x);
-
-
-  /**
-   * @brief  Fast approximation to the trigonometric cosine function for floating-point data.
-   * @param[in] x  input value in radians.
-   * @return  cos(x).
-   */
-  float32_t arm_cos_f32(
-  float32_t x);
-
-
-  /**
-   * @brief Fast approximation to the trigonometric cosine function for Q31 data.
-   * @param[in] x  Scaled input value in radians.
-   * @return  cos(x).
-   */
-  q31_t arm_cos_q31(
-  q31_t x);
-
-
-  /**
-   * @brief  Fast approximation to the trigonometric cosine function for Q15 data.
-   * @param[in] x  Scaled input value in radians.
-   * @return  cos(x).
-   */
-  q15_t arm_cos_q15(
-  q15_t x);
-
-
-/**
-  @brief         Floating-point vector of log values.
-  @param[in]     pSrc       points to the input vector
-  @param[out]    pDst       points to the output vector
-  @param[in]     blockSize  number of samples in each vector
-  @return        none
- */
-  void arm_vlog_f32(
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-/**
-  @brief         Floating-point vector of exp values.
-  @param[in]     pSrc       points to the input vector
-  @param[out]    pDst       points to the output vector
-  @param[in]     blockSize  number of samples in each vector
-  @return        none
- */
-  void arm_vexp_f32(
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-  /**
-   * @ingroup groupFastMath
-   */
-
-
-  /**
-   * @defgroup SQRT Square Root
-   *
-   * Computes the square root of a number.
-   * There are separate functions for Q15, Q31, and floating-point data types.
-   * The square root function is computed using the Newton-Raphson algorithm.
-   * This is an iterative algorithm of the form:
-   * <pre>
-   *      x1 = x0 - f(x0)/f'(x0)
-   * </pre>
-   * where <code>x1</code> is the current estimate,
-   * <code>x0</code> is the previous estimate, and
-   * <code>f'(x0)</code> is the derivative of <code>f()</code> evaluated at <code>x0</code>.
-   * For the square root function, the algorithm reduces to:
-   * <pre>
-   *     x0 = in/2                         [initial guess]
-   *     x1 = 1/2 * ( x0 + in / x0)        [each iteration]
-   * </pre>
-   */
-
-
-  /**
-   * @addtogroup SQRT
-   * @{
-   */
-
-/**
-  @brief         Floating-point square root function.
-  @param[in]     in    input value
-  @param[out]    pOut  square root of input value
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : input value is positive
-                   - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
- */
-__STATIC_FORCEINLINE arm_status arm_sqrt_f32(
-  float32_t in,
-  float32_t * pOut)
-  {
-    if (in >= 0.0f)
-    {
-#if defined ( __CC_ARM )
-  #if defined __TARGET_FPU_VFP
-      *pOut = __sqrtf(in);
-  #else
-      *pOut = sqrtf(in);
-  #endif
-
-#elif defined ( __ICCARM__ )
-  #if defined __ARMVFP__
-      __ASM("VSQRT.F32 %0,%1" : "=t"(*pOut) : "t"(in));
-  #else
-      *pOut = sqrtf(in);
-  #endif
-
-#else
-      *pOut = sqrtf(in);
-#endif
-
-      return (ARM_MATH_SUCCESS);
-    }
-    else
-    {
-      *pOut = 0.0f;
-      return (ARM_MATH_ARGUMENT_ERROR);
-    }
-  }
-
-
-/**
-  @brief         Q31 square root function.
-  @param[in]     in    input value.  The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF
-  @param[out]    pOut  points to square root of input value
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : input value is positive
-                   - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
- */
-arm_status arm_sqrt_q31(
-  q31_t in,
-  q31_t * pOut);
-
-
-/**
-  @brief         Q15 square root function.
-  @param[in]     in    input value.  The range of the input value is [0 +1) or 0x0000 to 0x7FFF
-  @param[out]    pOut  points to square root of input value
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : input value is positive
-                   - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
- */
-arm_status arm_sqrt_q15(
-  q15_t in,
-  q15_t * pOut);
-
-  /**
-   * @brief  Vector Floating-point square root function.
-   * @param[in]  pIn   input vector.
-   * @param[out] pOut  vector of square roots of input elements.
-   * @param[in]  len   length of input vector.
-   * @return The function returns ARM_MATH_SUCCESS if input value is positive value or ARM_MATH_ARGUMENT_ERROR if
-   * <code>in</code> is negative value and returns zero output for negative values.
-   */
-  void arm_vsqrt_f32(
-  float32_t * pIn,
-  float32_t * pOut,
-  uint16_t len);
-
-  void arm_vsqrt_q31(
-  q31_t * pIn,
-  q31_t * pOut,
-  uint16_t len);
-
-  void arm_vsqrt_q15(
-  q15_t * pIn,
-  q15_t * pOut,
-  uint16_t len);
-
-  /**
-   * @} end of SQRT group
-   */
-
-
-  /**
-   * @brief floating-point Circular write function.
-   */
-  __STATIC_FORCEINLINE void arm_circularWrite_f32(
-  int32_t * circBuffer,
-  int32_t L,
-  uint16_t * writeOffset,
-  int32_t bufferInc,
-  const int32_t * src,
-  int32_t srcInc,
-  uint32_t blockSize)
-  {
-    uint32_t i = 0U;
-    int32_t wOffset;
-
-    /* Copy the value of Index pointer that points
-     * to the current location where the input samples to be copied */
-    wOffset = *writeOffset;
-
-    /* Loop over the blockSize */
-    i = blockSize;
-
-    while (i > 0U)
-    {
-      /* copy the input sample to the circular buffer */
-      circBuffer[wOffset] = *src;
-
-      /* Update the input pointer */
-      src += srcInc;
-
-      /* Circularly update wOffset.  Watch out for positive and negative value */
-      wOffset += bufferInc;
-      if (wOffset >= L)
-        wOffset -= L;
-
-      /* Decrement the loop counter */
-      i--;
-    }
-
-    /* Update the index pointer */
-    *writeOffset = (uint16_t)wOffset;
-  }
-
-
-
-  /**
-   * @brief floating-point Circular Read function.
-   */
-  __STATIC_FORCEINLINE void arm_circularRead_f32(
-  int32_t * circBuffer,
-  int32_t L,
-  int32_t * readOffset,
-  int32_t bufferInc,
-  int32_t * dst,
-  int32_t * dst_base,
-  int32_t dst_length,
-  int32_t dstInc,
-  uint32_t blockSize)
-  {
-    uint32_t i = 0U;
-    int32_t rOffset;
-    int32_t* dst_end;
-
-    /* Copy the value of Index pointer that points
-     * to the current location from where the input samples to be read */
-    rOffset = *readOffset;
-    dst_end = dst_base + dst_length;
-
-    /* Loop over the blockSize */
-    i = blockSize;
-
-    while (i > 0U)
-    {
-      /* copy the sample from the circular buffer to the destination buffer */
-      *dst = circBuffer[rOffset];
-
-      /* Update the input pointer */
-      dst += dstInc;
-
-      if (dst == dst_end)
-      {
-        dst = dst_base;
-      }
-
-      /* Circularly update rOffset.  Watch out for positive and negative value  */
-      rOffset += bufferInc;
-
-      if (rOffset >= L)
-      {
-        rOffset -= L;
-      }
-
-      /* Decrement the loop counter */
-      i--;
-    }
-
-    /* Update the index pointer */
-    *readOffset = rOffset;
-  }
-
-
-  /**
-   * @brief Q15 Circular write function.
-   */
-  __STATIC_FORCEINLINE void arm_circularWrite_q15(
-  q15_t * circBuffer,
-  int32_t L,
-  uint16_t * writeOffset,
-  int32_t bufferInc,
-  const q15_t * src,
-  int32_t srcInc,
-  uint32_t blockSize)
-  {
-    uint32_t i = 0U;
-    int32_t wOffset;
-
-    /* Copy the value of Index pointer that points
-     * to the current location where the input samples to be copied */
-    wOffset = *writeOffset;
-
-    /* Loop over the blockSize */
-    i = blockSize;
-
-    while (i > 0U)
-    {
-      /* copy the input sample to the circular buffer */
-      circBuffer[wOffset] = *src;
-
-      /* Update the input pointer */
-      src += srcInc;
-
-      /* Circularly update wOffset.  Watch out for positive and negative value */
-      wOffset += bufferInc;
-      if (wOffset >= L)
-        wOffset -= L;
-
-      /* Decrement the loop counter */
-      i--;
-    }
-
-    /* Update the index pointer */
-    *writeOffset = (uint16_t)wOffset;
-  }
-
-
-  /**
-   * @brief Q15 Circular Read function.
-   */
-  __STATIC_FORCEINLINE void arm_circularRead_q15(
-  q15_t * circBuffer,
-  int32_t L,
-  int32_t * readOffset,
-  int32_t bufferInc,
-  q15_t * dst,
-  q15_t * dst_base,
-  int32_t dst_length,
-  int32_t dstInc,
-  uint32_t blockSize)
-  {
-    uint32_t i = 0;
-    int32_t rOffset;
-    q15_t* dst_end;
-
-    /* Copy the value of Index pointer that points
-     * to the current location from where the input samples to be read */
-    rOffset = *readOffset;
-
-    dst_end = dst_base + dst_length;
-
-    /* Loop over the blockSize */
-    i = blockSize;
-
-    while (i > 0U)
-    {
-      /* copy the sample from the circular buffer to the destination buffer */
-      *dst = circBuffer[rOffset];
-
-      /* Update the input pointer */
-      dst += dstInc;
-
-      if (dst == dst_end)
-      {
-        dst = dst_base;
-      }
-
-      /* Circularly update wOffset.  Watch out for positive and negative value */
-      rOffset += bufferInc;
-
-      if (rOffset >= L)
-      {
-        rOffset -= L;
-      }
-
-      /* Decrement the loop counter */
-      i--;
-    }
-
-    /* Update the index pointer */
-    *readOffset = rOffset;
-  }
-
-
-  /**
-   * @brief Q7 Circular write function.
-   */
-  __STATIC_FORCEINLINE void arm_circularWrite_q7(
-  q7_t * circBuffer,
-  int32_t L,
-  uint16_t * writeOffset,
-  int32_t bufferInc,
-  const q7_t * src,
-  int32_t srcInc,
-  uint32_t blockSize)
-  {
-    uint32_t i = 0U;
-    int32_t wOffset;
-
-    /* Copy the value of Index pointer that points
-     * to the current location where the input samples to be copied */
-    wOffset = *writeOffset;
-
-    /* Loop over the blockSize */
-    i = blockSize;
-
-    while (i > 0U)
-    {
-      /* copy the input sample to the circular buffer */
-      circBuffer[wOffset] = *src;
-
-      /* Update the input pointer */
-      src += srcInc;
-
-      /* Circularly update wOffset.  Watch out for positive and negative value */
-      wOffset += bufferInc;
-      if (wOffset >= L)
-        wOffset -= L;
-
-      /* Decrement the loop counter */
-      i--;
-    }
-
-    /* Update the index pointer */
-    *writeOffset = (uint16_t)wOffset;
-  }
-
-
-  /**
-   * @brief Q7 Circular Read function.
-   */
-  __STATIC_FORCEINLINE void arm_circularRead_q7(
-  q7_t * circBuffer,
-  int32_t L,
-  int32_t * readOffset,
-  int32_t bufferInc,
-  q7_t * dst,
-  q7_t * dst_base,
-  int32_t dst_length,
-  int32_t dstInc,
-  uint32_t blockSize)
-  {
-    uint32_t i = 0;
-    int32_t rOffset;
-    q7_t* dst_end;
-
-    /* Copy the value of Index pointer that points
-     * to the current location from where the input samples to be read */
-    rOffset = *readOffset;
-
-    dst_end = dst_base + dst_length;
-
-    /* Loop over the blockSize */
-    i = blockSize;
-
-    while (i > 0U)
-    {
-      /* copy the sample from the circular buffer to the destination buffer */
-      *dst = circBuffer[rOffset];
-
-      /* Update the input pointer */
-      dst += dstInc;
-
-      if (dst == dst_end)
-      {
-        dst = dst_base;
-      }
-
-      /* Circularly update rOffset.  Watch out for positive and negative value */
-      rOffset += bufferInc;
-
-      if (rOffset >= L)
-      {
-        rOffset -= L;
-      }
-
-      /* Decrement the loop counter */
-      i--;
-    }
-
-    /* Update the index pointer */
-    *readOffset = rOffset;
-  }
-
-
-  /**
-   * @brief  Sum of the squares of the elements of a Q31 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_power_q31(
-  const q31_t * pSrc,
-        uint32_t blockSize,
-        q63_t * pResult);
-
-
-  /**
-   * @brief  Sum of the squares of the elements of a floating-point vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_power_f32(
-  const float32_t * pSrc,
-        uint32_t blockSize,
-        float32_t * pResult);
-
-
-  /**
-   * @brief  Sum of the squares of the elements of a Q15 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_power_q15(
-  const q15_t * pSrc,
-        uint32_t blockSize,
-        q63_t * pResult);
-
-
-  /**
-   * @brief  Sum of the squares of the elements of a Q7 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_power_q7(
-  const q7_t * pSrc,
-        uint32_t blockSize,
-        q31_t * pResult);
-
-
-  /**
-   * @brief  Mean value of a Q7 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_mean_q7(
-  const q7_t * pSrc,
-        uint32_t blockSize,
-        q7_t * pResult);
-
-
-  /**
-   * @brief  Mean value of a Q15 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_mean_q15(
-  const q15_t * pSrc,
-        uint32_t blockSize,
-        q15_t * pResult);
-
-
-  /**
-   * @brief  Mean value of a Q31 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_mean_q31(
-  const q31_t * pSrc,
-        uint32_t blockSize,
-        q31_t * pResult);
-
-
-  /**
-   * @brief  Mean value of a floating-point vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_mean_f32(
-  const float32_t * pSrc,
-        uint32_t blockSize,
-        float32_t * pResult);
-
-
-  /**
-   * @brief  Variance of the elements of a floating-point vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_var_f32(
-  const float32_t * pSrc,
-        uint32_t blockSize,
-        float32_t * pResult);
-
-
-  /**
-   * @brief  Variance of the elements of a Q31 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_var_q31(
-  const q31_t * pSrc,
-        uint32_t blockSize,
-        q31_t * pResult);
-
-
-  /**
-   * @brief  Variance of the elements of a Q15 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_var_q15(
-  const q15_t * pSrc,
-        uint32_t blockSize,
-        q15_t * pResult);
-
-
-  /**
-   * @brief  Root Mean Square of the elements of a floating-point vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_rms_f32(
-  const float32_t * pSrc,
-        uint32_t blockSize,
-        float32_t * pResult);
-
-
-  /**
-   * @brief  Root Mean Square of the elements of a Q31 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_rms_q31(
-  const q31_t * pSrc,
-        uint32_t blockSize,
-        q31_t * pResult);
-
-
-  /**
-   * @brief  Root Mean Square of the elements of a Q15 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_rms_q15(
-  const q15_t * pSrc,
-        uint32_t blockSize,
-        q15_t * pResult);
-
-
-  /**
-   * @brief  Standard deviation of the elements of a floating-point vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_std_f32(
-  const float32_t * pSrc,
-        uint32_t blockSize,
-        float32_t * pResult);
-
-
-  /**
-   * @brief  Standard deviation of the elements of a Q31 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_std_q31(
-  const q31_t * pSrc,
-        uint32_t blockSize,
-        q31_t * pResult);
-
-
-  /**
-   * @brief  Standard deviation of the elements of a Q15 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output value.
-   */
-  void arm_std_q15(
-  const q15_t * pSrc,
-        uint32_t blockSize,
-        q15_t * pResult);
-
-
-  /**
-   * @brief  Floating-point complex magnitude
-   * @param[in]  pSrc        points to the complex input vector
-   * @param[out] pDst        points to the real output vector
-   * @param[in]  numSamples  number of complex samples in the input vector
-   */
-  void arm_cmplx_mag_f32(
-  const float32_t * pSrc,
-        float32_t * pDst,
-        uint32_t numSamples);
-
-
-  /**
-   * @brief  Q31 complex magnitude
-   * @param[in]  pSrc        points to the complex input vector
-   * @param[out] pDst        points to the real output vector
-   * @param[in]  numSamples  number of complex samples in the input vector
-   */
-  void arm_cmplx_mag_q31(
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t numSamples);
-
-
-  /**
-   * @brief  Q15 complex magnitude
-   * @param[in]  pSrc        points to the complex input vector
-   * @param[out] pDst        points to the real output vector
-   * @param[in]  numSamples  number of complex samples in the input vector
-   */
-  void arm_cmplx_mag_q15(
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t numSamples);
-
-
-  /**
-   * @brief  Q15 complex dot product
-   * @param[in]  pSrcA       points to the first input vector
-   * @param[in]  pSrcB       points to the second input vector
-   * @param[in]  numSamples  number of complex samples in each vector
-   * @param[out] realResult  real part of the result returned here
-   * @param[out] imagResult  imaginary part of the result returned here
-   */
-  void arm_cmplx_dot_prod_q15(
-  const q15_t * pSrcA,
-  const q15_t * pSrcB,
-        uint32_t numSamples,
-        q31_t * realResult,
-        q31_t * imagResult);
-
-
-  /**
-   * @brief  Q31 complex dot product
-   * @param[in]  pSrcA       points to the first input vector
-   * @param[in]  pSrcB       points to the second input vector
-   * @param[in]  numSamples  number of complex samples in each vector
-   * @param[out] realResult  real part of the result returned here
-   * @param[out] imagResult  imaginary part of the result returned here
-   */
-  void arm_cmplx_dot_prod_q31(
-  const q31_t * pSrcA,
-  const q31_t * pSrcB,
-        uint32_t numSamples,
-        q63_t * realResult,
-        q63_t * imagResult);
-
-
-  /**
-   * @brief  Floating-point complex dot product
-   * @param[in]  pSrcA       points to the first input vector
-   * @param[in]  pSrcB       points to the second input vector
-   * @param[in]  numSamples  number of complex samples in each vector
-   * @param[out] realResult  real part of the result returned here
-   * @param[out] imagResult  imaginary part of the result returned here
-   */
-  void arm_cmplx_dot_prod_f32(
-  const float32_t * pSrcA,
-  const float32_t * pSrcB,
-        uint32_t numSamples,
-        float32_t * realResult,
-        float32_t * imagResult);
-
-
-  /**
-   * @brief  Q15 complex-by-real multiplication
-   * @param[in]  pSrcCmplx   points to the complex input vector
-   * @param[in]  pSrcReal    points to the real input vector
-   * @param[out] pCmplxDst   points to the complex output vector
-   * @param[in]  numSamples  number of samples in each vector
-   */
-  void arm_cmplx_mult_real_q15(
-  const q15_t * pSrcCmplx,
-  const q15_t * pSrcReal,
-        q15_t * pCmplxDst,
-        uint32_t numSamples);
-
-
-  /**
-   * @brief  Q31 complex-by-real multiplication
-   * @param[in]  pSrcCmplx   points to the complex input vector
-   * @param[in]  pSrcReal    points to the real input vector
-   * @param[out] pCmplxDst   points to the complex output vector
-   * @param[in]  numSamples  number of samples in each vector
-   */
-  void arm_cmplx_mult_real_q31(
-  const q31_t * pSrcCmplx,
-  const q31_t * pSrcReal,
-        q31_t * pCmplxDst,
-        uint32_t numSamples);
-
-
-  /**
-   * @brief  Floating-point complex-by-real multiplication
-   * @param[in]  pSrcCmplx   points to the complex input vector
-   * @param[in]  pSrcReal    points to the real input vector
-   * @param[out] pCmplxDst   points to the complex output vector
-   * @param[in]  numSamples  number of samples in each vector
-   */
-  void arm_cmplx_mult_real_f32(
-  const float32_t * pSrcCmplx,
-  const float32_t * pSrcReal,
-        float32_t * pCmplxDst,
-        uint32_t numSamples);
-
-
-  /**
-   * @brief  Minimum value of a Q7 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] result     is output pointer
-   * @param[in]  index      is the array index of the minimum value in the input buffer.
-   */
-  void arm_min_q7(
-  const q7_t * pSrc,
-        uint32_t blockSize,
-        q7_t * result,
-        uint32_t * index);
-
-
-  /**
-   * @brief  Minimum value of a Q15 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output pointer
-   * @param[in]  pIndex     is the array index of the minimum value in the input buffer.
-   */
-  void arm_min_q15(
-  const q15_t * pSrc,
-        uint32_t blockSize,
-        q15_t * pResult,
-        uint32_t * pIndex);
-
-
-  /**
-   * @brief  Minimum value of a Q31 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output pointer
-   * @param[out] pIndex     is the array index of the minimum value in the input buffer.
-   */
-  void arm_min_q31(
-  const q31_t * pSrc,
-        uint32_t blockSize,
-        q31_t * pResult,
-        uint32_t * pIndex);
-
-
-  /**
-   * @brief  Minimum value of a floating-point vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[in]  blockSize  is the number of samples to process
-   * @param[out] pResult    is output pointer
-   * @param[out] pIndex     is the array index of the minimum value in the input buffer.
-   */
-  void arm_min_f32(
-  const float32_t * pSrc,
-        uint32_t blockSize,
-        float32_t * pResult,
-        uint32_t * pIndex);
-
-
-/**
- * @brief Maximum value of a Q7 vector.
- * @param[in]  pSrc       points to the input buffer
- * @param[in]  blockSize  length of the input vector
- * @param[out] pResult    maximum value returned here
- * @param[out] pIndex     index of maximum value returned here
- */
-  void arm_max_q7(
-  const q7_t * pSrc,
-        uint32_t blockSize,
-        q7_t * pResult,
-        uint32_t * pIndex);
-
-
-/**
- * @brief Maximum value of a Q15 vector.
- * @param[in]  pSrc       points to the input buffer
- * @param[in]  blockSize  length of the input vector
- * @param[out] pResult    maximum value returned here
- * @param[out] pIndex     index of maximum value returned here
- */
-  void arm_max_q15(
-  const q15_t * pSrc,
-        uint32_t blockSize,
-        q15_t * pResult,
-        uint32_t * pIndex);
-
-
-/**
- * @brief Maximum value of a Q31 vector.
- * @param[in]  pSrc       points to the input buffer
- * @param[in]  blockSize  length of the input vector
- * @param[out] pResult    maximum value returned here
- * @param[out] pIndex     index of maximum value returned here
- */
-  void arm_max_q31(
-  const q31_t * pSrc,
-        uint32_t blockSize,
-        q31_t * pResult,
-        uint32_t * pIndex);
-
-
-/**
- * @brief Maximum value of a floating-point vector.
- * @param[in]  pSrc       points to the input buffer
- * @param[in]  blockSize  length of the input vector
- * @param[out] pResult    maximum value returned here
- * @param[out] pIndex     index of maximum value returned here
- */
-  void arm_max_f32(
-  const float32_t * pSrc,
-        uint32_t blockSize,
-        float32_t * pResult,
-        uint32_t * pIndex);
-
-  /**
-    @brief         Maximum value of a floating-point vector.
-    @param[in]     pSrc       points to the input vector
-    @param[in]     blockSize  number of samples in input vector
-    @param[out]    pResult    maximum value returned here
-    @return        none
-   */
-  void arm_max_no_idx_f32(
-      const float32_t *pSrc,
-      uint32_t   blockSize,
-      float32_t *pResult);
-
-  /**
-   * @brief  Q15 complex-by-complex multiplication
-   * @param[in]  pSrcA       points to the first input vector
-   * @param[in]  pSrcB       points to the second input vector
-   * @param[out] pDst        points to the output vector
-   * @param[in]  numSamples  number of complex samples in each vector
-   */
-  void arm_cmplx_mult_cmplx_q15(
-  const q15_t * pSrcA,
-  const q15_t * pSrcB,
-        q15_t * pDst,
-        uint32_t numSamples);
-
-
-  /**
-   * @brief  Q31 complex-by-complex multiplication
-   * @param[in]  pSrcA       points to the first input vector
-   * @param[in]  pSrcB       points to the second input vector
-   * @param[out] pDst        points to the output vector
-   * @param[in]  numSamples  number of complex samples in each vector
-   */
-  void arm_cmplx_mult_cmplx_q31(
-  const q31_t * pSrcA,
-  const q31_t * pSrcB,
-        q31_t * pDst,
-        uint32_t numSamples);
-
-
-  /**
-   * @brief  Floating-point complex-by-complex multiplication
-   * @param[in]  pSrcA       points to the first input vector
-   * @param[in]  pSrcB       points to the second input vector
-   * @param[out] pDst        points to the output vector
-   * @param[in]  numSamples  number of complex samples in each vector
-   */
-  void arm_cmplx_mult_cmplx_f32(
-  const float32_t * pSrcA,
-  const float32_t * pSrcB,
-        float32_t * pDst,
-        uint32_t numSamples);
-
-
-  /**
-   * @brief Converts the elements of the floating-point vector to Q31 vector.
-   * @param[in]  pSrc       points to the floating-point input vector
-   * @param[out] pDst       points to the Q31 output vector
-   * @param[in]  blockSize  length of the input vector
-   */
-  void arm_float_to_q31(
-  const float32_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Converts the elements of the floating-point vector to Q15 vector.
-   * @param[in]  pSrc       points to the floating-point input vector
-   * @param[out] pDst       points to the Q15 output vector
-   * @param[in]  blockSize  length of the input vector
-   */
-  void arm_float_to_q15(
-  const float32_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief Converts the elements of the floating-point vector to Q7 vector.
-   * @param[in]  pSrc       points to the floating-point input vector
-   * @param[out] pDst       points to the Q7 output vector
-   * @param[in]  blockSize  length of the input vector
-   */
-  void arm_float_to_q7(
-  const float32_t * pSrc,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Converts the elements of the Q31 vector to floating-point vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[out] pDst       is output pointer
-   * @param[in]  blockSize  is the number of samples to process
-   */
-  void arm_q31_to_float(
-  const q31_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Converts the elements of the Q31 vector to Q15 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[out] pDst       is output pointer
-   * @param[in]  blockSize  is the number of samples to process
-   */
-  void arm_q31_to_q15(
-  const q31_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Converts the elements of the Q31 vector to Q7 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[out] pDst       is output pointer
-   * @param[in]  blockSize  is the number of samples to process
-   */
-  void arm_q31_to_q7(
-  const q31_t * pSrc,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Converts the elements of the Q15 vector to floating-point vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[out] pDst       is output pointer
-   * @param[in]  blockSize  is the number of samples to process
-   */
-  void arm_q15_to_float(
-  const q15_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Converts the elements of the Q15 vector to Q31 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[out] pDst       is output pointer
-   * @param[in]  blockSize  is the number of samples to process
-   */
-  void arm_q15_to_q31(
-  const q15_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Converts the elements of the Q15 vector to Q7 vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[out] pDst       is output pointer
-   * @param[in]  blockSize  is the number of samples to process
-   */
-  void arm_q15_to_q7(
-  const q15_t * pSrc,
-        q7_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Converts the elements of the Q7 vector to floating-point vector.
-   * @param[in]  pSrc       is input pointer
-   * @param[out] pDst       is output pointer
-   * @param[in]  blockSize  is the number of samples to process
-   */
-  void arm_q7_to_float(
-  const q7_t * pSrc,
-        float32_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Converts the elements of the Q7 vector to Q31 vector.
-   * @param[in]  pSrc       input pointer
-   * @param[out] pDst       output pointer
-   * @param[in]  blockSize  number of samples to process
-   */
-  void arm_q7_to_q31(
-  const q7_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize);
-
-
-  /**
-   * @brief  Converts the elements of the Q7 vector to Q15 vector.
-   * @param[in]  pSrc       input pointer
-   * @param[out] pDst       output pointer
-   * @param[in]  blockSize  number of samples to process
-   */
-  void arm_q7_to_q15(
-  const q7_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize);
-
-/**
- * @brief Struct for specifying SVM Kernel
- */
-typedef enum
-{
-    ARM_ML_KERNEL_LINEAR = 0,
-             /**< Linear kernel */
-    ARM_ML_KERNEL_POLYNOMIAL = 1,
-             /**< Polynomial kernel */
-    ARM_ML_KERNEL_RBF = 2,
-             /**< Radial Basis Function kernel */
-    ARM_ML_KERNEL_SIGMOID = 3
-             /**< Sigmoid kernel */
-} arm_ml_kernel_type;
-
-
-/**
- * @brief Instance structure for linear SVM prediction function.
- */
-typedef struct
-{
-  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
-  uint32_t        vectorDimension;        /**< Dimension of vector space */
-  float32_t       intercept;              /**< Intercept */
-  const float32_t *dualCoefficients;      /**< Dual coefficients */
-  const float32_t *supportVectors;        /**< Support vectors */
-  const int32_t   *classes;               /**< The two SVM classes */
-} arm_svm_linear_instance_f32;
-
-
-/**
- * @brief Instance structure for polynomial SVM prediction function.
- */
-typedef struct
-{
-  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
-  uint32_t        vectorDimension;        /**< Dimension of vector space */
-  float32_t       intercept;              /**< Intercept */
-  const float32_t *dualCoefficients;      /**< Dual coefficients */
-  const float32_t *supportVectors;        /**< Support vectors */
-  const int32_t   *classes;               /**< The two SVM classes */
-  int32_t         degree;                 /**< Polynomial degree */
-  float32_t       coef0;                  /**< Polynomial constant */
-  float32_t       gamma;                  /**< Gamma factor */
-} arm_svm_polynomial_instance_f32;
-
-/**
- * @brief Instance structure for rbf SVM prediction function.
- */
-typedef struct
-{
-  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
-  uint32_t        vectorDimension;        /**< Dimension of vector space */
-  float32_t       intercept;              /**< Intercept */
-  const float32_t *dualCoefficients;      /**< Dual coefficients */
-  const float32_t *supportVectors;        /**< Support vectors */
-  const int32_t   *classes;               /**< The two SVM classes */
-  float32_t       gamma;                  /**< Gamma factor */
-} arm_svm_rbf_instance_f32;
-
-/**
- * @brief Instance structure for sigmoid SVM prediction function.
- */
-typedef struct
-{
-  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
-  uint32_t        vectorDimension;        /**< Dimension of vector space */
-  float32_t       intercept;              /**< Intercept */
-  const float32_t *dualCoefficients;      /**< Dual coefficients */
-  const float32_t *supportVectors;        /**< Support vectors */
-  const int32_t   *classes;               /**< The two SVM classes */
-  float32_t       coef0;                  /**< Independant constant */
-  float32_t       gamma;                  /**< Gamma factor */
-} arm_svm_sigmoid_instance_f32;
-
-/**
- * @brief        SVM linear instance init function
- * @param[in]    S                      Parameters for SVM functions
- * @param[in]    nbOfSupportVectors     Number of support vectors
- * @param[in]    vectorDimension        Dimension of vector space
- * @param[in]    intercept              Intercept
- * @param[in]    dualCoefficients       Array of dual coefficients
- * @param[in]    supportVectors         Array of support vectors
- * @param[in]    classes                Array of 2 classes ID
- * @return none.
- *
- */
-
-
-void arm_svm_linear_init_f32(arm_svm_linear_instance_f32 *S, 
-  uint32_t nbOfSupportVectors,
-  uint32_t vectorDimension,
-  float32_t intercept,
-  const float32_t *dualCoefficients,
-  const float32_t *supportVectors,
-  const int32_t  *classes);
-
-/**
- * @brief SVM linear prediction
- * @param[in]    S          Pointer to an instance of the linear SVM structure.
- * @param[in]    in         Pointer to input vector
- * @param[out]   pResult    Decision value
- * @return none.
- *
- */
-  
-void arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 *S, 
-   const float32_t * in, 
-   int32_t * pResult);
-
-
-/**
- * @brief        SVM polynomial instance init function
- * @param[in]    S                      points to an instance of the polynomial SVM structure.
- * @param[in]    nbOfSupportVectors     Number of support vectors
- * @param[in]    vectorDimension        Dimension of vector space
- * @param[in]    intercept              Intercept
- * @param[in]    dualCoefficients       Array of dual coefficients
- * @param[in]    supportVectors         Array of support vectors
- * @param[in]    classes                Array of 2 classes ID
- * @param[in]    degree                 Polynomial degree
- * @param[in]    coef0                  coeff0 (scikit-learn terminology)
- * @param[in]    gamma                  gamma (scikit-learn terminology)
- * @return none.
- *
- */
-
-
-void arm_svm_polynomial_init_f32(arm_svm_polynomial_instance_f32 *S, 
-  uint32_t nbOfSupportVectors,
-  uint32_t vectorDimension,
-  float32_t intercept,
-  const float32_t *dualCoefficients,
-  const float32_t *supportVectors,
-  const int32_t   *classes,
-  int32_t      degree,
-  float32_t coef0,
-  float32_t gamma
-  );
-
-/**
- * @brief SVM polynomial prediction
- * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
- * @param[in]    in         Pointer to input vector
- * @param[out]   pResult    Decision value
- * @return none.
- *
- */
-void arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 *S, 
-   const float32_t * in, 
-   int32_t * pResult);
-
-
-/**
- * @brief        SVM radial basis function instance init function
- * @param[in]    S                      points to an instance of the polynomial SVM structure.
- * @param[in]    nbOfSupportVectors     Number of support vectors
- * @param[in]    vectorDimension        Dimension of vector space
- * @param[in]    intercept              Intercept
- * @param[in]    dualCoefficients       Array of dual coefficients
- * @param[in]    supportVectors         Array of support vectors
- * @param[in]    classes                Array of 2 classes ID
- * @param[in]    gamma                  gamma (scikit-learn terminology)
- * @return none.
- *
- */
-
-void arm_svm_rbf_init_f32(arm_svm_rbf_instance_f32 *S, 
-  uint32_t nbOfSupportVectors,
-  uint32_t vectorDimension,
-  float32_t intercept,
-  const float32_t *dualCoefficients,
-  const float32_t *supportVectors,
-  const int32_t   *classes,
-  float32_t gamma
-  );
-
-/**
- * @brief SVM rbf prediction
- * @param[in]    S         Pointer to an instance of the rbf SVM structure.
- * @param[in]    in        Pointer to input vector
- * @param[out]   pResult   decision value
- * @return none.
- *
- */
-void arm_svm_rbf_predict_f32(const arm_svm_rbf_instance_f32 *S, 
-   const float32_t * in, 
-   int32_t * pResult);
-
-/**
- * @brief        SVM sigmoid instance init function
- * @param[in]    S                      points to an instance of the rbf SVM structure.
- * @param[in]    nbOfSupportVectors     Number of support vectors
- * @param[in]    vectorDimension        Dimension of vector space
- * @param[in]    intercept              Intercept
- * @param[in]    dualCoefficients       Array of dual coefficients
- * @param[in]    supportVectors         Array of support vectors
- * @param[in]    classes                Array of 2 classes ID
- * @param[in]    coef0                  coeff0 (scikit-learn terminology)
- * @param[in]    gamma                  gamma (scikit-learn terminology)
- * @return none.
- *
- */
-
-void arm_svm_sigmoid_init_f32(arm_svm_sigmoid_instance_f32 *S, 
-  uint32_t nbOfSupportVectors,
-  uint32_t vectorDimension,
-  float32_t intercept,
-  const float32_t *dualCoefficients,
-  const float32_t *supportVectors,
-  const int32_t   *classes,
-  float32_t coef0,
-  float32_t gamma
-  );
-
-/**
- * @brief SVM sigmoid prediction
- * @param[in]    S        Pointer to an instance of the rbf SVM structure.
- * @param[in]    in       Pointer to input vector
- * @param[out]   pResult  Decision value
- * @return none.
- *
- */
-void arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 *S, 
-   const float32_t * in, 
-   int32_t * pResult);
-
-
-
-/**
- * @brief Instance structure for Naive Gaussian Bayesian estimator.
- */
-typedef struct
-{
-  uint32_t vectorDimension;  /**< Dimension of vector space */
-  uint32_t numberOfClasses;  /**< Number of different classes  */
-  const float32_t *theta;          /**< Mean values for the Gaussians */
-  const float32_t *sigma;          /**< Variances for the Gaussians */
-  const float32_t *classPriors;    /**< Class prior probabilities */
-  float32_t epsilon;         /**< Additive value to variances */
-} arm_gaussian_naive_bayes_instance_f32;
-
-/**
- * @brief Naive Gaussian Bayesian Estimator
- *
- * @param[in]  S         points to a naive bayes instance structure
- * @param[in]  in        points to the elements of the input vector.
- * @param[in]  pBuffer   points to a buffer of length numberOfClasses
- * @return The predicted class
- *
- */
-
-
-uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S, 
-   const float32_t * in, 
-   float32_t *pBuffer);
-
-/**
- * @brief Computation of the LogSumExp
- *
- * In probabilistic computations, the dynamic of the probability values can be very
- * wide because they come from gaussian functions.
- * To avoid underflow and overflow issues, the values are represented by their log.
- * In this representation, multiplying the original exp values is easy : their logs are added.
- * But adding the original exp values is requiring some special handling and it is the
- * goal of the LogSumExp function.
- *
- * If the values are x1...xn, the function is computing:
- *
- * ln(exp(x1) + ... + exp(xn)) and the computation is done in such a way that
- * rounding issues are minimised.
- *
- * The max xm of the values is extracted and the function is computing:
- * xm + ln(exp(x1 - xm) + ... + exp(xn - xm))
- *
- * @param[in]  *in         Pointer to an array of input values.
- * @param[in]  blockSize   Number of samples in the input array.
- * @return LogSumExp
- *
- */
-
-
-float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize);
-
-/**
- * @brief Dot product with log arithmetic
- *
- * Vectors are containing the log of the samples
- *
- * @param[in]       pSrcA points to the first input vector
- * @param[in]       pSrcB points to the second input vector
- * @param[in]       blockSize number of samples in each vector
- * @param[in]       pTmpBuffer temporary buffer of length blockSize
- * @return The log of the dot product .
- *
- */
-
-
-float32_t arm_logsumexp_dot_prod_f32(const float32_t * pSrcA,
-  const float32_t * pSrcB,
-  uint32_t blockSize,
-  float32_t *pTmpBuffer);
-
-/**
- * @brief Entropy
- *
- * @param[in]  pSrcA        Array of input values.
- * @param[in]  blockSize    Number of samples in the input array.
- * @return     Entropy      -Sum(p ln p)
- *
- */
-
-
-float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize);
-
-
-/**
- * @brief Entropy
- *
- * @param[in]  pSrcA        Array of input values.
- * @param[in]  blockSize    Number of samples in the input array.
- * @return     Entropy      -Sum(p ln p)
- *
- */
-
-
-float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize);
-
-
-/**
- * @brief Kullback-Leibler
- *
- * @param[in]  pSrcA         Pointer to an array of input values for probability distribution A.
- * @param[in]  pSrcB         Pointer to an array of input values for probability distribution B.
- * @param[in]  blockSize     Number of samples in the input array.
- * @return Kullback-Leibler  Divergence D(A || B)
- *
- */
-float32_t arm_kullback_leibler_f32(const float32_t * pSrcA
-  ,const float32_t * pSrcB
-  ,uint32_t blockSize);
-
-
-/**
- * @brief Kullback-Leibler
- *
- * @param[in]  pSrcA         Pointer to an array of input values for probability distribution A.
- * @param[in]  pSrcB         Pointer to an array of input values for probability distribution B.
- * @param[in]  blockSize     Number of samples in the input array.
- * @return Kullback-Leibler  Divergence D(A || B)
- *
- */
-float64_t arm_kullback_leibler_f64(const float64_t * pSrcA, 
-                const float64_t * pSrcB, 
-                uint32_t blockSize);
-
-
-/**
- * @brief Weighted sum
- *
- *
- * @param[in]    *in           Array of input values.
- * @param[in]    *weigths      Weights
- * @param[in]    blockSize     Number of samples in the input array.
- * @return Weighted sum
- *
- */
-float32_t arm_weighted_sum_f32(const float32_t *in
-  , const float32_t *weigths
-  , uint32_t blockSize);
-
-
-/**
- * @brief Barycenter
- *
- *
- * @param[in]    in         List of vectors
- * @param[in]    weights    Weights of the vectors
- * @param[out]   out        Barycenter
- * @param[in]    nbVectors  Number of vectors
- * @param[in]    vecDim     Dimension of space (vector dimension)
- * @return       None
- *
- */
-void arm_barycenter_f32(const float32_t *in
-  , const float32_t *weights
-  , float32_t *out
-  , uint32_t nbVectors
-  , uint32_t vecDim);
-
-/**
- * @brief        Euclidean distance between two vectors
- * @param[in]    pA         First vector
- * @param[in]    pB         Second vector
- * @param[in]    blockSize  vector length
- * @return distance
- *
- */
-
-float32_t arm_euclidean_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
-
-/**
- * @brief        Bray-Curtis distance between two vectors
- * @param[in]    pA         First vector
- * @param[in]    pB         Second vector
- * @param[in]    blockSize  vector length
- * @return distance
- *
- */
-float32_t arm_braycurtis_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
-
-/**
- * @brief        Canberra distance between two vectors
- *
- * This function may divide by zero when samples pA[i] and pB[i] are both zero.
- * The result of the computation will be correct. So the division per zero may be
- * ignored.
- *
- * @param[in]    pA         First vector
- * @param[in]    pB         Second vector
- * @param[in]    blockSize  vector length
- * @return distance
- *
- */
-float32_t arm_canberra_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
-
-
-/**
- * @brief        Chebyshev distance between two vectors
- * @param[in]    pA         First vector
- * @param[in]    pB         Second vector
- * @param[in]    blockSize  vector length
- * @return distance
- *
- */
-float32_t arm_chebyshev_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
-
-
-/**
- * @brief        Cityblock (Manhattan) distance between two vectors
- * @param[in]    pA         First vector
- * @param[in]    pB         Second vector
- * @param[in]    blockSize  vector length
- * @return distance
- *
- */
-float32_t arm_cityblock_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
-
-/**
- * @brief        Correlation distance between two vectors
- *
- * The input vectors are modified in place !
- *
- * @param[in]    pA         First vector
- * @param[in]    pB         Second vector
- * @param[in]    blockSize  vector length
- * @return distance
- *
- */
-float32_t arm_correlation_distance_f32(float32_t *pA,float32_t *pB, uint32_t blockSize);
-
-/**
- * @brief        Cosine distance between two vectors
- *
- * @param[in]    pA         First vector
- * @param[in]    pB         Second vector
- * @param[in]    blockSize  vector length
- * @return distance
- *
- */
-
-float32_t arm_cosine_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
-
-/**
- * @brief        Jensen-Shannon distance between two vectors
- *
- * This function is assuming that elements of second vector are > 0
- * and 0 only when the corresponding element of first vector is 0.
- * Otherwise the result of the computation does not make sense
- * and for speed reasons, the cases returning NaN or Infinity are not
- * managed.
- *
- * When the function is computing x log (x / y) with x 0 and y 0,
- * it will compute the right value (0) but a division per zero will occur
- * and shoudl be ignored in client code.
- *
- * @param[in]    pA         First vector
- * @param[in]    pB         Second vector
- * @param[in]    blockSize  vector length
- * @return distance
- *
- */
-
-float32_t arm_jensenshannon_distance_f32(const float32_t *pA,const float32_t *pB,uint32_t blockSize);
-
-/**
- * @brief        Minkowski distance between two vectors
- *
- * @param[in]    pA         First vector
- * @param[in]    pB         Second vector
- * @param[in]    n          Norm order (>= 2)
- * @param[in]    blockSize  vector length
- * @return distance
- *
- */
-
-
-
-float32_t arm_minkowski_distance_f32(const float32_t *pA,const float32_t *pB, int32_t order, uint32_t blockSize);
-
-/**
- * @brief        Dice distance between two vectors
- *
- * @param[in]    pA              First vector of packed booleans
- * @param[in]    pB              Second vector of packed booleans
- * @param[in]    order           Distance order
- * @param[in]    blockSize       Number of samples
- * @return distance
- *
- */
-
-
-float32_t arm_dice_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
-
-/**
- * @brief        Hamming distance between two vectors
- *
- * @param[in]    pA              First vector of packed booleans
- * @param[in]    pB              Second vector of packed booleans
- * @param[in]    numberOfBools   Number of booleans
- * @return distance
- *
- */
-
-float32_t arm_hamming_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
-
-/**
- * @brief        Jaccard distance between two vectors
- *
- * @param[in]    pA              First vector of packed booleans
- * @param[in]    pB              Second vector of packed booleans
- * @param[in]    numberOfBools   Number of booleans
- * @return distance
- *
- */
-
-float32_t arm_jaccard_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
-
-/**
- * @brief        Kulsinski distance between two vectors
- *
- * @param[in]    pA              First vector of packed booleans
- * @param[in]    pB              Second vector of packed booleans
- * @param[in]    numberOfBools   Number of booleans
- * @return distance
- *
- */
-
-float32_t arm_kulsinski_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
-
-/**
- * @brief        Roger Stanimoto distance between two vectors
- *
- * @param[in]    pA              First vector of packed booleans
- * @param[in]    pB              Second vector of packed booleans
- * @param[in]    numberOfBools   Number of booleans
- * @return distance
- *
- */
-
-float32_t arm_rogerstanimoto_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
-
-/**
- * @brief        Russell-Rao distance between two vectors
- *
- * @param[in]    pA              First vector of packed booleans
- * @param[in]    pB              Second vector of packed booleans
- * @param[in]    numberOfBools   Number of booleans
- * @return distance
- *
- */
-
-float32_t arm_russellrao_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
-
-/**
- * @brief        Sokal-Michener distance between two vectors
- *
- * @param[in]    pA              First vector of packed booleans
- * @param[in]    pB              Second vector of packed booleans
- * @param[in]    numberOfBools   Number of booleans
- * @return distance
- *
- */
-
-float32_t arm_sokalmichener_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
-
-/**
- * @brief        Sokal-Sneath distance between two vectors
- *
- * @param[in]    pA              First vector of packed booleans
- * @param[in]    pB              Second vector of packed booleans
- * @param[in]    numberOfBools   Number of booleans
- * @return distance
- *
- */
-
-float32_t arm_sokalsneath_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
-
-/**
- * @brief        Yule distance between two vectors
- *
- * @param[in]    pA              First vector of packed booleans
- * @param[in]    pB              Second vector of packed booleans
- * @param[in]    numberOfBools   Number of booleans
- * @return distance
- *
- */
-
-float32_t arm_yule_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
-
-
-  /**
-   * @ingroup groupInterpolation
-   */
-
-  /**
-   * @defgroup BilinearInterpolate Bilinear Interpolation
-   *
-   * Bilinear interpolation is an extension of linear interpolation applied to a two dimensional grid.
-   * The underlying function <code>f(x, y)</code> is sampled on a regular grid and the interpolation process
-   * determines values between the grid points.
-   * Bilinear interpolation is equivalent to two step linear interpolation, first in the x-dimension and then in the y-dimension.
-   * Bilinear interpolation is often used in image processing to rescale images.
-   * The CMSIS DSP library provides bilinear interpolation functions for Q7, Q15, Q31, and floating-point data types.
-   *
-   * <b>Algorithm</b>
-   * \par
-   * The instance structure used by the bilinear interpolation functions describes a two dimensional data table.
-   * For floating-point, the instance structure is defined as:
-   * <pre>
-   *   typedef struct
-   *   {
-   *     uint16_t numRows;
-   *     uint16_t numCols;
-   *     float32_t *pData;
-   * } arm_bilinear_interp_instance_f32;
-   * </pre>
-   *
-   * \par
-   * where <code>numRows</code> specifies the number of rows in the table;
-   * <code>numCols</code> specifies the number of columns in the table;
-   * and <code>pData</code> points to an array of size <code>numRows*numCols</code> values.
-   * The data table <code>pTable</code> is organized in row order and the supplied data values fall on integer indexes.
-   * That is, table element (x,y) is located at <code>pTable[x + y*numCols]</code> where x and y are integers.
-   *
-   * \par
-   * Let <code>(x, y)</code> specify the desired interpolation point.  Then define:
-   * <pre>
-   *     XF = floor(x)
-   *     YF = floor(y)
-   * </pre>
-   * \par
-   * The interpolated output point is computed as:
-   * <pre>
-   *  f(x, y) = f(XF, YF) * (1-(x-XF)) * (1-(y-YF))
-   *           + f(XF+1, YF) * (x-XF)*(1-(y-YF))
-   *           + f(XF, YF+1) * (1-(x-XF))*(y-YF)
-   *           + f(XF+1, YF+1) * (x-XF)*(y-YF)
-   * </pre>
-   * Note that the coordinates (x, y) contain integer and fractional components.
-   * The integer components specify which portion of the table to use while the
-   * fractional components control the interpolation processor.
-   *
-   * \par
-   * if (x,y) are outside of the table boundary, Bilinear interpolation returns zero output.
-   */
-
-
-  /**
-   * @addtogroup BilinearInterpolate
-   * @{
-   */
-
-  /**
-  * @brief  Floating-point bilinear interpolation.
-  * @param[in,out] S  points to an instance of the interpolation structure.
-  * @param[in]     X  interpolation coordinate.
-  * @param[in]     Y  interpolation coordinate.
-  * @return out interpolated value.
-  */
-  __STATIC_FORCEINLINE float32_t arm_bilinear_interp_f32(
-  const arm_bilinear_interp_instance_f32 * S,
-  float32_t X,
-  float32_t Y)
-  {
-    float32_t out;
-    float32_t f00, f01, f10, f11;
-    float32_t *pData = S->pData;
-    int32_t xIndex, yIndex, index;
-    float32_t xdiff, ydiff;
-    float32_t b1, b2, b3, b4;
-
-    xIndex = (int32_t) X;
-    yIndex = (int32_t) Y;
-
-    /* Care taken for table outside boundary */
-    /* Returns zero output when values are outside table boundary */
-    if (xIndex < 0 || xIndex > (S->numCols - 2) || yIndex < 0 || yIndex > (S->numRows - 2))
-    {
-      return (0);
-    }
-
-    /* Calculation of index for two nearest points in X-direction */
-    index = (xIndex ) + (yIndex ) * S->numCols;
-
-
-    /* Read two nearest points in X-direction */
-    f00 = pData[index];
-    f01 = pData[index + 1];
-
-    /* Calculation of index for two nearest points in Y-direction */
-    index = (xIndex ) + (yIndex+1) * S->numCols;
-
-
-    /* Read two nearest points in Y-direction */
-    f10 = pData[index];
-    f11 = pData[index + 1];
-
-    /* Calculation of intermediate values */
-    b1 = f00;
-    b2 = f01 - f00;
-    b3 = f10 - f00;
-    b4 = f00 - f01 - f10 + f11;
-
-    /* Calculation of fractional part in X */
-    xdiff = X - xIndex;
-
-    /* Calculation of fractional part in Y */
-    ydiff = Y - yIndex;
-
-    /* Calculation of bi-linear interpolated output */
-    out = b1 + b2 * xdiff + b3 * ydiff + b4 * xdiff * ydiff;
-
-    /* return to application */
-    return (out);
-  }
-
-
-  /**
-  * @brief  Q31 bilinear interpolation.
-  * @param[in,out] S  points to an instance of the interpolation structure.
-  * @param[in]     X  interpolation coordinate in 12.20 format.
-  * @param[in]     Y  interpolation coordinate in 12.20 format.
-  * @return out interpolated value.
-  */
-  __STATIC_FORCEINLINE q31_t arm_bilinear_interp_q31(
-  arm_bilinear_interp_instance_q31 * S,
-  q31_t X,
-  q31_t Y)
-  {
-    q31_t out;                                   /* Temporary output */
-    q31_t acc = 0;                               /* output */
-    q31_t xfract, yfract;                        /* X, Y fractional parts */
-    q31_t x1, x2, y1, y2;                        /* Nearest output values */
-    int32_t rI, cI;                              /* Row and column indices */
-    q31_t *pYData = S->pData;                    /* pointer to output table values */
-    uint32_t nCols = S->numCols;                 /* num of rows */
-
-    /* Input is in 12.20 format */
-    /* 12 bits for the table index */
-    /* Index value calculation */
-    rI = ((X & (q31_t)0xFFF00000) >> 20);
-
-    /* Input is in 12.20 format */
-    /* 12 bits for the table index */
-    /* Index value calculation */
-    cI = ((Y & (q31_t)0xFFF00000) >> 20);
-
-    /* Care taken for table outside boundary */
-    /* Returns zero output when values are outside table boundary */
-    if (rI < 0 || rI > (S->numCols - 2) || cI < 0 || cI > (S->numRows - 2))
-    {
-      return (0);
-    }
-
-    /* 20 bits for the fractional part */
-    /* shift left xfract by 11 to keep 1.31 format */
-    xfract = (X & 0x000FFFFF) << 11U;
-
-    /* Read two nearest output values from the index */
-    x1 = pYData[(rI) + (int32_t)nCols * (cI)    ];
-    x2 = pYData[(rI) + (int32_t)nCols * (cI) + 1];
-
-    /* 20 bits for the fractional part */
-    /* shift left yfract by 11 to keep 1.31 format */
-    yfract = (Y & 0x000FFFFF) << 11U;
-
-    /* Read two nearest output values from the index */
-    y1 = pYData[(rI) + (int32_t)nCols * (cI + 1)    ];
-    y2 = pYData[(rI) + (int32_t)nCols * (cI + 1) + 1];
-
-    /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 3.29(q29) format */
-    out = ((q31_t) (((q63_t) x1  * (0x7FFFFFFF - xfract)) >> 32));
-    acc = ((q31_t) (((q63_t) out * (0x7FFFFFFF - yfract)) >> 32));
-
-    /* x2 * (xfract) * (1-yfract)  in 3.29(q29) and adding to acc */
-    out = ((q31_t) ((q63_t) x2 * (0x7FFFFFFF - yfract) >> 32));
-    acc += ((q31_t) ((q63_t) out * (xfract) >> 32));
-
-    /* y1 * (1 - xfract) * (yfract)  in 3.29(q29) and adding to acc */
-    out = ((q31_t) ((q63_t) y1 * (0x7FFFFFFF - xfract) >> 32));
-    acc += ((q31_t) ((q63_t) out * (yfract) >> 32));
-
-    /* y2 * (xfract) * (yfract)  in 3.29(q29) and adding to acc */
-    out = ((q31_t) ((q63_t) y2 * (xfract) >> 32));
-    acc += ((q31_t) ((q63_t) out * (yfract) >> 32));
-
-    /* Convert acc to 1.31(q31) format */
-    return ((q31_t)(acc << 2));
-  }
-
-
-  /**
-  * @brief  Q15 bilinear interpolation.
-  * @param[in,out] S  points to an instance of the interpolation structure.
-  * @param[in]     X  interpolation coordinate in 12.20 format.
-  * @param[in]     Y  interpolation coordinate in 12.20 format.
-  * @return out interpolated value.
-  */
-  __STATIC_FORCEINLINE q15_t arm_bilinear_interp_q15(
-  arm_bilinear_interp_instance_q15 * S,
-  q31_t X,
-  q31_t Y)
-  {
-    q63_t acc = 0;                               /* output */
-    q31_t out;                                   /* Temporary output */
-    q15_t x1, x2, y1, y2;                        /* Nearest output values */
-    q31_t xfract, yfract;                        /* X, Y fractional parts */
-    int32_t rI, cI;                              /* Row and column indices */
-    q15_t *pYData = S->pData;                    /* pointer to output table values */
-    uint32_t nCols = S->numCols;                 /* num of rows */
-
-    /* Input is in 12.20 format */
-    /* 12 bits for the table index */
-    /* Index value calculation */
-    rI = ((X & (q31_t)0xFFF00000) >> 20);
-
-    /* Input is in 12.20 format */
-    /* 12 bits for the table index */
-    /* Index value calculation */
-    cI = ((Y & (q31_t)0xFFF00000) >> 20);
-
-    /* Care taken for table outside boundary */
-    /* Returns zero output when values are outside table boundary */
-    if (rI < 0 || rI > (S->numCols - 2) || cI < 0 || cI > (S->numRows - 2))
-    {
-      return (0);
-    }
-
-    /* 20 bits for the fractional part */
-    /* xfract should be in 12.20 format */
-    xfract = (X & 0x000FFFFF);
-
-    /* Read two nearest output values from the index */
-    x1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI)    ];
-    x2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) + 1];
-
-    /* 20 bits for the fractional part */
-    /* yfract should be in 12.20 format */
-    yfract = (Y & 0x000FFFFF);
-
-    /* Read two nearest output values from the index */
-    y1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1)    ];
-    y2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) + 1];
-
-    /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 13.51 format */
-
-    /* x1 is in 1.15(q15), xfract in 12.20 format and out is in 13.35 format */
-    /* convert 13.35 to 13.31 by right shifting  and out is in 1.31 */
-    out = (q31_t) (((q63_t) x1 * (0x0FFFFF - xfract)) >> 4U);
-    acc = ((q63_t) out * (0x0FFFFF - yfract));
-
-    /* x2 * (xfract) * (1-yfract)  in 1.51 and adding to acc */
-    out = (q31_t) (((q63_t) x2 * (0x0FFFFF - yfract)) >> 4U);
-    acc += ((q63_t) out * (xfract));
-
-    /* y1 * (1 - xfract) * (yfract)  in 1.51 and adding to acc */
-    out = (q31_t) (((q63_t) y1 * (0x0FFFFF - xfract)) >> 4U);
-    acc += ((q63_t) out * (yfract));
-
-    /* y2 * (xfract) * (yfract)  in 1.51 and adding to acc */
-    out = (q31_t) (((q63_t) y2 * (xfract)) >> 4U);
-    acc += ((q63_t) out * (yfract));
-
-    /* acc is in 13.51 format and down shift acc by 36 times */
-    /* Convert out to 1.15 format */
-    return ((q15_t)(acc >> 36));
-  }
-
-
-  /**
-  * @brief  Q7 bilinear interpolation.
-  * @param[in,out] S  points to an instance of the interpolation structure.
-  * @param[in]     X  interpolation coordinate in 12.20 format.
-  * @param[in]     Y  interpolation coordinate in 12.20 format.
-  * @return out interpolated value.
-  */
-  __STATIC_FORCEINLINE q7_t arm_bilinear_interp_q7(
-  arm_bilinear_interp_instance_q7 * S,
-  q31_t X,
-  q31_t Y)
-  {
-    q63_t acc = 0;                               /* output */
-    q31_t out;                                   /* Temporary output */
-    q31_t xfract, yfract;                        /* X, Y fractional parts */
-    q7_t x1, x2, y1, y2;                         /* Nearest output values */
-    int32_t rI, cI;                              /* Row and column indices */
-    q7_t *pYData = S->pData;                     /* pointer to output table values */
-    uint32_t nCols = S->numCols;                 /* num of rows */
-
-    /* Input is in 12.20 format */
-    /* 12 bits for the table index */
-    /* Index value calculation */
-    rI = ((X & (q31_t)0xFFF00000) >> 20);
-
-    /* Input is in 12.20 format */
-    /* 12 bits for the table index */
-    /* Index value calculation */
-    cI = ((Y & (q31_t)0xFFF00000) >> 20);
-
-    /* Care taken for table outside boundary */
-    /* Returns zero output when values are outside table boundary */
-    if (rI < 0 || rI > (S->numCols - 2) || cI < 0 || cI > (S->numRows - 2))
-    {
-      return (0);
-    }
-
-    /* 20 bits for the fractional part */
-    /* xfract should be in 12.20 format */
-    xfract = (X & (q31_t)0x000FFFFF);
-
-    /* Read two nearest output values from the index */
-    x1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI)    ];
-    x2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) + 1];
-
-    /* 20 bits for the fractional part */
-    /* yfract should be in 12.20 format */
-    yfract = (Y & (q31_t)0x000FFFFF);
-
-    /* Read two nearest output values from the index */
-    y1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1)    ];
-    y2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) + 1];
-
-    /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 16.47 format */
-    out = ((x1 * (0xFFFFF - xfract)));
-    acc = (((q63_t) out * (0xFFFFF - yfract)));
-
-    /* x2 * (xfract) * (1-yfract)  in 2.22 and adding to acc */
-    out = ((x2 * (0xFFFFF - yfract)));
-    acc += (((q63_t) out * (xfract)));
-
-    /* y1 * (1 - xfract) * (yfract)  in 2.22 and adding to acc */
-    out = ((y1 * (0xFFFFF - xfract)));
-    acc += (((q63_t) out * (yfract)));
-
-    /* y2 * (xfract) * (yfract)  in 2.22 and adding to acc */
-    out = ((y2 * (yfract)));
-    acc += (((q63_t) out * (xfract)));
-
-    /* acc in 16.47 format and down shift by 40 to convert to 1.7 format */
-    return ((q7_t)(acc >> 40));
-  }
-
-  /**
-   * @} end of BilinearInterpolate group
-   */
-
-
-/* SMMLAR */
-#define multAcc_32x32_keep32_R(a, x, y) \
-    a = (q31_t) (((((q63_t) a) << 32) + ((q63_t) x * y) + 0x80000000LL ) >> 32)
-
-/* SMMLSR */
-#define multSub_32x32_keep32_R(a, x, y) \
-    a = (q31_t) (((((q63_t) a) << 32) - ((q63_t) x * y) + 0x80000000LL ) >> 32)
-
-/* SMMULR */
-#define mult_32x32_keep32_R(a, x, y) \
-    a = (q31_t) (((q63_t) x * y + 0x80000000LL ) >> 32)
-
-/* SMMLA */
-#define multAcc_32x32_keep32(a, x, y) \
-    a += (q31_t) (((q63_t) x * y) >> 32)
-
-/* SMMLS */
-#define multSub_32x32_keep32(a, x, y) \
-    a -= (q31_t) (((q63_t) x * y) >> 32)
-
-/* SMMUL */
-#define mult_32x32_keep32(a, x, y) \
-    a = (q31_t) (((q63_t) x * y ) >> 32)
-
-
-#if   defined ( __CC_ARM )
-  /* Enter low optimization region - place directly above function definition */
-  #if defined( __ARM_ARCH_7EM__ )
-    #define LOW_OPTIMIZATION_ENTER \
-       _Pragma ("push")         \
-       _Pragma ("O1")
-  #else
-    #define LOW_OPTIMIZATION_ENTER
-  #endif
-
-  /* Exit low optimization region - place directly after end of function definition */
-  #if defined ( __ARM_ARCH_7EM__ )
-    #define LOW_OPTIMIZATION_EXIT \
-       _Pragma ("pop")
-  #else
-    #define LOW_OPTIMIZATION_EXIT
-  #endif
-
-  /* Enter low optimization region - place directly above function definition */
-  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
-
-  /* Exit low optimization region - place directly after end of function definition */
-  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
-
-#elif defined (__ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
-  #define LOW_OPTIMIZATION_ENTER
-  #define LOW_OPTIMIZATION_EXIT
-  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
-  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
-
-#elif defined ( __GNUC__ )
-  #define LOW_OPTIMIZATION_ENTER \
-       __attribute__(( optimize("-O1") ))
-  #define LOW_OPTIMIZATION_EXIT
-  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
-  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
-
-#elif defined ( __ICCARM__ )
-  /* Enter low optimization region - place directly above function definition */
-  #if defined ( __ARM_ARCH_7EM__ )
-    #define LOW_OPTIMIZATION_ENTER \
-       _Pragma ("optimize=low")
-  #else
-    #define LOW_OPTIMIZATION_ENTER
-  #endif
-
-  /* Exit low optimization region - place directly after end of function definition */
-  #define LOW_OPTIMIZATION_EXIT
-
-  /* Enter low optimization region - place directly above function definition */
-  #if defined ( __ARM_ARCH_7EM__ )
-    #define IAR_ONLY_LOW_OPTIMIZATION_ENTER \
-       _Pragma ("optimize=low")
-  #else
-    #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
-  #endif
-
-  /* Exit low optimization region - place directly after end of function definition */
-  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
-
-#elif defined ( __TI_ARM__ )
-  #define LOW_OPTIMIZATION_ENTER
-  #define LOW_OPTIMIZATION_EXIT
-  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
-  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
-
-#elif defined ( __CSMC__ )
-  #define LOW_OPTIMIZATION_ENTER
-  #define LOW_OPTIMIZATION_EXIT
-  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
-  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
-
-#elif defined ( __TASKING__ )
-  #define LOW_OPTIMIZATION_ENTER
-  #define LOW_OPTIMIZATION_EXIT
-  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
-  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
-       
-#elif defined ( _MSC_VER ) || defined(__GNUC_PYTHON__)
-      #define LOW_OPTIMIZATION_ENTER
-      #define LOW_OPTIMIZATION_EXIT
-      #define IAR_ONLY_LOW_OPTIMIZATION_ENTER 
-      #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
-#endif
-
-
-
-/* Compiler specific diagnostic adjustment */
-#if   defined ( __CC_ARM )
-
-#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
-
-#elif defined ( __GNUC__ )
-#pragma GCC diagnostic pop
-
-#elif defined ( __ICCARM__ )
-
-#elif defined ( __TI_ARM__ )
-
-#elif defined ( __CSMC__ )
-
-#elif defined ( __TASKING__ )
-
-#elif defined ( _MSC_VER )
-
-#else
-  #error Unknown compiler
-#endif
 
 #ifdef   __cplusplus
 }
diff --git a/CMSIS/DSP/Include/arm_math_f16.h b/CMSIS/DSP/Include/arm_math_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..c046a127a8a4b01798ef2233f3c57df53fb1bf38
--- /dev/null
+++ b/CMSIS/DSP/Include/arm_math_f16.h
@@ -0,0 +1,59 @@
+/******************************************************************************
+ * @file     arm_math_f16.h
+ * @brief    Public header file for f16 function of the CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_MATH_F16_H
+#define _ARM_MATH_F16_H
+
+#include "arm_math.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#include "arm_math_types_f16.h"
+#include "dsp/none.h"
+#include "dsp/utils.h"
+#include "dsp/basic_math_functions_f16.h"
+#include "dsp/interpolation_functions_f16.h"
+#include "dsp/bayes_functions_f16.h"
+#include "dsp/matrix_functions_f16.h"
+#include "dsp/complex_math_functions_f16.h"
+#include "dsp/statistics_functions_f16.h"
+#include "dsp/controller_functions_f16.h"
+#include "dsp/support_functions_f16.h"
+#include "dsp/distance_functions_f16.h"
+#include "dsp/svm_functions_f16.h"
+#include "dsp/fast_math_functions_f16.h"
+#include "dsp/transform_functions_f16.h"
+#include "dsp/filtering_functions_f16.h"
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* _ARM_MATH_F16_H */
+
+
diff --git a/CMSIS/DSP/Include/arm_math_memory.h b/CMSIS/DSP/Include/arm_math_memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..771bb7cd9383b5f28827f496ed93ddc7d9aa35ca
--- /dev/null
+++ b/CMSIS/DSP/Include/arm_math_memory.h
@@ -0,0 +1,241 @@
+/******************************************************************************
+ * @file     arm_math_memory.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_MATH_MEMORY_H_
+
+#define _ARM_MATH_MEMORY_H_
+
+#include "arm_math_types.h"
+
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/**
+  @brief definition to read/write two 16 bit values.
+  @deprecated
+ */
+#if   defined ( __CC_ARM )
+  #define __SIMD32_TYPE int32_t __packed
+#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+  #define __SIMD32_TYPE int32_t
+#elif defined ( __GNUC__ )
+  #define __SIMD32_TYPE int32_t
+#elif defined ( __ICCARM__ )
+  #define __SIMD32_TYPE int32_t __packed
+#elif defined ( __TI_ARM__ )
+  #define __SIMD32_TYPE int32_t
+#elif defined ( __CSMC__ )
+  #define __SIMD32_TYPE int32_t
+#elif defined ( __TASKING__ )
+  #define __SIMD32_TYPE __un(aligned) int32_t
+#elif defined(_MSC_VER )
+  #define __SIMD32_TYPE int32_t
+#else
+  #error Unknown compiler
+#endif
+
+#define __SIMD32(addr)        (*(__SIMD32_TYPE **) & (addr))
+#define __SIMD32_CONST(addr)  ( (__SIMD32_TYPE * )   (addr))
+#define _SIMD32_OFFSET(addr)  (*(__SIMD32_TYPE * )   (addr))
+#define __SIMD64(addr)        (*(      int64_t **) & (addr))
+
+
+/* SIMD replacement */
+
+
+/**
+  @brief         Read 2 Q15 from Q15 pointer.
+  @param[in]     pQ15      points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q15x2 (
+  q15_t * pQ15)
+{
+  q31_t val;
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, pQ15, 4);
+#else
+  val = (pQ15[1] << 16) | (pQ15[0] & 0x0FFFF) ;
+#endif
+
+  return (val);
+}
+
+/**
+  @brief         Read 2 Q15 from Q15 pointer and increment pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q15x2_ia (
+  q15_t ** pQ15)
+{
+  q31_t val;
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, *pQ15, 4);
+#else
+  val = ((*pQ15)[1] << 16) | ((*pQ15)[0] & 0x0FFFF);
+#endif
+
+ *pQ15 += 2;
+ return (val);
+}
+
+/**
+  @brief         Read 2 Q15 from Q15 pointer and decrement pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q15x2_da (
+  q15_t ** pQ15)
+{
+  q31_t val;
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, *pQ15, 4);
+#else
+  val = ((*pQ15)[1] << 16) | ((*pQ15)[0] & 0x0FFFF);
+#endif
+
+  *pQ15 -= 2;
+  return (val);
+}
+
+/**
+  @brief         Write 2 Q15 to Q15 pointer and increment pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q15x2_ia (
+  q15_t ** pQ15,
+  q31_t    value)
+{
+  q31_t val = value;
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (*pQ15, &val, 4);
+#else
+  (*pQ15)[0] = (val & 0x0FFFF);
+  (*pQ15)[1] = (val >> 16) & 0x0FFFF;
+#endif
+
+ *pQ15 += 2;
+}
+
+/**
+  @brief         Write 2 Q15 to Q15 pointer.
+  @param[in]     pQ15      points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q15x2 (
+  q15_t * pQ15,
+  q31_t   value)
+{
+  q31_t val = value;
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (pQ15, &val, 4);
+#else
+  pQ15[0] = val & 0x0FFFF;
+  pQ15[1] = val >> 16;
+#endif
+}
+
+
+/**
+  @brief         Read 4 Q7 from Q7 pointer and increment pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q7x4_ia (
+  q7_t ** pQ7)
+{
+  q31_t val;
+
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, *pQ7, 4);
+#else
+  val =(((*pQ7)[3] & 0x0FF) << 24)  | (((*pQ7)[2] & 0x0FF) << 16)  | (((*pQ7)[1] & 0x0FF) << 8)  | ((*pQ7)[0] & 0x0FF);
+#endif 
+
+  *pQ7 += 4;
+
+  return (val);
+}
+
+/**
+  @brief         Read 4 Q7 from Q7 pointer and decrement pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q7x4_da (
+  q7_t ** pQ7)
+{
+  q31_t val;
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, *pQ7, 4);
+#else
+  val = ((((*pQ7)[3]) & 0x0FF) << 24) | ((((*pQ7)[2]) & 0x0FF) << 16)   | ((((*pQ7)[1]) & 0x0FF) << 8)  | ((*pQ7)[0] & 0x0FF);
+#endif 
+  *pQ7 -= 4;
+
+  return (val);
+}
+
+/**
+  @brief         Write 4 Q7 to Q7 pointer and increment pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q7x4_ia (
+  q7_t ** pQ7,
+  q31_t   value)
+{
+  q31_t val = value;
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (*pQ7, &val, 4);
+#else
+  (*pQ7)[0] = val & 0x0FF;
+  (*pQ7)[1] = (val >> 8) & 0x0FF;
+  (*pQ7)[2] = (val >> 16) & 0x0FF;
+  (*pQ7)[3] = (val >> 24) & 0x0FF;
+
+#endif
+  *pQ7 += 4;
+}
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /*ifndef _ARM_MATH_MEMORY_H_ */
diff --git a/CMSIS/DSP/Include/arm_math_types.h b/CMSIS/DSP/Include/arm_math_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9f6ed247a82166f6bc3c947607594a3bc8bc924
--- /dev/null
+++ b/CMSIS/DSP/Include/arm_math_types.h
@@ -0,0 +1,592 @@
+/******************************************************************************
+ * @file     arm_math_types.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_MATH_TYPES_H_
+
+#define _ARM_MATH_TYPES_H_
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/* Compiler specific diagnostic adjustment */
+#if   defined ( __CC_ARM )
+
+#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+
+#elif defined ( __GNUC__ )
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wsign-conversion"
+  #pragma GCC diagnostic ignored "-Wconversion"
+  #pragma GCC diagnostic ignored "-Wunused-parameter"
+
+#elif defined ( __ICCARM__ )
+
+#elif defined ( __TI_ARM__ )
+
+#elif defined ( __CSMC__ )
+
+#elif defined ( __TASKING__ )
+
+#elif defined ( _MSC_VER )
+
+#else
+  #error Unknown compiler
+#endif
+
+
+/* Included for instrinsics definitions */
+#if defined (_MSC_VER ) 
+#include <stdint.h>
+#define __STATIC_FORCEINLINE static __forceinline
+#define __STATIC_INLINE static __inline
+#define __ALIGNED(x) __declspec(align(x))
+
+#elif defined (__GNUC_PYTHON__)
+#include <stdint.h>
+#define  __ALIGNED(x) __attribute__((aligned(x)))
+#define __STATIC_FORCEINLINE static inline __attribute__((always_inline)) 
+#define __STATIC_INLINE static inline
+
+#else
+#include "cmsis_compiler.h"
+#endif
+
+
+
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+
+/* evaluate ARM DSP feature */
+#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
+  #define ARM_MATH_DSP                   1
+#endif
+
+#if defined(ARM_MATH_NEON)
+#include <arm_neon.h>
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+  #if !defined(ARM_MATH_NEON_FLOAT16)
+  #define ARM_MATH_NEON_FLOAT16
+  #endif
+#endif
+#endif
+
+#if !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if __ARM_FEATURE_MVE
+  #if !defined(ARM_MATH_MVEI)
+    #define ARM_MATH_MVEI
+  #endif
+#endif
+
+#if (__ARM_FEATURE_MVE & 2)
+  #if !defined(ARM_MATH_MVEF)
+    #define ARM_MATH_MVEF
+  #endif
+  #if !defined(ARM_MATH_MVE_FLOAT16)
+       #define ARM_MATH_MVE_FLOAT16
+  #endif
+#endif
+
+#endif /*!defined(ARM_MATH_AUTOVECTORIZE)*/
+
+
+#if defined (ARM_MATH_HELIUM)
+  #if !defined(ARM_MATH_MVEF)
+    #define ARM_MATH_MVEF
+  #endif
+
+  #if !defined(ARM_MATH_MVEI)
+    #define ARM_MATH_MVEI
+  #endif
+
+  #if !defined(ARM_MATH_MVE_FLOAT16)
+       #define ARM_MATH_MVE_FLOAT16
+  #endif
+#endif
+
+
+
+#if   defined ( __CC_ARM )
+  /* Enter low optimization region - place directly above function definition */
+  #if defined( __ARM_ARCH_7EM__ )
+    #define LOW_OPTIMIZATION_ENTER \
+       _Pragma ("push")         \
+       _Pragma ("O1")
+  #else
+    #define LOW_OPTIMIZATION_ENTER
+  #endif
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #if defined ( __ARM_ARCH_7EM__ )
+    #define LOW_OPTIMIZATION_EXIT \
+       _Pragma ("pop")
+  #else
+    #define LOW_OPTIMIZATION_EXIT
+  #endif
+
+  /* Enter low optimization region - place directly above function definition */
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined (__ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __GNUC__ )
+  #define LOW_OPTIMIZATION_ENTER \
+       __attribute__(( optimize("-O1") ))
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __ICCARM__ )
+  /* Enter low optimization region - place directly above function definition */
+  #if defined ( __ARM_ARCH_7EM__ )
+    #define LOW_OPTIMIZATION_ENTER \
+       _Pragma ("optimize=low")
+  #else
+    #define LOW_OPTIMIZATION_ENTER
+  #endif
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #define LOW_OPTIMIZATION_EXIT
+
+  /* Enter low optimization region - place directly above function definition */
+  #if defined ( __ARM_ARCH_7EM__ )
+    #define IAR_ONLY_LOW_OPTIMIZATION_ENTER \
+       _Pragma ("optimize=low")
+  #else
+    #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #endif
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __TI_ARM__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __CSMC__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __TASKING__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+       
+#elif defined ( _MSC_VER ) || defined(__GNUC_PYTHON__)
+      #define LOW_OPTIMIZATION_ENTER
+      #define LOW_OPTIMIZATION_EXIT
+      #define IAR_ONLY_LOW_OPTIMIZATION_ENTER 
+      #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+#endif
+
+
+
+/* Compiler specific diagnostic adjustment */
+#if   defined ( __CC_ARM )
+
+#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+
+#elif defined ( __GNUC__ )
+#pragma GCC diagnostic pop
+
+#elif defined ( __ICCARM__ )
+
+#elif defined ( __TI_ARM__ )
+
+#elif defined ( __CSMC__ )
+
+#elif defined ( __TASKING__ )
+
+#elif defined ( _MSC_VER )
+
+#else
+  #error Unknown compiler
+#endif
+
+#ifdef   __cplusplus
+}
+#endif
+
+#if __ARM_FEATURE_MVE
+#include <arm_mve.h>
+#endif
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+ /**
+   * @brief 8-bit fractional data type in 1.7 format.
+   */
+  typedef int8_t q7_t;
+
+  /**
+   * @brief 16-bit fractional data type in 1.15 format.
+   */
+  typedef int16_t q15_t;
+
+  /**
+   * @brief 32-bit fractional data type in 1.31 format.
+   */
+  typedef int32_t q31_t;
+
+  /**
+   * @brief 64-bit fractional data type in 1.63 format.
+   */
+  typedef int64_t q63_t;
+
+  /**
+   * @brief 32-bit floating-point type definition.
+   */
+  typedef float float32_t;
+
+  /**
+   * @brief 64-bit floating-point type definition.
+   */
+  typedef double float64_t;
+
+  /**
+   * @brief vector types
+   */
+#if defined(ARM_MATH_NEON) || (defined (ARM_MATH_MVEI)  && !defined(ARM_MATH_AUTOVECTORIZE))
+  /**
+   * @brief 64-bit fractional 128-bit vector data type in 1.63 format
+   */
+  typedef int64x2_t q63x2_t;
+
+  /**
+   * @brief 32-bit fractional 128-bit vector data type in 1.31 format.
+   */
+  typedef int32x4_t q31x4_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector data type with 16-bit alignment in 1.15 format.
+   */
+  typedef __ALIGNED(2) int16x8_t q15x8_t;
+
+ /**
+   * @brief 8-bit fractional 128-bit vector data type with 8-bit alignment in 1.7 format.
+   */
+  typedef __ALIGNED(1) int8x16_t q7x16_t;
+
+    /**
+   * @brief 32-bit fractional 128-bit vector pair data type in 1.31 format.
+   */
+  typedef int32x4x2_t q31x4x2_t;
+
+  /**
+   * @brief 32-bit fractional 128-bit vector quadruplet data type in 1.31 format.
+   */
+  typedef int32x4x4_t q31x4x4_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector pair data type in 1.15 format.
+   */
+  typedef int16x8x2_t q15x8x2_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector quadruplet data type in 1.15 format.
+   */
+  typedef int16x8x4_t q15x8x4_t;
+
+  /**
+   * @brief 8-bit fractional 128-bit vector pair data type in 1.7 format.
+   */
+  typedef int8x16x2_t q7x16x2_t;
+
+  /**
+   * @brief 8-bit fractional 128-bit vector quadruplet data type in 1.7 format.
+   */
+   typedef int8x16x4_t q7x16x4_t;
+
+  /**
+   * @brief 32-bit fractional data type in 9.23 format.
+   */
+  typedef int32_t q23_t;
+
+  /**
+   * @brief 32-bit fractional 128-bit vector data type in 9.23 format.
+   */
+  typedef int32x4_t q23x4_t;
+
+  /**
+   * @brief 64-bit status 128-bit vector data type.
+   */
+  typedef int64x2_t status64x2_t;
+
+  /**
+   * @brief 32-bit status 128-bit vector data type.
+   */
+  typedef int32x4_t status32x4_t;
+
+  /**
+   * @brief 16-bit status 128-bit vector data type.
+   */
+  typedef int16x8_t status16x8_t;
+
+  /**
+   * @brief 8-bit status 128-bit vector data type.
+   */
+  typedef int8x16_t status8x16_t;
+
+
+#endif
+
+#if defined(ARM_MATH_NEON) || (defined(ARM_MATH_MVEF)  && !defined(ARM_MATH_AUTOVECTORIZE)) /* floating point vector*/
+  /**
+   * @brief 32-bit floating-point 128-bit vector type
+   */
+  typedef float32x4_t f32x4_t;
+
+  /**
+   * @brief 32-bit floating-point 128-bit vector pair data type
+   */
+  typedef float32x4x2_t f32x4x2_t;
+
+  /**
+   * @brief 32-bit floating-point 128-bit vector quadruplet data type
+   */
+  typedef float32x4x4_t f32x4x4_t;
+
+  /**
+   * @brief 32-bit ubiquitous 128-bit vector data type
+   */
+  typedef union _any32x4_t
+  {
+      float32x4_t     f;
+      int32x4_t       i;
+  } any32x4_t;
+
+#endif
+
+#if defined(ARM_MATH_NEON)
+  /**
+   * @brief 32-bit fractional 64-bit vector data type in 1.31 format.
+   */
+  typedef int32x2_t  q31x2_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector data type in 1.15 format.
+   */
+  typedef  __ALIGNED(2) int16x4_t q15x4_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector data type in 1.7 format.
+   */
+  typedef  __ALIGNED(1) int8x8_t q7x8_t;
+
+  /**
+   * @brief 32-bit float 64-bit vector data type.
+   */
+  typedef float32x2_t  f32x2_t;
+
+  /**
+   * @brief 32-bit floating-point 128-bit vector triplet data type
+   */
+  typedef float32x4x3_t f32x4x3_t;
+
+
+  /**
+   * @brief 32-bit fractional 128-bit vector triplet data type in 1.31 format
+   */
+  typedef int32x4x3_t q31x4x3_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector triplet data type in 1.15 format
+   */
+  typedef int16x8x3_t q15x8x3_t;
+
+  /**
+   * @brief 8-bit fractional 128-bit vector triplet data type in 1.7 format
+   */
+  typedef int8x16x3_t q7x16x3_t;
+
+  /**
+   * @brief 32-bit floating-point 64-bit vector pair data type
+   */
+  typedef float32x2x2_t f32x2x2_t;
+
+  /**
+   * @brief 32-bit floating-point 64-bit vector triplet data type
+   */
+  typedef float32x2x3_t f32x2x3_t;
+
+  /**
+   * @brief 32-bit floating-point 64-bit vector quadruplet data type
+   */
+  typedef float32x2x4_t f32x2x4_t;
+
+
+  /**
+   * @brief 32-bit fractional 64-bit vector pair data type in 1.31 format
+   */
+  typedef int32x2x2_t q31x2x2_t;
+
+  /**
+   * @brief 32-bit fractional 64-bit vector triplet data type in 1.31 format
+   */
+  typedef int32x2x3_t q31x2x3_t;
+
+  /**
+   * @brief 32-bit fractional 64-bit vector quadruplet data type in 1.31 format
+   */
+  typedef int32x4x3_t q31x2x4_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector pair data type in 1.15 format
+   */
+  typedef int16x4x2_t q15x4x2_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector triplet data type in 1.15 format
+   */
+  typedef int16x4x2_t q15x4x3_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector quadruplet data type in 1.15 format
+   */
+  typedef int16x4x3_t q15x4x4_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector pair data type in 1.7 format
+   */
+  typedef int8x8x2_t q7x8x2_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector triplet data type in 1.7 format
+   */
+  typedef int8x8x3_t q7x8x3_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector quadruplet data type in 1.7 format
+   */
+  typedef int8x8x4_t q7x8x4_t;
+
+  /**
+   * @brief 32-bit ubiquitous 64-bit vector data type
+   */
+  typedef union _any32x2_t
+  {
+      float32x2_t     f;
+      int32x2_t       i;
+  } any32x2_t;
+
+
+  /**
+   * @brief 32-bit status 64-bit vector data type.
+   */
+  typedef int32x4_t status32x2_t;
+
+  /**
+   * @brief 16-bit status 64-bit vector data type.
+   */
+  typedef int16x8_t status16x4_t;
+
+  /**
+   * @brief 8-bit status 64-bit vector data type.
+   */
+  typedef int8x16_t status8x8_t;
+
+#endif
+
+
+
+
+
+#define F64_MAX   ((float64_t)DBL_MAX)
+#define F32_MAX   ((float32_t)FLT_MAX)
+
+
+
+#define F64_MIN   (-DBL_MAX)
+#define F32_MIN   (-FLT_MAX)
+
+
+
+#define F64_ABSMAX   ((float64_t)DBL_MAX)
+#define F32_ABSMAX   ((float32_t)FLT_MAX)
+
+
+
+#define F64_ABSMIN   ((float64_t)0.0)
+#define F32_ABSMIN   ((float32_t)0.0)
+
+
+#define Q31_MAX   ((q31_t)(0x7FFFFFFFL))
+#define Q15_MAX   ((q15_t)(0x7FFF))
+#define Q7_MAX    ((q7_t)(0x7F))
+#define Q31_MIN   ((q31_t)(0x80000000L))
+#define Q15_MIN   ((q15_t)(0x8000))
+#define Q7_MIN    ((q7_t)(0x80))
+
+#define Q31_ABSMAX   ((q31_t)(0x7FFFFFFFL))
+#define Q15_ABSMAX   ((q15_t)(0x7FFF))
+#define Q7_ABSMAX    ((q7_t)(0x7F))
+#define Q31_ABSMIN   ((q31_t)0)
+#define Q15_ABSMIN   ((q15_t)0)
+#define Q7_ABSMIN    ((q7_t)0)
+
+  /* Dimension C vector space */
+  #define CMPLX_DIM 2
+
+  /**
+   * @brief Error status returned by some functions in the library.
+   */
+
+  typedef enum
+  {
+    ARM_MATH_SUCCESS                 =  0,        /**< No error */
+    ARM_MATH_ARGUMENT_ERROR          = -1,        /**< One or more arguments are incorrect */
+    ARM_MATH_LENGTH_ERROR            = -2,        /**< Length of data buffer is incorrect */
+    ARM_MATH_SIZE_MISMATCH           = -3,        /**< Size of matrices is not compatible with the operation */
+    ARM_MATH_NANINF                  = -4,        /**< Not-a-number (NaN) or infinity is generated */
+    ARM_MATH_SINGULAR                = -5,        /**< Input matrix is singular and cannot be inverted */
+    ARM_MATH_TEST_FAILURE            = -6,        /**< Test Failed */
+    ARM_MATH_DECOMPOSITION_FAILURE   = -7         /**< Decomposition Failed */
+  } arm_status;
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /*ifndef _ARM_MATH_TYPES_H_ */
diff --git a/CMSIS/DSP/Include/arm_math_types_f16.h b/CMSIS/DSP/Include/arm_math_types_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..baf8750f356ee081f0045667d55158186c7c51a5
--- /dev/null
+++ b/CMSIS/DSP/Include/arm_math_types_f16.h
@@ -0,0 +1,156 @@
+/******************************************************************************
+ * @file     arm_math_types_f16.h
+ * @brief    Public header file for f16 function of the CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_MATH_TYPES_F16_H
+#define _ARM_MATH_TYPES_F16_H
+
+#include "arm_math_types.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if !defined( __CC_ARM )
+
+/**
+ * @brief 16-bit floating-point type definition.
+ * This is already defined in arm_mve.h
+ *
+ * This is not fully supported on ARM AC5.
+ */
+
+/*
+
+Check if the type __fp16 is available.
+If it is not available, f16 version of the kernels
+won't be built.
+
+*/
+#if !(__ARM_FEATURE_MVE & 2)
+  #if !defined(DISABLEFLOAT16)
+    #if defined(__ARM_FP16_FORMAT_IEEE) || defined(__ARM_FP16_FORMAT_ALTERNATIVE)
+      typedef __fp16 float16_t;
+      #define ARM_FLOAT16_SUPPORTED
+    #endif
+  #endif
+#else
+  /* When Vector float16, this flag is always defined and can't be disabled */
+  #define ARM_FLOAT16_SUPPORTED
+#endif
+
+#if defined(ARM_MATH_NEON) || (defined(ARM_MATH_MVEF)  && !defined(ARM_MATH_AUTOVECTORIZE)) /* floating point vector*/
+  
+#if defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_NEON_FLOAT16)
+
+  /**
+   * @brief 16-bit floating-point 128-bit vector data type
+   */
+  typedef __ALIGNED(2) float16x8_t f16x8_t;
+
+  /**
+   * @brief 16-bit floating-point 128-bit vector pair data type
+   */
+  typedef float16x8x2_t f16x8x2_t;
+
+  /**
+   * @brief 16-bit floating-point 128-bit vector quadruplet data type
+   */
+  typedef float16x8x4_t f16x8x4_t;
+
+  /**
+   * @brief 16-bit ubiquitous 128-bit vector data type
+   */
+  typedef union _any16x8_t
+  {
+      float16x8_t     f;
+      int16x8_t       i;
+  } any16x8_t;
+#endif
+
+#endif
+
+#if defined(ARM_MATH_NEON)
+ 
+
+#if defined(ARM_MATH_NEON_FLOAT16)
+  /**
+   * @brief 16-bit float 64-bit vector data type.
+   */
+  typedef  __ALIGNED(2) float16x4_t f16x4_t;
+
+  /**
+   * @brief 16-bit floating-point 128-bit vector triplet data type
+   */
+  typedef float16x8x3_t f16x8x3_t;
+
+  /**
+   * @brief 16-bit floating-point 64-bit vector pair data type
+   */
+  typedef float16x4x2_t f16x4x2_t;
+
+  /**
+   * @brief 16-bit floating-point 64-bit vector triplet data type
+   */
+  typedef float16x4x3_t f16x4x3_t;
+
+  /**
+   * @brief 16-bit floating-point 64-bit vector quadruplet data type
+   */
+  typedef float16x4x4_t f16x4x4_t;
+
+  /**
+   * @brief 16-bit ubiquitous 64-bit vector data type
+   */
+  typedef union _any16x4_t
+  {
+      float16x4_t     f;
+      int16x4_t       i;
+  } any16x4_t;
+#endif 
+
+#endif
+
+
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+#define F16_MAX   ((float16_t)__FLT16_MAX__)
+#define F16_MIN   (-(float16_t)__FLT16_MAX__)
+
+#define F16_ABSMAX   ((float16_t)__FLT16_MAX__)
+#define F16_ABSMIN   ((float16_t)0.0f16)
+
+#define F16INFINITY ((float16_t)__builtin_inf())
+  
+#endif /* ARM_FLOAT16_SUPPORTED*/
+#endif /* !defined( __CC_ARM ) */
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* _ARM_MATH_F16_H */
+
+
diff --git a/CMSIS/DSP/Include/arm_mve_tables.h b/CMSIS/DSP/Include/arm_mve_tables.h
index 4d2c135ac6e1b196da6bd9fd84d457c42dba53a2..fe41a443cafdb79ccbca6d7504fb308f2e074f9e 100644
--- a/CMSIS/DSP/Include/arm_mve_tables.h
+++ b/CMSIS/DSP/Include/arm_mve_tables.h
@@ -4,13 +4,13 @@
  * Description:  common tables like fft twiddle factors, Bitreverse, reciprocal etc
  *               used for MVE implementation only
  *
- * $Date:        08. January 2020
- * $Revision:    V1.7.0
+ * @version  V1.9.0
+ * @date     23 April 2021
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -30,9 +30,12 @@
  #ifndef _ARM_MVE_TABLES_H
  #define _ARM_MVE_TABLES_H
 
- #include "arm_math.h"
+#include "arm_math_types.h"
 
- 
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
 
 
  
@@ -98,7 +101,7 @@ extern float32_t rearranged_twiddle_stride3_4096_f32[2728];
 
 
 
-#if defined(ARM_MATH_MVEI) 
+#if defined(ARM_MATH_MVEI)  && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
 
@@ -159,7 +162,7 @@ extern q31_t rearranged_twiddle_stride3_4096_q31[2728];
 
 
 
-#if defined(ARM_MATH_MVEI) 
+#if defined(ARM_MATH_MVEI)  && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
 
@@ -220,16 +223,9 @@ extern q15_t rearranged_twiddle_stride3_4096_q15[2728];
 
 
 
-#if defined(ARM_MATH_MVEI) 
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-
-#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
-
-#endif /* defined(ARM_MATH_MVEI) */
-
-
+#ifdef   __cplusplus
+}
+#endif
 
 #endif /*_ARM_MVE_TABLES_H*/
 
diff --git a/CMSIS/DSP/Include/arm_mve_tables_f16.h b/CMSIS/DSP/Include/arm_mve_tables_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..c93aed1813bb8fa4c92a1a5f4598fc7a8ad4ee3b
--- /dev/null
+++ b/CMSIS/DSP/Include/arm_mve_tables_f16.h
@@ -0,0 +1,109 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mve_tables_f16.h
+ * Description:  common tables like fft twiddle factors, Bitreverse, reciprocal etc
+ *               used for MVE implementation only
+ *
+ * @version  V1.9.0
+ * @date     23 April 2021
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #ifndef _ARM_MVE_TABLES_F16_H
+ #define _ARM_MVE_TABLES_F16_H
+
+#include "arm_math_types_f16.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+
+ 
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_16) || defined(ARM_TABLE_TWIDDLECOEF_F16_32)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_16_f16[2];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_16_f16[2];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_16_f16[2];
+extern float16_t rearranged_twiddle_stride1_16_f16[8];
+extern float16_t rearranged_twiddle_stride2_16_f16[8];
+extern float16_t rearranged_twiddle_stride3_16_f16[8];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_64) || defined(ARM_TABLE_TWIDDLECOEF_F16_128)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_64_f16[3];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_64_f16[3];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_64_f16[3];
+extern float16_t rearranged_twiddle_stride1_64_f16[40];
+extern float16_t rearranged_twiddle_stride2_64_f16[40];
+extern float16_t rearranged_twiddle_stride3_64_f16[40];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_256) || defined(ARM_TABLE_TWIDDLECOEF_F16_512)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_256_f16[4];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_256_f16[4];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_256_f16[4];
+extern float16_t rearranged_twiddle_stride1_256_f16[168];
+extern float16_t rearranged_twiddle_stride2_256_f16[168];
+extern float16_t rearranged_twiddle_stride3_256_f16[168];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_1024) || defined(ARM_TABLE_TWIDDLECOEF_F16_2048)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_f16[5];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_f16[5];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_f16[5];
+extern float16_t rearranged_twiddle_stride1_1024_f16[680];
+extern float16_t rearranged_twiddle_stride2_1024_f16[680];
+extern float16_t rearranged_twiddle_stride3_1024_f16[680];
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096) || defined(ARM_TABLE_TWIDDLECOEF_F16_8192)
+
+extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_f16[6];
+extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_f16[6];
+extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_f16[6];
+extern float16_t rearranged_twiddle_stride1_4096_f16[2728];
+extern float16_t rearranged_twiddle_stride2_4096_f16[2728];
+extern float16_t rearranged_twiddle_stride3_4096_f16[2728];
+#endif
+
+
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /*_ARM_MVE_TABLES_F16_H*/
+
diff --git a/CMSIS/DSP/Include/arm_vec_math.h b/CMSIS/DSP/Include/arm_vec_math.h
index 0ce9464bcb0415ea31078a0ab46c2859748e94e0..029088f1fe8ed0db916d2375256f09bdd92a4188 100644
--- a/CMSIS/DSP/Include/arm_vec_math.h
+++ b/CMSIS/DSP/Include/arm_vec_math.h
@@ -1,11 +1,12 @@
 /******************************************************************************
  * @file     arm_vec_math.h
  * @brief    Public header file for CMSIS DSP Library
- * @version  V1.7.0
- * @date     15. October 2019
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
  ******************************************************************************/
 /*
- * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,7 +26,7 @@
 #ifndef _ARM_VEC_MATH_H
 #define _ARM_VEC_MATH_H
 
-#include "arm_math.h"
+#include "arm_math_types.h"
 #include "arm_common_tables.h"
 #include "arm_helium_utils.h"
 
@@ -295,7 +296,7 @@ __STATIC_INLINE f32x4_t vpowq_f32(
 
 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
 
-#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
+#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
 #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
 
 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
diff --git a/CMSIS/DSP/Include/arm_vec_math_f16.h b/CMSIS/DSP/Include/arm_vec_math_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..c79955ba11ac60aa83716328bfc8e97692ea4098
--- /dev/null
+++ b/CMSIS/DSP/Include/arm_vec_math_f16.h
@@ -0,0 +1,312 @@
+/******************************************************************************
+ * @file     arm_vec_math_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_VEC_MATH_F16_H
+#define _ARM_VEC_MATH_F16_H
+
+#include "arm_math_types_f16.h"
+#include "arm_common_tables_f16.h"
+#include "arm_helium_utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+
+static const float16_t __logf_rng_f16=0.693147180f16;
+
+/* fast inverse approximation (3x newton) */
+__STATIC_INLINE f16x8_t vrecip_medprec_f16(
+    f16x8_t x)
+{
+    q15x8_t         m;
+    f16x8_t         b;
+    any16x8_t       xinv;
+    f16x8_t         ax = vabsq(x);
+
+    xinv.f = ax;
+
+    m = 0x03c00 - (xinv.i & 0x07c00);
+    xinv.i = xinv.i + m;
+    xinv.f = 1.41176471f16 - 0.47058824f16 * xinv.f;
+    xinv.i = xinv.i + m;
+
+    b = 2.0f16 - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f16 - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f16 - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    xinv.f = vdupq_m(xinv.f, F16INFINITY, vcmpeqq(x, 0.0f));
+    /*
+     * restore sign
+     */
+    xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
+
+    return xinv.f;
+}
+
+/* fast inverse approximation (4x newton) */
+__STATIC_INLINE f16x8_t vrecip_hiprec_f16(
+    f16x8_t x)
+{
+    q15x8_t         m;
+    f16x8_t         b;
+    any16x8_t       xinv;
+    f16x8_t         ax = vabsq(x);
+
+    xinv.f = ax;
+
+    m = 0x03c00 - (xinv.i & 0x07c00);
+    xinv.i = xinv.i + m;
+    xinv.f = 1.41176471f16 - 0.47058824f16 * xinv.f;
+    xinv.i = xinv.i + m;
+
+    b = 2.0f16 - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f16 - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f16 - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    b = 2.0f16 - xinv.f * ax;
+    xinv.f = xinv.f * b;
+
+    xinv.f = vdupq_m(xinv.f, F16INFINITY, vcmpeqq(x, 0.0f));
+    /*
+     * restore sign
+     */
+    xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
+
+    return xinv.f;
+}
+
+__STATIC_INLINE f16x8_t vdiv_f16(
+    f16x8_t num, f16x8_t den)
+{
+    return vmulq(num, vrecip_hiprec_f16(den));
+}
+
+
+/**
+  @brief         Single-precision taylor dev.
+  @param[in]     x              f16  vector input
+  @param[in]     coeffs         f16  vector coeffs
+  @return        destination    f16  vector
+ */
+
+__STATIC_INLINE float16x8_t vtaylor_polyq_f16(
+        float16x8_t           x,
+        const float16_t * coeffs)
+{
+    float16x8_t         A = vfmasq(vdupq_n_f16(coeffs[4]), x, coeffs[0]);
+    float16x8_t         B = vfmasq(vdupq_n_f16(coeffs[6]), x, coeffs[2]);
+    float16x8_t         C = vfmasq(vdupq_n_f16(coeffs[5]), x, coeffs[1]);
+    float16x8_t         D = vfmasq(vdupq_n_f16(coeffs[7]), x, coeffs[3]);
+    float16x8_t         x2 = vmulq(x, x);
+    float16x8_t         x4 = vmulq(x2, x2);
+    float16x8_t         res = vfmaq(vfmaq_f16(A, B, x2), vfmaq_f16(C, D, x2), x4);
+
+    return res;
+}
+
+#define VMANT_EXP_F16(x)  \
+    any16x8_t       r;    \
+    int16x8_t       n;    \
+                          \
+    r.f = x;              \
+    n = r.i >> 10;        \
+    n = n - 15;           \
+    r.i = r.i - (n << 10);\
+                          \
+    vecExpUnBiased = n;   \
+    vecTmpFlt1 = r.f;
+
+__STATIC_INLINE float16x8_t vlogq_f16(float16x8_t vecIn)
+{
+    q15x8_t             vecExpUnBiased;
+    float16x8_t         vecTmpFlt0, vecTmpFlt1;
+    float16x8_t         vecAcc0, vecAcc1, vecAcc2, vecAcc3;
+    float16x8_t         vecExpUnBiasedFlt;
+
+    /*
+     * extract exponent
+     */
+    VMANT_EXP_F16(vecIn);
+
+    vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
+    /*
+     * a = (__logf_lut_f16[4] * r.f) + (__logf_lut_f16[0]);
+     */
+    vecAcc0 = vdupq_n_f16(__logf_lut_f16[0]);
+    vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f16[4]);
+    /*
+     * b = (__logf_lut_f16[6] * r.f) + (__logf_lut_f16[2]);
+     */
+    vecAcc1 = vdupq_n_f16(__logf_lut_f16[2]);
+    vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f16[6]);
+    /*
+     * c = (__logf_lut_f16[5] * r.f) + (__logf_lut_f16[1]);
+     */
+    vecAcc2 = vdupq_n_f16(__logf_lut_f16[1]);
+    vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f16[5]);
+    /*
+     * d = (__logf_lut_f16[7] * r.f) + (__logf_lut_f16[3]);
+     */
+    vecAcc3 = vdupq_n_f16(__logf_lut_f16[3]);
+    vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f16[7]);
+    /*
+     * a = a + b * xx;
+     */
+    vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
+    /*
+     * c = c + d * xx;
+     */
+    vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
+    /*
+     * xx = xx * xx;
+     */
+    vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
+    vecExpUnBiasedFlt = vcvtq_f16_s16(vecExpUnBiased);
+    /*
+     * r.f = a + c * xx;
+     */
+    vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
+    /*
+     * add exponent
+     * r.f = r.f + ((float32_t) m) * __logf_rng_f16;
+     */
+    vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f16);
+    // set log0 down to -inf
+    vecAcc0 = vdupq_m(vecAcc0, -F16INFINITY, vcmpeqq(vecIn, 0.0f));
+    return vecAcc0;
+}
+
+__STATIC_INLINE float16x8_t vexpq_f16(
+    float16x8_t x)
+{
+    // Perform range reduction [-log(2),log(2)]
+    int16x8_t       m = vcvtq_s16_f16(vmulq_n_f16(x, 1.4426950408f16));
+    float16x8_t     val = vfmsq_f16(x, vcvtq_f16_s16(m), vdupq_n_f16(0.6931471805f16));
+
+    // Polynomial Approximation
+    float16x8_t         poly = vtaylor_polyq_f16(val, exp_tab_f16);
+
+    // Reconstruct
+    poly = (float16x8_t) (vqaddq_s16((int16x8_t) (poly), vqshlq_n_s16(m, 10)));
+
+    poly = vdupq_m(poly, 0.0f, vcmpltq_n_s16(m, -14));
+    return poly;
+}
+
+__STATIC_INLINE float16x8_t arm_vec_exponent_f16(float16x8_t x, int16_t nb)
+{
+    float16x8_t         r = x;
+    nb--;
+    while (nb > 0) {
+        r = vmulq(r, x);
+        nb--;
+    }
+    return (r);
+}
+
+__STATIC_INLINE f16x8_t vpowq_f16(
+    f16x8_t val,
+    f16x8_t n)
+{
+    return vexpq_f16(vmulq_f16(n, vlogq_f16(val)));
+}
+
+#define INV_NEWTON_INIT_F16  0x7773
+
+__STATIC_INLINE f16x8_t vrecip_f16(f16x8_t vecIn)
+{
+    f16x8_t     vecSx, vecW, vecTmp;
+    any16x8_t   v;
+
+    vecSx = vabsq(vecIn);
+
+    v.f = vecIn;
+    v.i = vsubq(vdupq_n_s16(INV_NEWTON_INIT_F16), v.i);
+
+    vecW = vmulq(vecSx, v.f);
+
+    // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
+    vecTmp = vsubq(vdupq_n_f16(8.0f), vecW);
+    vecTmp = vfmasq(vecW, vecTmp, -28.0f);
+    vecTmp = vfmasq(vecW, vecTmp, 56.0f);
+    vecTmp = vfmasq(vecW, vecTmp, -70.0f);
+    vecTmp = vfmasq(vecW, vecTmp, 56.0f);
+    vecTmp = vfmasq(vecW, vecTmp, -28.0f);
+    vecTmp = vfmasq(vecW, vecTmp, 8.0f);
+    v.f = vmulq(v.f,  vecTmp);
+
+    v.f = vdupq_m(v.f, F16INFINITY, vcmpeqq(vecIn, 0.0f));
+    /*
+     * restore sign
+     */
+    v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));
+    return v.f;
+}
+
+__STATIC_INLINE f16x8_t vtanhq_f16(
+    f16x8_t val)
+{
+    f16x8_t         x =
+        vminnmq_f16(vmaxnmq_f16(val, vdupq_n_f16(-10.f)), vdupq_n_f16(10.0f));
+    f16x8_t         exp2x = vexpq_f16(vmulq_n_f16(x, 2.f));
+    f16x8_t         num = vsubq_n_f16(exp2x, 1.f);
+    f16x8_t         den = vaddq_n_f16(exp2x, 1.f);
+    f16x8_t         tanh = vmulq_f16(num, vrecip_f16(den));
+    return tanh;
+}
+
+#endif /* defined(ARM_MATH_MVE_FLOAT16)  && !defined(ARM_MATH_AUTOVECTORIZE)*/
+
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ARM FLOAT16 SUPPORTED */
+
+#endif /* _ARM_VEC_MATH_F16_H */
+
+/**
+ *
+ * End of file.
+ */
diff --git a/CMSIS/DSP/Include/dsp/basic_math_functions.h b/CMSIS/DSP/Include/dsp/basic_math_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3481524ed4c9abc00870ade00fd27bd79112da8
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/basic_math_functions.h
@@ -0,0 +1,764 @@
+/******************************************************************************
+ * @file     basic_math_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _BASIC_MATH_FUNCTIONS_H_
+#define _BASIC_MATH_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @defgroup groupMath Basic Math Functions
+ */
+
+ /**
+   * @brief Q7 vector multiplication.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_mult_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q15 vector multiplication.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_mult_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q31 vector multiplication.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_mult_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Floating-point vector multiplication.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_mult_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+
+ /**
+   * @brief Floating-point vector addition.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_add_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+
+  /**
+   * @brief Q7 vector addition.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_add_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q15 vector addition.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_add_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q31 vector addition.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_add_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Floating-point vector subtraction.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_sub_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+
+  /**
+   * @brief Q7 vector subtraction.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_sub_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q15 vector subtraction.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_sub_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q31 vector subtraction.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_sub_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Multiplies a floating-point vector by a scalar.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  scale      scale factor to be applied
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_scale_f32(
+  const float32_t * pSrc,
+        float32_t scale,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+
+  /**
+   * @brief Multiplies a Q7 vector by a scalar.
+   * @param[in]  pSrc        points to the input vector
+   * @param[in]  scaleFract  fractional portion of the scale value
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to the output vector
+   * @param[in]  blockSize   number of samples in the vector
+   */
+  void arm_scale_q7(
+  const q7_t * pSrc,
+        q7_t scaleFract,
+        int8_t shift,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Multiplies a Q15 vector by a scalar.
+   * @param[in]  pSrc        points to the input vector
+   * @param[in]  scaleFract  fractional portion of the scale value
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to the output vector
+   * @param[in]  blockSize   number of samples in the vector
+   */
+  void arm_scale_q15(
+  const q15_t * pSrc,
+        q15_t scaleFract,
+        int8_t shift,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Multiplies a Q31 vector by a scalar.
+   * @param[in]  pSrc        points to the input vector
+   * @param[in]  scaleFract  fractional portion of the scale value
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to the output vector
+   * @param[in]  blockSize   number of samples in the vector
+   */
+  void arm_scale_q31(
+  const q31_t * pSrc,
+        q31_t scaleFract,
+        int8_t shift,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q7 vector absolute value.
+   * @param[in]  pSrc       points to the input buffer
+   * @param[out] pDst       points to the output buffer
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_abs_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Floating-point vector absolute value.
+   * @param[in]  pSrc       points to the input buffer
+   * @param[out] pDst       points to the output buffer
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_abs_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+
+
+  /**
+   * @brief Q15 vector absolute value.
+   * @param[in]  pSrc       points to the input buffer
+   * @param[out] pDst       points to the output buffer
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_abs_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Q31 vector absolute value.
+   * @param[in]  pSrc       points to the input buffer
+   * @param[out] pDst       points to the output buffer
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_abs_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Dot product of floating-point vectors.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[in]  blockSize  number of samples in each vector
+   * @param[out] result     output result returned here
+   */
+  void arm_dot_prod_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        uint32_t blockSize,
+        float32_t * result);
+
+
+
+  /**
+   * @brief Dot product of Q7 vectors.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[in]  blockSize  number of samples in each vector
+   * @param[out] result     output result returned here
+   */
+  void arm_dot_prod_q7(
+  const q7_t * pSrcA,
+  const q7_t * pSrcB,
+        uint32_t blockSize,
+        q31_t * result);
+
+
+  /**
+   * @brief Dot product of Q15 vectors.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[in]  blockSize  number of samples in each vector
+   * @param[out] result     output result returned here
+   */
+  void arm_dot_prod_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        uint32_t blockSize,
+        q63_t * result);
+
+
+  /**
+   * @brief Dot product of Q31 vectors.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[in]  blockSize  number of samples in each vector
+   * @param[out] result     output result returned here
+   */
+  void arm_dot_prod_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        uint32_t blockSize,
+        q63_t * result);
+
+
+  /**
+   * @brief  Shifts the elements of a Q7 vector a specified number of bits.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_shift_q7(
+  const q7_t * pSrc,
+        int8_t shiftBits,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Shifts the elements of a Q15 vector a specified number of bits.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_shift_q15(
+  const q15_t * pSrc,
+        int8_t shiftBits,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Shifts the elements of a Q31 vector a specified number of bits.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  shiftBits  number of bits to shift.  A positive value shifts left; a negative value shifts right.
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_shift_q31(
+  const q31_t * pSrc,
+        int8_t shiftBits,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Adds a constant offset to a floating-point vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  offset     is the offset to be added
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_offset_f32(
+  const float32_t * pSrc,
+        float32_t offset,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+
+  /**
+   * @brief  Adds a constant offset to a Q7 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  offset     is the offset to be added
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_offset_q7(
+  const q7_t * pSrc,
+        q7_t offset,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Adds a constant offset to a Q15 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  offset     is the offset to be added
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_offset_q15(
+  const q15_t * pSrc,
+        q15_t offset,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Adds a constant offset to a Q31 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  offset     is the offset to be added
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_offset_q31(
+  const q31_t * pSrc,
+        q31_t offset,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Negates the elements of a floating-point vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_negate_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Negates the elements of a Q7 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_negate_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Negates the elements of a Q15 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_negate_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Negates the elements of a Q31 vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_negate_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+/**
+   * @brief         Compute the logical bitwise AND of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_and_u16(
+    const uint16_t * pSrcA,
+    const uint16_t * pSrcB,
+          uint16_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise AND of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_and_u32(
+    const uint32_t * pSrcA,
+    const uint32_t * pSrcB,
+          uint32_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise AND of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_and_u8(
+    const uint8_t * pSrcA,
+    const uint8_t * pSrcB,
+          uint8_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise OR of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_or_u16(
+    const uint16_t * pSrcA,
+    const uint16_t * pSrcB,
+          uint16_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise OR of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_or_u32(
+    const uint32_t * pSrcA,
+    const uint32_t * pSrcB,
+          uint32_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise OR of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_or_u8(
+    const uint8_t * pSrcA,
+    const uint8_t * pSrcB,
+          uint8_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise NOT of a fixed-point vector.
+   * @param[in]     pSrc       points to input vector 
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_not_u16(
+    const uint16_t * pSrc,
+          uint16_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise NOT of a fixed-point vector.
+   * @param[in]     pSrc       points to input vector 
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_not_u32(
+    const uint32_t * pSrc,
+          uint32_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise NOT of a fixed-point vector.
+   * @param[in]     pSrc       points to input vector 
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_not_u8(
+    const uint8_t * pSrc,
+          uint8_t * pDst,
+          uint32_t blockSize);
+
+/**
+   * @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_xor_u16(
+    const uint16_t * pSrcA,
+    const uint16_t * pSrcB,
+          uint16_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_xor_u32(
+    const uint32_t * pSrcA,
+    const uint32_t * pSrcB,
+          uint32_t * pDst,
+          uint32_t blockSize);
+
+  /**
+   * @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+   * @param[in]     pSrcA      points to input vector A
+   * @param[in]     pSrcB      points to input vector B
+   * @param[out]    pDst       points to output vector
+   * @param[in]     blockSize  number of samples in each vector
+   * @return        none
+   */
+  void arm_xor_u8(
+    const uint8_t * pSrcA,
+    const uint8_t * pSrcB,
+          uint8_t * pDst,
+    uint32_t blockSize);
+
+  /**
+  @brief         Elementwise floating-point clipping
+  @param[in]     pSrc          points to input values
+  @param[out]    pDst          points to output clipped values
+  @param[in]     low           lower bound
+  @param[in]     high          higher bound
+  @param[in]     numSamples    number of samples to clip
+  @return        none
+ */
+
+void arm_clip_f32(const float32_t * pSrc, 
+  float32_t * pDst, 
+  float32_t low, 
+  float32_t high, 
+  uint32_t numSamples);
+
+  /**
+  @brief         Elementwise fixed-point clipping
+  @param[in]     pSrc          points to input values
+  @param[out]    pDst          points to output clipped values
+  @param[in]     low           lower bound
+  @param[in]     high          higher bound
+  @param[in]     numSamples    number of samples to clip
+  @return        none
+ */
+
+void arm_clip_q31(const q31_t * pSrc, 
+  q31_t * pDst, 
+  q31_t low, 
+  q31_t high, 
+  uint32_t numSamples);
+
+  /**
+  @brief         Elementwise fixed-point clipping
+  @param[in]     pSrc          points to input values
+  @param[out]    pDst          points to output clipped values
+  @param[in]     low           lower bound
+  @param[in]     high          higher bound
+  @param[in]     numSamples    number of samples to clip
+  @return        none
+ */
+
+void arm_clip_q15(const q15_t * pSrc, 
+  q15_t * pDst, 
+  q15_t low, 
+  q15_t high, 
+  uint32_t numSamples);
+
+  /**
+  @brief         Elementwise fixed-point clipping
+  @param[in]     pSrc          points to input values
+  @param[out]    pDst          points to output clipped values
+  @param[in]     low           lower bound
+  @param[in]     high          higher bound
+  @param[in]     numSamples    number of samples to clip
+  @return        none
+ */
+
+void arm_clip_q7(const q7_t * pSrc, 
+  q7_t * pDst, 
+  q7_t low, 
+  q7_t high, 
+  uint32_t numSamples);
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _BASIC_MATH_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/basic_math_functions_f16.h b/CMSIS/DSP/Include/dsp/basic_math_functions_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e4acb2759462e9267a187f15a3999ac4f4f536b
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/basic_math_functions_f16.h
@@ -0,0 +1,168 @@
+/******************************************************************************
+ * @file     basic_math_functions_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _BASIC_MATH_FUNCTIONS_F16_H_
+#define _BASIC_MATH_FUNCTIONS_F16_H_
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#include "arm_math_types_f16.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+  /**
+   * @brief Floating-point vector addition.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_add_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Floating-point vector subtraction.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_sub_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+    /**
+   * @brief Multiplies a floating-point vector by a scalar.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  scale      scale factor to be applied
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_scale_f16(
+  const float16_t * pSrc,
+        float16_t scale,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+    /**
+   * @brief Floating-point vector absolute value.
+   * @param[in]  pSrc       points to the input buffer
+   * @param[out] pDst       points to the output buffer
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_abs_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Adds a constant offset to a floating-point vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[in]  offset     is the offset to be added
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_offset_f16(
+  const float16_t * pSrc,
+        float16_t offset,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Dot product of floating-point vectors.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[in]  blockSize  number of samples in each vector
+   * @param[out] result     output result returned here
+   */
+  void arm_dot_prod_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        uint32_t blockSize,
+        float16_t * result);
+
+  /**
+   * @brief Floating-point vector multiplication.
+   * @param[in]  pSrcA      points to the first input vector
+   * @param[in]  pSrcB      points to the second input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in each vector
+   */
+  void arm_mult_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Negates the elements of a floating-point vector.
+   * @param[in]  pSrc       points to the input vector
+   * @param[out] pDst       points to the output vector
+   * @param[in]  blockSize  number of samples in the vector
+   */
+  void arm_negate_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+  /**
+  @brief         Elementwise floating-point clipping
+  @param[in]     pSrc          points to input values
+  @param[out]    pDst          points to output clipped values
+  @param[in]     low           lower bound
+  @param[in]     high          higher bound
+  @param[in]     numSamples    number of samples to clip
+  @return        none
+ */
+
+void arm_clip_f16(const float16_t * pSrc, 
+  float16_t * pDst, 
+  float16_t low, 
+  float16_t high, 
+  uint32_t numSamples);
+
+#endif /* defined(ARM_FLOAT16_SUPPORTED)*/
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _BASIC_MATH_FUNCTIONS_F16_H_ */
diff --git a/CMSIS/DSP/Include/dsp/bayes_functions.h b/CMSIS/DSP/Include/dsp/bayes_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..beca38ec619f77003fc0c1876dffc9e04400516c
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/bayes_functions.h
@@ -0,0 +1,89 @@
+/******************************************************************************
+ * @file     bayes_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _BAYES_FUNCTIONS_H_
+#define _BAYES_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#include "dsp/statistics_functions.h"
+
+/**
+ * @defgroup groupBayes Bayesian estimators
+ *
+ * Implement the naive gaussian Bayes estimator.
+ * The training must be done from scikit-learn.
+ *
+ * The parameters can be easily
+ * generated from the scikit-learn object. Some examples are given in
+ * DSP/Testing/PatternGeneration/Bayes.py
+ */
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @brief Instance structure for Naive Gaussian Bayesian estimator.
+ */
+typedef struct
+{
+  uint32_t vectorDimension;  /**< Dimension of vector space */
+  uint32_t numberOfClasses;  /**< Number of different classes  */
+  const float32_t *theta;          /**< Mean values for the Gaussians */
+  const float32_t *sigma;          /**< Variances for the Gaussians */
+  const float32_t *classPriors;    /**< Class prior probabilities */
+  float32_t epsilon;         /**< Additive value to variances */
+} arm_gaussian_naive_bayes_instance_f32;
+
+/**
+ * @brief Naive Gaussian Bayesian Estimator
+ *
+ * @param[in]  S                        points to a naive bayes instance structure
+ * @param[in]  in                       points to the elements of the input vector.
+ * @param[out] *pOutputProbabilities    points to a buffer of length numberOfClasses containing estimated probabilities
+ * @param[out] *pBufferB                points to a temporary buffer of length numberOfClasses
+ * @return The predicted class
+ *
+ */
+
+
+uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S, 
+   const float32_t * in, 
+   float32_t *pOutputProbabilities,
+   float32_t *pBufferB);
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _BAYES_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/bayes_functions_f16.h b/CMSIS/DSP/Include/dsp/bayes_functions_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2c9ad82e712e97d04901c4b86b4ef2144def60b
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/bayes_functions_f16.h
@@ -0,0 +1,80 @@
+/******************************************************************************
+ * @file     bayes_functions_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _BAYES_FUNCTIONS_F16_H_
+#define _BAYES_FUNCTIONS_F16_H_
+
+#include "arm_math_types_f16.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#include "dsp/statistics_functions_f16.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+ * @brief Instance structure for Naive Gaussian Bayesian estimator.
+ */
+typedef struct
+{
+  uint32_t vectorDimension;  /**< Dimension of vector space */
+  uint32_t numberOfClasses;  /**< Number of different classes  */
+  const float16_t *theta;          /**< Mean values for the Gaussians */
+  const float16_t *sigma;          /**< Variances for the Gaussians */
+  const float16_t *classPriors;    /**< Class prior probabilities */
+  float16_t epsilon;         /**< Additive value to variances */
+} arm_gaussian_naive_bayes_instance_f16;
+
+/**
+ * @brief Naive Gaussian Bayesian Estimator
+ *
+ * @param[in]  S                        points to a naive bayes instance structure
+ * @param[in]  in                       points to the elements of the input vector.
+ * @param[out] *pOutputProbabilities    points to a buffer of length numberOfClasses containing estimated probabilities
+ * @param[out] *pBufferB                points to a temporary buffer of length numberOfClasses
+ * @return The predicted class
+ *
+ */
+
+
+uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_instance_f16 *S, 
+   const float16_t * in, 
+   float16_t *pOutputProbabilities,
+   float16_t *pBufferB);
+
+#endif /*defined(ARM_FLOAT16_SUPPORTED)*/
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _BAYES_FUNCTIONS_F16_H_ */
diff --git a/CMSIS/DSP/Include/dsp/complex_math_functions.h b/CMSIS/DSP/Include/dsp/complex_math_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bf3e17034f4e471d8bac3dee750154006cbd3d7
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/complex_math_functions.h
@@ -0,0 +1,295 @@
+/******************************************************************************
+ * @file     complex_math_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _COMPLEX_MATH_FUNCTIONS_H_
+#define _COMPLEX_MATH_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+#include "dsp/fast_math_functions.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @defgroup groupCmplxMath Complex Math Functions
+ * This set of functions operates on complex data vectors.
+ * The data in the complex arrays is stored in an interleaved fashion
+ * (real, imag, real, imag, ...).
+ * In the API functions, the number of samples in a complex array refers
+ * to the number of complex values; the array contains twice this number of
+ * real values.
+ */
+
+ /**
+   * @brief  Floating-point complex conjugate.
+   * @param[in]  pSrc        points to the input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_conj_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples);
+
+  /**
+   * @brief  Q31 complex conjugate.
+   * @param[in]  pSrc        points to the input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_conj_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q15 complex conjugate.
+   * @param[in]  pSrc        points to the input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_conj_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Floating-point complex magnitude squared
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_squared_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q31 complex magnitude squared
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_squared_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q15 complex magnitude squared
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_squared_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples);
+
+
+/**
+   * @brief  Floating-point complex magnitude
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q31 complex magnitude
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q15 complex magnitude
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q15 complex dot product
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   * @param[out] realResult  real part of the result returned here
+   * @param[out] imagResult  imaginary part of the result returned here
+   */
+  void arm_cmplx_dot_prod_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        uint32_t numSamples,
+        q31_t * realResult,
+        q31_t * imagResult);
+
+
+  /**
+   * @brief  Q31 complex dot product
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   * @param[out] realResult  real part of the result returned here
+   * @param[out] imagResult  imaginary part of the result returned here
+   */
+  void arm_cmplx_dot_prod_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        uint32_t numSamples,
+        q63_t * realResult,
+        q63_t * imagResult);
+
+
+  /**
+   * @brief  Floating-point complex dot product
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   * @param[out] realResult  real part of the result returned here
+   * @param[out] imagResult  imaginary part of the result returned here
+   */
+  void arm_cmplx_dot_prod_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        uint32_t numSamples,
+        float32_t * realResult,
+        float32_t * imagResult);
+
+
+  /**
+   * @brief  Q15 complex-by-real multiplication
+   * @param[in]  pSrcCmplx   points to the complex input vector
+   * @param[in]  pSrcReal    points to the real input vector
+   * @param[out] pCmplxDst   points to the complex output vector
+   * @param[in]  numSamples  number of samples in each vector
+   */
+  void arm_cmplx_mult_real_q15(
+  const q15_t * pSrcCmplx,
+  const q15_t * pSrcReal,
+        q15_t * pCmplxDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q31 complex-by-real multiplication
+   * @param[in]  pSrcCmplx   points to the complex input vector
+   * @param[in]  pSrcReal    points to the real input vector
+   * @param[out] pCmplxDst   points to the complex output vector
+   * @param[in]  numSamples  number of samples in each vector
+   */
+  void arm_cmplx_mult_real_q31(
+  const q31_t * pSrcCmplx,
+  const q31_t * pSrcReal,
+        q31_t * pCmplxDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Floating-point complex-by-real multiplication
+   * @param[in]  pSrcCmplx   points to the complex input vector
+   * @param[in]  pSrcReal    points to the real input vector
+   * @param[out] pCmplxDst   points to the complex output vector
+   * @param[in]  numSamples  number of samples in each vector
+   */
+  void arm_cmplx_mult_real_f32(
+  const float32_t * pSrcCmplx,
+  const float32_t * pSrcReal,
+        float32_t * pCmplxDst,
+        uint32_t numSamples);
+
+  /**
+   * @brief  Q15 complex-by-complex multiplication
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_mult_cmplx_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Q31 complex-by-complex multiplication
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_mult_cmplx_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t numSamples);
+
+
+  /**
+   * @brief  Floating-point complex-by-complex multiplication
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_mult_cmplx_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t numSamples);
+
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _COMPLEX_MATH_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/complex_math_functions_f16.h b/CMSIS/DSP/Include/dsp/complex_math_functions_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..da78559bf311a8954140bbb1caef5af18b7fdbe0
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/complex_math_functions_f16.h
@@ -0,0 +1,123 @@
+/******************************************************************************
+ * @file     complex_math_functions_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _COMPLEX_MATH_FUNCTIONS_F16_H_
+#define _COMPLEX_MATH_FUNCTIONS_F16_H_
+
+#include "arm_math_types_f16.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+#include "dsp/fast_math_functions_f16.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+ /**
+   * @brief  Floating-point complex conjugate.
+   * @param[in]  pSrc        points to the input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_conj_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t numSamples);
+
+ /**
+   * @brief  Floating-point complex magnitude squared
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_squared_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t numSamples);
+
+  /**
+   * @brief  Floating-point complex magnitude
+   * @param[in]  pSrc        points to the complex input vector
+   * @param[out] pDst        points to the real output vector
+   * @param[in]  numSamples  number of complex samples in the input vector
+   */
+  void arm_cmplx_mag_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t numSamples);
+
+  /**
+   * @brief  Floating-point complex dot product
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   * @param[out] realResult  real part of the result returned here
+   * @param[out] imagResult  imaginary part of the result returned here
+   */
+  void arm_cmplx_dot_prod_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        uint32_t numSamples,
+        float16_t * realResult,
+        float16_t * imagResult);
+
+   /**
+   * @brief  Floating-point complex-by-real multiplication
+   * @param[in]  pSrcCmplx   points to the complex input vector
+   * @param[in]  pSrcReal    points to the real input vector
+   * @param[out] pCmplxDst   points to the complex output vector
+   * @param[in]  numSamples  number of samples in each vector
+   */
+  void arm_cmplx_mult_real_f16(
+  const float16_t * pSrcCmplx,
+  const float16_t * pSrcReal,
+        float16_t * pCmplxDst,
+        uint32_t numSamples);
+
+  /**
+   * @brief  Floating-point complex-by-complex multiplication
+   * @param[in]  pSrcA       points to the first input vector
+   * @param[in]  pSrcB       points to the second input vector
+   * @param[out] pDst        points to the output vector
+   * @param[in]  numSamples  number of complex samples in each vector
+   */
+  void arm_cmplx_mult_cmplx_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        float16_t * pDst,
+        uint32_t numSamples);
+
+#endif /*defined(ARM_FLOAT16_SUPPORTED)*/
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _COMPLEX_MATH_FUNCTIONS_F16_H_ */
diff --git a/CMSIS/DSP/Include/dsp/controller_functions.h b/CMSIS/DSP/Include/dsp/controller_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..53823dbe976fcfef41fd5fc02579b4f97a24f25d
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/controller_functions.h
@@ -0,0 +1,791 @@
+/******************************************************************************
+ * @file     controller_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _CONTROLLER_FUNCTIONS_H_
+#define _CONTROLLER_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * @brief Macros required for SINE and COSINE Controller functions
+   */
+
+#define CONTROLLER_Q31_SHIFT  (32 - 9)
+  /* 1.31(q31) Fixed value of 2/360 */
+  /* -1 to +1 is divided into 360 values so total spacing is (2/360) */
+#define INPUT_SPACING         0xB60B61
+  
+/**
+ * @defgroup groupController Controller Functions
+ */
+
+
+ /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @addtogroup SinCos
+   * @{
+   */
+
+/**
+   * @brief  Floating-point sin_cos function.
+   * @param[in]  theta   input value in degrees
+   * @param[out] pSinVal  points to the processed sine output.
+   * @param[out] pCosVal  points to the processed cos output.
+   */
+  void arm_sin_cos_f32(
+        float32_t theta,
+        float32_t * pSinVal,
+        float32_t * pCosVal);
+
+
+  /**
+   * @brief  Q31 sin_cos function.
+   * @param[in]  theta    scaled input value in degrees
+   * @param[out] pSinVal  points to the processed sine output.
+   * @param[out] pCosVal  points to the processed cosine output.
+   */
+  void arm_sin_cos_q31(
+        q31_t theta,
+        q31_t * pSinVal,
+        q31_t * pCosVal);
+
+  /**
+   * @} end of SinCos group
+   */
+
+ /**
+   * @ingroup groupController
+   */
+
+/**
+   * @defgroup PID PID Motor Control
+   *
+   * A Proportional Integral Derivative (PID) controller is a generic feedback control
+   * loop mechanism widely used in industrial control systems.
+   * A PID controller is the most commonly used type of feedback controller.
+   *
+   * This set of functions implements (PID) controllers
+   * for Q15, Q31, and floating-point data types.  The functions operate on a single sample
+   * of data and each call to the function returns a single processed value.
+   * <code>S</code> points to an instance of the PID control data structure.  <code>in</code>
+   * is the input sample value. The functions return the output value.
+   *
+   * \par Algorithm:
+   * <pre>
+   *    y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2]
+   *    A0 = Kp + Ki + Kd
+   *    A1 = (-Kp ) - (2 * Kd )
+   *    A2 = Kd
+   * </pre>
+   *
+   * \par
+   * where \c Kp is proportional constant, \c Ki is Integral constant and \c Kd is Derivative constant
+   *
+   * \par
+   * \image html PID.gif "Proportional Integral Derivative Controller"
+   *
+   * \par
+   * The PID controller calculates an "error" value as the difference between
+   * the measured output and the reference input.
+   * The controller attempts to minimize the error by adjusting the process control inputs.
+   * The proportional value determines the reaction to the current error,
+   * the integral value determines the reaction based on the sum of recent errors,
+   * and the derivative value determines the reaction based on the rate at which the error has been changing.
+   *
+   * \par Instance Structure
+   * The Gains A0, A1, A2 and state variables for a PID controller are stored together in an instance data structure.
+   * A separate instance structure must be defined for each PID Controller.
+   * There are separate instance structure declarations for each of the 3 supported data types.
+   *
+   * \par Reset Functions
+   * There is also an associated reset function for each data type which clears the state array.
+   *
+   * \par Initialization Functions
+   * There is also an associated initialization function for each data type.
+   * The initialization function performs the following operations:
+   * - Initializes the Gains A0, A1, A2 from Kp,Ki, Kd gains.
+   * - Zeros out the values in the state buffer.
+   *
+   * \par
+   * Instance structure cannot be placed into a const data section and it is recommended to use the initialization function.
+   *
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the fixed-point versions of the PID Controller functions.
+   * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+
+  /**
+   * @brief Instance structure for the Q15 PID Control.
+   */
+  typedef struct
+  {
+          q15_t A0;           /**< The derived gain, A0 = Kp + Ki + Kd . */
+#if !defined (ARM_MATH_DSP)
+          q15_t A1;           /**< The derived gain A1 = -Kp - 2Kd */
+          q15_t A2;           /**< The derived gain A1 = Kd. */
+#else
+          q31_t A1;           /**< The derived gain A1 = -Kp - 2Kd | Kd.*/
+#endif
+          q15_t state[3];     /**< The state array of length 3. */
+          q15_t Kp;           /**< The proportional gain. */
+          q15_t Ki;           /**< The integral gain. */
+          q15_t Kd;           /**< The derivative gain. */
+  } arm_pid_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 PID Control.
+   */
+  typedef struct
+  {
+          q31_t A0;            /**< The derived gain, A0 = Kp + Ki + Kd . */
+          q31_t A1;            /**< The derived gain, A1 = -Kp - 2Kd. */
+          q31_t A2;            /**< The derived gain, A2 = Kd . */
+          q31_t state[3];      /**< The state array of length 3. */
+          q31_t Kp;            /**< The proportional gain. */
+          q31_t Ki;            /**< The integral gain. */
+          q31_t Kd;            /**< The derivative gain. */
+  } arm_pid_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point PID Control.
+   */
+  typedef struct
+  {
+          float32_t A0;          /**< The derived gain, A0 = Kp + Ki + Kd . */
+          float32_t A1;          /**< The derived gain, A1 = -Kp - 2Kd. */
+          float32_t A2;          /**< The derived gain, A2 = Kd . */
+          float32_t state[3];    /**< The state array of length 3. */
+          float32_t Kp;          /**< The proportional gain. */
+          float32_t Ki;          /**< The integral gain. */
+          float32_t Kd;          /**< The derivative gain. */
+  } arm_pid_instance_f32;
+
+
+
+  /**
+   * @brief  Initialization function for the floating-point PID Control.
+   * @param[in,out] S               points to an instance of the PID structure.
+   * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
+   */
+  void arm_pid_init_f32(
+        arm_pid_instance_f32 * S,
+        int32_t resetStateFlag);
+
+
+  /**
+   * @brief  Reset function for the floating-point PID Control.
+   * @param[in,out] S  is an instance of the floating-point PID Control structure
+   */
+  void arm_pid_reset_f32(
+        arm_pid_instance_f32 * S);
+
+
+  /**
+   * @brief  Initialization function for the Q31 PID Control.
+   * @param[in,out] S               points to an instance of the Q15 PID structure.
+   * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
+   */
+  void arm_pid_init_q31(
+        arm_pid_instance_q31 * S,
+        int32_t resetStateFlag);
+
+
+  /**
+   * @brief  Reset function for the Q31 PID Control.
+   * @param[in,out] S   points to an instance of the Q31 PID Control structure
+   */
+
+  void arm_pid_reset_q31(
+        arm_pid_instance_q31 * S);
+
+
+  /**
+   * @brief  Initialization function for the Q15 PID Control.
+   * @param[in,out] S               points to an instance of the Q15 PID structure.
+   * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.
+   */
+  void arm_pid_init_q15(
+        arm_pid_instance_q15 * S,
+        int32_t resetStateFlag);
+
+
+  /**
+   * @brief  Reset function for the Q15 PID Control.
+   * @param[in,out] S  points to an instance of the q15 PID Control structure
+   */
+  void arm_pid_reset_q15(
+        arm_pid_instance_q15 * S);
+
+
+
+  /**
+   * @addtogroup PID
+   * @{
+   */
+
+  /**
+   * @brief         Process function for the floating-point PID Control.
+   * @param[in,out] S   is an instance of the floating-point PID Control structure
+   * @param[in]     in  input sample to process
+   * @return        processed output sample.
+   */
+  __STATIC_FORCEINLINE float32_t arm_pid_f32(
+  arm_pid_instance_f32 * S,
+  float32_t in)
+  {
+    float32_t out;
+
+    /* y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2]  */
+    out = (S->A0 * in) +
+      (S->A1 * S->state[0]) + (S->A2 * S->state[1]) + (S->state[2]);
+
+    /* Update state */
+    S->state[1] = S->state[0];
+    S->state[0] = in;
+    S->state[2] = out;
+
+    /* return to application */
+    return (out);
+
+  }
+
+/**
+  @brief         Process function for the Q31 PID Control.
+  @param[in,out] S  points to an instance of the Q31 PID Control structure
+  @param[in]     in  input sample to process
+  @return        processed output sample.
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using an internal 64-bit accumulator.
+         The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+         Thus, if the accumulator result overflows it wraps around rather than clip.
+         In order to avoid overflows completely the input signal must be scaled down by 2 bits as there are four additions.
+         After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
+ */
+__STATIC_FORCEINLINE q31_t arm_pid_q31(
+  arm_pid_instance_q31 * S,
+  q31_t in)
+  {
+    q63_t acc;
+    q31_t out;
+
+    /* acc = A0 * x[n]  */
+    acc = (q63_t) S->A0 * in;
+
+    /* acc += A1 * x[n-1] */
+    acc += (q63_t) S->A1 * S->state[0];
+
+    /* acc += A2 * x[n-2]  */
+    acc += (q63_t) S->A2 * S->state[1];
+
+    /* convert output to 1.31 format to add y[n-1] */
+    out = (q31_t) (acc >> 31U);
+
+    /* out += y[n-1] */
+    out += S->state[2];
+
+    /* Update state */
+    S->state[1] = S->state[0];
+    S->state[0] = in;
+    S->state[2] = out;
+
+    /* return to application */
+    return (out);
+  }
+
+
+/**
+  @brief         Process function for the Q15 PID Control.
+  @param[in,out] S   points to an instance of the Q15 PID Control structure
+  @param[in]     in  input sample to process
+  @return        processed output sample.
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using a 64-bit internal accumulator.
+         Both Gains and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+         The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+         There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+         After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+         Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ */
+__STATIC_FORCEINLINE q15_t arm_pid_q15(
+  arm_pid_instance_q15 * S,
+  q15_t in)
+  {
+    q63_t acc;
+    q15_t out;
+
+#if defined (ARM_MATH_DSP)
+    /* Implementation of PID controller */
+
+    /* acc = A0 * x[n]  */
+    acc = (q31_t) __SMUAD((uint32_t)S->A0, (uint32_t)in);
+
+    /* acc += A1 * x[n-1] + A2 * x[n-2]  */
+    acc = (q63_t)__SMLALD((uint32_t)S->A1, (uint32_t)read_q15x2 (S->state), (uint64_t)acc);
+#else
+    /* acc = A0 * x[n]  */
+    acc = ((q31_t) S->A0) * in;
+
+    /* acc += A1 * x[n-1] + A2 * x[n-2]  */
+    acc += (q31_t) S->A1 * S->state[0];
+    acc += (q31_t) S->A2 * S->state[1];
+#endif
+
+    /* acc += y[n-1] */
+    acc += (q31_t) S->state[2] << 15;
+
+    /* saturate the output */
+    out = (q15_t) (__SSAT((q31_t)(acc >> 15), 16));
+
+    /* Update state */
+    S->state[1] = S->state[0];
+    S->state[0] = in;
+    S->state[2] = out;
+
+    /* return to application */
+    return (out);
+  }
+
+  /**
+   * @} end of PID group
+   */
+
+  /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup park Vector Park Transform
+   *
+   * Forward Park transform converts the input two-coordinate vector to flux and torque components.
+   * The Park transform can be used to realize the transformation of the <code>Ialpha</code> and the <code>Ibeta</code> currents
+   * from the stationary to the moving reference frame and control the spatial relationship between
+   * the stator vector current and rotor flux vector.
+   * If we consider the d axis aligned with the rotor flux, the diagram below shows the
+   * current vector and the relationship from the two reference frames:
+   * \image html park.gif "Stator current space vector and its component in (a,b) and in the d,q rotating reference frame"
+   *
+   * The function operates on a single sample of data and each call to the function returns the processed output.
+   * The library provides separate functions for Q31 and floating-point data types.
+   * \par Algorithm
+   * \image html parkFormula.gif
+   * where <code>Ialpha</code> and <code>Ibeta</code> are the stator vector components,
+   * <code>pId</code> and <code>pIq</code> are rotor vector components and <code>cosVal</code> and <code>sinVal</code> are the
+   * cosine and sine values of theta (rotor flux position).
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the Q31 version of the Park transform.
+   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup park
+   * @{
+   */
+
+  /**
+   * @brief Floating-point Park transform
+   * @param[in]  Ialpha  input two-phase vector coordinate alpha
+   * @param[in]  Ibeta   input two-phase vector coordinate beta
+   * @param[out] pId     points to output   rotor reference frame d
+   * @param[out] pIq     points to output   rotor reference frame q
+   * @param[in]  sinVal  sine value of rotation angle theta
+   * @param[in]  cosVal  cosine value of rotation angle theta
+   * @return     none
+   *
+   * The function implements the forward Park transform.
+   *
+   */
+  __STATIC_FORCEINLINE void arm_park_f32(
+  float32_t Ialpha,
+  float32_t Ibeta,
+  float32_t * pId,
+  float32_t * pIq,
+  float32_t sinVal,
+  float32_t cosVal)
+  {
+    /* Calculate pId using the equation, pId = Ialpha * cosVal + Ibeta * sinVal */
+    *pId = Ialpha * cosVal + Ibeta * sinVal;
+
+    /* Calculate pIq using the equation, pIq = - Ialpha * sinVal + Ibeta * cosVal */
+    *pIq = -Ialpha * sinVal + Ibeta * cosVal;
+  }
+
+
+/**
+  @brief  Park transform for Q31 version
+  @param[in]  Ialpha  input two-phase vector coordinate alpha
+  @param[in]  Ibeta   input two-phase vector coordinate beta
+  @param[out] pId     points to output rotor reference frame d
+  @param[out] pIq     points to output rotor reference frame q
+  @param[in]  sinVal  sine value of rotation angle theta
+  @param[in]  cosVal  cosine value of rotation angle theta
+  @return     none
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using an internal 32-bit accumulator.
+         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
+         There is saturation on the addition and subtraction, hence there is no risk of overflow.
+ */
+__STATIC_FORCEINLINE void arm_park_q31(
+  q31_t Ialpha,
+  q31_t Ibeta,
+  q31_t * pId,
+  q31_t * pIq,
+  q31_t sinVal,
+  q31_t cosVal)
+  {
+    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
+    q31_t product3, product4;                    /* Temporary variables used to store intermediate results */
+
+    /* Intermediate product is calculated by (Ialpha * cosVal) */
+    product1 = (q31_t) (((q63_t) (Ialpha) * (cosVal)) >> 31);
+
+    /* Intermediate product is calculated by (Ibeta * sinVal) */
+    product2 = (q31_t) (((q63_t) (Ibeta) * (sinVal)) >> 31);
+
+
+    /* Intermediate product is calculated by (Ialpha * sinVal) */
+    product3 = (q31_t) (((q63_t) (Ialpha) * (sinVal)) >> 31);
+
+    /* Intermediate product is calculated by (Ibeta * cosVal) */
+    product4 = (q31_t) (((q63_t) (Ibeta) * (cosVal)) >> 31);
+
+    /* Calculate pId by adding the two intermediate products 1 and 2 */
+    *pId = __QADD(product1, product2);
+
+    /* Calculate pIq by subtracting the two intermediate products 3 from 4 */
+    *pIq = __QSUB(product4, product3);
+  }
+
+  /**
+   * @} end of park group
+   */
+
+
+  /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup inv_park Vector Inverse Park transform
+   * Inverse Park transform converts the input flux and torque components to two-coordinate vector.
+   *
+   * The function operates on a single sample of data and each call to the function returns the processed output.
+   * The library provides separate functions for Q31 and floating-point data types.
+   * \par Algorithm
+   * \image html parkInvFormula.gif
+   * where <code>pIalpha</code> and <code>pIbeta</code> are the stator vector components,
+   * <code>Id</code> and <code>Iq</code> are rotor vector components and <code>cosVal</code> and <code>sinVal</code> are the
+   * cosine and sine values of theta (rotor flux position).
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the Q31 version of the Park transform.
+   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup inv_park
+   * @{
+   */
+
+   /**
+   * @brief  Floating-point Inverse Park transform
+   * @param[in]  Id       input coordinate of rotor reference frame d
+   * @param[in]  Iq       input coordinate of rotor reference frame q
+   * @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
+   * @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
+   * @param[in]  sinVal   sine value of rotation angle theta
+   * @param[in]  cosVal   cosine value of rotation angle theta
+   * @return     none
+   */
+  __STATIC_FORCEINLINE void arm_inv_park_f32(
+  float32_t Id,
+  float32_t Iq,
+  float32_t * pIalpha,
+  float32_t * pIbeta,
+  float32_t sinVal,
+  float32_t cosVal)
+  {
+    /* Calculate pIalpha using the equation, pIalpha = Id * cosVal - Iq * sinVal */
+    *pIalpha = Id * cosVal - Iq * sinVal;
+
+    /* Calculate pIbeta using the equation, pIbeta = Id * sinVal + Iq * cosVal */
+    *pIbeta = Id * sinVal + Iq * cosVal;
+  }
+
+
+/**
+  @brief  Inverse Park transform for   Q31 version
+  @param[in]  Id       input coordinate of rotor reference frame d
+  @param[in]  Iq       input coordinate of rotor reference frame q
+  @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
+  @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
+  @param[in]  sinVal   sine value of rotation angle theta
+  @param[in]  cosVal   cosine value of rotation angle theta
+  @return     none
+
+  @par Scaling and Overflow Behavior
+         The function is implemented using an internal 32-bit accumulator.
+         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
+         There is saturation on the addition, hence there is no risk of overflow.
+ */
+__STATIC_FORCEINLINE void arm_inv_park_q31(
+  q31_t Id,
+  q31_t Iq,
+  q31_t * pIalpha,
+  q31_t * pIbeta,
+  q31_t sinVal,
+  q31_t cosVal)
+  {
+    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
+    q31_t product3, product4;                    /* Temporary variables used to store intermediate results */
+
+    /* Intermediate product is calculated by (Id * cosVal) */
+    product1 = (q31_t) (((q63_t) (Id) * (cosVal)) >> 31);
+
+    /* Intermediate product is calculated by (Iq * sinVal) */
+    product2 = (q31_t) (((q63_t) (Iq) * (sinVal)) >> 31);
+
+
+    /* Intermediate product is calculated by (Id * sinVal) */
+    product3 = (q31_t) (((q63_t) (Id) * (sinVal)) >> 31);
+
+    /* Intermediate product is calculated by (Iq * cosVal) */
+    product4 = (q31_t) (((q63_t) (Iq) * (cosVal)) >> 31);
+
+    /* Calculate pIalpha by using the two intermediate products 1 and 2 */
+    *pIalpha = __QSUB(product1, product2);
+
+    /* Calculate pIbeta by using the two intermediate products 3 and 4 */
+    *pIbeta = __QADD(product4, product3);
+  }
+
+  /**
+   * @} end of Inverse park group
+   */
+
+/**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup clarke Vector Clarke Transform
+   * Forward Clarke transform converts the instantaneous stator phases into a two-coordinate time invariant vector.
+   * Generally the Clarke transform uses three-phase currents <code>Ia, Ib and Ic</code> to calculate currents
+   * in the two-phase orthogonal stator axis <code>Ialpha</code> and <code>Ibeta</code>.
+   * When <code>Ialpha</code> is superposed with <code>Ia</code> as shown in the figure below
+   * \image html clarke.gif Stator current space vector and its components in (a,b).
+   * and <code>Ia + Ib + Ic = 0</code>, in this condition <code>Ialpha</code> and <code>Ibeta</code>
+   * can be calculated using only <code>Ia</code> and <code>Ib</code>.
+   *
+   * The function operates on a single sample of data and each call to the function returns the processed output.
+   * The library provides separate functions for Q31 and floating-point data types.
+   * \par Algorithm
+   * \image html clarkeFormula.gif
+   * where <code>Ia</code> and <code>Ib</code> are the instantaneous stator phases and
+   * <code>pIalpha</code> and <code>pIbeta</code> are the two coordinates of time invariant vector.
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the Q31 version of the Clarke transform.
+   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup clarke
+   * @{
+   */
+
+  /**
+   *
+   * @brief  Floating-point Clarke transform
+   * @param[in]  Ia       input three-phase coordinate <code>a</code>
+   * @param[in]  Ib       input three-phase coordinate <code>b</code>
+   * @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
+   * @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
+   * @return        none
+   */
+  __STATIC_FORCEINLINE void arm_clarke_f32(
+  float32_t Ia,
+  float32_t Ib,
+  float32_t * pIalpha,
+  float32_t * pIbeta)
+  {
+    /* Calculate pIalpha using the equation, pIalpha = Ia */
+    *pIalpha = Ia;
+
+    /* Calculate pIbeta using the equation, pIbeta = (1/sqrt(3)) * Ia + (2/sqrt(3)) * Ib */
+    *pIbeta = (0.57735026919f * Ia + 1.15470053838f * Ib);
+  }
+
+
+/**
+  @brief  Clarke transform for Q31 version
+  @param[in]  Ia       input three-phase coordinate <code>a</code>
+  @param[in]  Ib       input three-phase coordinate <code>b</code>
+  @param[out] pIalpha  points to output two-phase orthogonal vector axis alpha
+  @param[out] pIbeta   points to output two-phase orthogonal vector axis beta
+  @return     none
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using an internal 32-bit accumulator.
+         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
+         There is saturation on the addition, hence there is no risk of overflow.
+ */
+__STATIC_FORCEINLINE void arm_clarke_q31(
+  q31_t Ia,
+  q31_t Ib,
+  q31_t * pIalpha,
+  q31_t * pIbeta)
+  {
+    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
+
+    /* Calculating pIalpha from Ia by equation pIalpha = Ia */
+    *pIalpha = Ia;
+
+    /* Intermediate product is calculated by (1/(sqrt(3)) * Ia) */
+    product1 = (q31_t) (((q63_t) Ia * 0x24F34E8B) >> 30);
+
+    /* Intermediate product is calculated by (2/sqrt(3) * Ib) */
+    product2 = (q31_t) (((q63_t) Ib * 0x49E69D16) >> 30);
+
+    /* pIbeta is calculated by adding the intermediate products */
+    *pIbeta = __QADD(product1, product2);
+  }
+
+  /**
+   * @} end of clarke group
+   */
+
+
+  /**
+   * @ingroup groupController
+   */
+
+  /**
+   * @defgroup inv_clarke Vector Inverse Clarke Transform
+   * Inverse Clarke transform converts the two-coordinate time invariant vector into instantaneous stator phases.
+   *
+   * The function operates on a single sample of data and each call to the function returns the processed output.
+   * The library provides separate functions for Q31 and floating-point data types.
+   * \par Algorithm
+   * \image html clarkeInvFormula.gif
+   * where <code>pIa</code> and <code>pIb</code> are the instantaneous stator phases and
+   * <code>Ialpha</code> and <code>Ibeta</code> are the two coordinates of time invariant vector.
+   * \par Fixed-Point Behavior
+   * Care must be taken when using the Q31 version of the Clarke transform.
+   * In particular, the overflow and saturation behavior of the accumulator used must be considered.
+   * Refer to the function specific documentation below for usage guidelines.
+   */
+
+  /**
+   * @addtogroup inv_clarke
+   * @{
+   */
+
+   /**
+   * @brief  Floating-point Inverse Clarke transform
+   * @param[in]  Ialpha  input two-phase orthogonal vector axis alpha
+   * @param[in]  Ibeta   input two-phase orthogonal vector axis beta
+   * @param[out] pIa     points to output three-phase coordinate <code>a</code>
+   * @param[out] pIb     points to output three-phase coordinate <code>b</code>
+   * @return     none
+   */
+  __STATIC_FORCEINLINE void arm_inv_clarke_f32(
+  float32_t Ialpha,
+  float32_t Ibeta,
+  float32_t * pIa,
+  float32_t * pIb)
+  {
+    /* Calculating pIa from Ialpha by equation pIa = Ialpha */
+    *pIa = Ialpha;
+
+    /* Calculating pIb from Ialpha and Ibeta by equation pIb = -(1/2) * Ialpha + (sqrt(3)/2) * Ibeta */
+    *pIb = -0.5f * Ialpha + 0.8660254039f * Ibeta;
+  }
+
+
+/**
+  @brief  Inverse Clarke transform for Q31 version
+  @param[in]  Ialpha  input two-phase orthogonal vector axis alpha
+  @param[in]  Ibeta   input two-phase orthogonal vector axis beta
+  @param[out] pIa     points to output three-phase coordinate <code>a</code>
+  @param[out] pIb     points to output three-phase coordinate <code>b</code>
+  @return     none
+
+  \par Scaling and Overflow Behavior
+         The function is implemented using an internal 32-bit accumulator.
+         The accumulator maintains 1.31 format by truncating lower 31 bits of the intermediate multiplication in 2.62 format.
+         There is saturation on the subtraction, hence there is no risk of overflow.
+ */
+__STATIC_FORCEINLINE void arm_inv_clarke_q31(
+  q31_t Ialpha,
+  q31_t Ibeta,
+  q31_t * pIa,
+  q31_t * pIb)
+  {
+    q31_t product1, product2;                    /* Temporary variables used to store intermediate results */
+
+    /* Calculating pIa from Ialpha by equation pIa = Ialpha */
+    *pIa = Ialpha;
+
+    /* Intermediate product is calculated by (1/(2*sqrt(3)) * Ia) */
+    product1 = (q31_t) (((q63_t) (Ialpha) * (0x40000000)) >> 31);
+
+    /* Intermediate product is calculated by (1/sqrt(3) * pIb) */
+    product2 = (q31_t) (((q63_t) (Ibeta) * (0x6ED9EBA1)) >> 31);
+
+    /* pIb is calculated by subtracting the products */
+    *pIb = __QSUB(product2, product1);
+  }
+
+  /**
+   * @} end of inv_clarke group
+   */
+
+
+
+  
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _CONTROLLER_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/controller_functions_f16.h b/CMSIS/DSP/Include/dsp/controller_functions_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0bdd789713950a0aa54f000fc474fec1c9775b0
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/controller_functions_f16.h
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * @file     controller_functions_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _CONTROLLER_FUNCTIONS_F16_H_
+#define _CONTROLLER_FUNCTIONS_F16_H_
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+#endif /*defined(ARM_FLOAT16_SUPPORTED)*/
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _CONTROLLER_FUNCTIONS_F16_H_ */
diff --git a/CMSIS/DSP/Include/dsp/distance_functions.h b/CMSIS/DSP/Include/dsp/distance_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..0af3c6f8b6e0139179b43c81ef36f883ce3c6b30
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/distance_functions.h
@@ -0,0 +1,297 @@
+/******************************************************************************
+ * @file     distance_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _DISTANCE_FUNCTIONS_H_
+#define _DISTANCE_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#include "dsp/statistics_functions.h"
+#include "dsp/basic_math_functions.h"
+#include "dsp/fast_math_functions.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+
+/**
+ * @defgroup groupDistance Distance functions
+ *
+ * Distance functions for use with clustering algorithms.
+ * There are distance functions for float vectors and boolean vectors.
+ *
+ */
+
+/* 6.14 bug */
+#if defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100) && (__ARMCC_VERSION < 6150001)
+ 
+__attribute__((weak)) float __powisf2(float a, int b);
+
+#endif 
+
+/**
+ * @brief        Euclidean distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+float32_t arm_euclidean_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Bray-Curtis distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float32_t arm_braycurtis_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Canberra distance between two vectors
+ *
+ * This function may divide by zero when samples pA[i] and pB[i] are both zero.
+ * The result of the computation will be correct. So the division per zero may be
+ * ignored.
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float32_t arm_canberra_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
+
+
+/**
+ * @brief        Chebyshev distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float32_t arm_chebyshev_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
+
+
+/**
+ * @brief        Cityblock (Manhattan) distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float32_t arm_cityblock_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Correlation distance between two vectors
+ *
+ * The input vectors are modified in place !
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float32_t arm_correlation_distance_f32(float32_t *pA,float32_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Cosine distance between two vectors
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+float32_t arm_cosine_distance_f32(const float32_t *pA,const float32_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Jensen-Shannon distance between two vectors
+ *
+ * This function is assuming that elements of second vector are > 0
+ * and 0 only when the corresponding element of first vector is 0.
+ * Otherwise the result of the computation does not make sense
+ * and for speed reasons, the cases returning NaN or Infinity are not
+ * managed.
+ *
+ * When the function is computing x log (x / y) with x 0 and y 0,
+ * it will compute the right value (0) but a division per zero will occur
+ * and shoudl be ignored in client code.
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+float32_t arm_jensenshannon_distance_f32(const float32_t *pA,const float32_t *pB,uint32_t blockSize);
+
+/**
+ * @brief        Minkowski distance between two vectors
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    n          Norm order (>= 2)
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+
+
+float32_t arm_minkowski_distance_f32(const float32_t *pA,const float32_t *pB, int32_t order, uint32_t blockSize);
+
+/**
+ * @brief        Dice distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    order           Distance order
+ * @param[in]    blockSize       Number of samples
+ * @return distance
+ *
+ */
+
+
+float32_t arm_dice_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Hamming distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_hamming_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Jaccard distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_jaccard_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Kulsinski distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_kulsinski_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Roger Stanimoto distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_rogerstanimoto_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Russell-Rao distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_russellrao_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Sokal-Michener distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_sokalmichener_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Sokal-Sneath distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_sokalsneath_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+/**
+ * @brief        Yule distance between two vectors
+ *
+ * @param[in]    pA              First vector of packed booleans
+ * @param[in]    pB              Second vector of packed booleans
+ * @param[in]    numberOfBools   Number of booleans
+ * @return distance
+ *
+ */
+
+float32_t arm_yule_distance(const uint32_t *pA, const uint32_t *pB, uint32_t numberOfBools);
+
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _DISTANCE_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/distance_functions_f16.h b/CMSIS/DSP/Include/dsp/distance_functions_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab01fc6ffefcefb4867cd7b467110c2fa39aadba
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/distance_functions_f16.h
@@ -0,0 +1,180 @@
+/******************************************************************************
+ * @file     distance_functions_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _DISTANCE_FUNCTIONS_F16_H_
+#define _DISTANCE_FUNCTIONS_F16_H_
+
+#include "arm_math_types_f16.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+/* 6.14 bug */
+#if defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100) && (__ARMCC_VERSION < 6150001)
+/* Defined in minkowski_f32 */
+__attribute__((weak)) float __powisf2(float a, int b);
+#endif 
+
+#include "dsp/statistics_functions_f16.h"
+#include "dsp/basic_math_functions_f16.h"
+
+#include "dsp/fast_math_functions_f16.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+ * @brief        Euclidean distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+float16_t arm_euclidean_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Bray-Curtis distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Canberra distance between two vectors
+ *
+ * This function may divide by zero when samples pA[i] and pB[i] are both zero.
+ * The result of the computation will be correct. So the division per zero may be
+ * ignored.
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize);
+
+
+/**
+ * @brief        Chebyshev distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize);
+
+
+/**
+ * @brief        Cityblock (Manhattan) distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float16_t arm_cityblock_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Correlation distance between two vectors
+ *
+ * The input vectors are modified in place !
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+float16_t arm_correlation_distance_f16(float16_t *pA,float16_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Cosine distance between two vectors
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+float16_t arm_cosine_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize);
+
+/**
+ * @brief        Jensen-Shannon distance between two vectors
+ *
+ * This function is assuming that elements of second vector are > 0
+ * and 0 only when the corresponding element of first vector is 0.
+ * Otherwise the result of the computation does not make sense
+ * and for speed reasons, the cases returning NaN or Infinity are not
+ * managed.
+ *
+ * When the function is computing x log (x / y) with x 0 and y 0,
+ * it will compute the right value (0) but a division per zero will occur
+ * and shoudl be ignored in client code.
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+float16_t arm_jensenshannon_distance_f16(const float16_t *pA,const float16_t *pB,uint32_t blockSize);
+
+/**
+ * @brief        Minkowski distance between two vectors
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    n          Norm order (>= 2)
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+
+
+float16_t arm_minkowski_distance_f16(const float16_t *pA,const float16_t *pB, int32_t order, uint32_t blockSize);
+
+
+#endif /*defined(ARM_FLOAT16_SUPPORTED)*/
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _DISTANCE_FUNCTIONS_F16_H_ */
diff --git a/CMSIS/DSP/Include/dsp/fast_math_functions.h b/CMSIS/DSP/Include/dsp/fast_math_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9e72b436911b782e1cf4b62b115f0cdde7b621e
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/fast_math_functions.h
@@ -0,0 +1,305 @@
+/******************************************************************************
+ * @file     fast_math_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _FAST_MATH_FUNCTIONS_H_
+#define _FAST_MATH_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * @brief Macros required for SINE and COSINE Fast math approximations
+   */
+
+#define FAST_MATH_TABLE_SIZE  512
+#define FAST_MATH_Q31_SHIFT   (32 - 10)
+#define FAST_MATH_Q15_SHIFT   (16 - 10)
+  
+#ifndef PI
+  #define PI               3.14159265358979f
+#endif
+
+
+/**
+ * @defgroup groupFastMath Fast Math Functions
+ * This set of functions provides a fast approximation to sine, cosine, and square root.
+ * As compared to most of the other functions in the CMSIS math library, the fast math functions
+ * operate on individual values and not arrays.
+ * There are separate functions for Q15, Q31, and floating-point data.
+ *
+ */
+
+  /**
+   * @ingroup groupFastMath
+   */
+
+
+/**
+  @addtogroup sin
+  @{
+ */
+
+/**
+   * @brief  Fast approximation to the trigonometric sine function for floating-point data.
+   * @param[in] x  input value in radians.
+   * @return  sin(x).
+   */
+  float32_t arm_sin_f32(
+  float32_t x);
+
+
+  /**
+   * @brief  Fast approximation to the trigonometric sine function for Q31 data.
+   * @param[in] x  Scaled input value in radians.
+   * @return  sin(x).
+   */
+  q31_t arm_sin_q31(
+  q31_t x);
+
+
+  /**
+   * @brief  Fast approximation to the trigonometric sine function for Q15 data.
+   * @param[in] x  Scaled input value in radians.
+   * @return  sin(x).
+   */
+  q15_t arm_sin_q15(
+  q15_t x);
+
+/**
+  @} end of sin group
+ */
+
+/**
+  @addtogroup cos
+  @{
+ */
+
+  /**
+   * @brief  Fast approximation to the trigonometric cosine function for floating-point data.
+   * @param[in] x  input value in radians.
+   * @return  cos(x).
+   */
+  float32_t arm_cos_f32(
+  float32_t x);
+
+
+  /**
+   * @brief Fast approximation to the trigonometric cosine function for Q31 data.
+   * @param[in] x  Scaled input value in radians.
+   * @return  cos(x).
+   */
+  q31_t arm_cos_q31(
+  q31_t x);
+
+
+  /**
+   * @brief  Fast approximation to the trigonometric cosine function for Q15 data.
+   * @param[in] x  Scaled input value in radians.
+   * @return  cos(x).
+   */
+  q15_t arm_cos_q15(
+  q15_t x);
+
+/**
+  @} end of cos group
+ */
+
+
+/**
+  @brief         Floating-point vector of log values.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+  void arm_vlog_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+/**
+  @brief         Floating-point vector of exp values.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+  void arm_vexp_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+ /**
+   * @defgroup SQRT Square Root
+   *
+   * Computes the square root of a number.
+   * There are separate functions for Q15, Q31, and floating-point data types.
+   * The square root function is computed using the Newton-Raphson algorithm.
+   * This is an iterative algorithm of the form:
+   * <pre>
+   *      x1 = x0 - f(x0)/f'(x0)
+   * </pre>
+   * where <code>x1</code> is the current estimate,
+   * <code>x0</code> is the previous estimate, and
+   * <code>f'(x0)</code> is the derivative of <code>f()</code> evaluated at <code>x0</code>.
+   * For the square root function, the algorithm reduces to:
+   * <pre>
+   *     x0 = in/2                         [initial guess]
+   *     x1 = 1/2 * ( x0 + in / x0)        [each iteration]
+   * </pre>
+   */
+
+
+  /**
+   * @addtogroup SQRT
+   * @{
+   */
+
+/**
+  @brief         Floating-point square root function.
+  @param[in]     in    input value
+  @param[out]    pOut  square root of input value
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : input value is positive
+                   - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
+ */
+__STATIC_FORCEINLINE arm_status arm_sqrt_f32(
+  float32_t in,
+  float32_t * pOut)
+  {
+    if (in >= 0.0f)
+    {
+#if defined ( __CC_ARM )
+  #if defined __TARGET_FPU_VFP
+      *pOut = __sqrtf(in);
+  #else
+      *pOut = sqrtf(in);
+  #endif
+
+#elif defined ( __ICCARM__ )
+  #if defined __ARMVFP__
+      __ASM("VSQRT.F32 %0,%1" : "=t"(*pOut) : "t"(in));
+  #else
+      *pOut = sqrtf(in);
+  #endif
+
+#else
+      *pOut = sqrtf(in);
+#endif
+
+      return (ARM_MATH_SUCCESS);
+    }
+    else
+    {
+      *pOut = 0.0f;
+      return (ARM_MATH_ARGUMENT_ERROR);
+    }
+  }
+
+
+/**
+  @brief         Q31 square root function.
+  @param[in]     in    input value.  The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF
+  @param[out]    pOut  points to square root of input value
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : input value is positive
+                   - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
+ */
+arm_status arm_sqrt_q31(
+  q31_t in,
+  q31_t * pOut);
+
+
+/**
+  @brief         Q15 square root function.
+  @param[in]     in    input value.  The range of the input value is [0 +1) or 0x0000 to 0x7FFF
+  @param[out]    pOut  points to square root of input value
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : input value is positive
+                   - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
+ */
+arm_status arm_sqrt_q15(
+  q15_t in,
+  q15_t * pOut);
+
+  /**
+   * @brief  Vector Floating-point square root function.
+   * @param[in]  pIn   input vector.
+   * @param[out] pOut  vector of square roots of input elements.
+   * @param[in]  len   length of input vector.
+   * @return The function returns ARM_MATH_SUCCESS if input value is positive value or ARM_MATH_ARGUMENT_ERROR if
+   * <code>in</code> is negative value and returns zero output for negative values.
+   */
+  void arm_vsqrt_f32(
+  float32_t * pIn,
+  float32_t * pOut,
+  uint16_t len);
+
+  void arm_vsqrt_q31(
+  q31_t * pIn,
+  q31_t * pOut,
+  uint16_t len);
+
+  void arm_vsqrt_q15(
+  q15_t * pIn,
+  q15_t * pOut,
+  uint16_t len);
+
+  /**
+   * @} end of SQRT group
+   */
+
+  /**
+  @brief         Fixed point division
+  @param[in]     numerator    Numerator
+  @param[in]     denominator  Denominator
+  @param[out]    quotient     Quotient value normalized between -1.0 and 1.0
+  @param[out]    shift        Shift left value to get the unnormalized quotient
+  @return        error status
+
+  When dividing by 0, an error ARM_MATH_NANINF is returned. And the quotient is forced
+  to the saturated negative or positive value.
+ */
+
+arm_status arm_divide_q15(q15_t numerator,
+  q15_t denominator,
+  q15_t *quotient,
+  int16_t *shift);
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _FAST_MATH_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/fast_math_functions_f16.h b/CMSIS/DSP/Include/dsp/fast_math_functions_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..98a13cb3bbf05fb26ec688d13b0623af35bfba64
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/fast_math_functions_f16.h
@@ -0,0 +1,116 @@
+/******************************************************************************
+ * @file     fast_math_functions_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _FAST_MATH_FUNCTIONS_F16_H_
+#define _FAST_MATH_FUNCTIONS_F16_H_
+
+#include "arm_math_types_f16.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+/* For sqrt_f32 */
+#include "dsp/fast_math_functions.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+ /**
+   * @addtogroup SQRT
+   * @{
+   */
+
+/**
+  @brief         Floating-point square root function.
+  @param[in]     in    input value
+  @param[out]    pOut  square root of input value
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : input value is positive
+                   - \ref ARM_MATH_ARGUMENT_ERROR : input value is negative; *pOut is set to 0
+ */
+__STATIC_FORCEINLINE arm_status arm_sqrt_f16(
+  float16_t in,
+  float16_t * pOut)
+  {
+    float32_t r;
+    arm_status status;
+    status=arm_sqrt_f32((float32_t)in,&r);
+    *pOut=(float16_t)r;
+    return(status);
+  }
+
+
+/**
+  @} end of SQRT group
+ */
+  
+/**
+  @brief         Floating-point vector of log values.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+  void arm_vlog_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+/**
+  @brief         Floating-point vector of exp values.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+  void arm_vexp_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+  /**
+  @brief         Floating-point vector of inverse values.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+  void arm_vinverse_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+#endif /*defined(ARM_FLOAT16_SUPPORTED)*/
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _FAST_MATH_FUNCTIONS_F16_H_ */
diff --git a/CMSIS/DSP/Include/dsp/filtering_functions.h b/CMSIS/DSP/Include/dsp/filtering_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..634edbfb396bec1242740b6782ddb05193e0d2c7
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/filtering_functions.h
@@ -0,0 +1,2468 @@
+/******************************************************************************
+ * @file     filtering_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _FILTERING_FUNCTIONS_H_
+#define _FILTERING_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#include "dsp/support_functions.h"
+#include "dsp/fast_math_functions.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+
+
+#define DELTA_Q31          ((q31_t)(0x100))
+#define DELTA_Q15          ((q15_t)0x5)
+
+/**
+ * @defgroup groupFilters Filtering Functions
+ */
+    
+  /**
+   * @brief Instance structure for the Q7 FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;        /**< number of filter coefficients in the filter. */
+          q7_t *pState;            /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+    const q7_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps.*/
+  } arm_fir_instance_q7;
+
+  /**
+   * @brief Instance structure for the Q15 FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;         /**< number of filter coefficients in the filter. */
+          q15_t *pState;            /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+    const q15_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps.*/
+  } arm_fir_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;         /**< number of filter coefficients in the filter. */
+          q31_t *pState;            /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+    const q31_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps. */
+  } arm_fir_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;     /**< number of filter coefficients in the filter. */
+          float32_t *pState;    /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+    const float32_t *pCoeffs;   /**< points to the coefficient array. The array is of length numTaps. */
+  } arm_fir_instance_f32;
+
+  /**
+   * @brief Processing function for the Q7 FIR filter.
+   * @param[in]  S          points to an instance of the Q7 FIR filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_q7(
+  const arm_fir_instance_q7 * S,
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q7 FIR filter.
+   * @param[in,out] S          points to an instance of the Q7 FIR structure.
+   * @param[in]     numTaps    Number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of samples that are processed.
+   *
+   * For the MVE version, the coefficient length must be a multiple of 16.
+   * You can pad with zeros if you have less coefficients.
+   */
+  void arm_fir_init_q7(
+        arm_fir_instance_q7 * S,
+        uint16_t numTaps,
+  const q7_t * pCoeffs,
+        q7_t * pState,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the Q15 FIR filter.
+   * @param[in]  S          points to an instance of the Q15 FIR structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_q15(
+  const arm_fir_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the fast Q15 FIR filter (fast version).
+   * @param[in]  S          points to an instance of the Q15 FIR filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_fast_q15(
+  const arm_fir_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q15 FIR filter.
+   * @param[in,out] S          points to an instance of the Q15 FIR filter structure.
+   * @param[in]     numTaps    Number of filter coefficients in the filter. Must be even and greater than or equal to 4.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of samples that are processed at a time.
+   * @return     The function returns either
+   * <code>ARM_MATH_SUCCESS</code> if initialization was successful or
+   * <code>ARM_MATH_ARGUMENT_ERROR</code> if <code>numTaps</code> is not a supported value.
+   *
+   * For the MVE version, the coefficient length must be a multiple of 8.
+   * You can pad with zeros if you have less coefficients.
+   *
+   */
+  arm_status arm_fir_init_q15(
+        arm_fir_instance_q15 * S,
+        uint16_t numTaps,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the Q31 FIR filter.
+   * @param[in]  S          points to an instance of the Q31 FIR filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_q31(
+  const arm_fir_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the fast Q31 FIR filter (fast version).
+   * @param[in]  S          points to an instance of the Q31 FIR filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_fast_q31(
+  const arm_fir_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q31 FIR filter.
+   * @param[in,out] S          points to an instance of the Q31 FIR structure.
+   * @param[in]     numTaps    Number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of samples that are processed at a time.
+   *
+   * For the MVE version, the coefficient length must be a multiple of 4.
+   * You can pad with zeros if you have less coefficients.
+   */
+  void arm_fir_init_q31(
+        arm_fir_instance_q31 * S,
+        uint16_t numTaps,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the floating-point FIR filter.
+   * @param[in]  S          points to an instance of the floating-point FIR structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_f32(
+  const arm_fir_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the floating-point FIR filter.
+   * @param[in,out] S          points to an instance of the floating-point FIR filter structure.
+   * @param[in]     numTaps    Number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of samples that are processed at a time.
+   */
+  void arm_fir_init_f32(
+        arm_fir_instance_f32 * S,
+        uint16_t numTaps,
+  const float32_t * pCoeffs,
+        float32_t * pState,
+        uint32_t blockSize);
+
+  /**
+   * @brief Instance structure for the Q15 Biquad cascade filter.
+   */
+  typedef struct
+  {
+          int8_t numStages;        /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          q15_t *pState;           /**< Points to the array of state coefficients.  The array is of length 4*numStages. */
+    const q15_t *pCoeffs;          /**< Points to the array of coefficients.  The array is of length 5*numStages. */
+          int8_t postShift;        /**< Additional shift, in bits, applied to each output sample. */
+  } arm_biquad_casd_df1_inst_q15;
+
+  /**
+   * @brief Instance structure for the Q31 Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint32_t numStages;      /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          q31_t *pState;           /**< Points to the array of state coefficients.  The array is of length 4*numStages. */
+    const q31_t *pCoeffs;          /**< Points to the array of coefficients.  The array is of length 5*numStages. */
+          uint8_t postShift;       /**< Additional shift, in bits, applied to each output sample. */
+  } arm_biquad_casd_df1_inst_q31;
+
+  /**
+   * @brief Instance structure for the floating-point Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint32_t numStages;      /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float32_t *pState;       /**< Points to the array of state coefficients.  The array is of length 4*numStages. */
+    const float32_t *pCoeffs;      /**< Points to the array of coefficients.  The array is of length 5*numStages. */
+  } arm_biquad_casd_df1_inst_f32;
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+  /**
+   * @brief Instance structure for the modified Biquad coefs required by vectorized code.
+   */
+  typedef struct
+  {
+      float32_t coeffs[8][4]; /**< Points to the array of modified coefficients.  The array is of length 32. There is one per stage */
+  } arm_biquad_mod_coef_f32;
+#endif 
+
+  /**
+   * @brief Processing function for the Q15 Biquad cascade filter.
+   * @param[in]  S          points to an instance of the Q15 Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df1_q15(
+  const arm_biquad_casd_df1_inst_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q15 Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the Q15 Biquad cascade structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     postShift  Shift to be applied to the output. Varies according to the coefficients format
+   */
+  void arm_biquad_cascade_df1_init_q15(
+        arm_biquad_casd_df1_inst_q15 * S,
+        uint8_t numStages,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        int8_t postShift);
+
+  /**
+   * @brief Fast but less precise processing function for the Q15 Biquad cascade filter for Cortex-M3 and Cortex-M4.
+   * @param[in]  S          points to an instance of the Q15 Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df1_fast_q15(
+  const arm_biquad_casd_df1_inst_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the Q31 Biquad cascade filter
+   * @param[in]  S          points to an instance of the Q31 Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df1_q31(
+  const arm_biquad_casd_df1_inst_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Fast but less precise processing function for the Q31 Biquad cascade filter for Cortex-M3 and Cortex-M4.
+   * @param[in]  S          points to an instance of the Q31 Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df1_fast_q31(
+  const arm_biquad_casd_df1_inst_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the Q31 Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the Q31 Biquad cascade structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     postShift  Shift to be applied to the output. Varies according to the coefficients format
+   */
+  void arm_biquad_cascade_df1_init_q31(
+        arm_biquad_casd_df1_inst_q31 * S,
+        uint8_t numStages,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        int8_t postShift);
+
+  /**
+   * @brief Processing function for the floating-point Biquad cascade filter.
+   * @param[in]  S          points to an instance of the floating-point Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df1_f32(
+  const arm_biquad_casd_df1_inst_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the floating-point Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the floating-point Biquad cascade structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pCoeffsMod points to the modified filter coefficients (only MVE version).
+   * @param[in]     pState     points to the state buffer.
+   */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+  void arm_biquad_cascade_df1_mve_init_f32(
+      arm_biquad_casd_df1_inst_f32 * S,
+      uint8_t numStages,
+      const float32_t * pCoeffs, 
+      arm_biquad_mod_coef_f32 * pCoeffsMod, 
+      float32_t * pState);
+#endif
+  
+  void arm_biquad_cascade_df1_init_f32(
+        arm_biquad_casd_df1_inst_f32 * S,
+        uint8_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState);
+
+
+/**
+ * @brief Convolution of floating-point sequences.
+ * @param[in]  pSrcA    points to the first input sequence.
+ * @param[in]  srcALen  length of the first input sequence.
+ * @param[in]  pSrcB    points to the second input sequence.
+ * @param[in]  srcBLen  length of the second input sequence.
+ * @param[out] pDst     points to the location where the output result is written.  Length srcALen+srcBLen-1.
+ */
+  void arm_conv_f32(
+  const float32_t * pSrcA,
+        uint32_t srcALen,
+  const float32_t * pSrcB,
+        uint32_t srcBLen,
+        float32_t * pDst);
+
+
+  /**
+   * @brief Convolution of Q15 sequences.
+   * @param[in]  pSrcA      points to the first input sequence.
+   * @param[in]  srcALen    length of the first input sequence.
+   * @param[in]  pSrcB      points to the second input sequence.
+   * @param[in]  srcBLen    length of the second input sequence.
+   * @param[out] pDst       points to the block of output data  Length srcALen+srcBLen-1.
+   * @param[in]  pScratch1  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2  points to scratch buffer of size min(srcALen, srcBLen).
+   */
+  void arm_conv_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+/**
+ * @brief Convolution of Q15 sequences.
+ * @param[in]  pSrcA    points to the first input sequence.
+ * @param[in]  srcALen  length of the first input sequence.
+ * @param[in]  pSrcB    points to the second input sequence.
+ * @param[in]  srcBLen  length of the second input sequence.
+ * @param[out] pDst     points to the location where the output result is written.  Length srcALen+srcBLen-1.
+ */
+  void arm_conv_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst);
+
+
+  /**
+   * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
+   */
+  void arm_conv_fast_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst);
+
+
+  /**
+   * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA      points to the first input sequence.
+   * @param[in]  srcALen    length of the first input sequence.
+   * @param[in]  pSrcB      points to the second input sequence.
+   * @param[in]  srcBLen    length of the second input sequence.
+   * @param[out] pDst       points to the block of output data  Length srcALen+srcBLen-1.
+   * @param[in]  pScratch1  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2  points to scratch buffer of size min(srcALen, srcBLen).
+   */
+  void arm_conv_fast_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Convolution of Q31 sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
+   */
+  void arm_conv_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst);
+
+
+  /**
+   * @brief Convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
+   */
+  void arm_conv_fast_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst);
+
+
+    /**
+   * @brief Convolution of Q7 sequences.
+   * @param[in]  pSrcA      points to the first input sequence.
+   * @param[in]  srcALen    length of the first input sequence.
+   * @param[in]  pSrcB      points to the second input sequence.
+   * @param[in]  srcBLen    length of the second input sequence.
+   * @param[out] pDst       points to the block of output data  Length srcALen+srcBLen-1.
+   * @param[in]  pScratch1  points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2  points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+   */
+  void arm_conv_opt_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Convolution of Q7 sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length srcALen+srcBLen-1.
+   */
+  void arm_conv_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst);
+
+
+  /**
+   * @brief Partial convolution of floating-point sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_f32(
+  const float32_t * pSrcA,
+        uint32_t srcALen,
+  const float32_t * pSrcB,
+        uint32_t srcBLen,
+        float32_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q15 sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @param[in]  pScratch1   points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2   points to scratch buffer of size min(srcALen, srcBLen).
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Partial convolution of Q15 sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_fast_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @param[in]  pScratch1   points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2   points to scratch buffer of size min(srcALen, srcBLen).
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_fast_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Partial convolution of Q31 sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_fast_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Partial convolution of Q7 sequences
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @param[in]  pScratch1   points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2   points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_opt_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+/**
+   * @brief Partial convolution of Q7 sequences.
+   * @param[in]  pSrcA       points to the first input sequence.
+   * @param[in]  srcALen     length of the first input sequence.
+   * @param[in]  pSrcB       points to the second input sequence.
+   * @param[in]  srcBLen     length of the second input sequence.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  firstIndex  is the first output sample to start with.
+   * @param[in]  numPoints   is the number of output points to be computed.
+   * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+   */
+  arm_status arm_conv_partial_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst,
+        uint32_t firstIndex,
+        uint32_t numPoints);
+
+
+  /**
+   * @brief Instance structure for the Q15 FIR decimator.
+   */
+  typedef struct
+  {
+          uint8_t M;                  /**< decimation factor. */
+          uint16_t numTaps;           /**< number of coefficients in the filter. */
+    const q15_t *pCoeffs;             /**< points to the coefficient array. The array is of length numTaps.*/
+          q15_t *pState;              /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+  } arm_fir_decimate_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 FIR decimator.
+   */
+  typedef struct
+  {
+          uint8_t M;                  /**< decimation factor. */
+          uint16_t numTaps;           /**< number of coefficients in the filter. */
+    const q31_t *pCoeffs;             /**< points to the coefficient array. The array is of length numTaps.*/
+          q31_t *pState;              /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+  } arm_fir_decimate_instance_q31;
+
+/**
+  @brief Instance structure for floating-point FIR decimator.
+ */
+typedef struct
+  {
+          uint8_t M;                  /**< decimation factor. */
+          uint16_t numTaps;           /**< number of coefficients in the filter. */
+    const float32_t *pCoeffs;         /**< points to the coefficient array. The array is of length numTaps.*/
+          float32_t *pState;          /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+  } arm_fir_decimate_instance_f32;
+
+
+/**
+  @brief         Processing function for floating-point FIR decimator.
+  @param[in]     S         points to an instance of the floating-point FIR decimator structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize number of samples to process
+ */
+void arm_fir_decimate_f32(
+  const arm_fir_decimate_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+/**
+  @brief         Initialization function for the floating-point FIR decimator.
+  @param[in,out] S          points to an instance of the floating-point FIR decimator structure
+  @param[in]     numTaps    number of coefficients in the filter
+  @param[in]     M          decimation factor
+  @param[in]     pCoeffs    points to the filter coefficients
+  @param[in]     pState     points to the state buffer
+  @param[in]     blockSize  number of input samples to process per call
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS      : Operation successful
+                   - \ref ARM_MATH_LENGTH_ERROR : <code>blockSize</code> is not a multiple of <code>M</code>
+ */
+arm_status arm_fir_decimate_init_f32(
+        arm_fir_decimate_instance_f32 * S,
+        uint16_t numTaps,
+        uint8_t M,
+  const float32_t * pCoeffs,
+        float32_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q15 FIR decimator.
+   * @param[in]  S          points to an instance of the Q15 FIR decimator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void arm_fir_decimate_q15(
+  const arm_fir_decimate_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
+   * @param[in]  S          points to an instance of the Q15 FIR decimator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void arm_fir_decimate_fast_q15(
+  const arm_fir_decimate_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q15 FIR decimator.
+   * @param[in,out] S          points to an instance of the Q15 FIR decimator structure.
+   * @param[in]     numTaps    number of coefficients in the filter.
+   * @param[in]     M          decimation factor.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return    The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
+   * <code>blockSize</code> is not a multiple of <code>M</code>.
+   */
+  arm_status arm_fir_decimate_init_q15(
+        arm_fir_decimate_instance_q15 * S,
+        uint16_t numTaps,
+        uint8_t M,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q31 FIR decimator.
+   * @param[in]  S     points to an instance of the Q31 FIR decimator structure.
+   * @param[in]  pSrc  points to the block of input data.
+   * @param[out] pDst  points to the block of output data
+   * @param[in] blockSize number of input samples to process per call.
+   */
+  void arm_fir_decimate_q31(
+  const arm_fir_decimate_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the Q31 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
+   * @param[in]  S          points to an instance of the Q31 FIR decimator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void arm_fir_decimate_fast_q31(
+  const arm_fir_decimate_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q31 FIR decimator.
+   * @param[in,out] S          points to an instance of the Q31 FIR decimator structure.
+   * @param[in]     numTaps    number of coefficients in the filter.
+   * @param[in]     M          decimation factor.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return    The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
+   * <code>blockSize</code> is not a multiple of <code>M</code>.
+   */
+  arm_status arm_fir_decimate_init_q31(
+        arm_fir_decimate_instance_q31 * S,
+        uint16_t numTaps,
+        uint8_t M,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q15 FIR interpolator.
+   */
+  typedef struct
+  {
+        uint8_t L;                      /**< upsample factor. */
+        uint16_t phaseLength;           /**< length of each polyphase filter component. */
+  const q15_t *pCoeffs;                 /**< points to the coefficient array. The array is of length L*phaseLength. */
+        q15_t *pState;                  /**< points to the state variable array. The array is of length blockSize+phaseLength-1. */
+  } arm_fir_interpolate_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 FIR interpolator.
+   */
+  typedef struct
+  {
+        uint8_t L;                      /**< upsample factor. */
+        uint16_t phaseLength;           /**< length of each polyphase filter component. */
+  const q31_t *pCoeffs;                 /**< points to the coefficient array. The array is of length L*phaseLength. */
+        q31_t *pState;                  /**< points to the state variable array. The array is of length blockSize+phaseLength-1. */
+  } arm_fir_interpolate_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point FIR interpolator.
+   */
+  typedef struct
+  {
+        uint8_t L;                     /**< upsample factor. */
+        uint16_t phaseLength;          /**< length of each polyphase filter component. */
+  const float32_t *pCoeffs;            /**< points to the coefficient array. The array is of length L*phaseLength. */
+        float32_t *pState;             /**< points to the state variable array. The array is of length phaseLength+numTaps-1. */
+  } arm_fir_interpolate_instance_f32;
+
+
+  /**
+   * @brief Processing function for the Q15 FIR interpolator.
+   * @param[in]  S          points to an instance of the Q15 FIR interpolator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void arm_fir_interpolate_q15(
+  const arm_fir_interpolate_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q15 FIR interpolator.
+   * @param[in,out] S          points to an instance of the Q15 FIR interpolator structure.
+   * @param[in]     L          upsample factor.
+   * @param[in]     numTaps    number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficient buffer.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return        The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
+   * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
+   */
+  arm_status arm_fir_interpolate_init_q15(
+        arm_fir_interpolate_instance_q15 * S,
+        uint8_t L,
+        uint16_t numTaps,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q31 FIR interpolator.
+   * @param[in]  S          points to an instance of the Q15 FIR interpolator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void arm_fir_interpolate_q31(
+  const arm_fir_interpolate_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q31 FIR interpolator.
+   * @param[in,out] S          points to an instance of the Q31 FIR interpolator structure.
+   * @param[in]     L          upsample factor.
+   * @param[in]     numTaps    number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficient buffer.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return        The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
+   * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
+   */
+  arm_status arm_fir_interpolate_init_q31(
+        arm_fir_interpolate_instance_q31 * S,
+        uint8_t L,
+        uint16_t numTaps,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the floating-point FIR interpolator.
+   * @param[in]  S          points to an instance of the floating-point FIR interpolator structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of input samples to process per call.
+   */
+  void arm_fir_interpolate_f32(
+  const arm_fir_interpolate_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the floating-point FIR interpolator.
+   * @param[in,out] S          points to an instance of the floating-point FIR interpolator structure.
+   * @param[in]     L          upsample factor.
+   * @param[in]     numTaps    number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficient buffer.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of input samples to process per call.
+   * @return        The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_LENGTH_ERROR if
+   * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
+   */
+  arm_status arm_fir_interpolate_init_f32(
+        arm_fir_interpolate_instance_f32 * S,
+        uint8_t L,
+        uint16_t numTaps,
+  const float32_t * pCoeffs,
+        float32_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the high precision Q31 Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;       /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          q63_t *pState;           /**< points to the array of state coefficients.  The array is of length 4*numStages. */
+    const q31_t *pCoeffs;          /**< points to the array of coefficients.  The array is of length 5*numStages. */
+          uint8_t postShift;       /**< additional shift, in bits, applied to each output sample. */
+  } arm_biquad_cas_df1_32x64_ins_q31;
+
+
+  /**
+   * @param[in]  S          points to an instance of the high precision Q31 Biquad cascade filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cas_df1_32x64_q31(
+  const arm_biquad_cas_df1_32x64_ins_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @param[in,out] S          points to an instance of the high precision Q31 Biquad cascade filter structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     postShift  shift to be applied to the output. Varies according to the coefficients format
+   */
+  void arm_biquad_cas_df1_32x64_init_q31(
+        arm_biquad_cas_df1_32x64_ins_q31 * S,
+        uint8_t numStages,
+  const q31_t * pCoeffs,
+        q63_t * pState,
+        uint8_t postShift);
+
+
+  /**
+   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float32_t *pState;         /**< points to the array of state coefficients.  The array is of length 2*numStages. */
+    const float32_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
+  } arm_biquad_cascade_df2T_instance_f32;
+
+  /**
+   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float32_t *pState;         /**< points to the array of state coefficients.  The array is of length 4*numStages. */
+    const float32_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
+  } arm_biquad_cascade_stereo_df2T_instance_f32;
+
+  /**
+   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float64_t *pState;         /**< points to the array of state coefficients.  The array is of length 2*numStages. */
+    const float64_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
+  } arm_biquad_cascade_df2T_instance_f64;
+
+
+  /**
+   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in]  S          points to an instance of the filter data structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df2T_f32(
+  const arm_biquad_cascade_df2T_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. 2 channels
+   * @param[in]  S          points to an instance of the filter data structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_stereo_df2T_f32(
+  const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in]  S          points to an instance of the filter data structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df2T_f64(
+  const arm_biquad_cascade_df2T_instance_f64 * S,
+  const float64_t * pSrc,
+        float64_t * pDst,
+        uint32_t blockSize);
+
+
+#if defined(ARM_MATH_NEON) 
+void arm_biquad_cascade_df2T_compute_coefs_f32(
+  arm_biquad_cascade_df2T_instance_f32 * S,
+  uint8_t numStages,
+  float32_t * pCoeffs);
+#endif
+  /**
+   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the filter data structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   */
+  void arm_biquad_cascade_df2T_init_f32(
+        arm_biquad_cascade_df2T_instance_f32 * S,
+        uint8_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState);
+
+
+  /**
+   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the filter data structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   */
+  void arm_biquad_cascade_stereo_df2T_init_f32(
+        arm_biquad_cascade_stereo_df2T_instance_f32 * S,
+        uint8_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState);
+
+
+  /**
+   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the filter data structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   */
+  void arm_biquad_cascade_df2T_init_f64(
+        arm_biquad_cascade_df2T_instance_f64 * S,
+        uint8_t numStages,
+        const float64_t * pCoeffs,
+        float64_t * pState);
+
+
+  /**
+   * @brief Instance structure for the Q15 FIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of filter stages. */
+          q15_t *pState;                       /**< points to the state variable array. The array is of length numStages. */
+    const q15_t *pCoeffs;                      /**< points to the coefficient array. The array is of length numStages. */
+  } arm_fir_lattice_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 FIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of filter stages. */
+          q31_t *pState;                       /**< points to the state variable array. The array is of length numStages. */
+    const q31_t *pCoeffs;                      /**< points to the coefficient array. The array is of length numStages. */
+  } arm_fir_lattice_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point FIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of filter stages. */
+          float32_t *pState;                   /**< points to the state variable array. The array is of length numStages. */
+    const float32_t *pCoeffs;                  /**< points to the coefficient array. The array is of length numStages. */
+  } arm_fir_lattice_instance_f32;
+
+
+  /**
+   * @brief Initialization function for the Q15 FIR lattice filter.
+   * @param[in] S          points to an instance of the Q15 FIR lattice structure.
+   * @param[in] numStages  number of filter stages.
+   * @param[in] pCoeffs    points to the coefficient buffer.  The array is of length numStages.
+   * @param[in] pState     points to the state buffer.  The array is of length numStages.
+   */
+  void arm_fir_lattice_init_q15(
+        arm_fir_lattice_instance_q15 * S,
+        uint16_t numStages,
+  const q15_t * pCoeffs,
+        q15_t * pState);
+
+
+  /**
+   * @brief Processing function for the Q15 FIR lattice filter.
+   * @param[in]  S          points to an instance of the Q15 FIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_lattice_q15(
+  const arm_fir_lattice_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for the Q31 FIR lattice filter.
+   * @param[in] S          points to an instance of the Q31 FIR lattice structure.
+   * @param[in] numStages  number of filter stages.
+   * @param[in] pCoeffs    points to the coefficient buffer.  The array is of length numStages.
+   * @param[in] pState     points to the state buffer.   The array is of length numStages.
+   */
+  void arm_fir_lattice_init_q31(
+        arm_fir_lattice_instance_q31 * S,
+        uint16_t numStages,
+  const q31_t * pCoeffs,
+        q31_t * pState);
+
+
+  /**
+   * @brief Processing function for the Q31 FIR lattice filter.
+   * @param[in]  S          points to an instance of the Q31 FIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_lattice_q31(
+  const arm_fir_lattice_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+/**
+ * @brief Initialization function for the floating-point FIR lattice filter.
+ * @param[in] S          points to an instance of the floating-point FIR lattice structure.
+ * @param[in] numStages  number of filter stages.
+ * @param[in] pCoeffs    points to the coefficient buffer.  The array is of length numStages.
+ * @param[in] pState     points to the state buffer.  The array is of length numStages.
+ */
+  void arm_fir_lattice_init_f32(
+        arm_fir_lattice_instance_f32 * S,
+        uint16_t numStages,
+  const float32_t * pCoeffs,
+        float32_t * pState);
+
+
+  /**
+   * @brief Processing function for the floating-point FIR lattice filter.
+   * @param[in]  S          points to an instance of the floating-point FIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_lattice_f32(
+  const arm_fir_lattice_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q15 IIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of stages in the filter. */
+          q15_t *pState;                       /**< points to the state variable array. The array is of length numStages+blockSize. */
+          q15_t *pkCoeffs;                     /**< points to the reflection coefficient array. The array is of length numStages. */
+          q15_t *pvCoeffs;                     /**< points to the ladder coefficient array. The array is of length numStages+1. */
+  } arm_iir_lattice_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 IIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of stages in the filter. */
+          q31_t *pState;                       /**< points to the state variable array. The array is of length numStages+blockSize. */
+          q31_t *pkCoeffs;                     /**< points to the reflection coefficient array. The array is of length numStages. */
+          q31_t *pvCoeffs;                     /**< points to the ladder coefficient array. The array is of length numStages+1. */
+  } arm_iir_lattice_instance_q31;
+
+  /**
+   * @brief Instance structure for the floating-point IIR lattice filter.
+   */
+  typedef struct
+  {
+          uint16_t numStages;                  /**< number of stages in the filter. */
+          float32_t *pState;                   /**< points to the state variable array. The array is of length numStages+blockSize. */
+          float32_t *pkCoeffs;                 /**< points to the reflection coefficient array. The array is of length numStages. */
+          float32_t *pvCoeffs;                 /**< points to the ladder coefficient array. The array is of length numStages+1. */
+  } arm_iir_lattice_instance_f32;
+
+
+  /**
+   * @brief Processing function for the floating-point IIR lattice filter.
+   * @param[in]  S          points to an instance of the floating-point IIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_iir_lattice_f32(
+  const arm_iir_lattice_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for the floating-point IIR lattice filter.
+   * @param[in] S          points to an instance of the floating-point IIR lattice structure.
+   * @param[in] numStages  number of stages in the filter.
+   * @param[in] pkCoeffs   points to the reflection coefficient buffer.  The array is of length numStages.
+   * @param[in] pvCoeffs   points to the ladder coefficient buffer.  The array is of length numStages+1.
+   * @param[in] pState     points to the state buffer.  The array is of length numStages+blockSize-1.
+   * @param[in] blockSize  number of samples to process.
+   */
+  void arm_iir_lattice_init_f32(
+        arm_iir_lattice_instance_f32 * S,
+        uint16_t numStages,
+        float32_t * pkCoeffs,
+        float32_t * pvCoeffs,
+        float32_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q31 IIR lattice filter.
+   * @param[in]  S          points to an instance of the Q31 IIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_iir_lattice_q31(
+  const arm_iir_lattice_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for the Q31 IIR lattice filter.
+   * @param[in] S          points to an instance of the Q31 IIR lattice structure.
+   * @param[in] numStages  number of stages in the filter.
+   * @param[in] pkCoeffs   points to the reflection coefficient buffer.  The array is of length numStages.
+   * @param[in] pvCoeffs   points to the ladder coefficient buffer.  The array is of length numStages+1.
+   * @param[in] pState     points to the state buffer.  The array is of length numStages+blockSize.
+   * @param[in] blockSize  number of samples to process.
+   */
+  void arm_iir_lattice_init_q31(
+        arm_iir_lattice_instance_q31 * S,
+        uint16_t numStages,
+        q31_t * pkCoeffs,
+        q31_t * pvCoeffs,
+        q31_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q15 IIR lattice filter.
+   * @param[in]  S          points to an instance of the Q15 IIR lattice structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_iir_lattice_q15(
+  const arm_iir_lattice_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+/**
+ * @brief Initialization function for the Q15 IIR lattice filter.
+ * @param[in] S          points to an instance of the fixed-point Q15 IIR lattice structure.
+ * @param[in] numStages  number of stages in the filter.
+ * @param[in] pkCoeffs   points to reflection coefficient buffer.  The array is of length numStages.
+ * @param[in] pvCoeffs   points to ladder coefficient buffer.  The array is of length numStages+1.
+ * @param[in] pState     points to state buffer.  The array is of length numStages+blockSize.
+ * @param[in] blockSize  number of samples to process per call.
+ */
+  void arm_iir_lattice_init_q15(
+        arm_iir_lattice_instance_q15 * S,
+        uint16_t numStages,
+        q15_t * pkCoeffs,
+        q15_t * pvCoeffs,
+        q15_t * pState,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the floating-point LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;    /**< number of coefficients in the filter. */
+          float32_t *pState;   /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          float32_t *pCoeffs;  /**< points to the coefficient array. The array is of length numTaps. */
+          float32_t mu;        /**< step size that controls filter coefficient updates. */
+  } arm_lms_instance_f32;
+
+
+  /**
+   * @brief Processing function for floating-point LMS filter.
+   * @param[in]  S          points to an instance of the floating-point LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_lms_f32(
+  const arm_lms_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pRef,
+        float32_t * pOut,
+        float32_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for floating-point LMS filter.
+   * @param[in] S          points to an instance of the floating-point LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to the coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   */
+  void arm_lms_init_f32(
+        arm_lms_instance_f32 * S,
+        uint16_t numTaps,
+        float32_t * pCoeffs,
+        float32_t * pState,
+        float32_t mu,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q15 LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;    /**< number of coefficients in the filter. */
+          q15_t *pState;       /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          q15_t *pCoeffs;      /**< points to the coefficient array. The array is of length numTaps. */
+          q15_t mu;            /**< step size that controls filter coefficient updates. */
+          uint32_t postShift;  /**< bit shift applied to coefficients. */
+  } arm_lms_instance_q15;
+
+
+  /**
+   * @brief Initialization function for the Q15 LMS filter.
+   * @param[in] S          points to an instance of the Q15 LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to the coefficient buffer.
+   * @param[in] pState     points to the state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   * @param[in] postShift  bit shift applied to coefficients.
+   */
+  void arm_lms_init_q15(
+        arm_lms_instance_q15 * S,
+        uint16_t numTaps,
+        q15_t * pCoeffs,
+        q15_t * pState,
+        q15_t mu,
+        uint32_t blockSize,
+        uint32_t postShift);
+
+
+  /**
+   * @brief Processing function for Q15 LMS filter.
+   * @param[in]  S          points to an instance of the Q15 LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_lms_q15(
+  const arm_lms_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pRef,
+        q15_t * pOut,
+        q15_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q31 LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;    /**< number of coefficients in the filter. */
+          q31_t *pState;       /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          q31_t *pCoeffs;      /**< points to the coefficient array. The array is of length numTaps. */
+          q31_t mu;            /**< step size that controls filter coefficient updates. */
+          uint32_t postShift;  /**< bit shift applied to coefficients. */
+  } arm_lms_instance_q31;
+
+
+  /**
+   * @brief Processing function for Q31 LMS filter.
+   * @param[in]  S          points to an instance of the Q15 LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_lms_q31(
+  const arm_lms_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pRef,
+        q31_t * pOut,
+        q31_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for Q31 LMS filter.
+   * @param[in] S          points to an instance of the Q31 LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   * @param[in] postShift  bit shift applied to coefficients.
+   */
+  void arm_lms_init_q31(
+        arm_lms_instance_q31 * S,
+        uint16_t numTaps,
+        q31_t * pCoeffs,
+        q31_t * pState,
+        q31_t mu,
+        uint32_t blockSize,
+        uint32_t postShift);
+
+
+  /**
+   * @brief Instance structure for the floating-point normalized LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;     /**< number of coefficients in the filter. */
+          float32_t *pState;    /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          float32_t *pCoeffs;   /**< points to the coefficient array. The array is of length numTaps. */
+          float32_t mu;         /**< step size that control filter coefficient updates. */
+          float32_t energy;     /**< saves previous frame energy. */
+          float32_t x0;         /**< saves previous input sample. */
+  } arm_lms_norm_instance_f32;
+
+
+  /**
+   * @brief Processing function for floating-point normalized LMS filter.
+   * @param[in]  S          points to an instance of the floating-point normalized LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_lms_norm_f32(
+        arm_lms_norm_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pRef,
+        float32_t * pOut,
+        float32_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for floating-point normalized LMS filter.
+   * @param[in] S          points to an instance of the floating-point LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   */
+  void arm_lms_norm_init_f32(
+        arm_lms_norm_instance_f32 * S,
+        uint16_t numTaps,
+        float32_t * pCoeffs,
+        float32_t * pState,
+        float32_t mu,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the Q31 normalized LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;     /**< number of coefficients in the filter. */
+          q31_t *pState;        /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          q31_t *pCoeffs;       /**< points to the coefficient array. The array is of length numTaps. */
+          q31_t mu;             /**< step size that controls filter coefficient updates. */
+          uint8_t postShift;    /**< bit shift applied to coefficients. */
+    const q31_t *recipTable;    /**< points to the reciprocal initial value table. */
+          q31_t energy;         /**< saves previous frame energy. */
+          q31_t x0;             /**< saves previous input sample. */
+  } arm_lms_norm_instance_q31;
+
+
+  /**
+   * @brief Processing function for Q31 normalized LMS filter.
+   * @param[in]  S          points to an instance of the Q31 normalized LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_lms_norm_q31(
+        arm_lms_norm_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pRef,
+        q31_t * pOut,
+        q31_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for Q31 normalized LMS filter.
+   * @param[in] S          points to an instance of the Q31 normalized LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   * @param[in] postShift  bit shift applied to coefficients.
+   */
+  void arm_lms_norm_init_q31(
+        arm_lms_norm_instance_q31 * S,
+        uint16_t numTaps,
+        q31_t * pCoeffs,
+        q31_t * pState,
+        q31_t mu,
+        uint32_t blockSize,
+        uint8_t postShift);
+
+
+  /**
+   * @brief Instance structure for the Q15 normalized LMS filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;     /**< Number of coefficients in the filter. */
+          q15_t *pState;        /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+          q15_t *pCoeffs;       /**< points to the coefficient array. The array is of length numTaps. */
+          q15_t mu;             /**< step size that controls filter coefficient updates. */
+          uint8_t postShift;    /**< bit shift applied to coefficients. */
+    const q15_t *recipTable;    /**< Points to the reciprocal initial value table. */
+          q15_t energy;         /**< saves previous frame energy. */
+          q15_t x0;             /**< saves previous input sample. */
+  } arm_lms_norm_instance_q15;
+
+
+  /**
+   * @brief Processing function for Q15 normalized LMS filter.
+   * @param[in]  S          points to an instance of the Q15 normalized LMS filter structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[in]  pRef       points to the block of reference data.
+   * @param[out] pOut       points to the block of output data.
+   * @param[out] pErr       points to the block of error data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_lms_norm_q15(
+        arm_lms_norm_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pRef,
+        q15_t * pOut,
+        q15_t * pErr,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Initialization function for Q15 normalized LMS filter.
+   * @param[in] S          points to an instance of the Q15 normalized LMS filter structure.
+   * @param[in] numTaps    number of filter coefficients.
+   * @param[in] pCoeffs    points to coefficient buffer.
+   * @param[in] pState     points to state buffer.
+   * @param[in] mu         step size that controls filter coefficient updates.
+   * @param[in] blockSize  number of samples to process.
+   * @param[in] postShift  bit shift applied to coefficients.
+   */
+  void arm_lms_norm_init_q15(
+        arm_lms_norm_instance_q15 * S,
+        uint16_t numTaps,
+        q15_t * pCoeffs,
+        q15_t * pState,
+        q15_t mu,
+        uint32_t blockSize,
+        uint8_t postShift);
+
+
+  /**
+   * @brief Correlation of floating-point sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+   */
+  void arm_correlate_f32(
+  const float32_t * pSrcA,
+        uint32_t srcALen,
+  const float32_t * pSrcB,
+        uint32_t srcBLen,
+        float32_t * pDst);
+
+
+/**
+ @brief Correlation of Q15 sequences
+ @param[in]  pSrcA     points to the first input sequence
+ @param[in]  srcALen   length of the first input sequence
+ @param[in]  pSrcB     points to the second input sequence
+ @param[in]  srcBLen   length of the second input sequence
+ @param[out] pDst      points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+ @param[in]  pScratch  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+*/
+void arm_correlate_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        q15_t * pScratch);
+
+
+/**
+  @brief Correlation of Q15 sequences.
+  @param[in]  pSrcA    points to the first input sequence
+  @param[in]  srcALen  length of the first input sequence
+  @param[in]  pSrcB    points to the second input sequence
+  @param[in]  srcBLen  length of the second input sequence
+  @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+ */
+  void arm_correlate_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst);
+
+
+/**
+  @brief         Correlation of Q15 sequences (fast version).
+  @param[in]     pSrcA      points to the first input sequence
+  @param[in]     srcALen    length of the first input sequence
+  @param[in]     pSrcB      points to the second input sequence
+  @param[in]     srcBLen    length of the second input sequence
+  @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
+  @return        none
+ */
+void arm_correlate_fast_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst);
+
+
+/**
+  @brief Correlation of Q15 sequences (fast version).
+  @param[in]  pSrcA     points to the first input sequence.
+  @param[in]  srcALen   length of the first input sequence.
+  @param[in]  pSrcB     points to the second input sequence.
+  @param[in]  srcBLen   length of the second input sequence.
+  @param[out] pDst      points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+  @param[in]  pScratch  points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ */
+void arm_correlate_fast_opt_q15(
+  const q15_t * pSrcA,
+        uint32_t srcALen,
+  const q15_t * pSrcB,
+        uint32_t srcBLen,
+        q15_t * pDst,
+        q15_t * pScratch);
+
+
+  /**
+   * @brief Correlation of Q31 sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+   */
+  void arm_correlate_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst);
+
+
+/**
+  @brief Correlation of Q31 sequences (fast version).
+  @param[in]  pSrcA    points to the first input sequence
+  @param[in]  srcALen  length of the first input sequence
+  @param[in]  pSrcB    points to the second input sequence
+  @param[in]  srcBLen  length of the second input sequence
+  @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+ */
+void arm_correlate_fast_q31(
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst);
+
+
+ /**
+   * @brief Correlation of Q7 sequences.
+   * @param[in]  pSrcA      points to the first input sequence.
+   * @param[in]  srcALen    length of the first input sequence.
+   * @param[in]  pSrcB      points to the second input sequence.
+   * @param[in]  srcBLen    length of the second input sequence.
+   * @param[out] pDst       points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+   * @param[in]  pScratch1  points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+   * @param[in]  pScratch2  points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+   */
+  void arm_correlate_opt_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst,
+        q15_t * pScratch1,
+        q15_t * pScratch2);
+
+
+  /**
+   * @brief Correlation of Q7 sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+   */
+  void arm_correlate_q7(
+  const q7_t * pSrcA,
+        uint32_t srcALen,
+  const q7_t * pSrcB,
+        uint32_t srcBLen,
+        q7_t * pDst);
+
+
+  /**
+   * @brief Instance structure for the floating-point sparse FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;             /**< number of coefficients in the filter. */
+          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
+          float32_t *pState;            /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
+    const float32_t *pCoeffs;           /**< points to the coefficient array. The array is of length numTaps.*/
+          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
+          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
+  } arm_fir_sparse_instance_f32;
+
+  /**
+   * @brief Instance structure for the Q31 sparse FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;             /**< number of coefficients in the filter. */
+          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
+          q31_t *pState;                /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
+    const q31_t *pCoeffs;               /**< points to the coefficient array. The array is of length numTaps.*/
+          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
+          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
+  } arm_fir_sparse_instance_q31;
+
+  /**
+   * @brief Instance structure for the Q15 sparse FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;             /**< number of coefficients in the filter. */
+          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
+          q15_t *pState;                /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
+    const q15_t *pCoeffs;               /**< points to the coefficient array. The array is of length numTaps.*/
+          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
+          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
+  } arm_fir_sparse_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q7 sparse FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;             /**< number of coefficients in the filter. */
+          uint16_t stateIndex;          /**< state buffer index.  Points to the oldest sample in the state buffer. */
+          q7_t *pState;                 /**< points to the state buffer array. The array is of length maxDelay+blockSize-1. */
+    const q7_t *pCoeffs;                /**< points to the coefficient array. The array is of length numTaps.*/
+          uint16_t maxDelay;            /**< maximum offset specified by the pTapDelay array. */
+          int32_t *pTapDelay;           /**< points to the array of delay values.  The array is of length numTaps. */
+  } arm_fir_sparse_instance_q7;
+
+
+  /**
+   * @brief Processing function for the floating-point sparse FIR filter.
+   * @param[in]  S           points to an instance of the floating-point sparse FIR structure.
+   * @param[in]  pSrc        points to the block of input data.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  pScratchIn  points to a temporary buffer of size blockSize.
+   * @param[in]  blockSize   number of input samples to process per call.
+   */
+  void arm_fir_sparse_f32(
+        arm_fir_sparse_instance_f32 * S,
+  const float32_t * pSrc,
+        float32_t * pDst,
+        float32_t * pScratchIn,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the floating-point sparse FIR filter.
+   * @param[in,out] S          points to an instance of the floating-point sparse FIR structure.
+   * @param[in]     numTaps    number of nonzero coefficients in the filter.
+   * @param[in]     pCoeffs    points to the array of filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     pTapDelay  points to the array of offset times.
+   * @param[in]     maxDelay   maximum offset time supported.
+   * @param[in]     blockSize  number of samples that will be processed per block.
+   */
+  void arm_fir_sparse_init_f32(
+        arm_fir_sparse_instance_f32 * S,
+        uint16_t numTaps,
+  const float32_t * pCoeffs,
+        float32_t * pState,
+        int32_t * pTapDelay,
+        uint16_t maxDelay,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q31 sparse FIR filter.
+   * @param[in]  S           points to an instance of the Q31 sparse FIR structure.
+   * @param[in]  pSrc        points to the block of input data.
+   * @param[out] pDst        points to the block of output data
+   * @param[in]  pScratchIn  points to a temporary buffer of size blockSize.
+   * @param[in]  blockSize   number of input samples to process per call.
+   */
+  void arm_fir_sparse_q31(
+        arm_fir_sparse_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        q31_t * pScratchIn,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q31 sparse FIR filter.
+   * @param[in,out] S          points to an instance of the Q31 sparse FIR structure.
+   * @param[in]     numTaps    number of nonzero coefficients in the filter.
+   * @param[in]     pCoeffs    points to the array of filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     pTapDelay  points to the array of offset times.
+   * @param[in]     maxDelay   maximum offset time supported.
+   * @param[in]     blockSize  number of samples that will be processed per block.
+   */
+  void arm_fir_sparse_init_q31(
+        arm_fir_sparse_instance_q31 * S,
+        uint16_t numTaps,
+  const q31_t * pCoeffs,
+        q31_t * pState,
+        int32_t * pTapDelay,
+        uint16_t maxDelay,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q15 sparse FIR filter.
+   * @param[in]  S            points to an instance of the Q15 sparse FIR structure.
+   * @param[in]  pSrc         points to the block of input data.
+   * @param[out] pDst         points to the block of output data
+   * @param[in]  pScratchIn   points to a temporary buffer of size blockSize.
+   * @param[in]  pScratchOut  points to a temporary buffer of size blockSize.
+   * @param[in]  blockSize    number of input samples to process per call.
+   */
+  void arm_fir_sparse_q15(
+        arm_fir_sparse_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        q15_t * pScratchIn,
+        q31_t * pScratchOut,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q15 sparse FIR filter.
+   * @param[in,out] S          points to an instance of the Q15 sparse FIR structure.
+   * @param[in]     numTaps    number of nonzero coefficients in the filter.
+   * @param[in]     pCoeffs    points to the array of filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     pTapDelay  points to the array of offset times.
+   * @param[in]     maxDelay   maximum offset time supported.
+   * @param[in]     blockSize  number of samples that will be processed per block.
+   */
+  void arm_fir_sparse_init_q15(
+        arm_fir_sparse_instance_q15 * S,
+        uint16_t numTaps,
+  const q15_t * pCoeffs,
+        q15_t * pState,
+        int32_t * pTapDelay,
+        uint16_t maxDelay,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Processing function for the Q7 sparse FIR filter.
+   * @param[in]  S            points to an instance of the Q7 sparse FIR structure.
+   * @param[in]  pSrc         points to the block of input data.
+   * @param[out] pDst         points to the block of output data
+   * @param[in]  pScratchIn   points to a temporary buffer of size blockSize.
+   * @param[in]  pScratchOut  points to a temporary buffer of size blockSize.
+   * @param[in]  blockSize    number of input samples to process per call.
+   */
+  void arm_fir_sparse_q7(
+        arm_fir_sparse_instance_q7 * S,
+  const q7_t * pSrc,
+        q7_t * pDst,
+        q7_t * pScratchIn,
+        q31_t * pScratchOut,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Initialization function for the Q7 sparse FIR filter.
+   * @param[in,out] S          points to an instance of the Q7 sparse FIR structure.
+   * @param[in]     numTaps    number of nonzero coefficients in the filter.
+   * @param[in]     pCoeffs    points to the array of filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     pTapDelay  points to the array of offset times.
+   * @param[in]     maxDelay   maximum offset time supported.
+   * @param[in]     blockSize  number of samples that will be processed per block.
+   */
+  void arm_fir_sparse_init_q7(
+        arm_fir_sparse_instance_q7 * S,
+        uint16_t numTaps,
+  const q7_t * pCoeffs,
+        q7_t * pState,
+        int32_t * pTapDelay,
+        uint16_t maxDelay,
+        uint32_t blockSize);
+
+
+
+
+ 
+
+  /**
+   * @brief floating-point Circular write function.
+   */
+  __STATIC_FORCEINLINE void arm_circularWrite_f32(
+  int32_t * circBuffer,
+  int32_t L,
+  uint16_t * writeOffset,
+  int32_t bufferInc,
+  const int32_t * src,
+  int32_t srcInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0U;
+    int32_t wOffset;
+
+    /* Copy the value of Index pointer that points
+     * to the current location where the input samples to be copied */
+    wOffset = *writeOffset;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the input sample to the circular buffer */
+      circBuffer[wOffset] = *src;
+
+      /* Update the input pointer */
+      src += srcInc;
+
+      /* Circularly update wOffset.  Watch out for positive and negative value */
+      wOffset += bufferInc;
+      if (wOffset >= L)
+        wOffset -= L;
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *writeOffset = (uint16_t)wOffset;
+  }
+
+
+
+  /**
+   * @brief floating-point Circular Read function.
+   */
+  __STATIC_FORCEINLINE void arm_circularRead_f32(
+  int32_t * circBuffer,
+  int32_t L,
+  int32_t * readOffset,
+  int32_t bufferInc,
+  int32_t * dst,
+  int32_t * dst_base,
+  int32_t dst_length,
+  int32_t dstInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0U;
+    int32_t rOffset;
+    int32_t* dst_end;
+
+    /* Copy the value of Index pointer that points
+     * to the current location from where the input samples to be read */
+    rOffset = *readOffset;
+    dst_end = dst_base + dst_length;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the sample from the circular buffer to the destination buffer */
+      *dst = circBuffer[rOffset];
+
+      /* Update the input pointer */
+      dst += dstInc;
+
+      if (dst == dst_end)
+      {
+        dst = dst_base;
+      }
+
+      /* Circularly update rOffset.  Watch out for positive and negative value  */
+      rOffset += bufferInc;
+
+      if (rOffset >= L)
+      {
+        rOffset -= L;
+      }
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *readOffset = rOffset;
+  }
+
+
+  /**
+   * @brief Q15 Circular write function.
+   */
+  __STATIC_FORCEINLINE void arm_circularWrite_q15(
+  q15_t * circBuffer,
+  int32_t L,
+  uint16_t * writeOffset,
+  int32_t bufferInc,
+  const q15_t * src,
+  int32_t srcInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0U;
+    int32_t wOffset;
+
+    /* Copy the value of Index pointer that points
+     * to the current location where the input samples to be copied */
+    wOffset = *writeOffset;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the input sample to the circular buffer */
+      circBuffer[wOffset] = *src;
+
+      /* Update the input pointer */
+      src += srcInc;
+
+      /* Circularly update wOffset.  Watch out for positive and negative value */
+      wOffset += bufferInc;
+      if (wOffset >= L)
+        wOffset -= L;
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *writeOffset = (uint16_t)wOffset;
+  }
+
+
+  /**
+   * @brief Q15 Circular Read function.
+   */
+  __STATIC_FORCEINLINE void arm_circularRead_q15(
+  q15_t * circBuffer,
+  int32_t L,
+  int32_t * readOffset,
+  int32_t bufferInc,
+  q15_t * dst,
+  q15_t * dst_base,
+  int32_t dst_length,
+  int32_t dstInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0;
+    int32_t rOffset;
+    q15_t* dst_end;
+
+    /* Copy the value of Index pointer that points
+     * to the current location from where the input samples to be read */
+    rOffset = *readOffset;
+
+    dst_end = dst_base + dst_length;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the sample from the circular buffer to the destination buffer */
+      *dst = circBuffer[rOffset];
+
+      /* Update the input pointer */
+      dst += dstInc;
+
+      if (dst == dst_end)
+      {
+        dst = dst_base;
+      }
+
+      /* Circularly update wOffset.  Watch out for positive and negative value */
+      rOffset += bufferInc;
+
+      if (rOffset >= L)
+      {
+        rOffset -= L;
+      }
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *readOffset = rOffset;
+  }
+
+
+  /**
+   * @brief Q7 Circular write function.
+   */
+  __STATIC_FORCEINLINE void arm_circularWrite_q7(
+  q7_t * circBuffer,
+  int32_t L,
+  uint16_t * writeOffset,
+  int32_t bufferInc,
+  const q7_t * src,
+  int32_t srcInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0U;
+    int32_t wOffset;
+
+    /* Copy the value of Index pointer that points
+     * to the current location where the input samples to be copied */
+    wOffset = *writeOffset;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the input sample to the circular buffer */
+      circBuffer[wOffset] = *src;
+
+      /* Update the input pointer */
+      src += srcInc;
+
+      /* Circularly update wOffset.  Watch out for positive and negative value */
+      wOffset += bufferInc;
+      if (wOffset >= L)
+        wOffset -= L;
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *writeOffset = (uint16_t)wOffset;
+  }
+
+
+  /**
+   * @brief Q7 Circular Read function.
+   */
+  __STATIC_FORCEINLINE void arm_circularRead_q7(
+  q7_t * circBuffer,
+  int32_t L,
+  int32_t * readOffset,
+  int32_t bufferInc,
+  q7_t * dst,
+  q7_t * dst_base,
+  int32_t dst_length,
+  int32_t dstInc,
+  uint32_t blockSize)
+  {
+    uint32_t i = 0;
+    int32_t rOffset;
+    q7_t* dst_end;
+
+    /* Copy the value of Index pointer that points
+     * to the current location from where the input samples to be read */
+    rOffset = *readOffset;
+
+    dst_end = dst_base + dst_length;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0U)
+    {
+      /* copy the sample from the circular buffer to the destination buffer */
+      *dst = circBuffer[rOffset];
+
+      /* Update the input pointer */
+      dst += dstInc;
+
+      if (dst == dst_end)
+      {
+        dst = dst_base;
+      }
+
+      /* Circularly update rOffset.  Watch out for positive and negative value */
+      rOffset += bufferInc;
+
+      if (rOffset >= L)
+      {
+        rOffset -= L;
+      }
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Update the index pointer */
+    *readOffset = rOffset;
+  }
+
+
+/**
+  @brief         Levinson Durbin
+  @param[in]     phi      autocovariance vector starting with lag 0 (length is nbCoefs + 1)
+  @param[out]    a        autoregressive coefficients
+  @param[out]    err      prediction error (variance)
+  @param[in]     nbCoefs  number of autoregressive coefficients
+  @return        none
+ */
+void arm_levinson_durbin_f32(const float32_t *phi,
+  float32_t *a, 
+  float32_t *err,
+  int nbCoefs);
+
+
+/**
+  @brief         Levinson Durbin
+  @param[in]     phi      autocovariance vector starting with lag 0 (length is nbCoefs + 1)
+  @param[out]    a        autoregressive coefficients
+  @param[out]    err      prediction error (variance)
+  @param[in]     nbCoefs  number of autoregressive coefficients
+  @return        none
+ */
+void arm_levinson_durbin_q31(const q31_t *phi,
+  q31_t *a, 
+  q31_t *err,
+  int nbCoefs);
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _FILTERING_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/filtering_functions_f16.h b/CMSIS/DSP/Include/dsp/filtering_functions_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ccb8a2d0d64b99e1513df5a04e3738d8b50d7b7
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/filtering_functions_f16.h
@@ -0,0 +1,237 @@
+/******************************************************************************
+ * @file     filtering_functions_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _FILTERING_FUNCTIONS_F16_H_
+#define _FILTERING_FUNCTIONS_F16_H_
+
+#include "arm_math_types_f16.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+ /**
+   * @brief Instance structure for the floating-point FIR filter.
+   */
+  typedef struct
+  {
+          uint16_t numTaps;     /**< number of filter coefficients in the filter. */
+          float16_t *pState;    /**< points to the state variable array. The array is of length numTaps+blockSize-1. */
+    const float16_t *pCoeffs;   /**< points to the coefficient array. The array is of length numTaps. */
+  } arm_fir_instance_f16;
+
+  /**
+   * @brief  Initialization function for the floating-point FIR filter.
+   * @param[in,out] S          points to an instance of the floating-point FIR filter structure.
+   * @param[in]     numTaps    Number of filter coefficients in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   * @param[in]     blockSize  number of samples that are processed at a time.
+   */
+  void arm_fir_init_f16(
+        arm_fir_instance_f16 * S,
+        uint16_t numTaps,
+  const float16_t * pCoeffs,
+        float16_t * pState,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the floating-point FIR filter.
+   * @param[in]  S          points to an instance of the floating-point FIR structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_fir_f16(
+  const arm_fir_instance_f16 * S,
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Instance structure for the floating-point Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint32_t numStages;      /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float16_t *pState;       /**< Points to the array of state coefficients.  The array is of length 4*numStages. */
+    const float16_t *pCoeffs;      /**< Points to the array of coefficients.  The array is of length 5*numStages. */
+  } arm_biquad_casd_df1_inst_f16;
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+  /**
+   * @brief Instance structure for the modified Biquad coefs required by vectorized code.
+   */
+  typedef struct
+  {
+      float16_t coeffs[12][8]; /**< Points to the array of modified coefficients.  The array is of length 32. There is one per stage */
+  } arm_biquad_mod_coef_f16;
+#endif 
+
+  /**
+   * @brief Processing function for the floating-point Biquad cascade filter.
+   * @param[in]  S          points to an instance of the floating-point Biquad cascade structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df1_f16(
+  const arm_biquad_casd_df1_inst_f16 * S,
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+  void arm_biquad_cascade_df1_mve_init_f16(
+      arm_biquad_casd_df1_inst_f16 * S,
+      uint8_t numStages,
+      const float16_t * pCoeffs, 
+      arm_biquad_mod_coef_f16 * pCoeffsMod, 
+      float16_t * pState);
+#endif
+
+  void arm_biquad_cascade_df1_init_f16(
+        arm_biquad_casd_df1_inst_f16 * S,
+        uint8_t numStages,
+  const float16_t * pCoeffs,
+        float16_t * pState);
+
+  /**
+   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float16_t *pState;         /**< points to the array of state coefficients.  The array is of length 2*numStages. */
+    const float16_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
+  } arm_biquad_cascade_df2T_instance_f16;
+
+  /**
+   * @brief Instance structure for the floating-point transposed direct form II Biquad cascade filter.
+   */
+  typedef struct
+  {
+          uint8_t numStages;         /**< number of 2nd order stages in the filter.  Overall order is 2*numStages. */
+          float16_t *pState;         /**< points to the array of state coefficients.  The array is of length 4*numStages. */
+    const float16_t *pCoeffs;        /**< points to the array of coefficients.  The array is of length 5*numStages. */
+  } arm_biquad_cascade_stereo_df2T_instance_f16;
+
+  /**
+   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in]  S          points to an instance of the filter data structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_df2T_f16(
+  const arm_biquad_cascade_df2T_instance_f16 * S,
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. 2 channels
+   * @param[in]  S          points to an instance of the filter data structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_biquad_cascade_stereo_df2T_f16(
+  const arm_biquad_cascade_stereo_df2T_instance_f16 * S,
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the filter data structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   */
+  void arm_biquad_cascade_df2T_init_f16(
+        arm_biquad_cascade_df2T_instance_f16 * S,
+        uint8_t numStages,
+  const float16_t * pCoeffs,
+        float16_t * pState);
+
+  /**
+   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+   * @param[in,out] S          points to an instance of the filter data structure.
+   * @param[in]     numStages  number of 2nd order stages in the filter.
+   * @param[in]     pCoeffs    points to the filter coefficients.
+   * @param[in]     pState     points to the state buffer.
+   */
+  void arm_biquad_cascade_stereo_df2T_init_f16(
+        arm_biquad_cascade_stereo_df2T_instance_f16 * S,
+        uint8_t numStages,
+  const float16_t * pCoeffs,
+        float16_t * pState);
+
+  /**
+   * @brief Correlation of floating-point sequences.
+   * @param[in]  pSrcA    points to the first input sequence.
+   * @param[in]  srcALen  length of the first input sequence.
+   * @param[in]  pSrcB    points to the second input sequence.
+   * @param[in]  srcBLen  length of the second input sequence.
+   * @param[out] pDst     points to the block of output data  Length 2 * max(srcALen, srcBLen) - 1.
+   */
+  void arm_correlate_f16(
+  const float16_t * pSrcA,
+        uint32_t srcALen,
+  const float16_t * pSrcB,
+        uint32_t srcBLen,
+        float16_t * pDst);
+
+
+/**
+  @brief         Levinson Durbin
+  @param[in]     phi      autocovariance vector starting with lag 0 (length is nbCoefs + 1)
+  @param[out]    a        autoregressive coefficients
+  @param[out]    err      prediction error (variance)
+  @param[in]     nbCoefs  number of autoregressive coefficients
+  @return        none
+ */
+void arm_levinson_durbin_f16(const float16_t *phi,
+  float16_t *a, 
+  float16_t *err,
+  int nbCoefs);
+
+#endif /*defined(ARM_FLOAT16_SUPPORTED)*/
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _FILTERING_FUNCTIONS_F16_H_ */
diff --git a/CMSIS/DSP/Include/dsp/interpolation_functions.h b/CMSIS/DSP/Include/dsp/interpolation_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..42bf746c46dfb44b40d137307e065a9621f2b1e6
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/interpolation_functions.h
@@ -0,0 +1,319 @@
+/******************************************************************************
+ * @file     interpolation_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _INTERPOLATION_FUNCTIONS_H_
+#define _INTERPOLATION_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+
+/**
+ * @defgroup groupInterpolation Interpolation Functions
+ * These functions perform 1- and 2-dimensional interpolation of data.
+ * Linear interpolation is used for 1-dimensional data and
+ * bilinear interpolation is used for 2-dimensional data.
+ */
+
+
+  /**
+   * @brief Instance structure for the floating-point Linear Interpolate function.
+   */
+  typedef struct
+  {
+          uint32_t nValues;           /**< nValues */
+          float32_t x1;               /**< x1 */
+          float32_t xSpacing;         /**< xSpacing */
+          float32_t *pYData;          /**< pointer to the table of Y values */
+  } arm_linear_interp_instance_f32;
+
+  /**
+   * @brief Instance structure for the floating-point bilinear interpolation function.
+   */
+  typedef struct
+  {
+          uint16_t numRows;   /**< number of rows in the data table. */
+          uint16_t numCols;   /**< number of columns in the data table. */
+          float32_t *pData;   /**< points to the data table. */
+  } arm_bilinear_interp_instance_f32;
+
+   /**
+   * @brief Instance structure for the Q31 bilinear interpolation function.
+   */
+  typedef struct
+  {
+          uint16_t numRows;   /**< number of rows in the data table. */
+          uint16_t numCols;   /**< number of columns in the data table. */
+          q31_t *pData;       /**< points to the data table. */
+  } arm_bilinear_interp_instance_q31;
+
+   /**
+   * @brief Instance structure for the Q15 bilinear interpolation function.
+   */
+  typedef struct
+  {
+          uint16_t numRows;   /**< number of rows in the data table. */
+          uint16_t numCols;   /**< number of columns in the data table. */
+          q15_t *pData;       /**< points to the data table. */
+  } arm_bilinear_interp_instance_q15;
+
+   /**
+   * @brief Instance structure for the Q15 bilinear interpolation function.
+   */
+  typedef struct
+  {
+          uint16_t numRows;   /**< number of rows in the data table. */
+          uint16_t numCols;   /**< number of columns in the data table. */
+          q7_t *pData;        /**< points to the data table. */
+  } arm_bilinear_interp_instance_q7;
+
+
+  /**
+   * @brief Struct for specifying cubic spline type
+   */
+  typedef enum
+  {
+    ARM_SPLINE_NATURAL = 0,           /**< Natural spline */
+    ARM_SPLINE_PARABOLIC_RUNOUT = 1   /**< Parabolic runout spline */
+  } arm_spline_type;
+
+  /**
+   * @brief Instance structure for the floating-point cubic spline interpolation.
+   */
+  typedef struct
+  {
+    arm_spline_type type;      /**< Type (boundary conditions) */
+    const float32_t * x;       /**< x values */
+    const float32_t * y;       /**< y values */
+    uint32_t n_x;              /**< Number of known data points */
+    float32_t * coeffs;        /**< Coefficients buffer (b,c, and d) */
+  } arm_spline_instance_f32;
+
+
+
+
+  /**
+   * @ingroup groupInterpolation
+   */
+
+  /**
+   * @addtogroup SplineInterpolate
+   * @{
+   */
+
+  
+  /**
+   * @brief Processing function for the floating-point cubic spline interpolation.
+   * @param[in]  S          points to an instance of the floating-point spline structure.
+   * @param[in]  xq         points to the x values ot the interpolated data points.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples of output data.
+   */
+  void arm_spline_f32(
+        arm_spline_instance_f32 * S, 
+  const float32_t * xq,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+  /**
+   * @brief Initialization function for the floating-point cubic spline interpolation.
+   * @param[in,out] S        points to an instance of the floating-point spline structure.
+   * @param[in]     type     type of cubic spline interpolation (boundary conditions)
+   * @param[in]     x        points to the x values of the known data points.
+   * @param[in]     y        points to the y values of the known data points.
+   * @param[in]     n        number of known data points.
+   * @param[in]     coeffs   coefficients array for b, c, and d
+   * @param[in]     tempBuffer   buffer array for internal computations
+   */
+  void arm_spline_init_f32(
+          arm_spline_instance_f32 * S,
+          arm_spline_type type,
+    const float32_t * x,
+    const float32_t * y,
+          uint32_t n, 
+          float32_t * coeffs,
+          float32_t * tempBuffer);
+
+
+  /**
+   * @} end of SplineInterpolate group
+   */
+
+
+  
+  /**
+   * @addtogroup LinearInterpolate
+   * @{
+   */
+
+    /**
+   * @brief  Process function for the floating-point Linear Interpolation Function.
+   * @param[in,out] S  is an instance of the floating-point Linear Interpolation structure
+   * @param[in]     x  input sample to process
+   * @return y processed output sample.
+   *
+   */
+  float32_t arm_linear_interp_f32(
+  arm_linear_interp_instance_f32 * S,
+  float32_t x);
+
+   /**
+   *
+   * @brief  Process function for the Q31 Linear Interpolation Function.
+   * @param[in] pYData   pointer to Q31 Linear Interpolation table
+   * @param[in] x        input sample to process
+   * @param[in] nValues  number of table values
+   * @return y processed output sample.
+   *
+   * \par
+   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+   * This function can support maximum of table size 2^12.
+   *
+   */
+  q31_t arm_linear_interp_q31(
+  q31_t * pYData,
+  q31_t x,
+  uint32_t nValues);
+
+  /**
+   *
+   * @brief  Process function for the Q15 Linear Interpolation Function.
+   * @param[in] pYData   pointer to Q15 Linear Interpolation table
+   * @param[in] x        input sample to process
+   * @param[in] nValues  number of table values
+   * @return y processed output sample.
+   *
+   * \par
+   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+   * This function can support maximum of table size 2^12.
+   *
+   */
+  q15_t arm_linear_interp_q15(
+  q15_t * pYData,
+  q31_t x,
+  uint32_t nValues);
+
+  /**
+   *
+   * @brief  Process function for the Q7 Linear Interpolation Function.
+   * @param[in] pYData   pointer to Q7 Linear Interpolation table
+   * @param[in] x        input sample to process
+   * @param[in] nValues  number of table values
+   * @return y processed output sample.
+   *
+   * \par
+   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+   * This function can support maximum of table size 2^12.
+   */
+q7_t arm_linear_interp_q7(
+  q7_t * pYData,
+  q31_t x,
+  uint32_t nValues);
+
+  /**
+   * @} end of LinearInterpolate group
+   */
+
+  
+
+
+  /**
+   * @ingroup groupInterpolation
+   */
+
+
+  /**
+   * @addtogroup BilinearInterpolate
+   * @{
+   */
+
+  /**
+  * @brief  Floating-point bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate.
+  * @param[in]     Y  interpolation coordinate.
+  * @return out interpolated value.
+  */
+  float32_t arm_bilinear_interp_f32(
+  const arm_bilinear_interp_instance_f32 * S,
+  float32_t X,
+  float32_t Y);
+
+  /**
+  * @brief  Q31 bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate in 12.20 format.
+  * @param[in]     Y  interpolation coordinate in 12.20 format.
+  * @return out interpolated value.
+  */
+  q31_t arm_bilinear_interp_q31(
+  arm_bilinear_interp_instance_q31 * S,
+  q31_t X,
+  q31_t Y);
+
+
+  /**
+  * @brief  Q15 bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate in 12.20 format.
+  * @param[in]     Y  interpolation coordinate in 12.20 format.
+  * @return out interpolated value.
+  */
+  q15_t arm_bilinear_interp_q15(
+  arm_bilinear_interp_instance_q15 * S,
+  q31_t X,
+  q31_t Y);
+
+  /**
+  * @brief  Q7 bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate in 12.20 format.
+  * @param[in]     Y  interpolation coordinate in 12.20 format.
+  * @return out interpolated value.
+  */
+  q7_t arm_bilinear_interp_q7(
+  arm_bilinear_interp_instance_q7 * S,
+  q31_t X,
+  q31_t Y);
+  /**
+   * @} end of BilinearInterpolate group
+   */
+
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _INTERPOLATION_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/interpolation_functions_f16.h b/CMSIS/DSP/Include/dsp/interpolation_functions_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..01fd87acca97d329dbae168381bd85dde35b8a09
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/interpolation_functions_f16.h
@@ -0,0 +1,107 @@
+/******************************************************************************
+ * @file     interpolation_functions_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _INTERPOLATION_FUNCTIONS_F16_H_
+#define _INTERPOLATION_FUNCTIONS_F16_H_
+
+#include "arm_math_types_f16.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+typedef struct
+{
+    uint32_t  nValues;        /**< nValues */
+    float16_t x1;             /**< x1 */
+    float16_t xSpacing;       /**< xSpacing */
+    float16_t *pYData;        /**< pointer to the table of Y values */
+} arm_linear_interp_instance_f16;
+
+/**
+ * @brief Instance structure for the floating-point bilinear interpolation function.
+ */
+typedef struct
+{
+    uint16_t  numRows;/**< number of rows in the data table. */
+    uint16_t  numCols;/**< number of columns in the data table. */
+    float16_t *pData; /**< points to the data table. */
+} arm_bilinear_interp_instance_f16;
+
+  /**
+   * @addtogroup LinearInterpolate
+   * @{
+   */
+
+    /**
+   * @brief  Process function for the floating-point Linear Interpolation Function.
+   * @param[in,out] S  is an instance of the floating-point Linear Interpolation structure
+   * @param[in]     x  input sample to process
+   * @return y processed output sample.
+   *
+   */
+  float16_t arm_linear_interp_f16(
+  arm_linear_interp_instance_f16 * S,
+  float16_t x);
+
+    /**
+   * @} end of LinearInterpolate group
+   */
+
+/**
+   * @addtogroup BilinearInterpolate
+   * @{
+   */
+
+  /**
+  * @brief  Floating-point bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate.
+  * @param[in]     Y  interpolation coordinate.
+  * @return out interpolated value.
+  */
+  float16_t arm_bilinear_interp_f16(
+  const arm_bilinear_interp_instance_f16 * S,
+  float16_t X,
+  float16_t Y);
+
+
+  /**
+   * @} end of BilinearInterpolate group
+   */
+#endif /*defined(ARM_FLOAT16_SUPPORTED)*/
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _INTERPOLATION_FUNCTIONS_F16_H_ */
diff --git a/CMSIS/DSP/Include/dsp/matrix_functions.h b/CMSIS/DSP/Include/dsp/matrix_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..e03a2f18b67126a5e0bc1f41ddbe51dce614dc19
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/matrix_functions.h
@@ -0,0 +1,742 @@
+/******************************************************************************
+ * @file     matrix_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _MATRIX_FUNCTIONS_H_
+#define _MATRIX_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @defgroup groupMatrix Matrix Functions
+ *
+ * This set of functions provides basic matrix math operations.
+ * The functions operate on matrix data structures.  For example,
+ * the type
+ * definition for the floating-point matrix structure is shown
+ * below:
+ * <pre>
+ *     typedef struct
+ *     {
+ *       uint16_t numRows;     // number of rows of the matrix.
+ *       uint16_t numCols;     // number of columns of the matrix.
+ *       float32_t *pData;     // points to the data of the matrix.
+ *     } arm_matrix_instance_f32;
+ * </pre>
+ * There are similar definitions for Q15 and Q31 data types.
+ *
+ * The structure specifies the size of the matrix and then points to
+ * an array of data.  The array is of size <code>numRows X numCols</code>
+ * and the values are arranged in row order.  That is, the
+ * matrix element (i, j) is stored at:
+ * <pre>
+ *     pData[i*numCols + j]
+ * </pre>
+ *
+ * \par Init Functions
+ * There is an associated initialization function for each type of matrix
+ * data structure.
+ * The initialization function sets the values of the internal structure fields.
+ * Refer to \ref arm_mat_init_f32(), \ref arm_mat_init_q31() and \ref arm_mat_init_q15()
+ * for floating-point, Q31 and Q15 types,  respectively.
+ *
+ * \par
+ * Use of the initialization function is optional. However, if initialization function is used
+ * then the instance structure cannot be placed into a const data section.
+ * To place the instance structure in a const data
+ * section, manually initialize the data structure.  For example:
+ * <pre>
+ * <code>arm_matrix_instance_f32 S = {nRows, nColumns, pData};</code>
+ * <code>arm_matrix_instance_q31 S = {nRows, nColumns, pData};</code>
+ * <code>arm_matrix_instance_q15 S = {nRows, nColumns, pData};</code>
+ * </pre>
+ * where <code>nRows</code> specifies the number of rows, <code>nColumns</code>
+ * specifies the number of columns, and <code>pData</code> points to the
+ * data array.
+ *
+ * \par Size Checking
+ * By default all of the matrix functions perform size checking on the input and
+ * output matrices. For example, the matrix addition function verifies that the
+ * two input matrices and the output matrix all have the same number of rows and
+ * columns. If the size check fails the functions return:
+ * <pre>
+ *     ARM_MATH_SIZE_MISMATCH
+ * </pre>
+ * Otherwise the functions return
+ * <pre>
+ *     ARM_MATH_SUCCESS
+ * </pre>
+ * There is some overhead associated with this matrix size checking.
+ * The matrix size checking is enabled via the \#define
+ * <pre>
+ *     ARM_MATH_MATRIX_CHECK
+ * </pre>
+ * within the library project settings.  By default this macro is defined
+ * and size checking is enabled. By changing the project settings and
+ * undefining this macro size checking is eliminated and the functions
+ * run a bit faster. With size checking disabled the functions always
+ * return <code>ARM_MATH_SUCCESS</code>.
+ */
+
+  /**
+   * @brief Instance structure for the floating-point matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    float32_t *pData;     /**< points to the data of the matrix. */
+  } arm_matrix_instance_f32;
+ 
+ /**
+   * @brief Instance structure for the floating-point matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    float64_t *pData;     /**< points to the data of the matrix. */
+  } arm_matrix_instance_f64;
+
+ /**
+   * @brief Instance structure for the Q7 matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    q7_t *pData;         /**< points to the data of the matrix. */
+  } arm_matrix_instance_q7;
+
+  /**
+   * @brief Instance structure for the Q15 matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    q15_t *pData;         /**< points to the data of the matrix. */
+  } arm_matrix_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    q31_t *pData;         /**< points to the data of the matrix. */
+  } arm_matrix_instance_q31;
+
+  /**
+   * @brief Floating-point matrix addition.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_add_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix addition.
+   * @param[in]   pSrcA  points to the first input matrix structure
+   * @param[in]   pSrcB  points to the second input matrix structure
+   * @param[out]  pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_add_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix addition.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_add_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point, complex, matrix multiplication.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_mult_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15, complex,  matrix multiplication.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_mult_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst,
+        q15_t * pScratch);
+
+  /**
+   * @brief Q31, complex, matrix multiplication.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_mult_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_f32(
+  const arm_matrix_instance_f32 * pSrc,
+        arm_matrix_instance_f32 * pDst);
+
+/**
+   * @brief Floating-point matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_f64(
+  const arm_matrix_instance_f64 * pSrc,
+        arm_matrix_instance_f64 * pDst);
+
+  /**
+   * @brief Floating-point complex matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_trans_f32(
+  const arm_matrix_instance_f32 * pSrc,
+  arm_matrix_instance_f32 * pDst);
+
+
+  /**
+   * @brief Q15 matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_q15(
+  const arm_matrix_instance_q15 * pSrc,
+        arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q15 complex matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_trans_q15(
+  const arm_matrix_instance_q15 * pSrc,
+  arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q7 matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_q7(
+  const arm_matrix_instance_q7 * pSrc,
+        arm_matrix_instance_q7 * pDst);
+
+  /**
+   * @brief Q31 matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_q31(
+  const arm_matrix_instance_q31 * pSrc,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Q31 complex matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_trans_q31(
+  const arm_matrix_instance_q31 * pSrc,
+  arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix multiplication
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Floating-point matrix multiplication
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_f64(
+  const arm_matrix_instance_f64 * pSrcA,
+  const arm_matrix_instance_f64 * pSrcB,
+        arm_matrix_instance_f64 * pDst);
+
+  /**
+   * @brief Floating-point matrix and vector multiplication
+   * @param[in]  pSrcMat  points to the input matrix structure
+   * @param[in]  pVec     points to vector
+   * @param[out] pDst     points to output vector
+   */
+void arm_mat_vec_mult_f32(
+  const arm_matrix_instance_f32 *pSrcMat, 
+  const float32_t *pVec, 
+  float32_t *pDst);
+
+  /**
+   * @brief Q7 matrix multiplication
+   * @param[in]  pSrcA   points to the first input matrix structure
+   * @param[in]  pSrcB   points to the second input matrix structure
+   * @param[out] pDst    points to output matrix structure
+   * @param[in]  pState  points to the array for storing intermediate results
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_q7(
+  const arm_matrix_instance_q7 * pSrcA,
+  const arm_matrix_instance_q7 * pSrcB,
+        arm_matrix_instance_q7 * pDst,
+        q7_t * pState);
+
+  /**
+   * @brief Q7 matrix and vector multiplication
+   * @param[in]  pSrcMat  points to the input matrix structure
+   * @param[in]  pVec     points to vector
+   * @param[out] pDst     points to output vector
+   */
+void arm_mat_vec_mult_q7(
+  const arm_matrix_instance_q7 *pSrcMat, 
+  const q7_t *pVec, 
+  q7_t *pDst);
+
+  /**
+   * @brief Q15 matrix multiplication
+   * @param[in]  pSrcA   points to the first input matrix structure
+   * @param[in]  pSrcB   points to the second input matrix structure
+   * @param[out] pDst    points to output matrix structure
+   * @param[in]  pState  points to the array for storing intermediate results
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst,
+        q15_t * pState);
+
+  /**
+   * @brief Q15 matrix and vector multiplication
+   * @param[in]  pSrcMat  points to the input matrix structure
+   * @param[in]  pVec     points to vector
+   * @param[out] pDst     points to output vector
+   */
+void arm_mat_vec_mult_q15(
+  const arm_matrix_instance_q15 *pSrcMat, 
+  const q15_t *pVec, 
+  q15_t *pDst);
+
+  /**
+   * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA   points to the first input matrix structure
+   * @param[in]  pSrcB   points to the second input matrix structure
+   * @param[out] pDst    points to output matrix structure
+   * @param[in]  pState  points to the array for storing intermediate results
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_fast_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst,
+        q15_t * pState);
+
+  /**
+   * @brief Q31 matrix multiplication
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Q31 matrix and vector multiplication
+   * @param[in]  pSrcMat  points to the input matrix structure
+   * @param[in]  pVec     points to vector
+   * @param[out] pDst     points to output vector
+   */
+void arm_mat_vec_mult_q31(
+  const arm_matrix_instance_q31 *pSrcMat, 
+  const q31_t *pVec, 
+  q31_t *pDst);
+
+  /**
+   * @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_fast_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_sub_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Floating-point matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_sub_f64(
+  const arm_matrix_instance_f64 * pSrcA,
+  const arm_matrix_instance_f64 * pSrcB,
+        arm_matrix_instance_f64 * pDst);
+
+  /**
+   * @brief Q15 matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_sub_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_sub_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix scaling.
+   * @param[in]  pSrc   points to the input matrix
+   * @param[in]  scale  scale factor
+   * @param[out] pDst   points to the output matrix
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_scale_f32(
+  const arm_matrix_instance_f32 * pSrc,
+        float32_t scale,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix scaling.
+   * @param[in]  pSrc        points to input matrix
+   * @param[in]  scaleFract  fractional portion of the scale factor
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to output matrix
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_scale_q15(
+  const arm_matrix_instance_q15 * pSrc,
+        q15_t scaleFract,
+        int32_t shift,
+        arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix scaling.
+   * @param[in]  pSrc        points to input matrix
+   * @param[in]  scaleFract  fractional portion of the scale factor
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_scale_q31(
+  const arm_matrix_instance_q31 * pSrc,
+        q31_t scaleFract,
+        int32_t shift,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief  Q31 matrix initialization.
+   * @param[in,out] S         points to an instance of the floating-point matrix structure.
+   * @param[in]     nRows     number of rows in the matrix.
+   * @param[in]     nColumns  number of columns in the matrix.
+   * @param[in]     pData     points to the matrix data array.
+   */
+void arm_mat_init_q31(
+        arm_matrix_instance_q31 * S,
+        uint16_t nRows,
+        uint16_t nColumns,
+        q31_t * pData);
+
+  /**
+   * @brief  Q15 matrix initialization.
+   * @param[in,out] S         points to an instance of the floating-point matrix structure.
+   * @param[in]     nRows     number of rows in the matrix.
+   * @param[in]     nColumns  number of columns in the matrix.
+   * @param[in]     pData     points to the matrix data array.
+   */
+void arm_mat_init_q15(
+        arm_matrix_instance_q15 * S,
+        uint16_t nRows,
+        uint16_t nColumns,
+        q15_t * pData);
+
+  /**
+   * @brief  Floating-point matrix initialization.
+   * @param[in,out] S         points to an instance of the floating-point matrix structure.
+   * @param[in]     nRows     number of rows in the matrix.
+   * @param[in]     nColumns  number of columns in the matrix.
+   * @param[in]     pData     points to the matrix data array.
+   */
+void arm_mat_init_f32(
+        arm_matrix_instance_f32 * S,
+        uint16_t nRows,
+        uint16_t nColumns,
+        float32_t * pData);
+
+
+
+  /**
+   * @brief Floating-point matrix inverse.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR.
+   */
+  arm_status arm_mat_inverse_f32(
+  const arm_matrix_instance_f32 * src,
+  arm_matrix_instance_f32 * dst);
+
+
+  /**
+   * @brief Floating-point matrix inverse.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR.
+   */
+  arm_status arm_mat_inverse_f64(
+  const arm_matrix_instance_f64 * src,
+  arm_matrix_instance_f64 * dst);
+
+ /**
+   * @brief Floating-point Cholesky decomposition of Symmetric Positive Definite Matrix.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE.
+   * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
+   * The decomposition is returning a lower triangular matrix.
+   */
+  arm_status arm_mat_cholesky_f64(
+  const arm_matrix_instance_f64 * src,
+  arm_matrix_instance_f64 * dst);
+
+ /**
+   * @brief Floating-point Cholesky decomposition of Symmetric Positive Definite Matrix.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE.
+   * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
+   * The decomposition is returning a lower triangular matrix.
+   */
+  arm_status arm_mat_cholesky_f32(
+  const arm_matrix_instance_f32 * src,
+  arm_matrix_instance_f32 * dst);
+
+  /**
+   * @brief Solve UT . X = A where UT is an upper triangular matrix
+   * @param[in]  ut  The upper triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of UT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+  */
+  arm_status arm_mat_solve_upper_triangular_f32(
+  const arm_matrix_instance_f32 * ut,
+  const arm_matrix_instance_f32 * a,
+  arm_matrix_instance_f32 * dst);
+
+ /**
+   * @brief Solve LT . X = A where LT is a lower triangular matrix
+   * @param[in]  lt  The lower triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of LT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+   */
+  arm_status arm_mat_solve_lower_triangular_f32(
+  const arm_matrix_instance_f32 * lt,
+  const arm_matrix_instance_f32 * a,
+  arm_matrix_instance_f32 * dst);
+
+
+  /**
+   * @brief Solve UT . X = A where UT is an upper triangular matrix
+   * @param[in]  ut  The upper triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of UT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+  */
+  arm_status arm_mat_solve_upper_triangular_f64(
+  const arm_matrix_instance_f64 * ut,
+  const arm_matrix_instance_f64 * a,
+  arm_matrix_instance_f64 * dst);
+
+ /**
+   * @brief Solve LT . X = A where LT is a lower triangular matrix
+   * @param[in]  lt  The lower triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of LT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+   */
+  arm_status arm_mat_solve_lower_triangular_f64(
+  const arm_matrix_instance_f64 * lt,
+  const arm_matrix_instance_f64 * a,
+  arm_matrix_instance_f64 * dst);
+
+
+  /**
+   * @brief Floating-point LDL decomposition of Symmetric Positive Semi-Definite Matrix.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] l   points to the instance of the output floating-point triangular matrix structure.
+   * @param[out] d   points to the instance of the output floating-point diagonal matrix structure.
+   * @param[out] p   points to the instance of the output floating-point permutation vector.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE.
+   * The decomposition is returning a lower triangular matrix.
+   */
+  arm_status arm_mat_ldlt_f32(
+  const arm_matrix_instance_f32 * src,
+  arm_matrix_instance_f32 * l,
+  arm_matrix_instance_f32 * d,
+  uint16_t * pp);
+
+ /**
+   * @brief Floating-point LDL decomposition of Symmetric Positive Semi-Definite Matrix.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] l   points to the instance of the output floating-point triangular matrix structure.
+   * @param[out] d   points to the instance of the output floating-point diagonal matrix structure.
+   * @param[out] p   points to the instance of the output floating-point permutation vector.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE.
+   * The decomposition is returning a lower triangular matrix.
+   */
+  arm_status arm_mat_ldlt_f64(
+  const arm_matrix_instance_f64 * src,
+  arm_matrix_instance_f64 * l,
+  arm_matrix_instance_f64 * d,
+  uint16_t * pp);
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _MATRIX_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/matrix_functions_f16.h b/CMSIS/DSP/Include/dsp/matrix_functions_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..62876a76b099a2898ae864c06f60fc92b965c401
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/matrix_functions_f16.h
@@ -0,0 +1,221 @@
+/******************************************************************************
+ * @file     matrix_functions_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _MATRIX_FUNCTIONS_F16_H_
+#define _MATRIX_FUNCTIONS_F16_H_
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+
+#include "arm_math_types_f16.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+    
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+ /**
+   * @brief Instance structure for the floating-point matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    float16_t *pData;     /**< points to the data of the matrix. */
+  } arm_matrix_instance_f16;
+
+ /**
+   * @brief Floating-point matrix addition.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_add_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+        arm_matrix_instance_f16 * pDst);
+
+  /**
+   * @brief Floating-point, complex, matrix multiplication.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_mult_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+        arm_matrix_instance_f16 * pDst);
+
+  /**
+   * @brief Floating-point matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_f16(
+  const arm_matrix_instance_f16 * pSrc,
+        arm_matrix_instance_f16 * pDst);
+
+  /**
+   * @brief Floating-point complex matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_trans_f16(
+  const arm_matrix_instance_f16 * pSrc,
+  arm_matrix_instance_f16 * pDst);
+
+  /**
+   * @brief Floating-point matrix multiplication
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+        arm_matrix_instance_f16 * pDst);
+  /**
+   * @brief Floating-point matrix and vector multiplication
+   * @param[in]  pSrcMat  points to the input matrix structure
+   * @param[in]  pVec     points to vector
+   * @param[out] pDst     points to output vector
+   */
+void arm_mat_vec_mult_f16(
+  const arm_matrix_instance_f16 *pSrcMat, 
+  const float16_t *pVec, 
+  float16_t *pDst);
+
+  /**
+   * @brief Floating-point matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_sub_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+        arm_matrix_instance_f16 * pDst);
+
+  /**
+   * @brief Floating-point matrix scaling.
+   * @param[in]  pSrc   points to the input matrix
+   * @param[in]  scale  scale factor
+   * @param[out] pDst   points to the output matrix
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_scale_f16(
+  const arm_matrix_instance_f16 * pSrc,
+        float16_t scale,
+        arm_matrix_instance_f16 * pDst);
+
+  /**
+   * @brief  Floating-point matrix initialization.
+   * @param[in,out] S         points to an instance of the floating-point matrix structure.
+   * @param[in]     nRows     number of rows in the matrix.
+   * @param[in]     nColumns  number of columns in the matrix.
+   * @param[in]     pData     points to the matrix data array.
+   */
+void arm_mat_init_f16(
+        arm_matrix_instance_f16 * S,
+        uint16_t nRows,
+        uint16_t nColumns,
+        float16_t * pData);
+
+
+  /**
+   * @brief Floating-point matrix inverse.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR.
+   */
+  arm_status arm_mat_inverse_f16(
+  const arm_matrix_instance_f16 * src,
+  arm_matrix_instance_f16 * dst);
+
+
+ /**
+   * @brief Floating-point Cholesky decomposition of Symmetric Positive Definite Matrix.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE.
+   * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
+   * The decomposition is returning a lower triangular matrix.
+   */
+  arm_status arm_mat_cholesky_f16(
+  const arm_matrix_instance_f16 * src,
+  arm_matrix_instance_f16 * dst);
+
+ /**
+   * @brief Solve UT . X = A where UT is an upper triangular matrix
+   * @param[in]  ut  The upper triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of UT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+  */
+  arm_status arm_mat_solve_upper_triangular_f16(
+  const arm_matrix_instance_f16 * ut,
+  const arm_matrix_instance_f16 * a,
+  arm_matrix_instance_f16 * dst);
+
+ /**
+   * @brief Solve LT . X = A where LT is a lower triangular matrix
+   * @param[in]  lt  The lower triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of LT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+   */
+  arm_status arm_mat_solve_lower_triangular_f16(
+  const arm_matrix_instance_f16 * lt,
+  const arm_matrix_instance_f16 * a,
+  arm_matrix_instance_f16 * dst);
+
+
+
+#endif /*defined(ARM_FLOAT16_SUPPORTED)*/
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _MATRIX_FUNCTIONS_F16_H_ */
diff --git a/CMSIS/DSP/Include/dsp/none.h b/CMSIS/DSP/Include/dsp/none.h
new file mode 100644
index 0000000000000000000000000000000000000000..62f2d144a663acd0f3c265d2c07ef912d117d5d8
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/none.h
@@ -0,0 +1,576 @@
+/******************************************************************************
+ * @file     none.h
+ * @brief    Intrinsincs when no DSP extension available
+ * @version  V1.9.0
+ * @date     20. July 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+
+Definitions in this file are allowing to reuse some versions of the
+CMSIS-DSP to build on a core (M0 for instance) or a host where
+DSP extension are not available.
+
+Ideally a pure C version should have been used instead.
+But those are not always available or use a restricted set
+of intrinsics.
+
+*/
+ 
+#ifndef _NONE_H_
+#define _NONE_H_
+
+#include "arm_math_types.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+ 
+
+/*
+
+Normally those kind of definitions are in a compiler file
+in Core or Core_A.
+
+But for MSVC compiler it is a bit special. The goal is very specific
+to CMSIS-DSP and only to allow the use of this library from other
+systems like Python or Matlab.
+
+MSVC is not going to be used to cross-compile to ARM. So, having a MSVC
+compiler file in Core or Core_A would not make sense.
+
+*/
+#if defined ( _MSC_VER ) || defined(__GNUC_PYTHON__)
+    __STATIC_FORCEINLINE uint8_t __CLZ(uint32_t data)
+    {
+      if (data == 0U) { return 32U; }
+
+      uint32_t count = 0U;
+      uint32_t mask = 0x80000000U;
+
+      while ((data & mask) == 0U)
+      {
+        count += 1U;
+        mask = mask >> 1U;
+      }
+      return count;
+    }
+
+  __STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
+  {
+    if ((sat >= 1U) && (sat <= 32U))
+    {
+      const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+      const int32_t min = -1 - max ;
+      if (val > max)
+      {
+        return max;
+      }
+      else if (val < min)
+      {
+        return min;
+      }
+    }
+    return val;
+  }
+
+  __STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
+  {
+    if (sat <= 31U)
+    {
+      const uint32_t max = ((1U << sat) - 1U);
+      if (val > (int32_t)max)
+      {
+        return max;
+      }
+      else if (val < 0)
+      {
+        return 0U;
+      }
+    }
+    return (uint32_t)val;
+  }
+
+ /**
+  \brief   Rotate Right in unsigned value (32 bit)
+  \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
+  \param [in]    op1  Value to rotate
+  \param [in]    op2  Number of Bits to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
+{
+  op2 %= 32U;
+  if (op2 == 0U)
+  {
+    return op1;
+  }
+  return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+
+#endif
+
+/**
+   * @brief Clips Q63 to Q31 values.
+   */
+  __STATIC_FORCEINLINE q31_t clip_q63_to_q31(
+  q63_t x)
+  {
+    return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
+      ((0x7FFFFFFF ^ ((q31_t) (x >> 63)))) : (q31_t) x;
+  }
+
+  /**
+   * @brief Clips Q63 to Q15 values.
+   */
+  __STATIC_FORCEINLINE q15_t clip_q63_to_q15(
+  q63_t x)
+  {
+    return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
+      ((0x7FFF ^ ((q15_t) (x >> 63)))) : (q15_t) (x >> 15);
+  }
+
+  /**
+   * @brief Clips Q31 to Q7 values.
+   */
+  __STATIC_FORCEINLINE q7_t clip_q31_to_q7(
+  q31_t x)
+  {
+    return ((q31_t) (x >> 24) != ((q31_t) x >> 23)) ?
+      ((0x7F ^ ((q7_t) (x >> 31)))) : (q7_t) x;
+  }
+
+  /**
+   * @brief Clips Q31 to Q15 values.
+   */
+  __STATIC_FORCEINLINE q15_t clip_q31_to_q15(
+  q31_t x)
+  {
+    return ((q31_t) (x >> 16) != ((q31_t) x >> 15)) ?
+      ((0x7FFF ^ ((q15_t) (x >> 31)))) : (q15_t) x;
+  }
+
+  /**
+   * @brief Multiplies 32 X 64 and returns 32 bit result in 2.30 format.
+   */
+  __STATIC_FORCEINLINE q63_t mult32x64(
+  q63_t x,
+  q31_t y)
+  {
+    return ((((q63_t) (x & 0x00000000FFFFFFFF) * y) >> 32) +
+            (((q63_t) (x >> 32)                * y)      )  );
+  }
+
+/* SMMLAR */
+#define multAcc_32x32_keep32_R(a, x, y) \
+    a = (q31_t) (((((q63_t) a) << 32) + ((q63_t) x * y) + 0x80000000LL ) >> 32)
+
+/* SMMLSR */
+#define multSub_32x32_keep32_R(a, x, y) \
+    a = (q31_t) (((((q63_t) a) << 32) - ((q63_t) x * y) + 0x80000000LL ) >> 32)
+
+/* SMMULR */
+#define mult_32x32_keep32_R(a, x, y) \
+    a = (q31_t) (((q63_t) x * y + 0x80000000LL ) >> 32)
+
+/* SMMLA */
+#define multAcc_32x32_keep32(a, x, y) \
+    a += (q31_t) (((q63_t) x * y) >> 32)
+
+/* SMMLS */
+#define multSub_32x32_keep32(a, x, y) \
+    a -= (q31_t) (((q63_t) x * y) >> 32)
+
+/* SMMUL */
+#define mult_32x32_keep32(a, x, y) \
+    a = (q31_t) (((q63_t) x * y ) >> 32)
+
+#ifndef ARM_MATH_DSP
+  /**
+   * @brief definition to pack two 16 bit values.
+   */
+  #define __PKHBT(ARG1, ARG2, ARG3) ( (((int32_t)(ARG1) <<    0) & (int32_t)0x0000FFFF) | \
+                                      (((int32_t)(ARG2) << ARG3) & (int32_t)0xFFFF0000)  )
+  #define __PKHTB(ARG1, ARG2, ARG3) ( (((int32_t)(ARG1) <<    0) & (int32_t)0xFFFF0000) | \
+                                      (((int32_t)(ARG2) >> ARG3) & (int32_t)0x0000FFFF)  )
+#endif
+
+   /**
+   * @brief definition to pack four 8 bit values.
+   */
+#ifndef ARM_MATH_BIG_ENDIAN
+  #define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v0) <<  0) & (int32_t)0x000000FF) | \
+                                  (((int32_t)(v1) <<  8) & (int32_t)0x0000FF00) | \
+                                  (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | \
+                                  (((int32_t)(v3) << 24) & (int32_t)0xFF000000)  )
+#else
+  #define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v3) <<  0) & (int32_t)0x000000FF) | \
+                                  (((int32_t)(v2) <<  8) & (int32_t)0x0000FF00) | \
+                                  (((int32_t)(v1) << 16) & (int32_t)0x00FF0000) | \
+                                  (((int32_t)(v0) << 24) & (int32_t)0xFF000000)  )
+#endif
+
+
+ 
+
+/*
+ * @brief C custom defined intrinsic functions
+ */
+#if !defined (ARM_MATH_DSP)
+
+
+  /*
+   * @brief C custom defined QADD8
+   */
+  __STATIC_FORCEINLINE uint32_t __QADD8(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s, t, u;
+
+    r = __SSAT(((((q31_t)x << 24) >> 24) + (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF;
+    s = __SSAT(((((q31_t)x << 16) >> 24) + (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF;
+    t = __SSAT(((((q31_t)x <<  8) >> 24) + (((q31_t)y <<  8) >> 24)), 8) & (int32_t)0x000000FF;
+    u = __SSAT(((((q31_t)x      ) >> 24) + (((q31_t)y      ) >> 24)), 8) & (int32_t)0x000000FF;
+
+    return ((uint32_t)((u << 24) | (t << 16) | (s <<  8) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QSUB8
+   */
+  __STATIC_FORCEINLINE uint32_t __QSUB8(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s, t, u;
+
+    r = __SSAT(((((q31_t)x << 24) >> 24) - (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF;
+    s = __SSAT(((((q31_t)x << 16) >> 24) - (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF;
+    t = __SSAT(((((q31_t)x <<  8) >> 24) - (((q31_t)y <<  8) >> 24)), 8) & (int32_t)0x000000FF;
+    u = __SSAT(((((q31_t)x      ) >> 24) - (((q31_t)y      ) >> 24)), 8) & (int32_t)0x000000FF;
+
+    return ((uint32_t)((u << 24) | (t << 16) | (s <<  8) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QADD16
+   */
+  __STATIC_FORCEINLINE uint32_t __QADD16(
+  uint32_t x,
+  uint32_t y)
+  {
+/*  q31_t r,     s;  without initialisation 'arm_offset_q15 test' fails  but 'intrinsic' tests pass! for armCC */
+    q31_t r = 0, s = 0;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) + (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHADD16
+   */
+  __STATIC_FORCEINLINE uint32_t __SHADD16(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) + (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QSUB16
+   */
+  __STATIC_FORCEINLINE uint32_t __QSUB16(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) - (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHSUB16
+   */
+  __STATIC_FORCEINLINE uint32_t __SHSUB16(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) - (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QASX
+   */
+  __STATIC_FORCEINLINE uint32_t __QASX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHASX
+   */
+  __STATIC_FORCEINLINE uint32_t __SHASX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) - (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QSAX
+   */
+  __STATIC_FORCEINLINE uint32_t __QSAX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHSAX
+   */
+  __STATIC_FORCEINLINE uint32_t __SHSAX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) + (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SMUSDX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUSDX(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) -
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16))   ));
+  }
+
+  /*
+   * @brief C custom defined SMUADX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUADX(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16))   ));
+  }
+
+
+  /*
+   * @brief C custom defined QADD
+   */
+  __STATIC_FORCEINLINE int32_t __QADD(
+  int32_t x,
+  int32_t y)
+  {
+    return ((int32_t)(clip_q63_to_q31((q63_t)x + (q31_t)y)));
+  }
+
+
+  /*
+   * @brief C custom defined QSUB
+   */
+  __STATIC_FORCEINLINE int32_t __QSUB(
+  int32_t x,
+  int32_t y)
+  {
+    return ((int32_t)(clip_q63_to_q31((q63_t)x - (q31_t)y)));
+  }
+
+
+  /*
+   * @brief C custom defined SMLAD
+   */
+  __STATIC_FORCEINLINE uint32_t __SMLAD(
+  uint32_t x,
+  uint32_t y,
+  uint32_t sum)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ( ((q31_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLADX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMLADX(
+  uint32_t x,
+  uint32_t y,
+  uint32_t sum)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ( ((q31_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLSDX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMLSDX(
+  uint32_t x,
+  uint32_t y,
+  uint32_t sum)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) -
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ( ((q31_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLALD
+   */
+  __STATIC_FORCEINLINE uint64_t __SMLALD(
+  uint32_t x,
+  uint32_t y,
+  uint64_t sum)
+  {
+/*  return (sum + ((q15_t) (x >> 16) * (q15_t) (y >> 16)) + ((q15_t) x * (q15_t) y)); */
+    return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ( ((q63_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLALDX
+   */
+  __STATIC_FORCEINLINE uint64_t __SMLALDX(
+  uint32_t x,
+  uint32_t y,
+  uint64_t sum)
+  {
+/*  return (sum + ((q15_t) (x >> 16) * (q15_t) y)) + ((q15_t) x * (q15_t) (y >> 16)); */
+    return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ( ((q63_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMUAD
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUAD(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16))   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMUSD
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUSD(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) -
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16))   ));
+  }
+
+
+  /*
+   * @brief C custom defined SXTB16
+   */
+  __STATIC_FORCEINLINE uint32_t __SXTB16(
+  uint32_t x)
+  {
+    return ((uint32_t)(((((q31_t)x << 24) >> 24) & (q31_t)0x0000FFFF) |
+                       ((((q31_t)x <<  8) >>  8) & (q31_t)0xFFFF0000)  ));
+  }
+
+  /*
+   * @brief C custom defined SMMLA
+   */
+  __STATIC_FORCEINLINE int32_t __SMMLA(
+  int32_t x,
+  int32_t y,
+  int32_t sum)
+  {
+    return (sum + (int32_t) (((int64_t) x * y) >> 32));
+  }
+
+#endif /* !defined (ARM_MATH_DSP) */
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _TRANSFORM_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/quaternion_math_functions.h b/CMSIS/DSP/Include/dsp/quaternion_math_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e1f2e0af91445fea519d60c8e36dd3bcdd0fe98
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/quaternion_math_functions.h
@@ -0,0 +1,159 @@
+/******************************************************************************
+ * @file     quaternion_math_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _QUATERNION_MATH_FUNCTIONS_H_
+#define _QUATERNION_MATH_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @defgroup groupQuaternionMath Quaternion Math Functions
+ * Functions to operates on quaternions and convert between a
+ * rotation and quaternion representation.
+ */
+
+
+/**
+  @brief         Floating-point quaternion Norm.
+  @param[in]     pInputQuaternions       points to the input vector of quaternions
+  @param[out]    pNorms                  points to the output vector of norms
+  @param[in]     nbQuaternions           number of quaternions in each vector
+  @return        none
+ */
+
+
+
+void arm_quaternion_norm_f32(const float32_t *pInputQuaternions, 
+    float32_t *pNorms,
+    uint32_t nbQuaternions);
+
+
+/**
+  @brief         Floating-point quaternion inverse.
+  @param[in]     pInputQuaternions            points to the input vector of quaternions
+  @param[out]    pInverseQuaternions          points to the output vector of inverse quaternions
+  @param[in]     nbQuaternions                number of quaternions in each vector
+  @return        none
+ */
+
+void arm_quaternion_inverse_f32(const float32_t *pInputQuaternions, 
+    float32_t *pInverseQuaternions, 
+    uint32_t nbQuaternions);
+
+/**
+  @brief         Floating-point quaternion conjugates.
+  @param[in]     pInputQuaternions            points to the input vector of quaternions
+  @param[out]    pConjugateQuaternions        points to the output vector of conjugate quaternions
+  @param[in]     nbQuaternions                number of quaternions in each vector
+  @return        none
+ */
+void arm_quaternion_conjugate_f32(const float32_t *inputQuaternions, 
+    float32_t *pConjugateQuaternions, 
+    uint32_t nbQuaternions);
+
+/**
+  @brief         Floating-point normalization of quaternions.
+  @param[in]     pInputQuaternions            points to the input vector of quaternions
+  @param[out]    pNormalizedQuaternions       points to the output vector of normalized quaternions
+  @param[in]     nbQuaternions                number of quaternions in each vector
+  @return        none
+ */
+void arm_quaternion_normalize_f32(const float32_t *inputQuaternions, 
+    float32_t *pNormalizedQuaternions, 
+    uint32_t nbQuaternions);
+
+
+/**
+  @brief         Floating-point product of two quaternions.
+  @param[in]     qa       First quaternion
+  @param[in]     qb       Second quaternion
+  @param[out]    r        Product of two quaternions
+  @return        none
+ */
+void arm_quaternion_product_single_f32(const float32_t *qa, 
+    const float32_t *qb, 
+    float32_t *r);
+
+/**
+  @brief         Floating-point elementwise product two quaternions.
+  @param[in]     qa                  First array of quaternions
+  @param[in]     qb                  Second array of quaternions
+  @param[out]    r                   Elementwise product of quaternions
+  @param[in]     nbQuaternions       Number of quaternions in the array
+  @return        none
+ */
+void arm_quaternion_product_f32(const float32_t *qa, 
+    const float32_t *qb, 
+    float32_t *r,
+    uint32_t nbQuaternions);
+
+/**
+ * @brief Conversion of quaternion to equivalent rotation matrix.
+ * @param[in]       pInputQuaternions points to an array of normalized quaternions
+ * @param[out]      pOutputRotations points to an array of 3x3 rotations (in row order)
+ * @param[in]       nbQuaternions in the array
+ * @return none.
+ *
+ * <b>Format of rotation matrix</b>
+ * \par
+ * The quaternion a + ib + jc + kd is converted into rotation matrix:
+ *   a^2 + b^2 - c^2 - d^2                 2bc - 2ad                 2bd + 2ac
+ *               2bc + 2ad     a^2 - b^2 + c^2 - d^2                 2cd - 2ab
+ *               2bd - 2ac                 2cd + 2ab     a^2 - b^2 - c^2 + d^2
+ *
+ * Rotation matrix is saved in row order : R00 R01 R02 R10 R11 R12 R20 R21 R22
+ */
+void arm_quaternion2rotation_f32(const float32_t *pInputQuaternions, 
+    float32_t *pOutputRotations, 
+    uint32_t nbQuaternions);
+
+/**
+ * @brief Conversion of a rotation matrix to equivalent quaternion.
+ * @param[in]       pInputRotations points to an array 3x3 rotation matrix (in row order)
+ * @param[out]      pOutputQuaternions points to an array of quaternions
+ * @param[in]       nbQuaternions in the array
+ * @return none.
+*/
+void arm_rotation2quaternion_f32(const float32_t *pInputRotations, 
+    float32_t *pOutputQuaternions,  
+    uint32_t nbQuaternions);
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _QUATERNION_MATH_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/statistics_functions.h b/CMSIS/DSP/Include/dsp/statistics_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee5c6926536924b0724ace876b727fbd45203ba9
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/statistics_functions.h
@@ -0,0 +1,586 @@
+/******************************************************************************
+ * @file     statistics_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _STATISTICS_FUNCTIONS_H_
+#define _STATISTICS_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#include "dsp/basic_math_functions.h"
+#include "dsp/fast_math_functions.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+
+/**
+ * @defgroup groupStats Statistics Functions
+ */
+
+/**
+ * @brief Computation of the LogSumExp
+ *
+ * In probabilistic computations, the dynamic of the probability values can be very
+ * wide because they come from gaussian functions.
+ * To avoid underflow and overflow issues, the values are represented by their log.
+ * In this representation, multiplying the original exp values is easy : their logs are added.
+ * But adding the original exp values is requiring some special handling and it is the
+ * goal of the LogSumExp function.
+ *
+ * If the values are x1...xn, the function is computing:
+ *
+ * ln(exp(x1) + ... + exp(xn)) and the computation is done in such a way that
+ * rounding issues are minimised.
+ *
+ * The max xm of the values is extracted and the function is computing:
+ * xm + ln(exp(x1 - xm) + ... + exp(xn - xm))
+ *
+ * @param[in]  *in         Pointer to an array of input values.
+ * @param[in]  blockSize   Number of samples in the input array.
+ * @return LogSumExp
+ *
+ */
+
+
+float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize);
+
+/**
+ * @brief Dot product with log arithmetic
+ *
+ * Vectors are containing the log of the samples
+ *
+ * @param[in]       pSrcA points to the first input vector
+ * @param[in]       pSrcB points to the second input vector
+ * @param[in]       blockSize number of samples in each vector
+ * @param[in]       pTmpBuffer temporary buffer of length blockSize
+ * @return The log of the dot product .
+ *
+ */
+
+
+float32_t arm_logsumexp_dot_prod_f32(const float32_t * pSrcA,
+  const float32_t * pSrcB,
+  uint32_t blockSize,
+  float32_t *pTmpBuffer);
+
+/**
+ * @brief Entropy
+ *
+ * @param[in]  pSrcA        Array of input values.
+ * @param[in]  blockSize    Number of samples in the input array.
+ * @return     Entropy      -Sum(p ln p)
+ *
+ */
+
+
+float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize);
+
+
+/**
+ * @brief Entropy
+ *
+ * @param[in]  pSrcA        Array of input values.
+ * @param[in]  blockSize    Number of samples in the input array.
+ * @return     Entropy      -Sum(p ln p)
+ *
+ */
+
+
+float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize);
+
+
+/**
+ * @brief Kullback-Leibler
+ *
+ * @param[in]  pSrcA         Pointer to an array of input values for probability distribution A.
+ * @param[in]  pSrcB         Pointer to an array of input values for probability distribution B.
+ * @param[in]  blockSize     Number of samples in the input array.
+ * @return Kullback-Leibler  Divergence D(A || B)
+ *
+ */
+float32_t arm_kullback_leibler_f32(const float32_t * pSrcA
+  ,const float32_t * pSrcB
+  ,uint32_t blockSize);
+
+
+/**
+ * @brief Kullback-Leibler
+ *
+ * @param[in]  pSrcA         Pointer to an array of input values for probability distribution A.
+ * @param[in]  pSrcB         Pointer to an array of input values for probability distribution B.
+ * @param[in]  blockSize     Number of samples in the input array.
+ * @return Kullback-Leibler  Divergence D(A || B)
+ *
+ */
+float64_t arm_kullback_leibler_f64(const float64_t * pSrcA, 
+                const float64_t * pSrcB, 
+                uint32_t blockSize);
+
+
+ /**
+   * @brief  Sum of the squares of the elements of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_power_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q63_t * pResult);
+
+
+  /**
+   * @brief  Sum of the squares of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_power_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Sum of the squares of the elements of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_power_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q63_t * pResult);
+
+
+  /**
+   * @brief  Sum of the squares of the elements of a Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_power_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Mean value of a Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_mean_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * pResult);
+
+
+  /**
+   * @brief  Mean value of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_mean_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult);
+
+
+  /**
+   * @brief  Mean value of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_mean_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Mean value of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_mean_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Variance of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_var_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Variance of the elements of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_var_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Variance of the elements of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_var_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult);
+
+
+  /**
+   * @brief  Root Mean Square of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_rms_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Root Mean Square of the elements of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_rms_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Root Mean Square of the elements of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_rms_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult);
+
+
+  /**
+   * @brief  Standard deviation of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_std_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult);
+
+
+  /**
+   * @brief  Standard deviation of the elements of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_std_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult);
+
+
+  /**
+   * @brief  Standard deviation of the elements of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_std_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult);
+
+
+  
+  /**
+   * @brief  Minimum value of a Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] result     is output pointer
+   * @param[in]  index      is the array index of the minimum value in the input buffer.
+   */
+  void arm_min_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * result,
+        uint32_t * index);
+
+  /**
+   * @brief  Minimum value of absolute values of a Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] result     is output pointer
+   * @param[in]  index      is the array index of the minimum value in the input buffer.
+   */
+  void arm_absmin_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * result,
+        uint32_t * index);
+
+
+  /**
+   * @brief  Minimum value of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[in]  pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void arm_min_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex);
+
+/**
+   * @brief  Minimum value of absolute values of a Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[in]  pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void arm_absmin_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex);
+
+
+  /**
+   * @brief  Minimum value of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[out] pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void arm_min_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex);
+
+  /**
+   * @brief  Minimum value of absolute values of a Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[out] pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void arm_absmin_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex);
+
+
+  /**
+   * @brief  Minimum value of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[out] pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void arm_min_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex);
+
+  /**
+   * @brief  Minimum value of absolute values of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[out] pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void arm_absmin_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex);
+
+
+/**
+ * @brief Maximum value of a Q7 vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_max_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * pResult,
+        uint32_t * pIndex);
+
+/**
+ * @brief Maximum value of absolute values of a Q7 vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_absmax_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * pResult,
+        uint32_t * pIndex);
+
+
+/**
+ * @brief Maximum value of a Q15 vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_max_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex);
+
+/**
+ * @brief Maximum value of absolute values of a Q15 vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_absmax_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex);
+
+/**
+ * @brief Maximum value of a Q31 vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_max_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex);
+
+/**
+ * @brief Maximum value of absolute values of a Q31 vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_absmax_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex);
+
+/**
+ * @brief Maximum value of a floating-point vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_max_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex);
+
+/**
+ * @brief Maximum value of absolute values of a floating-point vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_absmax_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex);
+
+  /**
+    @brief         Maximum value of a floating-point vector.
+    @param[in]     pSrc       points to the input vector
+    @param[in]     blockSize  number of samples in input vector
+    @param[out]    pResult    maximum value returned here
+    @return        none
+   */
+  void arm_max_no_idx_f32(
+      const float32_t *pSrc,
+      uint32_t   blockSize,
+      float32_t *pResult);
+
+
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _STATISTICS_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/statistics_functions_f16.h b/CMSIS/DSP/Include/dsp/statistics_functions_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ed3a844ce8088224c815165a25ace3b63fa1849
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/statistics_functions_f16.h
@@ -0,0 +1,218 @@
+/******************************************************************************
+ * @file     statistics_functions_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _STATISTICS_FUNCTIONS_F16_H_
+#define _STATISTICS_FUNCTIONS_F16_H_
+
+#include "arm_math_types_f16.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#include "dsp/basic_math_functions_f16.h"
+#include "dsp/fast_math_functions_f16.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+ /**
+   * @brief  Sum of the squares of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_power_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult);
+
+ /**
+   * @brief  Mean value of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_mean_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult);
+
+  /**
+   * @brief  Variance of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_var_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult);
+
+ /**
+   * @brief  Root Mean Square of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_rms_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult);
+
+ /**
+   * @brief  Standard deviation of the elements of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output value.
+   */
+  void arm_std_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult);
+
+ /**
+   * @brief  Minimum value of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[out] pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void arm_min_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult,
+        uint32_t * pIndex);
+
+ /**
+   * @brief  Minimum value of absolute values of a floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[in]  blockSize  is the number of samples to process
+   * @param[out] pResult    is output pointer
+   * @param[out] pIndex     is the array index of the minimum value in the input buffer.
+   */
+  void arm_absmin_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult,
+        uint32_t * pIndex);
+
+/**
+ * @brief Maximum value of a floating-point vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_max_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult,
+        uint32_t * pIndex);
+
+/**
+ * @brief Maximum value of absolute values of a floating-point vector.
+ * @param[in]  pSrc       points to the input buffer
+ * @param[in]  blockSize  length of the input vector
+ * @param[out] pResult    maximum value returned here
+ * @param[out] pIndex     index of maximum value returned here
+ */
+  void arm_absmax_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult,
+        uint32_t * pIndex);
+
+/**
+ * @brief Entropy
+ *
+ * @param[in]  pSrcA        Array of input values.
+ * @param[in]  blockSize    Number of samples in the input array.
+ * @return     Entropy      -Sum(p ln p)
+ *
+ */
+
+
+float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize);
+
+float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize);
+
+/**
+ * @brief Dot product with log arithmetic
+ *
+ * Vectors are containing the log of the samples
+ *
+ * @param[in]       pSrcA points to the first input vector
+ * @param[in]       pSrcB points to the second input vector
+ * @param[in]       blockSize number of samples in each vector
+ * @param[in]       pTmpBuffer temporary buffer of length blockSize
+ * @return The log of the dot product .
+ *
+ */
+
+
+float16_t arm_logsumexp_dot_prod_f16(const float16_t * pSrcA,
+  const float16_t * pSrcB,
+  uint32_t blockSize,
+  float16_t *pTmpBuffer);
+
+/**
+ * @brief Kullback-Leibler
+ *
+ * @param[in]  pSrcA         Pointer to an array of input values for probability distribution A.
+ * @param[in]  pSrcB         Pointer to an array of input values for probability distribution B.
+ * @param[in]  blockSize     Number of samples in the input array.
+ * @return Kullback-Leibler  Divergence D(A || B)
+ *
+ */
+float16_t arm_kullback_leibler_f16(const float16_t * pSrcA
+  ,const float16_t * pSrcB
+  ,uint32_t blockSize);
+
+/**
+    @brief         Maximum value of a floating-point vector.
+    @param[in]     pSrc       points to the input vector
+    @param[in]     blockSize  number of samples in input vector
+    @param[out]    pResult    maximum value returned here
+    @return        none
+   */
+  void arm_max_no_idx_f16(
+      const float16_t *pSrc,
+      uint32_t   blockSize,
+      float16_t *pResult);
+
+
+
+#endif /*defined(ARM_FLOAT16_SUPPORTED)*/
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _STATISTICS_FUNCTIONS_F16_H_ */
diff --git a/CMSIS/DSP/Include/dsp/support_functions.h b/CMSIS/DSP/Include/dsp/support_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c2a7de3e7ed796bc07ee8b44d321f86b6f27cf9
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/support_functions.h
@@ -0,0 +1,427 @@
+/******************************************************************************
+ * @file     support_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _SUPPORT_FUNCTIONS_H_
+#define _SUPPORT_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @defgroup groupSupport Support Functions
+ */
+
+
+/**
+   * @brief Converts the elements of the floating-point vector to Q31 vector.
+   * @param[in]  pSrc       points to the floating-point input vector
+   * @param[out] pDst       points to the Q31 output vector
+   * @param[in]  blockSize  length of the input vector
+   */
+  void arm_float_to_q31(
+  const float32_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Converts the elements of the floating-point vector to Q15 vector.
+   * @param[in]  pSrc       points to the floating-point input vector
+   * @param[out] pDst       points to the Q15 output vector
+   * @param[in]  blockSize  length of the input vector
+   */
+  void arm_float_to_q15(
+  const float32_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief Converts the elements of the floating-point vector to Q7 vector.
+   * @param[in]  pSrc       points to the floating-point input vector
+   * @param[out] pDst       points to the Q7 output vector
+   * @param[in]  blockSize  length of the input vector
+   */
+  void arm_float_to_q7(
+  const float32_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q31 vector to floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q31_to_float(
+  const q31_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q31 vector to Q15 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q31_to_q15(
+  const q31_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q31 vector to Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q31_to_q7(
+  const q31_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q15 vector to floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q15_to_float(
+  const q15_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q15 vector to Q31 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q15_to_q31(
+  const q15_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q15 vector to Q7 vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q15_to_q7(
+  const q15_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q7 vector to floating-point vector.
+   * @param[in]  pSrc       is input pointer
+   * @param[out] pDst       is output pointer
+   * @param[in]  blockSize  is the number of samples to process
+   */
+  void arm_q7_to_float(
+  const q7_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q7 vector to Q31 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_q7_to_q31(
+  const q7_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Converts the elements of the Q7 vector to Q15 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_q7_to_q15(
+  const q7_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+
+
+  
+  /**
+   * @brief Struct for specifying sorting algorithm
+   */
+  typedef enum
+  {
+    ARM_SORT_BITONIC   = 0,
+             /**< Bitonic sort   */
+    ARM_SORT_BUBBLE    = 1,
+             /**< Bubble sort    */
+    ARM_SORT_HEAP      = 2,
+             /**< Heap sort      */
+    ARM_SORT_INSERTION = 3,
+             /**< Insertion sort */
+    ARM_SORT_QUICK     = 4,
+             /**< Quick sort     */
+    ARM_SORT_SELECTION = 5
+             /**< Selection sort */
+  } arm_sort_alg;
+
+  /**
+   * @brief Struct for specifying sorting algorithm
+   */
+  typedef enum
+  {
+    ARM_SORT_DESCENDING = 0,
+             /**< Descending order (9 to 0) */
+    ARM_SORT_ASCENDING = 1
+             /**< Ascending order (0 to 9) */
+  } arm_sort_dir;
+
+  /**
+   * @brief Instance structure for the sorting algorithms.
+   */
+  typedef struct            
+  {
+    arm_sort_alg alg;        /**< Sorting algorithm selected */
+    arm_sort_dir dir;        /**< Sorting order (direction)  */
+  } arm_sort_instance_f32;  
+
+  /**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data.
+   * @param[in]  blockSize  number of samples to process.
+   */
+  void arm_sort_f32(
+    const arm_sort_instance_f32 * S, 
+          float32_t * pSrc, 
+          float32_t * pDst, 
+          uint32_t blockSize);
+
+  /**
+   * @param[in,out]  S            points to an instance of the sorting structure.
+   * @param[in]      alg          Selected algorithm.
+   * @param[in]      dir          Sorting order.
+   */
+  void arm_sort_init_f32(
+    arm_sort_instance_f32 * S, 
+    arm_sort_alg alg, 
+    arm_sort_dir dir); 
+
+  /**
+   * @brief Instance structure for the sorting algorithms.
+   */
+  typedef struct            
+  {
+    arm_sort_dir dir;        /**< Sorting order (direction)  */
+    float32_t * buffer;      /**< Working buffer */
+  } arm_merge_sort_instance_f32;  
+
+  /**
+   * @param[in]      S          points to an instance of the sorting structure.
+   * @param[in,out]  pSrc       points to the block of input data.
+   * @param[out]     pDst       points to the block of output data
+   * @param[in]      blockSize  number of samples to process.
+   */
+  void arm_merge_sort_f32(
+    const arm_merge_sort_instance_f32 * S,
+          float32_t *pSrc,
+          float32_t *pDst,
+          uint32_t blockSize);
+
+  /**
+   * @param[in,out]  S            points to an instance of the sorting structure.
+   * @param[in]      dir          Sorting order.
+   * @param[in]      buffer       Working buffer.
+   */
+  void arm_merge_sort_init_f32(
+    arm_merge_sort_instance_f32 * S,
+    arm_sort_dir dir,
+    float32_t * buffer);
+
+ 
+ 
+  /**
+   * @brief  Copies the elements of a floating-point vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_copy_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Copies the elements of a Q7 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_copy_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Copies the elements of a Q15 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_copy_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Copies the elements of a Q31 vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_copy_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Fills a constant value into a floating-point vector.
+   * @param[in]  value      input value to be filled
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_fill_f32(
+        float32_t value,
+        float32_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Fills a constant value into a Q7 vector.
+   * @param[in]  value      input value to be filled
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_fill_q7(
+        q7_t value,
+        q7_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Fills a constant value into a Q15 vector.
+   * @param[in]  value      input value to be filled
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_fill_q15(
+        q15_t value,
+        q15_t * pDst,
+        uint32_t blockSize);
+
+
+  /**
+   * @brief  Fills a constant value into a Q31 vector.
+   * @param[in]  value      input value to be filled
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+  void arm_fill_q31(
+        q31_t value,
+        q31_t * pDst,
+        uint32_t blockSize);
+
+
+
+
+
+
+
+/**
+ * @brief Weighted sum
+ *
+ *
+ * @param[in]    *in           Array of input values.
+ * @param[in]    *weigths      Weights
+ * @param[in]    blockSize     Number of samples in the input array.
+ * @return Weighted sum
+ *
+ */
+float32_t arm_weighted_sum_f32(const float32_t *in
+  , const float32_t *weigths
+  , uint32_t blockSize);
+
+
+/**
+ * @brief Barycenter
+ *
+ *
+ * @param[in]    in         List of vectors
+ * @param[in]    weights    Weights of the vectors
+ * @param[out]   out        Barycenter
+ * @param[in]    nbVectors  Number of vectors
+ * @param[in]    vecDim     Dimension of space (vector dimension)
+ * @return       None
+ *
+ */
+void arm_barycenter_f32(const float32_t *in
+  , const float32_t *weights
+  , float32_t *out
+  , uint32_t nbVectors
+  , uint32_t vecDim);
+
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _SUPPORT_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/support_functions_f16.h b/CMSIS/DSP/Include/dsp/support_functions_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..47b6535f1eda04bd09b1fca0a8a6552cfc4cd331
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/support_functions_f16.h
@@ -0,0 +1,129 @@
+/******************************************************************************
+ * @file     support_functions_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _SUPPORT_FUNCTIONS_F16_H_
+#define _SUPPORT_FUNCTIONS_F16_H_
+
+#include "arm_math_types_f16.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+  /**
+   * @brief  Copies the elements of a floating-point vector.
+   * @param[in]  pSrc       input pointer
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+void arm_copy_f16(const float16_t * pSrc, float16_t * pDst, uint32_t blockSize);
+
+  /**
+   * @brief  Fills a constant value into a floating-point vector.
+   * @param[in]  value      input value to be filled
+   * @param[out] pDst       output pointer
+   * @param[in]  blockSize  number of samples to process
+   */
+void arm_fill_f16(float16_t value, float16_t * pDst, uint32_t blockSize);
+
+/**
+   * @brief Converts the elements of the floating-point vector to Q31 vector.
+   * @param[in]  pSrc       points to the f16 input vector
+   * @param[out] pDst       points to the q15 output vector
+   * @param[in]  blockSize  length of the input vector
+   */
+void arm_f16_to_q15(const float16_t * pSrc, q15_t * pDst, uint32_t blockSize);
+
+/**
+   * @brief Converts the elements of the floating-point vector to Q31 vector.
+   * @param[in]  pSrc       points to the q15 input vector
+   * @param[out] pDst       points to the f16 output vector
+   * @param[in]  blockSize  length of the input vector
+   */
+void arm_q15_to_f16(const q15_t * pSrc, float16_t * pDst, uint32_t blockSize);
+
+
+/**
+   * @brief Converts the elements of the floating-point vector to Q31 vector.
+   * @param[in]  pSrc       points to the f32 input vector
+   * @param[out] pDst       points to the f16 output vector
+   * @param[in]  blockSize  length of the input vector
+   */
+void arm_float_to_f16(const float32_t * pSrc, float16_t * pDst, uint32_t blockSize);
+
+/**
+   * @brief Converts the elements of the floating-point vector to Q31 vector.
+   * @param[in]  pSrc       points to the f16 input vector
+   * @param[out] pDst       points to the f32 output vector
+   * @param[in]  blockSize  length of the input vector
+   */
+void arm_f16_to_float(const float16_t * pSrc, float32_t * pDst, uint32_t blockSize);
+
+/**
+ * @brief Weighted sum
+ *
+ *
+ * @param[in]    *in           Array of input values.
+ * @param[in]    *weigths      Weights
+ * @param[in]    blockSize     Number of samples in the input array.
+ * @return Weighted sum
+ *
+ */
+float16_t arm_weighted_sum_f16(const float16_t *in
+  , const float16_t *weigths
+  , uint32_t blockSize);
+
+/**
+ * @brief Barycenter
+ *
+ *
+ * @param[in]    in         List of vectors
+ * @param[in]    weights    Weights of the vectors
+ * @param[out]   out        Barycenter
+ * @param[in]    nbVectors  Number of vectors
+ * @param[in]    vecDim     Dimension of space (vector dimension)
+ * @return       None
+ *
+ */
+void arm_barycenter_f16(const float16_t *in
+  , const float16_t *weights
+  , float16_t *out
+  , uint32_t nbVectors
+  , uint32_t vecDim);
+
+#endif /*defined(ARM_FLOAT16_SUPPORTED)*/
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _SUPPORT_FUNCTIONS_F16_H_ */
diff --git a/CMSIS/DSP/Include/dsp/svm_defines.h b/CMSIS/DSP/Include/dsp/svm_defines.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f6001f327f52afaf3e9c37226ec8994c62c42b3
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/svm_defines.h
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * @file     svm_defines.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _SVM_DEFINES_H_
+#define _SVM_DEFINES_H_
+
+/**
+ * @brief Struct for specifying SVM Kernel
+ */
+typedef enum
+{
+    ARM_ML_KERNEL_LINEAR = 0,
+             /**< Linear kernel */
+    ARM_ML_KERNEL_POLYNOMIAL = 1,
+             /**< Polynomial kernel */
+    ARM_ML_KERNEL_RBF = 2,
+             /**< Radial Basis Function kernel */
+    ARM_ML_KERNEL_SIGMOID = 3
+             /**< Sigmoid kernel */
+} arm_ml_kernel_type;
+
+#endif
diff --git a/CMSIS/DSP/Include/dsp/svm_functions.h b/CMSIS/DSP/Include/dsp/svm_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fdcb13e111f2d3d37ae158839ec014376940b77
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/svm_functions.h
@@ -0,0 +1,299 @@
+/******************************************************************************
+ * @file     svm_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _SVM_FUNCTIONS_H_
+#define _SVM_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+#include "dsp/svm_defines.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#define STEP(x) (x) <= 0 ? 0 : 1
+
+/**
+ * @defgroup groupSVM SVM Functions
+ * This set of functions is implementing SVM classification on 2 classes.
+ * The training must be done from scikit-learn. The parameters can be easily
+ * generated from the scikit-learn object. Some examples are given in
+ * DSP/Testing/PatternGeneration/SVM.py
+ *
+ * If more than 2 classes are needed, the functions in this folder 
+ * will have to be used, as building blocks, to do multi-class classification.
+ *
+ * No multi-class classification is provided in this SVM folder.
+ * 
+ */
+
+/**
+ * @brief Integer exponentiation
+ * @param[in]    x           value
+ * @param[in]    nb          integer exponent >= 1
+ * @return x^nb
+ *
+ */
+__STATIC_INLINE float32_t arm_exponent_f32(float32_t x, int32_t nb)
+{
+    float32_t r = x;
+    nb --;
+    while(nb > 0)
+    {
+        r = r * x;
+        nb--;
+    }
+    return(r);
+}
+
+  
+
+
+
+/**
+ * @brief Instance structure for linear SVM prediction function.
+ */
+typedef struct
+{
+  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
+  uint32_t        vectorDimension;        /**< Dimension of vector space */
+  float32_t       intercept;              /**< Intercept */
+  const float32_t *dualCoefficients;      /**< Dual coefficients */
+  const float32_t *supportVectors;        /**< Support vectors */
+  const int32_t   *classes;               /**< The two SVM classes */
+} arm_svm_linear_instance_f32;
+
+
+/**
+ * @brief Instance structure for polynomial SVM prediction function.
+ */
+typedef struct
+{
+  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
+  uint32_t        vectorDimension;        /**< Dimension of vector space */
+  float32_t       intercept;              /**< Intercept */
+  const float32_t *dualCoefficients;      /**< Dual coefficients */
+  const float32_t *supportVectors;        /**< Support vectors */
+  const int32_t   *classes;               /**< The two SVM classes */
+  int32_t         degree;                 /**< Polynomial degree */
+  float32_t       coef0;                  /**< Polynomial constant */
+  float32_t       gamma;                  /**< Gamma factor */
+} arm_svm_polynomial_instance_f32;
+
+/**
+ * @brief Instance structure for rbf SVM prediction function.
+ */
+typedef struct
+{
+  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
+  uint32_t        vectorDimension;        /**< Dimension of vector space */
+  float32_t       intercept;              /**< Intercept */
+  const float32_t *dualCoefficients;      /**< Dual coefficients */
+  const float32_t *supportVectors;        /**< Support vectors */
+  const int32_t   *classes;               /**< The two SVM classes */
+  float32_t       gamma;                  /**< Gamma factor */
+} arm_svm_rbf_instance_f32;
+
+/**
+ * @brief Instance structure for sigmoid SVM prediction function.
+ */
+typedef struct
+{
+  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
+  uint32_t        vectorDimension;        /**< Dimension of vector space */
+  float32_t       intercept;              /**< Intercept */
+  const float32_t *dualCoefficients;      /**< Dual coefficients */
+  const float32_t *supportVectors;        /**< Support vectors */
+  const int32_t   *classes;               /**< The two SVM classes */
+  float32_t       coef0;                  /**< Independent constant */
+  float32_t       gamma;                  /**< Gamma factor */
+} arm_svm_sigmoid_instance_f32;
+
+/**
+ * @brief        SVM linear instance init function
+ * @param[in]    S                      Parameters for SVM functions
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @return none.
+ *
+ */
+
+
+void arm_svm_linear_init_f32(arm_svm_linear_instance_f32 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float32_t intercept,
+  const float32_t *dualCoefficients,
+  const float32_t *supportVectors,
+  const int32_t  *classes);
+
+/**
+ * @brief SVM linear prediction
+ * @param[in]    S          Pointer to an instance of the linear SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
+  
+void arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 *S, 
+   const float32_t * in, 
+   int32_t * pResult);
+
+
+/**
+ * @brief        SVM polynomial instance init function
+ * @param[in]    S                      points to an instance of the polynomial SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    degree                 Polynomial degree
+ * @param[in]    coef0                  coeff0 (scikit-learn terminology)
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+
+void arm_svm_polynomial_init_f32(arm_svm_polynomial_instance_f32 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float32_t intercept,
+  const float32_t *dualCoefficients,
+  const float32_t *supportVectors,
+  const int32_t   *classes,
+  int32_t      degree,
+  float32_t coef0,
+  float32_t gamma
+  );
+
+/**
+ * @brief SVM polynomial prediction
+ * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
+void arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 *S, 
+   const float32_t * in, 
+   int32_t * pResult);
+
+
+/**
+ * @brief        SVM radial basis function instance init function
+ * @param[in]    S                      points to an instance of the polynomial SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_rbf_init_f32(arm_svm_rbf_instance_f32 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float32_t intercept,
+  const float32_t *dualCoefficients,
+  const float32_t *supportVectors,
+  const int32_t   *classes,
+  float32_t gamma
+  );
+
+/**
+ * @brief SVM rbf prediction
+ * @param[in]    S         Pointer to an instance of the rbf SVM structure.
+ * @param[in]    in        Pointer to input vector
+ * @param[out]   pResult   decision value
+ * @return none.
+ *
+ */
+void arm_svm_rbf_predict_f32(const arm_svm_rbf_instance_f32 *S, 
+   const float32_t * in, 
+   int32_t * pResult);
+
+/**
+ * @brief        SVM sigmoid instance init function
+ * @param[in]    S                      points to an instance of the rbf SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    coef0                  coeff0 (scikit-learn terminology)
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_sigmoid_init_f32(arm_svm_sigmoid_instance_f32 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float32_t intercept,
+  const float32_t *dualCoefficients,
+  const float32_t *supportVectors,
+  const int32_t   *classes,
+  float32_t coef0,
+  float32_t gamma
+  );
+
+/**
+ * @brief SVM sigmoid prediction
+ * @param[in]    S        Pointer to an instance of the rbf SVM structure.
+ * @param[in]    in       Pointer to input vector
+ * @param[out]   pResult  Decision value
+ * @return none.
+ *
+ */
+void arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 *S, 
+   const float32_t * in, 
+   int32_t * pResult);
+
+
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _SVM_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/svm_functions_f16.h b/CMSIS/DSP/Include/dsp/svm_functions_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..b80ed7cf9041dcf43300d65c648b9506d46a6afd
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/svm_functions_f16.h
@@ -0,0 +1,298 @@
+/******************************************************************************
+ * @file     svm_functions_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _SVM_FUNCTIONS_F16_H_
+#define _SVM_FUNCTIONS_F16_H_
+
+#include "arm_math_types_f16.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+#include "dsp/svm_defines.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#define STEP(x) (x) <= 0 ? 0 : 1
+
+/**
+ * @defgroup groupSVM SVM Functions
+ * This set of functions is implementing SVM classification on 2 classes.
+ * The training must be done from scikit-learn. The parameters can be easily
+ * generated from the scikit-learn object. Some examples are given in
+ * DSP/Testing/PatternGeneration/SVM.py
+ *
+ * If more than 2 classes are needed, the functions in this folder 
+ * will have to be used, as building blocks, to do multi-class classification.
+ *
+ * No multi-class classification is provided in this SVM folder.
+ * 
+ */
+
+/**
+ * @brief Integer exponentiation
+ * @param[in]    x           value
+ * @param[in]    nb          integer exponent >= 1
+ * @return x^nb
+ *
+ */
+__STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb)
+{
+    float16_t r = x;
+    nb --;
+    while(nb > 0)
+    {
+        r = r * x;
+        nb--;
+    }
+    return(r);
+}
+
+
+/**
+ * @brief Instance structure for linear SVM prediction function.
+ */
+typedef struct
+{
+  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
+  uint32_t        vectorDimension;        /**< Dimension of vector space */
+  float16_t       intercept;              /**< Intercept */
+  const float16_t *dualCoefficients;      /**< Dual coefficients */
+  const float16_t *supportVectors;        /**< Support vectors */
+  const int32_t   *classes;               /**< The two SVM classes */
+} arm_svm_linear_instance_f16;
+
+
+/**
+ * @brief Instance structure for polynomial SVM prediction function.
+ */
+typedef struct
+{
+  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
+  uint32_t        vectorDimension;        /**< Dimension of vector space */
+  float16_t       intercept;              /**< Intercept */
+  const float16_t *dualCoefficients;      /**< Dual coefficients */
+  const float16_t *supportVectors;        /**< Support vectors */
+  const int32_t   *classes;               /**< The two SVM classes */
+  int32_t         degree;                 /**< Polynomial degree */
+  float16_t       coef0;                  /**< Polynomial constant */
+  float16_t       gamma;                  /**< Gamma factor */
+} arm_svm_polynomial_instance_f16;
+
+/**
+ * @brief Instance structure for rbf SVM prediction function.
+ */
+typedef struct
+{
+  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
+  uint32_t        vectorDimension;        /**< Dimension of vector space */
+  float16_t       intercept;              /**< Intercept */
+  const float16_t *dualCoefficients;      /**< Dual coefficients */
+  const float16_t *supportVectors;        /**< Support vectors */
+  const int32_t   *classes;               /**< The two SVM classes */
+  float16_t       gamma;                  /**< Gamma factor */
+} arm_svm_rbf_instance_f16;
+
+/**
+ * @brief Instance structure for sigmoid SVM prediction function.
+ */
+typedef struct
+{
+  uint32_t        nbOfSupportVectors;     /**< Number of support vectors */
+  uint32_t        vectorDimension;        /**< Dimension of vector space */
+  float16_t       intercept;              /**< Intercept */
+  const float16_t *dualCoefficients;      /**< Dual coefficients */
+  const float16_t *supportVectors;        /**< Support vectors */
+  const int32_t   *classes;               /**< The two SVM classes */
+  float16_t       coef0;                  /**< Independent constant */
+  float16_t       gamma;                  /**< Gamma factor */
+} arm_svm_sigmoid_instance_f16;
+
+/**
+ * @brief        SVM linear instance init function
+ * @param[in]    S                      Parameters for SVM functions
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @return none.
+ *
+ */
+
+
+void arm_svm_linear_init_f16(arm_svm_linear_instance_f16 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float16_t intercept,
+  const float16_t *dualCoefficients,
+  const float16_t *supportVectors,
+  const int32_t  *classes);
+
+/**
+ * @brief SVM linear prediction
+ * @param[in]    S          Pointer to an instance of the linear SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
+  
+void arm_svm_linear_predict_f16(const arm_svm_linear_instance_f16 *S, 
+   const float16_t * in, 
+   int32_t * pResult);
+
+
+/**
+ * @brief        SVM polynomial instance init function
+ * @param[in]    S                      points to an instance of the polynomial SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    degree                 Polynomial degree
+ * @param[in]    coef0                  coeff0 (scikit-learn terminology)
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+
+void arm_svm_polynomial_init_f16(arm_svm_polynomial_instance_f16 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float16_t intercept,
+  const float16_t *dualCoefficients,
+  const float16_t *supportVectors,
+  const int32_t   *classes,
+  int32_t      degree,
+  float16_t coef0,
+  float16_t gamma
+  );
+
+/**
+ * @brief SVM polynomial prediction
+ * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
+void arm_svm_polynomial_predict_f16(const arm_svm_polynomial_instance_f16 *S, 
+   const float16_t * in, 
+   int32_t * pResult);
+
+
+/**
+ * @brief        SVM radial basis function instance init function
+ * @param[in]    S                      points to an instance of the polynomial SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_rbf_init_f16(arm_svm_rbf_instance_f16 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float16_t intercept,
+  const float16_t *dualCoefficients,
+  const float16_t *supportVectors,
+  const int32_t   *classes,
+  float16_t gamma
+  );
+
+/**
+ * @brief SVM rbf prediction
+ * @param[in]    S         Pointer to an instance of the rbf SVM structure.
+ * @param[in]    in        Pointer to input vector
+ * @param[out]   pResult   decision value
+ * @return none.
+ *
+ */
+void arm_svm_rbf_predict_f16(const arm_svm_rbf_instance_f16 *S, 
+   const float16_t * in, 
+   int32_t * pResult);
+
+/**
+ * @brief        SVM sigmoid instance init function
+ * @param[in]    S                      points to an instance of the rbf SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    coef0                  coeff0 (scikit-learn terminology)
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_sigmoid_init_f16(arm_svm_sigmoid_instance_f16 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float16_t intercept,
+  const float16_t *dualCoefficients,
+  const float16_t *supportVectors,
+  const int32_t   *classes,
+  float16_t coef0,
+  float16_t gamma
+  );
+
+/**
+ * @brief SVM sigmoid prediction
+ * @param[in]    S        Pointer to an instance of the rbf SVM structure.
+ * @param[in]    in       Pointer to input vector
+ * @param[out]   pResult  Decision value
+ * @return none.
+ *
+ */
+void arm_svm_sigmoid_predict_f16(const arm_svm_sigmoid_instance_f16 *S, 
+   const float16_t * in, 
+   int32_t * pResult);
+
+
+
+#endif /*defined(ARM_FLOAT16_SUPPORTED)*/
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _SVM_FUNCTIONS_F16_H_ */
diff --git a/CMSIS/DSP/Include/dsp/transform_functions.h b/CMSIS/DSP/Include/dsp/transform_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf9c43c87789047188b97def0d2758ddf7b548b5
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/transform_functions.h
@@ -0,0 +1,592 @@
+/******************************************************************************
+ * @file     transform_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _TRANSFORM_FUNCTIONS_H_
+#define _TRANSFORM_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#include "dsp/basic_math_functions.h"
+#include "dsp/complex_math_functions.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+
+/**
+ * @defgroup groupTransforms Transform Functions
+ */
+
+
+  /**
+   * @brief Instance structure for the Q15 CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                 /**< length of the FFT. */
+          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const q15_t *pTwiddle;                 /**< points to the Sin twiddle factor table. */
+    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+  } arm_cfft_radix2_instance_q15;
+
+/* Deprecated */
+  arm_status arm_cfft_radix2_init_q15(
+        arm_cfft_radix2_instance_q15 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void arm_cfft_radix2_q15(
+  const arm_cfft_radix2_instance_q15 * S,
+        q15_t * pSrc);
+
+
+  /**
+   * @brief Instance structure for the Q15 CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                 /**< length of the FFT. */
+          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const q15_t *pTwiddle;                 /**< points to the twiddle factor table. */
+    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+  } arm_cfft_radix4_instance_q15;
+
+/* Deprecated */
+  arm_status arm_cfft_radix4_init_q15(
+        arm_cfft_radix4_instance_q15 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void arm_cfft_radix4_q15(
+  const arm_cfft_radix4_instance_q15 * S,
+        q15_t * pSrc);
+
+  /**
+   * @brief Instance structure for the Radix-2 Q31 CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                 /**< length of the FFT. */
+          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const q31_t *pTwiddle;                 /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+  } arm_cfft_radix2_instance_q31;
+
+/* Deprecated */
+  arm_status arm_cfft_radix2_init_q31(
+        arm_cfft_radix2_instance_q31 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void arm_cfft_radix2_q31(
+  const arm_cfft_radix2_instance_q31 * S,
+        q31_t * pSrc);
+
+  /**
+   * @brief Instance structure for the Q31 CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                 /**< length of the FFT. */
+          uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const q31_t *pTwiddle;                 /**< points to the twiddle factor table. */
+    const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+  } arm_cfft_radix4_instance_q31;
+
+/* Deprecated */
+  void arm_cfft_radix4_q31(
+  const arm_cfft_radix4_instance_q31 * S,
+        q31_t * pSrc);
+
+/* Deprecated */
+  arm_status arm_cfft_radix4_init_q31(
+        arm_cfft_radix4_instance_q31 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+  /**
+   * @brief Instance structure for the floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+          uint8_t ifftFlag;                  /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;            /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const float32_t *pTwiddle;               /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;            /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;         /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;             /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+          float32_t onebyfftLen;             /**< value of 1/fftLen. */
+  } arm_cfft_radix2_instance_f32;
+
+
+/* Deprecated */
+  arm_status arm_cfft_radix2_init_f32(
+        arm_cfft_radix2_instance_f32 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void arm_cfft_radix2_f32(
+  const arm_cfft_radix2_instance_f32 * S,
+        float32_t * pSrc);
+
+  /**
+   * @brief Instance structure for the floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+          uint8_t ifftFlag;                  /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;            /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const float32_t *pTwiddle;               /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;            /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;         /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;             /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+          float32_t onebyfftLen;             /**< value of 1/fftLen. */
+  } arm_cfft_radix4_instance_f32;
+
+
+
+/* Deprecated */
+  arm_status arm_cfft_radix4_init_f32(
+        arm_cfft_radix4_instance_f32 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void arm_cfft_radix4_f32(
+  const arm_cfft_radix4_instance_f32 * S,
+        float32_t * pSrc);
+
+  /**
+   * @brief Instance structure for the fixed-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+    const q15_t *pTwiddle;             /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
+          uint16_t bitRevLength;             /**< bit reversal table length. */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+   const uint32_t *rearranged_twiddle_tab_stride1_arr;        /**< Per stage reordered twiddle pointer (offset 1) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride2_arr;        /**< Per stage reordered twiddle pointer (offset 2) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride3_arr;        /**< Per stage reordered twiddle pointer (offset 3) */                                                       \
+   const q15_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */                                                                   \
+   const q15_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */                                                                   \
+   const q15_t *rearranged_twiddle_stride3;
+#endif
+  } arm_cfft_instance_q15;
+
+arm_status arm_cfft_init_q15(
+  arm_cfft_instance_q15 * S,
+  uint16_t fftLen);
+
+void arm_cfft_q15(
+    const arm_cfft_instance_q15 * S,
+          q15_t * p1,
+          uint8_t ifftFlag,
+          uint8_t bitReverseFlag);
+
+  /**
+   * @brief Instance structure for the fixed-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+    const q31_t *pTwiddle;             /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
+          uint16_t bitRevLength;             /**< bit reversal table length. */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+   const uint32_t *rearranged_twiddle_tab_stride1_arr;        /**< Per stage reordered twiddle pointer (offset 1) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride2_arr;        /**< Per stage reordered twiddle pointer (offset 2) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride3_arr;        /**< Per stage reordered twiddle pointer (offset 3) */                                                       \
+   const q31_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */                                                                   \
+   const q31_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */                                                                   \
+   const q31_t *rearranged_twiddle_stride3;
+#endif
+  } arm_cfft_instance_q31;
+
+arm_status arm_cfft_init_q31(
+  arm_cfft_instance_q31 * S,
+  uint16_t fftLen);
+
+void arm_cfft_q31(
+    const arm_cfft_instance_q31 * S,
+          q31_t * p1,
+          uint8_t ifftFlag,
+          uint8_t bitReverseFlag);
+
+  /**
+   * @brief Instance structure for the floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+    const float32_t *pTwiddle;         /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
+          uint16_t bitRevLength;             /**< bit reversal table length. */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+   const uint32_t *rearranged_twiddle_tab_stride1_arr;        /**< Per stage reordered twiddle pointer (offset 1) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride2_arr;        /**< Per stage reordered twiddle pointer (offset 2) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride3_arr;        /**< Per stage reordered twiddle pointer (offset 3) */                                                       \
+   const float32_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */                                                                   \
+   const float32_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */                                                                   \
+   const float32_t *rearranged_twiddle_stride3;
+#endif
+  } arm_cfft_instance_f32;
+
+
+
+  arm_status arm_cfft_init_f32(
+  arm_cfft_instance_f32 * S,
+  uint16_t fftLen);
+
+  void arm_cfft_f32(
+  const arm_cfft_instance_f32 * S,
+        float32_t * p1,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+
+  /**
+   * @brief Instance structure for the Double Precision Floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+    const float64_t *pTwiddle;         /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
+          uint16_t bitRevLength;             /**< bit reversal table length. */
+  } arm_cfft_instance_f64;
+
+  arm_status arm_cfft_init_f64(
+  arm_cfft_instance_f64 * S,
+  uint16_t fftLen);
+  
+  void arm_cfft_f64(
+  const arm_cfft_instance_f64 * S,
+        float64_t * p1,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+  /**
+   * @brief Instance structure for the Q15 RFFT/RIFFT function.
+   */
+  typedef struct
+  {
+          uint32_t fftLenReal;                      /**< length of the real FFT. */
+          uint8_t ifftFlagR;                        /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
+          uint8_t bitReverseFlagR;                  /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
+          uint32_t twidCoefRModifier;               /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+    const q15_t *pTwiddleAReal;                     /**< points to the real twiddle factor table. */
+    const q15_t *pTwiddleBReal;                     /**< points to the imag twiddle factor table. */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    arm_cfft_instance_q15 cfftInst;
+#else
+    const arm_cfft_instance_q15 *pCfft;       /**< points to the complex FFT instance. */
+#endif
+  } arm_rfft_instance_q15;
+
+  arm_status arm_rfft_init_q15(
+        arm_rfft_instance_q15 * S,
+        uint32_t fftLenReal,
+        uint32_t ifftFlagR,
+        uint32_t bitReverseFlag);
+
+  void arm_rfft_q15(
+  const arm_rfft_instance_q15 * S,
+        q15_t * pSrc,
+        q15_t * pDst);
+
+  /**
+   * @brief Instance structure for the Q31 RFFT/RIFFT function.
+   */
+  typedef struct
+  {
+          uint32_t fftLenReal;                        /**< length of the real FFT. */
+          uint8_t ifftFlagR;                          /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
+          uint8_t bitReverseFlagR;                    /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
+          uint32_t twidCoefRModifier;                 /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+    const q31_t *pTwiddleAReal;                       /**< points to the real twiddle factor table. */
+    const q31_t *pTwiddleBReal;                       /**< points to the imag twiddle factor table. */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+    arm_cfft_instance_q31 cfftInst;
+#else
+    const arm_cfft_instance_q31 *pCfft;         /**< points to the complex FFT instance. */
+#endif
+  } arm_rfft_instance_q31;
+
+  arm_status arm_rfft_init_q31(
+        arm_rfft_instance_q31 * S,
+        uint32_t fftLenReal,
+        uint32_t ifftFlagR,
+        uint32_t bitReverseFlag);
+
+  void arm_rfft_q31(
+  const arm_rfft_instance_q31 * S,
+        q31_t * pSrc,
+        q31_t * pDst);
+
+  /**
+   * @brief Instance structure for the floating-point RFFT/RIFFT function.
+   */
+  typedef struct
+  {
+          uint32_t fftLenReal;                        /**< length of the real FFT. */
+          uint16_t fftLenBy2;                         /**< length of the complex FFT. */
+          uint8_t ifftFlagR;                          /**< flag that selects forward (ifftFlagR=0) or inverse (ifftFlagR=1) transform. */
+          uint8_t bitReverseFlagR;                    /**< flag that enables (bitReverseFlagR=1) or disables (bitReverseFlagR=0) bit reversal of output. */
+          uint32_t twidCoefRModifier;                     /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+    const float32_t *pTwiddleAReal;                   /**< points to the real twiddle factor table. */
+    const float32_t *pTwiddleBReal;                   /**< points to the imag twiddle factor table. */
+          arm_cfft_radix4_instance_f32 *pCfft;        /**< points to the complex FFT instance. */
+  } arm_rfft_instance_f32;
+
+  arm_status arm_rfft_init_f32(
+        arm_rfft_instance_f32 * S,
+        arm_cfft_radix4_instance_f32 * S_CFFT,
+        uint32_t fftLenReal,
+        uint32_t ifftFlagR,
+        uint32_t bitReverseFlag);
+
+  void arm_rfft_f32(
+  const arm_rfft_instance_f32 * S,
+        float32_t * pSrc,
+        float32_t * pDst);
+
+  /**
+   * @brief Instance structure for the Double Precision Floating-point RFFT/RIFFT function.
+   */
+typedef struct
+  {
+          arm_cfft_instance_f64 Sint;      /**< Internal CFFT structure. */
+          uint16_t fftLenRFFT;             /**< length of the real sequence */
+    const float64_t * pTwiddleRFFT;        /**< Twiddle factors real stage  */
+  } arm_rfft_fast_instance_f64 ;
+
+arm_status arm_rfft_fast_init_f64 (
+         arm_rfft_fast_instance_f64 * S,
+         uint16_t fftLen);
+
+
+void arm_rfft_fast_f64(
+    arm_rfft_fast_instance_f64 * S,
+    float64_t * p, float64_t * pOut,
+    uint8_t ifftFlag);
+
+
+  /**
+   * @brief Instance structure for the floating-point RFFT/RIFFT function.
+   */
+typedef struct
+  {
+          arm_cfft_instance_f32 Sint;      /**< Internal CFFT structure. */
+          uint16_t fftLenRFFT;             /**< length of the real sequence */
+    const float32_t * pTwiddleRFFT;        /**< Twiddle factors real stage  */
+  } arm_rfft_fast_instance_f32 ;
+
+arm_status arm_rfft_fast_init_f32 (
+         arm_rfft_fast_instance_f32 * S,
+         uint16_t fftLen);
+
+
+  void arm_rfft_fast_f32(
+        const arm_rfft_fast_instance_f32 * S,
+        float32_t * p, float32_t * pOut,
+        uint8_t ifftFlag);
+
+  /**
+   * @brief Instance structure for the floating-point DCT4/IDCT4 function.
+   */
+  typedef struct
+  {
+          uint16_t N;                          /**< length of the DCT4. */
+          uint16_t Nby2;                       /**< half of the length of the DCT4. */
+          float32_t normalize;                 /**< normalizing factor. */
+    const float32_t *pTwiddle;                 /**< points to the twiddle factor table. */
+    const float32_t *pCosFactor;               /**< points to the cosFactor table. */
+          arm_rfft_instance_f32 *pRfft;        /**< points to the real FFT instance. */
+          arm_cfft_radix4_instance_f32 *pCfft; /**< points to the complex FFT instance. */
+  } arm_dct4_instance_f32;
+
+
+  /**
+   * @brief  Initialization function for the floating-point DCT4/IDCT4.
+   * @param[in,out] S          points to an instance of floating-point DCT4/IDCT4 structure.
+   * @param[in]     S_RFFT     points to an instance of floating-point RFFT/RIFFT structure.
+   * @param[in]     S_CFFT     points to an instance of floating-point CFFT/CIFFT structure.
+   * @param[in]     N          length of the DCT4.
+   * @param[in]     Nby2       half of the length of the DCT4.
+   * @param[in]     normalize  normalizing factor.
+   * @return      arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if <code>fftLenReal</code> is not a supported transform length.
+   */
+  arm_status arm_dct4_init_f32(
+        arm_dct4_instance_f32 * S,
+        arm_rfft_instance_f32 * S_RFFT,
+        arm_cfft_radix4_instance_f32 * S_CFFT,
+        uint16_t N,
+        uint16_t Nby2,
+        float32_t normalize);
+
+
+  /**
+   * @brief Processing function for the floating-point DCT4/IDCT4.
+   * @param[in]     S              points to an instance of the floating-point DCT4/IDCT4 structure.
+   * @param[in]     pState         points to state buffer.
+   * @param[in,out] pInlineBuffer  points to the in-place input and output buffer.
+   */
+  void arm_dct4_f32(
+  const arm_dct4_instance_f32 * S,
+        float32_t * pState,
+        float32_t * pInlineBuffer);
+
+
+  /**
+   * @brief Instance structure for the Q31 DCT4/IDCT4 function.
+   */
+  typedef struct
+  {
+          uint16_t N;                          /**< length of the DCT4. */
+          uint16_t Nby2;                       /**< half of the length of the DCT4. */
+          q31_t normalize;                     /**< normalizing factor. */
+    const q31_t *pTwiddle;                     /**< points to the twiddle factor table. */
+    const q31_t *pCosFactor;                   /**< points to the cosFactor table. */
+          arm_rfft_instance_q31 *pRfft;        /**< points to the real FFT instance. */
+          arm_cfft_radix4_instance_q31 *pCfft; /**< points to the complex FFT instance. */
+  } arm_dct4_instance_q31;
+
+
+  /**
+   * @brief  Initialization function for the Q31 DCT4/IDCT4.
+   * @param[in,out] S          points to an instance of Q31 DCT4/IDCT4 structure.
+   * @param[in]     S_RFFT     points to an instance of Q31 RFFT/RIFFT structure
+   * @param[in]     S_CFFT     points to an instance of Q31 CFFT/CIFFT structure
+   * @param[in]     N          length of the DCT4.
+   * @param[in]     Nby2       half of the length of the DCT4.
+   * @param[in]     normalize  normalizing factor.
+   * @return      arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if <code>N</code> is not a supported transform length.
+   */
+  arm_status arm_dct4_init_q31(
+        arm_dct4_instance_q31 * S,
+        arm_rfft_instance_q31 * S_RFFT,
+        arm_cfft_radix4_instance_q31 * S_CFFT,
+        uint16_t N,
+        uint16_t Nby2,
+        q31_t normalize);
+
+
+  /**
+   * @brief Processing function for the Q31 DCT4/IDCT4.
+   * @param[in]     S              points to an instance of the Q31 DCT4 structure.
+   * @param[in]     pState         points to state buffer.
+   * @param[in,out] pInlineBuffer  points to the in-place input and output buffer.
+   */
+  void arm_dct4_q31(
+  const arm_dct4_instance_q31 * S,
+        q31_t * pState,
+        q31_t * pInlineBuffer);
+
+
+  /**
+   * @brief Instance structure for the Q15 DCT4/IDCT4 function.
+   */
+  typedef struct
+  {
+          uint16_t N;                          /**< length of the DCT4. */
+          uint16_t Nby2;                       /**< half of the length of the DCT4. */
+          q15_t normalize;                     /**< normalizing factor. */
+    const q15_t *pTwiddle;                     /**< points to the twiddle factor table. */
+    const q15_t *pCosFactor;                   /**< points to the cosFactor table. */
+          arm_rfft_instance_q15 *pRfft;        /**< points to the real FFT instance. */
+          arm_cfft_radix4_instance_q15 *pCfft; /**< points to the complex FFT instance. */
+  } arm_dct4_instance_q15;
+
+
+  /**
+   * @brief  Initialization function for the Q15 DCT4/IDCT4.
+   * @param[in,out] S          points to an instance of Q15 DCT4/IDCT4 structure.
+   * @param[in]     S_RFFT     points to an instance of Q15 RFFT/RIFFT structure.
+   * @param[in]     S_CFFT     points to an instance of Q15 CFFT/CIFFT structure.
+   * @param[in]     N          length of the DCT4.
+   * @param[in]     Nby2       half of the length of the DCT4.
+   * @param[in]     normalize  normalizing factor.
+   * @return      arm_status function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if <code>N</code> is not a supported transform length.
+   */
+  arm_status arm_dct4_init_q15(
+        arm_dct4_instance_q15 * S,
+        arm_rfft_instance_q15 * S_RFFT,
+        arm_cfft_radix4_instance_q15 * S_CFFT,
+        uint16_t N,
+        uint16_t Nby2,
+        q15_t normalize);
+
+
+  /**
+   * @brief Processing function for the Q15 DCT4/IDCT4.
+   * @param[in]     S              points to an instance of the Q15 DCT4 structure.
+   * @param[in]     pState         points to state buffer.
+   * @param[in,out] pInlineBuffer  points to the in-place input and output buffer.
+   */
+  void arm_dct4_q15(
+  const arm_dct4_instance_q15 * S,
+        q15_t * pState,
+        q15_t * pInlineBuffer);
+
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _TRANSFORM_FUNCTIONS_H_ */
diff --git a/CMSIS/DSP/Include/dsp/transform_functions_f16.h b/CMSIS/DSP/Include/dsp/transform_functions_f16.h
new file mode 100644
index 0000000000000000000000000000000000000000..67f1adc21897651f677cc5e0604b1150f2357324
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/transform_functions_f16.h
@@ -0,0 +1,157 @@
+/******************************************************************************
+ * @file     transform_functions_f16.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     23 April 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _TRANSFORM_FUNCTIONS_F16_H_
+#define _TRANSFORM_FUNCTIONS_F16_H_
+
+#include "arm_math_types_f16.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+  /**
+   * @brief Instance structure for the floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+          uint8_t ifftFlag;                  /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;            /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const float16_t *pTwiddle;               /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;            /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;         /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;             /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+          float16_t onebyfftLen;             /**< value of 1/fftLen. */
+  } arm_cfft_radix2_instance_f16;
+
+  /**
+   * @brief Instance structure for the floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+          uint8_t ifftFlag;                  /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
+          uint8_t bitReverseFlag;            /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
+    const float16_t *pTwiddle;               /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;            /**< points to the bit reversal table. */
+          uint16_t twidCoefModifier;         /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
+          uint16_t bitRevFactor;             /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
+          float16_t onebyfftLen;             /**< value of 1/fftLen. */
+  } arm_cfft_radix4_instance_f16;
+
+  /**
+   * @brief Instance structure for the floating-point CFFT/CIFFT function.
+   */
+  typedef struct
+  {
+          uint16_t fftLen;                   /**< length of the FFT. */
+    const float16_t *pTwiddle;         /**< points to the Twiddle factor table. */
+    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
+          uint16_t bitRevLength;             /**< bit reversal table length. */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+   const uint32_t *rearranged_twiddle_tab_stride1_arr;        /**< Per stage reordered twiddle pointer (offset 1) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride2_arr;        /**< Per stage reordered twiddle pointer (offset 2) */                                                       \
+   const uint32_t *rearranged_twiddle_tab_stride3_arr;        /**< Per stage reordered twiddle pointer (offset 3) */                                                       \
+   const float16_t *rearranged_twiddle_stride1; /**< reordered twiddle offset 1 storage */                                                                   \
+   const float16_t *rearranged_twiddle_stride2; /**< reordered twiddle offset 2 storage */                                                                   \
+   const float16_t *rearranged_twiddle_stride3;
+#endif
+  } arm_cfft_instance_f16;
+
+
+  arm_status arm_cfft_init_f16(
+  arm_cfft_instance_f16 * S,
+  uint16_t fftLen);
+
+  void arm_cfft_f16(
+  const arm_cfft_instance_f16 * S,
+        float16_t * p1,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+  /**
+   * @brief Instance structure for the floating-point RFFT/RIFFT function.
+   */
+typedef struct
+  {
+          arm_cfft_instance_f16 Sint;      /**< Internal CFFT structure. */
+          uint16_t fftLenRFFT;             /**< length of the real sequence */
+    const float16_t * pTwiddleRFFT;        /**< Twiddle factors real stage  */
+  } arm_rfft_fast_instance_f16 ;
+
+arm_status arm_rfft_fast_init_f16 (
+         arm_rfft_fast_instance_f16 * S,
+         uint16_t fftLen);
+
+
+  void arm_rfft_fast_f16(
+        const arm_rfft_fast_instance_f16 * S,
+        float16_t * p, float16_t * pOut,
+        uint8_t ifftFlag);
+
+/* Deprecated */
+  arm_status arm_cfft_radix4_init_f16(
+        arm_cfft_radix4_instance_f16 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void arm_cfft_radix4_f16(
+  const arm_cfft_radix4_instance_f16 * S,
+        float16_t * pSrc);
+
+
+/* Deprecated */
+  arm_status arm_cfft_radix2_init_f16(
+        arm_cfft_radix2_instance_f16 * S,
+        uint16_t fftLen,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag);
+
+/* Deprecated */
+  void arm_cfft_radix2_f16(
+  const arm_cfft_radix2_instance_f16 * S,
+        float16_t * pSrc);
+  
+#endif /* defined(ARM_FLOAT16_SUPPORTED)*/
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _TRANSFORM_FUNCTIONS_F16_H_ */
diff --git a/CMSIS/DSP/Include/dsp/utils.h b/CMSIS/DSP/Include/dsp/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f5acb3747e79bc8650702f396d2da91ea3c441c
--- /dev/null
+++ b/CMSIS/DSP/Include/dsp/utils.h
@@ -0,0 +1,240 @@
+/******************************************************************************
+ * @file     arm_math_utils.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     20. July 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_MATH_UTILS_H_
+
+#define _ARM_MATH_UTILS_H_
+
+#include "arm_math_types.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * @brief Macros required for reciprocal calculation in Normalized LMS
+   */
+
+#define INDEX_MASK         0x0000003F
+
+
+#define SQ(x) ((x) * (x))
+
+#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
+
+
+  /**
+   * @brief Function to Calculates 1/in (reciprocal) value of Q31 Data type.
+   */
+  __STATIC_FORCEINLINE uint32_t arm_recip_q31(
+        q31_t in,
+        q31_t * dst,
+  const q31_t * pRecipTable)
+  {
+    q31_t out;
+    uint32_t tempVal;
+    uint32_t index, i;
+    uint32_t signBits;
+
+    if (in > 0)
+    {
+      signBits = ((uint32_t) (__CLZ( in) - 1));
+    }
+    else
+    {
+      signBits = ((uint32_t) (__CLZ(-in) - 1));
+    }
+
+    /* Convert input sample to 1.31 format */
+    in = (in << signBits);
+
+    /* calculation of index for initial approximated Val */
+    index = (uint32_t)(in >> 24);
+    index = (index & INDEX_MASK);
+
+    /* 1.31 with exp 1 */
+    out = pRecipTable[index];
+
+    /* calculation of reciprocal value */
+    /* running approximation for two iterations */
+    for (i = 0U; i < 2U; i++)
+    {
+      tempVal = (uint32_t) (((q63_t) in * out) >> 31);
+      tempVal = 0x7FFFFFFFu - tempVal;
+      /*      1.31 with exp 1 */
+      /* out = (q31_t) (((q63_t) out * tempVal) >> 30); */
+      out = clip_q63_to_q31(((q63_t) out * tempVal) >> 30);
+    }
+
+    /* write output */
+    *dst = out;
+
+    /* return num of signbits of out = 1/in value */
+    return (signBits + 1U);
+  }
+
+
+  /**
+   * @brief Function to Calculates 1/in (reciprocal) value of Q15 Data type.
+   */
+  __STATIC_FORCEINLINE uint32_t arm_recip_q15(
+        q15_t in,
+        q15_t * dst,
+  const q15_t * pRecipTable)
+  {
+    q15_t out = 0;
+    uint32_t tempVal = 0;
+    uint32_t index = 0, i = 0;
+    uint32_t signBits = 0;
+
+    if (in > 0)
+    {
+      signBits = ((uint32_t)(__CLZ( in) - 17));
+    }
+    else
+    {
+      signBits = ((uint32_t)(__CLZ(-in) - 17));
+    }
+
+    /* Convert input sample to 1.15 format */
+    in = (in << signBits);
+
+    /* calculation of index for initial approximated Val */
+    index = (uint32_t)(in >>  8);
+    index = (index & INDEX_MASK);
+
+    /*      1.15 with exp 1  */
+    out = pRecipTable[index];
+
+    /* calculation of reciprocal value */
+    /* running approximation for two iterations */
+    for (i = 0U; i < 2U; i++)
+    {
+      tempVal = (uint32_t) (((q31_t) in * out) >> 15);
+      tempVal = 0x7FFFu - tempVal;
+      /*      1.15 with exp 1 */
+      out = (q15_t) (((q31_t) out * tempVal) >> 14);
+      /* out = clip_q31_to_q15(((q31_t) out * tempVal) >> 14); */
+    }
+
+    /* write output */
+    *dst = out;
+
+    /* return num of signbits of out = 1/in value */
+    return (signBits + 1);
+  }
+
+
+/**
+ * @brief  64-bit to 32-bit unsigned normalization
+ * @param[in]  in           is input unsigned long long value
+ * @param[out] normalized   is the 32-bit normalized value
+ * @param[out] norm         is norm scale
+ */
+__STATIC_INLINE  void arm_norm_64_to_32u(uint64_t in, int32_t * normalized, int32_t *norm)
+{
+    int32_t     n1;
+    int32_t     hi = (int32_t) (in >> 32);
+    int32_t     lo = (int32_t) ((in << 32) >> 32);
+
+    n1 = __CLZ(hi) - 32;
+    if (!n1)
+    {
+        /*
+         * input fits in 32-bit
+         */
+        n1 = __CLZ(lo);
+        if (!n1)
+        {
+            /*
+             * MSB set, need to scale down by 1
+             */
+            *norm = -1;
+            *normalized = (((uint32_t) lo) >> 1);
+        } else
+        {
+            if (n1 == 32)
+            {
+                /*
+                 * input is zero
+                 */
+                *norm = 0;
+                *normalized = 0;
+            } else
+            {
+                /*
+                 * 32-bit normalization
+                 */
+                *norm = n1 - 1;
+                *normalized = lo << *norm;
+            }
+        }
+    } else
+    {
+        /*
+         * input fits in 64-bit
+         */
+        n1 = 1 - n1;
+        *norm = -n1;
+        /*
+         * 64 bit normalization
+         */
+        *normalized = (((uint32_t) lo) >> n1) | (hi << (32 - n1));
+    }
+}
+
+__STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
+{
+    q31_t   result;
+    uint64_t   absNum;
+    int32_t   normalized;
+    int32_t   norm;
+
+    /*
+     * if sum fits in 32bits
+     * avoid costly 64-bit division
+     */
+    absNum = num > 0 ? num : -num;
+    arm_norm_64_to_32u(absNum, &normalized, &norm);
+    if (norm > 0)
+        /*
+         * 32-bit division
+         */
+        result = (q31_t) num / den;
+    else
+        /*
+         * 64-bit division
+         */
+        result = (q31_t) (num / den);
+
+    return result;
+}
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /*ifndef _ARM_MATH_UTILS_H_ */
diff --git a/CMSIS/DSP/PrivateInclude/arm_vec_fft.h b/CMSIS/DSP/PrivateInclude/arm_vec_fft.h
index b372d5a62a908e732ad432ce06fbde90736a2b86..30dcb0e2347ef034f2fffd8513c1c7c44f7d0766 100644
--- a/CMSIS/DSP/PrivateInclude/arm_vec_fft.h
+++ b/CMSIS/DSP/PrivateInclude/arm_vec_fft.h
@@ -40,13 +40,280 @@ extern "C"
 #define MVE_CMPLX_MULT_FLT_AxB(A,B)         vcmlaq_rot90(vcmulq(A, B), A, B)
 #define MVE_CMPLX_MULT_FLT_Conj_AxB(A,B)    vcmlaq_rot270(vcmulq(A, B), A, B)
 
-#define MVE_CMPLX_MULT_FX_AxB(A,B)          vqdmladhxq(vqdmlsdhq((__typeof(A))vuninitializedq_s32(), A, B), A, B);
-#define MVE_CMPLX_MULT_FX_AxConjB(A,B)      vqdmladhq(vqdmlsdhxq((__typeof(A))vuninitializedq_s32(), A, B), A, B);
+#define MVE_CMPLX_MULT_FX_AxB(A,B)          vqdmladhxq(vqdmlsdhq((__typeof(A))vuninitializedq_s32(), A, B), A, B)
+#define MVE_CMPLX_MULT_FX_AxConjB(A,B)      vqdmladhq(vqdmlsdhxq((__typeof(A))vuninitializedq_s32(), A, B), A, B)
 
 #define MVE_CMPLX_ADD_FX_A_ixB(A, B)        vhcaddq_rot90(A,B)
 #define MVE_CMPLX_SUB_FX_A_ixB(A,B)         vhcaddq_rot270(A,B)
 
 
+/**
+  @brief         In-place 32 bit reversal function for helium
+  @param[in,out] pSrc        points to in-place buffer of unknown 32-bit data type
+  @param[in]     bitRevLen   bit reversal table length
+  @param[in]     pBitRevTab  points to bit reversal table
+  @return        none
+*/
+
+__STATIC_INLINE void arm_bitreversal_32_inpl_mve(
+        uint32_t *pSrc,
+  const uint16_t  bitRevLen,
+  const uint16_t *pBitRevTab)
+
+{
+    uint64_t       *src = (uint64_t *) pSrc;
+    int32_t         blkCnt;     /* loop counters */
+    uint32x4_t      bitRevTabOff;
+    uint32x4_t      one = vdupq_n_u32(1);
+    uint64x2_t      inLow, inHigh;
+    uint64x2_t      bitRevOff1Low, bitRevOff0Low;
+    uint64x2_t      bitRevOff1High, bitRevOff0High;
+
+    /* load scheduling to increase gather load idx update / gather load distance */
+    bitRevTabOff = vldrhq_u32(pBitRevTab);
+    pBitRevTab += 4;
+
+    bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
+    bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
+
+
+    blkCnt = bitRevLen / 8;
+    while (blkCnt > 0) {
+        bitRevTabOff = vldrhq_u32(pBitRevTab);
+        pBitRevTab += 4;
+
+        /* 64-bit index expansion */
+        bitRevOff1Low = vmullbq_int_u32(bitRevTabOff, one);
+        bitRevOff1High = vmulltq_int_u32(bitRevTabOff, one);
+
+        inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
+        inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
+
+        vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
+        vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
+
+
+        /* unrolled */
+        bitRevTabOff = vldrhq_u32(pBitRevTab);
+        pBitRevTab += 4;
+
+        bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
+        bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
+
+        inLow = vldrdq_gather_offset_u64(src, bitRevOff1Low);
+        inHigh = vldrdq_gather_offset_u64(src, bitRevOff1High);
+
+        vstrdq_scatter_offset_u64(src, bitRevOff1Low, inHigh);
+        vstrdq_scatter_offset_u64(src, bitRevOff1High, inLow);
+
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+
+    if (bitRevLen & 7) {
+        /* FFT size = 16 */
+        inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
+        inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
+
+        vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
+        vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
+    }
+}
+
+
+
+/**
+  @brief         In-place 16 bit reversal function for helium
+  @param[in,out] pSrc        points to in-place buffer of unknown 16-bit data type
+  @param[in]     bitRevLen   bit reversal table length
+  @param[in]     pBitRevTab  points to bit reversal table
+  @return        none
+*/
+
+__STATIC_INLINE void arm_bitreversal_16_inpl_mve(
+        uint16_t *pSrc,
+  const uint16_t bitRevLen,
+  const uint16_t *pBitRevTab)
+
+{
+    uint32_t       *src = (uint32_t *) pSrc;
+    int32_t         blkCnt;     /* loop counters */
+    uint32x4_t      bitRevTabOff;
+    uint16x8_t      one = vdupq_n_u16(1);
+    uint32x4_t      bitRevOff1Low, bitRevOff0Low;
+    uint32x4_t      bitRevOff1High, bitRevOff0High;
+    uint32x4_t      inLow, inHigh;
+
+    /* load scheduling to increase gather load idx update / gather load distance */
+    bitRevTabOff = vldrhq_u16(pBitRevTab);
+    pBitRevTab += 8;
+
+    bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
+    bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
+    bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
+    bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+
+    blkCnt = (bitRevLen / 16);
+    while (blkCnt > 0) {
+        bitRevTabOff = vldrhq_u16(pBitRevTab);
+        pBitRevTab += 8;
+
+        bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one);
+        bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one);
+        bitRevOff1Low = vshrq_n_u16(bitRevOff1Low, 3);
+        bitRevOff1High = vshrq_n_u16(bitRevOff1High, 3);
+
+        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
+        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
+
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
+
+        /* loop unrolling */
+        bitRevTabOff = vldrhq_u16(pBitRevTab);
+        pBitRevTab += 8;
+
+        bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
+        bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
+        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
+        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+
+        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low);
+        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High);
+
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff1Low, inHigh);
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff1High, inLow);
+
+        blkCnt--;
+    }
+
+    /* tail handling */
+    blkCnt = bitRevLen & 0xf;
+    if (blkCnt == 8) {
+        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
+        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
+
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
+    } else if (blkCnt == 12) {
+        /* FFT 16 special case */
+        mve_pred16_t    p = vctp16q(4);
+
+        bitRevTabOff = vldrhq_z_u16(pBitRevTab, p);
+
+        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
+        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
+
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
+
+        bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
+        bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
+        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
+        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+
+        inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p);
+        inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p);
+
+        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0Low, inHigh, p);
+        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0High, inLow, p);
+    }
+}
+
+/**
+  @brief         Out-of-place 32 bit reversal function for helium
+  @param[out]   pDst        points to destination buffer of unknown 32-bit data type
+  @param[in]    pSrc        points to input buffer of unknown 32-bit data type
+  @param[in]    fftLen      FFT length
+  @return       none
+*/
+__STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
+{
+    uint32x4_t      idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
+    uint32_t        bitRevPos, blkCnt;
+    uint32_t       *pDst32 = (uint32_t *) pDst;
+
+    /* fwd indexes */
+    idxOffs0 = vdupq_n_u32(0);
+    idxOffs1 = vdupq_n_u32(0);
+    idxOffs0[0] = 0;    idxOffs0[2] = 4;
+    idxOffs1[0] = 8;    idxOffs1[2] = 12;
+
+    bitRevPos = (31 - __CLZ(fftLen)) + 5;
+    blkCnt = fftLen >> 2;
+
+    /* issued earlier to increase gather load idx update / gather load distance */
+    /* bit-reverse fwd indexes */
+    bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
+    bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
+    while (blkCnt > 0) {
+        uint64x2_t      vecIn;
+
+        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs0);
+        idxOffs0 = idxOffs0 + 16;
+        vst1q(pDst32, (uint32x4_t) vecIn);
+        pDst32 += 4;
+        bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
+
+        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs1);
+        idxOffs1 = idxOffs1 + 16;
+        vst1q(pDst32, (uint32x4_t) vecIn);
+        pDst32 += 4;
+        bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
+
+        blkCnt--;
+    }
+}
+
+
+/**
+  @brief         Out-of-place 16 bit reversal function for helium
+  @param[out]   pDst        points to destination buffer of unknown 16-bit data type
+  @param[in]    pSrc        points to input buffer of unknown 16-bit data type
+  @param[in]    fftLen      FFT length
+  @return       none
+*/
+
+__STATIC_INLINE void arm_bitreversal_16_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
+{
+    uint32x4_t      idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
+    uint32_t        bitRevPos, blkCnt;
+    uint16_t       *pDst16 = (uint16_t *) pDst;
+    uint32_t        incrIdx = 0;
+
+    /* fwd indexes */
+    idxOffs0 = vidupq_wb_u32(&incrIdx, 4);    // {0, 4, 8, 12}
+    idxOffs1 = vidupq_wb_u32(&incrIdx, 4);    // {16, 20, 24, 28}
+
+    bitRevPos = (31 - __CLZ(fftLen)) + 4;
+    blkCnt = fftLen >> 3;
+
+    /* issued earlier to increase gather load idx update / gather load distance */
+    /* bit-reverse fwd indexes */
+    bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
+    bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
+    while (blkCnt > 0) {
+        uint32x4_t      vecIn;
+
+        vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs0);
+        idxOffs0 = idxOffs0 + 32;
+        vst1q(pDst16, (uint16x8_t) vecIn);
+        pDst16 += 8;
+        bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
+
+        vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs1);
+        idxOffs1 = idxOffs1 + 32;
+        vst1q(pDst16, (uint16x8_t) vecIn);
+        pDst16 += 8;
+        bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
+
+        blkCnt--;
+    }
+}
+
+
 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
 
 
diff --git a/CMSIS/DSP/PrivateInclude/arm_vec_filtering.h b/CMSIS/DSP/PrivateInclude/arm_vec_filtering.h
index 65fc752aedac83653890e5db29eb5c5b7449c222..b2a06903dfb49cafa0bb7f3fd87c06c907965190 100644
--- a/CMSIS/DSP/PrivateInclude/arm_vec_filtering.h
+++ b/CMSIS/DSP/PrivateInclude/arm_vec_filtering.h
@@ -253,291 +253,223 @@ extern "C"
     acc1 = vecAddAcrossF32Mve(acc1Vec);                                 \
 }
 
-#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    float32_t const *pSrcX;                                                                         \
-    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                       \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    acc0Vec = vdupq_n_f32(0.0f);                                                                    \
-    acc1Vec = vdupq_n_f32(0.0f);                                                                    \
-    pSrcX = (float32_t const *) pX;                                                                 \
-    k = (count - 1) >> 2;                                                                           \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        xVec = vldrwq_f32(&pSrcX[1]);                                                               \
-        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                   \
-        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
-        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                   \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = (count - 1) % 0x4U;                                                                         \
-    mve_pred16_t p0 = vctp32q(k);                                                        \
-    yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                        \
-    xVec = vldrwq_f32(&pSrcX[1]);                                                                   \
-    acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                                 \
-    xVec = vld1q(pSrcX);  pSrcX += 4;                                                   \
-    p0 = vctp32q(k+1);                                                                   \
-    acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                                 \
-                                                                                                    \
-    acc0 = vecAddAcrossF32Mve(acc0Vec);                                                             \
-    acc1 = vecAddAcrossF32Mve(acc1Vec);                                                             \
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count)                             \
+{                                                                                                    \
+    float32_t const *pSrcX;                                                                          \
+    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                          \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    acc0Vec = vdupq_n_f32(0.0f);                                                                     \
+    acc1Vec = vdupq_n_f32(0.0f);                                                                     \
+    pSrcX = (float32_t const *) pX;                                                                  \
+    k = (count - 1) >> 2;                                                                            \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
+        pY-=4;                                                                                       \
+        xVec = vldrwq_f32(&pSrcX[1]);                                                                \
+        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
+        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                    \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = (count - 1) % 0x4U;                                                                          \
+    mve_pred16_t p0 = vctp32q(k);                                                                    \
+    yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                         \
+    xVec = vldrwq_f32(&pSrcX[1]);                                                                    \
+    acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                                  \
+    xVec = vld1q(pSrcX);  pSrcX += 4;                                                                \
+    p0 = vctp32q(k+1);                                                                               \
+    acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                                  \
+                                                                                                     \
+    acc0 = vecAddAcrossF32Mve(acc0Vec);                                                              \
+    acc1 = vecAddAcrossF32Mve(acc1Vec);                                                              \
 }
 
-#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count)                          \
-{                                                                                                   \
-    float32_t const *pSrcX;                                                                         \
-    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                       \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    acc0Vec = vdupq_n_f32(0.0f);                                                                    \
-    acc1Vec = vdupq_n_f32(0.0f);                                                                    \
-    pSrcX = (float32_t const *) pX;                                                                 \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        xVec = vldrwq_f32(&pSrcX[1]);                                                               \
-        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                   \
-        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
-        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                   \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
-        xVec = vldrwq_f32(&pSrcX[1]);                                                               \
-        acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                             \
-        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
-        acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                             \
-    }                                                                                               \
-    acc0 = vecAddAcrossF32Mve(acc0Vec);                                                             \
-    acc1 = vecAddAcrossF32Mve(acc1Vec);                                                             \
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count)                           \
+{                                                                                                    \
+    float32_t const *pSrcX;                                                                          \
+    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                          \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    acc0Vec = vdupq_n_f32(0.0f);                                                                     \
+    acc1Vec = vdupq_n_f32(0.0f);                                                                     \
+    pSrcX = (float32_t const *) pX;                                                                  \
+    k = count >> 2;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
+        pY-=4;                                                                                       \
+        xVec = vldrwq_f32(&pSrcX[1]);                                                                \
+        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
+        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                    \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x4U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp32q(k);                                                                \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
+        xVec = vldrwq_f32(&pSrcX[1]);                                                                \
+        acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                              \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
+        acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                              \
+    }                                                                                                \
+    acc0 = vecAddAcrossF32Mve(acc0Vec);                                                              \
+    acc1 = vecAddAcrossF32Mve(acc1Vec);                                                              \
 }
 
-#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    float32_t   const *pSrcX;                                                                       \
-    const float32_t  *pY1 = pY + 1;                                                                       \
-    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                       \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    acc0Vec = vdupq_n_f32(0.0f);                                                                    \
-    acc1Vec = vdupq_n_f32(0.0f);                                                                    \
-    pSrcX = (float32_t const *) pX;                                                                 \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
-        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                   \
-        yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec);                                   \
-        pY1-=4;                                                                                     \
-        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                   \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x4U;                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    mve_pred16_t p0 = vctp32q(k);                                                        \
-    xVec = vld1q(pSrcX);  pSrcX += 4;                                                   \
-    yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                        \
-    acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                                 \
-    yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec);                                       \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp32q(k+1);                                                                   \
-    acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                                 \
-                                                                                                    \
-    acc0 = vecAddAcrossF32Mve(acc0Vec);                                                             \
-    acc1 = vecAddAcrossF32Mve(acc1Vec);                                                             \
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    float32_t   const *pSrcX;                                           \
+    const float32_t  *pY1 = pY + 1;                                     \
+    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                             \
+    uint32_t    k;                                                      \
+                                                                        \
+    acc0Vec = vdupq_n_f32(0.0f);                                        \
+    acc1Vec = vdupq_n_f32(0.0f);                                        \
+    pSrcX = (float32_t const *) pX;                                     \
+    k = count >> 2;                                                     \
+                                                                        \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                               \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);        \
+        pY-=4;                                                          \
+        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                       \
+        yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec);       \
+        pY1-=4;                                                         \
+        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                       \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    k = count % 0x4U;                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc0 requires exact number of sample  */                         \
+    /* disable extra lanes in final MAC computation  */                 \
+    mve_pred16_t p0 = vctp32q(k);                                       \
+    xVec = vld1q(pSrcX);  pSrcX += 4;                                   \
+    yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);            \
+    acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                     \
+    yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec);           \
+    /* acc1 requires 1 additional sample  */                            \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    p0 = vctp32q(k+1);                                                  \
+    acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                     \
+                                                                        \
+    acc0 = vecAddAcrossF32Mve(acc0Vec);                                 \
+    acc1 = vecAddAcrossF32Mve(acc1Vec);                                 \
 }
 
-#define MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count)                                                \
-{                                                                                                   \
-    float32_t const *pSrcX;                                                                         \
-    f32x4_t   accVec, xVec, yVec;                                                                 \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    accVec = vdupq_n_f32(0.0f);                                                                     \
-    pSrcX = (float32_t const *) pX;                                                                 \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
-        accVec = vfmaq_f32(accVec, xVec, yVec);                                                     \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
-        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
-        accVec = vfmaq_m_f32(accVec, xVec, yVec, p0);                                               \
-    }                                                                                               \
-    acc = vecAddAcrossF32Mve(accVec);                                                               \
+#define MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count)                                                 \
+{                                                                                                    \
+    float32_t const *pSrcX;                                                                          \
+    f32x4_t   accVec, xVec, yVec;                                                                    \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    accVec = vdupq_n_f32(0.0f);                                                                      \
+    pSrcX = (float32_t const *) pX;                                                                  \
+    k = count >> 2;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
+        pY-=4;                                                                                       \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
+        accVec = vfmaq_f32(accVec, xVec, yVec);                                                      \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x4U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp32q(k);                                                                \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
+        accVec = vfmaq_m_f32(accVec, xVec, yVec, p0);                                                \
+    }                                                                                                \
+    acc = vecAddAcrossF32Mve(accVec);                                                                \
 }
 
 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
 
 #if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
 
-#define MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count)                                                \
-{                                                                                                   \
-    q31_t const *pSrcX;                                                                             \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc = vmlaldavaq(acc, xVec, yVec);                                                          \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                    \
-    }                                                                                               \
-    acc = asrl(acc, 31);                                                                            \
-}
-
-
-
-#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q31_t const *pSrcX;                                                                             \
-    const q31_t       *pY1 = pY + 1;                                                                      \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec);                                   \
-        pY1-=4;                                                                                     \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x4U;                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    mve_pred16_t p0 = vctp32q(k);                                                        \
-    xVec = vld1q(pSrcX); pSrcX += 4;                                                    \
-    yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                        \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
-    yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec);                                       \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp32q(k+1);                                                                   \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
-                                                                                                    \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
+#define MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count)                                                 \
+{                                                                                                    \
+    q31_t const *pSrcX;                                                                              \
+    q31x4_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q31_t const *) pX;                                                                      \
+    k = count >> 2;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
+        pY-=4;                                                                                       \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
+        acc = vmlaldavaq(acc, xVec, yVec);                                                           \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x4U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp32q(k);                                                                \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
+        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                     \
+    }                                                                                                \
+    acc = asrl(acc, 31);                                                                             \
 }
 
 
 
-
-#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)\
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\
 {                                                                       \
     q31_t const *pSrcX;                                                 \
+    const q31_t       *pY1 = pY + 1;                                    \
     q31x4_t   xVec, yVec;                                               \
     uint32_t    k;                                                      \
                                                                         \
     pSrcX = (q31_t const *) pX;                                         \
-    k = (count-1) >> 2;                                                 \
+    k = count >> 2;                                                     \
                                                                         \
     while (k > 0U)                                                      \
     {                                                                   \
-        /* note */                                                      \
-        /* could can be more efficient using Vector Scatter Store: */   \
-        /* + pre-increment + WB */                                      \
-        /* To be revisited when intrinsic available */                  \
-        /* SDCOMP-52618 */                                              \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                \
         yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);        \
         pY-=4;                                                          \
-        xVec = vldrwq_s32(&pSrcX[1]);                                   \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
-        xVec = vld1q(pSrcX);                                            \
-        pSrcX += 4;                                                     \
         acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
+        yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec);       \
+        pY1-=4;                                                         \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
         /*  Decrement the loop counter   */                             \
         k--;                                                            \
     }                                                                   \
-    k = (count - 1) % 0x4U;                                             \
+    k = count % 0x4U;                                                   \
     /* use predication to finalize MAC sum */                           \
-    /* acc1 requires exact number of sample (count-1)  */               \
+    /* acc0 requires exact number of sample  */                         \
     /* disable extra lanes in final MAC computation  */                 \
     mve_pred16_t p0 = vctp32q(k);                                       \
+    xVec = vld1q(pSrcX); pSrcX += 4;                                    \
     yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);            \
-    xVec = vldrwq_s32(&pSrcX[1]);                                       \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
-    /* acc0 requires 1 additional sample  (count) */                    \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
+    yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec);           \
+    /* acc1 requires 1 additional sample  */                            \
     /* so add 1 to unmask an extra lane  in final MAC computation  */   \
     p0 = vctp32q(k+1);                                                  \
-    xVec = vld1q(pSrcX);                                                \
-    pSrcX += 4;                                                         \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
                                                                         \
     acc0 = asrl(acc0, 31);                                              \
     acc1 = asrl(acc1, 31);                                              \
@@ -545,1110 +477,1103 @@ extern "C"
 
 
 
-#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)                          \
-{                                                                                                   \
-    q31_t const *pSrcX;                                                                             \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
+
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count) \
+{                                                                        \
+    q31_t const *pSrcX;                                                  \
+    q31x4_t   xVec, yVec;                                                \
+    uint32_t    k;                                                       \
+                                                                         \
+    pSrcX = (q31_t const *) pX;                                          \
+    k = (count-1) >> 2;                                                  \
+                                                                         \
+    while (k > 0U)                                                       \
+    {                                                                    \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);         \
+        pY-=4;                                                           \
+        xVec = vldrwq_s32(&pSrcX[1]);                                    \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                             \
+        xVec = vld1q(pSrcX);                                             \
+        pSrcX += 4;                                                      \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                             \
+        /*  Decrement the loop counter   */                              \
+        k--;                                                             \
+    }                                                                    \
+    k = (count - 1) % 0x4U;                                              \
+    /* use predication to finalize MAC sum */                            \
+    /* acc1 requires exact number of sample (count-1)  */                \
+    /* disable extra lanes in final MAC computation  */                  \
+    mve_pred16_t p0 = vctp32q(k);                                        \
+    yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);             \
+    xVec = vldrwq_s32(&pSrcX[1]);                                        \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                           \
+    /* acc0 requires 1 additional sample  (count) */                     \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */    \
+    p0 = vctp32q(k+1);                                                   \
+    xVec = vld1q(pSrcX);                                                 \
+    pSrcX += 4;                                                          \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                           \
+                                                                         \
+    acc0 = asrl(acc0, 31);                                               \
+    acc1 = asrl(acc1, 31);                                               \
 }
 
 
 
-#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)              \
-{                                                                                                   \
-    q31_t const *pSrcX;                                                                             \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vldrwq_s32(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                        \
-        xVec = vldrwq_s32(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vldrwq_s32(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                  \
-        xVec = vldrwq_s32(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
-    acc2 = asrl(acc2, 31);                                                                          \
-    acc3 = asrl(acc3, 31);                                                                          \
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)                           \
+{                                                                                                    \
+    q31_t const *pSrcX;                                                                              \
+    q31x4_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q31_t const *) pX;                                                                      \
+    k = count >> 2;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
+        pY-=4;                                                                                       \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x4U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp32q(k);                                                                \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                   \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                   \
+    }                                                                                                \
+    acc0 = asrl(acc0, 31);                                                                           \
+    acc1 = asrl(acc1, 31);                                                                           \
 }
 
-#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q31_t const *pSrcX, *pSrcY;                                                                     \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    pSrcY  = (q31_t const *) pY;                                                                    \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        yVec = vldrwq_s32(&pSrcY[-1]);                                                              \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x4U;                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    mve_pred16_t p0 = vctp32q(k+1);                                                      \
-    xVec = vld1q(pSrcX); pSrcX += 4;                                                    \
-    yVec = vldrwq_s32(&pSrcY[-1]);                                                                  \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0);                                                       \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    p0 = vctp32q(k);                                                                     \
-    yVec = vld1q(pSrcY);  pSrcY += 4;                                                   \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0);                                                       \
-                                                                                                    \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
+
+
+#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)               \
+{                                                                                                    \
+    q31_t const *pSrcX;                                                                              \
+    q31x4_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q31_t const *) pX;                                                                      \
+    k = count >> 2;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
+        pY-=4;                                                                                       \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
+        xVec = vldrwq_s32(&pSrcX[2]);                                                                \
+        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                         \
+        xVec = vldrwq_s32(&pSrcX[3]);                                                                \
+        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x4U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp32q(k);                                                                \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                   \
+        xVec = vldrwq_s32(&pSrcX[2]);                                                                \
+        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                   \
+        xVec = vldrwq_s32(&pSrcX[3]);                                                                \
+        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                   \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                   \
+    }                                                                                                \
+    acc0 = asrl(acc0, 31);                                                                           \
+    acc1 = asrl(acc1, 31);                                                                           \
+    acc2 = asrl(acc2, 31);                                                                           \
+    acc3 = asrl(acc3, 31);                                                                           \
 }
 
-#define MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count)                                                \
-{                                                                                                   \
-    q31_t const *pSrcX, *pSrcY;                                                                     \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    pSrcY = (q31_t const *) pY;                                                                     \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        acc = vmlaldavaq(acc, xVec, yVec);                                                          \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /*  tail predication expected here  */                                                          \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                    \
-    }                                                                                               \
-    acc = asrl(acc, 31);                                                                            \
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    q31_t const *pSrcX, *pSrcY;                                         \
+    q31x4_t   xVec, yVec;                                               \
+    uint32_t    k;                                                      \
+                                                                        \
+    pSrcX = (q31_t const *) pX;                                         \
+    pSrcY  = (q31_t const *) pY;                                        \
+    k = count >> 2;                                                     \
+                                                                        \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                \
+        yVec = vldrwq_s32(&pSrcY[-1]);                                  \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    k = count % 0x4U;                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc1 requires 1 additional sample  */                            \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    mve_pred16_t p0 = vctp32q(k+1);                                     \
+    xVec = vld1q(pSrcX); pSrcX += 4;                                    \
+    yVec = vldrwq_s32(&pSrcY[-1]);                                      \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0);                           \
+    /* acc0 requires exact number of sample  */                         \
+    /* disable extra lanes in final MAC computation  */                 \
+    p0 = vctp32q(k);                                                    \
+    yVec = vld1q(pSrcY);  pSrcY += 4;                                   \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0);                           \
+                                                                        \
+    acc0 = asrl(acc0, 31);                                              \
+    acc1 = asrl(acc1, 31);                                              \
 }
 
-#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)              \
-{                                                                                                   \
-    q31_t const *pSrcX, *pSrcY;                                                                     \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    pSrcY  = (q31_t const *) pY;                                                                    \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vldrwq_s32(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                        \
-        xVec = vldrwq_s32(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* loop + tail predication expected here  */                                                    \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vldrwq_s32(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                  \
-        xVec = vldrwq_s32(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-                                                                                                    \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
-    acc2 = asrl(acc2, 31);                                                                          \
-    acc3 = asrl(acc3, 31);                                                                          \
+#define MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count)\
+{                                                   \
+    q31_t const *pSrcX, *pSrcY;                     \
+    q31x4_t   xVec, yVec;                           \
+    uint32_t    k;                                  \
+                                                    \
+    pSrcX = (q31_t const *) pX;                     \
+    pSrcY = (q31_t const *) pY;                     \
+    k = count >> 2;                                 \
+                                                    \
+    while (k > 0U)                                  \
+    {                                               \
+        xVec = vld1q(pSrcX); pSrcX += 4;            \
+        yVec = vld1q(pSrcY); pSrcY += 4;            \
+        acc = vmlaldavaq(acc, xVec, yVec);          \
+        /*  Decrement the loop counter   */         \
+        k--;                                        \
+    }                                               \
+    /*  tail predication expected here  */          \
+    k = count % 0x4U;                               \
+    if (k > 0U)                                     \
+    {                                               \
+        mve_pred16_t p0 = vctp32q(k);               \
+        xVec = vld1q(pSrcX); pSrcX += 4;            \
+        yVec = vld1q(pSrcY); pSrcY += 4;            \
+        acc = vmlaldavaq_p(acc, xVec, yVec, p0);    \
+    }                                               \
+    acc = asrl(acc, 31);                            \
 }
 
-#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)                          \
-{                                                                                                   \
-    q31_t const *pSrcX, *pSrcY;                                                                     \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    pSrcY  = (q31_t const *) pY;                                                                    \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* loop + tail predication expected here  */                                                    \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-                                                                                                    \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)\
+{                                                                                     \
+    q31_t const *pSrcX, *pSrcY;                                                       \
+    q31x4_t   xVec, yVec;                                                             \
+    uint32_t    k;                                                                    \
+                                                                                      \
+    pSrcX = (q31_t const *) pX;                                                       \
+    pSrcY  = (q31_t const *) pY;                                                      \
+    k = count >> 2;                                                                   \
+                                                                                      \
+    while (k > 0U)                                                                    \
+    {                                                                                 \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                              \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                 \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                          \
+        xVec = vldrwq_s32(&pSrcX[2]);                                                 \
+        acc2 = vmlaldavaq(acc2, xVec, yVec);                                          \
+        xVec = vldrwq_s32(&pSrcX[3]);                                                 \
+        acc3 = vmlaldavaq(acc3, xVec, yVec);                                          \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                              \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                          \
+        /*  Decrement the loop counter   */                                           \
+        k--;                                                                          \
+    }                                                                                 \
+    /* loop + tail predication expected here  */                                      \
+    k = count % 0x4U;                                                                 \
+    if (k > 0U)                                                                       \
+    {                                                                                 \
+        mve_pred16_t p0 = vctp32q(k);                                                 \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                              \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                 \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                    \
+        xVec = vldrwq_s32(&pSrcX[2]);                                                 \
+        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                    \
+        xVec = vldrwq_s32(&pSrcX[3]);                                                 \
+        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                    \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                              \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                    \
+    }                                                                                 \
+                                                                                      \
+    acc0 = asrl(acc0, 31);                                                            \
+    acc1 = asrl(acc1, 31);                                                            \
+    acc2 = asrl(acc2, 31);                                                            \
+    acc3 = asrl(acc3, 31);                                                            \
 }
 
-#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q31_t const *pSrcX, *pSrcY;                                                                     \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    pSrcY  = (q31_t const *) pY;                                                                    \
-    k = (count-1) >> 2;                                                                             \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires exact number of sample (count-1)  */                                           \
-    /* disable extra lanes in final MAC computation  */                                             \
-    k = (count-1) % 0x4U;                                                                           \
-    mve_pred16_t p0 = vctp32q(k);                                                        \
-    yVec = vld1q(pSrcY);  pSrcY += 4;                                                   \
-    xVec = vldrwq_s32(&pSrcX[1]);                                                                   \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
-    /* acc0 requires 1 additional sample  (count) */                                                \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp32q(k+1);                                                                   \
-    xVec = vld1q(pSrcX); pSrcX += 4;                                                    \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
-                                                                                                    \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)\
+{                                                                         \
+    q31_t const *pSrcX, *pSrcY;                                           \
+    q31x4_t   xVec, yVec;                                                 \
+    uint32_t    k;                                                        \
+                                                                          \
+    pSrcX = (q31_t const *) pX;                                           \
+    pSrcY  = (q31_t const *) pY;                                          \
+    k = count >> 2;                                                       \
+                                                                          \
+    while (k > 0U)                                                        \
+    {                                                                     \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                  \
+        xVec = vldrwq_s32(&pSrcX[1]);                                     \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                              \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                  \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                              \
+        /*  Decrement the loop counter   */                               \
+        k--;                                                              \
+    }                                                                     \
+    /* loop + tail predication expected here  */                          \
+    k = count % 0x4U;                                                     \
+    if (k > 0U)                                                           \
+    {                                                                     \
+        mve_pred16_t p0 = vctp32q(k);                                     \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                  \
+        xVec = vldrwq_s32(&pSrcX[1]);                                     \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                        \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                  \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                        \
+    }                                                                     \
+                                                                          \
+    acc0 = asrl(acc0, 31);                                                \
+    acc1 = asrl(acc1, 31);                                                \
 }
 
-#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q15_t const *pSrcX, *pSrcY;                                                                     \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    pSrcY  = (q15_t const *) pY;                                                                    \
-    k = count >> 3;                                                                                 \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        yVec = vldrhq_s16(&pSrcY[-1]);                                                              \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x8U;                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    mve_pred16_t p0 = vctp16q(k+1);                                                      \
-    xVec = vld1q(pSrcX);  pSrcX += 8;                                                   \
-    yVec = vldrhq_s16(&pSrcY[-1]);                                                                  \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0);                                                       \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    p0 = vctp16q(k);                                                                     \
-    yVec = vld1q(pSrcY);  pSrcY += 8;                                                   \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0);                                                       \
-                                                                                                    \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    q31_t const *pSrcX, *pSrcY;                                         \
+    q31x4_t   xVec, yVec;                                               \
+    uint32_t    k;                                                      \
+                                                                        \
+    pSrcX = (q31_t const *) pX;                                         \
+    pSrcY  = (q31_t const *) pY;                                        \
+    k = (count-1) >> 2;                                                 \
+                                                                        \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                \
+        xVec = vldrwq_s32(&pSrcX[1]);                                   \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc1 requires exact number of sample (count-1)  */               \
+    /* disable extra lanes in final MAC computation  */                 \
+    k = (count-1) % 0x4U;                                               \
+    mve_pred16_t p0 = vctp32q(k);                                       \
+    yVec = vld1q(pSrcY);  pSrcY += 4;                                   \
+    xVec = vldrwq_s32(&pSrcX[1]);                                       \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
+    /* acc0 requires 1 additional sample  (count) */                    \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    p0 = vctp32q(k+1);                                                  \
+    xVec = vld1q(pSrcX); pSrcX += 4;                                    \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
+                                                                        \
+    acc0 = asrl(acc0, 31);                                              \
+    acc1 = asrl(acc1, 31);                                              \
+}
+
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    q15_t const *pSrcX, *pSrcY;                                         \
+    q15x8_t   xVec, yVec;                                               \
+    uint32_t    k;                                                      \
+                                                                        \
+    pSrcX = (q15_t const *) pX;                                         \
+    pSrcY  = (q15_t const *) pY;                                        \
+    k = count >> 3;                                                     \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                               \
+        yVec = vldrhq_s16(&pSrcY[-1]);                                  \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                               \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    k = count % 0x8U;                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc1 requires 1 additional sample  */                            \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    mve_pred16_t p0 = vctp16q(k+1);                                     \
+    xVec = vld1q(pSrcX);  pSrcX += 8;                                   \
+    yVec = vldrhq_s16(&pSrcY[-1]);                                      \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0);                           \
+    /* acc0 requires exact number of sample  */                         \
+    /* disable extra lanes in final MAC computation  */                 \
+    p0 = vctp16q(k);                                                    \
+    yVec = vld1q(pSrcY);  pSrcY += 8;                                   \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0);                           \
+                                                                        \
+    acc0 = asrl(acc0, 15);                                              \
+    acc1 = asrl(acc1, 15);                                              \
+    acc0 = __SSAT(acc0, 16);                                            \
+    acc1 = __SSAT(acc1, 16);                                            \
 }
 
-#define MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count)                                                \
-{                                                                                                   \
-    q15_t const *pSrcX, *pSrcY;                                                                     \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    pSrcY = (q15_t const *) pY;                                                                     \
-    k = count >> 3;                                                                                 \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        acc = vmlaldavaq(acc, xVec, yVec);                                                          \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /*  tail predication expected here  */                                                          \
-    k = count % 0x8U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp16q(k);                                                    \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                    \
-    }                                                                                               \
-    acc = asrl(acc, 15);                                                                            \
-    acc = __SSAT(acc, 16);                                                                          \
+#define MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count)\
+{                                                   \
+    q15_t const *pSrcX, *pSrcY;                     \
+    q15x8_t   xVec, yVec;                           \
+    uint32_t    k;                                  \
+                                                    \
+    pSrcX = (q15_t const *) pX;                     \
+    pSrcY = (q15_t const *) pY;                     \
+    k = count >> 3;                                 \
+    while (k > 0U)                                  \
+    {                                               \
+        xVec = vld1q(pSrcX);  pSrcX += 8;           \
+        yVec = vld1q(pSrcY);  pSrcY += 8;           \
+        acc = vmlaldavaq(acc, xVec, yVec);          \
+        /*  Decrement the loop counter   */         \
+        k--;                                        \
+    }                                               \
+    /*  tail predication expected here  */          \
+    k = count % 0x8U;                               \
+    if (k > 0U)                                     \
+    {                                               \
+        mve_pred16_t p0 = vctp16q(k);               \
+        xVec = vld1q(pSrcX);  pSrcX += 8;           \
+        yVec = vld1q(pSrcY);  pSrcY += 8;           \
+        acc = vmlaldavaq_p(acc, xVec, yVec, p0);    \
+    }                                               \
+    acc = asrl(acc, 15);                            \
+    acc = __SSAT(acc, 16);                          \
 }
 
-#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)              \
-{                                                                                                   \
-    q15_t const *pSrcX, *pSrcY;                                                                     \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    pSrcY  = (q15_t const *) pY;                                                                    \
-    k = count >> 3;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vldrhq_s16(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                        \
-        xVec = vldrhq_s16(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* loop + tail predication expected here  */                                                    \
-    k = count % 0x8U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp16q(k);                                                    \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vldrhq_s16(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                  \
-        xVec = vldrhq_s16(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-                                                                                                    \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc2 = asrl(acc2, 15);                                                                          \
-    acc3 = asrl(acc3, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
-    acc2 = __SSAT(acc2, 16);                                                                        \
-    acc3 = __SSAT(acc3, 16);                                                                        \
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)\
+{                                                                                     \
+    q15_t const *pSrcX, *pSrcY;                                                       \
+    q15x8_t   xVec, yVec;                                                             \
+    uint32_t    k;                                                                    \
+                                                                                      \
+    pSrcX = (q15_t const *) pX;                                                       \
+    pSrcY  = (q15_t const *) pY;                                                      \
+    k = count >> 3;                                                                   \
+                                                                                      \
+    while (k > 0U)                                                                    \
+    {                                                                                 \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                             \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                 \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                          \
+        xVec = vldrhq_s16(&pSrcX[2]);                                                 \
+        acc2 = vmlaldavaq(acc2, xVec, yVec);                                          \
+        xVec = vldrhq_s16(&pSrcX[3]);                                                 \
+        acc3 = vmlaldavaq(acc3, xVec, yVec);                                          \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                             \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                          \
+        /*  Decrement the loop counter   */                                           \
+        k--;                                                                          \
+    }                                                                                 \
+    /* loop + tail predication expected here  */                                      \
+    k = count % 0x8U;                                                                 \
+    if (k > 0U)                                                                       \
+    {                                                                                 \
+        mve_pred16_t p0 = vctp16q(k);                                                 \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                             \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                 \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                    \
+        xVec = vldrhq_s16(&pSrcX[2]);                                                 \
+        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                    \
+        xVec = vldrhq_s16(&pSrcX[3]);                                                 \
+        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                             \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                    \
+    }                                                                                 \
+                                                                                      \
+    acc0 = asrl(acc0, 15);                                                            \
+    acc1 = asrl(acc1, 15);                                                            \
+    acc2 = asrl(acc2, 15);                                                            \
+    acc3 = asrl(acc3, 15);                                                            \
+    acc0 = __SSAT(acc0, 16);                                                          \
+    acc1 = __SSAT(acc1, 16);                                                          \
+    acc2 = __SSAT(acc2, 16);                                                          \
+    acc3 = __SSAT(acc3, 16);                                                          \
 }
 
-#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)                          \
-{                                                                                                   \
-    q15_t const *pSrcX, *pSrcY;                                                                     \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    pSrcY  = (q15_t const *) pY;                                                                    \
-    k = count >> 3;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* loop + tail predication expected here  */                                                    \
-    k = count % 0x8U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp16q(k);                                                    \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-                                                                                                    \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)\
+{                                                                         \
+    q15_t const *pSrcX, *pSrcY;                                           \
+    q15x8_t   xVec, yVec;                                                 \
+    uint32_t    k;                                                        \
+                                                                          \
+    pSrcX = (q15_t const *) pX;                                           \
+    pSrcY  = (q15_t const *) pY;                                          \
+    k = count >> 3;                                                       \
+                                                                          \
+    while (k > 0U)                                                        \
+    {                                                                     \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                 \
+        xVec = vldrhq_s16(&pSrcX[1]);                                     \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                              \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                 \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                              \
+        /*  Decrement the loop counter   */                               \
+        k--;                                                              \
+    }                                                                     \
+    /* loop + tail predication expected here  */                          \
+    k = count % 0x8U;                                                     \
+    if (k > 0U)                                                           \
+    {                                                                     \
+        mve_pred16_t p0 = vctp16q(k);                                     \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                 \
+        xVec = vldrhq_s16(&pSrcX[1]);                                     \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                        \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                 \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                        \
+    }                                                                     \
+                                                                          \
+    acc0 = asrl(acc0, 15);                                                \
+    acc1 = asrl(acc1, 15);                                                \
+    acc0 = __SSAT(acc0, 16);                                              \
+    acc1 = __SSAT(acc1, 16);                                              \
 }
 
-#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q15_t const *pSrcX, *pSrcY;                                                                     \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    pSrcY  = (q15_t const *) pY;                                                                    \
-    k = (count-1) >> 3;                                                                             \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires exact number of sample (count-1)  */                                           \
-    /* disable extra lanes in final MAC computation  */                                             \
-    k = (count-1) % 0x8U;                                                                           \
-    mve_pred16_t p0 = vctp16q(k);                                                        \
-    yVec = vld1q(pSrcY);  pSrcY += 8;                                                   \
-    xVec = vldrhq_s16(&pSrcX[1]);                                                                   \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
-    /* acc0 requires 1 additional sample  (count) */                                                \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp16q(k+1);                                                                   \
-    xVec = vld1q(pSrcX);  pSrcX += 8;                                                   \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
-                                                                                                    \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    q15_t const *pSrcX, *pSrcY;                                         \
+    q15x8_t   xVec, yVec;                                               \
+    uint32_t    k;                                                      \
+                                                                        \
+    pSrcX = (q15_t const *) pX;                                         \
+    pSrcY  = (q15_t const *) pY;                                        \
+    k = (count-1) >> 3;                                                 \
+                                                                        \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                               \
+        xVec = vldrhq_s16(&pSrcX[1]);                                   \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                               \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc1 requires exact number of sample (count-1)  */               \
+    /* disable extra lanes in final MAC computation  */                 \
+    k = (count-1) % 0x8U;                                               \
+    mve_pred16_t p0 = vctp16q(k);                                       \
+    yVec = vld1q(pSrcY);  pSrcY += 8;                                   \
+    xVec = vldrhq_s16(&pSrcX[1]);                                       \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
+    /* acc0 requires 1 additional sample  (count) */                    \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    p0 = vctp16q(k+1);                                                  \
+    xVec = vld1q(pSrcX);  pSrcX += 8;                                   \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
+                                                                        \
+    acc0 = asrl(acc0, 15);                                              \
+    acc1 = asrl(acc1, 15);                                              \
+    acc0 = __SSAT(acc0, 16);                                            \
+    acc1 = __SSAT(acc1, 16);                                            \
 }
 
-#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q15_t const *pSrcX;                                                                             \
-    const q15_t       *pY1 = pY + 1;                                                                      \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    k = count >> 3;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        pY-=8;                                                                                      \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec);                                   \
-        pY1-=8;                                                                                     \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x8U;                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    mve_pred16_t p0 = vctp16q(k);                                                        \
-    xVec = vld1q(pSrcX);  pSrcX += 8;                                                   \
-    yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                        \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
-    yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec);                                       \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp16q(k+1);                                                                   \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
-                                                                                                    \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    q15_t const *pSrcX;                                                 \
+    const q15_t       *pY1 = pY + 1;                                    \
+    q15x8_t   xVec, yVec;                                               \
+    uint32_t    k;                                                      \
+                                                                        \
+    pSrcX = (q15_t const *) pX;                                         \
+    k = count >> 3;                                                     \
+                                                                        \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                               \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);        \
+        pY-=8;                                                          \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
+        yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec);       \
+        pY1-=8;                                                         \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    k = count % 0x8U;                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc0 requires exact number of sample  */                         \
+    /* disable extra lanes in final MAC computation  */                 \
+    mve_pred16_t p0 = vctp16q(k);                                       \
+    xVec = vld1q(pSrcX);  pSrcX += 8;                                   \
+    yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);            \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
+    yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec);           \
+    /* acc1 requires 1 additional sample  */                            \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    p0 = vctp16q(k+1);                                                  \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
+                                                                        \
+    acc0 = asrl(acc0, 15);                                              \
+    acc1 = asrl(acc1, 15);                                              \
+    acc0 = __SSAT(acc0, 16);                                            \
+    acc1 = __SSAT(acc1, 16);                                            \
 }
 
-#define MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count)                                                \
-{                                                                                                   \
-    q15_t const *pSrcX;                                                                             \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    k = count >> 3;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        pY-=8;                                                                                      \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc = vmlaldavaq(acc, xVec, yVec);                                                          \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x8U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp16q(k);                                                    \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                    \
-    }                                                                                               \
-    acc = asrl(acc, 15);                                                                            \
-    acc = __SSAT(acc, 16);                                                                          \
+#define MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count)                                                 \
+{                                                                                                    \
+    q15_t const *pSrcX;                                                                              \
+    q15x8_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q15_t const *) pX;                                                                      \
+    k = count >> 3;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        pY-=8;                                                                                       \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        acc = vmlaldavaq(acc, xVec, yVec);                                                           \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x8U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp16q(k);                                                                \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                     \
+    }                                                                                                \
+    acc = asrl(acc, 15);                                                                             \
+    acc = __SSAT(acc, 16);                                                                           \
 }
 
-#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)              \
-{                                                                                                   \
-    q15_t const *pSrcX;                                                                             \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    k = count >> 3;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        pY-=8;                                                                                      \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vldrhq_s16(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                        \
-        xVec = vldrhq_s16(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x8U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp16q(k);                                                    \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vldrhq_s16(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                  \
-        xVec = vldrhq_s16(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc2 = asrl(acc2, 15);                                                                          \
-    acc3 = asrl(acc3, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
-    acc2 = __SSAT(acc2, 16);                                                                        \
-    acc3 = __SSAT(acc3, 16);                                                                        \
+#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)               \
+{                                                                                                    \
+    q15_t const *pSrcX;                                                                              \
+    q15x8_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q15_t const *) pX;                                                                      \
+    k = count >> 3;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        pY-=8;                                                                                       \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
+        xVec = vldrhq_s16(&pSrcX[2]);                                                                \
+        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                         \
+        xVec = vldrhq_s16(&pSrcX[3]);                                                                \
+        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x8U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp16q(k);                                                                \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                   \
+        xVec = vldrhq_s16(&pSrcX[2]);                                                                \
+        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                   \
+        xVec = vldrhq_s16(&pSrcX[3]);                                                                \
+        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                   \
+    }                                                                                                \
+    acc0 = asrl(acc0, 15);                                                                           \
+    acc1 = asrl(acc1, 15);                                                                           \
+    acc2 = asrl(acc2, 15);                                                                           \
+    acc3 = asrl(acc3, 15);                                                                           \
+    acc0 = __SSAT(acc0, 16);                                                                         \
+    acc1 = __SSAT(acc1, 16);                                                                         \
+    acc2 = __SSAT(acc2, 16);                                                                         \
+    acc3 = __SSAT(acc3, 16);                                                                         \
 }
 
-#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)                          \
-{                                                                                                   \
-    q15_t const *pSrcX;                                                                             \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    k = count >> 3;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        pY-=8;                                                                                      \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x8U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp16q(k);                                                    \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)                           \
+{                                                                                                    \
+    q15_t const *pSrcX;                                                                              \
+    q15x8_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q15_t const *) pX;                                                                      \
+    k = count >> 3;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        pY-=8;                                                                                       \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x8U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp16q(k);                                                                \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                   \
+    }                                                                                                \
+    acc0 = asrl(acc0, 15);                                                                           \
+    acc1 = asrl(acc1, 15);                                                                           \
+    acc0 = __SSAT(acc0, 16);                                                                         \
+    acc1 = __SSAT(acc1, 16);                                                                         \
 }
 
-#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q15_t const *pSrcX;                                                                             \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    k = (count-1) >> 3;                                                                             \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        pY-=8;                                                                                      \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = (count - 1) % 0x8U;                                                                         \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires exact number of sample (count-1)  */                                           \
-    /* disable extra lanes in final MAC computation  */                                             \
-    mve_pred16_t p0 = vctp16q(k);                                                        \
-    yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                        \
-    xVec = vldrhq_s16(&pSrcX[1]);                                                                   \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
-    /* acc0 requires 1 additional sample  (count) */                                                \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp16q(k+1);                                                                   \
-    xVec = vld1q(pSrcX);  pSrcX += 8;                                                   \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
-                                                                                                    \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)                             \
+{                                                                                                    \
+    q15_t const *pSrcX;                                                                              \
+    q15x8_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q15_t const *) pX;                                                                      \
+    k = (count-1) >> 3;                                                                              \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        pY-=8;                                                                                       \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    k = (count - 1) % 0x8U;                                                                          \
+    /* use predication to finalize MAC sum */                                                        \
+    /* acc1 requires exact number of sample (count-1)  */                                            \
+    /* disable extra lanes in final MAC computation  */                                              \
+    mve_pred16_t p0 = vctp16q(k);                                                                    \
+    yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                         \
+    xVec = vldrhq_s16(&pSrcX[1]);                                                                    \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                       \
+    /* acc0 requires 1 additional sample  (count) */                                                 \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                                \
+    p0 = vctp16q(k+1);                                                                               \
+    xVec = vld1q(pSrcX);  pSrcX += 8;                                                                \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                       \
+                                                                                                     \
+    acc0 = asrl(acc0, 15);                                                                           \
+    acc1 = asrl(acc1, 15);                                                                           \
+    acc0 = __SSAT(acc0, 16);                                                                         \
+    acc1 = __SSAT(acc1, 16);                                                                         \
 }
 
-#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)                             \
-{                                                                                                   \
-    q7_t const *pSrcX, *pSrcY;                                                                      \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    pSrcY = (q7_t const *) pY;                                                                      \
-    k = count >> 4;                                                                                 \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        yVec = vldrbq_s8(&pSrcY[-1]);                                                               \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x10U;                                                                              \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    mve_pred16_t p0 = vctp8q(k+1);                                                       \
-    xVec = vld1q(pSrcX);  pSrcX += 16;                                                    \
-    yVec = vldrbq_s8(&pSrcY[-1]);                                                                   \
-    acc1 = vmladavaq_p(acc1, xVec, yVec,p0);                                                        \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    p0 = vctp8q(k);                                                                      \
-    yVec = vld1q(pSrcY);  pSrcY += 16;                                                    \
-    acc0 = vmladavaq_p(acc0, xVec, yVec,p0);                                                        \
-                                                                                                    \
-    acc0 = (acc0 >> 7);                                                                             \
-    acc1 = (acc1 >> 7);                                                                             \
-    acc0 = __SSAT(acc0, 8);                                                                         \
-    acc1 = __SSAT(acc1, 8);                                                                         \
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\
+{                                                                      \
+    q7_t const *pSrcX, *pSrcY;                                         \
+    q7x16_t   xVec, yVec;                                              \
+    uint32_t    k;                                                     \
+                                                                       \
+    pSrcX = (q7_t const *) pX;                                         \
+    pSrcY = (q7_t const *) pY;                                         \
+    k = count >> 4;                                                    \
+    while (k > 0U)                                                     \
+    {                                                                  \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                             \
+        yVec = vldrbq_s8(&pSrcY[-1]);                                  \
+        acc1 = vmladavaq(acc1, xVec, yVec);                            \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                             \
+        acc0 = vmladavaq(acc0, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                            \
+        k--;                                                           \
+    }                                                                  \
+    k = count % 0x10U;                                                 \
+    /* use predication to finalize MAC sum */                          \
+    /* acc1 requires 1 additional sample  */                           \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */  \
+    mve_pred16_t p0 = vctp8q(k+1);                                     \
+    xVec = vld1q(pSrcX);  pSrcX += 16;                                 \
+    yVec = vldrbq_s8(&pSrcY[-1]);                                      \
+    acc1 = vmladavaq_p(acc1, xVec, yVec,p0);                           \
+    /* acc0 requires exact number of sample  */                        \
+    /* disable extra lanes in final MAC computation  */                \
+    p0 = vctp8q(k);                                                    \
+    yVec = vld1q(pSrcY);  pSrcY += 16;                                 \
+    acc0 = vmladavaq_p(acc0, xVec, yVec,p0);                           \
+                                                                       \
+    acc0 = (acc0 >> 7);                                                \
+    acc1 = (acc1 >> 7);                                                \
+    acc0 = __SSAT(acc0, 8);                                            \
+    acc1 = __SSAT(acc1, 8);                                            \
 }
 
-#define MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count)                                                 \
-{                                                                                                   \
-    q7_t const *pSrcX, *pSrcY;                                                                      \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    pSrcY = (q7_t const *) pY;                                                                      \
-    k = count >> 4;                                                                                 \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        acc = vmladavaq(acc, xVec, yVec);                                                           \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /*  tail predication expected here  */                                                          \
-    k = count % 0x10U;                                                                              \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp8q(k);                                                     \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        acc = vmladavaq_p(acc, xVec, yVec, p0);                                                     \
-    }                                                                                               \
-    acc =(acc >> 7);                                                                                \
-    acc = __SSAT(acc, 8);                                                                           \
+#define MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count)\
+{                                                  \
+    q7_t const *pSrcX, *pSrcY;                     \
+    q7x16_t   xVec, yVec;                          \
+    uint32_t    k;                                 \
+                                                   \
+    pSrcX = (q7_t const *) pX;                     \
+    pSrcY = (q7_t const *) pY;                     \
+    k = count >> 4;                                \
+    while (k > 0U)                                 \
+    {                                              \
+        xVec = vld1q(pSrcX);  pSrcX += 16;         \
+        yVec = vld1q(pSrcY);  pSrcY += 16;         \
+        acc = vmladavaq(acc, xVec, yVec);          \
+        /*  Decrement the loop counter   */        \
+        k--;                                       \
+    }                                              \
+    /*  tail predication expected here  */         \
+    k = count % 0x10U;                             \
+    if (k > 0U)                                    \
+    {                                              \
+        mve_pred16_t p0 = vctp8q(k);               \
+        xVec = vld1q(pSrcX);  pSrcX += 16;         \
+        yVec = vld1q(pSrcY);  pSrcY += 16;         \
+        acc = vmladavaq_p(acc, xVec, yVec, p0);    \
+    }                                              \
+    acc =(acc >> 7);                               \
+    acc = __SSAT(acc, 8);                          \
 }
 
-#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)               \
-{                                                                                                   \
-    q7_t const *pSrcX, *pSrcY;                                                                      \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    pSrcY = (q7_t const *) pY;                                                                      \
-    k = count >> 4;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        xVec = vldrbq_s8(&pSrcX[2]);                                                                \
-        acc2 = vmladavaq(acc2, xVec, yVec);                                                         \
-        xVec = vldrbq_s8(&pSrcX[3]);                                                                \
-        acc3 = vmladavaq(acc3, xVec, yVec);                                                         \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* loop + tail predication expected here  */                                                    \
-    k = count % 0x10U;                                                                              \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp8q(k);                                                     \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                   \
-        xVec = vldrbq_s8(&pSrcX[2]);                                                                \
-        acc2 = vmladavaq_p(acc2, xVec, yVec, p0);                                                   \
-        xVec = vldrbq_s8(&pSrcX[3]);                                                                \
-        acc3 = vmladavaq_p(acc3, xVec, yVec, p0);                                                   \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                   \
-    }                                                                                               \
-                                                                                                    \
-    acc0 = (acc0 >> 7);                                                                             \
-    acc1 = (acc1 >> 7);                                                                             \
-    acc2 = (acc2 >> 7);                                                                             \
-    acc3 = (acc3 >> 7);                                                                             \
-    acc0 = __SSAT(acc0, 8);                                                                         \
-    acc1 = __SSAT(acc1, 8);                                                                         \
-    acc2 = __SSAT(acc2, 8);                                                                         \
-    acc3 = __SSAT(acc3, 8);                                                                         \
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)\
+{                                                                                    \
+    q7_t const *pSrcX, *pSrcY;                                                       \
+    q7x16_t   xVec, yVec;                                                            \
+    uint32_t    k;                                                                   \
+                                                                                     \
+    pSrcX = (q7_t const *) pX;                                                       \
+    pSrcY = (q7_t const *) pY;                                                       \
+    k = count >> 4;                                                                  \
+                                                                                     \
+    while (k > 0U)                                                                   \
+    {                                                                                \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                                           \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                 \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                          \
+        xVec = vldrbq_s8(&pSrcX[2]);                                                 \
+        acc2 = vmladavaq(acc2, xVec, yVec);                                          \
+        xVec = vldrbq_s8(&pSrcX[3]);                                                 \
+        acc3 = vmladavaq(acc3, xVec, yVec);                                          \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                           \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                          \
+        /*  Decrement the loop counter   */                                          \
+        k--;                                                                         \
+    }                                                                                \
+    /* loop + tail predication expected here  */                                     \
+    k = count % 0x10U;                                                               \
+    if (k > 0U)                                                                      \
+    {                                                                                \
+        mve_pred16_t p0 = vctp8q(k);                                                 \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                                           \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                 \
+        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                    \
+        xVec = vldrbq_s8(&pSrcX[2]);                                                 \
+        acc2 = vmladavaq_p(acc2, xVec, yVec, p0);                                    \
+        xVec = vldrbq_s8(&pSrcX[3]);                                                 \
+        acc3 = vmladavaq_p(acc3, xVec, yVec, p0);                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                           \
+        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                    \
+    }                                                                                \
+                                                                                     \
+    acc0 = (acc0 >> 7);                                                              \
+    acc1 = (acc1 >> 7);                                                              \
+    acc2 = (acc2 >> 7);                                                              \
+    acc3 = (acc3 >> 7);                                                              \
+    acc0 = __SSAT(acc0, 8);                                                          \
+    acc1 = __SSAT(acc1, 8);                                                          \
+    acc2 = __SSAT(acc2, 8);                                                          \
+    acc3 = __SSAT(acc3, 8);                                                          \
 }
 
-#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)                           \
-{                                                                                                   \
-    q7_t const *pSrcX, *pSrcY;                                                                      \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    pSrcY = (q7_t const *) pY;                                                                      \
-    k = count >> 4;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* loop + tail predication expected here  */                                                    \
-    k = count % 0x10U;                                                                              \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp8q(k);                                                     \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                   \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                   \
-    }                                                                                               \
-                                                                                                    \
-    acc0 = (acc0 >> 7);                                                                             \
-    acc1 = (acc1 >> 7);                                                                             \
-    acc0 = __SSAT(acc0, 8);                                                                         \
-    acc1 = __SSAT(acc1, 8);                                                                         \
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)\
+{                                                                        \
+    q7_t const *pSrcX, *pSrcY;                                           \
+    q7x16_t   xVec, yVec;                                                \
+    uint32_t    k;                                                       \
+                                                                         \
+    pSrcX = (q7_t const *) pX;                                           \
+    pSrcY = (q7_t const *) pY;                                           \
+    k = count >> 4;                                                      \
+                                                                         \
+    while (k > 0U)                                                       \
+    {                                                                    \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                               \
+        xVec = vldrbq_s8(&pSrcX[1]);                                     \
+        acc1 = vmladavaq(acc1, xVec, yVec);                              \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                               \
+        acc0 = vmladavaq(acc0, xVec, yVec);                              \
+        /*  Decrement the loop counter   */                              \
+        k--;                                                             \
+    }                                                                    \
+    /* loop + tail predication expected here  */                         \
+    k = count % 0x10U;                                                   \
+    if (k > 0U)                                                          \
+    {                                                                    \
+        mve_pred16_t p0 = vctp8q(k);                                     \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                               \
+        xVec = vldrbq_s8(&pSrcX[1]);                                     \
+        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                        \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                               \
+        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                        \
+    }                                                                    \
+                                                                         \
+    acc0 = (acc0 >> 7);                                                  \
+    acc1 = (acc1 >> 7);                                                  \
+    acc0 = __SSAT(acc0, 8);                                              \
+    acc1 = __SSAT(acc1, 8);                                              \
 }
 
-#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)                             \
-{                                                                                                   \
-    q7_t const *pSrcX, *pSrcY;                                                                      \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    pSrcY = (q7_t const *) pY;                                                                      \
-    k = (count-1) >> 4;                                                                             \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires exact number of sample (count-1)  */                                           \
-    /* disable extra lanes in final MAC computation  */                                             \
-    k = (count-1) % 0x10U;                                                                          \
-    mve_pred16_t p0 = vctp8q(k);                                                         \
-    yVec = vld1q(pSrcY);  pSrcY += 16;                                                    \
-    xVec = vldrbq_s8(&pSrcX[1]);                                                                    \
-    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                       \
-    /* acc0 requires 1 additional sample  (count) */                                                \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp8q(k+1);                                                                    \
-    xVec = vld1q(pSrcX);  pSrcX += 16;                                                    \
-    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                       \
-                                                                                                    \
-    acc0 = (acc0 >> 7);                                                                             \
-    acc1 = (acc1 >> 7);                                                                             \
-    acc0 = __SSAT(acc0, 8);                                                                         \
-    acc1 = __SSAT(acc1, 8);                                                                         \
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)\
+{                                                                      \
+    q7_t const *pSrcX, *pSrcY;                                         \
+    q7x16_t   xVec, yVec;                                              \
+    uint32_t    k;                                                     \
+                                                                       \
+    pSrcX = (q7_t const *) pX;                                         \
+    pSrcY = (q7_t const *) pY;                                         \
+    k = (count-1) >> 4;                                                \
+                                                                       \
+    while (k > 0U)                                                     \
+    {                                                                  \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                             \
+        xVec = vldrbq_s8(&pSrcX[1]);                                   \
+        acc1 = vmladavaq(acc1, xVec, yVec);                            \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                             \
+        acc0 = vmladavaq(acc0, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                            \
+        k--;                                                           \
+    }                                                                  \
+    /* use predication to finalize MAC sum */                          \
+    /* acc1 requires exact number of sample (count-1)  */              \
+    /* disable extra lanes in final MAC computation  */                \
+    k = (count-1) % 0x10U;                                             \
+    mve_pred16_t p0 = vctp8q(k);                                       \
+    yVec = vld1q(pSrcY);  pSrcY += 16;                                 \
+    xVec = vldrbq_s8(&pSrcX[1]);                                       \
+    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                          \
+    /* acc0 requires 1 additional sample  (count) */                   \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */  \
+    p0 = vctp8q(k+1);                                                  \
+    xVec = vld1q(pSrcX);  pSrcX += 16;                                 \
+    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                          \
+                                                                       \
+    acc0 = (acc0 >> 7);                                                \
+    acc1 = (acc1 >> 7);                                                \
+    acc0 = __SSAT(acc0, 8);                                            \
+    acc1 = __SSAT(acc1, 8);                                            \
 }
 
-#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)                             \
-{                                                                                                   \
-    q7_t const *pSrcX;                                                                              \
-    const q7_t       *pY1 = pY + 1;                                                                       \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    k = count >> 4;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        pY-=16;                                                                                     \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec);                                            \
-        pY1-=16;                                                                                    \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x10U;                                                                              \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    mve_pred16_t p0 = vctp8q(k);                                                         \
-    xVec = vld1q(pSrcX);  pSrcX += 16;                                                    \
-    yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                                 \
-    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                       \
-    yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec);                                                \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp8q(k+1);                                                                    \
-    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                       \
-                                                                                                    \
-    acc0 = (acc0 >> 7);                                                                             \
-    acc1 = (acc1 >> 7);                                                                             \
-    acc0 = __SSAT(acc0, 8);                                                                         \
-    acc1 = __SSAT(acc1, 8);                                                                         \
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\
+{                                                                      \
+    q7_t const *pSrcX;                                                 \
+    const q7_t       *pY1 = pY + 1;                                    \
+    q7x16_t   xVec, yVec;                                              \
+    uint32_t    k;                                                     \
+                                                                       \
+    pSrcX = (q7_t const *) pX;                                         \
+    k = count >> 4;                                                    \
+                                                                       \
+    while (k > 0U)                                                     \
+    {                                                                  \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                             \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                \
+        pY-=16;                                                        \
+        acc0 = vmladavaq(acc0, xVec, yVec);                            \
+        yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec);               \
+        pY1-=16;                                                       \
+        acc1 = vmladavaq(acc1, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                            \
+        k--;                                                           \
+    }                                                                  \
+    k = count % 0x10U;                                                 \
+    /* use predication to finalize MAC sum */                          \
+    /* acc0 requires exact number of sample  */                        \
+    /* disable extra lanes in final MAC computation  */                \
+    mve_pred16_t p0 = vctp8q(k);                                       \
+    xVec = vld1q(pSrcX);  pSrcX += 16;                                 \
+    yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                    \
+    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                          \
+    yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec);                   \
+    /* acc1 requires 1 additional sample  */                           \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */  \
+    p0 = vctp8q(k+1);                                                  \
+    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                          \
+                                                                       \
+    acc0 = (acc0 >> 7);                                                \
+    acc1 = (acc1 >> 7);                                                \
+    acc0 = __SSAT(acc0, 8);                                            \
+    acc1 = __SSAT(acc1, 8);                                            \
 }
 
-#define MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count)                                                 \
-{                                                                                                   \
-    q7_t const *pSrcX;                                                                              \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    k = count >> 4;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        pY-=16;                                                                                     \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc = vmladavaq(acc, xVec, yVec);                                                           \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x10U;                                                                              \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp8q(k);                                                     \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        acc = vmladavaq_p(acc, xVec, yVec, p0);                                                     \
-    }                                                                                               \
-    acc = __SSAT(acc >> 7, 8);                                                                      \
+#define MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count)                                                  \
+{                                                                                                    \
+    q7_t const *pSrcX;                                                                               \
+    q7x16_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q7_t const *) pX;                                                                       \
+    k = count >> 4;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        pY-=16;                                                                                      \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        acc = vmladavaq(acc, xVec, yVec);                                                            \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x10U;                                                                               \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp8q(k);                                                                 \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        acc = vmladavaq_p(acc, xVec, yVec, p0);                                                      \
+    }                                                                                                \
+    acc = __SSAT(acc >> 7, 8);                                                                       \
 }
 
-#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)               \
-{                                                                                                   \
-    q7_t const *pSrcX;                                                                              \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    k = count >> 4;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        pY-=16;                                                                                     \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        xVec = vldrbq_s8(&pSrcX[2]);                                                                \
-        acc2 = vmladavaq(acc2, xVec, yVec);                                                         \
-        xVec = vldrbq_s8(&pSrcX[3]);                                                                \
-        acc3 = vmladavaq(acc3, xVec, yVec);                                                         \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x10U;                                                                              \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp8q(k);                                                     \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                   \
-        xVec = vldrbq_s8(&pSrcX[2]);                                                                \
-        acc2 = vmladavaq_p(acc2, xVec, yVec, p0);                                                   \
-        xVec = vldrbq_s8(&pSrcX[3]);                                                                \
-        acc3 = vmladavaq_p(acc3, xVec, yVec, p0);                                                   \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                   \
-    }                                                                                               \
-    acc0 = __SSAT(acc0 >> 7, 8);                                                                    \
-    acc1 = __SSAT(acc1 >> 7, 8);                                                                    \
-    acc2 = __SSAT(acc2 >> 7, 8);                                                                    \
-    acc3 = __SSAT(acc3 >> 7, 8);                                                                    \
+#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)                \
+{                                                                                                    \
+    q7_t const *pSrcX;                                                                               \
+    q7x16_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q7_t const *) pX;                                                                       \
+    k = count >> 4;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        pY-=16;                                                                                      \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                          \
+        xVec = vldrbq_s8(&pSrcX[2]);                                                                 \
+        acc2 = vmladavaq(acc2, xVec, yVec);                                                          \
+        xVec = vldrbq_s8(&pSrcX[3]);                                                                 \
+        acc3 = vmladavaq(acc3, xVec, yVec);                                                          \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                          \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x10U;                                                                               \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp8q(k);                                                                 \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
+        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                    \
+        xVec = vldrbq_s8(&pSrcX[2]);                                                                 \
+        acc2 = vmladavaq_p(acc2, xVec, yVec, p0);                                                    \
+        xVec = vldrbq_s8(&pSrcX[3]);                                                                 \
+        acc3 = vmladavaq_p(acc3, xVec, yVec, p0);                                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                    \
+    }                                                                                                \
+    acc0 = __SSAT(acc0 >> 7, 8);                                                                     \
+    acc1 = __SSAT(acc1 >> 7, 8);                                                                     \
+    acc2 = __SSAT(acc2 >> 7, 8);                                                                     \
+    acc3 = __SSAT(acc3 >> 7, 8);                                                                     \
 }
 
-#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)                           \
-{                                                                                                   \
-    q7_t const *pSrcX;                                                                              \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    k = count >> 4;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        pY-=16;                                                                                     \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x10U;                                                                              \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp8q(k);                                                     \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                   \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                   \
-    }                                                                                               \
-    acc0 = __SSAT(acc0 >> 7, 8);                                                                    \
-    acc1 = __SSAT(acc1 >> 7, 8);                                                                    \
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)                            \
+{                                                                                                    \
+    q7_t const *pSrcX;                                                                               \
+    q7x16_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q7_t const *) pX;                                                                       \
+    k = count >> 4;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        pY-=16;                                                                                      \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                          \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                          \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x10U;                                                                               \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp8q(k);                                                                 \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
+        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                    \
+    }                                                                                                \
+    acc0 = __SSAT(acc0 >> 7, 8);                                                                     \
+    acc1 = __SSAT(acc1 >> 7, 8);                                                                     \
 }
 
 
-#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)                             \
-{                                                                                                   \
-    q7_t const *pSrcX;                                                                              \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    k = (count-1) >> 4;                                                                             \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        pY-=16;                                                                                     \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = (count - 1) % 0x10U;                                                                        \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires exact number of sample (count-1)  */                                           \
-    /* disable extra lanes in final MAC computation  */                                             \
-    mve_pred16_t p0 = vctp8q(k);                                                         \
-    yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                                 \
-    xVec = vldrbq_s8(&pSrcX[1]);                                                                    \
-    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                       \
-    /* acc0 requires 1 additional sample  (count) */                                                \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp8q(k+1);                                                                    \
-    xVec = vld1q(pSrcX);  pSrcX += 16;                                                    \
-    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                       \
-                                                                                                    \
-    acc0 = (acc0 >> 7);                                                                             \
-    acc1 = (acc1 >> 7);                                                                             \
-    acc0 = __SSAT(acc0, 8);                                                                         \
-    acc1 = __SSAT(acc1, 8);                                                                         \
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)                              \
+{                                                                                                    \
+    q7_t const *pSrcX;                                                                               \
+    q7x16_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q7_t const *) pX;                                                                       \
+    k = (count-1) >> 4;                                                                              \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        pY-=16;                                                                                      \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                          \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                          \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    k = (count - 1) % 0x10U;                                                                         \
+    /* use predication to finalize MAC sum */                                                        \
+    /* acc1 requires exact number of sample (count-1)  */                                            \
+    /* disable extra lanes in final MAC computation  */                                              \
+    mve_pred16_t p0 = vctp8q(k);                                                                     \
+    yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                                  \
+    xVec = vldrbq_s8(&pSrcX[1]);                                                                     \
+    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                        \
+    /* acc0 requires 1 additional sample  (count) */                                                 \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                                \
+    p0 = vctp8q(k+1);                                                                                \
+    xVec = vld1q(pSrcX);  pSrcX += 16;                                                               \
+    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                        \
+                                                                                                     \
+    acc0 = (acc0 >> 7);                                                                              \
+    acc1 = (acc1 >> 7);                                                                              \
+    acc0 = __SSAT(acc0, 8);                                                                          \
+    acc1 = __SSAT(acc1, 8);                                                                          \
 }
 
 #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_f16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..fb1b9b4c66ec07337e333ae42073863d6325b88d
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_f16.c
@@ -0,0 +1,198 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_abs_f16.c
+ * Description:  Floating-point vector absolute value
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/basic_math_functions_f16.h"
+#include <math.h>
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicAbs Vector Absolute Value
+
+  Computes the absolute value of a vector on an element-by-element basis.
+
+  <pre>
+      pDst[n] = abs(pSrc[n]),   0 <= n < blockSize.
+  </pre>
+
+  The functions support in-place computation allowing the source and
+  destination pointers to reference the same memory buffer.
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicAbs
+  @{
+ */
+
+/**
+  @brief         Floating-point vector absolute value.
+  @param[in]     pSrc       points to the input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_abs_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+    f16x8_t vec1;
+    f16x8_t res;
+
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+
+        /* Calculate absolute values and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vabsq(vec1);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 8;
+        pDst += 8;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x7;
+
+
+    if (blkCnt > 0U)
+    {
+      /* C = |A| */
+      mve_pred16_t p0 = vctp16q(blkCnt);
+      vec1 = vld1q(pSrc);
+      vstrhq_p(pDst, vabsq(vec1), p0);
+    }
+
+}
+
+#else
+#if defined(ARM_FLOAT16_SUPPORTED)
+void arm_abs_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined(ARM_MATH_NEON_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f16x8_t vec1;
+    f16x8_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+
+    	/* Calculate absolute values and then store the results in the destination buffer. */
+        vec1 = vld1q_f16(pSrc);
+        res = vabsq_f16(vec1);
+        vst1q_f16(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+#else
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute and store result in destination buffer. */
+    *pDst++ = fabsf(*pSrc++);
+
+    *pDst++ = fabsf(*pSrc++);
+
+    *pDst++ = fabsf(*pSrc++);
+
+    *pDst++ = fabsf(*pSrc++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+#endif /* #if defined(ARM_MATH_NEON) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = |A| */
+
+    /* Calculate absolute and store result in destination buffer. */
+    *pDst++ = fabsf(*pSrc++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_FLOAT16_SUPPORTED */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of BasicAbs group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_f32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_f32.c
index ccae6d31874344bced6a08a4c7bf0b83abd30f4a..1e67f4e3345839b449de640b8396ba284780527e 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_f32.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_abs_f32.c
  * Description:  Floating-point vector absolute value
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 #include <math.h>
 
 /**
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_q15.c
index 854fbd9bc34271a99183e70ad88df3a4fc11e8bf..7c38ef518bd2345223abc37d267247165e6f7980 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_q15.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_abs_q15.c
  * Description:  Q15 vector absolute value
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -49,7 +49,7 @@
                    The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_q31.c
index 9f3d97b5b4cedfdc7cdb9980b8f8d29414c152cc..7043aa0cf2eaf71de83ab9b6f6f3ba9aa9ebfe18 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_q31.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_abs_q31.c
  * Description:  Q31 vector absolute value
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -49,7 +49,7 @@
                    The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_q7.c
index e016286c53fe098c40bd63ea6a1162954582e2cb..bd79582e40c72af188046d76613cf32e588723af 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_q7.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_abs_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_abs_q7.c
  * Description:  Q7 vector absolute value
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -51,7 +51,7 @@
                    The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_add_f16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_add_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..3c313c65321d6313e28ab97c43284335563a83f6
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_add_f16.c
@@ -0,0 +1,169 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_add_f16.c
+ * Description:  Floating-point vector addition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/basic_math_functions_f16.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicAdd Vector Addition
+
+  Element-by-element addition of two vectors.
+
+  <pre>
+      pDst[n] = pSrcA[n] + pSrcB[n],   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicAdd
+  @{
+ */
+
+/**
+  @brief         Floating-point vector addition.
+  @param[in]     pSrcA      points to first input vector
+  @param[in]     pSrcB      points to second input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_add_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f16x8_t vec1;
+    f16x8_t vec2;
+    f16x8_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+        /* Add and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrcA);
+        vec2 = vld1q(pSrcB);
+        res = vaddq(vec1, vec2);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 8;
+        pSrcB += 8; 
+        pDst += 8;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x7;
+
+    if (blkCnt > 0U)
+    {
+      /* C = A + B */
+      mve_pred16_t p0 = vctp16q(blkCnt);
+      vec1 = vld1q(pSrcA);
+      vec2 = vld1q(pSrcB);
+      vstrhq_p(pDst, vaddq(vec1,vec2), p0);
+    }
+
+}
+
+#else
+#if defined(ARM_FLOAT16_SUPPORTED)
+void arm_add_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + B */
+
+    /* Add and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) + (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_FLOAT16_SUPPORTED) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicAdd group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_add_f32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_add_f32.c
index abc0a0624b894a8867391ac88dca12e0fb87e47f..9741e4a6b2a2811deaed44369ecbd6db1bb86e9d 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_add_f32.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_add_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_add_f32.c
  * Description:  Floating-point vector addition
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_add_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_add_q15.c
index ff8ae065423caefb2f16325a7c1a2738f2fb3298..f7725ef512b4b99c301e5bf55be75569c6b7d5b3 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_add_q15.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_add_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_add_q15.c
  * Description:  Q15 vector addition
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -50,7 +50,7 @@
                    Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_add_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_add_q31.c
index f4ef3de053ec4b927a2b8e15b20dba11ed244645..a6783e7f6f8e76ad533ac49cd1a9eb8df1cbd6b6 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_add_q31.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_add_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_add_q31.c
  * Description:  Q31 vector addition
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -50,7 +50,7 @@
                    Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_add_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_add_q7.c
index 8836c85a18ecad3fdbb34c069543b2db3128758d..07dfbd5c6bc2b9e7a166b684a8a3dd694a8a53b5 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_add_q7.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_add_q7.c
@@ -1,15 +1,5 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_add_q7.c
- * Description:  Q7 vector addition
- *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
- *
- * Target Processor: Cortex-M cores
- * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +16,18 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_add_q7.c
+ * Description:  Q7 vector addition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -50,7 +51,7 @@
                    Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_and_u16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_u16.c
index 684df9a50bb97c6b9c4e186012290d0f4468c32b..7a7427dcf765c5e96c232877a10216dfc6523271 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_and_u16.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_u16.c
@@ -3,13 +3,13 @@
  * Title:        arm_and_u16.c
  * Description:  uint16_t bitwise AND
  *
- * $Date:        14 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -63,7 +63,7 @@ void arm_and_u16(
     uint32_t blkCnt;      /* Loop counter */
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-    q15x8_t vecSrcA, vecSrcB;
+    uint16x8_t vecSrcA, vecSrcB;
 
     /* Compute 8 outputs at a time */
     blkCnt = blockSize >> 3;
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_and_u32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_u32.c
index 0a0274445287fe15363c92e9b035b9fc952c73b0..ddf838c25b0a4c609ba715b05b0632ab701ce7f6 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_and_u32.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_u32.c
@@ -3,13 +3,13 @@
  * Title:        arm_and_u32.c
  * Description:  uint32_t bitwise AND
  *
- * $Date:        14 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -55,7 +55,7 @@ void arm_and_u32(
     uint32_t blkCnt;      /* Loop counter */
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-    q31x4_t vecSrcA, vecSrcB;
+    uint32x4_t vecSrcA, vecSrcB;
 
     /* Compute 4 outputs at a time */
     blkCnt = blockSize >> 2;
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_and_u8.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_u8.c
index 00f72ec7e348e4abc634cabc3117f14cb17311ef..bd1a1abe54300ac3f6b93cfa1d89cb878fff2087 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_and_u8.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_u8.c
@@ -3,13 +3,13 @@
  * Title:        arm_and_u8.c
  * Description:  uint8_t bitwise AND
  *
- * $Date:        14 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -56,7 +56,7 @@ void arm_and_u8(
     uint32_t blkCnt;      /* Loop counter */
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-    q7x16_t vecSrcA, vecSrcB;
+    uint8x16_t vecSrcA, vecSrcB;
 
     /* Compute 16 outputs at a time */
     blkCnt = blockSize >> 4;
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_f16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..ec4a6dda6db086485254e8923ab88498223d3d75
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_f16.c
@@ -0,0 +1,141 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_clip_f16.c
+ * Description:  Floating-point vector addition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/basic_math_functions_f16.h"
+
+/**
+  @ingroup groupMath
+ */
+
+
+/**
+  @addtogroup BasicClip
+  @{
+ */
+
+/**
+  @brief         Elementwise floating-point clipping
+  @param[in]     pSrc          points to input values
+  @param[out]    pDst          points to output clipped values
+  @param[in]     low           lower bound
+  @param[in]     high          higher bound
+  @param[in]     numSamples    number of samples to clip
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_clip_f16(const float16_t * pSrc, 
+  float16_t * pDst, 
+  float16_t low, 
+  float16_t high, 
+  uint32_t numSamples)
+{
+    uint32_t  blkCnt;
+    f16x8_t curVec0, curVec1;
+    f16x8_t vecLow, vecHigh;
+
+    vecLow = vdupq_n_f16(low);
+    vecHigh = vdupq_n_f16(high);
+
+    curVec0 = vld1q(pSrc);
+    pSrc += 8;
+    /*
+     * unrolled x 2 to allow
+     * vldr/vstr/vmin/vmax
+     * stall free interleaving
+     */
+    blkCnt = numSamples >> 4;
+    while (blkCnt--)
+    {
+        curVec0 = vmaxnmq(curVec0, vecLow);
+        curVec1 = vld1q(pSrc);
+        pSrc += 8;
+        curVec0 = vminnmq(curVec0, vecHigh);
+        vst1q(pDst, curVec0);
+        pDst += 8;
+        curVec1 = vmaxnmq(curVec1, vecLow);
+        curVec0 = vld1q(pSrc);
+        pSrc += 8;
+        curVec1 = vminnmq(curVec1, vecHigh);
+        vst1q(pDst, curVec1);
+        pDst += 8;
+    }
+    /*
+     * Tail handling
+     */
+    blkCnt = numSamples - ((numSamples >> 4) << 4);
+    if (blkCnt >= 8)
+    {
+        curVec0 = vmaxnmq(curVec0, vecLow);
+        curVec0 = vminnmq(curVec0, vecHigh);
+        vst1q(pDst, curVec0);
+        pDst += 8;
+        curVec0 = vld1q(pSrc);
+        pSrc += 8;
+    }
+
+    if (blkCnt > 0)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt & 7);
+        curVec0 = vmaxnmq(curVec0, vecLow);
+        curVec0 = vminnmq(curVec0, vecHigh);
+        vstrhq_p(pDst, curVec0, p0);
+    }
+}
+
+#else
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+void arm_clip_f16(const float16_t * pSrc, 
+  float16_t * pDst, 
+  float16_t low, 
+  float16_t high, 
+  uint32_t numSamples)
+{
+    for (uint32_t i = 0; i < numSamples; i++)
+    {                                        
+        if (pSrc[i] > high)                  
+            pDst[i] = high;                  
+        else if (pSrc[i] < low)              
+            pDst[i] = low;                   
+        else                                 
+            pDst[i] = pSrc[i];               
+    }
+}
+#endif /* defined(ARM_FLOAT16_SUPPORTED */
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/**
+  @} end of BasicClip group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_f32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..bc06d9949518176d82a6df3ebb0bfd3930d4dea5
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_f32.c
@@ -0,0 +1,143 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_clip_f32.c
+ * Description:  Floating-point vector addition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/basic_math_functions.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicClip Elementwise clipping
+
+  Element-by-element clipping of a value.
+
+  The value is constrained between 2 bounds.
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicClip
+  @{
+ */
+
+/**
+  @brief         Elementwise floating-point clipping
+  @param[in]     pSrc          points to input values
+  @param[out]    pDst          points to output clipped values
+  @param[in]     low           lower bound
+  @param[in]     high          higher bound
+  @param[in]     numSamples    number of samples to clip
+  @return        none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_clip_f32(const float32_t * pSrc, 
+  float32_t * pDst, 
+  float32_t low, 
+  float32_t high, 
+  uint32_t numSamples)
+{
+    uint32_t  blkCnt;
+    f32x4_t curVec0, curVec1;
+    f32x4_t vecLow, vecHigh;
+
+    vecLow = vdupq_n_f32(low);
+    vecHigh = vdupq_n_f32(high);
+
+    curVec0 = vld1q(pSrc);
+    pSrc += 4;
+    /*
+     * unrolled x 2 to allow
+     * vldr/vstr/vmin/vmax
+     * stall free interleaving
+     */
+    blkCnt = numSamples >> 3;
+    while (blkCnt--)
+    {
+        curVec0 = vmaxnmq(curVec0, vecLow);
+        curVec1 = vld1q(pSrc);
+        pSrc += 4;
+        curVec0 = vminnmq(curVec0, vecHigh);
+        vst1q(pDst, curVec0);
+        pDst += 4;
+        curVec1 = vmaxnmq(curVec1, vecLow);
+        curVec0 = vld1q(pSrc);
+        pSrc += 4;
+        curVec1 = vminnmq(curVec1, vecHigh);
+        vst1q(pDst, curVec1);
+        pDst += 4;
+    }
+    /*
+     * Tail handling
+     */
+    blkCnt = numSamples - ((numSamples >> 3) << 3);
+    if (blkCnt >= 4)
+    {
+        curVec0 = vmaxnmq(curVec0, vecLow);
+        curVec0 = vminnmq(curVec0, vecHigh);
+        vst1q(pDst, curVec0);
+        pDst += 4;
+        curVec0 = vld1q(pSrc);
+        pSrc += 4;
+    }
+
+    if (blkCnt > 0)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt & 3);
+        curVec0 = vmaxnmq(curVec0, vecLow);
+        curVec0 = vminnmq(curVec0, vecHigh);
+        vstrwq_p(pDst, curVec0, p0);
+    }
+}
+
+#else
+void arm_clip_f32(const float32_t * pSrc, 
+  float32_t * pDst, 
+  float32_t low, 
+  float32_t high, 
+  uint32_t numSamples)
+{
+    for (uint32_t i = 0; i < numSamples; i++)
+    {                                        
+        if (pSrc[i] > high)                  
+            pDst[i] = high;                  
+        else if (pSrc[i] < low)              
+            pDst[i] = low;                   
+        else                                 
+            pDst[i] = pSrc[i];               
+    }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicClip group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_q15.c
new file mode 100644
index 0000000000000000000000000000000000000000..1d302873765f3a25a9dfbf6f90c0e02222da0e85
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_q15.c
@@ -0,0 +1,133 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_clip_q15.c
+ * Description:  Floating-point vector addition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/basic_math_functions.h"
+
+/**
+  @ingroup groupMath
+ */
+
+
+/**
+  @addtogroup BasicClip
+  @{
+ */
+
+/**
+  @brief         Elementwise fixed-point clipping
+  @param[in]     pSrc          points to input values
+  @param[out]    pDst          points to output clipped values
+  @param[in]     low           lower bound
+  @param[in]     high          higher bound
+  @param[in]     numSamples    number of samples to clip
+  @return        none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_clip_q15(const q15_t * pSrc, 
+  q15_t * pDst, 
+  q15_t low, 
+  q15_t high, 
+  uint32_t numSamples)
+{
+    uint32_t  blkCnt;
+    q15x8_t curVec0, curVec1;
+    q15x8_t vecLow, vecHigh;
+
+    vecLow = vdupq_n_s16(low);
+    vecHigh = vdupq_n_s16(high);
+
+    curVec0 = vld1q(pSrc);
+    pSrc += 8;
+    /*
+     * unrolled x 2 to allow
+     * vldr/vstr/vmin/vmax
+     * stall free interleaving
+     */
+    blkCnt = numSamples >> 4;
+    while (blkCnt--)
+    {
+        curVec0 = vmaxq(curVec0, vecLow);
+        curVec1 = vld1q(pSrc);
+        pSrc += 8;
+        curVec0 = vminq(curVec0, vecHigh);
+        vst1q(pDst, curVec0);
+        pDst += 8;
+        curVec1 = vmaxq(curVec1, vecLow);
+        curVec0 = vld1q(pSrc);
+        pSrc += 8;
+        curVec1 = vminq(curVec1, vecHigh);
+        vst1q(pDst, curVec1);
+        pDst += 8;
+    }
+    /*
+     * Tail handling
+     */
+    blkCnt = numSamples - ((numSamples >> 4) << 4);
+    if (blkCnt >= 8)
+    {
+        curVec0 = vmaxq(curVec0, vecLow);
+        curVec0 = vminq(curVec0, vecHigh);
+        vst1q(pDst, curVec0);
+        pDst += 8;
+        curVec0 = vld1q(pSrc);
+        pSrc += 8;
+    }
+
+    if (blkCnt > 0)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt & 7);
+        curVec0 = vmaxq(curVec0, vecLow);
+        curVec0 = vminq(curVec0, vecHigh);
+        vstrhq_p(pDst, curVec0, p0);
+    }
+}
+
+#else
+void arm_clip_q15(const q15_t * pSrc, 
+  q15_t * pDst, 
+  q15_t low, 
+  q15_t high, 
+  uint32_t numSamples)
+{
+    for (uint32_t i = 0; i < numSamples; i++)
+    {                                        
+        if (pSrc[i] > high)                  
+            pDst[i] = high;                  
+        else if (pSrc[i] < low)              
+            pDst[i] = low;                   
+        else                                 
+            pDst[i] = pSrc[i];               
+    }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicClip group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_q31.c
new file mode 100644
index 0000000000000000000000000000000000000000..36f6526acae510fb4a78b8cdeac0af380584b41a
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_q31.c
@@ -0,0 +1,133 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_clip_q31.c
+ * Description:  Floating-point vector addition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/basic_math_functions.h"
+
+/**
+  @ingroup groupMath
+ */
+
+
+/**
+  @addtogroup BasicClip
+  @{
+ */
+
+/**
+  @brief         Elementwise fixed-point clipping
+  @param[in]     pSrc          points to input values
+  @param[out]    pDst          points to output clipped values
+  @param[in]     low           lower bound
+  @param[in]     high          higher bound
+  @param[in]     numSamples    number of samples to clip
+  @return        none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_clip_q31(const q31_t * pSrc, 
+  q31_t * pDst, 
+  q31_t low, 
+  q31_t high, 
+  uint32_t numSamples)
+{
+    uint32_t  blkCnt;
+    q31x4_t curVec0, curVec1;
+    q31x4_t vecLow, vecHigh;
+
+    vecLow = vdupq_n_s32(low);
+    vecHigh = vdupq_n_s32(high);
+
+    curVec0 = vld1q(pSrc);
+    pSrc += 4;
+    /*
+     * unrolled x 2 to allow
+     * vldr/vstr/vmin/vmax
+     * stall free interleaving
+     */
+    blkCnt = numSamples >> 3;
+    while (blkCnt--)
+    {
+        curVec0 = vmaxq(curVec0, vecLow);
+        curVec1 = vld1q(pSrc);
+        pSrc += 4;
+        curVec0 = vminq(curVec0, vecHigh);
+        vst1q(pDst, curVec0);
+        pDst += 4;
+        curVec1 = vmaxq(curVec1, vecLow);
+        curVec0 = vld1q(pSrc);
+        pSrc += 4;
+        curVec1 = vminq(curVec1, vecHigh);
+        vst1q(pDst, curVec1);
+        pDst += 4;
+    }
+    /*
+     * Tail handling
+     */
+    blkCnt = numSamples - ((numSamples >> 3) << 3);
+    if (blkCnt >= 4)
+    {
+        curVec0 = vmaxq(curVec0, vecLow);
+        curVec0 = vminq(curVec0, vecHigh);
+        vst1q(pDst, curVec0);
+        pDst += 4;
+        curVec0 = vld1q(pSrc);
+        pSrc += 4;
+    }
+
+    if (blkCnt > 0)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt & 3);
+        curVec0 = vmaxq(curVec0, vecLow);
+        curVec0 = vminq(curVec0, vecHigh);
+        vstrwq_p(pDst, curVec0, p0);
+    }
+}
+
+#else
+void arm_clip_q31(const q31_t * pSrc, 
+  q31_t * pDst, 
+  q31_t low, 
+  q31_t high, 
+  uint32_t numSamples)
+{
+    for (uint32_t i = 0; i < numSamples; i++)
+    {                                        
+        if (pSrc[i] > high)                  
+            pDst[i] = high;                  
+        else if (pSrc[i] < low)              
+            pDst[i] = low;                   
+        else                                 
+            pDst[i] = pSrc[i];               
+    }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicClip group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_q7.c
new file mode 100644
index 0000000000000000000000000000000000000000..89e16af9ecb5c51be6562a5513e2520c46bd3539
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_clip_q7.c
@@ -0,0 +1,133 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_clip_q7.c
+ * Description:  Floating-point vector addition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/basic_math_functions.h"
+
+/**
+  @ingroup groupMath
+ */
+
+
+/**
+  @addtogroup BasicClip
+  @{
+ */
+
+/**
+  @brief         Elementwise fixed-point clipping
+  @param[in]     pSrc          points to input values
+  @param[out]    pDst          points to output clipped values
+  @param[in]     low           lower bound
+  @param[in]     high          higher bound
+  @param[in]     numSamples    number of samples to clip
+  @return        none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_clip_q7(const q7_t * pSrc, 
+  q7_t * pDst, 
+  q7_t low, 
+  q7_t high, 
+  uint32_t numSamples)
+{
+    uint32_t  blkCnt;
+    q7x16_t curVec0, curVec1;
+    q7x16_t vecLow, vecHigh;
+
+    vecLow = vdupq_n_s8(low);
+    vecHigh = vdupq_n_s8(high);
+
+    curVec0 = vld1q(pSrc);
+    pSrc += 16;
+    /*
+     * unrolled x 2 to allow
+     * vldr/vstr/vmin/vmax
+     * stall free interleaving
+     */
+    blkCnt = numSamples >> 5;
+    while (blkCnt--)
+    {
+        curVec0 = vmaxq(curVec0, vecLow);
+        curVec1 = vld1q(pSrc);
+        pSrc += 16;
+        curVec0 = vminq(curVec0, vecHigh);
+        vst1q(pDst, curVec0);
+        pDst += 16;
+        curVec1 = vmaxq(curVec1, vecLow);
+        curVec0 = vld1q(pSrc);
+        pSrc += 16;
+        curVec1 = vminq(curVec1, vecHigh);
+        vst1q(pDst, curVec1);
+        pDst += 16;
+    }
+    /*
+     * Tail handling
+     */
+    blkCnt = numSamples - ((numSamples >> 5) << 5);
+    if (blkCnt >= 16)
+    {
+        curVec0 = vmaxq(curVec0, vecLow);
+        curVec0 = vminq(curVec0, vecHigh);
+        vst1q(pDst, curVec0);
+        pDst += 16;
+        curVec0 = vld1q(pSrc);
+        pSrc += 16;
+    }
+
+    if (blkCnt > 0)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt & 0xf);
+        curVec0 = vmaxq(curVec0, vecLow);
+        curVec0 = vminq(curVec0, vecHigh);
+        vstrbq_p(pDst, curVec0, p0);
+    }
+}
+
+#else
+void arm_clip_q7(const q7_t * pSrc, 
+  q7_t * pDst, 
+  q7_t low, 
+  q7_t high, 
+  uint32_t numSamples)
+{
+    for (uint32_t i = 0; i < numSamples; i++)
+    {                                        
+        if (pSrc[i] > high)                  
+            pDst[i] = high;                  
+        else if (pSrc[i] < low)              
+            pDst[i] = low;                   
+        else                                 
+            pDst[i] = pSrc[i];               
+    }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicClip group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..55c88dfccf656e43522abba30b990c52bad049f6
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f16.c
@@ -0,0 +1,184 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dot_prod_f16.c
+ * Description:  Floating-point dot product
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/basic_math_functions_f16.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicDotProd Vector Dot Product
+
+  Computes the dot product of two vectors.
+  The vectors are multiplied element-by-element and then summed.
+
+  <pre>
+      sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicDotProd
+  @{
+ */
+
+/**
+  @brief         Dot product of floating-point vectors.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @param[out]    result     output result returned here.
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+
+void arm_dot_prod_f16(
+    const float16_t * pSrcA,
+    const float16_t * pSrcB,
+    uint32_t    blockSize,
+    float16_t * result)
+{
+    f16x8_t vecA, vecB;
+    f16x8_t vecSum;
+    uint32_t blkCnt; 
+    float16_t sum = 0.0f;  
+    vecSum = vdupq_n_f16(0.0f);
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 3U;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
+         * Calculate dot product and then store the result in a temporary buffer.
+         * and advance vector source and destination pointers
+         */
+        vecA = vld1q(pSrcA);
+        pSrcA += 8;
+        
+        vecB = vld1q(pSrcB);
+        pSrcB += 8;
+
+        vecSum = vfmaq(vecSum, vecA, vecB);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt --;
+    }
+
+
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vecSum = vfmaq_m(vecSum, vecA, vecB, p0);
+    }
+
+    sum = vecAddAcrossF16Mve(vecSum);
+
+    /* Store result in destination buffer */
+    *result = sum;
+
+}
+
+#else
+#if defined(ARM_FLOAT16_SUPPORTED)
+void arm_dot_prod_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        uint32_t blockSize,
+        float16_t * result)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        _Float16 sum = 0.0f;                          /* Temporary return variable */
+
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
+
+    sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
+
+    sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
+
+    sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+    /* Calculate dot product and store result in a temporary buffer. */
+    sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store result in destination buffer */
+  *result = sum;
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of BasicDotProd group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f32.c
index 2bce15bf3cc0fc726e0ac853777e2a847f1cfad3..df568f32f032c15fa9866e2779bcfe9a8fdaf587 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f32.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_dot_prod_f32.c
  * Description:  Floating-point dot product
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_q15.c
index 7a6a52555159ffd2eaf3009670ce6abe414b8a9b..3aa1cb3c1e84284ab4c08a294c33463a69fe4a87 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_q15.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_dot_prod_q15.c
  * Description:  Q15 dot product
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -52,7 +52,7 @@
                    there is no risk of overflow.
                    The return result is in 34.30 format.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_q31.c
index 0606f94037d6fda6c95105afc1fc8114a580a900..b85da3798258d17d214eae2b442060c22cda09e9 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_q31.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_dot_prod_q31.c
  * Description:  Q31 dot product
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -54,7 +54,7 @@
                    The return result is in 16.48 format.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_q7.c
index dff7ecec04f899bbd03f447c39f04cfc92de0826..268be604fe796ce31e7346fe762eddd5a2702a6f 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_q7.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_dot_prod_q7.c
  * Description:  Q7 dot product
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -53,7 +53,7 @@
                    The return result is in 18.14 format.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_f16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..4d99b8cacdeead8782065e75cb318bd6e28fb25c
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_f16.c
@@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mult_f16.c
+ * Description:  Floating-point vector multiplication
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/basic_math_functions_f16.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicMult Vector Multiplication
+
+  Element-by-element multiplication of two vectors.
+
+  <pre>
+      pDst[n] = pSrcA[n] * pSrcB[n],   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicMult
+  @{
+ */
+
+/**
+  @brief         Floating-point vector multiplication.
+  @param[in]     pSrcA      points to the first input vector.
+  @param[in]     pSrcB      points to the second input vector.
+  @param[out]    pDst       points to the output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
+ */
+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) 
+
+#include "arm_helium_utils.h"
+
+void arm_mult_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f16x8_t vec1;
+    f16x8_t vec2;
+    f16x8_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 3U;
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+      /* Add and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrcA);
+        vec2 = vld1q(pSrcB);
+        res = vmulq(vec1, vec2);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 8;
+        pSrcB += 8; 
+        pDst += 8;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x7;
+    if (blkCnt > 0U)
+    {
+      /* C = A + B */
+      mve_pred16_t p0 = vctp16q(blkCnt);
+      vec1 = vld1q(pSrcA);
+      vec2 = vld1q(pSrcB);
+      vstrhq_p(pDst, vmulq(vec1,vec2), p0);
+    }
+
+}
+
+#else
+#if defined(ARM_FLOAT16_SUPPORTED)
+void arm_mult_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply inputs and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+
+    /* Multiply input and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) * (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicMult group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_f32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_f32.c
index 687de7afb8d7a8dafcf5cceb40d8931210d2b5e3..4df04e9629d38bce7bd9824938df141131fef71b 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_f32.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_mult_f32.c
  * Description:  Floating-point vector multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q15.c
index 8e9dc79debaf3467fa933b983f200ba4944bfb16..167220a4b7a74facceadc0e72a4563be5b50b70c 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q15.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_mult_q15.c
  * Description:  Q15 vector multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -49,7 +49,7 @@
                    The function uses saturating arithmetic.
                    Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q31.c
index 606c000ab39fed325659185623a761de3ffafdc2..63ad73c410af252da61e459bac97bdf154114cf3 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q31.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_mult_q31.c
  * Description:  Q31 vector multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -49,7 +49,7 @@
                    The function uses saturating arithmetic.
                    Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q7.c
index 06f56d311317467c097e408d04643a04225ae925..7be80db5066193cc9e0b8d926217c38c663b6c07 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q7.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_mult_q7.c
  * Description:  Q7 vector multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -49,7 +49,7 @@
                    The function uses saturating arithmetic.
                    Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_f16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..2fe26a240db29bcde41650f9dae356646b406104
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_f16.c
@@ -0,0 +1,166 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_negate_f16.c
+ * Description:  Negates floating-point vectors
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/basic_math_functions_f16.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicNegate Vector Negate
+
+  Negates the elements of a vector.
+
+  <pre>
+      pDst[n] = -pSrc[n],   0 <= n < blockSize.
+  </pre>
+
+  The functions support in-place computation allowing the source and
+  destination pointers to reference the same memory buffer.
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicNegate
+  @{
+ */
+
+/**
+  @brief         Negates the elements of a floating-point vector.
+  @param[in]     pSrc       points to input vector.
+  @param[out]    pDst       points to output vector.
+  @param[in]     blockSize  number of samples in each vector.
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_negate_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+    f16x8_t vec1;
+    f16x8_t res;
+
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 3U;
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+
+        /* Calculate absolute values and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vnegq(vec1);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 8;
+        pDst += 8;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x7;
+    if (blkCnt > 0U)
+    {
+      /* C = |A| */
+      mve_pred16_t p0 = vctp16q(blkCnt);
+      vec1 = vld1q((float16_t const *) pSrc);
+      vstrhq_p(pDst, vnegq(vec1), p0);
+    }
+
+}
+
+#else
+#if defined(ARM_FLOAT16_SUPPORTED)
+void arm_negate_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    *pDst++ = -*pSrc++;
+
+    *pDst++ = -*pSrc++;
+
+    *pDst++ = -*pSrc++;
+
+    *pDst++ = -*pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = -A */
+
+    /* Negate and store result in destination buffer. */
+    *pDst++ = -*pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicNegate group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_f32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_f32.c
index 4eae2a284a3b53435b2026ecef68ff817c6a650c..4f243cdac95bd0165c37956bbeb0c8815d00e42d 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_f32.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_negate_f32.c
  * Description:  Negates floating-point vectors
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_q15.c
index ea0b008996731264841bc9625721a4f19032eb8c..a1a7e8c4770011a4e385b1d9a27601b5f251b447 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_q15.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_negate_q15.c
  * Description:  Negates Q15 vectors
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -50,7 +50,7 @@
                    The function uses saturating arithmetic.
                    The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_q31.c
index f962fe483b88c0f579f8b29c873e0b139e1d4bfa..3ee77ed742eb04e7573aa4b75145d2afc4b8f215 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_q31.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_negate_q31.c
  * Description:  Negates Q31 vectors
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -49,7 +49,7 @@
                    The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_q7.c
index 03ace0e8a8d78b329328cde9376a8859d6d4ac84..2e76567f786cbb1b61c14807e14ef2acba69b415 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_q7.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_negate_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_negate_q7.c
  * Description:  Negates Q7 vectors
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -48,7 +48,7 @@
                    The function uses saturating arithmetic.
                    The Q7 value -1 (0x80) is saturated to the maximum allowable positive value 0x7F.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_not_u16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_u16.c
index e1ce1b77d7a7340add45faf7ad63d1eed76ae7a7..e553d28793698cc5e2236b4b320f72486e7e6478 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_not_u16.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_u16.c
@@ -3,13 +3,13 @@
  * Title:        arm_not_u16.c
  * Description:  uint16_t bitwise NOT
  *
- * $Date:        14 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -61,7 +61,7 @@ void arm_not_u16(
     uint32_t blkCnt;      /* Loop counter */
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-    q15x8_t vecSrc;
+    uint16x8_t vecSrc;
 
     /* Compute 8 outputs at a time */
     blkCnt = blockSize >> 3;
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_not_u32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_u32.c
index eed0408f1c98b5c10e269f4af7d4984e09b03614..f46284c87845a47bd09a9c837533131e244d301d 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_not_u32.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_u32.c
@@ -3,13 +3,13 @@
  * Title:        arm_not_u32.c
  * Description:  uint32_t bitwise NOT
  *
- * $Date:        14 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -53,7 +53,7 @@ void arm_not_u32(
     uint32_t blkCnt;      /* Loop counter */
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-    q31x4_t vecSrc;
+    uint32x4_t vecSrc;
 
     /* Compute 8 outputs at a time */
     blkCnt = blockSize >> 2;
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_not_u8.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_u8.c
index 83e0de57d0c75b00eb72c24162abeb2574da258a..e06828386c8e19b36dbf381a8a7998a861da1879 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_not_u8.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_u8.c
@@ -3,13 +3,13 @@
  * Title:        arm_not_u8.c
  * Description:  uint8_t bitwise NOT
  *
- * $Date:        14 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -53,7 +53,7 @@ void arm_not_u8(
     uint32_t blkCnt;      /* Loop counter */
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-    q7x16_t vecSrc;
+    uint8x16_t vecSrc;
 
     /* Compute 16 outputs at a time */
     blkCnt = blockSize >> 4;
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_f16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..2ae2f80ce9a2497703153bb35230e246c8e50a36
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_f16.c
@@ -0,0 +1,170 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_offset_f16.c
+ * Description:  Floating-point vector offset
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/basic_math_functions_f16.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicOffset Vector Offset
+
+  Adds a constant offset to each element of a vector.
+
+  <pre>
+      pDst[n] = pSrc[n] + offset,   0 <= n < blockSize.
+  </pre>
+
+  The functions support in-place computation allowing the source and
+  destination pointers to reference the same memory buffer.
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicOffset
+  @{
+ */
+
+/**
+  @brief         Adds a constant offset to a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     offset     is the offset to be added
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_offset_f16(
+  const float16_t * pSrc,
+        float16_t offset,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+    f16x8_t vec1;
+    f16x8_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 3U;
+    while (blkCnt > 0U)
+    {
+        /* C = A + offset */
+ 
+        /* Add offset and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vaddq(vec1,offset);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 8;
+        pDst += 8;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x7;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vec1 = vld1q((float16_t const *) pSrc);
+        vstrhq_p(pDst, vaddq(vec1, offset), p0);
+    }
+
+
+}
+
+#else
+#if defined(ARM_FLOAT16_SUPPORTED)
+void arm_offset_f16(
+  const float16_t * pSrc,
+        float16_t offset,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+    *pDst++ = (*pSrc++) + offset;
+
+    *pDst++ = (*pSrc++) + offset;
+
+    *pDst++ = (*pSrc++) + offset;
+
+    *pDst++ = (*pSrc++) + offset;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A + offset */
+
+    /* Add offset and store result in destination buffer. */
+    *pDst++ = (*pSrc++) + offset;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicOffset group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_f32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_f32.c
index 3bed80651319793b602a99afdb94194c10c74975..a68df050a04d5b27ea672045235a56f077de391d 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_f32.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_offset_f32.c
  * Description:  Floating-point vector offset
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_q15.c
index 81f820bd01a3205b7952b4b601ac166293681911..ff5515c777680aabac5fe68ad3cd73fa6342a3da 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_q15.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_offset_q15.c
  * Description:  Q15 vector offset
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -49,7 +49,7 @@
                    The function uses saturating arithmetic.
                    Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_q31.c
index bdc8cf33153fabd483257de9ec960c67e8a91ff5..2f702b0ff71e934d09dc7b1c979f7fa8bea04716 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_q31.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_offset_q31.c
  * Description:  Q31 vector offset
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -50,7 +50,7 @@
                    Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
@@ -114,29 +114,13 @@ void arm_offset_q31(
     /* C = A + offset */
 
     /* Add offset and store result in destination buffer. */
-#if defined (ARM_MATH_DSP)
     *pDst++ = __QADD(*pSrc++, offset);
-#else
-    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
-#endif
-
-#if defined (ARM_MATH_DSP)
+    
     *pDst++ = __QADD(*pSrc++, offset);
-#else
-    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
-#endif
-
-#if defined (ARM_MATH_DSP)
+    
     *pDst++ = __QADD(*pSrc++, offset);
-#else
-    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
-#endif
-
-#if defined (ARM_MATH_DSP)
+    
     *pDst++ = __QADD(*pSrc++, offset);
-#else
-    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
-#endif
 
     /* Decrement loop counter */
     blkCnt--;
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_q7.c
index a9de6cc38a4399deea3c33d90757704afa98b68a..9199a9a4ebf5485516b48c59431c335736b73bf6 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_q7.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_offset_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_offset_q7.c
  * Description:  Q7 vector offset
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -49,7 +49,7 @@
                    The function uses saturating arithmetic.
                    Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
@@ -123,10 +123,10 @@ void arm_offset_q7(
     /* Add offset and store result in destination buffer (4 samples at a time). */
     write_q7x4_ia (&pDst, __QADD8(read_q7x4_ia ((q7_t **) &pSrc), offset_packed));
 #else
-    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
-    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
-    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
-    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
+    *pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
 #endif
 
     /* Decrement loop counter */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_or_u16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_u16.c
index 8f2abbccd40accecf774c3de2ab8fda88dee00a9..92d28036b9479e166f3be4384a4df9b090e26b27 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_or_u16.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_u16.c
@@ -3,13 +3,13 @@
  * Title:        arm_or_u16.c
  * Description:  uint16_t bitwise inclusive OR
  *
- * $Date:        14 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -63,7 +63,7 @@ void arm_or_u16(
     uint32_t blkCnt;      /* Loop counter */
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-    q15x8_t vecSrcA, vecSrcB;
+    uint16x8_t vecSrcA, vecSrcB;
 
     /* Compute 8 outputs at a time */
     blkCnt = blockSize >> 3;
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_or_u32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_u32.c
index 66b97c051fbeca97879940eaae1740cedbc45896..14cc83bb5f7b535e78e74e3e4831daaf992de4ae 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_or_u32.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_u32.c
@@ -3,13 +3,13 @@
  * Title:        arm_or_u32.c
  * Description:  uint32_t bitwise inclusive OR
  *
- * $Date:        14 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -55,7 +55,7 @@ void arm_or_u32(
     uint32_t blkCnt;      /* Loop counter */
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-    q31x4_t vecSrcA, vecSrcB;
+    uint32x4_t vecSrcA, vecSrcB;
 
     /* Compute 4 outputs at a time */
     blkCnt = blockSize >> 2;
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_or_u8.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_u8.c
index 32ef686d26b26dd633d3a66483f1912e669fa840..beee07b69aaf03e3cb827e681066bf17170e5f76 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_or_u8.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_u8.c
@@ -3,13 +3,13 @@
  * Title:        arm_or_u8.c
  * Description:  uint8_t bitwise inclusive OR
  *
- * $Date:        14 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -55,7 +55,7 @@ void arm_or_u8(
     uint32_t blkCnt;      /* Loop counter */
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-    q7x16_t vecSrcA, vecSrcB;
+    uint8x16_t vecSrcA, vecSrcB;
 
     /* Compute 16 outputs at a time */
     blkCnt = blockSize >> 4;
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_f16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..449d74847ae59d2ef803451fdbb3018f39c96880
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_f16.c
@@ -0,0 +1,183 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_scale_f16.c
+ * Description:  Multiplies a floating-point vector by a scalar
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/basic_math_functions_f16.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicScale Vector Scale
+
+  Multiply a vector by a scalar value.  For floating-point data, the algorithm used is:
+
+  <pre>
+      pDst[n] = pSrc[n] * scale,   0 <= n < blockSize.
+  </pre>
+
+  In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
+  a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
+  The shift allows the gain of the scaling operation to exceed 1.0.
+  The algorithm used with fixed-point data is:
+
+  <pre>
+      pDst[n] = (pSrc[n] * scaleFract) << shift,   0 <= n < blockSize.
+  </pre>
+
+  The overall scale factor applied to the fixed-point data is
+  <pre>
+      scale = scaleFract * 2^shift.
+  </pre>
+
+  The functions support in-place computation allowing the source and destination
+  pointers to reference the same memory buffer.
+ */
+
+/**
+  @addtogroup BasicScale
+  @{
+ */
+
+/**
+  @brief         Multiplies a floating-point vector by a scalar.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     scale      scale factor to be applied
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_scale_f16(
+  const float16_t * pSrc,
+        float16_t scale,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+    f16x8_t vec1;
+    f16x8_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + offset */
+ 
+        /* Add offset and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vmulq(vec1,scale);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 8;
+        pDst += 8;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x7;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vec1 = vld1q((float16_t const *) pSrc);
+        vstrhq_p(pDst, vmulq(vec1, scale), p0);
+    }
+
+
+}
+
+#else
+#if defined(ARM_FLOAT16_SUPPORTED)
+void arm_scale_f16(
+  const float16_t *pSrc,
+        float16_t scale,
+        float16_t *pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * scale */
+
+    /* Scale input and store result in destination buffer. */
+    *pDst++ = (*pSrc++) * scale;
+
+    *pDst++ = (*pSrc++) * scale;
+
+    *pDst++ = (*pSrc++) * scale;
+
+    *pDst++ = (*pSrc++) * scale;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * scale */
+
+    /* Scale input and store result in destination buffer. */
+    *pDst++ = (*pSrc++) * scale;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicScale group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_f32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_f32.c
index 053492f6dbba9375888a665b2f7cfe5455965d6b..59bc813fab319181a014965bfa67bc04351e2f63 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_f32.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_scale_f32.c
  * Description:  Multiplies a floating-point vector by a scalar
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_q15.c
index 159c7db984667b8b12403c65d377d9762adcceb2..f7062022180a87e97c516cb05e9810294e25d4c1 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_q15.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_scale_q15.c
  * Description:  Multiplies a Q15 vector by a scalar
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -51,7 +51,7 @@
                    These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_q31.c
index e4639bb6ca968bf1105471cb47e5558e3edf4749..4e28441b2b7a746dbbb937cc6ea02f95e8766fb1 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_q31.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_scale_q31.c
  * Description:  Multiplies a Q31 vector by a scalar
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -51,7 +51,7 @@
                    These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_q7.c
index d12353729c760ceb0c52028dc405a69364894eab..5bb4580f86676413ecb187bc0ba403b1b6cec565 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_q7.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_scale_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_scale_q7.c
  * Description:  Multiplies a Q7 vector by a scalar
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -51,7 +51,7 @@
                    These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_shift_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_shift_q15.c
index 0de2eb4fa131a747566604036a1e8f6a1c5ab944..2de3b2b02fac2955a87f6d2fea7cf722912ebc69 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_shift_q15.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_shift_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_shift_q15.c
  * Description:  Shifts the elements of a Q15 vector by a specified number of bits
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -50,7 +50,7 @@
                    Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
@@ -131,11 +131,11 @@ void arm_shift_q15(
 
       /* Shift the inputs and then store the results in the destination buffer. */
 #ifndef ARM_MATH_BIG_ENDIAN
-      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
-                                     __SSAT((in2 << shiftBits), 16), 16));
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT(((q31_t) in1 << shiftBits), 16),
+                                     __SSAT(((q31_t) in2 << shiftBits), 16), 16));
 #else
-      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
-                                      __SSAT((in1 << shiftBits), 16), 16));
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT(((q31_t) in2 << shiftBits), 16),
+                                      __SSAT(((q31_t) in1 << shiftBits), 16), 16));
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 
       /* read 2 samples from source */
@@ -143,11 +143,11 @@ void arm_shift_q15(
       in2 = *pSrc++;
 
 #ifndef ARM_MATH_BIG_ENDIAN
-      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in1 << shiftBits), 16),
-                                     __SSAT((in2 << shiftBits), 16), 16));
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT(((q31_t) in1 << shiftBits), 16),
+                                     __SSAT(((q31_t) in2 << shiftBits), 16), 16));
 #else
-      write_q15x2_ia (&pDst, __PKHBT(__SSAT((in2 << shiftBits), 16),
-                                     __SSAT((in1 << shiftBits), 16), 16));
+      write_q15x2_ia (&pDst, __PKHBT(__SSAT(((q31_t) in2 << shiftBits), 16),
+                                     __SSAT(((q31_t) in1 << shiftBits), 16), 16));
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 
 #else
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_shift_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_shift_q31.c
index c8cd660ebe73b690ea158d6fd40cd706c238b4e1..5405c2411e32b88092323892fcb96835bdc00d46 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_shift_q31.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_shift_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_shift_q31.c
  * Description:  Shifts the elements of a Q31 vector by a specified number of bits
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -67,7 +67,7 @@
                    Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_shift_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_shift_q7.c
index ecfe1984f72b32fe0dd8a662227995c9980ea64f..4f83edcccd992474f5613d82ec68a067b19c7afe 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_shift_q7.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_shift_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_shift_q7.c
  * Description:  Processing function for the Q7 Shifting
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -52,7 +52,7 @@
                    Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
@@ -134,10 +134,10 @@ void arm_shift_q7(
       in4 = *pSrc++;
 
     /* Pack and store result in destination buffer (in single write) */
-      write_q7x4_ia (&pDst, __PACKq7(__SSAT((in1 << shiftBits), 8),
-                                     __SSAT((in2 << shiftBits), 8),
-                                     __SSAT((in3 << shiftBits), 8),
-                                     __SSAT((in4 << shiftBits), 8) ));
+      write_q7x4_ia (&pDst, __PACKq7(__SSAT(((q15_t) in1 << shiftBits), 8),
+                                     __SSAT(((q15_t) in2 << shiftBits), 8),
+                                     __SSAT(((q15_t) in3 << shiftBits), 8),
+                                     __SSAT(((q15_t) in4 << shiftBits), 8) ));
 #else
       *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
       *pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_f16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..eddbbd25b77e39fa1c658a56966ac7707ad6404c
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_f16.c
@@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sub_f16.c
+ * Description:  Floating-point vector subtraction
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/basic_math_functions_f16.h"
+
+/**
+  @ingroup groupMath
+ */
+
+/**
+  @defgroup BasicSub Vector Subtraction
+
+  Element-by-element subtraction of two vectors.
+
+  <pre>
+      pDst[n] = pSrcA[n] - pSrcB[n],   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating-point, Q7, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup BasicSub
+  @{
+ */
+
+/**
+  @brief         Floating-point vector subtraction.
+  @param[in]     pSrcA      points to the first input vector
+  @param[in]     pSrcB      points to the second input vector
+  @param[out]    pDst       points to the output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include "arm_helium_utils.h"
+
+void arm_sub_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f16x8_t vec1;
+    f16x8_t vec2;
+    f16x8_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+      /* Add and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrcA);
+        vec2 = vld1q(pSrcB);
+        res = vsubq(vec1, vec2);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 8;
+        pSrcB += 8; 
+        pDst += 8;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x7;
+
+    if (blkCnt > 0U)
+    {
+      /* C = A + B */
+      mve_pred16_t p0 = vctp16q(blkCnt);
+      vec1 = vld1q(pSrcA);
+      vec2 = vld1q(pSrcB);
+      vstrhq_p(pDst, vsubq(vec1,vec2), p0);
+    }
+
+}
+
+#else
+#if defined(ARM_FLOAT16_SUPPORTED)
+void arm_sub_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A - B */
+
+    /* Subtract and store result in destination buffer. */
+    *pDst++ = (*pSrcA++) - (*pSrcB++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicSub group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_f32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_f32.c
index 9f4865d7e17abc565428cbe68095392f622415c2..2a07c0c6c28cae4342090625aa10ca2f6c0098e0 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_f32.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_sub_f32.c
  * Description:  Floating-point vector subtraction
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_q15.c
index b4b267bb9d49187a5faed398d4c814741b8eccb6..5f05d4f2030c18da24e6f683a2694f11bda0a62a 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_q15.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_sub_q15.c
  * Description:  Q15 vector subtraction
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -50,7 +50,7 @@
                    Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_q31.c
index b4905df9a9620b93b8578770a8ce8a19aeb65297..ed879a1e285804f3a42e79ea300fc2bd0c914fc4 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_q31.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_sub_q31.c
  * Description:  Q31 vector subtraction
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -50,7 +50,7 @@
                    Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_q7.c
index e34e74ee321ee8822c2e32a926c98234c6f70766..3fb9d94e8a27b370abce008bb59b7d5fd521cf48 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_q7.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_sub_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_sub_q7.c
  * Description:  Q7 vector subtraction
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -49,7 +49,7 @@
                    The function uses saturating arithmetic.
                    Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_u16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_u16.c
index 02960377b564a31302ecda5e5cc6bb0a625c6698..54042ea7270c13020586737a735bfcbb9f24e48b 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_u16.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_u16.c
@@ -3,13 +3,13 @@
  * Title:        arm_xor_u16.c
  * Description:  uint16_t bitwise exclusive OR
  *
- * $Date:        14 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -63,7 +63,7 @@ void arm_xor_u16(
     uint32_t blkCnt;      /* Loop counter */
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-    q15x8_t vecSrcA, vecSrcB;
+    uint16x8_t vecSrcA, vecSrcB;
 
     /* Compute 8 outputs at a time */
     blkCnt = blockSize >> 3;
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_u32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_u32.c
index 526a1a0a7b88a2f6e79830c4419f8cd5d0554a37..9d149084402c5359c801eed6587ccaa07ba5be3c 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_u32.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_u32.c
@@ -3,13 +3,13 @@
  * Title:        arm_xor_u32.c
  * Description:  uint32_t bitwise exclusive OR
  *
- * $Date:        14 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -55,7 +55,7 @@ void arm_xor_u32(
     uint32_t blkCnt;      /* Loop counter */
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-    q31x4_t vecSrcA, vecSrcB;
+    uint32x4_t vecSrcA, vecSrcB;
 
     /* Compute 4 outputs at a time */
     blkCnt = blockSize >> 2;
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_u8.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_u8.c
index 769e27cd9e873a0b96b7444cdaa49263ee9822e9..d708cd4e02d8ecd6657e845a1322564b7bac6442 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_u8.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_u8.c
@@ -3,13 +3,13 @@
  * Title:        arm_xor_u8.c
  * Description:  uint8_t bitwise exclusive OR
  *
- * $Date:        14 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/basic_math_functions.h"
 
 /**
   @ingroup groupMath
@@ -55,7 +55,7 @@ void arm_xor_u8(
     uint32_t blkCnt;      /* Loop counter */
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-    q7x16_t vecSrcA, vecSrcB;
+    uint8x16_t vecSrcA, vecSrcB;
 
     /* Compute 16 outputs at a time */
     blkCnt = blockSize >> 4;
diff --git a/CMSIS/DSP/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f16.c b/CMSIS/DSP/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..b918f704bf73bed0178f0c2a21b6a6787f8851e6
--- /dev/null
+++ b/CMSIS/DSP/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f16.c
@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_naive_gaussian_bayes_predict_f16
+ * Description:  Naive Gaussian Bayesian Estimator
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/bayes_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+#define PI_F 3.1415926535897932384626433832795f16
+
+/**
+ * @addtogroup groupBayes
+ * @{
+ */
+
+/**
+ * @brief Naive Gaussian Bayesian Estimator
+ *
+ * @param[in]  *S                       points to a naive bayes instance structure
+ * @param[in]  *in                      points to the elements of the input vector.
+ * @param[out] *pOutputProbabilities    points to a buffer of length numberOfClasses containing estimated probabilities
+ * @param[out] *pBufferB                points to a temporary buffer of length numberOfClasses
+ * @return The predicted class
+ *
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_instance_f16 *S, 
+   const float16_t * in, 
+   float16_t *pOutputProbabilities,
+   float16_t *pBufferB
+   )
+{
+    uint32_t         nbClass;
+    const float16_t *pTheta = S->theta;
+    const float16_t *pSigma = S->sigma;
+    float16_t      *buffer = pOutputProbabilities;
+    const float16_t *pIn = in;
+    float16_t       result;
+    f16x8_t         vsigma;
+    _Float16       tmp;
+    f16x8_t         vacc1, vacc2;
+    uint32_t        index;
+    float16_t       *logclassPriors=pBufferB;
+    float16_t      *pLogPrior = logclassPriors;
+
+    arm_vlog_f16((float16_t *) S->classPriors, logclassPriors, S->numberOfClasses);
+
+    pTheta = S->theta;
+    pSigma = S->sigma;
+
+    for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) {
+        pIn = in;
+
+        vacc1 = vdupq_n_f16(0.0f16);
+        vacc2 = vdupq_n_f16(0.0f16);
+
+        uint32_t         blkCnt =S->vectorDimension >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vinvSigma, vtmp;
+
+            vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
+            vacc1 = vaddq(vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)));
+
+            vinvSigma = vrecip_medprec_f16(vsigma);
+
+            vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
+            /* squaring */
+            vtmp = vmulq(vtmp, vtmp);
+
+            vacc2 = vfmaq(vacc2, vtmp, vinvSigma);
+
+            pIn += 8;
+            pTheta += 8;
+            pSigma += 8;
+            blkCnt--;
+        }
+
+        blkCnt = S->vectorDimension & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vinvSigma, vtmp;
+
+            vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
+            vacc1 =
+                vaddq_m_f16(vacc1, vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)), p0);
+
+            vinvSigma = vrecip_medprec_f16(vsigma);
+
+            vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
+            /* squaring */
+            vtmp = vmulq(vtmp, vtmp);
+
+            vacc2 = vfmaq_m_f16(vacc2, vtmp, vinvSigma, p0);
+
+            pTheta += blkCnt;
+            pSigma += blkCnt;
+        }
+
+        tmp = -0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc1);
+        tmp -= 0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc2);
+
+        *buffer = tmp + *pLogPrior++;
+        buffer++;
+    }
+
+    arm_max_f16(pOutputProbabilities, S->numberOfClasses, &result, &index);
+
+    return (index);
+}
+
+#else
+
+uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_instance_f16 *S, 
+   const float16_t * in, 
+   float16_t *pOutputProbabilities,
+   float16_t *pBufferB)
+{
+    uint32_t nbClass;
+    uint32_t nbDim;
+    const float16_t *pPrior = S->classPriors;
+    const float16_t *pTheta = S->theta;
+    const float16_t *pSigma = S->sigma;
+    float16_t *buffer = pOutputProbabilities;
+    const float16_t *pIn=in;
+    float16_t result;
+    _Float16 sigma;
+    _Float16 tmp;
+    _Float16 acc1,acc2;
+    uint32_t index;
+    (void)pBufferB;
+
+    pTheta=S->theta;
+    pSigma=S->sigma;
+
+    for(nbClass = 0; nbClass < S->numberOfClasses; nbClass++)
+    {
+
+        
+        pIn = in;
+
+        tmp = 0.0f16;
+        acc1 = 0.0f16;
+        acc2 = 0.0f16;
+        for(nbDim = 0; nbDim < S->vectorDimension; nbDim++)
+        {
+           sigma = *pSigma + S->epsilon;
+           acc1 += logf(2.0f16 * (_Float16)PI_F * sigma);
+           acc2 += (*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
+
+           pIn++;
+           pTheta++;
+           pSigma++;
+        }
+
+        tmp = -0.5f16 * acc1;
+        tmp -= 0.5f16 * acc2;
+
+
+        *buffer = tmp + logf(*pPrior++);
+        buffer++;
+    }
+
+    arm_max_f16(pOutputProbabilities,S->numberOfClasses,&result,&index);
+
+    return(index);
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of groupBayes group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c b/CMSIS/DSP/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c
index 27bd793fb7803dc5dd8c001119a88586c37ce8aa..56331ff523c2a86b8034fe144da4bbbb53c3a2bf 100644
--- a/CMSIS/DSP/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c
+++ b/CMSIS/DSP/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_naive_gaussian_bayes_predict_f32
  * Description:  Naive Gaussian Bayesian Estimator
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/bayes_functions.h"
 #include <limits.h>
 #include <math.h>
 
@@ -39,13 +41,12 @@
 /**
  * @brief Naive Gaussian Bayesian Estimator
  *
- * @param[in]  *S         points to a naive bayes instance structure
- * @param[in]  *in        points to the elements of the input vector.
- * @param[in]  *pBuffer   points to a buffer of length numberOfClasses
+ * @param[in]   *S                      points to a naive bayes instance structure
+ * @param[in]   *in                     points to the elements of the input vector.
+ * @param[out]  *pOutputProbabilities   points to a buffer of length numberOfClasses containing estimated probabilities
+ * @param[out]  *pBufferB               points to a temporary buffer of length numberOfClasses
  * @return The predicted class
  *
- * @par If the number of classes is big, MVE version will consume lot of
- * stack since the log prior are computed on the stack.
  *
  */
 
@@ -56,19 +57,21 @@
 
 uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S, 
    const float32_t * in, 
-   float32_t *pBuffer)
+   float32_t *pOutputProbabilities,
+   float32_t *pBufferB
+   )
 {
     uint32_t         nbClass;
     const float32_t *pTheta = S->theta;
     const float32_t *pSigma = S->sigma;
-    float32_t      *buffer = pBuffer;
+    float32_t      *buffer = pOutputProbabilities;
     const float32_t *pIn = in;
     float32_t       result;
     f32x4_t         vsigma;
     float32_t       tmp;
     f32x4_t         vacc1, vacc2;
     uint32_t        index;
-    float32_t       logclassPriors[S->numberOfClasses];
+    float32_t       *logclassPriors=pBufferB;
     float32_t      *pLogPrior = logclassPriors;
 
     arm_vlog_f32((float32_t *) S->classPriors, logclassPriors, S->numberOfClasses);
@@ -131,7 +134,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_ins
         buffer++;
     }
 
-    arm_max_f32(pBuffer, S->numberOfClasses, &result, &index);
+    arm_max_f32(pOutputProbabilities, S->numberOfClasses, &result, &index);
 
     return (index);
 }
@@ -146,7 +149,8 @@ uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_ins
 
 uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S, 
    const float32_t * in, 
-   float32_t *pBuffer)
+   float32_t *pOutputProbabilities,
+   float32_t *pBufferB)
 {
     
     const float32_t *pPrior = S->classPriors;
@@ -157,7 +161,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_ins
     const float32_t *pTheta1 = S->theta + S->vectorDimension;
     const float32_t *pSigma1 = S->sigma + S->vectorDimension;
 
-    float32_t *buffer = pBuffer;
+    float32_t *buffer = pOutputProbabilities;
     const float32_t *pIn=in;
 
     float32_t result;
@@ -172,6 +176,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_ins
     float32x2_t tmpV2;
     float32x4_t thetaV,thetaV1;
     float32x4_t inV;
+    (void)pBufferB;
 
     epsilonV = vdupq_n_f32(S->epsilon);
 
@@ -320,32 +325,24 @@ uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_ins
         classBlkCnt--;
     }
 
-    arm_max_f32(pBuffer,S->numberOfClasses,&result,&index);
+    arm_max_f32(pOutputProbabilities,S->numberOfClasses,&result,&index);
 
     return(index);
 }
 
 #else
 
-/**
- * @brief Naive Gaussian Bayesian Estimator
- *
- * @param[in]  *S         points to a naive bayes instance structure
- * @param[in]  *in        points to the elements of the input vector.
- * @param[in]  *pBuffer   points to a buffer of length numberOfClasses
- * @return The predicted class
- *
- */
 uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S, 
    const float32_t * in, 
-   float32_t *pBuffer)
+   float32_t *pOutputProbabilities,
+   float32_t *pBufferB)
 {
     uint32_t nbClass;
     uint32_t nbDim;
     const float32_t *pPrior = S->classPriors;
     const float32_t *pTheta = S->theta;
     const float32_t *pSigma = S->sigma;
-    float32_t *buffer = pBuffer;
+    float32_t *buffer = pOutputProbabilities;
     const float32_t *pIn=in;
     float32_t result;
     float32_t sigma;
@@ -353,6 +350,8 @@ uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_ins
     float32_t acc1,acc2;
     uint32_t index;
 
+    (void)pBufferB;
+
     pTheta=S->theta;
     pSigma=S->sigma;
 
@@ -384,7 +383,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_ins
         buffer++;
     }
 
-    arm_max_f32(pBuffer,S->numberOfClasses,&result,&index);
+    arm_max_f32(pOutputProbabilities,S->numberOfClasses,&result,&index);
 
     return(index);
 }
diff --git a/CMSIS/DSP/Source/CommonTables/arm_common_tables.c b/CMSIS/DSP/Source/CommonTables/arm_common_tables.c
index 140e26358b3dd50dde5ed5c41603688f531a9cbe..54ce0e0fb667c2c23540f46dea5b612a45bbc172 100644
--- a/CMSIS/DSP/Source/CommonTables/arm_common_tables.c
+++ b/CMSIS/DSP/Source/CommonTables/arm_common_tables.c
@@ -3,13 +3,13 @@
  * Title:        arm_common_tables.c
  * Description:  common tables like fft twiddle factors, Bitreverse, reciprocal etc
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "arm_math_types.h"
 #include "arm_common_tables.h"
 
 /**
@@ -8538,10 +8538,6 @@ const uint64_t twiddleCoefF64_4096[8192] = {
 
 #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
 
-/**
-  @brief  Floating-point Twiddle factors Table Generation
-*/
-
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_16)
 /**
   @par
@@ -70387,7 +70383,7 @@ const q15_t sinTable_q15[FAST_MATH_TABLE_SIZE + 1] = {
 };
 #endif /* defined(ARM_ALL_FAST_TABLES) */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
      #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
 const q31_t sqrtTable_Q31[256] = {
     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
@@ -70539,7 +70535,7 @@ const float32_t __logf_lut_f32[8] = {
 
 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
-#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) 
+#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))  && !defined(ARM_MATH_AUTOVECTORIZE)
 
 /* haming weight LUT for bytes */
 #define B2(n) n, n + 1, n + 1, n + 2
diff --git a/CMSIS/DSP/Source/CommonTables/arm_common_tables_f16.c b/CMSIS/DSP/Source/CommonTables/arm_common_tables_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..ef14c84b9f0a8decd74bb3021344f7ba0a4a0622
--- /dev/null
+++ b/CMSIS/DSP/Source/CommonTables/arm_common_tables_f16.c
@@ -0,0 +1,12586 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_common_tables_f16.c
+ * Description:  common tables like fft twiddle factors, Bitreverse, reciprocal etc
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math_types_f16.h"
+/**
+  @ingroup ComplexFFT
+ */
+
+/**
+  @addtogroup CFFT_CIFFT Complex FFT Tables
+  @{
+ */
+
+
+/**
+  @brief  Floating-point Twiddle factors Table Generation
+*/
+
+/* F16 */
+#if !defined(__CC_ARM)
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include "arm_common_tables_f16.h"
+
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_16)
+
+/**
+* \par
+* Example code for Floating-point Twiddle factors Generation:
+* \par
+* <pre>for(i = 0; i< N/; i++)
+* {
+* twiddleCoef[2*i]= cos(i * 2*PI/(float)N);
+* twiddleCoef[2*i+1]= sin(i * 2*PI/(float)N);
+* } </pre>
+* \par
+* where N = 16  and PI = 3.14159265358979
+* \par
+* Cos and Sin values are in interleaved fashion
+*
+*/
+const float16_t twiddleCoefF16_16[32] = {
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+   (float16_t)-0.382683432f,  (float16_t)0.923879533f,
+   (float16_t)-0.707106781f,  (float16_t)0.707106781f,
+   (float16_t)-0.923879533f,  (float16_t)0.382683432f,
+   (float16_t)-1.000000000f,  (float16_t)0.000000000f,
+   (float16_t)-0.923879533f, (float16_t)-0.382683432f,
+   (float16_t)-0.707106781f, (float16_t)-0.707106781f,
+   (float16_t)-0.382683432f, (float16_t)-0.923879533f,
+   (float16_t)-0.000000000f, (float16_t)-1.000000000f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f
+};
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_32)
+
+/**
+* \par
+* Example code for Floating-point Twiddle factors Generation:
+* \par
+* <pre>for(i = 0; i< N/; i++)
+* {
+* twiddleCoef[2*i]= cos(i * 2*PI/(float)N);
+* twiddleCoef[2*i+1]= sin(i * 2*PI/(float)N);
+* } </pre>
+* \par
+* where N = 32  and PI = 3.14159265358979
+* \par
+* Cos and Sin values are in interleaved fashion
+*
+*/
+const float16_t twiddleCoefF16_32[64] = {
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+   (float16_t)-0.195090322f,  (float16_t)0.980785280f,
+   (float16_t)-0.382683432f,  (float16_t)0.923879533f,
+   (float16_t)-0.555570233f,  (float16_t)0.831469612f,
+   (float16_t)-0.707106781f,  (float16_t)0.707106781f,
+   (float16_t)-0.831469612f,  (float16_t)0.555570233f,
+   (float16_t)-0.923879533f,  (float16_t)0.382683432f,
+   (float16_t)-0.980785280f,  (float16_t)0.195090322f,
+   (float16_t)-1.000000000f,  (float16_t)0.000000000f,
+   (float16_t)-0.980785280f, (float16_t)-0.195090322f,
+   (float16_t)-0.923879533f, (float16_t)-0.382683432f,
+   (float16_t)-0.831469612f, (float16_t)-0.555570233f,
+   (float16_t)-0.707106781f, (float16_t)-0.707106781f,
+   (float16_t)-0.555570233f, (float16_t)-0.831469612f,
+   (float16_t)-0.382683432f, (float16_t)-0.923879533f,
+   (float16_t)-0.195090322f, (float16_t)-0.980785280f,
+   (float16_t)-0.000000000f, (float16_t)-1.000000000f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f
+};
+
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_64)
+
+/**
+* \par
+* Example code for Floating-point Twiddle factors Generation:
+* \par
+* <pre>for(i = 0; i< N/; i++)
+* {
+* twiddleCoef[2*i]= cos(i * 2*PI/(float)N);
+* twiddleCoef[2*i+1]= sin(i * 2*PI/(float)N);
+* } </pre>
+* \par
+* where N = 64  and PI = 3.14159265358979
+* \par
+* Cos and Sin values are in interleaved fashion
+*
+*/
+const float16_t twiddleCoefF16_64[128] = {
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.995184727f,  (float16_t)0.098017140f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.956940336f,  (float16_t)0.290284677f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.881921264f,  (float16_t)0.471396737f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.773010453f,  (float16_t)0.634393284f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.634393284f,  (float16_t)0.773010453f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.471396737f,  (float16_t)0.881921264f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.290284677f,  (float16_t)0.956940336f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.098017140f,  (float16_t)0.995184727f,
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+   (float16_t)-0.098017140f,  (float16_t)0.995184727f,
+   (float16_t)-0.195090322f,  (float16_t)0.980785280f,
+   (float16_t)-0.290284677f,  (float16_t)0.956940336f,
+   (float16_t)-0.382683432f,  (float16_t)0.923879533f,
+   (float16_t)-0.471396737f,  (float16_t)0.881921264f,
+   (float16_t)-0.555570233f,  (float16_t)0.831469612f,
+   (float16_t)-0.634393284f,  (float16_t)0.773010453f,
+   (float16_t)-0.707106781f,  (float16_t)0.707106781f,
+   (float16_t)-0.773010453f,  (float16_t)0.634393284f,
+   (float16_t)-0.831469612f,  (float16_t)0.555570233f,
+   (float16_t)-0.881921264f,  (float16_t)0.471396737f,
+   (float16_t)-0.923879533f,  (float16_t)0.382683432f,
+   (float16_t)-0.956940336f,  (float16_t)0.290284677f,
+   (float16_t)-0.980785280f,  (float16_t)0.195090322f,
+   (float16_t)-0.995184727f,  (float16_t)0.098017140f,
+   (float16_t)-1.000000000f,  (float16_t)0.000000000f,
+   (float16_t)-0.995184727f, (float16_t)-0.098017140f,
+   (float16_t)-0.980785280f, (float16_t)-0.195090322f,
+   (float16_t)-0.956940336f, (float16_t)-0.290284677f,
+   (float16_t)-0.923879533f, (float16_t)-0.382683432f,
+   (float16_t)-0.881921264f, (float16_t)-0.471396737f,
+   (float16_t)-0.831469612f, (float16_t)-0.555570233f,
+   (float16_t)-0.773010453f, (float16_t)-0.634393284f,
+   (float16_t)-0.707106781f, (float16_t)-0.707106781f,
+   (float16_t)-0.634393284f, (float16_t)-0.773010453f,
+   (float16_t)-0.555570233f, (float16_t)-0.831469612f,
+   (float16_t)-0.471396737f, (float16_t)-0.881921264f,
+   (float16_t)-0.382683432f, (float16_t)-0.923879533f,
+   (float16_t)-0.290284677f, (float16_t)-0.956940336f,
+   (float16_t)-0.195090322f, (float16_t)-0.980785280f,
+   (float16_t)-0.098017140f, (float16_t)-0.995184727f,
+   (float16_t)-0.000000000f, (float16_t)-1.000000000f,
+    (float16_t)0.098017140f, (float16_t)-0.995184727f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.290284677f, (float16_t)-0.956940336f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.471396737f, (float16_t)-0.881921264f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.634393284f, (float16_t)-0.773010453f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.773010453f, (float16_t)-0.634393284f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.881921264f, (float16_t)-0.471396737f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.956940336f, (float16_t)-0.290284677f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.995184727f, (float16_t)-0.098017140f
+};
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_128)
+
+/**
+* \par
+* Example code for Floating-point Twiddle factors Generation:
+* \par
+* <pre>for(i = 0; i< N/; i++)
+* {
+* twiddleCoef[2*i]= cos(i * 2*PI/(float)N);
+* twiddleCoef[2*i+1]= sin(i * 2*PI/(float)N);
+* } </pre>
+* \par
+* where N = 128 and PI = 3.14159265358979
+* \par
+* Cos and Sin values are in interleaved fashion
+*
+*/
+
+const float16_t twiddleCoefF16_128[256] = {
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.998795456f,  (float16_t)0.049067674f,
+    (float16_t)0.995184727f,  (float16_t)0.098017140f,
+    (float16_t)0.989176510f,  (float16_t)0.146730474f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.970031253f,  (float16_t)0.242980180f,
+    (float16_t)0.956940336f,  (float16_t)0.290284677f,
+    (float16_t)0.941544065f,  (float16_t)0.336889853f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.903989293f,  (float16_t)0.427555093f,
+    (float16_t)0.881921264f,  (float16_t)0.471396737f,
+    (float16_t)0.857728610f,  (float16_t)0.514102744f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.803207531f,  (float16_t)0.595699304f,
+    (float16_t)0.773010453f,  (float16_t)0.634393284f,
+    (float16_t)0.740951125f,  (float16_t)0.671558955f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.671558955f,  (float16_t)0.740951125f,
+    (float16_t)0.634393284f,  (float16_t)0.773010453f,
+    (float16_t)0.595699304f,  (float16_t)0.803207531f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.514102744f,  (float16_t)0.857728610f,
+    (float16_t)0.471396737f,  (float16_t)0.881921264f,
+    (float16_t)0.427555093f,  (float16_t)0.903989293f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.336889853f,  (float16_t)0.941544065f,
+    (float16_t)0.290284677f,  (float16_t)0.956940336f,
+    (float16_t)0.242980180f,  (float16_t)0.970031253f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.146730474f,  (float16_t)0.989176510f,
+    (float16_t)0.098017140f,  (float16_t)0.995184727f,
+    (float16_t)0.049067674f,  (float16_t)0.998795456f,
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+   (float16_t)-0.049067674f,  (float16_t)0.998795456f,
+   (float16_t)-0.098017140f,  (float16_t)0.995184727f,
+   (float16_t)-0.146730474f,  (float16_t)0.989176510f,
+   (float16_t)-0.195090322f,  (float16_t)0.980785280f,
+   (float16_t)-0.242980180f,  (float16_t)0.970031253f,
+   (float16_t)-0.290284677f,  (float16_t)0.956940336f,
+   (float16_t)-0.336889853f,  (float16_t)0.941544065f,
+   (float16_t)-0.382683432f,  (float16_t)0.923879533f,
+   (float16_t)-0.427555093f,  (float16_t)0.903989293f,
+   (float16_t)-0.471396737f,  (float16_t)0.881921264f,
+   (float16_t)-0.514102744f,  (float16_t)0.857728610f,
+   (float16_t)-0.555570233f,  (float16_t)0.831469612f,
+   (float16_t)-0.595699304f,  (float16_t)0.803207531f,
+   (float16_t)-0.634393284f,  (float16_t)0.773010453f,
+   (float16_t)-0.671558955f,  (float16_t)0.740951125f,
+   (float16_t)-0.707106781f,  (float16_t)0.707106781f,
+   (float16_t)-0.740951125f,  (float16_t)0.671558955f,
+   (float16_t)-0.773010453f,  (float16_t)0.634393284f,
+   (float16_t)-0.803207531f,  (float16_t)0.595699304f,
+   (float16_t)-0.831469612f,  (float16_t)0.555570233f,
+   (float16_t)-0.857728610f,  (float16_t)0.514102744f,
+   (float16_t)-0.881921264f,  (float16_t)0.471396737f,
+   (float16_t)-0.903989293f,  (float16_t)0.427555093f,
+   (float16_t)-0.923879533f,  (float16_t)0.382683432f,
+   (float16_t)-0.941544065f,  (float16_t)0.336889853f,
+   (float16_t)-0.956940336f,  (float16_t)0.290284677f,
+   (float16_t)-0.970031253f,  (float16_t)0.242980180f,
+   (float16_t)-0.980785280f,  (float16_t)0.195090322f,
+   (float16_t)-0.989176510f,  (float16_t)0.146730474f,
+   (float16_t)-0.995184727f,  (float16_t)0.098017140f,
+   (float16_t)-0.998795456f,  (float16_t)0.049067674f,
+   (float16_t)-1.000000000f,  (float16_t)0.000000000f,
+   (float16_t)-0.998795456f, (float16_t)-0.049067674f,
+   (float16_t)-0.995184727f, (float16_t)-0.098017140f,
+   (float16_t)-0.989176510f, (float16_t)-0.146730474f,
+   (float16_t)-0.980785280f, (float16_t)-0.195090322f,
+   (float16_t)-0.970031253f, (float16_t)-0.242980180f,
+   (float16_t)-0.956940336f, (float16_t)-0.290284677f,
+   (float16_t)-0.941544065f, (float16_t)-0.336889853f,
+   (float16_t)-0.923879533f, (float16_t)-0.382683432f,
+   (float16_t)-0.903989293f, (float16_t)-0.427555093f,
+   (float16_t)-0.881921264f, (float16_t)-0.471396737f,
+   (float16_t)-0.857728610f, (float16_t)-0.514102744f,
+   (float16_t)-0.831469612f, (float16_t)-0.555570233f,
+   (float16_t)-0.803207531f, (float16_t)-0.595699304f,
+   (float16_t)-0.773010453f, (float16_t)-0.634393284f,
+   (float16_t)-0.740951125f, (float16_t)-0.671558955f,
+   (float16_t)-0.707106781f, (float16_t)-0.707106781f,
+   (float16_t)-0.671558955f, (float16_t)-0.740951125f,
+   (float16_t)-0.634393284f, (float16_t)-0.773010453f,
+   (float16_t)-0.595699304f, (float16_t)-0.803207531f,
+   (float16_t)-0.555570233f, (float16_t)-0.831469612f,
+   (float16_t)-0.514102744f, (float16_t)-0.857728610f,
+   (float16_t)-0.471396737f, (float16_t)-0.881921264f,
+   (float16_t)-0.427555093f, (float16_t)-0.903989293f,
+   (float16_t)-0.382683432f, (float16_t)-0.923879533f,
+   (float16_t)-0.336889853f, (float16_t)-0.941544065f,
+   (float16_t)-0.290284677f, (float16_t)-0.956940336f,
+   (float16_t)-0.242980180f, (float16_t)-0.970031253f,
+   (float16_t)-0.195090322f, (float16_t)-0.980785280f,
+   (float16_t)-0.146730474f, (float16_t)-0.989176510f,
+   (float16_t)-0.098017140f, (float16_t)-0.995184727f,
+   (float16_t)-0.049067674f, (float16_t)-0.998795456f,
+   (float16_t)-0.000000000f, (float16_t)-1.000000000f,
+    (float16_t)0.049067674f, (float16_t)-0.998795456f,
+    (float16_t)0.098017140f, (float16_t)-0.995184727f,
+    (float16_t)0.146730474f, (float16_t)-0.989176510f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.242980180f, (float16_t)-0.970031253f,
+    (float16_t)0.290284677f, (float16_t)-0.956940336f,
+    (float16_t)0.336889853f, (float16_t)-0.941544065f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.427555093f, (float16_t)-0.903989293f,
+    (float16_t)0.471396737f, (float16_t)-0.881921264f,
+    (float16_t)0.514102744f, (float16_t)-0.857728610f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.595699304f, (float16_t)-0.803207531f,
+    (float16_t)0.634393284f, (float16_t)-0.773010453f,
+    (float16_t)0.671558955f, (float16_t)-0.740951125f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.740951125f, (float16_t)-0.671558955f,
+    (float16_t)0.773010453f, (float16_t)-0.634393284f,
+    (float16_t)0.803207531f, (float16_t)-0.595699304f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.857728610f, (float16_t)-0.514102744f,
+    (float16_t)0.881921264f, (float16_t)-0.471396737f,
+    (float16_t)0.903989293f, (float16_t)-0.427555093f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.941544065f, (float16_t)-0.336889853f,
+    (float16_t)0.956940336f, (float16_t)-0.290284677f,
+    (float16_t)0.970031253f, (float16_t)-0.242980180f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.989176510f, (float16_t)-0.146730474f,
+    (float16_t)0.995184727f, (float16_t)-0.098017140f,
+    (float16_t)0.998795456f, (float16_t)-0.049067674f
+};
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_256)
+
+/**
+* \par
+* Example code for Floating-point Twiddle factors Generation:
+* \par
+* <pre>for(i = 0; i< N/; i++)
+* {
+* twiddleCoef[2*i]= cos(i * 2*PI/(float)N);
+* twiddleCoef[2*i+1]= sin(i * 2*PI/(float)N);
+* } </pre>
+* \par
+* where N = 256 and PI = 3.14159265358979
+* \par
+* Cos and Sin values are in interleaved fashion
+*
+*/
+const float16_t twiddleCoefF16_256[512] = {
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.999698819f,  (float16_t)0.024541229f,
+    (float16_t)0.998795456f,  (float16_t)0.049067674f,
+    (float16_t)0.997290457f,  (float16_t)0.073564564f,
+    (float16_t)0.995184727f,  (float16_t)0.098017140f,
+    (float16_t)0.992479535f,  (float16_t)0.122410675f,
+    (float16_t)0.989176510f,  (float16_t)0.146730474f,
+    (float16_t)0.985277642f,  (float16_t)0.170961889f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.975702130f,  (float16_t)0.219101240f,
+    (float16_t)0.970031253f,  (float16_t)0.242980180f,
+    (float16_t)0.963776066f,  (float16_t)0.266712757f,
+    (float16_t)0.956940336f,  (float16_t)0.290284677f,
+    (float16_t)0.949528181f,  (float16_t)0.313681740f,
+    (float16_t)0.941544065f,  (float16_t)0.336889853f,
+    (float16_t)0.932992799f,  (float16_t)0.359895037f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.914209756f,  (float16_t)0.405241314f,
+    (float16_t)0.903989293f,  (float16_t)0.427555093f,
+    (float16_t)0.893224301f,  (float16_t)0.449611330f,
+    (float16_t)0.881921264f,  (float16_t)0.471396737f,
+    (float16_t)0.870086991f,  (float16_t)0.492898192f,
+    (float16_t)0.857728610f,  (float16_t)0.514102744f,
+    (float16_t)0.844853565f,  (float16_t)0.534997620f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.817584813f,  (float16_t)0.575808191f,
+    (float16_t)0.803207531f,  (float16_t)0.595699304f,
+    (float16_t)0.788346428f,  (float16_t)0.615231591f,
+    (float16_t)0.773010453f,  (float16_t)0.634393284f,
+    (float16_t)0.757208847f,  (float16_t)0.653172843f,
+    (float16_t)0.740951125f,  (float16_t)0.671558955f,
+    (float16_t)0.724247083f,  (float16_t)0.689540545f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.689540545f,  (float16_t)0.724247083f,
+    (float16_t)0.671558955f,  (float16_t)0.740951125f,
+    (float16_t)0.653172843f,  (float16_t)0.757208847f,
+    (float16_t)0.634393284f,  (float16_t)0.773010453f,
+    (float16_t)0.615231591f,  (float16_t)0.788346428f,
+    (float16_t)0.595699304f,  (float16_t)0.803207531f,
+    (float16_t)0.575808191f,  (float16_t)0.817584813f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.534997620f,  (float16_t)0.844853565f,
+    (float16_t)0.514102744f,  (float16_t)0.857728610f,
+    (float16_t)0.492898192f,  (float16_t)0.870086991f,
+    (float16_t)0.471396737f,  (float16_t)0.881921264f,
+    (float16_t)0.449611330f,  (float16_t)0.893224301f,
+    (float16_t)0.427555093f,  (float16_t)0.903989293f,
+    (float16_t)0.405241314f,  (float16_t)0.914209756f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.359895037f,  (float16_t)0.932992799f,
+    (float16_t)0.336889853f,  (float16_t)0.941544065f,
+    (float16_t)0.313681740f,  (float16_t)0.949528181f,
+    (float16_t)0.290284677f,  (float16_t)0.956940336f,
+    (float16_t)0.266712757f,  (float16_t)0.963776066f,
+    (float16_t)0.242980180f,  (float16_t)0.970031253f,
+    (float16_t)0.219101240f,  (float16_t)0.975702130f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.170961889f,  (float16_t)0.985277642f,
+    (float16_t)0.146730474f,  (float16_t)0.989176510f,
+    (float16_t)0.122410675f,  (float16_t)0.992479535f,
+    (float16_t)0.098017140f,  (float16_t)0.995184727f,
+    (float16_t)0.073564564f,  (float16_t)0.997290457f,
+    (float16_t)0.049067674f,  (float16_t)0.998795456f,
+    (float16_t)0.024541229f,  (float16_t)0.999698819f,
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+   (float16_t)-0.024541229f,  (float16_t)0.999698819f,
+   (float16_t)-0.049067674f,  (float16_t)0.998795456f,
+   (float16_t)-0.073564564f,  (float16_t)0.997290457f,
+   (float16_t)-0.098017140f,  (float16_t)0.995184727f,
+   (float16_t)-0.122410675f,  (float16_t)0.992479535f,
+   (float16_t)-0.146730474f,  (float16_t)0.989176510f,
+   (float16_t)-0.170961889f,  (float16_t)0.985277642f,
+   (float16_t)-0.195090322f,  (float16_t)0.980785280f,
+   (float16_t)-0.219101240f,  (float16_t)0.975702130f,
+   (float16_t)-0.242980180f,  (float16_t)0.970031253f,
+   (float16_t)-0.266712757f,  (float16_t)0.963776066f,
+   (float16_t)-0.290284677f,  (float16_t)0.956940336f,
+   (float16_t)-0.313681740f,  (float16_t)0.949528181f,
+   (float16_t)-0.336889853f,  (float16_t)0.941544065f,
+   (float16_t)-0.359895037f,  (float16_t)0.932992799f,
+   (float16_t)-0.382683432f,  (float16_t)0.923879533f,
+   (float16_t)-0.405241314f,  (float16_t)0.914209756f,
+   (float16_t)-0.427555093f,  (float16_t)0.903989293f,
+   (float16_t)-0.449611330f,  (float16_t)0.893224301f,
+   (float16_t)-0.471396737f,  (float16_t)0.881921264f,
+   (float16_t)-0.492898192f,  (float16_t)0.870086991f,
+   (float16_t)-0.514102744f,  (float16_t)0.857728610f,
+   (float16_t)-0.534997620f,  (float16_t)0.844853565f,
+   (float16_t)-0.555570233f,  (float16_t)0.831469612f,
+   (float16_t)-0.575808191f,  (float16_t)0.817584813f,
+   (float16_t)-0.595699304f,  (float16_t)0.803207531f,
+   (float16_t)-0.615231591f,  (float16_t)0.788346428f,
+   (float16_t)-0.634393284f,  (float16_t)0.773010453f,
+   (float16_t)-0.653172843f,  (float16_t)0.757208847f,
+   (float16_t)-0.671558955f,  (float16_t)0.740951125f,
+   (float16_t)-0.689540545f,  (float16_t)0.724247083f,
+   (float16_t)-0.707106781f,  (float16_t)0.707106781f,
+   (float16_t)-0.724247083f,  (float16_t)0.689540545f,
+   (float16_t)-0.740951125f,  (float16_t)0.671558955f,
+   (float16_t)-0.757208847f,  (float16_t)0.653172843f,
+   (float16_t)-0.773010453f,  (float16_t)0.634393284f,
+   (float16_t)-0.788346428f,  (float16_t)0.615231591f,
+   (float16_t)-0.803207531f,  (float16_t)0.595699304f,
+   (float16_t)-0.817584813f,  (float16_t)0.575808191f,
+   (float16_t)-0.831469612f,  (float16_t)0.555570233f,
+   (float16_t)-0.844853565f,  (float16_t)0.534997620f,
+   (float16_t)-0.857728610f,  (float16_t)0.514102744f,
+   (float16_t)-0.870086991f,  (float16_t)0.492898192f,
+   (float16_t)-0.881921264f,  (float16_t)0.471396737f,
+   (float16_t)-0.893224301f,  (float16_t)0.449611330f,
+   (float16_t)-0.903989293f,  (float16_t)0.427555093f,
+   (float16_t)-0.914209756f,  (float16_t)0.405241314f,
+   (float16_t)-0.923879533f,  (float16_t)0.382683432f,
+   (float16_t)-0.932992799f,  (float16_t)0.359895037f,
+   (float16_t)-0.941544065f,  (float16_t)0.336889853f,
+   (float16_t)-0.949528181f,  (float16_t)0.313681740f,
+   (float16_t)-0.956940336f,  (float16_t)0.290284677f,
+   (float16_t)-0.963776066f,  (float16_t)0.266712757f,
+   (float16_t)-0.970031253f,  (float16_t)0.242980180f,
+   (float16_t)-0.975702130f,  (float16_t)0.219101240f,
+   (float16_t)-0.980785280f,  (float16_t)0.195090322f,
+   (float16_t)-0.985277642f,  (float16_t)0.170961889f,
+   (float16_t)-0.989176510f,  (float16_t)0.146730474f,
+   (float16_t)-0.992479535f,  (float16_t)0.122410675f,
+   (float16_t)-0.995184727f,  (float16_t)0.098017140f,
+   (float16_t)-0.997290457f,  (float16_t)0.073564564f,
+   (float16_t)-0.998795456f,  (float16_t)0.049067674f,
+   (float16_t)-0.999698819f,  (float16_t)0.024541229f,
+   (float16_t)-1.000000000f,  (float16_t)0.000000000f,
+   (float16_t)-0.999698819f, (float16_t)-0.024541229f,
+   (float16_t)-0.998795456f, (float16_t)-0.049067674f,
+   (float16_t)-0.997290457f, (float16_t)-0.073564564f,
+   (float16_t)-0.995184727f, (float16_t)-0.098017140f,
+   (float16_t)-0.992479535f, (float16_t)-0.122410675f,
+   (float16_t)-0.989176510f, (float16_t)-0.146730474f,
+   (float16_t)-0.985277642f, (float16_t)-0.170961889f,
+   (float16_t)-0.980785280f, (float16_t)-0.195090322f,
+   (float16_t)-0.975702130f, (float16_t)-0.219101240f,
+   (float16_t)-0.970031253f, (float16_t)-0.242980180f,
+   (float16_t)-0.963776066f, (float16_t)-0.266712757f,
+   (float16_t)-0.956940336f, (float16_t)-0.290284677f,
+   (float16_t)-0.949528181f, (float16_t)-0.313681740f,
+   (float16_t)-0.941544065f, (float16_t)-0.336889853f,
+   (float16_t)-0.932992799f, (float16_t)-0.359895037f,
+   (float16_t)-0.923879533f, (float16_t)-0.382683432f,
+   (float16_t)-0.914209756f, (float16_t)-0.405241314f,
+   (float16_t)-0.903989293f, (float16_t)-0.427555093f,
+   (float16_t)-0.893224301f, (float16_t)-0.449611330f,
+   (float16_t)-0.881921264f, (float16_t)-0.471396737f,
+   (float16_t)-0.870086991f, (float16_t)-0.492898192f,
+   (float16_t)-0.857728610f, (float16_t)-0.514102744f,
+   (float16_t)-0.844853565f, (float16_t)-0.534997620f,
+   (float16_t)-0.831469612f, (float16_t)-0.555570233f,
+   (float16_t)-0.817584813f, (float16_t)-0.575808191f,
+   (float16_t)-0.803207531f, (float16_t)-0.595699304f,
+   (float16_t)-0.788346428f, (float16_t)-0.615231591f,
+   (float16_t)-0.773010453f, (float16_t)-0.634393284f,
+   (float16_t)-0.757208847f, (float16_t)-0.653172843f,
+   (float16_t)-0.740951125f, (float16_t)-0.671558955f,
+   (float16_t)-0.724247083f, (float16_t)-0.689540545f,
+   (float16_t)-0.707106781f, (float16_t)-0.707106781f,
+   (float16_t)-0.689540545f, (float16_t)-0.724247083f,
+   (float16_t)-0.671558955f, (float16_t)-0.740951125f,
+   (float16_t)-0.653172843f, (float16_t)-0.757208847f,
+   (float16_t)-0.634393284f, (float16_t)-0.773010453f,
+   (float16_t)-0.615231591f, (float16_t)-0.788346428f,
+   (float16_t)-0.595699304f, (float16_t)-0.803207531f,
+   (float16_t)-0.575808191f, (float16_t)-0.817584813f,
+   (float16_t)-0.555570233f, (float16_t)-0.831469612f,
+   (float16_t)-0.534997620f, (float16_t)-0.844853565f,
+   (float16_t)-0.514102744f, (float16_t)-0.857728610f,
+   (float16_t)-0.492898192f, (float16_t)-0.870086991f,
+   (float16_t)-0.471396737f, (float16_t)-0.881921264f,
+   (float16_t)-0.449611330f, (float16_t)-0.893224301f,
+   (float16_t)-0.427555093f, (float16_t)-0.903989293f,
+   (float16_t)-0.405241314f, (float16_t)-0.914209756f,
+   (float16_t)-0.382683432f, (float16_t)-0.923879533f,
+   (float16_t)-0.359895037f, (float16_t)-0.932992799f,
+   (float16_t)-0.336889853f, (float16_t)-0.941544065f,
+   (float16_t)-0.313681740f, (float16_t)-0.949528181f,
+   (float16_t)-0.290284677f, (float16_t)-0.956940336f,
+   (float16_t)-0.266712757f, (float16_t)-0.963776066f,
+   (float16_t)-0.242980180f, (float16_t)-0.970031253f,
+   (float16_t)-0.219101240f, (float16_t)-0.975702130f,
+   (float16_t)-0.195090322f, (float16_t)-0.980785280f,
+   (float16_t)-0.170961889f, (float16_t)-0.985277642f,
+   (float16_t)-0.146730474f, (float16_t)-0.989176510f,
+   (float16_t)-0.122410675f, (float16_t)-0.992479535f,
+   (float16_t)-0.098017140f, (float16_t)-0.995184727f,
+   (float16_t)-0.073564564f, (float16_t)-0.997290457f,
+   (float16_t)-0.049067674f, (float16_t)-0.998795456f,
+   (float16_t)-0.024541229f, (float16_t)-0.999698819f,
+   (float16_t)-0.000000000f, (float16_t)-1.000000000f,
+    (float16_t)0.024541229f, (float16_t)-0.999698819f,
+    (float16_t)0.049067674f, (float16_t)-0.998795456f,
+    (float16_t)0.073564564f, (float16_t)-0.997290457f,
+    (float16_t)0.098017140f, (float16_t)-0.995184727f,
+    (float16_t)0.122410675f, (float16_t)-0.992479535f,
+    (float16_t)0.146730474f, (float16_t)-0.989176510f,
+    (float16_t)0.170961889f, (float16_t)-0.985277642f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.219101240f, (float16_t)-0.975702130f,
+    (float16_t)0.242980180f, (float16_t)-0.970031253f,
+    (float16_t)0.266712757f, (float16_t)-0.963776066f,
+    (float16_t)0.290284677f, (float16_t)-0.956940336f,
+    (float16_t)0.313681740f, (float16_t)-0.949528181f,
+    (float16_t)0.336889853f, (float16_t)-0.941544065f,
+    (float16_t)0.359895037f, (float16_t)-0.932992799f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.405241314f, (float16_t)-0.914209756f,
+    (float16_t)0.427555093f, (float16_t)-0.903989293f,
+    (float16_t)0.449611330f, (float16_t)-0.893224301f,
+    (float16_t)0.471396737f, (float16_t)-0.881921264f,
+    (float16_t)0.492898192f, (float16_t)-0.870086991f,
+    (float16_t)0.514102744f, (float16_t)-0.857728610f,
+    (float16_t)0.534997620f, (float16_t)-0.844853565f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.575808191f, (float16_t)-0.817584813f,
+    (float16_t)0.595699304f, (float16_t)-0.803207531f,
+    (float16_t)0.615231591f, (float16_t)-0.788346428f,
+    (float16_t)0.634393284f, (float16_t)-0.773010453f,
+    (float16_t)0.653172843f, (float16_t)-0.757208847f,
+    (float16_t)0.671558955f, (float16_t)-0.740951125f,
+    (float16_t)0.689540545f, (float16_t)-0.724247083f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.724247083f, (float16_t)-0.689540545f,
+    (float16_t)0.740951125f, (float16_t)-0.671558955f,
+    (float16_t)0.757208847f, (float16_t)-0.653172843f,
+    (float16_t)0.773010453f, (float16_t)-0.634393284f,
+    (float16_t)0.788346428f, (float16_t)-0.615231591f,
+    (float16_t)0.803207531f, (float16_t)-0.595699304f,
+    (float16_t)0.817584813f, (float16_t)-0.575808191f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.844853565f, (float16_t)-0.534997620f,
+    (float16_t)0.857728610f, (float16_t)-0.514102744f,
+    (float16_t)0.870086991f, (float16_t)-0.492898192f,
+    (float16_t)0.881921264f, (float16_t)-0.471396737f,
+    (float16_t)0.893224301f, (float16_t)-0.449611330f,
+    (float16_t)0.903989293f, (float16_t)-0.427555093f,
+    (float16_t)0.914209756f, (float16_t)-0.405241314f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.932992799f, (float16_t)-0.359895037f,
+    (float16_t)0.941544065f, (float16_t)-0.336889853f,
+    (float16_t)0.949528181f, (float16_t)-0.313681740f,
+    (float16_t)0.956940336f, (float16_t)-0.290284677f,
+    (float16_t)0.963776066f, (float16_t)-0.266712757f,
+    (float16_t)0.970031253f, (float16_t)-0.242980180f,
+    (float16_t)0.975702130f, (float16_t)-0.219101240f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.985277642f, (float16_t)-0.170961889f,
+    (float16_t)0.989176510f, (float16_t)-0.146730474f,
+    (float16_t)0.992479535f, (float16_t)-0.122410675f,
+    (float16_t)0.995184727f, (float16_t)-0.098017140f,
+    (float16_t)0.997290457f, (float16_t)-0.073564564f,
+    (float16_t)0.998795456f, (float16_t)-0.049067674f,
+    (float16_t)0.999698819f, (float16_t)-0.024541229f
+};
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_512)
+
+/**
+* \par
+* Example code for Floating-point Twiddle factors Generation:
+* \par
+* <pre>for(i = 0; i< N/; i++)
+* {
+* twiddleCoef[2*i]= cos(i * 2*PI/(float)N);
+* twiddleCoef[2*i+1]= sin(i * 2*PI/(float)N);
+* } </pre>
+* \par
+* where N = 512 and PI = 3.14159265358979
+* \par
+* Cos and Sin values are in interleaved fashion
+*
+*/
+const float16_t twiddleCoefF16_512[1024] = {
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.999924702f,  (float16_t)0.012271538f,
+    (float16_t)0.999698819f,  (float16_t)0.024541229f,
+    (float16_t)0.999322385f,  (float16_t)0.036807223f,
+    (float16_t)0.998795456f,  (float16_t)0.049067674f,
+    (float16_t)0.998118113f,  (float16_t)0.061320736f,
+    (float16_t)0.997290457f,  (float16_t)0.073564564f,
+    (float16_t)0.996312612f,  (float16_t)0.085797312f,
+    (float16_t)0.995184727f,  (float16_t)0.098017140f,
+    (float16_t)0.993906970f,  (float16_t)0.110222207f,
+    (float16_t)0.992479535f,  (float16_t)0.122410675f,
+    (float16_t)0.990902635f,  (float16_t)0.134580709f,
+    (float16_t)0.989176510f,  (float16_t)0.146730474f,
+    (float16_t)0.987301418f,  (float16_t)0.158858143f,
+    (float16_t)0.985277642f,  (float16_t)0.170961889f,
+    (float16_t)0.983105487f,  (float16_t)0.183039888f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.978317371f,  (float16_t)0.207111376f,
+    (float16_t)0.975702130f,  (float16_t)0.219101240f,
+    (float16_t)0.972939952f,  (float16_t)0.231058108f,
+    (float16_t)0.970031253f,  (float16_t)0.242980180f,
+    (float16_t)0.966976471f,  (float16_t)0.254865660f,
+    (float16_t)0.963776066f,  (float16_t)0.266712757f,
+    (float16_t)0.960430519f,  (float16_t)0.278519689f,
+    (float16_t)0.956940336f,  (float16_t)0.290284677f,
+    (float16_t)0.953306040f,  (float16_t)0.302005949f,
+    (float16_t)0.949528181f,  (float16_t)0.313681740f,
+    (float16_t)0.945607325f,  (float16_t)0.325310292f,
+    (float16_t)0.941544065f,  (float16_t)0.336889853f,
+    (float16_t)0.937339012f,  (float16_t)0.348418680f,
+    (float16_t)0.932992799f,  (float16_t)0.359895037f,
+    (float16_t)0.928506080f,  (float16_t)0.371317194f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.919113852f,  (float16_t)0.393992040f,
+    (float16_t)0.914209756f,  (float16_t)0.405241314f,
+    (float16_t)0.909167983f,  (float16_t)0.416429560f,
+    (float16_t)0.903989293f,  (float16_t)0.427555093f,
+    (float16_t)0.898674466f,  (float16_t)0.438616239f,
+    (float16_t)0.893224301f,  (float16_t)0.449611330f,
+    (float16_t)0.887639620f,  (float16_t)0.460538711f,
+    (float16_t)0.881921264f,  (float16_t)0.471396737f,
+    (float16_t)0.876070094f,  (float16_t)0.482183772f,
+    (float16_t)0.870086991f,  (float16_t)0.492898192f,
+    (float16_t)0.863972856f,  (float16_t)0.503538384f,
+    (float16_t)0.857728610f,  (float16_t)0.514102744f,
+    (float16_t)0.851355193f,  (float16_t)0.524589683f,
+    (float16_t)0.844853565f,  (float16_t)0.534997620f,
+    (float16_t)0.838224706f,  (float16_t)0.545324988f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.824589303f,  (float16_t)0.565731811f,
+    (float16_t)0.817584813f,  (float16_t)0.575808191f,
+    (float16_t)0.810457198f,  (float16_t)0.585797857f,
+    (float16_t)0.803207531f,  (float16_t)0.595699304f,
+    (float16_t)0.795836905f,  (float16_t)0.605511041f,
+    (float16_t)0.788346428f,  (float16_t)0.615231591f,
+    (float16_t)0.780737229f,  (float16_t)0.624859488f,
+    (float16_t)0.773010453f,  (float16_t)0.634393284f,
+    (float16_t)0.765167266f,  (float16_t)0.643831543f,
+    (float16_t)0.757208847f,  (float16_t)0.653172843f,
+    (float16_t)0.749136395f,  (float16_t)0.662415778f,
+    (float16_t)0.740951125f,  (float16_t)0.671558955f,
+    (float16_t)0.732654272f,  (float16_t)0.680600998f,
+    (float16_t)0.724247083f,  (float16_t)0.689540545f,
+    (float16_t)0.715730825f,  (float16_t)0.698376249f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.698376249f,  (float16_t)0.715730825f,
+    (float16_t)0.689540545f,  (float16_t)0.724247083f,
+    (float16_t)0.680600998f,  (float16_t)0.732654272f,
+    (float16_t)0.671558955f,  (float16_t)0.740951125f,
+    (float16_t)0.662415778f,  (float16_t)0.749136395f,
+    (float16_t)0.653172843f,  (float16_t)0.757208847f,
+    (float16_t)0.643831543f,  (float16_t)0.765167266f,
+    (float16_t)0.634393284f,  (float16_t)0.773010453f,
+    (float16_t)0.624859488f,  (float16_t)0.780737229f,
+    (float16_t)0.615231591f,  (float16_t)0.788346428f,
+    (float16_t)0.605511041f,  (float16_t)0.795836905f,
+    (float16_t)0.595699304f,  (float16_t)0.803207531f,
+    (float16_t)0.585797857f,  (float16_t)0.810457198f,
+    (float16_t)0.575808191f,  (float16_t)0.817584813f,
+    (float16_t)0.565731811f,  (float16_t)0.824589303f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.545324988f,  (float16_t)0.838224706f,
+    (float16_t)0.534997620f,  (float16_t)0.844853565f,
+    (float16_t)0.524589683f,  (float16_t)0.851355193f,
+    (float16_t)0.514102744f,  (float16_t)0.857728610f,
+    (float16_t)0.503538384f,  (float16_t)0.863972856f,
+    (float16_t)0.492898192f,  (float16_t)0.870086991f,
+    (float16_t)0.482183772f,  (float16_t)0.876070094f,
+    (float16_t)0.471396737f,  (float16_t)0.881921264f,
+    (float16_t)0.460538711f,  (float16_t)0.887639620f,
+    (float16_t)0.449611330f,  (float16_t)0.893224301f,
+    (float16_t)0.438616239f,  (float16_t)0.898674466f,
+    (float16_t)0.427555093f,  (float16_t)0.903989293f,
+    (float16_t)0.416429560f,  (float16_t)0.909167983f,
+    (float16_t)0.405241314f,  (float16_t)0.914209756f,
+    (float16_t)0.393992040f,  (float16_t)0.919113852f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.371317194f,  (float16_t)0.928506080f,
+    (float16_t)0.359895037f,  (float16_t)0.932992799f,
+    (float16_t)0.348418680f,  (float16_t)0.937339012f,
+    (float16_t)0.336889853f,  (float16_t)0.941544065f,
+    (float16_t)0.325310292f,  (float16_t)0.945607325f,
+    (float16_t)0.313681740f,  (float16_t)0.949528181f,
+    (float16_t)0.302005949f,  (float16_t)0.953306040f,
+    (float16_t)0.290284677f,  (float16_t)0.956940336f,
+    (float16_t)0.278519689f,  (float16_t)0.960430519f,
+    (float16_t)0.266712757f,  (float16_t)0.963776066f,
+    (float16_t)0.254865660f,  (float16_t)0.966976471f,
+    (float16_t)0.242980180f,  (float16_t)0.970031253f,
+    (float16_t)0.231058108f,  (float16_t)0.972939952f,
+    (float16_t)0.219101240f,  (float16_t)0.975702130f,
+    (float16_t)0.207111376f,  (float16_t)0.978317371f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.183039888f,  (float16_t)0.983105487f,
+    (float16_t)0.170961889f,  (float16_t)0.985277642f,
+    (float16_t)0.158858143f,  (float16_t)0.987301418f,
+    (float16_t)0.146730474f,  (float16_t)0.989176510f,
+    (float16_t)0.134580709f,  (float16_t)0.990902635f,
+    (float16_t)0.122410675f,  (float16_t)0.992479535f,
+    (float16_t)0.110222207f,  (float16_t)0.993906970f,
+    (float16_t)0.098017140f,  (float16_t)0.995184727f,
+    (float16_t)0.085797312f,  (float16_t)0.996312612f,
+    (float16_t)0.073564564f,  (float16_t)0.997290457f,
+    (float16_t)0.061320736f,  (float16_t)0.998118113f,
+    (float16_t)0.049067674f,  (float16_t)0.998795456f,
+    (float16_t)0.036807223f,  (float16_t)0.999322385f,
+    (float16_t)0.024541229f,  (float16_t)0.999698819f,
+    (float16_t)0.012271538f,  (float16_t)0.999924702f,
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+   (float16_t)-0.012271538f,  (float16_t)0.999924702f,
+   (float16_t)-0.024541229f,  (float16_t)0.999698819f,
+   (float16_t)-0.036807223f,  (float16_t)0.999322385f,
+   (float16_t)-0.049067674f,  (float16_t)0.998795456f,
+   (float16_t)-0.061320736f,  (float16_t)0.998118113f,
+   (float16_t)-0.073564564f,  (float16_t)0.997290457f,
+   (float16_t)-0.085797312f,  (float16_t)0.996312612f,
+   (float16_t)-0.098017140f,  (float16_t)0.995184727f,
+   (float16_t)-0.110222207f,  (float16_t)0.993906970f,
+   (float16_t)-0.122410675f,  (float16_t)0.992479535f,
+   (float16_t)-0.134580709f,  (float16_t)0.990902635f,
+   (float16_t)-0.146730474f,  (float16_t)0.989176510f,
+   (float16_t)-0.158858143f,  (float16_t)0.987301418f,
+   (float16_t)-0.170961889f,  (float16_t)0.985277642f,
+   (float16_t)-0.183039888f,  (float16_t)0.983105487f,
+   (float16_t)-0.195090322f,  (float16_t)0.980785280f,
+   (float16_t)-0.207111376f,  (float16_t)0.978317371f,
+   (float16_t)-0.219101240f,  (float16_t)0.975702130f,
+   (float16_t)-0.231058108f,  (float16_t)0.972939952f,
+   (float16_t)-0.242980180f,  (float16_t)0.970031253f,
+   (float16_t)-0.254865660f,  (float16_t)0.966976471f,
+   (float16_t)-0.266712757f,  (float16_t)0.963776066f,
+   (float16_t)-0.278519689f,  (float16_t)0.960430519f,
+   (float16_t)-0.290284677f,  (float16_t)0.956940336f,
+   (float16_t)-0.302005949f,  (float16_t)0.953306040f,
+   (float16_t)-0.313681740f,  (float16_t)0.949528181f,
+   (float16_t)-0.325310292f,  (float16_t)0.945607325f,
+   (float16_t)-0.336889853f,  (float16_t)0.941544065f,
+   (float16_t)-0.348418680f,  (float16_t)0.937339012f,
+   (float16_t)-0.359895037f,  (float16_t)0.932992799f,
+   (float16_t)-0.371317194f,  (float16_t)0.928506080f,
+   (float16_t)-0.382683432f,  (float16_t)0.923879533f,
+   (float16_t)-0.393992040f,  (float16_t)0.919113852f,
+   (float16_t)-0.405241314f,  (float16_t)0.914209756f,
+   (float16_t)-0.416429560f,  (float16_t)0.909167983f,
+   (float16_t)-0.427555093f,  (float16_t)0.903989293f,
+   (float16_t)-0.438616239f,  (float16_t)0.898674466f,
+   (float16_t)-0.449611330f,  (float16_t)0.893224301f,
+   (float16_t)-0.460538711f,  (float16_t)0.887639620f,
+   (float16_t)-0.471396737f,  (float16_t)0.881921264f,
+   (float16_t)-0.482183772f,  (float16_t)0.876070094f,
+   (float16_t)-0.492898192f,  (float16_t)0.870086991f,
+   (float16_t)-0.503538384f,  (float16_t)0.863972856f,
+   (float16_t)-0.514102744f,  (float16_t)0.857728610f,
+   (float16_t)-0.524589683f,  (float16_t)0.851355193f,
+   (float16_t)-0.534997620f,  (float16_t)0.844853565f,
+   (float16_t)-0.545324988f,  (float16_t)0.838224706f,
+   (float16_t)-0.555570233f,  (float16_t)0.831469612f,
+   (float16_t)-0.565731811f,  (float16_t)0.824589303f,
+   (float16_t)-0.575808191f,  (float16_t)0.817584813f,
+   (float16_t)-0.585797857f,  (float16_t)0.810457198f,
+   (float16_t)-0.595699304f,  (float16_t)0.803207531f,
+   (float16_t)-0.605511041f,  (float16_t)0.795836905f,
+   (float16_t)-0.615231591f,  (float16_t)0.788346428f,
+   (float16_t)-0.624859488f,  (float16_t)0.780737229f,
+   (float16_t)-0.634393284f,  (float16_t)0.773010453f,
+   (float16_t)-0.643831543f,  (float16_t)0.765167266f,
+   (float16_t)-0.653172843f,  (float16_t)0.757208847f,
+   (float16_t)-0.662415778f,  (float16_t)0.749136395f,
+   (float16_t)-0.671558955f,  (float16_t)0.740951125f,
+   (float16_t)-0.680600998f,  (float16_t)0.732654272f,
+   (float16_t)-0.689540545f,  (float16_t)0.724247083f,
+   (float16_t)-0.698376249f,  (float16_t)0.715730825f,
+   (float16_t)-0.707106781f,  (float16_t)0.707106781f,
+   (float16_t)-0.715730825f,  (float16_t)0.698376249f,
+   (float16_t)-0.724247083f,  (float16_t)0.689540545f,
+   (float16_t)-0.732654272f,  (float16_t)0.680600998f,
+   (float16_t)-0.740951125f,  (float16_t)0.671558955f,
+   (float16_t)-0.749136395f,  (float16_t)0.662415778f,
+   (float16_t)-0.757208847f,  (float16_t)0.653172843f,
+   (float16_t)-0.765167266f,  (float16_t)0.643831543f,
+   (float16_t)-0.773010453f,  (float16_t)0.634393284f,
+   (float16_t)-0.780737229f,  (float16_t)0.624859488f,
+   (float16_t)-0.788346428f,  (float16_t)0.615231591f,
+   (float16_t)-0.795836905f,  (float16_t)0.605511041f,
+   (float16_t)-0.803207531f,  (float16_t)0.595699304f,
+   (float16_t)-0.810457198f,  (float16_t)0.585797857f,
+   (float16_t)-0.817584813f,  (float16_t)0.575808191f,
+   (float16_t)-0.824589303f,  (float16_t)0.565731811f,
+   (float16_t)-0.831469612f,  (float16_t)0.555570233f,
+   (float16_t)-0.838224706f,  (float16_t)0.545324988f,
+   (float16_t)-0.844853565f,  (float16_t)0.534997620f,
+   (float16_t)-0.851355193f,  (float16_t)0.524589683f,
+   (float16_t)-0.857728610f,  (float16_t)0.514102744f,
+   (float16_t)-0.863972856f,  (float16_t)0.503538384f,
+   (float16_t)-0.870086991f,  (float16_t)0.492898192f,
+   (float16_t)-0.876070094f,  (float16_t)0.482183772f,
+   (float16_t)-0.881921264f,  (float16_t)0.471396737f,
+   (float16_t)-0.887639620f,  (float16_t)0.460538711f,
+   (float16_t)-0.893224301f,  (float16_t)0.449611330f,
+   (float16_t)-0.898674466f,  (float16_t)0.438616239f,
+   (float16_t)-0.903989293f,  (float16_t)0.427555093f,
+   (float16_t)-0.909167983f,  (float16_t)0.416429560f,
+   (float16_t)-0.914209756f,  (float16_t)0.405241314f,
+   (float16_t)-0.919113852f,  (float16_t)0.393992040f,
+   (float16_t)-0.923879533f,  (float16_t)0.382683432f,
+   (float16_t)-0.928506080f,  (float16_t)0.371317194f,
+   (float16_t)-0.932992799f,  (float16_t)0.359895037f,
+   (float16_t)-0.937339012f,  (float16_t)0.348418680f,
+   (float16_t)-0.941544065f,  (float16_t)0.336889853f,
+   (float16_t)-0.945607325f,  (float16_t)0.325310292f,
+   (float16_t)-0.949528181f,  (float16_t)0.313681740f,
+   (float16_t)-0.953306040f,  (float16_t)0.302005949f,
+   (float16_t)-0.956940336f,  (float16_t)0.290284677f,
+   (float16_t)-0.960430519f,  (float16_t)0.278519689f,
+   (float16_t)-0.963776066f,  (float16_t)0.266712757f,
+   (float16_t)-0.966976471f,  (float16_t)0.254865660f,
+   (float16_t)-0.970031253f,  (float16_t)0.242980180f,
+   (float16_t)-0.972939952f,  (float16_t)0.231058108f,
+   (float16_t)-0.975702130f,  (float16_t)0.219101240f,
+   (float16_t)-0.978317371f,  (float16_t)0.207111376f,
+   (float16_t)-0.980785280f,  (float16_t)0.195090322f,
+   (float16_t)-0.983105487f,  (float16_t)0.183039888f,
+   (float16_t)-0.985277642f,  (float16_t)0.170961889f,
+   (float16_t)-0.987301418f,  (float16_t)0.158858143f,
+   (float16_t)-0.989176510f,  (float16_t)0.146730474f,
+   (float16_t)-0.990902635f,  (float16_t)0.134580709f,
+   (float16_t)-0.992479535f,  (float16_t)0.122410675f,
+   (float16_t)-0.993906970f,  (float16_t)0.110222207f,
+   (float16_t)-0.995184727f,  (float16_t)0.098017140f,
+   (float16_t)-0.996312612f,  (float16_t)0.085797312f,
+   (float16_t)-0.997290457f,  (float16_t)0.073564564f,
+   (float16_t)-0.998118113f,  (float16_t)0.061320736f,
+   (float16_t)-0.998795456f,  (float16_t)0.049067674f,
+   (float16_t)-0.999322385f,  (float16_t)0.036807223f,
+   (float16_t)-0.999698819f,  (float16_t)0.024541229f,
+   (float16_t)-0.999924702f,  (float16_t)0.012271538f,
+   (float16_t)-1.000000000f,  (float16_t)0.000000000f,
+   (float16_t)-0.999924702f, (float16_t)-0.012271538f,
+   (float16_t)-0.999698819f, (float16_t)-0.024541229f,
+   (float16_t)-0.999322385f, (float16_t)-0.036807223f,
+   (float16_t)-0.998795456f, (float16_t)-0.049067674f,
+   (float16_t)-0.998118113f, (float16_t)-0.061320736f,
+   (float16_t)-0.997290457f, (float16_t)-0.073564564f,
+   (float16_t)-0.996312612f, (float16_t)-0.085797312f,
+   (float16_t)-0.995184727f, (float16_t)-0.098017140f,
+   (float16_t)-0.993906970f, (float16_t)-0.110222207f,
+   (float16_t)-0.992479535f, (float16_t)-0.122410675f,
+   (float16_t)-0.990902635f, (float16_t)-0.134580709f,
+   (float16_t)-0.989176510f, (float16_t)-0.146730474f,
+   (float16_t)-0.987301418f, (float16_t)-0.158858143f,
+   (float16_t)-0.985277642f, (float16_t)-0.170961889f,
+   (float16_t)-0.983105487f, (float16_t)-0.183039888f,
+   (float16_t)-0.980785280f, (float16_t)-0.195090322f,
+   (float16_t)-0.978317371f, (float16_t)-0.207111376f,
+   (float16_t)-0.975702130f, (float16_t)-0.219101240f,
+   (float16_t)-0.972939952f, (float16_t)-0.231058108f,
+   (float16_t)-0.970031253f, (float16_t)-0.242980180f,
+   (float16_t)-0.966976471f, (float16_t)-0.254865660f,
+   (float16_t)-0.963776066f, (float16_t)-0.266712757f,
+   (float16_t)-0.960430519f, (float16_t)-0.278519689f,
+   (float16_t)-0.956940336f, (float16_t)-0.290284677f,
+   (float16_t)-0.953306040f, (float16_t)-0.302005949f,
+   (float16_t)-0.949528181f, (float16_t)-0.313681740f,
+   (float16_t)-0.945607325f, (float16_t)-0.325310292f,
+   (float16_t)-0.941544065f, (float16_t)-0.336889853f,
+   (float16_t)-0.937339012f, (float16_t)-0.348418680f,
+   (float16_t)-0.932992799f, (float16_t)-0.359895037f,
+   (float16_t)-0.928506080f, (float16_t)-0.371317194f,
+   (float16_t)-0.923879533f, (float16_t)-0.382683432f,
+   (float16_t)-0.919113852f, (float16_t)-0.393992040f,
+   (float16_t)-0.914209756f, (float16_t)-0.405241314f,
+   (float16_t)-0.909167983f, (float16_t)-0.416429560f,
+   (float16_t)-0.903989293f, (float16_t)-0.427555093f,
+   (float16_t)-0.898674466f, (float16_t)-0.438616239f,
+   (float16_t)-0.893224301f, (float16_t)-0.449611330f,
+   (float16_t)-0.887639620f, (float16_t)-0.460538711f,
+   (float16_t)-0.881921264f, (float16_t)-0.471396737f,
+   (float16_t)-0.876070094f, (float16_t)-0.482183772f,
+   (float16_t)-0.870086991f, (float16_t)-0.492898192f,
+   (float16_t)-0.863972856f, (float16_t)-0.503538384f,
+   (float16_t)-0.857728610f, (float16_t)-0.514102744f,
+   (float16_t)-0.851355193f, (float16_t)-0.524589683f,
+   (float16_t)-0.844853565f, (float16_t)-0.534997620f,
+   (float16_t)-0.838224706f, (float16_t)-0.545324988f,
+   (float16_t)-0.831469612f, (float16_t)-0.555570233f,
+   (float16_t)-0.824589303f, (float16_t)-0.565731811f,
+   (float16_t)-0.817584813f, (float16_t)-0.575808191f,
+   (float16_t)-0.810457198f, (float16_t)-0.585797857f,
+   (float16_t)-0.803207531f, (float16_t)-0.595699304f,
+   (float16_t)-0.795836905f, (float16_t)-0.605511041f,
+   (float16_t)-0.788346428f, (float16_t)-0.615231591f,
+   (float16_t)-0.780737229f, (float16_t)-0.624859488f,
+   (float16_t)-0.773010453f, (float16_t)-0.634393284f,
+   (float16_t)-0.765167266f, (float16_t)-0.643831543f,
+   (float16_t)-0.757208847f, (float16_t)-0.653172843f,
+   (float16_t)-0.749136395f, (float16_t)-0.662415778f,
+   (float16_t)-0.740951125f, (float16_t)-0.671558955f,
+   (float16_t)-0.732654272f, (float16_t)-0.680600998f,
+   (float16_t)-0.724247083f, (float16_t)-0.689540545f,
+   (float16_t)-0.715730825f, (float16_t)-0.698376249f,
+   (float16_t)-0.707106781f, (float16_t)-0.707106781f,
+   (float16_t)-0.698376249f, (float16_t)-0.715730825f,
+   (float16_t)-0.689540545f, (float16_t)-0.724247083f,
+   (float16_t)-0.680600998f, (float16_t)-0.732654272f,
+   (float16_t)-0.671558955f, (float16_t)-0.740951125f,
+   (float16_t)-0.662415778f, (float16_t)-0.749136395f,
+   (float16_t)-0.653172843f, (float16_t)-0.757208847f,
+   (float16_t)-0.643831543f, (float16_t)-0.765167266f,
+   (float16_t)-0.634393284f, (float16_t)-0.773010453f,
+   (float16_t)-0.624859488f, (float16_t)-0.780737229f,
+   (float16_t)-0.615231591f, (float16_t)-0.788346428f,
+   (float16_t)-0.605511041f, (float16_t)-0.795836905f,
+   (float16_t)-0.595699304f, (float16_t)-0.803207531f,
+   (float16_t)-0.585797857f, (float16_t)-0.810457198f,
+   (float16_t)-0.575808191f, (float16_t)-0.817584813f,
+   (float16_t)-0.565731811f, (float16_t)-0.824589303f,
+   (float16_t)-0.555570233f, (float16_t)-0.831469612f,
+   (float16_t)-0.545324988f, (float16_t)-0.838224706f,
+   (float16_t)-0.534997620f, (float16_t)-0.844853565f,
+   (float16_t)-0.524589683f, (float16_t)-0.851355193f,
+   (float16_t)-0.514102744f, (float16_t)-0.857728610f,
+   (float16_t)-0.503538384f, (float16_t)-0.863972856f,
+   (float16_t)-0.492898192f, (float16_t)-0.870086991f,
+   (float16_t)-0.482183772f, (float16_t)-0.876070094f,
+   (float16_t)-0.471396737f, (float16_t)-0.881921264f,
+   (float16_t)-0.460538711f, (float16_t)-0.887639620f,
+   (float16_t)-0.449611330f, (float16_t)-0.893224301f,
+   (float16_t)-0.438616239f, (float16_t)-0.898674466f,
+   (float16_t)-0.427555093f, (float16_t)-0.903989293f,
+   (float16_t)-0.416429560f, (float16_t)-0.909167983f,
+   (float16_t)-0.405241314f, (float16_t)-0.914209756f,
+   (float16_t)-0.393992040f, (float16_t)-0.919113852f,
+   (float16_t)-0.382683432f, (float16_t)-0.923879533f,
+   (float16_t)-0.371317194f, (float16_t)-0.928506080f,
+   (float16_t)-0.359895037f, (float16_t)-0.932992799f,
+   (float16_t)-0.348418680f, (float16_t)-0.937339012f,
+   (float16_t)-0.336889853f, (float16_t)-0.941544065f,
+   (float16_t)-0.325310292f, (float16_t)-0.945607325f,
+   (float16_t)-0.313681740f, (float16_t)-0.949528181f,
+   (float16_t)-0.302005949f, (float16_t)-0.953306040f,
+   (float16_t)-0.290284677f, (float16_t)-0.956940336f,
+   (float16_t)-0.278519689f, (float16_t)-0.960430519f,
+   (float16_t)-0.266712757f, (float16_t)-0.963776066f,
+   (float16_t)-0.254865660f, (float16_t)-0.966976471f,
+   (float16_t)-0.242980180f, (float16_t)-0.970031253f,
+   (float16_t)-0.231058108f, (float16_t)-0.972939952f,
+   (float16_t)-0.219101240f, (float16_t)-0.975702130f,
+   (float16_t)-0.207111376f, (float16_t)-0.978317371f,
+   (float16_t)-0.195090322f, (float16_t)-0.980785280f,
+   (float16_t)-0.183039888f, (float16_t)-0.983105487f,
+   (float16_t)-0.170961889f, (float16_t)-0.985277642f,
+   (float16_t)-0.158858143f, (float16_t)-0.987301418f,
+   (float16_t)-0.146730474f, (float16_t)-0.989176510f,
+   (float16_t)-0.134580709f, (float16_t)-0.990902635f,
+   (float16_t)-0.122410675f, (float16_t)-0.992479535f,
+   (float16_t)-0.110222207f, (float16_t)-0.993906970f,
+   (float16_t)-0.098017140f, (float16_t)-0.995184727f,
+   (float16_t)-0.085797312f, (float16_t)-0.996312612f,
+   (float16_t)-0.073564564f, (float16_t)-0.997290457f,
+   (float16_t)-0.061320736f, (float16_t)-0.998118113f,
+   (float16_t)-0.049067674f, (float16_t)-0.998795456f,
+   (float16_t)-0.036807223f, (float16_t)-0.999322385f,
+   (float16_t)-0.024541229f, (float16_t)-0.999698819f,
+   (float16_t)-0.012271538f, (float16_t)-0.999924702f,
+   (float16_t)-0.000000000f, (float16_t)-1.000000000f,
+    (float16_t)0.012271538f, (float16_t)-0.999924702f,
+    (float16_t)0.024541229f, (float16_t)-0.999698819f,
+    (float16_t)0.036807223f, (float16_t)-0.999322385f,
+    (float16_t)0.049067674f, (float16_t)-0.998795456f,
+    (float16_t)0.061320736f, (float16_t)-0.998118113f,
+    (float16_t)0.073564564f, (float16_t)-0.997290457f,
+    (float16_t)0.085797312f, (float16_t)-0.996312612f,
+    (float16_t)0.098017140f, (float16_t)-0.995184727f,
+    (float16_t)0.110222207f, (float16_t)-0.993906970f,
+    (float16_t)0.122410675f, (float16_t)-0.992479535f,
+    (float16_t)0.134580709f, (float16_t)-0.990902635f,
+    (float16_t)0.146730474f, (float16_t)-0.989176510f,
+    (float16_t)0.158858143f, (float16_t)-0.987301418f,
+    (float16_t)0.170961889f, (float16_t)-0.985277642f,
+    (float16_t)0.183039888f, (float16_t)-0.983105487f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.207111376f, (float16_t)-0.978317371f,
+    (float16_t)0.219101240f, (float16_t)-0.975702130f,
+    (float16_t)0.231058108f, (float16_t)-0.972939952f,
+    (float16_t)0.242980180f, (float16_t)-0.970031253f,
+    (float16_t)0.254865660f, (float16_t)-0.966976471f,
+    (float16_t)0.266712757f, (float16_t)-0.963776066f,
+    (float16_t)0.278519689f, (float16_t)-0.960430519f,
+    (float16_t)0.290284677f, (float16_t)-0.956940336f,
+    (float16_t)0.302005949f, (float16_t)-0.953306040f,
+    (float16_t)0.313681740f, (float16_t)-0.949528181f,
+    (float16_t)0.325310292f, (float16_t)-0.945607325f,
+    (float16_t)0.336889853f, (float16_t)-0.941544065f,
+    (float16_t)0.348418680f, (float16_t)-0.937339012f,
+    (float16_t)0.359895037f, (float16_t)-0.932992799f,
+    (float16_t)0.371317194f, (float16_t)-0.928506080f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.393992040f, (float16_t)-0.919113852f,
+    (float16_t)0.405241314f, (float16_t)-0.914209756f,
+    (float16_t)0.416429560f, (float16_t)-0.909167983f,
+    (float16_t)0.427555093f, (float16_t)-0.903989293f,
+    (float16_t)0.438616239f, (float16_t)-0.898674466f,
+    (float16_t)0.449611330f, (float16_t)-0.893224301f,
+    (float16_t)0.460538711f, (float16_t)-0.887639620f,
+    (float16_t)0.471396737f, (float16_t)-0.881921264f,
+    (float16_t)0.482183772f, (float16_t)-0.876070094f,
+    (float16_t)0.492898192f, (float16_t)-0.870086991f,
+    (float16_t)0.503538384f, (float16_t)-0.863972856f,
+    (float16_t)0.514102744f, (float16_t)-0.857728610f,
+    (float16_t)0.524589683f, (float16_t)-0.851355193f,
+    (float16_t)0.534997620f, (float16_t)-0.844853565f,
+    (float16_t)0.545324988f, (float16_t)-0.838224706f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.565731811f, (float16_t)-0.824589303f,
+    (float16_t)0.575808191f, (float16_t)-0.817584813f,
+    (float16_t)0.585797857f, (float16_t)-0.810457198f,
+    (float16_t)0.595699304f, (float16_t)-0.803207531f,
+    (float16_t)0.605511041f, (float16_t)-0.795836905f,
+    (float16_t)0.615231591f, (float16_t)-0.788346428f,
+    (float16_t)0.624859488f, (float16_t)-0.780737229f,
+    (float16_t)0.634393284f, (float16_t)-0.773010453f,
+    (float16_t)0.643831543f, (float16_t)-0.765167266f,
+    (float16_t)0.653172843f, (float16_t)-0.757208847f,
+    (float16_t)0.662415778f, (float16_t)-0.749136395f,
+    (float16_t)0.671558955f, (float16_t)-0.740951125f,
+    (float16_t)0.680600998f, (float16_t)-0.732654272f,
+    (float16_t)0.689540545f, (float16_t)-0.724247083f,
+    (float16_t)0.698376249f, (float16_t)-0.715730825f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.715730825f, (float16_t)-0.698376249f,
+    (float16_t)0.724247083f, (float16_t)-0.689540545f,
+    (float16_t)0.732654272f, (float16_t)-0.680600998f,
+    (float16_t)0.740951125f, (float16_t)-0.671558955f,
+    (float16_t)0.749136395f, (float16_t)-0.662415778f,
+    (float16_t)0.757208847f, (float16_t)-0.653172843f,
+    (float16_t)0.765167266f, (float16_t)-0.643831543f,
+    (float16_t)0.773010453f, (float16_t)-0.634393284f,
+    (float16_t)0.780737229f, (float16_t)-0.624859488f,
+    (float16_t)0.788346428f, (float16_t)-0.615231591f,
+    (float16_t)0.795836905f, (float16_t)-0.605511041f,
+    (float16_t)0.803207531f, (float16_t)-0.595699304f,
+    (float16_t)0.810457198f, (float16_t)-0.585797857f,
+    (float16_t)0.817584813f, (float16_t)-0.575808191f,
+    (float16_t)0.824589303f, (float16_t)-0.565731811f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.838224706f, (float16_t)-0.545324988f,
+    (float16_t)0.844853565f, (float16_t)-0.534997620f,
+    (float16_t)0.851355193f, (float16_t)-0.524589683f,
+    (float16_t)0.857728610f, (float16_t)-0.514102744f,
+    (float16_t)0.863972856f, (float16_t)-0.503538384f,
+    (float16_t)0.870086991f, (float16_t)-0.492898192f,
+    (float16_t)0.876070094f, (float16_t)-0.482183772f,
+    (float16_t)0.881921264f, (float16_t)-0.471396737f,
+    (float16_t)0.887639620f, (float16_t)-0.460538711f,
+    (float16_t)0.893224301f, (float16_t)-0.449611330f,
+    (float16_t)0.898674466f, (float16_t)-0.438616239f,
+    (float16_t)0.903989293f, (float16_t)-0.427555093f,
+    (float16_t)0.909167983f, (float16_t)-0.416429560f,
+    (float16_t)0.914209756f, (float16_t)-0.405241314f,
+    (float16_t)0.919113852f, (float16_t)-0.393992040f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.928506080f, (float16_t)-0.371317194f,
+    (float16_t)0.932992799f, (float16_t)-0.359895037f,
+    (float16_t)0.937339012f, (float16_t)-0.348418680f,
+    (float16_t)0.941544065f, (float16_t)-0.336889853f,
+    (float16_t)0.945607325f, (float16_t)-0.325310292f,
+    (float16_t)0.949528181f, (float16_t)-0.313681740f,
+    (float16_t)0.953306040f, (float16_t)-0.302005949f,
+    (float16_t)0.956940336f, (float16_t)-0.290284677f,
+    (float16_t)0.960430519f, (float16_t)-0.278519689f,
+    (float16_t)0.963776066f, (float16_t)-0.266712757f,
+    (float16_t)0.966976471f, (float16_t)-0.254865660f,
+    (float16_t)0.970031253f, (float16_t)-0.242980180f,
+    (float16_t)0.972939952f, (float16_t)-0.231058108f,
+    (float16_t)0.975702130f, (float16_t)-0.219101240f,
+    (float16_t)0.978317371f, (float16_t)-0.207111376f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.983105487f, (float16_t)-0.183039888f,
+    (float16_t)0.985277642f, (float16_t)-0.170961889f,
+    (float16_t)0.987301418f, (float16_t)-0.158858143f,
+    (float16_t)0.989176510f, (float16_t)-0.146730474f,
+    (float16_t)0.990902635f, (float16_t)-0.134580709f,
+    (float16_t)0.992479535f, (float16_t)-0.122410675f,
+    (float16_t)0.993906970f, (float16_t)-0.110222207f,
+    (float16_t)0.995184727f, (float16_t)-0.098017140f,
+    (float16_t)0.996312612f, (float16_t)-0.085797312f,
+    (float16_t)0.997290457f, (float16_t)-0.073564564f,
+    (float16_t)0.998118113f, (float16_t)-0.061320736f,
+    (float16_t)0.998795456f, (float16_t)-0.049067674f,
+    (float16_t)0.999322385f, (float16_t)-0.036807223f,
+    (float16_t)0.999698819f, (float16_t)-0.024541229f,
+    (float16_t)0.999924702f, (float16_t)-0.012271538f
+};
+
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_1024)
+
+/**
+* \par
+* Example code for Floating-point Twiddle factors Generation:
+* \par
+* <pre>for(i = 0; i< N/; i++)
+* {
+* twiddleCoef[2*i]= cos(i * 2*PI/(float)N);
+* twiddleCoef[2*i+1]= sin(i * 2*PI/(float)N);
+* } </pre>
+* \par
+* where N = 1024  and PI = 3.14159265358979
+* \par
+* Cos and Sin values are in interleaved fashion
+*
+*/
+const float16_t twiddleCoefF16_1024[2048] = {
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.999981175f,  (float16_t)0.006135885f,
+    (float16_t)0.999924702f,  (float16_t)0.012271538f,
+    (float16_t)0.999830582f,  (float16_t)0.018406730f,
+    (float16_t)0.999698819f,  (float16_t)0.024541229f,
+    (float16_t)0.999529418f,  (float16_t)0.030674803f,
+    (float16_t)0.999322385f,  (float16_t)0.036807223f,
+    (float16_t)0.999077728f,  (float16_t)0.042938257f,
+    (float16_t)0.998795456f,  (float16_t)0.049067674f,
+    (float16_t)0.998475581f,  (float16_t)0.055195244f,
+    (float16_t)0.998118113f,  (float16_t)0.061320736f,
+    (float16_t)0.997723067f,  (float16_t)0.067443920f,
+    (float16_t)0.997290457f,  (float16_t)0.073564564f,
+    (float16_t)0.996820299f,  (float16_t)0.079682438f,
+    (float16_t)0.996312612f,  (float16_t)0.085797312f,
+    (float16_t)0.995767414f,  (float16_t)0.091908956f,
+    (float16_t)0.995184727f,  (float16_t)0.098017140f,
+    (float16_t)0.994564571f,  (float16_t)0.104121634f,
+    (float16_t)0.993906970f,  (float16_t)0.110222207f,
+    (float16_t)0.993211949f,  (float16_t)0.116318631f,
+    (float16_t)0.992479535f,  (float16_t)0.122410675f,
+    (float16_t)0.991709754f,  (float16_t)0.128498111f,
+    (float16_t)0.990902635f,  (float16_t)0.134580709f,
+    (float16_t)0.990058210f,  (float16_t)0.140658239f,
+    (float16_t)0.989176510f,  (float16_t)0.146730474f,
+    (float16_t)0.988257568f,  (float16_t)0.152797185f,
+    (float16_t)0.987301418f,  (float16_t)0.158858143f,
+    (float16_t)0.986308097f,  (float16_t)0.164913120f,
+    (float16_t)0.985277642f,  (float16_t)0.170961889f,
+    (float16_t)0.984210092f,  (float16_t)0.177004220f,
+    (float16_t)0.983105487f,  (float16_t)0.183039888f,
+    (float16_t)0.981963869f,  (float16_t)0.189068664f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.979569766f,  (float16_t)0.201104635f,
+    (float16_t)0.978317371f,  (float16_t)0.207111376f,
+    (float16_t)0.977028143f,  (float16_t)0.213110320f,
+    (float16_t)0.975702130f,  (float16_t)0.219101240f,
+    (float16_t)0.974339383f,  (float16_t)0.225083911f,
+    (float16_t)0.972939952f,  (float16_t)0.231058108f,
+    (float16_t)0.971503891f,  (float16_t)0.237023606f,
+    (float16_t)0.970031253f,  (float16_t)0.242980180f,
+    (float16_t)0.968522094f,  (float16_t)0.248927606f,
+    (float16_t)0.966976471f,  (float16_t)0.254865660f,
+    (float16_t)0.965394442f,  (float16_t)0.260794118f,
+    (float16_t)0.963776066f,  (float16_t)0.266712757f,
+    (float16_t)0.962121404f,  (float16_t)0.272621355f,
+    (float16_t)0.960430519f,  (float16_t)0.278519689f,
+    (float16_t)0.958703475f,  (float16_t)0.284407537f,
+    (float16_t)0.956940336f,  (float16_t)0.290284677f,
+    (float16_t)0.955141168f,  (float16_t)0.296150888f,
+    (float16_t)0.953306040f,  (float16_t)0.302005949f,
+    (float16_t)0.951435021f,  (float16_t)0.307849640f,
+    (float16_t)0.949528181f,  (float16_t)0.313681740f,
+    (float16_t)0.947585591f,  (float16_t)0.319502031f,
+    (float16_t)0.945607325f,  (float16_t)0.325310292f,
+    (float16_t)0.943593458f,  (float16_t)0.331106306f,
+    (float16_t)0.941544065f,  (float16_t)0.336889853f,
+    (float16_t)0.939459224f,  (float16_t)0.342660717f,
+    (float16_t)0.937339012f,  (float16_t)0.348418680f,
+    (float16_t)0.935183510f,  (float16_t)0.354163525f,
+    (float16_t)0.932992799f,  (float16_t)0.359895037f,
+    (float16_t)0.930766961f,  (float16_t)0.365612998f,
+    (float16_t)0.928506080f,  (float16_t)0.371317194f,
+    (float16_t)0.926210242f,  (float16_t)0.377007410f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.921514039f,  (float16_t)0.388345047f,
+    (float16_t)0.919113852f,  (float16_t)0.393992040f,
+    (float16_t)0.916679060f,  (float16_t)0.399624200f,
+    (float16_t)0.914209756f,  (float16_t)0.405241314f,
+    (float16_t)0.911706032f,  (float16_t)0.410843171f,
+    (float16_t)0.909167983f,  (float16_t)0.416429560f,
+    (float16_t)0.906595705f,  (float16_t)0.422000271f,
+    (float16_t)0.903989293f,  (float16_t)0.427555093f,
+    (float16_t)0.901348847f,  (float16_t)0.433093819f,
+    (float16_t)0.898674466f,  (float16_t)0.438616239f,
+    (float16_t)0.895966250f,  (float16_t)0.444122145f,
+    (float16_t)0.893224301f,  (float16_t)0.449611330f,
+    (float16_t)0.890448723f,  (float16_t)0.455083587f,
+    (float16_t)0.887639620f,  (float16_t)0.460538711f,
+    (float16_t)0.884797098f,  (float16_t)0.465976496f,
+    (float16_t)0.881921264f,  (float16_t)0.471396737f,
+    (float16_t)0.879012226f,  (float16_t)0.476799230f,
+    (float16_t)0.876070094f,  (float16_t)0.482183772f,
+    (float16_t)0.873094978f,  (float16_t)0.487550160f,
+    (float16_t)0.870086991f,  (float16_t)0.492898192f,
+    (float16_t)0.867046246f,  (float16_t)0.498227667f,
+    (float16_t)0.863972856f,  (float16_t)0.503538384f,
+    (float16_t)0.860866939f,  (float16_t)0.508830143f,
+    (float16_t)0.857728610f,  (float16_t)0.514102744f,
+    (float16_t)0.854557988f,  (float16_t)0.519355990f,
+    (float16_t)0.851355193f,  (float16_t)0.524589683f,
+    (float16_t)0.848120345f,  (float16_t)0.529803625f,
+    (float16_t)0.844853565f,  (float16_t)0.534997620f,
+    (float16_t)0.841554977f,  (float16_t)0.540171473f,
+    (float16_t)0.838224706f,  (float16_t)0.545324988f,
+    (float16_t)0.834862875f,  (float16_t)0.550457973f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.828045045f,  (float16_t)0.560661576f,
+    (float16_t)0.824589303f,  (float16_t)0.565731811f,
+    (float16_t)0.821102515f,  (float16_t)0.570780746f,
+    (float16_t)0.817584813f,  (float16_t)0.575808191f,
+    (float16_t)0.814036330f,  (float16_t)0.580813958f,
+    (float16_t)0.810457198f,  (float16_t)0.585797857f,
+    (float16_t)0.806847554f,  (float16_t)0.590759702f,
+    (float16_t)0.803207531f,  (float16_t)0.595699304f,
+    (float16_t)0.799537269f,  (float16_t)0.600616479f,
+    (float16_t)0.795836905f,  (float16_t)0.605511041f,
+    (float16_t)0.792106577f,  (float16_t)0.610382806f,
+    (float16_t)0.788346428f,  (float16_t)0.615231591f,
+    (float16_t)0.784556597f,  (float16_t)0.620057212f,
+    (float16_t)0.780737229f,  (float16_t)0.624859488f,
+    (float16_t)0.776888466f,  (float16_t)0.629638239f,
+    (float16_t)0.773010453f,  (float16_t)0.634393284f,
+    (float16_t)0.769103338f,  (float16_t)0.639124445f,
+    (float16_t)0.765167266f,  (float16_t)0.643831543f,
+    (float16_t)0.761202385f,  (float16_t)0.648514401f,
+    (float16_t)0.757208847f,  (float16_t)0.653172843f,
+    (float16_t)0.753186799f,  (float16_t)0.657806693f,
+    (float16_t)0.749136395f,  (float16_t)0.662415778f,
+    (float16_t)0.745057785f,  (float16_t)0.666999922f,
+    (float16_t)0.740951125f,  (float16_t)0.671558955f,
+    (float16_t)0.736816569f,  (float16_t)0.676092704f,
+    (float16_t)0.732654272f,  (float16_t)0.680600998f,
+    (float16_t)0.728464390f,  (float16_t)0.685083668f,
+    (float16_t)0.724247083f,  (float16_t)0.689540545f,
+    (float16_t)0.720002508f,  (float16_t)0.693971461f,
+    (float16_t)0.715730825f,  (float16_t)0.698376249f,
+    (float16_t)0.711432196f,  (float16_t)0.702754744f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.702754744f,  (float16_t)0.711432196f,
+    (float16_t)0.698376249f,  (float16_t)0.715730825f,
+    (float16_t)0.693971461f,  (float16_t)0.720002508f,
+    (float16_t)0.689540545f,  (float16_t)0.724247083f,
+    (float16_t)0.685083668f,  (float16_t)0.728464390f,
+    (float16_t)0.680600998f,  (float16_t)0.732654272f,
+    (float16_t)0.676092704f,  (float16_t)0.736816569f,
+    (float16_t)0.671558955f,  (float16_t)0.740951125f,
+    (float16_t)0.666999922f,  (float16_t)0.745057785f,
+    (float16_t)0.662415778f,  (float16_t)0.749136395f,
+    (float16_t)0.657806693f,  (float16_t)0.753186799f,
+    (float16_t)0.653172843f,  (float16_t)0.757208847f,
+    (float16_t)0.648514401f,  (float16_t)0.761202385f,
+    (float16_t)0.643831543f,  (float16_t)0.765167266f,
+    (float16_t)0.639124445f,  (float16_t)0.769103338f,
+    (float16_t)0.634393284f,  (float16_t)0.773010453f,
+    (float16_t)0.629638239f,  (float16_t)0.776888466f,
+    (float16_t)0.624859488f,  (float16_t)0.780737229f,
+    (float16_t)0.620057212f,  (float16_t)0.784556597f,
+    (float16_t)0.615231591f,  (float16_t)0.788346428f,
+    (float16_t)0.610382806f,  (float16_t)0.792106577f,
+    (float16_t)0.605511041f,  (float16_t)0.795836905f,
+    (float16_t)0.600616479f,  (float16_t)0.799537269f,
+    (float16_t)0.595699304f,  (float16_t)0.803207531f,
+    (float16_t)0.590759702f,  (float16_t)0.806847554f,
+    (float16_t)0.585797857f,  (float16_t)0.810457198f,
+    (float16_t)0.580813958f,  (float16_t)0.814036330f,
+    (float16_t)0.575808191f,  (float16_t)0.817584813f,
+    (float16_t)0.570780746f,  (float16_t)0.821102515f,
+    (float16_t)0.565731811f,  (float16_t)0.824589303f,
+    (float16_t)0.560661576f,  (float16_t)0.828045045f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.550457973f,  (float16_t)0.834862875f,
+    (float16_t)0.545324988f,  (float16_t)0.838224706f,
+    (float16_t)0.540171473f,  (float16_t)0.841554977f,
+    (float16_t)0.534997620f,  (float16_t)0.844853565f,
+    (float16_t)0.529803625f,  (float16_t)0.848120345f,
+    (float16_t)0.524589683f,  (float16_t)0.851355193f,
+    (float16_t)0.519355990f,  (float16_t)0.854557988f,
+    (float16_t)0.514102744f,  (float16_t)0.857728610f,
+    (float16_t)0.508830143f,  (float16_t)0.860866939f,
+    (float16_t)0.503538384f,  (float16_t)0.863972856f,
+    (float16_t)0.498227667f,  (float16_t)0.867046246f,
+    (float16_t)0.492898192f,  (float16_t)0.870086991f,
+    (float16_t)0.487550160f,  (float16_t)0.873094978f,
+    (float16_t)0.482183772f,  (float16_t)0.876070094f,
+    (float16_t)0.476799230f,  (float16_t)0.879012226f,
+    (float16_t)0.471396737f,  (float16_t)0.881921264f,
+    (float16_t)0.465976496f,  (float16_t)0.884797098f,
+    (float16_t)0.460538711f,  (float16_t)0.887639620f,
+    (float16_t)0.455083587f,  (float16_t)0.890448723f,
+    (float16_t)0.449611330f,  (float16_t)0.893224301f,
+    (float16_t)0.444122145f,  (float16_t)0.895966250f,
+    (float16_t)0.438616239f,  (float16_t)0.898674466f,
+    (float16_t)0.433093819f,  (float16_t)0.901348847f,
+    (float16_t)0.427555093f,  (float16_t)0.903989293f,
+    (float16_t)0.422000271f,  (float16_t)0.906595705f,
+    (float16_t)0.416429560f,  (float16_t)0.909167983f,
+    (float16_t)0.410843171f,  (float16_t)0.911706032f,
+    (float16_t)0.405241314f,  (float16_t)0.914209756f,
+    (float16_t)0.399624200f,  (float16_t)0.916679060f,
+    (float16_t)0.393992040f,  (float16_t)0.919113852f,
+    (float16_t)0.388345047f,  (float16_t)0.921514039f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.377007410f,  (float16_t)0.926210242f,
+    (float16_t)0.371317194f,  (float16_t)0.928506080f,
+    (float16_t)0.365612998f,  (float16_t)0.930766961f,
+    (float16_t)0.359895037f,  (float16_t)0.932992799f,
+    (float16_t)0.354163525f,  (float16_t)0.935183510f,
+    (float16_t)0.348418680f,  (float16_t)0.937339012f,
+    (float16_t)0.342660717f,  (float16_t)0.939459224f,
+    (float16_t)0.336889853f,  (float16_t)0.941544065f,
+    (float16_t)0.331106306f,  (float16_t)0.943593458f,
+    (float16_t)0.325310292f,  (float16_t)0.945607325f,
+    (float16_t)0.319502031f,  (float16_t)0.947585591f,
+    (float16_t)0.313681740f,  (float16_t)0.949528181f,
+    (float16_t)0.307849640f,  (float16_t)0.951435021f,
+    (float16_t)0.302005949f,  (float16_t)0.953306040f,
+    (float16_t)0.296150888f,  (float16_t)0.955141168f,
+    (float16_t)0.290284677f,  (float16_t)0.956940336f,
+    (float16_t)0.284407537f,  (float16_t)0.958703475f,
+    (float16_t)0.278519689f,  (float16_t)0.960430519f,
+    (float16_t)0.272621355f,  (float16_t)0.962121404f,
+    (float16_t)0.266712757f,  (float16_t)0.963776066f,
+    (float16_t)0.260794118f,  (float16_t)0.965394442f,
+    (float16_t)0.254865660f,  (float16_t)0.966976471f,
+    (float16_t)0.248927606f,  (float16_t)0.968522094f,
+    (float16_t)0.242980180f,  (float16_t)0.970031253f,
+    (float16_t)0.237023606f,  (float16_t)0.971503891f,
+    (float16_t)0.231058108f,  (float16_t)0.972939952f,
+    (float16_t)0.225083911f,  (float16_t)0.974339383f,
+    (float16_t)0.219101240f,  (float16_t)0.975702130f,
+    (float16_t)0.213110320f,  (float16_t)0.977028143f,
+    (float16_t)0.207111376f,  (float16_t)0.978317371f,
+    (float16_t)0.201104635f,  (float16_t)0.979569766f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.189068664f,  (float16_t)0.981963869f,
+    (float16_t)0.183039888f,  (float16_t)0.983105487f,
+    (float16_t)0.177004220f,  (float16_t)0.984210092f,
+    (float16_t)0.170961889f,  (float16_t)0.985277642f,
+    (float16_t)0.164913120f,  (float16_t)0.986308097f,
+    (float16_t)0.158858143f,  (float16_t)0.987301418f,
+    (float16_t)0.152797185f,  (float16_t)0.988257568f,
+    (float16_t)0.146730474f,  (float16_t)0.989176510f,
+    (float16_t)0.140658239f,  (float16_t)0.990058210f,
+    (float16_t)0.134580709f,  (float16_t)0.990902635f,
+    (float16_t)0.128498111f,  (float16_t)0.991709754f,
+    (float16_t)0.122410675f,  (float16_t)0.992479535f,
+    (float16_t)0.116318631f,  (float16_t)0.993211949f,
+    (float16_t)0.110222207f,  (float16_t)0.993906970f,
+    (float16_t)0.104121634f,  (float16_t)0.994564571f,
+    (float16_t)0.098017140f,  (float16_t)0.995184727f,
+    (float16_t)0.091908956f,  (float16_t)0.995767414f,
+    (float16_t)0.085797312f,  (float16_t)0.996312612f,
+    (float16_t)0.079682438f,  (float16_t)0.996820299f,
+    (float16_t)0.073564564f,  (float16_t)0.997290457f,
+    (float16_t)0.067443920f,  (float16_t)0.997723067f,
+    (float16_t)0.061320736f,  (float16_t)0.998118113f,
+    (float16_t)0.055195244f,  (float16_t)0.998475581f,
+    (float16_t)0.049067674f,  (float16_t)0.998795456f,
+    (float16_t)0.042938257f,  (float16_t)0.999077728f,
+    (float16_t)0.036807223f,  (float16_t)0.999322385f,
+    (float16_t)0.030674803f,  (float16_t)0.999529418f,
+    (float16_t)0.024541229f,  (float16_t)0.999698819f,
+    (float16_t)0.018406730f,  (float16_t)0.999830582f,
+    (float16_t)0.012271538f,  (float16_t)0.999924702f,
+    (float16_t)0.006135885f,  (float16_t)0.999981175f,
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+   (float16_t)-0.006135885f,  (float16_t)0.999981175f,
+   (float16_t)-0.012271538f,  (float16_t)0.999924702f,
+   (float16_t)-0.018406730f,  (float16_t)0.999830582f,
+   (float16_t)-0.024541229f,  (float16_t)0.999698819f,
+   (float16_t)-0.030674803f,  (float16_t)0.999529418f,
+   (float16_t)-0.036807223f,  (float16_t)0.999322385f,
+   (float16_t)-0.042938257f,  (float16_t)0.999077728f,
+   (float16_t)-0.049067674f,  (float16_t)0.998795456f,
+   (float16_t)-0.055195244f,  (float16_t)0.998475581f,
+   (float16_t)-0.061320736f,  (float16_t)0.998118113f,
+   (float16_t)-0.067443920f,  (float16_t)0.997723067f,
+   (float16_t)-0.073564564f,  (float16_t)0.997290457f,
+   (float16_t)-0.079682438f,  (float16_t)0.996820299f,
+   (float16_t)-0.085797312f,  (float16_t)0.996312612f,
+   (float16_t)-0.091908956f,  (float16_t)0.995767414f,
+   (float16_t)-0.098017140f,  (float16_t)0.995184727f,
+   (float16_t)-0.104121634f,  (float16_t)0.994564571f,
+   (float16_t)-0.110222207f,  (float16_t)0.993906970f,
+   (float16_t)-0.116318631f,  (float16_t)0.993211949f,
+   (float16_t)-0.122410675f,  (float16_t)0.992479535f,
+   (float16_t)-0.128498111f,  (float16_t)0.991709754f,
+   (float16_t)-0.134580709f,  (float16_t)0.990902635f,
+   (float16_t)-0.140658239f,  (float16_t)0.990058210f,
+   (float16_t)-0.146730474f,  (float16_t)0.989176510f,
+   (float16_t)-0.152797185f,  (float16_t)0.988257568f,
+   (float16_t)-0.158858143f,  (float16_t)0.987301418f,
+   (float16_t)-0.164913120f,  (float16_t)0.986308097f,
+   (float16_t)-0.170961889f,  (float16_t)0.985277642f,
+   (float16_t)-0.177004220f,  (float16_t)0.984210092f,
+   (float16_t)-0.183039888f,  (float16_t)0.983105487f,
+   (float16_t)-0.189068664f,  (float16_t)0.981963869f,
+   (float16_t)-0.195090322f,  (float16_t)0.980785280f,
+   (float16_t)-0.201104635f,  (float16_t)0.979569766f,
+   (float16_t)-0.207111376f,  (float16_t)0.978317371f,
+   (float16_t)-0.213110320f,  (float16_t)0.977028143f,
+   (float16_t)-0.219101240f,  (float16_t)0.975702130f,
+   (float16_t)-0.225083911f,  (float16_t)0.974339383f,
+   (float16_t)-0.231058108f,  (float16_t)0.972939952f,
+   (float16_t)-0.237023606f,  (float16_t)0.971503891f,
+   (float16_t)-0.242980180f,  (float16_t)0.970031253f,
+   (float16_t)-0.248927606f,  (float16_t)0.968522094f,
+   (float16_t)-0.254865660f,  (float16_t)0.966976471f,
+   (float16_t)-0.260794118f,  (float16_t)0.965394442f,
+   (float16_t)-0.266712757f,  (float16_t)0.963776066f,
+   (float16_t)-0.272621355f,  (float16_t)0.962121404f,
+   (float16_t)-0.278519689f,  (float16_t)0.960430519f,
+   (float16_t)-0.284407537f,  (float16_t)0.958703475f,
+   (float16_t)-0.290284677f,  (float16_t)0.956940336f,
+   (float16_t)-0.296150888f,  (float16_t)0.955141168f,
+   (float16_t)-0.302005949f,  (float16_t)0.953306040f,
+   (float16_t)-0.307849640f,  (float16_t)0.951435021f,
+   (float16_t)-0.313681740f,  (float16_t)0.949528181f,
+   (float16_t)-0.319502031f,  (float16_t)0.947585591f,
+   (float16_t)-0.325310292f,  (float16_t)0.945607325f,
+   (float16_t)-0.331106306f,  (float16_t)0.943593458f,
+   (float16_t)-0.336889853f,  (float16_t)0.941544065f,
+   (float16_t)-0.342660717f,  (float16_t)0.939459224f,
+   (float16_t)-0.348418680f,  (float16_t)0.937339012f,
+   (float16_t)-0.354163525f,  (float16_t)0.935183510f,
+   (float16_t)-0.359895037f,  (float16_t)0.932992799f,
+   (float16_t)-0.365612998f,  (float16_t)0.930766961f,
+   (float16_t)-0.371317194f,  (float16_t)0.928506080f,
+   (float16_t)-0.377007410f,  (float16_t)0.926210242f,
+   (float16_t)-0.382683432f,  (float16_t)0.923879533f,
+   (float16_t)-0.388345047f,  (float16_t)0.921514039f,
+   (float16_t)-0.393992040f,  (float16_t)0.919113852f,
+   (float16_t)-0.399624200f,  (float16_t)0.916679060f,
+   (float16_t)-0.405241314f,  (float16_t)0.914209756f,
+   (float16_t)-0.410843171f,  (float16_t)0.911706032f,
+   (float16_t)-0.416429560f,  (float16_t)0.909167983f,
+   (float16_t)-0.422000271f,  (float16_t)0.906595705f,
+   (float16_t)-0.427555093f,  (float16_t)0.903989293f,
+   (float16_t)-0.433093819f,  (float16_t)0.901348847f,
+   (float16_t)-0.438616239f,  (float16_t)0.898674466f,
+   (float16_t)-0.444122145f,  (float16_t)0.895966250f,
+   (float16_t)-0.449611330f,  (float16_t)0.893224301f,
+   (float16_t)-0.455083587f,  (float16_t)0.890448723f,
+   (float16_t)-0.460538711f,  (float16_t)0.887639620f,
+   (float16_t)-0.465976496f,  (float16_t)0.884797098f,
+   (float16_t)-0.471396737f,  (float16_t)0.881921264f,
+   (float16_t)-0.476799230f,  (float16_t)0.879012226f,
+   (float16_t)-0.482183772f,  (float16_t)0.876070094f,
+   (float16_t)-0.487550160f,  (float16_t)0.873094978f,
+   (float16_t)-0.492898192f,  (float16_t)0.870086991f,
+   (float16_t)-0.498227667f,  (float16_t)0.867046246f,
+   (float16_t)-0.503538384f,  (float16_t)0.863972856f,
+   (float16_t)-0.508830143f,  (float16_t)0.860866939f,
+   (float16_t)-0.514102744f,  (float16_t)0.857728610f,
+   (float16_t)-0.519355990f,  (float16_t)0.854557988f,
+   (float16_t)-0.524589683f,  (float16_t)0.851355193f,
+   (float16_t)-0.529803625f,  (float16_t)0.848120345f,
+   (float16_t)-0.534997620f,  (float16_t)0.844853565f,
+   (float16_t)-0.540171473f,  (float16_t)0.841554977f,
+   (float16_t)-0.545324988f,  (float16_t)0.838224706f,
+   (float16_t)-0.550457973f,  (float16_t)0.834862875f,
+   (float16_t)-0.555570233f,  (float16_t)0.831469612f,
+   (float16_t)-0.560661576f,  (float16_t)0.828045045f,
+   (float16_t)-0.565731811f,  (float16_t)0.824589303f,
+   (float16_t)-0.570780746f,  (float16_t)0.821102515f,
+   (float16_t)-0.575808191f,  (float16_t)0.817584813f,
+   (float16_t)-0.580813958f,  (float16_t)0.814036330f,
+   (float16_t)-0.585797857f,  (float16_t)0.810457198f,
+   (float16_t)-0.590759702f,  (float16_t)0.806847554f,
+   (float16_t)-0.595699304f,  (float16_t)0.803207531f,
+   (float16_t)-0.600616479f,  (float16_t)0.799537269f,
+   (float16_t)-0.605511041f,  (float16_t)0.795836905f,
+   (float16_t)-0.610382806f,  (float16_t)0.792106577f,
+   (float16_t)-0.615231591f,  (float16_t)0.788346428f,
+   (float16_t)-0.620057212f,  (float16_t)0.784556597f,
+   (float16_t)-0.624859488f,  (float16_t)0.780737229f,
+   (float16_t)-0.629638239f,  (float16_t)0.776888466f,
+   (float16_t)-0.634393284f,  (float16_t)0.773010453f,
+   (float16_t)-0.639124445f,  (float16_t)0.769103338f,
+   (float16_t)-0.643831543f,  (float16_t)0.765167266f,
+   (float16_t)-0.648514401f,  (float16_t)0.761202385f,
+   (float16_t)-0.653172843f,  (float16_t)0.757208847f,
+   (float16_t)-0.657806693f,  (float16_t)0.753186799f,
+   (float16_t)-0.662415778f,  (float16_t)0.749136395f,
+   (float16_t)-0.666999922f,  (float16_t)0.745057785f,
+   (float16_t)-0.671558955f,  (float16_t)0.740951125f,
+   (float16_t)-0.676092704f,  (float16_t)0.736816569f,
+   (float16_t)-0.680600998f,  (float16_t)0.732654272f,
+   (float16_t)-0.685083668f,  (float16_t)0.728464390f,
+   (float16_t)-0.689540545f,  (float16_t)0.724247083f,
+   (float16_t)-0.693971461f,  (float16_t)0.720002508f,
+   (float16_t)-0.698376249f,  (float16_t)0.715730825f,
+   (float16_t)-0.702754744f,  (float16_t)0.711432196f,
+   (float16_t)-0.707106781f,  (float16_t)0.707106781f,
+   (float16_t)-0.711432196f,  (float16_t)0.702754744f,
+   (float16_t)-0.715730825f,  (float16_t)0.698376249f,
+   (float16_t)-0.720002508f,  (float16_t)0.693971461f,
+   (float16_t)-0.724247083f,  (float16_t)0.689540545f,
+   (float16_t)-0.728464390f,  (float16_t)0.685083668f,
+   (float16_t)-0.732654272f,  (float16_t)0.680600998f,
+   (float16_t)-0.736816569f,  (float16_t)0.676092704f,
+   (float16_t)-0.740951125f,  (float16_t)0.671558955f,
+   (float16_t)-0.745057785f,  (float16_t)0.666999922f,
+   (float16_t)-0.749136395f,  (float16_t)0.662415778f,
+   (float16_t)-0.753186799f,  (float16_t)0.657806693f,
+   (float16_t)-0.757208847f,  (float16_t)0.653172843f,
+   (float16_t)-0.761202385f,  (float16_t)0.648514401f,
+   (float16_t)-0.765167266f,  (float16_t)0.643831543f,
+   (float16_t)-0.769103338f,  (float16_t)0.639124445f,
+   (float16_t)-0.773010453f,  (float16_t)0.634393284f,
+   (float16_t)-0.776888466f,  (float16_t)0.629638239f,
+   (float16_t)-0.780737229f,  (float16_t)0.624859488f,
+   (float16_t)-0.784556597f,  (float16_t)0.620057212f,
+   (float16_t)-0.788346428f,  (float16_t)0.615231591f,
+   (float16_t)-0.792106577f,  (float16_t)0.610382806f,
+   (float16_t)-0.795836905f,  (float16_t)0.605511041f,
+   (float16_t)-0.799537269f,  (float16_t)0.600616479f,
+   (float16_t)-0.803207531f,  (float16_t)0.595699304f,
+   (float16_t)-0.806847554f,  (float16_t)0.590759702f,
+   (float16_t)-0.810457198f,  (float16_t)0.585797857f,
+   (float16_t)-0.814036330f,  (float16_t)0.580813958f,
+   (float16_t)-0.817584813f,  (float16_t)0.575808191f,
+   (float16_t)-0.821102515f,  (float16_t)0.570780746f,
+   (float16_t)-0.824589303f,  (float16_t)0.565731811f,
+   (float16_t)-0.828045045f,  (float16_t)0.560661576f,
+   (float16_t)-0.831469612f,  (float16_t)0.555570233f,
+   (float16_t)-0.834862875f,  (float16_t)0.550457973f,
+   (float16_t)-0.838224706f,  (float16_t)0.545324988f,
+   (float16_t)-0.841554977f,  (float16_t)0.540171473f,
+   (float16_t)-0.844853565f,  (float16_t)0.534997620f,
+   (float16_t)-0.848120345f,  (float16_t)0.529803625f,
+   (float16_t)-0.851355193f,  (float16_t)0.524589683f,
+   (float16_t)-0.854557988f,  (float16_t)0.519355990f,
+   (float16_t)-0.857728610f,  (float16_t)0.514102744f,
+   (float16_t)-0.860866939f,  (float16_t)0.508830143f,
+   (float16_t)-0.863972856f,  (float16_t)0.503538384f,
+   (float16_t)-0.867046246f,  (float16_t)0.498227667f,
+   (float16_t)-0.870086991f,  (float16_t)0.492898192f,
+   (float16_t)-0.873094978f,  (float16_t)0.487550160f,
+   (float16_t)-0.876070094f,  (float16_t)0.482183772f,
+   (float16_t)-0.879012226f,  (float16_t)0.476799230f,
+   (float16_t)-0.881921264f,  (float16_t)0.471396737f,
+   (float16_t)-0.884797098f,  (float16_t)0.465976496f,
+   (float16_t)-0.887639620f,  (float16_t)0.460538711f,
+   (float16_t)-0.890448723f,  (float16_t)0.455083587f,
+   (float16_t)-0.893224301f,  (float16_t)0.449611330f,
+   (float16_t)-0.895966250f,  (float16_t)0.444122145f,
+   (float16_t)-0.898674466f,  (float16_t)0.438616239f,
+   (float16_t)-0.901348847f,  (float16_t)0.433093819f,
+   (float16_t)-0.903989293f,  (float16_t)0.427555093f,
+   (float16_t)-0.906595705f,  (float16_t)0.422000271f,
+   (float16_t)-0.909167983f,  (float16_t)0.416429560f,
+   (float16_t)-0.911706032f,  (float16_t)0.410843171f,
+   (float16_t)-0.914209756f,  (float16_t)0.405241314f,
+   (float16_t)-0.916679060f,  (float16_t)0.399624200f,
+   (float16_t)-0.919113852f,  (float16_t)0.393992040f,
+   (float16_t)-0.921514039f,  (float16_t)0.388345047f,
+   (float16_t)-0.923879533f,  (float16_t)0.382683432f,
+   (float16_t)-0.926210242f,  (float16_t)0.377007410f,
+   (float16_t)-0.928506080f,  (float16_t)0.371317194f,
+   (float16_t)-0.930766961f,  (float16_t)0.365612998f,
+   (float16_t)-0.932992799f,  (float16_t)0.359895037f,
+   (float16_t)-0.935183510f,  (float16_t)0.354163525f,
+   (float16_t)-0.937339012f,  (float16_t)0.348418680f,
+   (float16_t)-0.939459224f,  (float16_t)0.342660717f,
+   (float16_t)-0.941544065f,  (float16_t)0.336889853f,
+   (float16_t)-0.943593458f,  (float16_t)0.331106306f,
+   (float16_t)-0.945607325f,  (float16_t)0.325310292f,
+   (float16_t)-0.947585591f,  (float16_t)0.319502031f,
+   (float16_t)-0.949528181f,  (float16_t)0.313681740f,
+   (float16_t)-0.951435021f,  (float16_t)0.307849640f,
+   (float16_t)-0.953306040f,  (float16_t)0.302005949f,
+   (float16_t)-0.955141168f,  (float16_t)0.296150888f,
+   (float16_t)-0.956940336f,  (float16_t)0.290284677f,
+   (float16_t)-0.958703475f,  (float16_t)0.284407537f,
+   (float16_t)-0.960430519f,  (float16_t)0.278519689f,
+   (float16_t)-0.962121404f,  (float16_t)0.272621355f,
+   (float16_t)-0.963776066f,  (float16_t)0.266712757f,
+   (float16_t)-0.965394442f,  (float16_t)0.260794118f,
+   (float16_t)-0.966976471f,  (float16_t)0.254865660f,
+   (float16_t)-0.968522094f,  (float16_t)0.248927606f,
+   (float16_t)-0.970031253f,  (float16_t)0.242980180f,
+   (float16_t)-0.971503891f,  (float16_t)0.237023606f,
+   (float16_t)-0.972939952f,  (float16_t)0.231058108f,
+   (float16_t)-0.974339383f,  (float16_t)0.225083911f,
+   (float16_t)-0.975702130f,  (float16_t)0.219101240f,
+   (float16_t)-0.977028143f,  (float16_t)0.213110320f,
+   (float16_t)-0.978317371f,  (float16_t)0.207111376f,
+   (float16_t)-0.979569766f,  (float16_t)0.201104635f,
+   (float16_t)-0.980785280f,  (float16_t)0.195090322f,
+   (float16_t)-0.981963869f,  (float16_t)0.189068664f,
+   (float16_t)-0.983105487f,  (float16_t)0.183039888f,
+   (float16_t)-0.984210092f,  (float16_t)0.177004220f,
+   (float16_t)-0.985277642f,  (float16_t)0.170961889f,
+   (float16_t)-0.986308097f,  (float16_t)0.164913120f,
+   (float16_t)-0.987301418f,  (float16_t)0.158858143f,
+   (float16_t)-0.988257568f,  (float16_t)0.152797185f,
+   (float16_t)-0.989176510f,  (float16_t)0.146730474f,
+   (float16_t)-0.990058210f,  (float16_t)0.140658239f,
+   (float16_t)-0.990902635f,  (float16_t)0.134580709f,
+   (float16_t)-0.991709754f,  (float16_t)0.128498111f,
+   (float16_t)-0.992479535f,  (float16_t)0.122410675f,
+   (float16_t)-0.993211949f,  (float16_t)0.116318631f,
+   (float16_t)-0.993906970f,  (float16_t)0.110222207f,
+   (float16_t)-0.994564571f,  (float16_t)0.104121634f,
+   (float16_t)-0.995184727f,  (float16_t)0.098017140f,
+   (float16_t)-0.995767414f,  (float16_t)0.091908956f,
+   (float16_t)-0.996312612f,  (float16_t)0.085797312f,
+   (float16_t)-0.996820299f,  (float16_t)0.079682438f,
+   (float16_t)-0.997290457f,  (float16_t)0.073564564f,
+   (float16_t)-0.997723067f,  (float16_t)0.067443920f,
+   (float16_t)-0.998118113f,  (float16_t)0.061320736f,
+   (float16_t)-0.998475581f,  (float16_t)0.055195244f,
+   (float16_t)-0.998795456f,  (float16_t)0.049067674f,
+   (float16_t)-0.999077728f,  (float16_t)0.042938257f,
+   (float16_t)-0.999322385f,  (float16_t)0.036807223f,
+   (float16_t)-0.999529418f,  (float16_t)0.030674803f,
+   (float16_t)-0.999698819f,  (float16_t)0.024541229f,
+   (float16_t)-0.999830582f,  (float16_t)0.018406730f,
+   (float16_t)-0.999924702f,  (float16_t)0.012271538f,
+   (float16_t)-0.999981175f,  (float16_t)0.006135885f,
+   (float16_t)-1.000000000f,  (float16_t)0.000000000f,
+   (float16_t)-0.999981175f, (float16_t)-0.006135885f,
+   (float16_t)-0.999924702f, (float16_t)-0.012271538f,
+   (float16_t)-0.999830582f, (float16_t)-0.018406730f,
+   (float16_t)-0.999698819f, (float16_t)-0.024541229f,
+   (float16_t)-0.999529418f, (float16_t)-0.030674803f,
+   (float16_t)-0.999322385f, (float16_t)-0.036807223f,
+   (float16_t)-0.999077728f, (float16_t)-0.042938257f,
+   (float16_t)-0.998795456f, (float16_t)-0.049067674f,
+   (float16_t)-0.998475581f, (float16_t)-0.055195244f,
+   (float16_t)-0.998118113f, (float16_t)-0.061320736f,
+   (float16_t)-0.997723067f, (float16_t)-0.067443920f,
+   (float16_t)-0.997290457f, (float16_t)-0.073564564f,
+   (float16_t)-0.996820299f, (float16_t)-0.079682438f,
+   (float16_t)-0.996312612f, (float16_t)-0.085797312f,
+   (float16_t)-0.995767414f, (float16_t)-0.091908956f,
+   (float16_t)-0.995184727f, (float16_t)-0.098017140f,
+   (float16_t)-0.994564571f, (float16_t)-0.104121634f,
+   (float16_t)-0.993906970f, (float16_t)-0.110222207f,
+   (float16_t)-0.993211949f, (float16_t)-0.116318631f,
+   (float16_t)-0.992479535f, (float16_t)-0.122410675f,
+   (float16_t)-0.991709754f, (float16_t)-0.128498111f,
+   (float16_t)-0.990902635f, (float16_t)-0.134580709f,
+   (float16_t)-0.990058210f, (float16_t)-0.140658239f,
+   (float16_t)-0.989176510f, (float16_t)-0.146730474f,
+   (float16_t)-0.988257568f, (float16_t)-0.152797185f,
+   (float16_t)-0.987301418f, (float16_t)-0.158858143f,
+   (float16_t)-0.986308097f, (float16_t)-0.164913120f,
+   (float16_t)-0.985277642f, (float16_t)-0.170961889f,
+   (float16_t)-0.984210092f, (float16_t)-0.177004220f,
+   (float16_t)-0.983105487f, (float16_t)-0.183039888f,
+   (float16_t)-0.981963869f, (float16_t)-0.189068664f,
+   (float16_t)-0.980785280f, (float16_t)-0.195090322f,
+   (float16_t)-0.979569766f, (float16_t)-0.201104635f,
+   (float16_t)-0.978317371f, (float16_t)-0.207111376f,
+   (float16_t)-0.977028143f, (float16_t)-0.213110320f,
+   (float16_t)-0.975702130f, (float16_t)-0.219101240f,
+   (float16_t)-0.974339383f, (float16_t)-0.225083911f,
+   (float16_t)-0.972939952f, (float16_t)-0.231058108f,
+   (float16_t)-0.971503891f, (float16_t)-0.237023606f,
+   (float16_t)-0.970031253f, (float16_t)-0.242980180f,
+   (float16_t)-0.968522094f, (float16_t)-0.248927606f,
+   (float16_t)-0.966976471f, (float16_t)-0.254865660f,
+   (float16_t)-0.965394442f, (float16_t)-0.260794118f,
+   (float16_t)-0.963776066f, (float16_t)-0.266712757f,
+   (float16_t)-0.962121404f, (float16_t)-0.272621355f,
+   (float16_t)-0.960430519f, (float16_t)-0.278519689f,
+   (float16_t)-0.958703475f, (float16_t)-0.284407537f,
+   (float16_t)-0.956940336f, (float16_t)-0.290284677f,
+   (float16_t)-0.955141168f, (float16_t)-0.296150888f,
+   (float16_t)-0.953306040f, (float16_t)-0.302005949f,
+   (float16_t)-0.951435021f, (float16_t)-0.307849640f,
+   (float16_t)-0.949528181f, (float16_t)-0.313681740f,
+   (float16_t)-0.947585591f, (float16_t)-0.319502031f,
+   (float16_t)-0.945607325f, (float16_t)-0.325310292f,
+   (float16_t)-0.943593458f, (float16_t)-0.331106306f,
+   (float16_t)-0.941544065f, (float16_t)-0.336889853f,
+   (float16_t)-0.939459224f, (float16_t)-0.342660717f,
+   (float16_t)-0.937339012f, (float16_t)-0.348418680f,
+   (float16_t)-0.935183510f, (float16_t)-0.354163525f,
+   (float16_t)-0.932992799f, (float16_t)-0.359895037f,
+   (float16_t)-0.930766961f, (float16_t)-0.365612998f,
+   (float16_t)-0.928506080f, (float16_t)-0.371317194f,
+   (float16_t)-0.926210242f, (float16_t)-0.377007410f,
+   (float16_t)-0.923879533f, (float16_t)-0.382683432f,
+   (float16_t)-0.921514039f, (float16_t)-0.388345047f,
+   (float16_t)-0.919113852f, (float16_t)-0.393992040f,
+   (float16_t)-0.916679060f, (float16_t)-0.399624200f,
+   (float16_t)-0.914209756f, (float16_t)-0.405241314f,
+   (float16_t)-0.911706032f, (float16_t)-0.410843171f,
+   (float16_t)-0.909167983f, (float16_t)-0.416429560f,
+   (float16_t)-0.906595705f, (float16_t)-0.422000271f,
+   (float16_t)-0.903989293f, (float16_t)-0.427555093f,
+   (float16_t)-0.901348847f, (float16_t)-0.433093819f,
+   (float16_t)-0.898674466f, (float16_t)-0.438616239f,
+   (float16_t)-0.895966250f, (float16_t)-0.444122145f,
+   (float16_t)-0.893224301f, (float16_t)-0.449611330f,
+   (float16_t)-0.890448723f, (float16_t)-0.455083587f,
+   (float16_t)-0.887639620f, (float16_t)-0.460538711f,
+   (float16_t)-0.884797098f, (float16_t)-0.465976496f,
+   (float16_t)-0.881921264f, (float16_t)-0.471396737f,
+   (float16_t)-0.879012226f, (float16_t)-0.476799230f,
+   (float16_t)-0.876070094f, (float16_t)-0.482183772f,
+   (float16_t)-0.873094978f, (float16_t)-0.487550160f,
+   (float16_t)-0.870086991f, (float16_t)-0.492898192f,
+   (float16_t)-0.867046246f, (float16_t)-0.498227667f,
+   (float16_t)-0.863972856f, (float16_t)-0.503538384f,
+   (float16_t)-0.860866939f, (float16_t)-0.508830143f,
+   (float16_t)-0.857728610f, (float16_t)-0.514102744f,
+   (float16_t)-0.854557988f, (float16_t)-0.519355990f,
+   (float16_t)-0.851355193f, (float16_t)-0.524589683f,
+   (float16_t)-0.848120345f, (float16_t)-0.529803625f,
+   (float16_t)-0.844853565f, (float16_t)-0.534997620f,
+   (float16_t)-0.841554977f, (float16_t)-0.540171473f,
+   (float16_t)-0.838224706f, (float16_t)-0.545324988f,
+   (float16_t)-0.834862875f, (float16_t)-0.550457973f,
+   (float16_t)-0.831469612f, (float16_t)-0.555570233f,
+   (float16_t)-0.828045045f, (float16_t)-0.560661576f,
+   (float16_t)-0.824589303f, (float16_t)-0.565731811f,
+   (float16_t)-0.821102515f, (float16_t)-0.570780746f,
+   (float16_t)-0.817584813f, (float16_t)-0.575808191f,
+   (float16_t)-0.814036330f, (float16_t)-0.580813958f,
+   (float16_t)-0.810457198f, (float16_t)-0.585797857f,
+   (float16_t)-0.806847554f, (float16_t)-0.590759702f,
+   (float16_t)-0.803207531f, (float16_t)-0.595699304f,
+   (float16_t)-0.799537269f, (float16_t)-0.600616479f,
+   (float16_t)-0.795836905f, (float16_t)-0.605511041f,
+   (float16_t)-0.792106577f, (float16_t)-0.610382806f,
+   (float16_t)-0.788346428f, (float16_t)-0.615231591f,
+   (float16_t)-0.784556597f, (float16_t)-0.620057212f,
+   (float16_t)-0.780737229f, (float16_t)-0.624859488f,
+   (float16_t)-0.776888466f, (float16_t)-0.629638239f,
+   (float16_t)-0.773010453f, (float16_t)-0.634393284f,
+   (float16_t)-0.769103338f, (float16_t)-0.639124445f,
+   (float16_t)-0.765167266f, (float16_t)-0.643831543f,
+   (float16_t)-0.761202385f, (float16_t)-0.648514401f,
+   (float16_t)-0.757208847f, (float16_t)-0.653172843f,
+   (float16_t)-0.753186799f, (float16_t)-0.657806693f,
+   (float16_t)-0.749136395f, (float16_t)-0.662415778f,
+   (float16_t)-0.745057785f, (float16_t)-0.666999922f,
+   (float16_t)-0.740951125f, (float16_t)-0.671558955f,
+   (float16_t)-0.736816569f, (float16_t)-0.676092704f,
+   (float16_t)-0.732654272f, (float16_t)-0.680600998f,
+   (float16_t)-0.728464390f, (float16_t)-0.685083668f,
+   (float16_t)-0.724247083f, (float16_t)-0.689540545f,
+   (float16_t)-0.720002508f, (float16_t)-0.693971461f,
+   (float16_t)-0.715730825f, (float16_t)-0.698376249f,
+   (float16_t)-0.711432196f, (float16_t)-0.702754744f,
+   (float16_t)-0.707106781f, (float16_t)-0.707106781f,
+   (float16_t)-0.702754744f, (float16_t)-0.711432196f,
+   (float16_t)-0.698376249f, (float16_t)-0.715730825f,
+   (float16_t)-0.693971461f, (float16_t)-0.720002508f,
+   (float16_t)-0.689540545f, (float16_t)-0.724247083f,
+   (float16_t)-0.685083668f, (float16_t)-0.728464390f,
+   (float16_t)-0.680600998f, (float16_t)-0.732654272f,
+   (float16_t)-0.676092704f, (float16_t)-0.736816569f,
+   (float16_t)-0.671558955f, (float16_t)-0.740951125f,
+   (float16_t)-0.666999922f, (float16_t)-0.745057785f,
+   (float16_t)-0.662415778f, (float16_t)-0.749136395f,
+   (float16_t)-0.657806693f, (float16_t)-0.753186799f,
+   (float16_t)-0.653172843f, (float16_t)-0.757208847f,
+   (float16_t)-0.648514401f, (float16_t)-0.761202385f,
+   (float16_t)-0.643831543f, (float16_t)-0.765167266f,
+   (float16_t)-0.639124445f, (float16_t)-0.769103338f,
+   (float16_t)-0.634393284f, (float16_t)-0.773010453f,
+   (float16_t)-0.629638239f, (float16_t)-0.776888466f,
+   (float16_t)-0.624859488f, (float16_t)-0.780737229f,
+   (float16_t)-0.620057212f, (float16_t)-0.784556597f,
+   (float16_t)-0.615231591f, (float16_t)-0.788346428f,
+   (float16_t)-0.610382806f, (float16_t)-0.792106577f,
+   (float16_t)-0.605511041f, (float16_t)-0.795836905f,
+   (float16_t)-0.600616479f, (float16_t)-0.799537269f,
+   (float16_t)-0.595699304f, (float16_t)-0.803207531f,
+   (float16_t)-0.590759702f, (float16_t)-0.806847554f,
+   (float16_t)-0.585797857f, (float16_t)-0.810457198f,
+   (float16_t)-0.580813958f, (float16_t)-0.814036330f,
+   (float16_t)-0.575808191f, (float16_t)-0.817584813f,
+   (float16_t)-0.570780746f, (float16_t)-0.821102515f,
+   (float16_t)-0.565731811f, (float16_t)-0.824589303f,
+   (float16_t)-0.560661576f, (float16_t)-0.828045045f,
+   (float16_t)-0.555570233f, (float16_t)-0.831469612f,
+   (float16_t)-0.550457973f, (float16_t)-0.834862875f,
+   (float16_t)-0.545324988f, (float16_t)-0.838224706f,
+   (float16_t)-0.540171473f, (float16_t)-0.841554977f,
+   (float16_t)-0.534997620f, (float16_t)-0.844853565f,
+   (float16_t)-0.529803625f, (float16_t)-0.848120345f,
+   (float16_t)-0.524589683f, (float16_t)-0.851355193f,
+   (float16_t)-0.519355990f, (float16_t)-0.854557988f,
+   (float16_t)-0.514102744f, (float16_t)-0.857728610f,
+   (float16_t)-0.508830143f, (float16_t)-0.860866939f,
+   (float16_t)-0.503538384f, (float16_t)-0.863972856f,
+   (float16_t)-0.498227667f, (float16_t)-0.867046246f,
+   (float16_t)-0.492898192f, (float16_t)-0.870086991f,
+   (float16_t)-0.487550160f, (float16_t)-0.873094978f,
+   (float16_t)-0.482183772f, (float16_t)-0.876070094f,
+   (float16_t)-0.476799230f, (float16_t)-0.879012226f,
+   (float16_t)-0.471396737f, (float16_t)-0.881921264f,
+   (float16_t)-0.465976496f, (float16_t)-0.884797098f,
+   (float16_t)-0.460538711f, (float16_t)-0.887639620f,
+   (float16_t)-0.455083587f, (float16_t)-0.890448723f,
+   (float16_t)-0.449611330f, (float16_t)-0.893224301f,
+   (float16_t)-0.444122145f, (float16_t)-0.895966250f,
+   (float16_t)-0.438616239f, (float16_t)-0.898674466f,
+   (float16_t)-0.433093819f, (float16_t)-0.901348847f,
+   (float16_t)-0.427555093f, (float16_t)-0.903989293f,
+   (float16_t)-0.422000271f, (float16_t)-0.906595705f,
+   (float16_t)-0.416429560f, (float16_t)-0.909167983f,
+   (float16_t)-0.410843171f, (float16_t)-0.911706032f,
+   (float16_t)-0.405241314f, (float16_t)-0.914209756f,
+   (float16_t)-0.399624200f, (float16_t)-0.916679060f,
+   (float16_t)-0.393992040f, (float16_t)-0.919113852f,
+   (float16_t)-0.388345047f, (float16_t)-0.921514039f,
+   (float16_t)-0.382683432f, (float16_t)-0.923879533f,
+   (float16_t)-0.377007410f, (float16_t)-0.926210242f,
+   (float16_t)-0.371317194f, (float16_t)-0.928506080f,
+   (float16_t)-0.365612998f, (float16_t)-0.930766961f,
+   (float16_t)-0.359895037f, (float16_t)-0.932992799f,
+   (float16_t)-0.354163525f, (float16_t)-0.935183510f,
+   (float16_t)-0.348418680f, (float16_t)-0.937339012f,
+   (float16_t)-0.342660717f, (float16_t)-0.939459224f,
+   (float16_t)-0.336889853f, (float16_t)-0.941544065f,
+   (float16_t)-0.331106306f, (float16_t)-0.943593458f,
+   (float16_t)-0.325310292f, (float16_t)-0.945607325f,
+   (float16_t)-0.319502031f, (float16_t)-0.947585591f,
+   (float16_t)-0.313681740f, (float16_t)-0.949528181f,
+   (float16_t)-0.307849640f, (float16_t)-0.951435021f,
+   (float16_t)-0.302005949f, (float16_t)-0.953306040f,
+   (float16_t)-0.296150888f, (float16_t)-0.955141168f,
+   (float16_t)-0.290284677f, (float16_t)-0.956940336f,
+   (float16_t)-0.284407537f, (float16_t)-0.958703475f,
+   (float16_t)-0.278519689f, (float16_t)-0.960430519f,
+   (float16_t)-0.272621355f, (float16_t)-0.962121404f,
+   (float16_t)-0.266712757f, (float16_t)-0.963776066f,
+   (float16_t)-0.260794118f, (float16_t)-0.965394442f,
+   (float16_t)-0.254865660f, (float16_t)-0.966976471f,
+   (float16_t)-0.248927606f, (float16_t)-0.968522094f,
+   (float16_t)-0.242980180f, (float16_t)-0.970031253f,
+   (float16_t)-0.237023606f, (float16_t)-0.971503891f,
+   (float16_t)-0.231058108f, (float16_t)-0.972939952f,
+   (float16_t)-0.225083911f, (float16_t)-0.974339383f,
+   (float16_t)-0.219101240f, (float16_t)-0.975702130f,
+   (float16_t)-0.213110320f, (float16_t)-0.977028143f,
+   (float16_t)-0.207111376f, (float16_t)-0.978317371f,
+   (float16_t)-0.201104635f, (float16_t)-0.979569766f,
+   (float16_t)-0.195090322f, (float16_t)-0.980785280f,
+   (float16_t)-0.189068664f, (float16_t)-0.981963869f,
+   (float16_t)-0.183039888f, (float16_t)-0.983105487f,
+   (float16_t)-0.177004220f, (float16_t)-0.984210092f,
+   (float16_t)-0.170961889f, (float16_t)-0.985277642f,
+   (float16_t)-0.164913120f, (float16_t)-0.986308097f,
+   (float16_t)-0.158858143f, (float16_t)-0.987301418f,
+   (float16_t)-0.152797185f, (float16_t)-0.988257568f,
+   (float16_t)-0.146730474f, (float16_t)-0.989176510f,
+   (float16_t)-0.140658239f, (float16_t)-0.990058210f,
+   (float16_t)-0.134580709f, (float16_t)-0.990902635f,
+   (float16_t)-0.128498111f, (float16_t)-0.991709754f,
+   (float16_t)-0.122410675f, (float16_t)-0.992479535f,
+   (float16_t)-0.116318631f, (float16_t)-0.993211949f,
+   (float16_t)-0.110222207f, (float16_t)-0.993906970f,
+   (float16_t)-0.104121634f, (float16_t)-0.994564571f,
+   (float16_t)-0.098017140f, (float16_t)-0.995184727f,
+   (float16_t)-0.091908956f, (float16_t)-0.995767414f,
+   (float16_t)-0.085797312f, (float16_t)-0.996312612f,
+   (float16_t)-0.079682438f, (float16_t)-0.996820299f,
+   (float16_t)-0.073564564f, (float16_t)-0.997290457f,
+   (float16_t)-0.067443920f, (float16_t)-0.997723067f,
+   (float16_t)-0.061320736f, (float16_t)-0.998118113f,
+   (float16_t)-0.055195244f, (float16_t)-0.998475581f,
+   (float16_t)-0.049067674f, (float16_t)-0.998795456f,
+   (float16_t)-0.042938257f, (float16_t)-0.999077728f,
+   (float16_t)-0.036807223f, (float16_t)-0.999322385f,
+   (float16_t)-0.030674803f, (float16_t)-0.999529418f,
+   (float16_t)-0.024541229f, (float16_t)-0.999698819f,
+   (float16_t)-0.018406730f, (float16_t)-0.999830582f,
+   (float16_t)-0.012271538f, (float16_t)-0.999924702f,
+   (float16_t)-0.006135885f, (float16_t)-0.999981175f,
+   (float16_t)-0.000000000f, (float16_t)-1.000000000f,
+    (float16_t)0.006135885f, (float16_t)-0.999981175f,
+    (float16_t)0.012271538f, (float16_t)-0.999924702f,
+    (float16_t)0.018406730f, (float16_t)-0.999830582f,
+    (float16_t)0.024541229f, (float16_t)-0.999698819f,
+    (float16_t)0.030674803f, (float16_t)-0.999529418f,
+    (float16_t)0.036807223f, (float16_t)-0.999322385f,
+    (float16_t)0.042938257f, (float16_t)-0.999077728f,
+    (float16_t)0.049067674f, (float16_t)-0.998795456f,
+    (float16_t)0.055195244f, (float16_t)-0.998475581f,
+    (float16_t)0.061320736f, (float16_t)-0.998118113f,
+    (float16_t)0.067443920f, (float16_t)-0.997723067f,
+    (float16_t)0.073564564f, (float16_t)-0.997290457f,
+    (float16_t)0.079682438f, (float16_t)-0.996820299f,
+    (float16_t)0.085797312f, (float16_t)-0.996312612f,
+    (float16_t)0.091908956f, (float16_t)-0.995767414f,
+    (float16_t)0.098017140f, (float16_t)-0.995184727f,
+    (float16_t)0.104121634f, (float16_t)-0.994564571f,
+    (float16_t)0.110222207f, (float16_t)-0.993906970f,
+    (float16_t)0.116318631f, (float16_t)-0.993211949f,
+    (float16_t)0.122410675f, (float16_t)-0.992479535f,
+    (float16_t)0.128498111f, (float16_t)-0.991709754f,
+    (float16_t)0.134580709f, (float16_t)-0.990902635f,
+    (float16_t)0.140658239f, (float16_t)-0.990058210f,
+    (float16_t)0.146730474f, (float16_t)-0.989176510f,
+    (float16_t)0.152797185f, (float16_t)-0.988257568f,
+    (float16_t)0.158858143f, (float16_t)-0.987301418f,
+    (float16_t)0.164913120f, (float16_t)-0.986308097f,
+    (float16_t)0.170961889f, (float16_t)-0.985277642f,
+    (float16_t)0.177004220f, (float16_t)-0.984210092f,
+    (float16_t)0.183039888f, (float16_t)-0.983105487f,
+    (float16_t)0.189068664f, (float16_t)-0.981963869f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.201104635f, (float16_t)-0.979569766f,
+    (float16_t)0.207111376f, (float16_t)-0.978317371f,
+    (float16_t)0.213110320f, (float16_t)-0.977028143f,
+    (float16_t)0.219101240f, (float16_t)-0.975702130f,
+    (float16_t)0.225083911f, (float16_t)-0.974339383f,
+    (float16_t)0.231058108f, (float16_t)-0.972939952f,
+    (float16_t)0.237023606f, (float16_t)-0.971503891f,
+    (float16_t)0.242980180f, (float16_t)-0.970031253f,
+    (float16_t)0.248927606f, (float16_t)-0.968522094f,
+    (float16_t)0.254865660f, (float16_t)-0.966976471f,
+    (float16_t)0.260794118f, (float16_t)-0.965394442f,
+    (float16_t)0.266712757f, (float16_t)-0.963776066f,
+    (float16_t)0.272621355f, (float16_t)-0.962121404f,
+    (float16_t)0.278519689f, (float16_t)-0.960430519f,
+    (float16_t)0.284407537f, (float16_t)-0.958703475f,
+    (float16_t)0.290284677f, (float16_t)-0.956940336f,
+    (float16_t)0.296150888f, (float16_t)-0.955141168f,
+    (float16_t)0.302005949f, (float16_t)-0.953306040f,
+    (float16_t)0.307849640f, (float16_t)-0.951435021f,
+    (float16_t)0.313681740f, (float16_t)-0.949528181f,
+    (float16_t)0.319502031f, (float16_t)-0.947585591f,
+    (float16_t)0.325310292f, (float16_t)-0.945607325f,
+    (float16_t)0.331106306f, (float16_t)-0.943593458f,
+    (float16_t)0.336889853f, (float16_t)-0.941544065f,
+    (float16_t)0.342660717f, (float16_t)-0.939459224f,
+    (float16_t)0.348418680f, (float16_t)-0.937339012f,
+    (float16_t)0.354163525f, (float16_t)-0.935183510f,
+    (float16_t)0.359895037f, (float16_t)-0.932992799f,
+    (float16_t)0.365612998f, (float16_t)-0.930766961f,
+    (float16_t)0.371317194f, (float16_t)-0.928506080f,
+    (float16_t)0.377007410f, (float16_t)-0.926210242f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.388345047f, (float16_t)-0.921514039f,
+    (float16_t)0.393992040f, (float16_t)-0.919113852f,
+    (float16_t)0.399624200f, (float16_t)-0.916679060f,
+    (float16_t)0.405241314f, (float16_t)-0.914209756f,
+    (float16_t)0.410843171f, (float16_t)-0.911706032f,
+    (float16_t)0.416429560f, (float16_t)-0.909167983f,
+    (float16_t)0.422000271f, (float16_t)-0.906595705f,
+    (float16_t)0.427555093f, (float16_t)-0.903989293f,
+    (float16_t)0.433093819f, (float16_t)-0.901348847f,
+    (float16_t)0.438616239f, (float16_t)-0.898674466f,
+    (float16_t)0.444122145f, (float16_t)-0.895966250f,
+    (float16_t)0.449611330f, (float16_t)-0.893224301f,
+    (float16_t)0.455083587f, (float16_t)-0.890448723f,
+    (float16_t)0.460538711f, (float16_t)-0.887639620f,
+    (float16_t)0.465976496f, (float16_t)-0.884797098f,
+    (float16_t)0.471396737f, (float16_t)-0.881921264f,
+    (float16_t)0.476799230f, (float16_t)-0.879012226f,
+    (float16_t)0.482183772f, (float16_t)-0.876070094f,
+    (float16_t)0.487550160f, (float16_t)-0.873094978f,
+    (float16_t)0.492898192f, (float16_t)-0.870086991f,
+    (float16_t)0.498227667f, (float16_t)-0.867046246f,
+    (float16_t)0.503538384f, (float16_t)-0.863972856f,
+    (float16_t)0.508830143f, (float16_t)-0.860866939f,
+    (float16_t)0.514102744f, (float16_t)-0.857728610f,
+    (float16_t)0.519355990f, (float16_t)-0.854557988f,
+    (float16_t)0.524589683f, (float16_t)-0.851355193f,
+    (float16_t)0.529803625f, (float16_t)-0.848120345f,
+    (float16_t)0.534997620f, (float16_t)-0.844853565f,
+    (float16_t)0.540171473f, (float16_t)-0.841554977f,
+    (float16_t)0.545324988f, (float16_t)-0.838224706f,
+    (float16_t)0.550457973f, (float16_t)-0.834862875f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.560661576f, (float16_t)-0.828045045f,
+    (float16_t)0.565731811f, (float16_t)-0.824589303f,
+    (float16_t)0.570780746f, (float16_t)-0.821102515f,
+    (float16_t)0.575808191f, (float16_t)-0.817584813f,
+    (float16_t)0.580813958f, (float16_t)-0.814036330f,
+    (float16_t)0.585797857f, (float16_t)-0.810457198f,
+    (float16_t)0.590759702f, (float16_t)-0.806847554f,
+    (float16_t)0.595699304f, (float16_t)-0.803207531f,
+    (float16_t)0.600616479f, (float16_t)-0.799537269f,
+    (float16_t)0.605511041f, (float16_t)-0.795836905f,
+    (float16_t)0.610382806f, (float16_t)-0.792106577f,
+    (float16_t)0.615231591f, (float16_t)-0.788346428f,
+    (float16_t)0.620057212f, (float16_t)-0.784556597f,
+    (float16_t)0.624859488f, (float16_t)-0.780737229f,
+    (float16_t)0.629638239f, (float16_t)-0.776888466f,
+    (float16_t)0.634393284f, (float16_t)-0.773010453f,
+    (float16_t)0.639124445f, (float16_t)-0.769103338f,
+    (float16_t)0.643831543f, (float16_t)-0.765167266f,
+    (float16_t)0.648514401f, (float16_t)-0.761202385f,
+    (float16_t)0.653172843f, (float16_t)-0.757208847f,
+    (float16_t)0.657806693f, (float16_t)-0.753186799f,
+    (float16_t)0.662415778f, (float16_t)-0.749136395f,
+    (float16_t)0.666999922f, (float16_t)-0.745057785f,
+    (float16_t)0.671558955f, (float16_t)-0.740951125f,
+    (float16_t)0.676092704f, (float16_t)-0.736816569f,
+    (float16_t)0.680600998f, (float16_t)-0.732654272f,
+    (float16_t)0.685083668f, (float16_t)-0.728464390f,
+    (float16_t)0.689540545f, (float16_t)-0.724247083f,
+    (float16_t)0.693971461f, (float16_t)-0.720002508f,
+    (float16_t)0.698376249f, (float16_t)-0.715730825f,
+    (float16_t)0.702754744f, (float16_t)-0.711432196f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.711432196f, (float16_t)-0.702754744f,
+    (float16_t)0.715730825f, (float16_t)-0.698376249f,
+    (float16_t)0.720002508f, (float16_t)-0.693971461f,
+    (float16_t)0.724247083f, (float16_t)-0.689540545f,
+    (float16_t)0.728464390f, (float16_t)-0.685083668f,
+    (float16_t)0.732654272f, (float16_t)-0.680600998f,
+    (float16_t)0.736816569f, (float16_t)-0.676092704f,
+    (float16_t)0.740951125f, (float16_t)-0.671558955f,
+    (float16_t)0.745057785f, (float16_t)-0.666999922f,
+    (float16_t)0.749136395f, (float16_t)-0.662415778f,
+    (float16_t)0.753186799f, (float16_t)-0.657806693f,
+    (float16_t)0.757208847f, (float16_t)-0.653172843f,
+    (float16_t)0.761202385f, (float16_t)-0.648514401f,
+    (float16_t)0.765167266f, (float16_t)-0.643831543f,
+    (float16_t)0.769103338f, (float16_t)-0.639124445f,
+    (float16_t)0.773010453f, (float16_t)-0.634393284f,
+    (float16_t)0.776888466f, (float16_t)-0.629638239f,
+    (float16_t)0.780737229f, (float16_t)-0.624859488f,
+    (float16_t)0.784556597f, (float16_t)-0.620057212f,
+    (float16_t)0.788346428f, (float16_t)-0.615231591f,
+    (float16_t)0.792106577f, (float16_t)-0.610382806f,
+    (float16_t)0.795836905f, (float16_t)-0.605511041f,
+    (float16_t)0.799537269f, (float16_t)-0.600616479f,
+    (float16_t)0.803207531f, (float16_t)-0.595699304f,
+    (float16_t)0.806847554f, (float16_t)-0.590759702f,
+    (float16_t)0.810457198f, (float16_t)-0.585797857f,
+    (float16_t)0.814036330f, (float16_t)-0.580813958f,
+    (float16_t)0.817584813f, (float16_t)-0.575808191f,
+    (float16_t)0.821102515f, (float16_t)-0.570780746f,
+    (float16_t)0.824589303f, (float16_t)-0.565731811f,
+    (float16_t)0.828045045f, (float16_t)-0.560661576f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.834862875f, (float16_t)-0.550457973f,
+    (float16_t)0.838224706f, (float16_t)-0.545324988f,
+    (float16_t)0.841554977f, (float16_t)-0.540171473f,
+    (float16_t)0.844853565f, (float16_t)-0.534997620f,
+    (float16_t)0.848120345f, (float16_t)-0.529803625f,
+    (float16_t)0.851355193f, (float16_t)-0.524589683f,
+    (float16_t)0.854557988f, (float16_t)-0.519355990f,
+    (float16_t)0.857728610f, (float16_t)-0.514102744f,
+    (float16_t)0.860866939f, (float16_t)-0.508830143f,
+    (float16_t)0.863972856f, (float16_t)-0.503538384f,
+    (float16_t)0.867046246f, (float16_t)-0.498227667f,
+    (float16_t)0.870086991f, (float16_t)-0.492898192f,
+    (float16_t)0.873094978f, (float16_t)-0.487550160f,
+    (float16_t)0.876070094f, (float16_t)-0.482183772f,
+    (float16_t)0.879012226f, (float16_t)-0.476799230f,
+    (float16_t)0.881921264f, (float16_t)-0.471396737f,
+    (float16_t)0.884797098f, (float16_t)-0.465976496f,
+    (float16_t)0.887639620f, (float16_t)-0.460538711f,
+    (float16_t)0.890448723f, (float16_t)-0.455083587f,
+    (float16_t)0.893224301f, (float16_t)-0.449611330f,
+    (float16_t)0.895966250f, (float16_t)-0.444122145f,
+    (float16_t)0.898674466f, (float16_t)-0.438616239f,
+    (float16_t)0.901348847f, (float16_t)-0.433093819f,
+    (float16_t)0.903989293f, (float16_t)-0.427555093f,
+    (float16_t)0.906595705f, (float16_t)-0.422000271f,
+    (float16_t)0.909167983f, (float16_t)-0.416429560f,
+    (float16_t)0.911706032f, (float16_t)-0.410843171f,
+    (float16_t)0.914209756f, (float16_t)-0.405241314f,
+    (float16_t)0.916679060f, (float16_t)-0.399624200f,
+    (float16_t)0.919113852f, (float16_t)-0.393992040f,
+    (float16_t)0.921514039f, (float16_t)-0.388345047f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.926210242f, (float16_t)-0.377007410f,
+    (float16_t)0.928506080f, (float16_t)-0.371317194f,
+    (float16_t)0.930766961f, (float16_t)-0.365612998f,
+    (float16_t)0.932992799f, (float16_t)-0.359895037f,
+    (float16_t)0.935183510f, (float16_t)-0.354163525f,
+    (float16_t)0.937339012f, (float16_t)-0.348418680f,
+    (float16_t)0.939459224f, (float16_t)-0.342660717f,
+    (float16_t)0.941544065f, (float16_t)-0.336889853f,
+    (float16_t)0.943593458f, (float16_t)-0.331106306f,
+    (float16_t)0.945607325f, (float16_t)-0.325310292f,
+    (float16_t)0.947585591f, (float16_t)-0.319502031f,
+    (float16_t)0.949528181f, (float16_t)-0.313681740f,
+    (float16_t)0.951435021f, (float16_t)-0.307849640f,
+    (float16_t)0.953306040f, (float16_t)-0.302005949f,
+    (float16_t)0.955141168f, (float16_t)-0.296150888f,
+    (float16_t)0.956940336f, (float16_t)-0.290284677f,
+    (float16_t)0.958703475f, (float16_t)-0.284407537f,
+    (float16_t)0.960430519f, (float16_t)-0.278519689f,
+    (float16_t)0.962121404f, (float16_t)-0.272621355f,
+    (float16_t)0.963776066f, (float16_t)-0.266712757f,
+    (float16_t)0.965394442f, (float16_t)-0.260794118f,
+    (float16_t)0.966976471f, (float16_t)-0.254865660f,
+    (float16_t)0.968522094f, (float16_t)-0.248927606f,
+    (float16_t)0.970031253f, (float16_t)-0.242980180f,
+    (float16_t)0.971503891f, (float16_t)-0.237023606f,
+    (float16_t)0.972939952f, (float16_t)-0.231058108f,
+    (float16_t)0.974339383f, (float16_t)-0.225083911f,
+    (float16_t)0.975702130f, (float16_t)-0.219101240f,
+    (float16_t)0.977028143f, (float16_t)-0.213110320f,
+    (float16_t)0.978317371f, (float16_t)-0.207111376f,
+    (float16_t)0.979569766f, (float16_t)-0.201104635f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.981963869f, (float16_t)-0.189068664f,
+    (float16_t)0.983105487f, (float16_t)-0.183039888f,
+    (float16_t)0.984210092f, (float16_t)-0.177004220f,
+    (float16_t)0.985277642f, (float16_t)-0.170961889f,
+    (float16_t)0.986308097f, (float16_t)-0.164913120f,
+    (float16_t)0.987301418f, (float16_t)-0.158858143f,
+    (float16_t)0.988257568f, (float16_t)-0.152797185f,
+    (float16_t)0.989176510f, (float16_t)-0.146730474f,
+    (float16_t)0.990058210f, (float16_t)-0.140658239f,
+    (float16_t)0.990902635f, (float16_t)-0.134580709f,
+    (float16_t)0.991709754f, (float16_t)-0.128498111f,
+    (float16_t)0.992479535f, (float16_t)-0.122410675f,
+    (float16_t)0.993211949f, (float16_t)-0.116318631f,
+    (float16_t)0.993906970f, (float16_t)-0.110222207f,
+    (float16_t)0.994564571f, (float16_t)-0.104121634f,
+    (float16_t)0.995184727f, (float16_t)-0.098017140f,
+    (float16_t)0.995767414f, (float16_t)-0.091908956f,
+    (float16_t)0.996312612f, (float16_t)-0.085797312f,
+    (float16_t)0.996820299f, (float16_t)-0.079682438f,
+    (float16_t)0.997290457f, (float16_t)-0.073564564f,
+    (float16_t)0.997723067f, (float16_t)-0.067443920f,
+    (float16_t)0.998118113f, (float16_t)-0.061320736f,
+    (float16_t)0.998475581f, (float16_t)-0.055195244f,
+    (float16_t)0.998795456f, (float16_t)-0.049067674f,
+    (float16_t)0.999077728f, (float16_t)-0.042938257f,
+    (float16_t)0.999322385f, (float16_t)-0.036807223f,
+    (float16_t)0.999529418f, (float16_t)-0.030674803f,
+    (float16_t)0.999698819f, (float16_t)-0.024541229f,
+    (float16_t)0.999830582f, (float16_t)-0.018406730f,
+    (float16_t)0.999924702f, (float16_t)-0.012271538f,
+    (float16_t)0.999981175f, (float16_t)-0.006135885f
+};
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_2048)
+
+/**
+* \par
+* Example code for Floating-point Twiddle factors Generation:
+* \par
+* <pre>for(i = 0; i< N/; i++)
+* {
+* twiddleCoef[2*i]= cos(i * 2*PI/(float)N);
+* twiddleCoef[2*i+1]= sin(i * 2*PI/(float)N);
+* } </pre>
+* \par
+* where N = 2048  and PI = 3.14159265358979
+* \par
+* Cos and Sin values are in interleaved fashion
+*
+*/
+const float16_t twiddleCoefF16_2048[4096] = {
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.999995294f,  (float16_t)0.003067957f,
+    (float16_t)0.999981175f,  (float16_t)0.006135885f,
+    (float16_t)0.999957645f,  (float16_t)0.009203755f,
+    (float16_t)0.999924702f,  (float16_t)0.012271538f,
+    (float16_t)0.999882347f,  (float16_t)0.015339206f,
+    (float16_t)0.999830582f,  (float16_t)0.018406730f,
+    (float16_t)0.999769405f,  (float16_t)0.021474080f,
+    (float16_t)0.999698819f,  (float16_t)0.024541229f,
+    (float16_t)0.999618822f,  (float16_t)0.027608146f,
+    (float16_t)0.999529418f,  (float16_t)0.030674803f,
+    (float16_t)0.999430605f,  (float16_t)0.033741172f,
+    (float16_t)0.999322385f,  (float16_t)0.036807223f,
+    (float16_t)0.999204759f,  (float16_t)0.039872928f,
+    (float16_t)0.999077728f,  (float16_t)0.042938257f,
+    (float16_t)0.998941293f,  (float16_t)0.046003182f,
+    (float16_t)0.998795456f,  (float16_t)0.049067674f,
+    (float16_t)0.998640218f,  (float16_t)0.052131705f,
+    (float16_t)0.998475581f,  (float16_t)0.055195244f,
+    (float16_t)0.998301545f,  (float16_t)0.058258265f,
+    (float16_t)0.998118113f,  (float16_t)0.061320736f,
+    (float16_t)0.997925286f,  (float16_t)0.064382631f,
+    (float16_t)0.997723067f,  (float16_t)0.067443920f,
+    (float16_t)0.997511456f,  (float16_t)0.070504573f,
+    (float16_t)0.997290457f,  (float16_t)0.073564564f,
+    (float16_t)0.997060070f,  (float16_t)0.076623861f,
+    (float16_t)0.996820299f,  (float16_t)0.079682438f,
+    (float16_t)0.996571146f,  (float16_t)0.082740265f,
+    (float16_t)0.996312612f,  (float16_t)0.085797312f,
+    (float16_t)0.996044701f,  (float16_t)0.088853553f,
+    (float16_t)0.995767414f,  (float16_t)0.091908956f,
+    (float16_t)0.995480755f,  (float16_t)0.094963495f,
+    (float16_t)0.995184727f,  (float16_t)0.098017140f,
+    (float16_t)0.994879331f,  (float16_t)0.101069863f,
+    (float16_t)0.994564571f,  (float16_t)0.104121634f,
+    (float16_t)0.994240449f,  (float16_t)0.107172425f,
+    (float16_t)0.993906970f,  (float16_t)0.110222207f,
+    (float16_t)0.993564136f,  (float16_t)0.113270952f,
+    (float16_t)0.993211949f,  (float16_t)0.116318631f,
+    (float16_t)0.992850414f,  (float16_t)0.119365215f,
+    (float16_t)0.992479535f,  (float16_t)0.122410675f,
+    (float16_t)0.992099313f,  (float16_t)0.125454983f,
+    (float16_t)0.991709754f,  (float16_t)0.128498111f,
+    (float16_t)0.991310860f,  (float16_t)0.131540029f,
+    (float16_t)0.990902635f,  (float16_t)0.134580709f,
+    (float16_t)0.990485084f,  (float16_t)0.137620122f,
+    (float16_t)0.990058210f,  (float16_t)0.140658239f,
+    (float16_t)0.989622017f,  (float16_t)0.143695033f,
+    (float16_t)0.989176510f,  (float16_t)0.146730474f,
+    (float16_t)0.988721692f,  (float16_t)0.149764535f,
+    (float16_t)0.988257568f,  (float16_t)0.152797185f,
+    (float16_t)0.987784142f,  (float16_t)0.155828398f,
+    (float16_t)0.987301418f,  (float16_t)0.158858143f,
+    (float16_t)0.986809402f,  (float16_t)0.161886394f,
+    (float16_t)0.986308097f,  (float16_t)0.164913120f,
+    (float16_t)0.985797509f,  (float16_t)0.167938295f,
+    (float16_t)0.985277642f,  (float16_t)0.170961889f,
+    (float16_t)0.984748502f,  (float16_t)0.173983873f,
+    (float16_t)0.984210092f,  (float16_t)0.177004220f,
+    (float16_t)0.983662419f,  (float16_t)0.180022901f,
+    (float16_t)0.983105487f,  (float16_t)0.183039888f,
+    (float16_t)0.982539302f,  (float16_t)0.186055152f,
+    (float16_t)0.981963869f,  (float16_t)0.189068664f,
+    (float16_t)0.981379193f,  (float16_t)0.192080397f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.980182136f,  (float16_t)0.198098411f,
+    (float16_t)0.979569766f,  (float16_t)0.201104635f,
+    (float16_t)0.978948175f,  (float16_t)0.204108966f,
+    (float16_t)0.978317371f,  (float16_t)0.207111376f,
+    (float16_t)0.977677358f,  (float16_t)0.210111837f,
+    (float16_t)0.977028143f,  (float16_t)0.213110320f,
+    (float16_t)0.976369731f,  (float16_t)0.216106797f,
+    (float16_t)0.975702130f,  (float16_t)0.219101240f,
+    (float16_t)0.975025345f,  (float16_t)0.222093621f,
+    (float16_t)0.974339383f,  (float16_t)0.225083911f,
+    (float16_t)0.973644250f,  (float16_t)0.228072083f,
+    (float16_t)0.972939952f,  (float16_t)0.231058108f,
+    (float16_t)0.972226497f,  (float16_t)0.234041959f,
+    (float16_t)0.971503891f,  (float16_t)0.237023606f,
+    (float16_t)0.970772141f,  (float16_t)0.240003022f,
+    (float16_t)0.970031253f,  (float16_t)0.242980180f,
+    (float16_t)0.969281235f,  (float16_t)0.245955050f,
+    (float16_t)0.968522094f,  (float16_t)0.248927606f,
+    (float16_t)0.967753837f,  (float16_t)0.251897818f,
+    (float16_t)0.966976471f,  (float16_t)0.254865660f,
+    (float16_t)0.966190003f,  (float16_t)0.257831102f,
+    (float16_t)0.965394442f,  (float16_t)0.260794118f,
+    (float16_t)0.964589793f,  (float16_t)0.263754679f,
+    (float16_t)0.963776066f,  (float16_t)0.266712757f,
+    (float16_t)0.962953267f,  (float16_t)0.269668326f,
+    (float16_t)0.962121404f,  (float16_t)0.272621355f,
+    (float16_t)0.961280486f,  (float16_t)0.275571819f,
+    (float16_t)0.960430519f,  (float16_t)0.278519689f,
+    (float16_t)0.959571513f,  (float16_t)0.281464938f,
+    (float16_t)0.958703475f,  (float16_t)0.284407537f,
+    (float16_t)0.957826413f,  (float16_t)0.287347460f,
+    (float16_t)0.956940336f,  (float16_t)0.290284677f,
+    (float16_t)0.956045251f,  (float16_t)0.293219163f,
+    (float16_t)0.955141168f,  (float16_t)0.296150888f,
+    (float16_t)0.954228095f,  (float16_t)0.299079826f,
+    (float16_t)0.953306040f,  (float16_t)0.302005949f,
+    (float16_t)0.952375013f,  (float16_t)0.304929230f,
+    (float16_t)0.951435021f,  (float16_t)0.307849640f,
+    (float16_t)0.950486074f,  (float16_t)0.310767153f,
+    (float16_t)0.949528181f,  (float16_t)0.313681740f,
+    (float16_t)0.948561350f,  (float16_t)0.316593376f,
+    (float16_t)0.947585591f,  (float16_t)0.319502031f,
+    (float16_t)0.946600913f,  (float16_t)0.322407679f,
+    (float16_t)0.945607325f,  (float16_t)0.325310292f,
+    (float16_t)0.944604837f,  (float16_t)0.328209844f,
+    (float16_t)0.943593458f,  (float16_t)0.331106306f,
+    (float16_t)0.942573198f,  (float16_t)0.333999651f,
+    (float16_t)0.941544065f,  (float16_t)0.336889853f,
+    (float16_t)0.940506071f,  (float16_t)0.339776884f,
+    (float16_t)0.939459224f,  (float16_t)0.342660717f,
+    (float16_t)0.938403534f,  (float16_t)0.345541325f,
+    (float16_t)0.937339012f,  (float16_t)0.348418680f,
+    (float16_t)0.936265667f,  (float16_t)0.351292756f,
+    (float16_t)0.935183510f,  (float16_t)0.354163525f,
+    (float16_t)0.934092550f,  (float16_t)0.357030961f,
+    (float16_t)0.932992799f,  (float16_t)0.359895037f,
+    (float16_t)0.931884266f,  (float16_t)0.362755724f,
+    (float16_t)0.930766961f,  (float16_t)0.365612998f,
+    (float16_t)0.929640896f,  (float16_t)0.368466830f,
+    (float16_t)0.928506080f,  (float16_t)0.371317194f,
+    (float16_t)0.927362526f,  (float16_t)0.374164063f,
+    (float16_t)0.926210242f,  (float16_t)0.377007410f,
+    (float16_t)0.925049241f,  (float16_t)0.379847209f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.922701128f,  (float16_t)0.385516054f,
+    (float16_t)0.921514039f,  (float16_t)0.388345047f,
+    (float16_t)0.920318277f,  (float16_t)0.391170384f,
+    (float16_t)0.919113852f,  (float16_t)0.393992040f,
+    (float16_t)0.917900776f,  (float16_t)0.396809987f,
+    (float16_t)0.916679060f,  (float16_t)0.399624200f,
+    (float16_t)0.915448716f,  (float16_t)0.402434651f,
+    (float16_t)0.914209756f,  (float16_t)0.405241314f,
+    (float16_t)0.912962190f,  (float16_t)0.408044163f,
+    (float16_t)0.911706032f,  (float16_t)0.410843171f,
+    (float16_t)0.910441292f,  (float16_t)0.413638312f,
+    (float16_t)0.909167983f,  (float16_t)0.416429560f,
+    (float16_t)0.907886116f,  (float16_t)0.419216888f,
+    (float16_t)0.906595705f,  (float16_t)0.422000271f,
+    (float16_t)0.905296759f,  (float16_t)0.424779681f,
+    (float16_t)0.903989293f,  (float16_t)0.427555093f,
+    (float16_t)0.902673318f,  (float16_t)0.430326481f,
+    (float16_t)0.901348847f,  (float16_t)0.433093819f,
+    (float16_t)0.900015892f,  (float16_t)0.435857080f,
+    (float16_t)0.898674466f,  (float16_t)0.438616239f,
+    (float16_t)0.897324581f,  (float16_t)0.441371269f,
+    (float16_t)0.895966250f,  (float16_t)0.444122145f,
+    (float16_t)0.894599486f,  (float16_t)0.446868840f,
+    (float16_t)0.893224301f,  (float16_t)0.449611330f,
+    (float16_t)0.891840709f,  (float16_t)0.452349587f,
+    (float16_t)0.890448723f,  (float16_t)0.455083587f,
+    (float16_t)0.889048356f,  (float16_t)0.457813304f,
+    (float16_t)0.887639620f,  (float16_t)0.460538711f,
+    (float16_t)0.886222530f,  (float16_t)0.463259784f,
+    (float16_t)0.884797098f,  (float16_t)0.465976496f,
+    (float16_t)0.883363339f,  (float16_t)0.468688822f,
+    (float16_t)0.881921264f,  (float16_t)0.471396737f,
+    (float16_t)0.880470889f,  (float16_t)0.474100215f,
+    (float16_t)0.879012226f,  (float16_t)0.476799230f,
+    (float16_t)0.877545290f,  (float16_t)0.479493758f,
+    (float16_t)0.876070094f,  (float16_t)0.482183772f,
+    (float16_t)0.874586652f,  (float16_t)0.484869248f,
+    (float16_t)0.873094978f,  (float16_t)0.487550160f,
+    (float16_t)0.871595087f,  (float16_t)0.490226483f,
+    (float16_t)0.870086991f,  (float16_t)0.492898192f,
+    (float16_t)0.868570706f,  (float16_t)0.495565262f,
+    (float16_t)0.867046246f,  (float16_t)0.498227667f,
+    (float16_t)0.865513624f,  (float16_t)0.500885383f,
+    (float16_t)0.863972856f,  (float16_t)0.503538384f,
+    (float16_t)0.862423956f,  (float16_t)0.506186645f,
+    (float16_t)0.860866939f,  (float16_t)0.508830143f,
+    (float16_t)0.859301818f,  (float16_t)0.511468850f,
+    (float16_t)0.857728610f,  (float16_t)0.514102744f,
+    (float16_t)0.856147328f,  (float16_t)0.516731799f,
+    (float16_t)0.854557988f,  (float16_t)0.519355990f,
+    (float16_t)0.852960605f,  (float16_t)0.521975293f,
+    (float16_t)0.851355193f,  (float16_t)0.524589683f,
+    (float16_t)0.849741768f,  (float16_t)0.527199135f,
+    (float16_t)0.848120345f,  (float16_t)0.529803625f,
+    (float16_t)0.846490939f,  (float16_t)0.532403128f,
+    (float16_t)0.844853565f,  (float16_t)0.534997620f,
+    (float16_t)0.843208240f,  (float16_t)0.537587076f,
+    (float16_t)0.841554977f,  (float16_t)0.540171473f,
+    (float16_t)0.839893794f,  (float16_t)0.542750785f,
+    (float16_t)0.838224706f,  (float16_t)0.545324988f,
+    (float16_t)0.836547727f,  (float16_t)0.547894059f,
+    (float16_t)0.834862875f,  (float16_t)0.550457973f,
+    (float16_t)0.833170165f,  (float16_t)0.553016706f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.829761234f,  (float16_t)0.558118531f,
+    (float16_t)0.828045045f,  (float16_t)0.560661576f,
+    (float16_t)0.826321063f,  (float16_t)0.563199344f,
+    (float16_t)0.824589303f,  (float16_t)0.565731811f,
+    (float16_t)0.822849781f,  (float16_t)0.568258953f,
+    (float16_t)0.821102515f,  (float16_t)0.570780746f,
+    (float16_t)0.819347520f,  (float16_t)0.573297167f,
+    (float16_t)0.817584813f,  (float16_t)0.575808191f,
+    (float16_t)0.815814411f,  (float16_t)0.578313796f,
+    (float16_t)0.814036330f,  (float16_t)0.580813958f,
+    (float16_t)0.812250587f,  (float16_t)0.583308653f,
+    (float16_t)0.810457198f,  (float16_t)0.585797857f,
+    (float16_t)0.808656182f,  (float16_t)0.588281548f,
+    (float16_t)0.806847554f,  (float16_t)0.590759702f,
+    (float16_t)0.805031331f,  (float16_t)0.593232295f,
+    (float16_t)0.803207531f,  (float16_t)0.595699304f,
+    (float16_t)0.801376172f,  (float16_t)0.598160707f,
+    (float16_t)0.799537269f,  (float16_t)0.600616479f,
+    (float16_t)0.797690841f,  (float16_t)0.603066599f,
+    (float16_t)0.795836905f,  (float16_t)0.605511041f,
+    (float16_t)0.793975478f,  (float16_t)0.607949785f,
+    (float16_t)0.792106577f,  (float16_t)0.610382806f,
+    (float16_t)0.790230221f,  (float16_t)0.612810082f,
+    (float16_t)0.788346428f,  (float16_t)0.615231591f,
+    (float16_t)0.786455214f,  (float16_t)0.617647308f,
+    (float16_t)0.784556597f,  (float16_t)0.620057212f,
+    (float16_t)0.782650596f,  (float16_t)0.622461279f,
+    (float16_t)0.780737229f,  (float16_t)0.624859488f,
+    (float16_t)0.778816512f,  (float16_t)0.627251815f,
+    (float16_t)0.776888466f,  (float16_t)0.629638239f,
+    (float16_t)0.774953107f,  (float16_t)0.632018736f,
+    (float16_t)0.773010453f,  (float16_t)0.634393284f,
+    (float16_t)0.771060524f,  (float16_t)0.636761861f,
+    (float16_t)0.769103338f,  (float16_t)0.639124445f,
+    (float16_t)0.767138912f,  (float16_t)0.641481013f,
+    (float16_t)0.765167266f,  (float16_t)0.643831543f,
+    (float16_t)0.763188417f,  (float16_t)0.646176013f,
+    (float16_t)0.761202385f,  (float16_t)0.648514401f,
+    (float16_t)0.759209189f,  (float16_t)0.650846685f,
+    (float16_t)0.757208847f,  (float16_t)0.653172843f,
+    (float16_t)0.755201377f,  (float16_t)0.655492853f,
+    (float16_t)0.753186799f,  (float16_t)0.657806693f,
+    (float16_t)0.751165132f,  (float16_t)0.660114342f,
+    (float16_t)0.749136395f,  (float16_t)0.662415778f,
+    (float16_t)0.747100606f,  (float16_t)0.664710978f,
+    (float16_t)0.745057785f,  (float16_t)0.666999922f,
+    (float16_t)0.743007952f,  (float16_t)0.669282588f,
+    (float16_t)0.740951125f,  (float16_t)0.671558955f,
+    (float16_t)0.738887324f,  (float16_t)0.673829000f,
+    (float16_t)0.736816569f,  (float16_t)0.676092704f,
+    (float16_t)0.734738878f,  (float16_t)0.678350043f,
+    (float16_t)0.732654272f,  (float16_t)0.680600998f,
+    (float16_t)0.730562769f,  (float16_t)0.682845546f,
+    (float16_t)0.728464390f,  (float16_t)0.685083668f,
+    (float16_t)0.726359155f,  (float16_t)0.687315341f,
+    (float16_t)0.724247083f,  (float16_t)0.689540545f,
+    (float16_t)0.722128194f,  (float16_t)0.691759258f,
+    (float16_t)0.720002508f,  (float16_t)0.693971461f,
+    (float16_t)0.717870045f,  (float16_t)0.696177131f,
+    (float16_t)0.715730825f,  (float16_t)0.698376249f,
+    (float16_t)0.713584869f,  (float16_t)0.700568794f,
+    (float16_t)0.711432196f,  (float16_t)0.702754744f,
+    (float16_t)0.709272826f,  (float16_t)0.704934080f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.704934080f,  (float16_t)0.709272826f,
+    (float16_t)0.702754744f,  (float16_t)0.711432196f,
+    (float16_t)0.700568794f,  (float16_t)0.713584869f,
+    (float16_t)0.698376249f,  (float16_t)0.715730825f,
+    (float16_t)0.696177131f,  (float16_t)0.717870045f,
+    (float16_t)0.693971461f,  (float16_t)0.720002508f,
+    (float16_t)0.691759258f,  (float16_t)0.722128194f,
+    (float16_t)0.689540545f,  (float16_t)0.724247083f,
+    (float16_t)0.687315341f,  (float16_t)0.726359155f,
+    (float16_t)0.685083668f,  (float16_t)0.728464390f,
+    (float16_t)0.682845546f,  (float16_t)0.730562769f,
+    (float16_t)0.680600998f,  (float16_t)0.732654272f,
+    (float16_t)0.678350043f,  (float16_t)0.734738878f,
+    (float16_t)0.676092704f,  (float16_t)0.736816569f,
+    (float16_t)0.673829000f,  (float16_t)0.738887324f,
+    (float16_t)0.671558955f,  (float16_t)0.740951125f,
+    (float16_t)0.669282588f,  (float16_t)0.743007952f,
+    (float16_t)0.666999922f,  (float16_t)0.745057785f,
+    (float16_t)0.664710978f,  (float16_t)0.747100606f,
+    (float16_t)0.662415778f,  (float16_t)0.749136395f,
+    (float16_t)0.660114342f,  (float16_t)0.751165132f,
+    (float16_t)0.657806693f,  (float16_t)0.753186799f,
+    (float16_t)0.655492853f,  (float16_t)0.755201377f,
+    (float16_t)0.653172843f,  (float16_t)0.757208847f,
+    (float16_t)0.650846685f,  (float16_t)0.759209189f,
+    (float16_t)0.648514401f,  (float16_t)0.761202385f,
+    (float16_t)0.646176013f,  (float16_t)0.763188417f,
+    (float16_t)0.643831543f,  (float16_t)0.765167266f,
+    (float16_t)0.641481013f,  (float16_t)0.767138912f,
+    (float16_t)0.639124445f,  (float16_t)0.769103338f,
+    (float16_t)0.636761861f,  (float16_t)0.771060524f,
+    (float16_t)0.634393284f,  (float16_t)0.773010453f,
+    (float16_t)0.632018736f,  (float16_t)0.774953107f,
+    (float16_t)0.629638239f,  (float16_t)0.776888466f,
+    (float16_t)0.627251815f,  (float16_t)0.778816512f,
+    (float16_t)0.624859488f,  (float16_t)0.780737229f,
+    (float16_t)0.622461279f,  (float16_t)0.782650596f,
+    (float16_t)0.620057212f,  (float16_t)0.784556597f,
+    (float16_t)0.617647308f,  (float16_t)0.786455214f,
+    (float16_t)0.615231591f,  (float16_t)0.788346428f,
+    (float16_t)0.612810082f,  (float16_t)0.790230221f,
+    (float16_t)0.610382806f,  (float16_t)0.792106577f,
+    (float16_t)0.607949785f,  (float16_t)0.793975478f,
+    (float16_t)0.605511041f,  (float16_t)0.795836905f,
+    (float16_t)0.603066599f,  (float16_t)0.797690841f,
+    (float16_t)0.600616479f,  (float16_t)0.799537269f,
+    (float16_t)0.598160707f,  (float16_t)0.801376172f,
+    (float16_t)0.595699304f,  (float16_t)0.803207531f,
+    (float16_t)0.593232295f,  (float16_t)0.805031331f,
+    (float16_t)0.590759702f,  (float16_t)0.806847554f,
+    (float16_t)0.588281548f,  (float16_t)0.808656182f,
+    (float16_t)0.585797857f,  (float16_t)0.810457198f,
+    (float16_t)0.583308653f,  (float16_t)0.812250587f,
+    (float16_t)0.580813958f,  (float16_t)0.814036330f,
+    (float16_t)0.578313796f,  (float16_t)0.815814411f,
+    (float16_t)0.575808191f,  (float16_t)0.817584813f,
+    (float16_t)0.573297167f,  (float16_t)0.819347520f,
+    (float16_t)0.570780746f,  (float16_t)0.821102515f,
+    (float16_t)0.568258953f,  (float16_t)0.822849781f,
+    (float16_t)0.565731811f,  (float16_t)0.824589303f,
+    (float16_t)0.563199344f,  (float16_t)0.826321063f,
+    (float16_t)0.560661576f,  (float16_t)0.828045045f,
+    (float16_t)0.558118531f,  (float16_t)0.829761234f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.553016706f,  (float16_t)0.833170165f,
+    (float16_t)0.550457973f,  (float16_t)0.834862875f,
+    (float16_t)0.547894059f,  (float16_t)0.836547727f,
+    (float16_t)0.545324988f,  (float16_t)0.838224706f,
+    (float16_t)0.542750785f,  (float16_t)0.839893794f,
+    (float16_t)0.540171473f,  (float16_t)0.841554977f,
+    (float16_t)0.537587076f,  (float16_t)0.843208240f,
+    (float16_t)0.534997620f,  (float16_t)0.844853565f,
+    (float16_t)0.532403128f,  (float16_t)0.846490939f,
+    (float16_t)0.529803625f,  (float16_t)0.848120345f,
+    (float16_t)0.527199135f,  (float16_t)0.849741768f,
+    (float16_t)0.524589683f,  (float16_t)0.851355193f,
+    (float16_t)0.521975293f,  (float16_t)0.852960605f,
+    (float16_t)0.519355990f,  (float16_t)0.854557988f,
+    (float16_t)0.516731799f,  (float16_t)0.856147328f,
+    (float16_t)0.514102744f,  (float16_t)0.857728610f,
+    (float16_t)0.511468850f,  (float16_t)0.859301818f,
+    (float16_t)0.508830143f,  (float16_t)0.860866939f,
+    (float16_t)0.506186645f,  (float16_t)0.862423956f,
+    (float16_t)0.503538384f,  (float16_t)0.863972856f,
+    (float16_t)0.500885383f,  (float16_t)0.865513624f,
+    (float16_t)0.498227667f,  (float16_t)0.867046246f,
+    (float16_t)0.495565262f,  (float16_t)0.868570706f,
+    (float16_t)0.492898192f,  (float16_t)0.870086991f,
+    (float16_t)0.490226483f,  (float16_t)0.871595087f,
+    (float16_t)0.487550160f,  (float16_t)0.873094978f,
+    (float16_t)0.484869248f,  (float16_t)0.874586652f,
+    (float16_t)0.482183772f,  (float16_t)0.876070094f,
+    (float16_t)0.479493758f,  (float16_t)0.877545290f,
+    (float16_t)0.476799230f,  (float16_t)0.879012226f,
+    (float16_t)0.474100215f,  (float16_t)0.880470889f,
+    (float16_t)0.471396737f,  (float16_t)0.881921264f,
+    (float16_t)0.468688822f,  (float16_t)0.883363339f,
+    (float16_t)0.465976496f,  (float16_t)0.884797098f,
+    (float16_t)0.463259784f,  (float16_t)0.886222530f,
+    (float16_t)0.460538711f,  (float16_t)0.887639620f,
+    (float16_t)0.457813304f,  (float16_t)0.889048356f,
+    (float16_t)0.455083587f,  (float16_t)0.890448723f,
+    (float16_t)0.452349587f,  (float16_t)0.891840709f,
+    (float16_t)0.449611330f,  (float16_t)0.893224301f,
+    (float16_t)0.446868840f,  (float16_t)0.894599486f,
+    (float16_t)0.444122145f,  (float16_t)0.895966250f,
+    (float16_t)0.441371269f,  (float16_t)0.897324581f,
+    (float16_t)0.438616239f,  (float16_t)0.898674466f,
+    (float16_t)0.435857080f,  (float16_t)0.900015892f,
+    (float16_t)0.433093819f,  (float16_t)0.901348847f,
+    (float16_t)0.430326481f,  (float16_t)0.902673318f,
+    (float16_t)0.427555093f,  (float16_t)0.903989293f,
+    (float16_t)0.424779681f,  (float16_t)0.905296759f,
+    (float16_t)0.422000271f,  (float16_t)0.906595705f,
+    (float16_t)0.419216888f,  (float16_t)0.907886116f,
+    (float16_t)0.416429560f,  (float16_t)0.909167983f,
+    (float16_t)0.413638312f,  (float16_t)0.910441292f,
+    (float16_t)0.410843171f,  (float16_t)0.911706032f,
+    (float16_t)0.408044163f,  (float16_t)0.912962190f,
+    (float16_t)0.405241314f,  (float16_t)0.914209756f,
+    (float16_t)0.402434651f,  (float16_t)0.915448716f,
+    (float16_t)0.399624200f,  (float16_t)0.916679060f,
+    (float16_t)0.396809987f,  (float16_t)0.917900776f,
+    (float16_t)0.393992040f,  (float16_t)0.919113852f,
+    (float16_t)0.391170384f,  (float16_t)0.920318277f,
+    (float16_t)0.388345047f,  (float16_t)0.921514039f,
+    (float16_t)0.385516054f,  (float16_t)0.922701128f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.379847209f,  (float16_t)0.925049241f,
+    (float16_t)0.377007410f,  (float16_t)0.926210242f,
+    (float16_t)0.374164063f,  (float16_t)0.927362526f,
+    (float16_t)0.371317194f,  (float16_t)0.928506080f,
+    (float16_t)0.368466830f,  (float16_t)0.929640896f,
+    (float16_t)0.365612998f,  (float16_t)0.930766961f,
+    (float16_t)0.362755724f,  (float16_t)0.931884266f,
+    (float16_t)0.359895037f,  (float16_t)0.932992799f,
+    (float16_t)0.357030961f,  (float16_t)0.934092550f,
+    (float16_t)0.354163525f,  (float16_t)0.935183510f,
+    (float16_t)0.351292756f,  (float16_t)0.936265667f,
+    (float16_t)0.348418680f,  (float16_t)0.937339012f,
+    (float16_t)0.345541325f,  (float16_t)0.938403534f,
+    (float16_t)0.342660717f,  (float16_t)0.939459224f,
+    (float16_t)0.339776884f,  (float16_t)0.940506071f,
+    (float16_t)0.336889853f,  (float16_t)0.941544065f,
+    (float16_t)0.333999651f,  (float16_t)0.942573198f,
+    (float16_t)0.331106306f,  (float16_t)0.943593458f,
+    (float16_t)0.328209844f,  (float16_t)0.944604837f,
+    (float16_t)0.325310292f,  (float16_t)0.945607325f,
+    (float16_t)0.322407679f,  (float16_t)0.946600913f,
+    (float16_t)0.319502031f,  (float16_t)0.947585591f,
+    (float16_t)0.316593376f,  (float16_t)0.948561350f,
+    (float16_t)0.313681740f,  (float16_t)0.949528181f,
+    (float16_t)0.310767153f,  (float16_t)0.950486074f,
+    (float16_t)0.307849640f,  (float16_t)0.951435021f,
+    (float16_t)0.304929230f,  (float16_t)0.952375013f,
+    (float16_t)0.302005949f,  (float16_t)0.953306040f,
+    (float16_t)0.299079826f,  (float16_t)0.954228095f,
+    (float16_t)0.296150888f,  (float16_t)0.955141168f,
+    (float16_t)0.293219163f,  (float16_t)0.956045251f,
+    (float16_t)0.290284677f,  (float16_t)0.956940336f,
+    (float16_t)0.287347460f,  (float16_t)0.957826413f,
+    (float16_t)0.284407537f,  (float16_t)0.958703475f,
+    (float16_t)0.281464938f,  (float16_t)0.959571513f,
+    (float16_t)0.278519689f,  (float16_t)0.960430519f,
+    (float16_t)0.275571819f,  (float16_t)0.961280486f,
+    (float16_t)0.272621355f,  (float16_t)0.962121404f,
+    (float16_t)0.269668326f,  (float16_t)0.962953267f,
+    (float16_t)0.266712757f,  (float16_t)0.963776066f,
+    (float16_t)0.263754679f,  (float16_t)0.964589793f,
+    (float16_t)0.260794118f,  (float16_t)0.965394442f,
+    (float16_t)0.257831102f,  (float16_t)0.966190003f,
+    (float16_t)0.254865660f,  (float16_t)0.966976471f,
+    (float16_t)0.251897818f,  (float16_t)0.967753837f,
+    (float16_t)0.248927606f,  (float16_t)0.968522094f,
+    (float16_t)0.245955050f,  (float16_t)0.969281235f,
+    (float16_t)0.242980180f,  (float16_t)0.970031253f,
+    (float16_t)0.240003022f,  (float16_t)0.970772141f,
+    (float16_t)0.237023606f,  (float16_t)0.971503891f,
+    (float16_t)0.234041959f,  (float16_t)0.972226497f,
+    (float16_t)0.231058108f,  (float16_t)0.972939952f,
+    (float16_t)0.228072083f,  (float16_t)0.973644250f,
+    (float16_t)0.225083911f,  (float16_t)0.974339383f,
+    (float16_t)0.222093621f,  (float16_t)0.975025345f,
+    (float16_t)0.219101240f,  (float16_t)0.975702130f,
+    (float16_t)0.216106797f,  (float16_t)0.976369731f,
+    (float16_t)0.213110320f,  (float16_t)0.977028143f,
+    (float16_t)0.210111837f,  (float16_t)0.977677358f,
+    (float16_t)0.207111376f,  (float16_t)0.978317371f,
+    (float16_t)0.204108966f,  (float16_t)0.978948175f,
+    (float16_t)0.201104635f,  (float16_t)0.979569766f,
+    (float16_t)0.198098411f,  (float16_t)0.980182136f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.192080397f,  (float16_t)0.981379193f,
+    (float16_t)0.189068664f,  (float16_t)0.981963869f,
+    (float16_t)0.186055152f,  (float16_t)0.982539302f,
+    (float16_t)0.183039888f,  (float16_t)0.983105487f,
+    (float16_t)0.180022901f,  (float16_t)0.983662419f,
+    (float16_t)0.177004220f,  (float16_t)0.984210092f,
+    (float16_t)0.173983873f,  (float16_t)0.984748502f,
+    (float16_t)0.170961889f,  (float16_t)0.985277642f,
+    (float16_t)0.167938295f,  (float16_t)0.985797509f,
+    (float16_t)0.164913120f,  (float16_t)0.986308097f,
+    (float16_t)0.161886394f,  (float16_t)0.986809402f,
+    (float16_t)0.158858143f,  (float16_t)0.987301418f,
+    (float16_t)0.155828398f,  (float16_t)0.987784142f,
+    (float16_t)0.152797185f,  (float16_t)0.988257568f,
+    (float16_t)0.149764535f,  (float16_t)0.988721692f,
+    (float16_t)0.146730474f,  (float16_t)0.989176510f,
+    (float16_t)0.143695033f,  (float16_t)0.989622017f,
+    (float16_t)0.140658239f,  (float16_t)0.990058210f,
+    (float16_t)0.137620122f,  (float16_t)0.990485084f,
+    (float16_t)0.134580709f,  (float16_t)0.990902635f,
+    (float16_t)0.131540029f,  (float16_t)0.991310860f,
+    (float16_t)0.128498111f,  (float16_t)0.991709754f,
+    (float16_t)0.125454983f,  (float16_t)0.992099313f,
+    (float16_t)0.122410675f,  (float16_t)0.992479535f,
+    (float16_t)0.119365215f,  (float16_t)0.992850414f,
+    (float16_t)0.116318631f,  (float16_t)0.993211949f,
+    (float16_t)0.113270952f,  (float16_t)0.993564136f,
+    (float16_t)0.110222207f,  (float16_t)0.993906970f,
+    (float16_t)0.107172425f,  (float16_t)0.994240449f,
+    (float16_t)0.104121634f,  (float16_t)0.994564571f,
+    (float16_t)0.101069863f,  (float16_t)0.994879331f,
+    (float16_t)0.098017140f,  (float16_t)0.995184727f,
+    (float16_t)0.094963495f,  (float16_t)0.995480755f,
+    (float16_t)0.091908956f,  (float16_t)0.995767414f,
+    (float16_t)0.088853553f,  (float16_t)0.996044701f,
+    (float16_t)0.085797312f,  (float16_t)0.996312612f,
+    (float16_t)0.082740265f,  (float16_t)0.996571146f,
+    (float16_t)0.079682438f,  (float16_t)0.996820299f,
+    (float16_t)0.076623861f,  (float16_t)0.997060070f,
+    (float16_t)0.073564564f,  (float16_t)0.997290457f,
+    (float16_t)0.070504573f,  (float16_t)0.997511456f,
+    (float16_t)0.067443920f,  (float16_t)0.997723067f,
+    (float16_t)0.064382631f,  (float16_t)0.997925286f,
+    (float16_t)0.061320736f,  (float16_t)0.998118113f,
+    (float16_t)0.058258265f,  (float16_t)0.998301545f,
+    (float16_t)0.055195244f,  (float16_t)0.998475581f,
+    (float16_t)0.052131705f,  (float16_t)0.998640218f,
+    (float16_t)0.049067674f,  (float16_t)0.998795456f,
+    (float16_t)0.046003182f,  (float16_t)0.998941293f,
+    (float16_t)0.042938257f,  (float16_t)0.999077728f,
+    (float16_t)0.039872928f,  (float16_t)0.999204759f,
+    (float16_t)0.036807223f,  (float16_t)0.999322385f,
+    (float16_t)0.033741172f,  (float16_t)0.999430605f,
+    (float16_t)0.030674803f,  (float16_t)0.999529418f,
+    (float16_t)0.027608146f,  (float16_t)0.999618822f,
+    (float16_t)0.024541229f,  (float16_t)0.999698819f,
+    (float16_t)0.021474080f,  (float16_t)0.999769405f,
+    (float16_t)0.018406730f,  (float16_t)0.999830582f,
+    (float16_t)0.015339206f,  (float16_t)0.999882347f,
+    (float16_t)0.012271538f,  (float16_t)0.999924702f,
+    (float16_t)0.009203755f,  (float16_t)0.999957645f,
+    (float16_t)0.006135885f,  (float16_t)0.999981175f,
+    (float16_t)0.003067957f,  (float16_t)0.999995294f,
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+   (float16_t)-0.003067957f,  (float16_t)0.999995294f,
+   (float16_t)-0.006135885f,  (float16_t)0.999981175f,
+   (float16_t)-0.009203755f,  (float16_t)0.999957645f,
+   (float16_t)-0.012271538f,  (float16_t)0.999924702f,
+   (float16_t)-0.015339206f,  (float16_t)0.999882347f,
+   (float16_t)-0.018406730f,  (float16_t)0.999830582f,
+   (float16_t)-0.021474080f,  (float16_t)0.999769405f,
+   (float16_t)-0.024541229f,  (float16_t)0.999698819f,
+   (float16_t)-0.027608146f,  (float16_t)0.999618822f,
+   (float16_t)-0.030674803f,  (float16_t)0.999529418f,
+   (float16_t)-0.033741172f,  (float16_t)0.999430605f,
+   (float16_t)-0.036807223f,  (float16_t)0.999322385f,
+   (float16_t)-0.039872928f,  (float16_t)0.999204759f,
+   (float16_t)-0.042938257f,  (float16_t)0.999077728f,
+   (float16_t)-0.046003182f,  (float16_t)0.998941293f,
+   (float16_t)-0.049067674f,  (float16_t)0.998795456f,
+   (float16_t)-0.052131705f,  (float16_t)0.998640218f,
+   (float16_t)-0.055195244f,  (float16_t)0.998475581f,
+   (float16_t)-0.058258265f,  (float16_t)0.998301545f,
+   (float16_t)-0.061320736f,  (float16_t)0.998118113f,
+   (float16_t)-0.064382631f,  (float16_t)0.997925286f,
+   (float16_t)-0.067443920f,  (float16_t)0.997723067f,
+   (float16_t)-0.070504573f,  (float16_t)0.997511456f,
+   (float16_t)-0.073564564f,  (float16_t)0.997290457f,
+   (float16_t)-0.076623861f,  (float16_t)0.997060070f,
+   (float16_t)-0.079682438f,  (float16_t)0.996820299f,
+   (float16_t)-0.082740265f,  (float16_t)0.996571146f,
+   (float16_t)-0.085797312f,  (float16_t)0.996312612f,
+   (float16_t)-0.088853553f,  (float16_t)0.996044701f,
+   (float16_t)-0.091908956f,  (float16_t)0.995767414f,
+   (float16_t)-0.094963495f,  (float16_t)0.995480755f,
+   (float16_t)-0.098017140f,  (float16_t)0.995184727f,
+   (float16_t)-0.101069863f,  (float16_t)0.994879331f,
+   (float16_t)-0.104121634f,  (float16_t)0.994564571f,
+   (float16_t)-0.107172425f,  (float16_t)0.994240449f,
+   (float16_t)-0.110222207f,  (float16_t)0.993906970f,
+   (float16_t)-0.113270952f,  (float16_t)0.993564136f,
+   (float16_t)-0.116318631f,  (float16_t)0.993211949f,
+   (float16_t)-0.119365215f,  (float16_t)0.992850414f,
+   (float16_t)-0.122410675f,  (float16_t)0.992479535f,
+   (float16_t)-0.125454983f,  (float16_t)0.992099313f,
+   (float16_t)-0.128498111f,  (float16_t)0.991709754f,
+   (float16_t)-0.131540029f,  (float16_t)0.991310860f,
+   (float16_t)-0.134580709f,  (float16_t)0.990902635f,
+   (float16_t)-0.137620122f,  (float16_t)0.990485084f,
+   (float16_t)-0.140658239f,  (float16_t)0.990058210f,
+   (float16_t)-0.143695033f,  (float16_t)0.989622017f,
+   (float16_t)-0.146730474f,  (float16_t)0.989176510f,
+   (float16_t)-0.149764535f,  (float16_t)0.988721692f,
+   (float16_t)-0.152797185f,  (float16_t)0.988257568f,
+   (float16_t)-0.155828398f,  (float16_t)0.987784142f,
+   (float16_t)-0.158858143f,  (float16_t)0.987301418f,
+   (float16_t)-0.161886394f,  (float16_t)0.986809402f,
+   (float16_t)-0.164913120f,  (float16_t)0.986308097f,
+   (float16_t)-0.167938295f,  (float16_t)0.985797509f,
+   (float16_t)-0.170961889f,  (float16_t)0.985277642f,
+   (float16_t)-0.173983873f,  (float16_t)0.984748502f,
+   (float16_t)-0.177004220f,  (float16_t)0.984210092f,
+   (float16_t)-0.180022901f,  (float16_t)0.983662419f,
+   (float16_t)-0.183039888f,  (float16_t)0.983105487f,
+   (float16_t)-0.186055152f,  (float16_t)0.982539302f,
+   (float16_t)-0.189068664f,  (float16_t)0.981963869f,
+   (float16_t)-0.192080397f,  (float16_t)0.981379193f,
+   (float16_t)-0.195090322f,  (float16_t)0.980785280f,
+   (float16_t)-0.198098411f,  (float16_t)0.980182136f,
+   (float16_t)-0.201104635f,  (float16_t)0.979569766f,
+   (float16_t)-0.204108966f,  (float16_t)0.978948175f,
+   (float16_t)-0.207111376f,  (float16_t)0.978317371f,
+   (float16_t)-0.210111837f,  (float16_t)0.977677358f,
+   (float16_t)-0.213110320f,  (float16_t)0.977028143f,
+   (float16_t)-0.216106797f,  (float16_t)0.976369731f,
+   (float16_t)-0.219101240f,  (float16_t)0.975702130f,
+   (float16_t)-0.222093621f,  (float16_t)0.975025345f,
+   (float16_t)-0.225083911f,  (float16_t)0.974339383f,
+   (float16_t)-0.228072083f,  (float16_t)0.973644250f,
+   (float16_t)-0.231058108f,  (float16_t)0.972939952f,
+   (float16_t)-0.234041959f,  (float16_t)0.972226497f,
+   (float16_t)-0.237023606f,  (float16_t)0.971503891f,
+   (float16_t)-0.240003022f,  (float16_t)0.970772141f,
+   (float16_t)-0.242980180f,  (float16_t)0.970031253f,
+   (float16_t)-0.245955050f,  (float16_t)0.969281235f,
+   (float16_t)-0.248927606f,  (float16_t)0.968522094f,
+   (float16_t)-0.251897818f,  (float16_t)0.967753837f,
+   (float16_t)-0.254865660f,  (float16_t)0.966976471f,
+   (float16_t)-0.257831102f,  (float16_t)0.966190003f,
+   (float16_t)-0.260794118f,  (float16_t)0.965394442f,
+   (float16_t)-0.263754679f,  (float16_t)0.964589793f,
+   (float16_t)-0.266712757f,  (float16_t)0.963776066f,
+   (float16_t)-0.269668326f,  (float16_t)0.962953267f,
+   (float16_t)-0.272621355f,  (float16_t)0.962121404f,
+   (float16_t)-0.275571819f,  (float16_t)0.961280486f,
+   (float16_t)-0.278519689f,  (float16_t)0.960430519f,
+   (float16_t)-0.281464938f,  (float16_t)0.959571513f,
+   (float16_t)-0.284407537f,  (float16_t)0.958703475f,
+   (float16_t)-0.287347460f,  (float16_t)0.957826413f,
+   (float16_t)-0.290284677f,  (float16_t)0.956940336f,
+   (float16_t)-0.293219163f,  (float16_t)0.956045251f,
+   (float16_t)-0.296150888f,  (float16_t)0.955141168f,
+   (float16_t)-0.299079826f,  (float16_t)0.954228095f,
+   (float16_t)-0.302005949f,  (float16_t)0.953306040f,
+   (float16_t)-0.304929230f,  (float16_t)0.952375013f,
+   (float16_t)-0.307849640f,  (float16_t)0.951435021f,
+   (float16_t)-0.310767153f,  (float16_t)0.950486074f,
+   (float16_t)-0.313681740f,  (float16_t)0.949528181f,
+   (float16_t)-0.316593376f,  (float16_t)0.948561350f,
+   (float16_t)-0.319502031f,  (float16_t)0.947585591f,
+   (float16_t)-0.322407679f,  (float16_t)0.946600913f,
+   (float16_t)-0.325310292f,  (float16_t)0.945607325f,
+   (float16_t)-0.328209844f,  (float16_t)0.944604837f,
+   (float16_t)-0.331106306f,  (float16_t)0.943593458f,
+   (float16_t)-0.333999651f,  (float16_t)0.942573198f,
+   (float16_t)-0.336889853f,  (float16_t)0.941544065f,
+   (float16_t)-0.339776884f,  (float16_t)0.940506071f,
+   (float16_t)-0.342660717f,  (float16_t)0.939459224f,
+   (float16_t)-0.345541325f,  (float16_t)0.938403534f,
+   (float16_t)-0.348418680f,  (float16_t)0.937339012f,
+   (float16_t)-0.351292756f,  (float16_t)0.936265667f,
+   (float16_t)-0.354163525f,  (float16_t)0.935183510f,
+   (float16_t)-0.357030961f,  (float16_t)0.934092550f,
+   (float16_t)-0.359895037f,  (float16_t)0.932992799f,
+   (float16_t)-0.362755724f,  (float16_t)0.931884266f,
+   (float16_t)-0.365612998f,  (float16_t)0.930766961f,
+   (float16_t)-0.368466830f,  (float16_t)0.929640896f,
+   (float16_t)-0.371317194f,  (float16_t)0.928506080f,
+   (float16_t)-0.374164063f,  (float16_t)0.927362526f,
+   (float16_t)-0.377007410f,  (float16_t)0.926210242f,
+   (float16_t)-0.379847209f,  (float16_t)0.925049241f,
+   (float16_t)-0.382683432f,  (float16_t)0.923879533f,
+   (float16_t)-0.385516054f,  (float16_t)0.922701128f,
+   (float16_t)-0.388345047f,  (float16_t)0.921514039f,
+   (float16_t)-0.391170384f,  (float16_t)0.920318277f,
+   (float16_t)-0.393992040f,  (float16_t)0.919113852f,
+   (float16_t)-0.396809987f,  (float16_t)0.917900776f,
+   (float16_t)-0.399624200f,  (float16_t)0.916679060f,
+   (float16_t)-0.402434651f,  (float16_t)0.915448716f,
+   (float16_t)-0.405241314f,  (float16_t)0.914209756f,
+   (float16_t)-0.408044163f,  (float16_t)0.912962190f,
+   (float16_t)-0.410843171f,  (float16_t)0.911706032f,
+   (float16_t)-0.413638312f,  (float16_t)0.910441292f,
+   (float16_t)-0.416429560f,  (float16_t)0.909167983f,
+   (float16_t)-0.419216888f,  (float16_t)0.907886116f,
+   (float16_t)-0.422000271f,  (float16_t)0.906595705f,
+   (float16_t)-0.424779681f,  (float16_t)0.905296759f,
+   (float16_t)-0.427555093f,  (float16_t)0.903989293f,
+   (float16_t)-0.430326481f,  (float16_t)0.902673318f,
+   (float16_t)-0.433093819f,  (float16_t)0.901348847f,
+   (float16_t)-0.435857080f,  (float16_t)0.900015892f,
+   (float16_t)-0.438616239f,  (float16_t)0.898674466f,
+   (float16_t)-0.441371269f,  (float16_t)0.897324581f,
+   (float16_t)-0.444122145f,  (float16_t)0.895966250f,
+   (float16_t)-0.446868840f,  (float16_t)0.894599486f,
+   (float16_t)-0.449611330f,  (float16_t)0.893224301f,
+   (float16_t)-0.452349587f,  (float16_t)0.891840709f,
+   (float16_t)-0.455083587f,  (float16_t)0.890448723f,
+   (float16_t)-0.457813304f,  (float16_t)0.889048356f,
+   (float16_t)-0.460538711f,  (float16_t)0.887639620f,
+   (float16_t)-0.463259784f,  (float16_t)0.886222530f,
+   (float16_t)-0.465976496f,  (float16_t)0.884797098f,
+   (float16_t)-0.468688822f,  (float16_t)0.883363339f,
+   (float16_t)-0.471396737f,  (float16_t)0.881921264f,
+   (float16_t)-0.474100215f,  (float16_t)0.880470889f,
+   (float16_t)-0.476799230f,  (float16_t)0.879012226f,
+   (float16_t)-0.479493758f,  (float16_t)0.877545290f,
+   (float16_t)-0.482183772f,  (float16_t)0.876070094f,
+   (float16_t)-0.484869248f,  (float16_t)0.874586652f,
+   (float16_t)-0.487550160f,  (float16_t)0.873094978f,
+   (float16_t)-0.490226483f,  (float16_t)0.871595087f,
+   (float16_t)-0.492898192f,  (float16_t)0.870086991f,
+   (float16_t)-0.495565262f,  (float16_t)0.868570706f,
+   (float16_t)-0.498227667f,  (float16_t)0.867046246f,
+   (float16_t)-0.500885383f,  (float16_t)0.865513624f,
+   (float16_t)-0.503538384f,  (float16_t)0.863972856f,
+   (float16_t)-0.506186645f,  (float16_t)0.862423956f,
+   (float16_t)-0.508830143f,  (float16_t)0.860866939f,
+   (float16_t)-0.511468850f,  (float16_t)0.859301818f,
+   (float16_t)-0.514102744f,  (float16_t)0.857728610f,
+   (float16_t)-0.516731799f,  (float16_t)0.856147328f,
+   (float16_t)-0.519355990f,  (float16_t)0.854557988f,
+   (float16_t)-0.521975293f,  (float16_t)0.852960605f,
+   (float16_t)-0.524589683f,  (float16_t)0.851355193f,
+   (float16_t)-0.527199135f,  (float16_t)0.849741768f,
+   (float16_t)-0.529803625f,  (float16_t)0.848120345f,
+   (float16_t)-0.532403128f,  (float16_t)0.846490939f,
+   (float16_t)-0.534997620f,  (float16_t)0.844853565f,
+   (float16_t)-0.537587076f,  (float16_t)0.843208240f,
+   (float16_t)-0.540171473f,  (float16_t)0.841554977f,
+   (float16_t)-0.542750785f,  (float16_t)0.839893794f,
+   (float16_t)-0.545324988f,  (float16_t)0.838224706f,
+   (float16_t)-0.547894059f,  (float16_t)0.836547727f,
+   (float16_t)-0.550457973f,  (float16_t)0.834862875f,
+   (float16_t)-0.553016706f,  (float16_t)0.833170165f,
+   (float16_t)-0.555570233f,  (float16_t)0.831469612f,
+   (float16_t)-0.558118531f,  (float16_t)0.829761234f,
+   (float16_t)-0.560661576f,  (float16_t)0.828045045f,
+   (float16_t)-0.563199344f,  (float16_t)0.826321063f,
+   (float16_t)-0.565731811f,  (float16_t)0.824589303f,
+   (float16_t)-0.568258953f,  (float16_t)0.822849781f,
+   (float16_t)-0.570780746f,  (float16_t)0.821102515f,
+   (float16_t)-0.573297167f,  (float16_t)0.819347520f,
+   (float16_t)-0.575808191f,  (float16_t)0.817584813f,
+   (float16_t)-0.578313796f,  (float16_t)0.815814411f,
+   (float16_t)-0.580813958f,  (float16_t)0.814036330f,
+   (float16_t)-0.583308653f,  (float16_t)0.812250587f,
+   (float16_t)-0.585797857f,  (float16_t)0.810457198f,
+   (float16_t)-0.588281548f,  (float16_t)0.808656182f,
+   (float16_t)-0.590759702f,  (float16_t)0.806847554f,
+   (float16_t)-0.593232295f,  (float16_t)0.805031331f,
+   (float16_t)-0.595699304f,  (float16_t)0.803207531f,
+   (float16_t)-0.598160707f,  (float16_t)0.801376172f,
+   (float16_t)-0.600616479f,  (float16_t)0.799537269f,
+   (float16_t)-0.603066599f,  (float16_t)0.797690841f,
+   (float16_t)-0.605511041f,  (float16_t)0.795836905f,
+   (float16_t)-0.607949785f,  (float16_t)0.793975478f,
+   (float16_t)-0.610382806f,  (float16_t)0.792106577f,
+   (float16_t)-0.612810082f,  (float16_t)0.790230221f,
+   (float16_t)-0.615231591f,  (float16_t)0.788346428f,
+   (float16_t)-0.617647308f,  (float16_t)0.786455214f,
+   (float16_t)-0.620057212f,  (float16_t)0.784556597f,
+   (float16_t)-0.622461279f,  (float16_t)0.782650596f,
+   (float16_t)-0.624859488f,  (float16_t)0.780737229f,
+   (float16_t)-0.627251815f,  (float16_t)0.778816512f,
+   (float16_t)-0.629638239f,  (float16_t)0.776888466f,
+   (float16_t)-0.632018736f,  (float16_t)0.774953107f,
+   (float16_t)-0.634393284f,  (float16_t)0.773010453f,
+   (float16_t)-0.636761861f,  (float16_t)0.771060524f,
+   (float16_t)-0.639124445f,  (float16_t)0.769103338f,
+   (float16_t)-0.641481013f,  (float16_t)0.767138912f,
+   (float16_t)-0.643831543f,  (float16_t)0.765167266f,
+   (float16_t)-0.646176013f,  (float16_t)0.763188417f,
+   (float16_t)-0.648514401f,  (float16_t)0.761202385f,
+   (float16_t)-0.650846685f,  (float16_t)0.759209189f,
+   (float16_t)-0.653172843f,  (float16_t)0.757208847f,
+   (float16_t)-0.655492853f,  (float16_t)0.755201377f,
+   (float16_t)-0.657806693f,  (float16_t)0.753186799f,
+   (float16_t)-0.660114342f,  (float16_t)0.751165132f,
+   (float16_t)-0.662415778f,  (float16_t)0.749136395f,
+   (float16_t)-0.664710978f,  (float16_t)0.747100606f,
+   (float16_t)-0.666999922f,  (float16_t)0.745057785f,
+   (float16_t)-0.669282588f,  (float16_t)0.743007952f,
+   (float16_t)-0.671558955f,  (float16_t)0.740951125f,
+   (float16_t)-0.673829000f,  (float16_t)0.738887324f,
+   (float16_t)-0.676092704f,  (float16_t)0.736816569f,
+   (float16_t)-0.678350043f,  (float16_t)0.734738878f,
+   (float16_t)-0.680600998f,  (float16_t)0.732654272f,
+   (float16_t)-0.682845546f,  (float16_t)0.730562769f,
+   (float16_t)-0.685083668f,  (float16_t)0.728464390f,
+   (float16_t)-0.687315341f,  (float16_t)0.726359155f,
+   (float16_t)-0.689540545f,  (float16_t)0.724247083f,
+   (float16_t)-0.691759258f,  (float16_t)0.722128194f,
+   (float16_t)-0.693971461f,  (float16_t)0.720002508f,
+   (float16_t)-0.696177131f,  (float16_t)0.717870045f,
+   (float16_t)-0.698376249f,  (float16_t)0.715730825f,
+   (float16_t)-0.700568794f,  (float16_t)0.713584869f,
+   (float16_t)-0.702754744f,  (float16_t)0.711432196f,
+   (float16_t)-0.704934080f,  (float16_t)0.709272826f,
+   (float16_t)-0.707106781f,  (float16_t)0.707106781f,
+   (float16_t)-0.709272826f,  (float16_t)0.704934080f,
+   (float16_t)-0.711432196f,  (float16_t)0.702754744f,
+   (float16_t)-0.713584869f,  (float16_t)0.700568794f,
+   (float16_t)-0.715730825f,  (float16_t)0.698376249f,
+   (float16_t)-0.717870045f,  (float16_t)0.696177131f,
+   (float16_t)-0.720002508f,  (float16_t)0.693971461f,
+   (float16_t)-0.722128194f,  (float16_t)0.691759258f,
+   (float16_t)-0.724247083f,  (float16_t)0.689540545f,
+   (float16_t)-0.726359155f,  (float16_t)0.687315341f,
+   (float16_t)-0.728464390f,  (float16_t)0.685083668f,
+   (float16_t)-0.730562769f,  (float16_t)0.682845546f,
+   (float16_t)-0.732654272f,  (float16_t)0.680600998f,
+   (float16_t)-0.734738878f,  (float16_t)0.678350043f,
+   (float16_t)-0.736816569f,  (float16_t)0.676092704f,
+   (float16_t)-0.738887324f,  (float16_t)0.673829000f,
+   (float16_t)-0.740951125f,  (float16_t)0.671558955f,
+   (float16_t)-0.743007952f,  (float16_t)0.669282588f,
+   (float16_t)-0.745057785f,  (float16_t)0.666999922f,
+   (float16_t)-0.747100606f,  (float16_t)0.664710978f,
+   (float16_t)-0.749136395f,  (float16_t)0.662415778f,
+   (float16_t)-0.751165132f,  (float16_t)0.660114342f,
+   (float16_t)-0.753186799f,  (float16_t)0.657806693f,
+   (float16_t)-0.755201377f,  (float16_t)0.655492853f,
+   (float16_t)-0.757208847f,  (float16_t)0.653172843f,
+   (float16_t)-0.759209189f,  (float16_t)0.650846685f,
+   (float16_t)-0.761202385f,  (float16_t)0.648514401f,
+   (float16_t)-0.763188417f,  (float16_t)0.646176013f,
+   (float16_t)-0.765167266f,  (float16_t)0.643831543f,
+   (float16_t)-0.767138912f,  (float16_t)0.641481013f,
+   (float16_t)-0.769103338f,  (float16_t)0.639124445f,
+   (float16_t)-0.771060524f,  (float16_t)0.636761861f,
+   (float16_t)-0.773010453f,  (float16_t)0.634393284f,
+   (float16_t)-0.774953107f,  (float16_t)0.632018736f,
+   (float16_t)-0.776888466f,  (float16_t)0.629638239f,
+   (float16_t)-0.778816512f,  (float16_t)0.627251815f,
+   (float16_t)-0.780737229f,  (float16_t)0.624859488f,
+   (float16_t)-0.782650596f,  (float16_t)0.622461279f,
+   (float16_t)-0.784556597f,  (float16_t)0.620057212f,
+   (float16_t)-0.786455214f,  (float16_t)0.617647308f,
+   (float16_t)-0.788346428f,  (float16_t)0.615231591f,
+   (float16_t)-0.790230221f,  (float16_t)0.612810082f,
+   (float16_t)-0.792106577f,  (float16_t)0.610382806f,
+   (float16_t)-0.793975478f,  (float16_t)0.607949785f,
+   (float16_t)-0.795836905f,  (float16_t)0.605511041f,
+   (float16_t)-0.797690841f,  (float16_t)0.603066599f,
+   (float16_t)-0.799537269f,  (float16_t)0.600616479f,
+   (float16_t)-0.801376172f,  (float16_t)0.598160707f,
+   (float16_t)-0.803207531f,  (float16_t)0.595699304f,
+   (float16_t)-0.805031331f,  (float16_t)0.593232295f,
+   (float16_t)-0.806847554f,  (float16_t)0.590759702f,
+   (float16_t)-0.808656182f,  (float16_t)0.588281548f,
+   (float16_t)-0.810457198f,  (float16_t)0.585797857f,
+   (float16_t)-0.812250587f,  (float16_t)0.583308653f,
+   (float16_t)-0.814036330f,  (float16_t)0.580813958f,
+   (float16_t)-0.815814411f,  (float16_t)0.578313796f,
+   (float16_t)-0.817584813f,  (float16_t)0.575808191f,
+   (float16_t)-0.819347520f,  (float16_t)0.573297167f,
+   (float16_t)-0.821102515f,  (float16_t)0.570780746f,
+   (float16_t)-0.822849781f,  (float16_t)0.568258953f,
+   (float16_t)-0.824589303f,  (float16_t)0.565731811f,
+   (float16_t)-0.826321063f,  (float16_t)0.563199344f,
+   (float16_t)-0.828045045f,  (float16_t)0.560661576f,
+   (float16_t)-0.829761234f,  (float16_t)0.558118531f,
+   (float16_t)-0.831469612f,  (float16_t)0.555570233f,
+   (float16_t)-0.833170165f,  (float16_t)0.553016706f,
+   (float16_t)-0.834862875f,  (float16_t)0.550457973f,
+   (float16_t)-0.836547727f,  (float16_t)0.547894059f,
+   (float16_t)-0.838224706f,  (float16_t)0.545324988f,
+   (float16_t)-0.839893794f,  (float16_t)0.542750785f,
+   (float16_t)-0.841554977f,  (float16_t)0.540171473f,
+   (float16_t)-0.843208240f,  (float16_t)0.537587076f,
+   (float16_t)-0.844853565f,  (float16_t)0.534997620f,
+   (float16_t)-0.846490939f,  (float16_t)0.532403128f,
+   (float16_t)-0.848120345f,  (float16_t)0.529803625f,
+   (float16_t)-0.849741768f,  (float16_t)0.527199135f,
+   (float16_t)-0.851355193f,  (float16_t)0.524589683f,
+   (float16_t)-0.852960605f,  (float16_t)0.521975293f,
+   (float16_t)-0.854557988f,  (float16_t)0.519355990f,
+   (float16_t)-0.856147328f,  (float16_t)0.516731799f,
+   (float16_t)-0.857728610f,  (float16_t)0.514102744f,
+   (float16_t)-0.859301818f,  (float16_t)0.511468850f,
+   (float16_t)-0.860866939f,  (float16_t)0.508830143f,
+   (float16_t)-0.862423956f,  (float16_t)0.506186645f,
+   (float16_t)-0.863972856f,  (float16_t)0.503538384f,
+   (float16_t)-0.865513624f,  (float16_t)0.500885383f,
+   (float16_t)-0.867046246f,  (float16_t)0.498227667f,
+   (float16_t)-0.868570706f,  (float16_t)0.495565262f,
+   (float16_t)-0.870086991f,  (float16_t)0.492898192f,
+   (float16_t)-0.871595087f,  (float16_t)0.490226483f,
+   (float16_t)-0.873094978f,  (float16_t)0.487550160f,
+   (float16_t)-0.874586652f,  (float16_t)0.484869248f,
+   (float16_t)-0.876070094f,  (float16_t)0.482183772f,
+   (float16_t)-0.877545290f,  (float16_t)0.479493758f,
+   (float16_t)-0.879012226f,  (float16_t)0.476799230f,
+   (float16_t)-0.880470889f,  (float16_t)0.474100215f,
+   (float16_t)-0.881921264f,  (float16_t)0.471396737f,
+   (float16_t)-0.883363339f,  (float16_t)0.468688822f,
+   (float16_t)-0.884797098f,  (float16_t)0.465976496f,
+   (float16_t)-0.886222530f,  (float16_t)0.463259784f,
+   (float16_t)-0.887639620f,  (float16_t)0.460538711f,
+   (float16_t)-0.889048356f,  (float16_t)0.457813304f,
+   (float16_t)-0.890448723f,  (float16_t)0.455083587f,
+   (float16_t)-0.891840709f,  (float16_t)0.452349587f,
+   (float16_t)-0.893224301f,  (float16_t)0.449611330f,
+   (float16_t)-0.894599486f,  (float16_t)0.446868840f,
+   (float16_t)-0.895966250f,  (float16_t)0.444122145f,
+   (float16_t)-0.897324581f,  (float16_t)0.441371269f,
+   (float16_t)-0.898674466f,  (float16_t)0.438616239f,
+   (float16_t)-0.900015892f,  (float16_t)0.435857080f,
+   (float16_t)-0.901348847f,  (float16_t)0.433093819f,
+   (float16_t)-0.902673318f,  (float16_t)0.430326481f,
+   (float16_t)-0.903989293f,  (float16_t)0.427555093f,
+   (float16_t)-0.905296759f,  (float16_t)0.424779681f,
+   (float16_t)-0.906595705f,  (float16_t)0.422000271f,
+   (float16_t)-0.907886116f,  (float16_t)0.419216888f,
+   (float16_t)-0.909167983f,  (float16_t)0.416429560f,
+   (float16_t)-0.910441292f,  (float16_t)0.413638312f,
+   (float16_t)-0.911706032f,  (float16_t)0.410843171f,
+   (float16_t)-0.912962190f,  (float16_t)0.408044163f,
+   (float16_t)-0.914209756f,  (float16_t)0.405241314f,
+   (float16_t)-0.915448716f,  (float16_t)0.402434651f,
+   (float16_t)-0.916679060f,  (float16_t)0.399624200f,
+   (float16_t)-0.917900776f,  (float16_t)0.396809987f,
+   (float16_t)-0.919113852f,  (float16_t)0.393992040f,
+   (float16_t)-0.920318277f,  (float16_t)0.391170384f,
+   (float16_t)-0.921514039f,  (float16_t)0.388345047f,
+   (float16_t)-0.922701128f,  (float16_t)0.385516054f,
+   (float16_t)-0.923879533f,  (float16_t)0.382683432f,
+   (float16_t)-0.925049241f,  (float16_t)0.379847209f,
+   (float16_t)-0.926210242f,  (float16_t)0.377007410f,
+   (float16_t)-0.927362526f,  (float16_t)0.374164063f,
+   (float16_t)-0.928506080f,  (float16_t)0.371317194f,
+   (float16_t)-0.929640896f,  (float16_t)0.368466830f,
+   (float16_t)-0.930766961f,  (float16_t)0.365612998f,
+   (float16_t)-0.931884266f,  (float16_t)0.362755724f,
+   (float16_t)-0.932992799f,  (float16_t)0.359895037f,
+   (float16_t)-0.934092550f,  (float16_t)0.357030961f,
+   (float16_t)-0.935183510f,  (float16_t)0.354163525f,
+   (float16_t)-0.936265667f,  (float16_t)0.351292756f,
+   (float16_t)-0.937339012f,  (float16_t)0.348418680f,
+   (float16_t)-0.938403534f,  (float16_t)0.345541325f,
+   (float16_t)-0.939459224f,  (float16_t)0.342660717f,
+   (float16_t)-0.940506071f,  (float16_t)0.339776884f,
+   (float16_t)-0.941544065f,  (float16_t)0.336889853f,
+   (float16_t)-0.942573198f,  (float16_t)0.333999651f,
+   (float16_t)-0.943593458f,  (float16_t)0.331106306f,
+   (float16_t)-0.944604837f,  (float16_t)0.328209844f,
+   (float16_t)-0.945607325f,  (float16_t)0.325310292f,
+   (float16_t)-0.946600913f,  (float16_t)0.322407679f,
+   (float16_t)-0.947585591f,  (float16_t)0.319502031f,
+   (float16_t)-0.948561350f,  (float16_t)0.316593376f,
+   (float16_t)-0.949528181f,  (float16_t)0.313681740f,
+   (float16_t)-0.950486074f,  (float16_t)0.310767153f,
+   (float16_t)-0.951435021f,  (float16_t)0.307849640f,
+   (float16_t)-0.952375013f,  (float16_t)0.304929230f,
+   (float16_t)-0.953306040f,  (float16_t)0.302005949f,
+   (float16_t)-0.954228095f,  (float16_t)0.299079826f,
+   (float16_t)-0.955141168f,  (float16_t)0.296150888f,
+   (float16_t)-0.956045251f,  (float16_t)0.293219163f,
+   (float16_t)-0.956940336f,  (float16_t)0.290284677f,
+   (float16_t)-0.957826413f,  (float16_t)0.287347460f,
+   (float16_t)-0.958703475f,  (float16_t)0.284407537f,
+   (float16_t)-0.959571513f,  (float16_t)0.281464938f,
+   (float16_t)-0.960430519f,  (float16_t)0.278519689f,
+   (float16_t)-0.961280486f,  (float16_t)0.275571819f,
+   (float16_t)-0.962121404f,  (float16_t)0.272621355f,
+   (float16_t)-0.962953267f,  (float16_t)0.269668326f,
+   (float16_t)-0.963776066f,  (float16_t)0.266712757f,
+   (float16_t)-0.964589793f,  (float16_t)0.263754679f,
+   (float16_t)-0.965394442f,  (float16_t)0.260794118f,
+   (float16_t)-0.966190003f,  (float16_t)0.257831102f,
+   (float16_t)-0.966976471f,  (float16_t)0.254865660f,
+   (float16_t)-0.967753837f,  (float16_t)0.251897818f,
+   (float16_t)-0.968522094f,  (float16_t)0.248927606f,
+   (float16_t)-0.969281235f,  (float16_t)0.245955050f,
+   (float16_t)-0.970031253f,  (float16_t)0.242980180f,
+   (float16_t)-0.970772141f,  (float16_t)0.240003022f,
+   (float16_t)-0.971503891f,  (float16_t)0.237023606f,
+   (float16_t)-0.972226497f,  (float16_t)0.234041959f,
+   (float16_t)-0.972939952f,  (float16_t)0.231058108f,
+   (float16_t)-0.973644250f,  (float16_t)0.228072083f,
+   (float16_t)-0.974339383f,  (float16_t)0.225083911f,
+   (float16_t)-0.975025345f,  (float16_t)0.222093621f,
+   (float16_t)-0.975702130f,  (float16_t)0.219101240f,
+   (float16_t)-0.976369731f,  (float16_t)0.216106797f,
+   (float16_t)-0.977028143f,  (float16_t)0.213110320f,
+   (float16_t)-0.977677358f,  (float16_t)0.210111837f,
+   (float16_t)-0.978317371f,  (float16_t)0.207111376f,
+   (float16_t)-0.978948175f,  (float16_t)0.204108966f,
+   (float16_t)-0.979569766f,  (float16_t)0.201104635f,
+   (float16_t)-0.980182136f,  (float16_t)0.198098411f,
+   (float16_t)-0.980785280f,  (float16_t)0.195090322f,
+   (float16_t)-0.981379193f,  (float16_t)0.192080397f,
+   (float16_t)-0.981963869f,  (float16_t)0.189068664f,
+   (float16_t)-0.982539302f,  (float16_t)0.186055152f,
+   (float16_t)-0.983105487f,  (float16_t)0.183039888f,
+   (float16_t)-0.983662419f,  (float16_t)0.180022901f,
+   (float16_t)-0.984210092f,  (float16_t)0.177004220f,
+   (float16_t)-0.984748502f,  (float16_t)0.173983873f,
+   (float16_t)-0.985277642f,  (float16_t)0.170961889f,
+   (float16_t)-0.985797509f,  (float16_t)0.167938295f,
+   (float16_t)-0.986308097f,  (float16_t)0.164913120f,
+   (float16_t)-0.986809402f,  (float16_t)0.161886394f,
+   (float16_t)-0.987301418f,  (float16_t)0.158858143f,
+   (float16_t)-0.987784142f,  (float16_t)0.155828398f,
+   (float16_t)-0.988257568f,  (float16_t)0.152797185f,
+   (float16_t)-0.988721692f,  (float16_t)0.149764535f,
+   (float16_t)-0.989176510f,  (float16_t)0.146730474f,
+   (float16_t)-0.989622017f,  (float16_t)0.143695033f,
+   (float16_t)-0.990058210f,  (float16_t)0.140658239f,
+   (float16_t)-0.990485084f,  (float16_t)0.137620122f,
+   (float16_t)-0.990902635f,  (float16_t)0.134580709f,
+   (float16_t)-0.991310860f,  (float16_t)0.131540029f,
+   (float16_t)-0.991709754f,  (float16_t)0.128498111f,
+   (float16_t)-0.992099313f,  (float16_t)0.125454983f,
+   (float16_t)-0.992479535f,  (float16_t)0.122410675f,
+   (float16_t)-0.992850414f,  (float16_t)0.119365215f,
+   (float16_t)-0.993211949f,  (float16_t)0.116318631f,
+   (float16_t)-0.993564136f,  (float16_t)0.113270952f,
+   (float16_t)-0.993906970f,  (float16_t)0.110222207f,
+   (float16_t)-0.994240449f,  (float16_t)0.107172425f,
+   (float16_t)-0.994564571f,  (float16_t)0.104121634f,
+   (float16_t)-0.994879331f,  (float16_t)0.101069863f,
+   (float16_t)-0.995184727f,  (float16_t)0.098017140f,
+   (float16_t)-0.995480755f,  (float16_t)0.094963495f,
+   (float16_t)-0.995767414f,  (float16_t)0.091908956f,
+   (float16_t)-0.996044701f,  (float16_t)0.088853553f,
+   (float16_t)-0.996312612f,  (float16_t)0.085797312f,
+   (float16_t)-0.996571146f,  (float16_t)0.082740265f,
+   (float16_t)-0.996820299f,  (float16_t)0.079682438f,
+   (float16_t)-0.997060070f,  (float16_t)0.076623861f,
+   (float16_t)-0.997290457f,  (float16_t)0.073564564f,
+   (float16_t)-0.997511456f,  (float16_t)0.070504573f,
+   (float16_t)-0.997723067f,  (float16_t)0.067443920f,
+   (float16_t)-0.997925286f,  (float16_t)0.064382631f,
+   (float16_t)-0.998118113f,  (float16_t)0.061320736f,
+   (float16_t)-0.998301545f,  (float16_t)0.058258265f,
+   (float16_t)-0.998475581f,  (float16_t)0.055195244f,
+   (float16_t)-0.998640218f,  (float16_t)0.052131705f,
+   (float16_t)-0.998795456f,  (float16_t)0.049067674f,
+   (float16_t)-0.998941293f,  (float16_t)0.046003182f,
+   (float16_t)-0.999077728f,  (float16_t)0.042938257f,
+   (float16_t)-0.999204759f,  (float16_t)0.039872928f,
+   (float16_t)-0.999322385f,  (float16_t)0.036807223f,
+   (float16_t)-0.999430605f,  (float16_t)0.033741172f,
+   (float16_t)-0.999529418f,  (float16_t)0.030674803f,
+   (float16_t)-0.999618822f,  (float16_t)0.027608146f,
+   (float16_t)-0.999698819f,  (float16_t)0.024541229f,
+   (float16_t)-0.999769405f,  (float16_t)0.021474080f,
+   (float16_t)-0.999830582f,  (float16_t)0.018406730f,
+   (float16_t)-0.999882347f,  (float16_t)0.015339206f,
+   (float16_t)-0.999924702f,  (float16_t)0.012271538f,
+   (float16_t)-0.999957645f,  (float16_t)0.009203755f,
+   (float16_t)-0.999981175f,  (float16_t)0.006135885f,
+   (float16_t)-0.999995294f,  (float16_t)0.003067957f,
+   (float16_t)-1.000000000f,  (float16_t)0.000000000f,
+   (float16_t)-0.999995294f, (float16_t)-0.003067957f,
+   (float16_t)-0.999981175f, (float16_t)-0.006135885f,
+   (float16_t)-0.999957645f, (float16_t)-0.009203755f,
+   (float16_t)-0.999924702f, (float16_t)-0.012271538f,
+   (float16_t)-0.999882347f, (float16_t)-0.015339206f,
+   (float16_t)-0.999830582f, (float16_t)-0.018406730f,
+   (float16_t)-0.999769405f, (float16_t)-0.021474080f,
+   (float16_t)-0.999698819f, (float16_t)-0.024541229f,
+   (float16_t)-0.999618822f, (float16_t)-0.027608146f,
+   (float16_t)-0.999529418f, (float16_t)-0.030674803f,
+   (float16_t)-0.999430605f, (float16_t)-0.033741172f,
+   (float16_t)-0.999322385f, (float16_t)-0.036807223f,
+   (float16_t)-0.999204759f, (float16_t)-0.039872928f,
+   (float16_t)-0.999077728f, (float16_t)-0.042938257f,
+   (float16_t)-0.998941293f, (float16_t)-0.046003182f,
+   (float16_t)-0.998795456f, (float16_t)-0.049067674f,
+   (float16_t)-0.998640218f, (float16_t)-0.052131705f,
+   (float16_t)-0.998475581f, (float16_t)-0.055195244f,
+   (float16_t)-0.998301545f, (float16_t)-0.058258265f,
+   (float16_t)-0.998118113f, (float16_t)-0.061320736f,
+   (float16_t)-0.997925286f, (float16_t)-0.064382631f,
+   (float16_t)-0.997723067f, (float16_t)-0.067443920f,
+   (float16_t)-0.997511456f, (float16_t)-0.070504573f,
+   (float16_t)-0.997290457f, (float16_t)-0.073564564f,
+   (float16_t)-0.997060070f, (float16_t)-0.076623861f,
+   (float16_t)-0.996820299f, (float16_t)-0.079682438f,
+   (float16_t)-0.996571146f, (float16_t)-0.082740265f,
+   (float16_t)-0.996312612f, (float16_t)-0.085797312f,
+   (float16_t)-0.996044701f, (float16_t)-0.088853553f,
+   (float16_t)-0.995767414f, (float16_t)-0.091908956f,
+   (float16_t)-0.995480755f, (float16_t)-0.094963495f,
+   (float16_t)-0.995184727f, (float16_t)-0.098017140f,
+   (float16_t)-0.994879331f, (float16_t)-0.101069863f,
+   (float16_t)-0.994564571f, (float16_t)-0.104121634f,
+   (float16_t)-0.994240449f, (float16_t)-0.107172425f,
+   (float16_t)-0.993906970f, (float16_t)-0.110222207f,
+   (float16_t)-0.993564136f, (float16_t)-0.113270952f,
+   (float16_t)-0.993211949f, (float16_t)-0.116318631f,
+   (float16_t)-0.992850414f, (float16_t)-0.119365215f,
+   (float16_t)-0.992479535f, (float16_t)-0.122410675f,
+   (float16_t)-0.992099313f, (float16_t)-0.125454983f,
+   (float16_t)-0.991709754f, (float16_t)-0.128498111f,
+   (float16_t)-0.991310860f, (float16_t)-0.131540029f,
+   (float16_t)-0.990902635f, (float16_t)-0.134580709f,
+   (float16_t)-0.990485084f, (float16_t)-0.137620122f,
+   (float16_t)-0.990058210f, (float16_t)-0.140658239f,
+   (float16_t)-0.989622017f, (float16_t)-0.143695033f,
+   (float16_t)-0.989176510f, (float16_t)-0.146730474f,
+   (float16_t)-0.988721692f, (float16_t)-0.149764535f,
+   (float16_t)-0.988257568f, (float16_t)-0.152797185f,
+   (float16_t)-0.987784142f, (float16_t)-0.155828398f,
+   (float16_t)-0.987301418f, (float16_t)-0.158858143f,
+   (float16_t)-0.986809402f, (float16_t)-0.161886394f,
+   (float16_t)-0.986308097f, (float16_t)-0.164913120f,
+   (float16_t)-0.985797509f, (float16_t)-0.167938295f,
+   (float16_t)-0.985277642f, (float16_t)-0.170961889f,
+   (float16_t)-0.984748502f, (float16_t)-0.173983873f,
+   (float16_t)-0.984210092f, (float16_t)-0.177004220f,
+   (float16_t)-0.983662419f, (float16_t)-0.180022901f,
+   (float16_t)-0.983105487f, (float16_t)-0.183039888f,
+   (float16_t)-0.982539302f, (float16_t)-0.186055152f,
+   (float16_t)-0.981963869f, (float16_t)-0.189068664f,
+   (float16_t)-0.981379193f, (float16_t)-0.192080397f,
+   (float16_t)-0.980785280f, (float16_t)-0.195090322f,
+   (float16_t)-0.980182136f, (float16_t)-0.198098411f,
+   (float16_t)-0.979569766f, (float16_t)-0.201104635f,
+   (float16_t)-0.978948175f, (float16_t)-0.204108966f,
+   (float16_t)-0.978317371f, (float16_t)-0.207111376f,
+   (float16_t)-0.977677358f, (float16_t)-0.210111837f,
+   (float16_t)-0.977028143f, (float16_t)-0.213110320f,
+   (float16_t)-0.976369731f, (float16_t)-0.216106797f,
+   (float16_t)-0.975702130f, (float16_t)-0.219101240f,
+   (float16_t)-0.975025345f, (float16_t)-0.222093621f,
+   (float16_t)-0.974339383f, (float16_t)-0.225083911f,
+   (float16_t)-0.973644250f, (float16_t)-0.228072083f,
+   (float16_t)-0.972939952f, (float16_t)-0.231058108f,
+   (float16_t)-0.972226497f, (float16_t)-0.234041959f,
+   (float16_t)-0.971503891f, (float16_t)-0.237023606f,
+   (float16_t)-0.970772141f, (float16_t)-0.240003022f,
+   (float16_t)-0.970031253f, (float16_t)-0.242980180f,
+   (float16_t)-0.969281235f, (float16_t)-0.245955050f,
+   (float16_t)-0.968522094f, (float16_t)-0.248927606f,
+   (float16_t)-0.967753837f, (float16_t)-0.251897818f,
+   (float16_t)-0.966976471f, (float16_t)-0.254865660f,
+   (float16_t)-0.966190003f, (float16_t)-0.257831102f,
+   (float16_t)-0.965394442f, (float16_t)-0.260794118f,
+   (float16_t)-0.964589793f, (float16_t)-0.263754679f,
+   (float16_t)-0.963776066f, (float16_t)-0.266712757f,
+   (float16_t)-0.962953267f, (float16_t)-0.269668326f,
+   (float16_t)-0.962121404f, (float16_t)-0.272621355f,
+   (float16_t)-0.961280486f, (float16_t)-0.275571819f,
+   (float16_t)-0.960430519f, (float16_t)-0.278519689f,
+   (float16_t)-0.959571513f, (float16_t)-0.281464938f,
+   (float16_t)-0.958703475f, (float16_t)-0.284407537f,
+   (float16_t)-0.957826413f, (float16_t)-0.287347460f,
+   (float16_t)-0.956940336f, (float16_t)-0.290284677f,
+   (float16_t)-0.956045251f, (float16_t)-0.293219163f,
+   (float16_t)-0.955141168f, (float16_t)-0.296150888f,
+   (float16_t)-0.954228095f, (float16_t)-0.299079826f,
+   (float16_t)-0.953306040f, (float16_t)-0.302005949f,
+   (float16_t)-0.952375013f, (float16_t)-0.304929230f,
+   (float16_t)-0.951435021f, (float16_t)-0.307849640f,
+   (float16_t)-0.950486074f, (float16_t)-0.310767153f,
+   (float16_t)-0.949528181f, (float16_t)-0.313681740f,
+   (float16_t)-0.948561350f, (float16_t)-0.316593376f,
+   (float16_t)-0.947585591f, (float16_t)-0.319502031f,
+   (float16_t)-0.946600913f, (float16_t)-0.322407679f,
+   (float16_t)-0.945607325f, (float16_t)-0.325310292f,
+   (float16_t)-0.944604837f, (float16_t)-0.328209844f,
+   (float16_t)-0.943593458f, (float16_t)-0.331106306f,
+   (float16_t)-0.942573198f, (float16_t)-0.333999651f,
+   (float16_t)-0.941544065f, (float16_t)-0.336889853f,
+   (float16_t)-0.940506071f, (float16_t)-0.339776884f,
+   (float16_t)-0.939459224f, (float16_t)-0.342660717f,
+   (float16_t)-0.938403534f, (float16_t)-0.345541325f,
+   (float16_t)-0.937339012f, (float16_t)-0.348418680f,
+   (float16_t)-0.936265667f, (float16_t)-0.351292756f,
+   (float16_t)-0.935183510f, (float16_t)-0.354163525f,
+   (float16_t)-0.934092550f, (float16_t)-0.357030961f,
+   (float16_t)-0.932992799f, (float16_t)-0.359895037f,
+   (float16_t)-0.931884266f, (float16_t)-0.362755724f,
+   (float16_t)-0.930766961f, (float16_t)-0.365612998f,
+   (float16_t)-0.929640896f, (float16_t)-0.368466830f,
+   (float16_t)-0.928506080f, (float16_t)-0.371317194f,
+   (float16_t)-0.927362526f, (float16_t)-0.374164063f,
+   (float16_t)-0.926210242f, (float16_t)-0.377007410f,
+   (float16_t)-0.925049241f, (float16_t)-0.379847209f,
+   (float16_t)-0.923879533f, (float16_t)-0.382683432f,
+   (float16_t)-0.922701128f, (float16_t)-0.385516054f,
+   (float16_t)-0.921514039f, (float16_t)-0.388345047f,
+   (float16_t)-0.920318277f, (float16_t)-0.391170384f,
+   (float16_t)-0.919113852f, (float16_t)-0.393992040f,
+   (float16_t)-0.917900776f, (float16_t)-0.396809987f,
+   (float16_t)-0.916679060f, (float16_t)-0.399624200f,
+   (float16_t)-0.915448716f, (float16_t)-0.402434651f,
+   (float16_t)-0.914209756f, (float16_t)-0.405241314f,
+   (float16_t)-0.912962190f, (float16_t)-0.408044163f,
+   (float16_t)-0.911706032f, (float16_t)-0.410843171f,
+   (float16_t)-0.910441292f, (float16_t)-0.413638312f,
+   (float16_t)-0.909167983f, (float16_t)-0.416429560f,
+   (float16_t)-0.907886116f, (float16_t)-0.419216888f,
+   (float16_t)-0.906595705f, (float16_t)-0.422000271f,
+   (float16_t)-0.905296759f, (float16_t)-0.424779681f,
+   (float16_t)-0.903989293f, (float16_t)-0.427555093f,
+   (float16_t)-0.902673318f, (float16_t)-0.430326481f,
+   (float16_t)-0.901348847f, (float16_t)-0.433093819f,
+   (float16_t)-0.900015892f, (float16_t)-0.435857080f,
+   (float16_t)-0.898674466f, (float16_t)-0.438616239f,
+   (float16_t)-0.897324581f, (float16_t)-0.441371269f,
+   (float16_t)-0.895966250f, (float16_t)-0.444122145f,
+   (float16_t)-0.894599486f, (float16_t)-0.446868840f,
+   (float16_t)-0.893224301f, (float16_t)-0.449611330f,
+   (float16_t)-0.891840709f, (float16_t)-0.452349587f,
+   (float16_t)-0.890448723f, (float16_t)-0.455083587f,
+   (float16_t)-0.889048356f, (float16_t)-0.457813304f,
+   (float16_t)-0.887639620f, (float16_t)-0.460538711f,
+   (float16_t)-0.886222530f, (float16_t)-0.463259784f,
+   (float16_t)-0.884797098f, (float16_t)-0.465976496f,
+   (float16_t)-0.883363339f, (float16_t)-0.468688822f,
+   (float16_t)-0.881921264f, (float16_t)-0.471396737f,
+   (float16_t)-0.880470889f, (float16_t)-0.474100215f,
+   (float16_t)-0.879012226f, (float16_t)-0.476799230f,
+   (float16_t)-0.877545290f, (float16_t)-0.479493758f,
+   (float16_t)-0.876070094f, (float16_t)-0.482183772f,
+   (float16_t)-0.874586652f, (float16_t)-0.484869248f,
+   (float16_t)-0.873094978f, (float16_t)-0.487550160f,
+   (float16_t)-0.871595087f, (float16_t)-0.490226483f,
+   (float16_t)-0.870086991f, (float16_t)-0.492898192f,
+   (float16_t)-0.868570706f, (float16_t)-0.495565262f,
+   (float16_t)-0.867046246f, (float16_t)-0.498227667f,
+   (float16_t)-0.865513624f, (float16_t)-0.500885383f,
+   (float16_t)-0.863972856f, (float16_t)-0.503538384f,
+   (float16_t)-0.862423956f, (float16_t)-0.506186645f,
+   (float16_t)-0.860866939f, (float16_t)-0.508830143f,
+   (float16_t)-0.859301818f, (float16_t)-0.511468850f,
+   (float16_t)-0.857728610f, (float16_t)-0.514102744f,
+   (float16_t)-0.856147328f, (float16_t)-0.516731799f,
+   (float16_t)-0.854557988f, (float16_t)-0.519355990f,
+   (float16_t)-0.852960605f, (float16_t)-0.521975293f,
+   (float16_t)-0.851355193f, (float16_t)-0.524589683f,
+   (float16_t)-0.849741768f, (float16_t)-0.527199135f,
+   (float16_t)-0.848120345f, (float16_t)-0.529803625f,
+   (float16_t)-0.846490939f, (float16_t)-0.532403128f,
+   (float16_t)-0.844853565f, (float16_t)-0.534997620f,
+   (float16_t)-0.843208240f, (float16_t)-0.537587076f,
+   (float16_t)-0.841554977f, (float16_t)-0.540171473f,
+   (float16_t)-0.839893794f, (float16_t)-0.542750785f,
+   (float16_t)-0.838224706f, (float16_t)-0.545324988f,
+   (float16_t)-0.836547727f, (float16_t)-0.547894059f,
+   (float16_t)-0.834862875f, (float16_t)-0.550457973f,
+   (float16_t)-0.833170165f, (float16_t)-0.553016706f,
+   (float16_t)-0.831469612f, (float16_t)-0.555570233f,
+   (float16_t)-0.829761234f, (float16_t)-0.558118531f,
+   (float16_t)-0.828045045f, (float16_t)-0.560661576f,
+   (float16_t)-0.826321063f, (float16_t)-0.563199344f,
+   (float16_t)-0.824589303f, (float16_t)-0.565731811f,
+   (float16_t)-0.822849781f, (float16_t)-0.568258953f,
+   (float16_t)-0.821102515f, (float16_t)-0.570780746f,
+   (float16_t)-0.819347520f, (float16_t)-0.573297167f,
+   (float16_t)-0.817584813f, (float16_t)-0.575808191f,
+   (float16_t)-0.815814411f, (float16_t)-0.578313796f,
+   (float16_t)-0.814036330f, (float16_t)-0.580813958f,
+   (float16_t)-0.812250587f, (float16_t)-0.583308653f,
+   (float16_t)-0.810457198f, (float16_t)-0.585797857f,
+   (float16_t)-0.808656182f, (float16_t)-0.588281548f,
+   (float16_t)-0.806847554f, (float16_t)-0.590759702f,
+   (float16_t)-0.805031331f, (float16_t)-0.593232295f,
+   (float16_t)-0.803207531f, (float16_t)-0.595699304f,
+   (float16_t)-0.801376172f, (float16_t)-0.598160707f,
+   (float16_t)-0.799537269f, (float16_t)-0.600616479f,
+   (float16_t)-0.797690841f, (float16_t)-0.603066599f,
+   (float16_t)-0.795836905f, (float16_t)-0.605511041f,
+   (float16_t)-0.793975478f, (float16_t)-0.607949785f,
+   (float16_t)-0.792106577f, (float16_t)-0.610382806f,
+   (float16_t)-0.790230221f, (float16_t)-0.612810082f,
+   (float16_t)-0.788346428f, (float16_t)-0.615231591f,
+   (float16_t)-0.786455214f, (float16_t)-0.617647308f,
+   (float16_t)-0.784556597f, (float16_t)-0.620057212f,
+   (float16_t)-0.782650596f, (float16_t)-0.622461279f,
+   (float16_t)-0.780737229f, (float16_t)-0.624859488f,
+   (float16_t)-0.778816512f, (float16_t)-0.627251815f,
+   (float16_t)-0.776888466f, (float16_t)-0.629638239f,
+   (float16_t)-0.774953107f, (float16_t)-0.632018736f,
+   (float16_t)-0.773010453f, (float16_t)-0.634393284f,
+   (float16_t)-0.771060524f, (float16_t)-0.636761861f,
+   (float16_t)-0.769103338f, (float16_t)-0.639124445f,
+   (float16_t)-0.767138912f, (float16_t)-0.641481013f,
+   (float16_t)-0.765167266f, (float16_t)-0.643831543f,
+   (float16_t)-0.763188417f, (float16_t)-0.646176013f,
+   (float16_t)-0.761202385f, (float16_t)-0.648514401f,
+   (float16_t)-0.759209189f, (float16_t)-0.650846685f,
+   (float16_t)-0.757208847f, (float16_t)-0.653172843f,
+   (float16_t)-0.755201377f, (float16_t)-0.655492853f,
+   (float16_t)-0.753186799f, (float16_t)-0.657806693f,
+   (float16_t)-0.751165132f, (float16_t)-0.660114342f,
+   (float16_t)-0.749136395f, (float16_t)-0.662415778f,
+   (float16_t)-0.747100606f, (float16_t)-0.664710978f,
+   (float16_t)-0.745057785f, (float16_t)-0.666999922f,
+   (float16_t)-0.743007952f, (float16_t)-0.669282588f,
+   (float16_t)-0.740951125f, (float16_t)-0.671558955f,
+   (float16_t)-0.738887324f, (float16_t)-0.673829000f,
+   (float16_t)-0.736816569f, (float16_t)-0.676092704f,
+   (float16_t)-0.734738878f, (float16_t)-0.678350043f,
+   (float16_t)-0.732654272f, (float16_t)-0.680600998f,
+   (float16_t)-0.730562769f, (float16_t)-0.682845546f,
+   (float16_t)-0.728464390f, (float16_t)-0.685083668f,
+   (float16_t)-0.726359155f, (float16_t)-0.687315341f,
+   (float16_t)-0.724247083f, (float16_t)-0.689540545f,
+   (float16_t)-0.722128194f, (float16_t)-0.691759258f,
+   (float16_t)-0.720002508f, (float16_t)-0.693971461f,
+   (float16_t)-0.717870045f, (float16_t)-0.696177131f,
+   (float16_t)-0.715730825f, (float16_t)-0.698376249f,
+   (float16_t)-0.713584869f, (float16_t)-0.700568794f,
+   (float16_t)-0.711432196f, (float16_t)-0.702754744f,
+   (float16_t)-0.709272826f, (float16_t)-0.704934080f,
+   (float16_t)-0.707106781f, (float16_t)-0.707106781f,
+   (float16_t)-0.704934080f, (float16_t)-0.709272826f,
+   (float16_t)-0.702754744f, (float16_t)-0.711432196f,
+   (float16_t)-0.700568794f, (float16_t)-0.713584869f,
+   (float16_t)-0.698376249f, (float16_t)-0.715730825f,
+   (float16_t)-0.696177131f, (float16_t)-0.717870045f,
+   (float16_t)-0.693971461f, (float16_t)-0.720002508f,
+   (float16_t)-0.691759258f, (float16_t)-0.722128194f,
+   (float16_t)-0.689540545f, (float16_t)-0.724247083f,
+   (float16_t)-0.687315341f, (float16_t)-0.726359155f,
+   (float16_t)-0.685083668f, (float16_t)-0.728464390f,
+   (float16_t)-0.682845546f, (float16_t)-0.730562769f,
+   (float16_t)-0.680600998f, (float16_t)-0.732654272f,
+   (float16_t)-0.678350043f, (float16_t)-0.734738878f,
+   (float16_t)-0.676092704f, (float16_t)-0.736816569f,
+   (float16_t)-0.673829000f, (float16_t)-0.738887324f,
+   (float16_t)-0.671558955f, (float16_t)-0.740951125f,
+   (float16_t)-0.669282588f, (float16_t)-0.743007952f,
+   (float16_t)-0.666999922f, (float16_t)-0.745057785f,
+   (float16_t)-0.664710978f, (float16_t)-0.747100606f,
+   (float16_t)-0.662415778f, (float16_t)-0.749136395f,
+   (float16_t)-0.660114342f, (float16_t)-0.751165132f,
+   (float16_t)-0.657806693f, (float16_t)-0.753186799f,
+   (float16_t)-0.655492853f, (float16_t)-0.755201377f,
+   (float16_t)-0.653172843f, (float16_t)-0.757208847f,
+   (float16_t)-0.650846685f, (float16_t)-0.759209189f,
+   (float16_t)-0.648514401f, (float16_t)-0.761202385f,
+   (float16_t)-0.646176013f, (float16_t)-0.763188417f,
+   (float16_t)-0.643831543f, (float16_t)-0.765167266f,
+   (float16_t)-0.641481013f, (float16_t)-0.767138912f,
+   (float16_t)-0.639124445f, (float16_t)-0.769103338f,
+   (float16_t)-0.636761861f, (float16_t)-0.771060524f,
+   (float16_t)-0.634393284f, (float16_t)-0.773010453f,
+   (float16_t)-0.632018736f, (float16_t)-0.774953107f,
+   (float16_t)-0.629638239f, (float16_t)-0.776888466f,
+   (float16_t)-0.627251815f, (float16_t)-0.778816512f,
+   (float16_t)-0.624859488f, (float16_t)-0.780737229f,
+   (float16_t)-0.622461279f, (float16_t)-0.782650596f,
+   (float16_t)-0.620057212f, (float16_t)-0.784556597f,
+   (float16_t)-0.617647308f, (float16_t)-0.786455214f,
+   (float16_t)-0.615231591f, (float16_t)-0.788346428f,
+   (float16_t)-0.612810082f, (float16_t)-0.790230221f,
+   (float16_t)-0.610382806f, (float16_t)-0.792106577f,
+   (float16_t)-0.607949785f, (float16_t)-0.793975478f,
+   (float16_t)-0.605511041f, (float16_t)-0.795836905f,
+   (float16_t)-0.603066599f, (float16_t)-0.797690841f,
+   (float16_t)-0.600616479f, (float16_t)-0.799537269f,
+   (float16_t)-0.598160707f, (float16_t)-0.801376172f,
+   (float16_t)-0.595699304f, (float16_t)-0.803207531f,
+   (float16_t)-0.593232295f, (float16_t)-0.805031331f,
+   (float16_t)-0.590759702f, (float16_t)-0.806847554f,
+   (float16_t)-0.588281548f, (float16_t)-0.808656182f,
+   (float16_t)-0.585797857f, (float16_t)-0.810457198f,
+   (float16_t)-0.583308653f, (float16_t)-0.812250587f,
+   (float16_t)-0.580813958f, (float16_t)-0.814036330f,
+   (float16_t)-0.578313796f, (float16_t)-0.815814411f,
+   (float16_t)-0.575808191f, (float16_t)-0.817584813f,
+   (float16_t)-0.573297167f, (float16_t)-0.819347520f,
+   (float16_t)-0.570780746f, (float16_t)-0.821102515f,
+   (float16_t)-0.568258953f, (float16_t)-0.822849781f,
+   (float16_t)-0.565731811f, (float16_t)-0.824589303f,
+   (float16_t)-0.563199344f, (float16_t)-0.826321063f,
+   (float16_t)-0.560661576f, (float16_t)-0.828045045f,
+   (float16_t)-0.558118531f, (float16_t)-0.829761234f,
+   (float16_t)-0.555570233f, (float16_t)-0.831469612f,
+   (float16_t)-0.553016706f, (float16_t)-0.833170165f,
+   (float16_t)-0.550457973f, (float16_t)-0.834862875f,
+   (float16_t)-0.547894059f, (float16_t)-0.836547727f,
+   (float16_t)-0.545324988f, (float16_t)-0.838224706f,
+   (float16_t)-0.542750785f, (float16_t)-0.839893794f,
+   (float16_t)-0.540171473f, (float16_t)-0.841554977f,
+   (float16_t)-0.537587076f, (float16_t)-0.843208240f,
+   (float16_t)-0.534997620f, (float16_t)-0.844853565f,
+   (float16_t)-0.532403128f, (float16_t)-0.846490939f,
+   (float16_t)-0.529803625f, (float16_t)-0.848120345f,
+   (float16_t)-0.527199135f, (float16_t)-0.849741768f,
+   (float16_t)-0.524589683f, (float16_t)-0.851355193f,
+   (float16_t)-0.521975293f, (float16_t)-0.852960605f,
+   (float16_t)-0.519355990f, (float16_t)-0.854557988f,
+   (float16_t)-0.516731799f, (float16_t)-0.856147328f,
+   (float16_t)-0.514102744f, (float16_t)-0.857728610f,
+   (float16_t)-0.511468850f, (float16_t)-0.859301818f,
+   (float16_t)-0.508830143f, (float16_t)-0.860866939f,
+   (float16_t)-0.506186645f, (float16_t)-0.862423956f,
+   (float16_t)-0.503538384f, (float16_t)-0.863972856f,
+   (float16_t)-0.500885383f, (float16_t)-0.865513624f,
+   (float16_t)-0.498227667f, (float16_t)-0.867046246f,
+   (float16_t)-0.495565262f, (float16_t)-0.868570706f,
+   (float16_t)-0.492898192f, (float16_t)-0.870086991f,
+   (float16_t)-0.490226483f, (float16_t)-0.871595087f,
+   (float16_t)-0.487550160f, (float16_t)-0.873094978f,
+   (float16_t)-0.484869248f, (float16_t)-0.874586652f,
+   (float16_t)-0.482183772f, (float16_t)-0.876070094f,
+   (float16_t)-0.479493758f, (float16_t)-0.877545290f,
+   (float16_t)-0.476799230f, (float16_t)-0.879012226f,
+   (float16_t)-0.474100215f, (float16_t)-0.880470889f,
+   (float16_t)-0.471396737f, (float16_t)-0.881921264f,
+   (float16_t)-0.468688822f, (float16_t)-0.883363339f,
+   (float16_t)-0.465976496f, (float16_t)-0.884797098f,
+   (float16_t)-0.463259784f, (float16_t)-0.886222530f,
+   (float16_t)-0.460538711f, (float16_t)-0.887639620f,
+   (float16_t)-0.457813304f, (float16_t)-0.889048356f,
+   (float16_t)-0.455083587f, (float16_t)-0.890448723f,
+   (float16_t)-0.452349587f, (float16_t)-0.891840709f,
+   (float16_t)-0.449611330f, (float16_t)-0.893224301f,
+   (float16_t)-0.446868840f, (float16_t)-0.894599486f,
+   (float16_t)-0.444122145f, (float16_t)-0.895966250f,
+   (float16_t)-0.441371269f, (float16_t)-0.897324581f,
+   (float16_t)-0.438616239f, (float16_t)-0.898674466f,
+   (float16_t)-0.435857080f, (float16_t)-0.900015892f,
+   (float16_t)-0.433093819f, (float16_t)-0.901348847f,
+   (float16_t)-0.430326481f, (float16_t)-0.902673318f,
+   (float16_t)-0.427555093f, (float16_t)-0.903989293f,
+   (float16_t)-0.424779681f, (float16_t)-0.905296759f,
+   (float16_t)-0.422000271f, (float16_t)-0.906595705f,
+   (float16_t)-0.419216888f, (float16_t)-0.907886116f,
+   (float16_t)-0.416429560f, (float16_t)-0.909167983f,
+   (float16_t)-0.413638312f, (float16_t)-0.910441292f,
+   (float16_t)-0.410843171f, (float16_t)-0.911706032f,
+   (float16_t)-0.408044163f, (float16_t)-0.912962190f,
+   (float16_t)-0.405241314f, (float16_t)-0.914209756f,
+   (float16_t)-0.402434651f, (float16_t)-0.915448716f,
+   (float16_t)-0.399624200f, (float16_t)-0.916679060f,
+   (float16_t)-0.396809987f, (float16_t)-0.917900776f,
+   (float16_t)-0.393992040f, (float16_t)-0.919113852f,
+   (float16_t)-0.391170384f, (float16_t)-0.920318277f,
+   (float16_t)-0.388345047f, (float16_t)-0.921514039f,
+   (float16_t)-0.385516054f, (float16_t)-0.922701128f,
+   (float16_t)-0.382683432f, (float16_t)-0.923879533f,
+   (float16_t)-0.379847209f, (float16_t)-0.925049241f,
+   (float16_t)-0.377007410f, (float16_t)-0.926210242f,
+   (float16_t)-0.374164063f, (float16_t)-0.927362526f,
+   (float16_t)-0.371317194f, (float16_t)-0.928506080f,
+   (float16_t)-0.368466830f, (float16_t)-0.929640896f,
+   (float16_t)-0.365612998f, (float16_t)-0.930766961f,
+   (float16_t)-0.362755724f, (float16_t)-0.931884266f,
+   (float16_t)-0.359895037f, (float16_t)-0.932992799f,
+   (float16_t)-0.357030961f, (float16_t)-0.934092550f,
+   (float16_t)-0.354163525f, (float16_t)-0.935183510f,
+   (float16_t)-0.351292756f, (float16_t)-0.936265667f,
+   (float16_t)-0.348418680f, (float16_t)-0.937339012f,
+   (float16_t)-0.345541325f, (float16_t)-0.938403534f,
+   (float16_t)-0.342660717f, (float16_t)-0.939459224f,
+   (float16_t)-0.339776884f, (float16_t)-0.940506071f,
+   (float16_t)-0.336889853f, (float16_t)-0.941544065f,
+   (float16_t)-0.333999651f, (float16_t)-0.942573198f,
+   (float16_t)-0.331106306f, (float16_t)-0.943593458f,
+   (float16_t)-0.328209844f, (float16_t)-0.944604837f,
+   (float16_t)-0.325310292f, (float16_t)-0.945607325f,
+   (float16_t)-0.322407679f, (float16_t)-0.946600913f,
+   (float16_t)-0.319502031f, (float16_t)-0.947585591f,
+   (float16_t)-0.316593376f, (float16_t)-0.948561350f,
+   (float16_t)-0.313681740f, (float16_t)-0.949528181f,
+   (float16_t)-0.310767153f, (float16_t)-0.950486074f,
+   (float16_t)-0.307849640f, (float16_t)-0.951435021f,
+   (float16_t)-0.304929230f, (float16_t)-0.952375013f,
+   (float16_t)-0.302005949f, (float16_t)-0.953306040f,
+   (float16_t)-0.299079826f, (float16_t)-0.954228095f,
+   (float16_t)-0.296150888f, (float16_t)-0.955141168f,
+   (float16_t)-0.293219163f, (float16_t)-0.956045251f,
+   (float16_t)-0.290284677f, (float16_t)-0.956940336f,
+   (float16_t)-0.287347460f, (float16_t)-0.957826413f,
+   (float16_t)-0.284407537f, (float16_t)-0.958703475f,
+   (float16_t)-0.281464938f, (float16_t)-0.959571513f,
+   (float16_t)-0.278519689f, (float16_t)-0.960430519f,
+   (float16_t)-0.275571819f, (float16_t)-0.961280486f,
+   (float16_t)-0.272621355f, (float16_t)-0.962121404f,
+   (float16_t)-0.269668326f, (float16_t)-0.962953267f,
+   (float16_t)-0.266712757f, (float16_t)-0.963776066f,
+   (float16_t)-0.263754679f, (float16_t)-0.964589793f,
+   (float16_t)-0.260794118f, (float16_t)-0.965394442f,
+   (float16_t)-0.257831102f, (float16_t)-0.966190003f,
+   (float16_t)-0.254865660f, (float16_t)-0.966976471f,
+   (float16_t)-0.251897818f, (float16_t)-0.967753837f,
+   (float16_t)-0.248927606f, (float16_t)-0.968522094f,
+   (float16_t)-0.245955050f, (float16_t)-0.969281235f,
+   (float16_t)-0.242980180f, (float16_t)-0.970031253f,
+   (float16_t)-0.240003022f, (float16_t)-0.970772141f,
+   (float16_t)-0.237023606f, (float16_t)-0.971503891f,
+   (float16_t)-0.234041959f, (float16_t)-0.972226497f,
+   (float16_t)-0.231058108f, (float16_t)-0.972939952f,
+   (float16_t)-0.228072083f, (float16_t)-0.973644250f,
+   (float16_t)-0.225083911f, (float16_t)-0.974339383f,
+   (float16_t)-0.222093621f, (float16_t)-0.975025345f,
+   (float16_t)-0.219101240f, (float16_t)-0.975702130f,
+   (float16_t)-0.216106797f, (float16_t)-0.976369731f,
+   (float16_t)-0.213110320f, (float16_t)-0.977028143f,
+   (float16_t)-0.210111837f, (float16_t)-0.977677358f,
+   (float16_t)-0.207111376f, (float16_t)-0.978317371f,
+   (float16_t)-0.204108966f, (float16_t)-0.978948175f,
+   (float16_t)-0.201104635f, (float16_t)-0.979569766f,
+   (float16_t)-0.198098411f, (float16_t)-0.980182136f,
+   (float16_t)-0.195090322f, (float16_t)-0.980785280f,
+   (float16_t)-0.192080397f, (float16_t)-0.981379193f,
+   (float16_t)-0.189068664f, (float16_t)-0.981963869f,
+   (float16_t)-0.186055152f, (float16_t)-0.982539302f,
+   (float16_t)-0.183039888f, (float16_t)-0.983105487f,
+   (float16_t)-0.180022901f, (float16_t)-0.983662419f,
+   (float16_t)-0.177004220f, (float16_t)-0.984210092f,
+   (float16_t)-0.173983873f, (float16_t)-0.984748502f,
+   (float16_t)-0.170961889f, (float16_t)-0.985277642f,
+   (float16_t)-0.167938295f, (float16_t)-0.985797509f,
+   (float16_t)-0.164913120f, (float16_t)-0.986308097f,
+   (float16_t)-0.161886394f, (float16_t)-0.986809402f,
+   (float16_t)-0.158858143f, (float16_t)-0.987301418f,
+   (float16_t)-0.155828398f, (float16_t)-0.987784142f,
+   (float16_t)-0.152797185f, (float16_t)-0.988257568f,
+   (float16_t)-0.149764535f, (float16_t)-0.988721692f,
+   (float16_t)-0.146730474f, (float16_t)-0.989176510f,
+   (float16_t)-0.143695033f, (float16_t)-0.989622017f,
+   (float16_t)-0.140658239f, (float16_t)-0.990058210f,
+   (float16_t)-0.137620122f, (float16_t)-0.990485084f,
+   (float16_t)-0.134580709f, (float16_t)-0.990902635f,
+   (float16_t)-0.131540029f, (float16_t)-0.991310860f,
+   (float16_t)-0.128498111f, (float16_t)-0.991709754f,
+   (float16_t)-0.125454983f, (float16_t)-0.992099313f,
+   (float16_t)-0.122410675f, (float16_t)-0.992479535f,
+   (float16_t)-0.119365215f, (float16_t)-0.992850414f,
+   (float16_t)-0.116318631f, (float16_t)-0.993211949f,
+   (float16_t)-0.113270952f, (float16_t)-0.993564136f,
+   (float16_t)-0.110222207f, (float16_t)-0.993906970f,
+   (float16_t)-0.107172425f, (float16_t)-0.994240449f,
+   (float16_t)-0.104121634f, (float16_t)-0.994564571f,
+   (float16_t)-0.101069863f, (float16_t)-0.994879331f,
+   (float16_t)-0.098017140f, (float16_t)-0.995184727f,
+   (float16_t)-0.094963495f, (float16_t)-0.995480755f,
+   (float16_t)-0.091908956f, (float16_t)-0.995767414f,
+   (float16_t)-0.088853553f, (float16_t)-0.996044701f,
+   (float16_t)-0.085797312f, (float16_t)-0.996312612f,
+   (float16_t)-0.082740265f, (float16_t)-0.996571146f,
+   (float16_t)-0.079682438f, (float16_t)-0.996820299f,
+   (float16_t)-0.076623861f, (float16_t)-0.997060070f,
+   (float16_t)-0.073564564f, (float16_t)-0.997290457f,
+   (float16_t)-0.070504573f, (float16_t)-0.997511456f,
+   (float16_t)-0.067443920f, (float16_t)-0.997723067f,
+   (float16_t)-0.064382631f, (float16_t)-0.997925286f,
+   (float16_t)-0.061320736f, (float16_t)-0.998118113f,
+   (float16_t)-0.058258265f, (float16_t)-0.998301545f,
+   (float16_t)-0.055195244f, (float16_t)-0.998475581f,
+   (float16_t)-0.052131705f, (float16_t)-0.998640218f,
+   (float16_t)-0.049067674f, (float16_t)-0.998795456f,
+   (float16_t)-0.046003182f, (float16_t)-0.998941293f,
+   (float16_t)-0.042938257f, (float16_t)-0.999077728f,
+   (float16_t)-0.039872928f, (float16_t)-0.999204759f,
+   (float16_t)-0.036807223f, (float16_t)-0.999322385f,
+   (float16_t)-0.033741172f, (float16_t)-0.999430605f,
+   (float16_t)-0.030674803f, (float16_t)-0.999529418f,
+   (float16_t)-0.027608146f, (float16_t)-0.999618822f,
+   (float16_t)-0.024541229f, (float16_t)-0.999698819f,
+   (float16_t)-0.021474080f, (float16_t)-0.999769405f,
+   (float16_t)-0.018406730f, (float16_t)-0.999830582f,
+   (float16_t)-0.015339206f, (float16_t)-0.999882347f,
+   (float16_t)-0.012271538f, (float16_t)-0.999924702f,
+   (float16_t)-0.009203755f, (float16_t)-0.999957645f,
+   (float16_t)-0.006135885f, (float16_t)-0.999981175f,
+   (float16_t)-0.003067957f, (float16_t)-0.999995294f,
+   (float16_t)-0.000000000f, (float16_t)-1.000000000f,
+    (float16_t)0.003067957f, (float16_t)-0.999995294f,
+    (float16_t)0.006135885f, (float16_t)-0.999981175f,
+    (float16_t)0.009203755f, (float16_t)-0.999957645f,
+    (float16_t)0.012271538f, (float16_t)-0.999924702f,
+    (float16_t)0.015339206f, (float16_t)-0.999882347f,
+    (float16_t)0.018406730f, (float16_t)-0.999830582f,
+    (float16_t)0.021474080f, (float16_t)-0.999769405f,
+    (float16_t)0.024541229f, (float16_t)-0.999698819f,
+    (float16_t)0.027608146f, (float16_t)-0.999618822f,
+    (float16_t)0.030674803f, (float16_t)-0.999529418f,
+    (float16_t)0.033741172f, (float16_t)-0.999430605f,
+    (float16_t)0.036807223f, (float16_t)-0.999322385f,
+    (float16_t)0.039872928f, (float16_t)-0.999204759f,
+    (float16_t)0.042938257f, (float16_t)-0.999077728f,
+    (float16_t)0.046003182f, (float16_t)-0.998941293f,
+    (float16_t)0.049067674f, (float16_t)-0.998795456f,
+    (float16_t)0.052131705f, (float16_t)-0.998640218f,
+    (float16_t)0.055195244f, (float16_t)-0.998475581f,
+    (float16_t)0.058258265f, (float16_t)-0.998301545f,
+    (float16_t)0.061320736f, (float16_t)-0.998118113f,
+    (float16_t)0.064382631f, (float16_t)-0.997925286f,
+    (float16_t)0.067443920f, (float16_t)-0.997723067f,
+    (float16_t)0.070504573f, (float16_t)-0.997511456f,
+    (float16_t)0.073564564f, (float16_t)-0.997290457f,
+    (float16_t)0.076623861f, (float16_t)-0.997060070f,
+    (float16_t)0.079682438f, (float16_t)-0.996820299f,
+    (float16_t)0.082740265f, (float16_t)-0.996571146f,
+    (float16_t)0.085797312f, (float16_t)-0.996312612f,
+    (float16_t)0.088853553f, (float16_t)-0.996044701f,
+    (float16_t)0.091908956f, (float16_t)-0.995767414f,
+    (float16_t)0.094963495f, (float16_t)-0.995480755f,
+    (float16_t)0.098017140f, (float16_t)-0.995184727f,
+    (float16_t)0.101069863f, (float16_t)-0.994879331f,
+    (float16_t)0.104121634f, (float16_t)-0.994564571f,
+    (float16_t)0.107172425f, (float16_t)-0.994240449f,
+    (float16_t)0.110222207f, (float16_t)-0.993906970f,
+    (float16_t)0.113270952f, (float16_t)-0.993564136f,
+    (float16_t)0.116318631f, (float16_t)-0.993211949f,
+    (float16_t)0.119365215f, (float16_t)-0.992850414f,
+    (float16_t)0.122410675f, (float16_t)-0.992479535f,
+    (float16_t)0.125454983f, (float16_t)-0.992099313f,
+    (float16_t)0.128498111f, (float16_t)-0.991709754f,
+    (float16_t)0.131540029f, (float16_t)-0.991310860f,
+    (float16_t)0.134580709f, (float16_t)-0.990902635f,
+    (float16_t)0.137620122f, (float16_t)-0.990485084f,
+    (float16_t)0.140658239f, (float16_t)-0.990058210f,
+    (float16_t)0.143695033f, (float16_t)-0.989622017f,
+    (float16_t)0.146730474f, (float16_t)-0.989176510f,
+    (float16_t)0.149764535f, (float16_t)-0.988721692f,
+    (float16_t)0.152797185f, (float16_t)-0.988257568f,
+    (float16_t)0.155828398f, (float16_t)-0.987784142f,
+    (float16_t)0.158858143f, (float16_t)-0.987301418f,
+    (float16_t)0.161886394f, (float16_t)-0.986809402f,
+    (float16_t)0.164913120f, (float16_t)-0.986308097f,
+    (float16_t)0.167938295f, (float16_t)-0.985797509f,
+    (float16_t)0.170961889f, (float16_t)-0.985277642f,
+    (float16_t)0.173983873f, (float16_t)-0.984748502f,
+    (float16_t)0.177004220f, (float16_t)-0.984210092f,
+    (float16_t)0.180022901f, (float16_t)-0.983662419f,
+    (float16_t)0.183039888f, (float16_t)-0.983105487f,
+    (float16_t)0.186055152f, (float16_t)-0.982539302f,
+    (float16_t)0.189068664f, (float16_t)-0.981963869f,
+    (float16_t)0.192080397f, (float16_t)-0.981379193f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.198098411f, (float16_t)-0.980182136f,
+    (float16_t)0.201104635f, (float16_t)-0.979569766f,
+    (float16_t)0.204108966f, (float16_t)-0.978948175f,
+    (float16_t)0.207111376f, (float16_t)-0.978317371f,
+    (float16_t)0.210111837f, (float16_t)-0.977677358f,
+    (float16_t)0.213110320f, (float16_t)-0.977028143f,
+    (float16_t)0.216106797f, (float16_t)-0.976369731f,
+    (float16_t)0.219101240f, (float16_t)-0.975702130f,
+    (float16_t)0.222093621f, (float16_t)-0.975025345f,
+    (float16_t)0.225083911f, (float16_t)-0.974339383f,
+    (float16_t)0.228072083f, (float16_t)-0.973644250f,
+    (float16_t)0.231058108f, (float16_t)-0.972939952f,
+    (float16_t)0.234041959f, (float16_t)-0.972226497f,
+    (float16_t)0.237023606f, (float16_t)-0.971503891f,
+    (float16_t)0.240003022f, (float16_t)-0.970772141f,
+    (float16_t)0.242980180f, (float16_t)-0.970031253f,
+    (float16_t)0.245955050f, (float16_t)-0.969281235f,
+    (float16_t)0.248927606f, (float16_t)-0.968522094f,
+    (float16_t)0.251897818f, (float16_t)-0.967753837f,
+    (float16_t)0.254865660f, (float16_t)-0.966976471f,
+    (float16_t)0.257831102f, (float16_t)-0.966190003f,
+    (float16_t)0.260794118f, (float16_t)-0.965394442f,
+    (float16_t)0.263754679f, (float16_t)-0.964589793f,
+    (float16_t)0.266712757f, (float16_t)-0.963776066f,
+    (float16_t)0.269668326f, (float16_t)-0.962953267f,
+    (float16_t)0.272621355f, (float16_t)-0.962121404f,
+    (float16_t)0.275571819f, (float16_t)-0.961280486f,
+    (float16_t)0.278519689f, (float16_t)-0.960430519f,
+    (float16_t)0.281464938f, (float16_t)-0.959571513f,
+    (float16_t)0.284407537f, (float16_t)-0.958703475f,
+    (float16_t)0.287347460f, (float16_t)-0.957826413f,
+    (float16_t)0.290284677f, (float16_t)-0.956940336f,
+    (float16_t)0.293219163f, (float16_t)-0.956045251f,
+    (float16_t)0.296150888f, (float16_t)-0.955141168f,
+    (float16_t)0.299079826f, (float16_t)-0.954228095f,
+    (float16_t)0.302005949f, (float16_t)-0.953306040f,
+    (float16_t)0.304929230f, (float16_t)-0.952375013f,
+    (float16_t)0.307849640f, (float16_t)-0.951435021f,
+    (float16_t)0.310767153f, (float16_t)-0.950486074f,
+    (float16_t)0.313681740f, (float16_t)-0.949528181f,
+    (float16_t)0.316593376f, (float16_t)-0.948561350f,
+    (float16_t)0.319502031f, (float16_t)-0.947585591f,
+    (float16_t)0.322407679f, (float16_t)-0.946600913f,
+    (float16_t)0.325310292f, (float16_t)-0.945607325f,
+    (float16_t)0.328209844f, (float16_t)-0.944604837f,
+    (float16_t)0.331106306f, (float16_t)-0.943593458f,
+    (float16_t)0.333999651f, (float16_t)-0.942573198f,
+    (float16_t)0.336889853f, (float16_t)-0.941544065f,
+    (float16_t)0.339776884f, (float16_t)-0.940506071f,
+    (float16_t)0.342660717f, (float16_t)-0.939459224f,
+    (float16_t)0.345541325f, (float16_t)-0.938403534f,
+    (float16_t)0.348418680f, (float16_t)-0.937339012f,
+    (float16_t)0.351292756f, (float16_t)-0.936265667f,
+    (float16_t)0.354163525f, (float16_t)-0.935183510f,
+    (float16_t)0.357030961f, (float16_t)-0.934092550f,
+    (float16_t)0.359895037f, (float16_t)-0.932992799f,
+    (float16_t)0.362755724f, (float16_t)-0.931884266f,
+    (float16_t)0.365612998f, (float16_t)-0.930766961f,
+    (float16_t)0.368466830f, (float16_t)-0.929640896f,
+    (float16_t)0.371317194f, (float16_t)-0.928506080f,
+    (float16_t)0.374164063f, (float16_t)-0.927362526f,
+    (float16_t)0.377007410f, (float16_t)-0.926210242f,
+    (float16_t)0.379847209f, (float16_t)-0.925049241f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.385516054f, (float16_t)-0.922701128f,
+    (float16_t)0.388345047f, (float16_t)-0.921514039f,
+    (float16_t)0.391170384f, (float16_t)-0.920318277f,
+    (float16_t)0.393992040f, (float16_t)-0.919113852f,
+    (float16_t)0.396809987f, (float16_t)-0.917900776f,
+    (float16_t)0.399624200f, (float16_t)-0.916679060f,
+    (float16_t)0.402434651f, (float16_t)-0.915448716f,
+    (float16_t)0.405241314f, (float16_t)-0.914209756f,
+    (float16_t)0.408044163f, (float16_t)-0.912962190f,
+    (float16_t)0.410843171f, (float16_t)-0.911706032f,
+    (float16_t)0.413638312f, (float16_t)-0.910441292f,
+    (float16_t)0.416429560f, (float16_t)-0.909167983f,
+    (float16_t)0.419216888f, (float16_t)-0.907886116f,
+    (float16_t)0.422000271f, (float16_t)-0.906595705f,
+    (float16_t)0.424779681f, (float16_t)-0.905296759f,
+    (float16_t)0.427555093f, (float16_t)-0.903989293f,
+    (float16_t)0.430326481f, (float16_t)-0.902673318f,
+    (float16_t)0.433093819f, (float16_t)-0.901348847f,
+    (float16_t)0.435857080f, (float16_t)-0.900015892f,
+    (float16_t)0.438616239f, (float16_t)-0.898674466f,
+    (float16_t)0.441371269f, (float16_t)-0.897324581f,
+    (float16_t)0.444122145f, (float16_t)-0.895966250f,
+    (float16_t)0.446868840f, (float16_t)-0.894599486f,
+    (float16_t)0.449611330f, (float16_t)-0.893224301f,
+    (float16_t)0.452349587f, (float16_t)-0.891840709f,
+    (float16_t)0.455083587f, (float16_t)-0.890448723f,
+    (float16_t)0.457813304f, (float16_t)-0.889048356f,
+    (float16_t)0.460538711f, (float16_t)-0.887639620f,
+    (float16_t)0.463259784f, (float16_t)-0.886222530f,
+    (float16_t)0.465976496f, (float16_t)-0.884797098f,
+    (float16_t)0.468688822f, (float16_t)-0.883363339f,
+    (float16_t)0.471396737f, (float16_t)-0.881921264f,
+    (float16_t)0.474100215f, (float16_t)-0.880470889f,
+    (float16_t)0.476799230f, (float16_t)-0.879012226f,
+    (float16_t)0.479493758f, (float16_t)-0.877545290f,
+    (float16_t)0.482183772f, (float16_t)-0.876070094f,
+    (float16_t)0.484869248f, (float16_t)-0.874586652f,
+    (float16_t)0.487550160f, (float16_t)-0.873094978f,
+    (float16_t)0.490226483f, (float16_t)-0.871595087f,
+    (float16_t)0.492898192f, (float16_t)-0.870086991f,
+    (float16_t)0.495565262f, (float16_t)-0.868570706f,
+    (float16_t)0.498227667f, (float16_t)-0.867046246f,
+    (float16_t)0.500885383f, (float16_t)-0.865513624f,
+    (float16_t)0.503538384f, (float16_t)-0.863972856f,
+    (float16_t)0.506186645f, (float16_t)-0.862423956f,
+    (float16_t)0.508830143f, (float16_t)-0.860866939f,
+    (float16_t)0.511468850f, (float16_t)-0.859301818f,
+    (float16_t)0.514102744f, (float16_t)-0.857728610f,
+    (float16_t)0.516731799f, (float16_t)-0.856147328f,
+    (float16_t)0.519355990f, (float16_t)-0.854557988f,
+    (float16_t)0.521975293f, (float16_t)-0.852960605f,
+    (float16_t)0.524589683f, (float16_t)-0.851355193f,
+    (float16_t)0.527199135f, (float16_t)-0.849741768f,
+    (float16_t)0.529803625f, (float16_t)-0.848120345f,
+    (float16_t)0.532403128f, (float16_t)-0.846490939f,
+    (float16_t)0.534997620f, (float16_t)-0.844853565f,
+    (float16_t)0.537587076f, (float16_t)-0.843208240f,
+    (float16_t)0.540171473f, (float16_t)-0.841554977f,
+    (float16_t)0.542750785f, (float16_t)-0.839893794f,
+    (float16_t)0.545324988f, (float16_t)-0.838224706f,
+    (float16_t)0.547894059f, (float16_t)-0.836547727f,
+    (float16_t)0.550457973f, (float16_t)-0.834862875f,
+    (float16_t)0.553016706f, (float16_t)-0.833170165f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.558118531f, (float16_t)-0.829761234f,
+    (float16_t)0.560661576f, (float16_t)-0.828045045f,
+    (float16_t)0.563199344f, (float16_t)-0.826321063f,
+    (float16_t)0.565731811f, (float16_t)-0.824589303f,
+    (float16_t)0.568258953f, (float16_t)-0.822849781f,
+    (float16_t)0.570780746f, (float16_t)-0.821102515f,
+    (float16_t)0.573297167f, (float16_t)-0.819347520f,
+    (float16_t)0.575808191f, (float16_t)-0.817584813f,
+    (float16_t)0.578313796f, (float16_t)-0.815814411f,
+    (float16_t)0.580813958f, (float16_t)-0.814036330f,
+    (float16_t)0.583308653f, (float16_t)-0.812250587f,
+    (float16_t)0.585797857f, (float16_t)-0.810457198f,
+    (float16_t)0.588281548f, (float16_t)-0.808656182f,
+    (float16_t)0.590759702f, (float16_t)-0.806847554f,
+    (float16_t)0.593232295f, (float16_t)-0.805031331f,
+    (float16_t)0.595699304f, (float16_t)-0.803207531f,
+    (float16_t)0.598160707f, (float16_t)-0.801376172f,
+    (float16_t)0.600616479f, (float16_t)-0.799537269f,
+    (float16_t)0.603066599f, (float16_t)-0.797690841f,
+    (float16_t)0.605511041f, (float16_t)-0.795836905f,
+    (float16_t)0.607949785f, (float16_t)-0.793975478f,
+    (float16_t)0.610382806f, (float16_t)-0.792106577f,
+    (float16_t)0.612810082f, (float16_t)-0.790230221f,
+    (float16_t)0.615231591f, (float16_t)-0.788346428f,
+    (float16_t)0.617647308f, (float16_t)-0.786455214f,
+    (float16_t)0.620057212f, (float16_t)-0.784556597f,
+    (float16_t)0.622461279f, (float16_t)-0.782650596f,
+    (float16_t)0.624859488f, (float16_t)-0.780737229f,
+    (float16_t)0.627251815f, (float16_t)-0.778816512f,
+    (float16_t)0.629638239f, (float16_t)-0.776888466f,
+    (float16_t)0.632018736f, (float16_t)-0.774953107f,
+    (float16_t)0.634393284f, (float16_t)-0.773010453f,
+    (float16_t)0.636761861f, (float16_t)-0.771060524f,
+    (float16_t)0.639124445f, (float16_t)-0.769103338f,
+    (float16_t)0.641481013f, (float16_t)-0.767138912f,
+    (float16_t)0.643831543f, (float16_t)-0.765167266f,
+    (float16_t)0.646176013f, (float16_t)-0.763188417f,
+    (float16_t)0.648514401f, (float16_t)-0.761202385f,
+    (float16_t)0.650846685f, (float16_t)-0.759209189f,
+    (float16_t)0.653172843f, (float16_t)-0.757208847f,
+    (float16_t)0.655492853f, (float16_t)-0.755201377f,
+    (float16_t)0.657806693f, (float16_t)-0.753186799f,
+    (float16_t)0.660114342f, (float16_t)-0.751165132f,
+    (float16_t)0.662415778f, (float16_t)-0.749136395f,
+    (float16_t)0.664710978f, (float16_t)-0.747100606f,
+    (float16_t)0.666999922f, (float16_t)-0.745057785f,
+    (float16_t)0.669282588f, (float16_t)-0.743007952f,
+    (float16_t)0.671558955f, (float16_t)-0.740951125f,
+    (float16_t)0.673829000f, (float16_t)-0.738887324f,
+    (float16_t)0.676092704f, (float16_t)-0.736816569f,
+    (float16_t)0.678350043f, (float16_t)-0.734738878f,
+    (float16_t)0.680600998f, (float16_t)-0.732654272f,
+    (float16_t)0.682845546f, (float16_t)-0.730562769f,
+    (float16_t)0.685083668f, (float16_t)-0.728464390f,
+    (float16_t)0.687315341f, (float16_t)-0.726359155f,
+    (float16_t)0.689540545f, (float16_t)-0.724247083f,
+    (float16_t)0.691759258f, (float16_t)-0.722128194f,
+    (float16_t)0.693971461f, (float16_t)-0.720002508f,
+    (float16_t)0.696177131f, (float16_t)-0.717870045f,
+    (float16_t)0.698376249f, (float16_t)-0.715730825f,
+    (float16_t)0.700568794f, (float16_t)-0.713584869f,
+    (float16_t)0.702754744f, (float16_t)-0.711432196f,
+    (float16_t)0.704934080f, (float16_t)-0.709272826f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.709272826f, (float16_t)-0.704934080f,
+    (float16_t)0.711432196f, (float16_t)-0.702754744f,
+    (float16_t)0.713584869f, (float16_t)-0.700568794f,
+    (float16_t)0.715730825f, (float16_t)-0.698376249f,
+    (float16_t)0.717870045f, (float16_t)-0.696177131f,
+    (float16_t)0.720002508f, (float16_t)-0.693971461f,
+    (float16_t)0.722128194f, (float16_t)-0.691759258f,
+    (float16_t)0.724247083f, (float16_t)-0.689540545f,
+    (float16_t)0.726359155f, (float16_t)-0.687315341f,
+    (float16_t)0.728464390f, (float16_t)-0.685083668f,
+    (float16_t)0.730562769f, (float16_t)-0.682845546f,
+    (float16_t)0.732654272f, (float16_t)-0.680600998f,
+    (float16_t)0.734738878f, (float16_t)-0.678350043f,
+    (float16_t)0.736816569f, (float16_t)-0.676092704f,
+    (float16_t)0.738887324f, (float16_t)-0.673829000f,
+    (float16_t)0.740951125f, (float16_t)-0.671558955f,
+    (float16_t)0.743007952f, (float16_t)-0.669282588f,
+    (float16_t)0.745057785f, (float16_t)-0.666999922f,
+    (float16_t)0.747100606f, (float16_t)-0.664710978f,
+    (float16_t)0.749136395f, (float16_t)-0.662415778f,
+    (float16_t)0.751165132f, (float16_t)-0.660114342f,
+    (float16_t)0.753186799f, (float16_t)-0.657806693f,
+    (float16_t)0.755201377f, (float16_t)-0.655492853f,
+    (float16_t)0.757208847f, (float16_t)-0.653172843f,
+    (float16_t)0.759209189f, (float16_t)-0.650846685f,
+    (float16_t)0.761202385f, (float16_t)-0.648514401f,
+    (float16_t)0.763188417f, (float16_t)-0.646176013f,
+    (float16_t)0.765167266f, (float16_t)-0.643831543f,
+    (float16_t)0.767138912f, (float16_t)-0.641481013f,
+    (float16_t)0.769103338f, (float16_t)-0.639124445f,
+    (float16_t)0.771060524f, (float16_t)-0.636761861f,
+    (float16_t)0.773010453f, (float16_t)-0.634393284f,
+    (float16_t)0.774953107f, (float16_t)-0.632018736f,
+    (float16_t)0.776888466f, (float16_t)-0.629638239f,
+    (float16_t)0.778816512f, (float16_t)-0.627251815f,
+    (float16_t)0.780737229f, (float16_t)-0.624859488f,
+    (float16_t)0.782650596f, (float16_t)-0.622461279f,
+    (float16_t)0.784556597f, (float16_t)-0.620057212f,
+    (float16_t)0.786455214f, (float16_t)-0.617647308f,
+    (float16_t)0.788346428f, (float16_t)-0.615231591f,
+    (float16_t)0.790230221f, (float16_t)-0.612810082f,
+    (float16_t)0.792106577f, (float16_t)-0.610382806f,
+    (float16_t)0.793975478f, (float16_t)-0.607949785f,
+    (float16_t)0.795836905f, (float16_t)-0.605511041f,
+    (float16_t)0.797690841f, (float16_t)-0.603066599f,
+    (float16_t)0.799537269f, (float16_t)-0.600616479f,
+    (float16_t)0.801376172f, (float16_t)-0.598160707f,
+    (float16_t)0.803207531f, (float16_t)-0.595699304f,
+    (float16_t)0.805031331f, (float16_t)-0.593232295f,
+    (float16_t)0.806847554f, (float16_t)-0.590759702f,
+    (float16_t)0.808656182f, (float16_t)-0.588281548f,
+    (float16_t)0.810457198f, (float16_t)-0.585797857f,
+    (float16_t)0.812250587f, (float16_t)-0.583308653f,
+    (float16_t)0.814036330f, (float16_t)-0.580813958f,
+    (float16_t)0.815814411f, (float16_t)-0.578313796f,
+    (float16_t)0.817584813f, (float16_t)-0.575808191f,
+    (float16_t)0.819347520f, (float16_t)-0.573297167f,
+    (float16_t)0.821102515f, (float16_t)-0.570780746f,
+    (float16_t)0.822849781f, (float16_t)-0.568258953f,
+    (float16_t)0.824589303f, (float16_t)-0.565731811f,
+    (float16_t)0.826321063f, (float16_t)-0.563199344f,
+    (float16_t)0.828045045f, (float16_t)-0.560661576f,
+    (float16_t)0.829761234f, (float16_t)-0.558118531f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.833170165f, (float16_t)-0.553016706f,
+    (float16_t)0.834862875f, (float16_t)-0.550457973f,
+    (float16_t)0.836547727f, (float16_t)-0.547894059f,
+    (float16_t)0.838224706f, (float16_t)-0.545324988f,
+    (float16_t)0.839893794f, (float16_t)-0.542750785f,
+    (float16_t)0.841554977f, (float16_t)-0.540171473f,
+    (float16_t)0.843208240f, (float16_t)-0.537587076f,
+    (float16_t)0.844853565f, (float16_t)-0.534997620f,
+    (float16_t)0.846490939f, (float16_t)-0.532403128f,
+    (float16_t)0.848120345f, (float16_t)-0.529803625f,
+    (float16_t)0.849741768f, (float16_t)-0.527199135f,
+    (float16_t)0.851355193f, (float16_t)-0.524589683f,
+    (float16_t)0.852960605f, (float16_t)-0.521975293f,
+    (float16_t)0.854557988f, (float16_t)-0.519355990f,
+    (float16_t)0.856147328f, (float16_t)-0.516731799f,
+    (float16_t)0.857728610f, (float16_t)-0.514102744f,
+    (float16_t)0.859301818f, (float16_t)-0.511468850f,
+    (float16_t)0.860866939f, (float16_t)-0.508830143f,
+    (float16_t)0.862423956f, (float16_t)-0.506186645f,
+    (float16_t)0.863972856f, (float16_t)-0.503538384f,
+    (float16_t)0.865513624f, (float16_t)-0.500885383f,
+    (float16_t)0.867046246f, (float16_t)-0.498227667f,
+    (float16_t)0.868570706f, (float16_t)-0.495565262f,
+    (float16_t)0.870086991f, (float16_t)-0.492898192f,
+    (float16_t)0.871595087f, (float16_t)-0.490226483f,
+    (float16_t)0.873094978f, (float16_t)-0.487550160f,
+    (float16_t)0.874586652f, (float16_t)-0.484869248f,
+    (float16_t)0.876070094f, (float16_t)-0.482183772f,
+    (float16_t)0.877545290f, (float16_t)-0.479493758f,
+    (float16_t)0.879012226f, (float16_t)-0.476799230f,
+    (float16_t)0.880470889f, (float16_t)-0.474100215f,
+    (float16_t)0.881921264f, (float16_t)-0.471396737f,
+    (float16_t)0.883363339f, (float16_t)-0.468688822f,
+    (float16_t)0.884797098f, (float16_t)-0.465976496f,
+    (float16_t)0.886222530f, (float16_t)-0.463259784f,
+    (float16_t)0.887639620f, (float16_t)-0.460538711f,
+    (float16_t)0.889048356f, (float16_t)-0.457813304f,
+    (float16_t)0.890448723f, (float16_t)-0.455083587f,
+    (float16_t)0.891840709f, (float16_t)-0.452349587f,
+    (float16_t)0.893224301f, (float16_t)-0.449611330f,
+    (float16_t)0.894599486f, (float16_t)-0.446868840f,
+    (float16_t)0.895966250f, (float16_t)-0.444122145f,
+    (float16_t)0.897324581f, (float16_t)-0.441371269f,
+    (float16_t)0.898674466f, (float16_t)-0.438616239f,
+    (float16_t)0.900015892f, (float16_t)-0.435857080f,
+    (float16_t)0.901348847f, (float16_t)-0.433093819f,
+    (float16_t)0.902673318f, (float16_t)-0.430326481f,
+    (float16_t)0.903989293f, (float16_t)-0.427555093f,
+    (float16_t)0.905296759f, (float16_t)-0.424779681f,
+    (float16_t)0.906595705f, (float16_t)-0.422000271f,
+    (float16_t)0.907886116f, (float16_t)-0.419216888f,
+    (float16_t)0.909167983f, (float16_t)-0.416429560f,
+    (float16_t)0.910441292f, (float16_t)-0.413638312f,
+    (float16_t)0.911706032f, (float16_t)-0.410843171f,
+    (float16_t)0.912962190f, (float16_t)-0.408044163f,
+    (float16_t)0.914209756f, (float16_t)-0.405241314f,
+    (float16_t)0.915448716f, (float16_t)-0.402434651f,
+    (float16_t)0.916679060f, (float16_t)-0.399624200f,
+    (float16_t)0.917900776f, (float16_t)-0.396809987f,
+    (float16_t)0.919113852f, (float16_t)-0.393992040f,
+    (float16_t)0.920318277f, (float16_t)-0.391170384f,
+    (float16_t)0.921514039f, (float16_t)-0.388345047f,
+    (float16_t)0.922701128f, (float16_t)-0.385516054f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.925049241f, (float16_t)-0.379847209f,
+    (float16_t)0.926210242f, (float16_t)-0.377007410f,
+    (float16_t)0.927362526f, (float16_t)-0.374164063f,
+    (float16_t)0.928506080f, (float16_t)-0.371317194f,
+    (float16_t)0.929640896f, (float16_t)-0.368466830f,
+    (float16_t)0.930766961f, (float16_t)-0.365612998f,
+    (float16_t)0.931884266f, (float16_t)-0.362755724f,
+    (float16_t)0.932992799f, (float16_t)-0.359895037f,
+    (float16_t)0.934092550f, (float16_t)-0.357030961f,
+    (float16_t)0.935183510f, (float16_t)-0.354163525f,
+    (float16_t)0.936265667f, (float16_t)-0.351292756f,
+    (float16_t)0.937339012f, (float16_t)-0.348418680f,
+    (float16_t)0.938403534f, (float16_t)-0.345541325f,
+    (float16_t)0.939459224f, (float16_t)-0.342660717f,
+    (float16_t)0.940506071f, (float16_t)-0.339776884f,
+    (float16_t)0.941544065f, (float16_t)-0.336889853f,
+    (float16_t)0.942573198f, (float16_t)-0.333999651f,
+    (float16_t)0.943593458f, (float16_t)-0.331106306f,
+    (float16_t)0.944604837f, (float16_t)-0.328209844f,
+    (float16_t)0.945607325f, (float16_t)-0.325310292f,
+    (float16_t)0.946600913f, (float16_t)-0.322407679f,
+    (float16_t)0.947585591f, (float16_t)-0.319502031f,
+    (float16_t)0.948561350f, (float16_t)-0.316593376f,
+    (float16_t)0.949528181f, (float16_t)-0.313681740f,
+    (float16_t)0.950486074f, (float16_t)-0.310767153f,
+    (float16_t)0.951435021f, (float16_t)-0.307849640f,
+    (float16_t)0.952375013f, (float16_t)-0.304929230f,
+    (float16_t)0.953306040f, (float16_t)-0.302005949f,
+    (float16_t)0.954228095f, (float16_t)-0.299079826f,
+    (float16_t)0.955141168f, (float16_t)-0.296150888f,
+    (float16_t)0.956045251f, (float16_t)-0.293219163f,
+    (float16_t)0.956940336f, (float16_t)-0.290284677f,
+    (float16_t)0.957826413f, (float16_t)-0.287347460f,
+    (float16_t)0.958703475f, (float16_t)-0.284407537f,
+    (float16_t)0.959571513f, (float16_t)-0.281464938f,
+    (float16_t)0.960430519f, (float16_t)-0.278519689f,
+    (float16_t)0.961280486f, (float16_t)-0.275571819f,
+    (float16_t)0.962121404f, (float16_t)-0.272621355f,
+    (float16_t)0.962953267f, (float16_t)-0.269668326f,
+    (float16_t)0.963776066f, (float16_t)-0.266712757f,
+    (float16_t)0.964589793f, (float16_t)-0.263754679f,
+    (float16_t)0.965394442f, (float16_t)-0.260794118f,
+    (float16_t)0.966190003f, (float16_t)-0.257831102f,
+    (float16_t)0.966976471f, (float16_t)-0.254865660f,
+    (float16_t)0.967753837f, (float16_t)-0.251897818f,
+    (float16_t)0.968522094f, (float16_t)-0.248927606f,
+    (float16_t)0.969281235f, (float16_t)-0.245955050f,
+    (float16_t)0.970031253f, (float16_t)-0.242980180f,
+    (float16_t)0.970772141f, (float16_t)-0.240003022f,
+    (float16_t)0.971503891f, (float16_t)-0.237023606f,
+    (float16_t)0.972226497f, (float16_t)-0.234041959f,
+    (float16_t)0.972939952f, (float16_t)-0.231058108f,
+    (float16_t)0.973644250f, (float16_t)-0.228072083f,
+    (float16_t)0.974339383f, (float16_t)-0.225083911f,
+    (float16_t)0.975025345f, (float16_t)-0.222093621f,
+    (float16_t)0.975702130f, (float16_t)-0.219101240f,
+    (float16_t)0.976369731f, (float16_t)-0.216106797f,
+    (float16_t)0.977028143f, (float16_t)-0.213110320f,
+    (float16_t)0.977677358f, (float16_t)-0.210111837f,
+    (float16_t)0.978317371f, (float16_t)-0.207111376f,
+    (float16_t)0.978948175f, (float16_t)-0.204108966f,
+    (float16_t)0.979569766f, (float16_t)-0.201104635f,
+    (float16_t)0.980182136f, (float16_t)-0.198098411f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.981379193f, (float16_t)-0.192080397f,
+    (float16_t)0.981963869f, (float16_t)-0.189068664f,
+    (float16_t)0.982539302f, (float16_t)-0.186055152f,
+    (float16_t)0.983105487f, (float16_t)-0.183039888f,
+    (float16_t)0.983662419f, (float16_t)-0.180022901f,
+    (float16_t)0.984210092f, (float16_t)-0.177004220f,
+    (float16_t)0.984748502f, (float16_t)-0.173983873f,
+    (float16_t)0.985277642f, (float16_t)-0.170961889f,
+    (float16_t)0.985797509f, (float16_t)-0.167938295f,
+    (float16_t)0.986308097f, (float16_t)-0.164913120f,
+    (float16_t)0.986809402f, (float16_t)-0.161886394f,
+    (float16_t)0.987301418f, (float16_t)-0.158858143f,
+    (float16_t)0.987784142f, (float16_t)-0.155828398f,
+    (float16_t)0.988257568f, (float16_t)-0.152797185f,
+    (float16_t)0.988721692f, (float16_t)-0.149764535f,
+    (float16_t)0.989176510f, (float16_t)-0.146730474f,
+    (float16_t)0.989622017f, (float16_t)-0.143695033f,
+    (float16_t)0.990058210f, (float16_t)-0.140658239f,
+    (float16_t)0.990485084f, (float16_t)-0.137620122f,
+    (float16_t)0.990902635f, (float16_t)-0.134580709f,
+    (float16_t)0.991310860f, (float16_t)-0.131540029f,
+    (float16_t)0.991709754f, (float16_t)-0.128498111f,
+    (float16_t)0.992099313f, (float16_t)-0.125454983f,
+    (float16_t)0.992479535f, (float16_t)-0.122410675f,
+    (float16_t)0.992850414f, (float16_t)-0.119365215f,
+    (float16_t)0.993211949f, (float16_t)-0.116318631f,
+    (float16_t)0.993564136f, (float16_t)-0.113270952f,
+    (float16_t)0.993906970f, (float16_t)-0.110222207f,
+    (float16_t)0.994240449f, (float16_t)-0.107172425f,
+    (float16_t)0.994564571f, (float16_t)-0.104121634f,
+    (float16_t)0.994879331f, (float16_t)-0.101069863f,
+    (float16_t)0.995184727f, (float16_t)-0.098017140f,
+    (float16_t)0.995480755f, (float16_t)-0.094963495f,
+    (float16_t)0.995767414f, (float16_t)-0.091908956f,
+    (float16_t)0.996044701f, (float16_t)-0.088853553f,
+    (float16_t)0.996312612f, (float16_t)-0.085797312f,
+    (float16_t)0.996571146f, (float16_t)-0.082740265f,
+    (float16_t)0.996820299f, (float16_t)-0.079682438f,
+    (float16_t)0.997060070f, (float16_t)-0.076623861f,
+    (float16_t)0.997290457f, (float16_t)-0.073564564f,
+    (float16_t)0.997511456f, (float16_t)-0.070504573f,
+    (float16_t)0.997723067f, (float16_t)-0.067443920f,
+    (float16_t)0.997925286f, (float16_t)-0.064382631f,
+    (float16_t)0.998118113f, (float16_t)-0.061320736f,
+    (float16_t)0.998301545f, (float16_t)-0.058258265f,
+    (float16_t)0.998475581f, (float16_t)-0.055195244f,
+    (float16_t)0.998640218f, (float16_t)-0.052131705f,
+    (float16_t)0.998795456f, (float16_t)-0.049067674f,
+    (float16_t)0.998941293f, (float16_t)-0.046003182f,
+    (float16_t)0.999077728f, (float16_t)-0.042938257f,
+    (float16_t)0.999204759f, (float16_t)-0.039872928f,
+    (float16_t)0.999322385f, (float16_t)-0.036807223f,
+    (float16_t)0.999430605f, (float16_t)-0.033741172f,
+    (float16_t)0.999529418f, (float16_t)-0.030674803f,
+    (float16_t)0.999618822f, (float16_t)-0.027608146f,
+    (float16_t)0.999698819f, (float16_t)-0.024541229f,
+    (float16_t)0.999769405f, (float16_t)-0.021474080f,
+    (float16_t)0.999830582f, (float16_t)-0.018406730f,
+    (float16_t)0.999882347f, (float16_t)-0.015339206f,
+    (float16_t)0.999924702f, (float16_t)-0.012271538f,
+    (float16_t)0.999957645f, (float16_t)-0.009203755f,
+    (float16_t)0.999981175f, (float16_t)-0.006135885f,
+    (float16_t)0.999995294f, (float16_t)-0.003067957f
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096)
+
+/**
+* \par
+* Example code for Floating-point Twiddle factors Generation:
+* \par
+* <pre>for(i = 0; i< N/; i++)
+* {
+* twiddleCoef[2*i]= cos(i * 2*PI/(float)N);
+* twiddleCoef[2*i+1]= sin(i * 2*PI/(float)N);
+* } </pre>
+* \par
+* where N = 4096  and PI = 3.14159265358979
+* \par
+* Cos and Sin values are in interleaved fashion
+*
+*/
+const float16_t twiddleCoefF16_4096[8192] = {
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.999998823f,  (float16_t)0.001533980f,
+    (float16_t)0.999995294f,  (float16_t)0.003067957f,
+    (float16_t)0.999989411f,  (float16_t)0.004601926f,
+    (float16_t)0.999981175f,  (float16_t)0.006135885f,
+    (float16_t)0.999970586f,  (float16_t)0.007669829f,
+    (float16_t)0.999957645f,  (float16_t)0.009203755f,
+    (float16_t)0.999942350f,  (float16_t)0.010737659f,
+    (float16_t)0.999924702f,  (float16_t)0.012271538f,
+    (float16_t)0.999904701f,  (float16_t)0.013805389f,
+    (float16_t)0.999882347f,  (float16_t)0.015339206f,
+    (float16_t)0.999857641f,  (float16_t)0.016872988f,
+    (float16_t)0.999830582f,  (float16_t)0.018406730f,
+    (float16_t)0.999801170f,  (float16_t)0.019940429f,
+    (float16_t)0.999769405f,  (float16_t)0.021474080f,
+    (float16_t)0.999735288f,  (float16_t)0.023007681f,
+    (float16_t)0.999698819f,  (float16_t)0.024541229f,
+    (float16_t)0.999659997f,  (float16_t)0.026074718f,
+    (float16_t)0.999618822f,  (float16_t)0.027608146f,
+    (float16_t)0.999575296f,  (float16_t)0.029141509f,
+    (float16_t)0.999529418f,  (float16_t)0.030674803f,
+    (float16_t)0.999481187f,  (float16_t)0.032208025f,
+    (float16_t)0.999430605f,  (float16_t)0.033741172f,
+    (float16_t)0.999377670f,  (float16_t)0.035274239f,
+    (float16_t)0.999322385f,  (float16_t)0.036807223f,
+    (float16_t)0.999264747f,  (float16_t)0.038340120f,
+    (float16_t)0.999204759f,  (float16_t)0.039872928f,
+    (float16_t)0.999142419f,  (float16_t)0.041405641f,
+    (float16_t)0.999077728f,  (float16_t)0.042938257f,
+    (float16_t)0.999010686f,  (float16_t)0.044470772f,
+    (float16_t)0.998941293f,  (float16_t)0.046003182f,
+    (float16_t)0.998869550f,  (float16_t)0.047535484f,
+    (float16_t)0.998795456f,  (float16_t)0.049067674f,
+    (float16_t)0.998719012f,  (float16_t)0.050599749f,
+    (float16_t)0.998640218f,  (float16_t)0.052131705f,
+    (float16_t)0.998559074f,  (float16_t)0.053663538f,
+    (float16_t)0.998475581f,  (float16_t)0.055195244f,
+    (float16_t)0.998389737f,  (float16_t)0.056726821f,
+    (float16_t)0.998301545f,  (float16_t)0.058258265f,
+    (float16_t)0.998211003f,  (float16_t)0.059789571f,
+    (float16_t)0.998118113f,  (float16_t)0.061320736f,
+    (float16_t)0.998022874f,  (float16_t)0.062851758f,
+    (float16_t)0.997925286f,  (float16_t)0.064382631f,
+    (float16_t)0.997825350f,  (float16_t)0.065913353f,
+    (float16_t)0.997723067f,  (float16_t)0.067443920f,
+    (float16_t)0.997618435f,  (float16_t)0.068974328f,
+    (float16_t)0.997511456f,  (float16_t)0.070504573f,
+    (float16_t)0.997402130f,  (float16_t)0.072034653f,
+    (float16_t)0.997290457f,  (float16_t)0.073564564f,
+    (float16_t)0.997176437f,  (float16_t)0.075094301f,
+    (float16_t)0.997060070f,  (float16_t)0.076623861f,
+    (float16_t)0.996941358f,  (float16_t)0.078153242f,
+    (float16_t)0.996820299f,  (float16_t)0.079682438f,
+    (float16_t)0.996696895f,  (float16_t)0.081211447f,
+    (float16_t)0.996571146f,  (float16_t)0.082740265f,
+    (float16_t)0.996443051f,  (float16_t)0.084268888f,
+    (float16_t)0.996312612f,  (float16_t)0.085797312f,
+    (float16_t)0.996179829f,  (float16_t)0.087325535f,
+    (float16_t)0.996044701f,  (float16_t)0.088853553f,
+    (float16_t)0.995907229f,  (float16_t)0.090381361f,
+    (float16_t)0.995767414f,  (float16_t)0.091908956f,
+    (float16_t)0.995625256f,  (float16_t)0.093436336f,
+    (float16_t)0.995480755f,  (float16_t)0.094963495f,
+    (float16_t)0.995333912f,  (float16_t)0.096490431f,
+    (float16_t)0.995184727f,  (float16_t)0.098017140f,
+    (float16_t)0.995033199f,  (float16_t)0.099543619f,
+    (float16_t)0.994879331f,  (float16_t)0.101069863f,
+    (float16_t)0.994723121f,  (float16_t)0.102595869f,
+    (float16_t)0.994564571f,  (float16_t)0.104121634f,
+    (float16_t)0.994403680f,  (float16_t)0.105647154f,
+    (float16_t)0.994240449f,  (float16_t)0.107172425f,
+    (float16_t)0.994074879f,  (float16_t)0.108697444f,
+    (float16_t)0.993906970f,  (float16_t)0.110222207f,
+    (float16_t)0.993736722f,  (float16_t)0.111746711f,
+    (float16_t)0.993564136f,  (float16_t)0.113270952f,
+    (float16_t)0.993389211f,  (float16_t)0.114794927f,
+    (float16_t)0.993211949f,  (float16_t)0.116318631f,
+    (float16_t)0.993032350f,  (float16_t)0.117842062f,
+    (float16_t)0.992850414f,  (float16_t)0.119365215f,
+    (float16_t)0.992666142f,  (float16_t)0.120888087f,
+    (float16_t)0.992479535f,  (float16_t)0.122410675f,
+    (float16_t)0.992290591f,  (float16_t)0.123932975f,
+    (float16_t)0.992099313f,  (float16_t)0.125454983f,
+    (float16_t)0.991905700f,  (float16_t)0.126976696f,
+    (float16_t)0.991709754f,  (float16_t)0.128498111f,
+    (float16_t)0.991511473f,  (float16_t)0.130019223f,
+    (float16_t)0.991310860f,  (float16_t)0.131540029f,
+    (float16_t)0.991107914f,  (float16_t)0.133060525f,
+    (float16_t)0.990902635f,  (float16_t)0.134580709f,
+    (float16_t)0.990695025f,  (float16_t)0.136100575f,
+    (float16_t)0.990485084f,  (float16_t)0.137620122f,
+    (float16_t)0.990272812f,  (float16_t)0.139139344f,
+    (float16_t)0.990058210f,  (float16_t)0.140658239f,
+    (float16_t)0.989841278f,  (float16_t)0.142176804f,
+    (float16_t)0.989622017f,  (float16_t)0.143695033f,
+    (float16_t)0.989400428f,  (float16_t)0.145212925f,
+    (float16_t)0.989176510f,  (float16_t)0.146730474f,
+    (float16_t)0.988950265f,  (float16_t)0.148247679f,
+    (float16_t)0.988721692f,  (float16_t)0.149764535f,
+    (float16_t)0.988490793f,  (float16_t)0.151281038f,
+    (float16_t)0.988257568f,  (float16_t)0.152797185f,
+    (float16_t)0.988022017f,  (float16_t)0.154312973f,
+    (float16_t)0.987784142f,  (float16_t)0.155828398f,
+    (float16_t)0.987543942f,  (float16_t)0.157343456f,
+    (float16_t)0.987301418f,  (float16_t)0.158858143f,
+    (float16_t)0.987056571f,  (float16_t)0.160372457f,
+    (float16_t)0.986809402f,  (float16_t)0.161886394f,
+    (float16_t)0.986559910f,  (float16_t)0.163399949f,
+    (float16_t)0.986308097f,  (float16_t)0.164913120f,
+    (float16_t)0.986053963f,  (float16_t)0.166425904f,
+    (float16_t)0.985797509f,  (float16_t)0.167938295f,
+    (float16_t)0.985538735f,  (float16_t)0.169450291f,
+    (float16_t)0.985277642f,  (float16_t)0.170961889f,
+    (float16_t)0.985014231f,  (float16_t)0.172473084f,
+    (float16_t)0.984748502f,  (float16_t)0.173983873f,
+    (float16_t)0.984480455f,  (float16_t)0.175494253f,
+    (float16_t)0.984210092f,  (float16_t)0.177004220f,
+    (float16_t)0.983937413f,  (float16_t)0.178513771f,
+    (float16_t)0.983662419f,  (float16_t)0.180022901f,
+    (float16_t)0.983385110f,  (float16_t)0.181531608f,
+    (float16_t)0.983105487f,  (float16_t)0.183039888f,
+    (float16_t)0.982823551f,  (float16_t)0.184547737f,
+    (float16_t)0.982539302f,  (float16_t)0.186055152f,
+    (float16_t)0.982252741f,  (float16_t)0.187562129f,
+    (float16_t)0.981963869f,  (float16_t)0.189068664f,
+    (float16_t)0.981672686f,  (float16_t)0.190574755f,
+    (float16_t)0.981379193f,  (float16_t)0.192080397f,
+    (float16_t)0.981083391f,  (float16_t)0.193585587f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.980484862f,  (float16_t)0.196594598f,
+    (float16_t)0.980182136f,  (float16_t)0.198098411f,
+    (float16_t)0.979877104f,  (float16_t)0.199601758f,
+    (float16_t)0.979569766f,  (float16_t)0.201104635f,
+    (float16_t)0.979260123f,  (float16_t)0.202607039f,
+    (float16_t)0.978948175f,  (float16_t)0.204108966f,
+    (float16_t)0.978633924f,  (float16_t)0.205610413f,
+    (float16_t)0.978317371f,  (float16_t)0.207111376f,
+    (float16_t)0.977998515f,  (float16_t)0.208611852f,
+    (float16_t)0.977677358f,  (float16_t)0.210111837f,
+    (float16_t)0.977353900f,  (float16_t)0.211611327f,
+    (float16_t)0.977028143f,  (float16_t)0.213110320f,
+    (float16_t)0.976700086f,  (float16_t)0.214608811f,
+    (float16_t)0.976369731f,  (float16_t)0.216106797f,
+    (float16_t)0.976037079f,  (float16_t)0.217604275f,
+    (float16_t)0.975702130f,  (float16_t)0.219101240f,
+    (float16_t)0.975364885f,  (float16_t)0.220597690f,
+    (float16_t)0.975025345f,  (float16_t)0.222093621f,
+    (float16_t)0.974683511f,  (float16_t)0.223589029f,
+    (float16_t)0.974339383f,  (float16_t)0.225083911f,
+    (float16_t)0.973992962f,  (float16_t)0.226578264f,
+    (float16_t)0.973644250f,  (float16_t)0.228072083f,
+    (float16_t)0.973293246f,  (float16_t)0.229565366f,
+    (float16_t)0.972939952f,  (float16_t)0.231058108f,
+    (float16_t)0.972584369f,  (float16_t)0.232550307f,
+    (float16_t)0.972226497f,  (float16_t)0.234041959f,
+    (float16_t)0.971866337f,  (float16_t)0.235533059f,
+    (float16_t)0.971503891f,  (float16_t)0.237023606f,
+    (float16_t)0.971139158f,  (float16_t)0.238513595f,
+    (float16_t)0.970772141f,  (float16_t)0.240003022f,
+    (float16_t)0.970402839f,  (float16_t)0.241491885f,
+    (float16_t)0.970031253f,  (float16_t)0.242980180f,
+    (float16_t)0.969657385f,  (float16_t)0.244467903f,
+    (float16_t)0.969281235f,  (float16_t)0.245955050f,
+    (float16_t)0.968902805f,  (float16_t)0.247441619f,
+    (float16_t)0.968522094f,  (float16_t)0.248927606f,
+    (float16_t)0.968139105f,  (float16_t)0.250413007f,
+    (float16_t)0.967753837f,  (float16_t)0.251897818f,
+    (float16_t)0.967366292f,  (float16_t)0.253382037f,
+    (float16_t)0.966976471f,  (float16_t)0.254865660f,
+    (float16_t)0.966584374f,  (float16_t)0.256348682f,
+    (float16_t)0.966190003f,  (float16_t)0.257831102f,
+    (float16_t)0.965793359f,  (float16_t)0.259312915f,
+    (float16_t)0.965394442f,  (float16_t)0.260794118f,
+    (float16_t)0.964993253f,  (float16_t)0.262274707f,
+    (float16_t)0.964589793f,  (float16_t)0.263754679f,
+    (float16_t)0.964184064f,  (float16_t)0.265234030f,
+    (float16_t)0.963776066f,  (float16_t)0.266712757f,
+    (float16_t)0.963365800f,  (float16_t)0.268190857f,
+    (float16_t)0.962953267f,  (float16_t)0.269668326f,
+    (float16_t)0.962538468f,  (float16_t)0.271145160f,
+    (float16_t)0.962121404f,  (float16_t)0.272621355f,
+    (float16_t)0.961702077f,  (float16_t)0.274096910f,
+    (float16_t)0.961280486f,  (float16_t)0.275571819f,
+    (float16_t)0.960856633f,  (float16_t)0.277046080f,
+    (float16_t)0.960430519f,  (float16_t)0.278519689f,
+    (float16_t)0.960002146f,  (float16_t)0.279992643f,
+    (float16_t)0.959571513f,  (float16_t)0.281464938f,
+    (float16_t)0.959138622f,  (float16_t)0.282936570f,
+    (float16_t)0.958703475f,  (float16_t)0.284407537f,
+    (float16_t)0.958266071f,  (float16_t)0.285877835f,
+    (float16_t)0.957826413f,  (float16_t)0.287347460f,
+    (float16_t)0.957384501f,  (float16_t)0.288816408f,
+    (float16_t)0.956940336f,  (float16_t)0.290284677f,
+    (float16_t)0.956493919f,  (float16_t)0.291752263f,
+    (float16_t)0.956045251f,  (float16_t)0.293219163f,
+    (float16_t)0.955594334f,  (float16_t)0.294685372f,
+    (float16_t)0.955141168f,  (float16_t)0.296150888f,
+    (float16_t)0.954685755f,  (float16_t)0.297615707f,
+    (float16_t)0.954228095f,  (float16_t)0.299079826f,
+    (float16_t)0.953768190f,  (float16_t)0.300543241f,
+    (float16_t)0.953306040f,  (float16_t)0.302005949f,
+    (float16_t)0.952841648f,  (float16_t)0.303467947f,
+    (float16_t)0.952375013f,  (float16_t)0.304929230f,
+    (float16_t)0.951906137f,  (float16_t)0.306389795f,
+    (float16_t)0.951435021f,  (float16_t)0.307849640f,
+    (float16_t)0.950961666f,  (float16_t)0.309308760f,
+    (float16_t)0.950486074f,  (float16_t)0.310767153f,
+    (float16_t)0.950008245f,  (float16_t)0.312224814f,
+    (float16_t)0.949528181f,  (float16_t)0.313681740f,
+    (float16_t)0.949045882f,  (float16_t)0.315137929f,
+    (float16_t)0.948561350f,  (float16_t)0.316593376f,
+    (float16_t)0.948074586f,  (float16_t)0.318048077f,
+    (float16_t)0.947585591f,  (float16_t)0.319502031f,
+    (float16_t)0.947094366f,  (float16_t)0.320955232f,
+    (float16_t)0.946600913f,  (float16_t)0.322407679f,
+    (float16_t)0.946105232f,  (float16_t)0.323859367f,
+    (float16_t)0.945607325f,  (float16_t)0.325310292f,
+    (float16_t)0.945107193f,  (float16_t)0.326760452f,
+    (float16_t)0.944604837f,  (float16_t)0.328209844f,
+    (float16_t)0.944100258f,  (float16_t)0.329658463f,
+    (float16_t)0.943593458f,  (float16_t)0.331106306f,
+    (float16_t)0.943084437f,  (float16_t)0.332553370f,
+    (float16_t)0.942573198f,  (float16_t)0.333999651f,
+    (float16_t)0.942059740f,  (float16_t)0.335445147f,
+    (float16_t)0.941544065f,  (float16_t)0.336889853f,
+    (float16_t)0.941026175f,  (float16_t)0.338333767f,
+    (float16_t)0.940506071f,  (float16_t)0.339776884f,
+    (float16_t)0.939983753f,  (float16_t)0.341219202f,
+    (float16_t)0.939459224f,  (float16_t)0.342660717f,
+    (float16_t)0.938932484f,  (float16_t)0.344101426f,
+    (float16_t)0.938403534f,  (float16_t)0.345541325f,
+    (float16_t)0.937872376f,  (float16_t)0.346980411f,
+    (float16_t)0.937339012f,  (float16_t)0.348418680f,
+    (float16_t)0.936803442f,  (float16_t)0.349856130f,
+    (float16_t)0.936265667f,  (float16_t)0.351292756f,
+    (float16_t)0.935725689f,  (float16_t)0.352728556f,
+    (float16_t)0.935183510f,  (float16_t)0.354163525f,
+    (float16_t)0.934639130f,  (float16_t)0.355597662f,
+    (float16_t)0.934092550f,  (float16_t)0.357030961f,
+    (float16_t)0.933543773f,  (float16_t)0.358463421f,
+    (float16_t)0.932992799f,  (float16_t)0.359895037f,
+    (float16_t)0.932439629f,  (float16_t)0.361325806f,
+    (float16_t)0.931884266f,  (float16_t)0.362755724f,
+    (float16_t)0.931326709f,  (float16_t)0.364184790f,
+    (float16_t)0.930766961f,  (float16_t)0.365612998f,
+    (float16_t)0.930205023f,  (float16_t)0.367040346f,
+    (float16_t)0.929640896f,  (float16_t)0.368466830f,
+    (float16_t)0.929074581f,  (float16_t)0.369892447f,
+    (float16_t)0.928506080f,  (float16_t)0.371317194f,
+    (float16_t)0.927935395f,  (float16_t)0.372741067f,
+    (float16_t)0.927362526f,  (float16_t)0.374164063f,
+    (float16_t)0.926787474f,  (float16_t)0.375586178f,
+    (float16_t)0.926210242f,  (float16_t)0.377007410f,
+    (float16_t)0.925630831f,  (float16_t)0.378427755f,
+    (float16_t)0.925049241f,  (float16_t)0.379847209f,
+    (float16_t)0.924465474f,  (float16_t)0.381265769f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.923291417f,  (float16_t)0.384100195f,
+    (float16_t)0.922701128f,  (float16_t)0.385516054f,
+    (float16_t)0.922108669f,  (float16_t)0.386931006f,
+    (float16_t)0.921514039f,  (float16_t)0.388345047f,
+    (float16_t)0.920917242f,  (float16_t)0.389758174f,
+    (float16_t)0.920318277f,  (float16_t)0.391170384f,
+    (float16_t)0.919717146f,  (float16_t)0.392581674f,
+    (float16_t)0.919113852f,  (float16_t)0.393992040f,
+    (float16_t)0.918508394f,  (float16_t)0.395401479f,
+    (float16_t)0.917900776f,  (float16_t)0.396809987f,
+    (float16_t)0.917290997f,  (float16_t)0.398217562f,
+    (float16_t)0.916679060f,  (float16_t)0.399624200f,
+    (float16_t)0.916064966f,  (float16_t)0.401029897f,
+    (float16_t)0.915448716f,  (float16_t)0.402434651f,
+    (float16_t)0.914830312f,  (float16_t)0.403838458f,
+    (float16_t)0.914209756f,  (float16_t)0.405241314f,
+    (float16_t)0.913587048f,  (float16_t)0.406643217f,
+    (float16_t)0.912962190f,  (float16_t)0.408044163f,
+    (float16_t)0.912335185f,  (float16_t)0.409444149f,
+    (float16_t)0.911706032f,  (float16_t)0.410843171f,
+    (float16_t)0.911074734f,  (float16_t)0.412241227f,
+    (float16_t)0.910441292f,  (float16_t)0.413638312f,
+    (float16_t)0.909805708f,  (float16_t)0.415034424f,
+    (float16_t)0.909167983f,  (float16_t)0.416429560f,
+    (float16_t)0.908528119f,  (float16_t)0.417823716f,
+    (float16_t)0.907886116f,  (float16_t)0.419216888f,
+    (float16_t)0.907241978f,  (float16_t)0.420609074f,
+    (float16_t)0.906595705f,  (float16_t)0.422000271f,
+    (float16_t)0.905947298f,  (float16_t)0.423390474f,
+    (float16_t)0.905296759f,  (float16_t)0.424779681f,
+    (float16_t)0.904644091f,  (float16_t)0.426167889f,
+    (float16_t)0.903989293f,  (float16_t)0.427555093f,
+    (float16_t)0.903332368f,  (float16_t)0.428941292f,
+    (float16_t)0.902673318f,  (float16_t)0.430326481f,
+    (float16_t)0.902012144f,  (float16_t)0.431710658f,
+    (float16_t)0.901348847f,  (float16_t)0.433093819f,
+    (float16_t)0.900683429f,  (float16_t)0.434475961f,
+    (float16_t)0.900015892f,  (float16_t)0.435857080f,
+    (float16_t)0.899346237f,  (float16_t)0.437237174f,
+    (float16_t)0.898674466f,  (float16_t)0.438616239f,
+    (float16_t)0.898000580f,  (float16_t)0.439994271f,
+    (float16_t)0.897324581f,  (float16_t)0.441371269f,
+    (float16_t)0.896646470f,  (float16_t)0.442747228f,
+    (float16_t)0.895966250f,  (float16_t)0.444122145f,
+    (float16_t)0.895283921f,  (float16_t)0.445496017f,
+    (float16_t)0.894599486f,  (float16_t)0.446868840f,
+    (float16_t)0.893912945f,  (float16_t)0.448240612f,
+    (float16_t)0.893224301f,  (float16_t)0.449611330f,
+    (float16_t)0.892533555f,  (float16_t)0.450980989f,
+    (float16_t)0.891840709f,  (float16_t)0.452349587f,
+    (float16_t)0.891145765f,  (float16_t)0.453717121f,
+    (float16_t)0.890448723f,  (float16_t)0.455083587f,
+    (float16_t)0.889749586f,  (float16_t)0.456448982f,
+    (float16_t)0.889048356f,  (float16_t)0.457813304f,
+    (float16_t)0.888345033f,  (float16_t)0.459176548f,
+    (float16_t)0.887639620f,  (float16_t)0.460538711f,
+    (float16_t)0.886932119f,  (float16_t)0.461899791f,
+    (float16_t)0.886222530f,  (float16_t)0.463259784f,
+    (float16_t)0.885510856f,  (float16_t)0.464618686f,
+    (float16_t)0.884797098f,  (float16_t)0.465976496f,
+    (float16_t)0.884081259f,  (float16_t)0.467333209f,
+    (float16_t)0.883363339f,  (float16_t)0.468688822f,
+    (float16_t)0.882643340f,  (float16_t)0.470043332f,
+    (float16_t)0.881921264f,  (float16_t)0.471396737f,
+    (float16_t)0.881197113f,  (float16_t)0.472749032f,
+    (float16_t)0.880470889f,  (float16_t)0.474100215f,
+    (float16_t)0.879742593f,  (float16_t)0.475450282f,
+    (float16_t)0.879012226f,  (float16_t)0.476799230f,
+    (float16_t)0.878279792f,  (float16_t)0.478147056f,
+    (float16_t)0.877545290f,  (float16_t)0.479493758f,
+    (float16_t)0.876808724f,  (float16_t)0.480839331f,
+    (float16_t)0.876070094f,  (float16_t)0.482183772f,
+    (float16_t)0.875329403f,  (float16_t)0.483527079f,
+    (float16_t)0.874586652f,  (float16_t)0.484869248f,
+    (float16_t)0.873841843f,  (float16_t)0.486210276f,
+    (float16_t)0.873094978f,  (float16_t)0.487550160f,
+    (float16_t)0.872346059f,  (float16_t)0.488888897f,
+    (float16_t)0.871595087f,  (float16_t)0.490226483f,
+    (float16_t)0.870842063f,  (float16_t)0.491562916f,
+    (float16_t)0.870086991f,  (float16_t)0.492898192f,
+    (float16_t)0.869329871f,  (float16_t)0.494232309f,
+    (float16_t)0.868570706f,  (float16_t)0.495565262f,
+    (float16_t)0.867809497f,  (float16_t)0.496897049f,
+    (float16_t)0.867046246f,  (float16_t)0.498227667f,
+    (float16_t)0.866280954f,  (float16_t)0.499557113f,
+    (float16_t)0.865513624f,  (float16_t)0.500885383f,
+    (float16_t)0.864744258f,  (float16_t)0.502212474f,
+    (float16_t)0.863972856f,  (float16_t)0.503538384f,
+    (float16_t)0.863199422f,  (float16_t)0.504863109f,
+    (float16_t)0.862423956f,  (float16_t)0.506186645f,
+    (float16_t)0.861646461f,  (float16_t)0.507508991f,
+    (float16_t)0.860866939f,  (float16_t)0.508830143f,
+    (float16_t)0.860085390f,  (float16_t)0.510150097f,
+    (float16_t)0.859301818f,  (float16_t)0.511468850f,
+    (float16_t)0.858516224f,  (float16_t)0.512786401f,
+    (float16_t)0.857728610f,  (float16_t)0.514102744f,
+    (float16_t)0.856938977f,  (float16_t)0.515417878f,
+    (float16_t)0.856147328f,  (float16_t)0.516731799f,
+    (float16_t)0.855353665f,  (float16_t)0.518044504f,
+    (float16_t)0.854557988f,  (float16_t)0.519355990f,
+    (float16_t)0.853760301f,  (float16_t)0.520666254f,
+    (float16_t)0.852960605f,  (float16_t)0.521975293f,
+    (float16_t)0.852158902f,  (float16_t)0.523283103f,
+    (float16_t)0.851355193f,  (float16_t)0.524589683f,
+    (float16_t)0.850549481f,  (float16_t)0.525895027f,
+    (float16_t)0.849741768f,  (float16_t)0.527199135f,
+    (float16_t)0.848932055f,  (float16_t)0.528502002f,
+    (float16_t)0.848120345f,  (float16_t)0.529803625f,
+    (float16_t)0.847306639f,  (float16_t)0.531104001f,
+    (float16_t)0.846490939f,  (float16_t)0.532403128f,
+    (float16_t)0.845673247f,  (float16_t)0.533701002f,
+    (float16_t)0.844853565f,  (float16_t)0.534997620f,
+    (float16_t)0.844031895f,  (float16_t)0.536292979f,
+    (float16_t)0.843208240f,  (float16_t)0.537587076f,
+    (float16_t)0.842382600f,  (float16_t)0.538879909f,
+    (float16_t)0.841554977f,  (float16_t)0.540171473f,
+    (float16_t)0.840725375f,  (float16_t)0.541461766f,
+    (float16_t)0.839893794f,  (float16_t)0.542750785f,
+    (float16_t)0.839060237f,  (float16_t)0.544038527f,
+    (float16_t)0.838224706f,  (float16_t)0.545324988f,
+    (float16_t)0.837387202f,  (float16_t)0.546610167f,
+    (float16_t)0.836547727f,  (float16_t)0.547894059f,
+    (float16_t)0.835706284f,  (float16_t)0.549176662f,
+    (float16_t)0.834862875f,  (float16_t)0.550457973f,
+    (float16_t)0.834017501f,  (float16_t)0.551737988f,
+    (float16_t)0.833170165f,  (float16_t)0.553016706f,
+    (float16_t)0.832320868f,  (float16_t)0.554294121f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.830616400f,  (float16_t)0.556845037f,
+    (float16_t)0.829761234f,  (float16_t)0.558118531f,
+    (float16_t)0.828904115f,  (float16_t)0.559390712f,
+    (float16_t)0.828045045f,  (float16_t)0.560661576f,
+    (float16_t)0.827184027f,  (float16_t)0.561931121f,
+    (float16_t)0.826321063f,  (float16_t)0.563199344f,
+    (float16_t)0.825456154f,  (float16_t)0.564466242f,
+    (float16_t)0.824589303f,  (float16_t)0.565731811f,
+    (float16_t)0.823720511f,  (float16_t)0.566996049f,
+    (float16_t)0.822849781f,  (float16_t)0.568258953f,
+    (float16_t)0.821977115f,  (float16_t)0.569520519f,
+    (float16_t)0.821102515f,  (float16_t)0.570780746f,
+    (float16_t)0.820225983f,  (float16_t)0.572039629f,
+    (float16_t)0.819347520f,  (float16_t)0.573297167f,
+    (float16_t)0.818467130f,  (float16_t)0.574553355f,
+    (float16_t)0.817584813f,  (float16_t)0.575808191f,
+    (float16_t)0.816700573f,  (float16_t)0.577061673f,
+    (float16_t)0.815814411f,  (float16_t)0.578313796f,
+    (float16_t)0.814926329f,  (float16_t)0.579564559f,
+    (float16_t)0.814036330f,  (float16_t)0.580813958f,
+    (float16_t)0.813144415f,  (float16_t)0.582061990f,
+    (float16_t)0.812250587f,  (float16_t)0.583308653f,
+    (float16_t)0.811354847f,  (float16_t)0.584553943f,
+    (float16_t)0.810457198f,  (float16_t)0.585797857f,
+    (float16_t)0.809557642f,  (float16_t)0.587040394f,
+    (float16_t)0.808656182f,  (float16_t)0.588281548f,
+    (float16_t)0.807752818f,  (float16_t)0.589521319f,
+    (float16_t)0.806847554f,  (float16_t)0.590759702f,
+    (float16_t)0.805940391f,  (float16_t)0.591996695f,
+    (float16_t)0.805031331f,  (float16_t)0.593232295f,
+    (float16_t)0.804120377f,  (float16_t)0.594466499f,
+    (float16_t)0.803207531f,  (float16_t)0.595699304f,
+    (float16_t)0.802292796f,  (float16_t)0.596930708f,
+    (float16_t)0.801376172f,  (float16_t)0.598160707f,
+    (float16_t)0.800457662f,  (float16_t)0.599389298f,
+    (float16_t)0.799537269f,  (float16_t)0.600616479f,
+    (float16_t)0.798614995f,  (float16_t)0.601842247f,
+    (float16_t)0.797690841f,  (float16_t)0.603066599f,
+    (float16_t)0.796764810f,  (float16_t)0.604289531f,
+    (float16_t)0.795836905f,  (float16_t)0.605511041f,
+    (float16_t)0.794907126f,  (float16_t)0.606731127f,
+    (float16_t)0.793975478f,  (float16_t)0.607949785f,
+    (float16_t)0.793041960f,  (float16_t)0.609167012f,
+    (float16_t)0.792106577f,  (float16_t)0.610382806f,
+    (float16_t)0.791169330f,  (float16_t)0.611597164f,
+    (float16_t)0.790230221f,  (float16_t)0.612810082f,
+    (float16_t)0.789289253f,  (float16_t)0.614021559f,
+    (float16_t)0.788346428f,  (float16_t)0.615231591f,
+    (float16_t)0.787401747f,  (float16_t)0.616440175f,
+    (float16_t)0.786455214f,  (float16_t)0.617647308f,
+    (float16_t)0.785506830f,  (float16_t)0.618852988f,
+    (float16_t)0.784556597f,  (float16_t)0.620057212f,
+    (float16_t)0.783604519f,  (float16_t)0.621259977f,
+    (float16_t)0.782650596f,  (float16_t)0.622461279f,
+    (float16_t)0.781694832f,  (float16_t)0.623661118f,
+    (float16_t)0.780737229f,  (float16_t)0.624859488f,
+    (float16_t)0.779777788f,  (float16_t)0.626056388f,
+    (float16_t)0.778816512f,  (float16_t)0.627251815f,
+    (float16_t)0.777853404f,  (float16_t)0.628445767f,
+    (float16_t)0.776888466f,  (float16_t)0.629638239f,
+    (float16_t)0.775921699f,  (float16_t)0.630829230f,
+    (float16_t)0.774953107f,  (float16_t)0.632018736f,
+    (float16_t)0.773982691f,  (float16_t)0.633206755f,
+    (float16_t)0.773010453f,  (float16_t)0.634393284f,
+    (float16_t)0.772036397f,  (float16_t)0.635578320f,
+    (float16_t)0.771060524f,  (float16_t)0.636761861f,
+    (float16_t)0.770082837f,  (float16_t)0.637943904f,
+    (float16_t)0.769103338f,  (float16_t)0.639124445f,
+    (float16_t)0.768122029f,  (float16_t)0.640303482f,
+    (float16_t)0.767138912f,  (float16_t)0.641481013f,
+    (float16_t)0.766153990f,  (float16_t)0.642657034f,
+    (float16_t)0.765167266f,  (float16_t)0.643831543f,
+    (float16_t)0.764178741f,  (float16_t)0.645004537f,
+    (float16_t)0.763188417f,  (float16_t)0.646176013f,
+    (float16_t)0.762196298f,  (float16_t)0.647345969f,
+    (float16_t)0.761202385f,  (float16_t)0.648514401f,
+    (float16_t)0.760206682f,  (float16_t)0.649681307f,
+    (float16_t)0.759209189f,  (float16_t)0.650846685f,
+    (float16_t)0.758209910f,  (float16_t)0.652010531f,
+    (float16_t)0.757208847f,  (float16_t)0.653172843f,
+    (float16_t)0.756206001f,  (float16_t)0.654333618f,
+    (float16_t)0.755201377f,  (float16_t)0.655492853f,
+    (float16_t)0.754194975f,  (float16_t)0.656650546f,
+    (float16_t)0.753186799f,  (float16_t)0.657806693f,
+    (float16_t)0.752176850f,  (float16_t)0.658961293f,
+    (float16_t)0.751165132f,  (float16_t)0.660114342f,
+    (float16_t)0.750151646f,  (float16_t)0.661265838f,
+    (float16_t)0.749136395f,  (float16_t)0.662415778f,
+    (float16_t)0.748119380f,  (float16_t)0.663564159f,
+    (float16_t)0.747100606f,  (float16_t)0.664710978f,
+    (float16_t)0.746080074f,  (float16_t)0.665856234f,
+    (float16_t)0.745057785f,  (float16_t)0.666999922f,
+    (float16_t)0.744033744f,  (float16_t)0.668142041f,
+    (float16_t)0.743007952f,  (float16_t)0.669282588f,
+    (float16_t)0.741980412f,  (float16_t)0.670421560f,
+    (float16_t)0.740951125f,  (float16_t)0.671558955f,
+    (float16_t)0.739920095f,  (float16_t)0.672694769f,
+    (float16_t)0.738887324f,  (float16_t)0.673829000f,
+    (float16_t)0.737852815f,  (float16_t)0.674961646f,
+    (float16_t)0.736816569f,  (float16_t)0.676092704f,
+    (float16_t)0.735778589f,  (float16_t)0.677222170f,
+    (float16_t)0.734738878f,  (float16_t)0.678350043f,
+    (float16_t)0.733697438f,  (float16_t)0.679476320f,
+    (float16_t)0.732654272f,  (float16_t)0.680600998f,
+    (float16_t)0.731609381f,  (float16_t)0.681724074f,
+    (float16_t)0.730562769f,  (float16_t)0.682845546f,
+    (float16_t)0.729514438f,  (float16_t)0.683965412f,
+    (float16_t)0.728464390f,  (float16_t)0.685083668f,
+    (float16_t)0.727412629f,  (float16_t)0.686200312f,
+    (float16_t)0.726359155f,  (float16_t)0.687315341f,
+    (float16_t)0.725303972f,  (float16_t)0.688428753f,
+    (float16_t)0.724247083f,  (float16_t)0.689540545f,
+    (float16_t)0.723188489f,  (float16_t)0.690650714f,
+    (float16_t)0.722128194f,  (float16_t)0.691759258f,
+    (float16_t)0.721066199f,  (float16_t)0.692866175f,
+    (float16_t)0.720002508f,  (float16_t)0.693971461f,
+    (float16_t)0.718937122f,  (float16_t)0.695075114f,
+    (float16_t)0.717870045f,  (float16_t)0.696177131f,
+    (float16_t)0.716801279f,  (float16_t)0.697277511f,
+    (float16_t)0.715730825f,  (float16_t)0.698376249f,
+    (float16_t)0.714658688f,  (float16_t)0.699473345f,
+    (float16_t)0.713584869f,  (float16_t)0.700568794f,
+    (float16_t)0.712509371f,  (float16_t)0.701662595f,
+    (float16_t)0.711432196f,  (float16_t)0.702754744f,
+    (float16_t)0.710353347f,  (float16_t)0.703845241f,
+    (float16_t)0.709272826f,  (float16_t)0.704934080f,
+    (float16_t)0.708190637f,  (float16_t)0.706021261f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.706021261f,  (float16_t)0.708190637f,
+    (float16_t)0.704934080f,  (float16_t)0.709272826f,
+    (float16_t)0.703845241f,  (float16_t)0.710353347f,
+    (float16_t)0.702754744f,  (float16_t)0.711432196f,
+    (float16_t)0.701662595f,  (float16_t)0.712509371f,
+    (float16_t)0.700568794f,  (float16_t)0.713584869f,
+    (float16_t)0.699473345f,  (float16_t)0.714658688f,
+    (float16_t)0.698376249f,  (float16_t)0.715730825f,
+    (float16_t)0.697277511f,  (float16_t)0.716801279f,
+    (float16_t)0.696177131f,  (float16_t)0.717870045f,
+    (float16_t)0.695075114f,  (float16_t)0.718937122f,
+    (float16_t)0.693971461f,  (float16_t)0.720002508f,
+    (float16_t)0.692866175f,  (float16_t)0.721066199f,
+    (float16_t)0.691759258f,  (float16_t)0.722128194f,
+    (float16_t)0.690650714f,  (float16_t)0.723188489f,
+    (float16_t)0.689540545f,  (float16_t)0.724247083f,
+    (float16_t)0.688428753f,  (float16_t)0.725303972f,
+    (float16_t)0.687315341f,  (float16_t)0.726359155f,
+    (float16_t)0.686200312f,  (float16_t)0.727412629f,
+    (float16_t)0.685083668f,  (float16_t)0.728464390f,
+    (float16_t)0.683965412f,  (float16_t)0.729514438f,
+    (float16_t)0.682845546f,  (float16_t)0.730562769f,
+    (float16_t)0.681724074f,  (float16_t)0.731609381f,
+    (float16_t)0.680600998f,  (float16_t)0.732654272f,
+    (float16_t)0.679476320f,  (float16_t)0.733697438f,
+    (float16_t)0.678350043f,  (float16_t)0.734738878f,
+    (float16_t)0.677222170f,  (float16_t)0.735778589f,
+    (float16_t)0.676092704f,  (float16_t)0.736816569f,
+    (float16_t)0.674961646f,  (float16_t)0.737852815f,
+    (float16_t)0.673829000f,  (float16_t)0.738887324f,
+    (float16_t)0.672694769f,  (float16_t)0.739920095f,
+    (float16_t)0.671558955f,  (float16_t)0.740951125f,
+    (float16_t)0.670421560f,  (float16_t)0.741980412f,
+    (float16_t)0.669282588f,  (float16_t)0.743007952f,
+    (float16_t)0.668142041f,  (float16_t)0.744033744f,
+    (float16_t)0.666999922f,  (float16_t)0.745057785f,
+    (float16_t)0.665856234f,  (float16_t)0.746080074f,
+    (float16_t)0.664710978f,  (float16_t)0.747100606f,
+    (float16_t)0.663564159f,  (float16_t)0.748119380f,
+    (float16_t)0.662415778f,  (float16_t)0.749136395f,
+    (float16_t)0.661265838f,  (float16_t)0.750151646f,
+    (float16_t)0.660114342f,  (float16_t)0.751165132f,
+    (float16_t)0.658961293f,  (float16_t)0.752176850f,
+    (float16_t)0.657806693f,  (float16_t)0.753186799f,
+    (float16_t)0.656650546f,  (float16_t)0.754194975f,
+    (float16_t)0.655492853f,  (float16_t)0.755201377f,
+    (float16_t)0.654333618f,  (float16_t)0.756206001f,
+    (float16_t)0.653172843f,  (float16_t)0.757208847f,
+    (float16_t)0.652010531f,  (float16_t)0.758209910f,
+    (float16_t)0.650846685f,  (float16_t)0.759209189f,
+    (float16_t)0.649681307f,  (float16_t)0.760206682f,
+    (float16_t)0.648514401f,  (float16_t)0.761202385f,
+    (float16_t)0.647345969f,  (float16_t)0.762196298f,
+    (float16_t)0.646176013f,  (float16_t)0.763188417f,
+    (float16_t)0.645004537f,  (float16_t)0.764178741f,
+    (float16_t)0.643831543f,  (float16_t)0.765167266f,
+    (float16_t)0.642657034f,  (float16_t)0.766153990f,
+    (float16_t)0.641481013f,  (float16_t)0.767138912f,
+    (float16_t)0.640303482f,  (float16_t)0.768122029f,
+    (float16_t)0.639124445f,  (float16_t)0.769103338f,
+    (float16_t)0.637943904f,  (float16_t)0.770082837f,
+    (float16_t)0.636761861f,  (float16_t)0.771060524f,
+    (float16_t)0.635578320f,  (float16_t)0.772036397f,
+    (float16_t)0.634393284f,  (float16_t)0.773010453f,
+    (float16_t)0.633206755f,  (float16_t)0.773982691f,
+    (float16_t)0.632018736f,  (float16_t)0.774953107f,
+    (float16_t)0.630829230f,  (float16_t)0.775921699f,
+    (float16_t)0.629638239f,  (float16_t)0.776888466f,
+    (float16_t)0.628445767f,  (float16_t)0.777853404f,
+    (float16_t)0.627251815f,  (float16_t)0.778816512f,
+    (float16_t)0.626056388f,  (float16_t)0.779777788f,
+    (float16_t)0.624859488f,  (float16_t)0.780737229f,
+    (float16_t)0.623661118f,  (float16_t)0.781694832f,
+    (float16_t)0.622461279f,  (float16_t)0.782650596f,
+    (float16_t)0.621259977f,  (float16_t)0.783604519f,
+    (float16_t)0.620057212f,  (float16_t)0.784556597f,
+    (float16_t)0.618852988f,  (float16_t)0.785506830f,
+    (float16_t)0.617647308f,  (float16_t)0.786455214f,
+    (float16_t)0.616440175f,  (float16_t)0.787401747f,
+    (float16_t)0.615231591f,  (float16_t)0.788346428f,
+    (float16_t)0.614021559f,  (float16_t)0.789289253f,
+    (float16_t)0.612810082f,  (float16_t)0.790230221f,
+    (float16_t)0.611597164f,  (float16_t)0.791169330f,
+    (float16_t)0.610382806f,  (float16_t)0.792106577f,
+    (float16_t)0.609167012f,  (float16_t)0.793041960f,
+    (float16_t)0.607949785f,  (float16_t)0.793975478f,
+    (float16_t)0.606731127f,  (float16_t)0.794907126f,
+    (float16_t)0.605511041f,  (float16_t)0.795836905f,
+    (float16_t)0.604289531f,  (float16_t)0.796764810f,
+    (float16_t)0.603066599f,  (float16_t)0.797690841f,
+    (float16_t)0.601842247f,  (float16_t)0.798614995f,
+    (float16_t)0.600616479f,  (float16_t)0.799537269f,
+    (float16_t)0.599389298f,  (float16_t)0.800457662f,
+    (float16_t)0.598160707f,  (float16_t)0.801376172f,
+    (float16_t)0.596930708f,  (float16_t)0.802292796f,
+    (float16_t)0.595699304f,  (float16_t)0.803207531f,
+    (float16_t)0.594466499f,  (float16_t)0.804120377f,
+    (float16_t)0.593232295f,  (float16_t)0.805031331f,
+    (float16_t)0.591996695f,  (float16_t)0.805940391f,
+    (float16_t)0.590759702f,  (float16_t)0.806847554f,
+    (float16_t)0.589521319f,  (float16_t)0.807752818f,
+    (float16_t)0.588281548f,  (float16_t)0.808656182f,
+    (float16_t)0.587040394f,  (float16_t)0.809557642f,
+    (float16_t)0.585797857f,  (float16_t)0.810457198f,
+    (float16_t)0.584553943f,  (float16_t)0.811354847f,
+    (float16_t)0.583308653f,  (float16_t)0.812250587f,
+    (float16_t)0.582061990f,  (float16_t)0.813144415f,
+    (float16_t)0.580813958f,  (float16_t)0.814036330f,
+    (float16_t)0.579564559f,  (float16_t)0.814926329f,
+    (float16_t)0.578313796f,  (float16_t)0.815814411f,
+    (float16_t)0.577061673f,  (float16_t)0.816700573f,
+    (float16_t)0.575808191f,  (float16_t)0.817584813f,
+    (float16_t)0.574553355f,  (float16_t)0.818467130f,
+    (float16_t)0.573297167f,  (float16_t)0.819347520f,
+    (float16_t)0.572039629f,  (float16_t)0.820225983f,
+    (float16_t)0.570780746f,  (float16_t)0.821102515f,
+    (float16_t)0.569520519f,  (float16_t)0.821977115f,
+    (float16_t)0.568258953f,  (float16_t)0.822849781f,
+    (float16_t)0.566996049f,  (float16_t)0.823720511f,
+    (float16_t)0.565731811f,  (float16_t)0.824589303f,
+    (float16_t)0.564466242f,  (float16_t)0.825456154f,
+    (float16_t)0.563199344f,  (float16_t)0.826321063f,
+    (float16_t)0.561931121f,  (float16_t)0.827184027f,
+    (float16_t)0.560661576f,  (float16_t)0.828045045f,
+    (float16_t)0.559390712f,  (float16_t)0.828904115f,
+    (float16_t)0.558118531f,  (float16_t)0.829761234f,
+    (float16_t)0.556845037f,  (float16_t)0.830616400f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.554294121f,  (float16_t)0.832320868f,
+    (float16_t)0.553016706f,  (float16_t)0.833170165f,
+    (float16_t)0.551737988f,  (float16_t)0.834017501f,
+    (float16_t)0.550457973f,  (float16_t)0.834862875f,
+    (float16_t)0.549176662f,  (float16_t)0.835706284f,
+    (float16_t)0.547894059f,  (float16_t)0.836547727f,
+    (float16_t)0.546610167f,  (float16_t)0.837387202f,
+    (float16_t)0.545324988f,  (float16_t)0.838224706f,
+    (float16_t)0.544038527f,  (float16_t)0.839060237f,
+    (float16_t)0.542750785f,  (float16_t)0.839893794f,
+    (float16_t)0.541461766f,  (float16_t)0.840725375f,
+    (float16_t)0.540171473f,  (float16_t)0.841554977f,
+    (float16_t)0.538879909f,  (float16_t)0.842382600f,
+    (float16_t)0.537587076f,  (float16_t)0.843208240f,
+    (float16_t)0.536292979f,  (float16_t)0.844031895f,
+    (float16_t)0.534997620f,  (float16_t)0.844853565f,
+    (float16_t)0.533701002f,  (float16_t)0.845673247f,
+    (float16_t)0.532403128f,  (float16_t)0.846490939f,
+    (float16_t)0.531104001f,  (float16_t)0.847306639f,
+    (float16_t)0.529803625f,  (float16_t)0.848120345f,
+    (float16_t)0.528502002f,  (float16_t)0.848932055f,
+    (float16_t)0.527199135f,  (float16_t)0.849741768f,
+    (float16_t)0.525895027f,  (float16_t)0.850549481f,
+    (float16_t)0.524589683f,  (float16_t)0.851355193f,
+    (float16_t)0.523283103f,  (float16_t)0.852158902f,
+    (float16_t)0.521975293f,  (float16_t)0.852960605f,
+    (float16_t)0.520666254f,  (float16_t)0.853760301f,
+    (float16_t)0.519355990f,  (float16_t)0.854557988f,
+    (float16_t)0.518044504f,  (float16_t)0.855353665f,
+    (float16_t)0.516731799f,  (float16_t)0.856147328f,
+    (float16_t)0.515417878f,  (float16_t)0.856938977f,
+    (float16_t)0.514102744f,  (float16_t)0.857728610f,
+    (float16_t)0.512786401f,  (float16_t)0.858516224f,
+    (float16_t)0.511468850f,  (float16_t)0.859301818f,
+    (float16_t)0.510150097f,  (float16_t)0.860085390f,
+    (float16_t)0.508830143f,  (float16_t)0.860866939f,
+    (float16_t)0.507508991f,  (float16_t)0.861646461f,
+    (float16_t)0.506186645f,  (float16_t)0.862423956f,
+    (float16_t)0.504863109f,  (float16_t)0.863199422f,
+    (float16_t)0.503538384f,  (float16_t)0.863972856f,
+    (float16_t)0.502212474f,  (float16_t)0.864744258f,
+    (float16_t)0.500885383f,  (float16_t)0.865513624f,
+    (float16_t)0.499557113f,  (float16_t)0.866280954f,
+    (float16_t)0.498227667f,  (float16_t)0.867046246f,
+    (float16_t)0.496897049f,  (float16_t)0.867809497f,
+    (float16_t)0.495565262f,  (float16_t)0.868570706f,
+    (float16_t)0.494232309f,  (float16_t)0.869329871f,
+    (float16_t)0.492898192f,  (float16_t)0.870086991f,
+    (float16_t)0.491562916f,  (float16_t)0.870842063f,
+    (float16_t)0.490226483f,  (float16_t)0.871595087f,
+    (float16_t)0.488888897f,  (float16_t)0.872346059f,
+    (float16_t)0.487550160f,  (float16_t)0.873094978f,
+    (float16_t)0.486210276f,  (float16_t)0.873841843f,
+    (float16_t)0.484869248f,  (float16_t)0.874586652f,
+    (float16_t)0.483527079f,  (float16_t)0.875329403f,
+    (float16_t)0.482183772f,  (float16_t)0.876070094f,
+    (float16_t)0.480839331f,  (float16_t)0.876808724f,
+    (float16_t)0.479493758f,  (float16_t)0.877545290f,
+    (float16_t)0.478147056f,  (float16_t)0.878279792f,
+    (float16_t)0.476799230f,  (float16_t)0.879012226f,
+    (float16_t)0.475450282f,  (float16_t)0.879742593f,
+    (float16_t)0.474100215f,  (float16_t)0.880470889f,
+    (float16_t)0.472749032f,  (float16_t)0.881197113f,
+    (float16_t)0.471396737f,  (float16_t)0.881921264f,
+    (float16_t)0.470043332f,  (float16_t)0.882643340f,
+    (float16_t)0.468688822f,  (float16_t)0.883363339f,
+    (float16_t)0.467333209f,  (float16_t)0.884081259f,
+    (float16_t)0.465976496f,  (float16_t)0.884797098f,
+    (float16_t)0.464618686f,  (float16_t)0.885510856f,
+    (float16_t)0.463259784f,  (float16_t)0.886222530f,
+    (float16_t)0.461899791f,  (float16_t)0.886932119f,
+    (float16_t)0.460538711f,  (float16_t)0.887639620f,
+    (float16_t)0.459176548f,  (float16_t)0.888345033f,
+    (float16_t)0.457813304f,  (float16_t)0.889048356f,
+    (float16_t)0.456448982f,  (float16_t)0.889749586f,
+    (float16_t)0.455083587f,  (float16_t)0.890448723f,
+    (float16_t)0.453717121f,  (float16_t)0.891145765f,
+    (float16_t)0.452349587f,  (float16_t)0.891840709f,
+    (float16_t)0.450980989f,  (float16_t)0.892533555f,
+    (float16_t)0.449611330f,  (float16_t)0.893224301f,
+    (float16_t)0.448240612f,  (float16_t)0.893912945f,
+    (float16_t)0.446868840f,  (float16_t)0.894599486f,
+    (float16_t)0.445496017f,  (float16_t)0.895283921f,
+    (float16_t)0.444122145f,  (float16_t)0.895966250f,
+    (float16_t)0.442747228f,  (float16_t)0.896646470f,
+    (float16_t)0.441371269f,  (float16_t)0.897324581f,
+    (float16_t)0.439994271f,  (float16_t)0.898000580f,
+    (float16_t)0.438616239f,  (float16_t)0.898674466f,
+    (float16_t)0.437237174f,  (float16_t)0.899346237f,
+    (float16_t)0.435857080f,  (float16_t)0.900015892f,
+    (float16_t)0.434475961f,  (float16_t)0.900683429f,
+    (float16_t)0.433093819f,  (float16_t)0.901348847f,
+    (float16_t)0.431710658f,  (float16_t)0.902012144f,
+    (float16_t)0.430326481f,  (float16_t)0.902673318f,
+    (float16_t)0.428941292f,  (float16_t)0.903332368f,
+    (float16_t)0.427555093f,  (float16_t)0.903989293f,
+    (float16_t)0.426167889f,  (float16_t)0.904644091f,
+    (float16_t)0.424779681f,  (float16_t)0.905296759f,
+    (float16_t)0.423390474f,  (float16_t)0.905947298f,
+    (float16_t)0.422000271f,  (float16_t)0.906595705f,
+    (float16_t)0.420609074f,  (float16_t)0.907241978f,
+    (float16_t)0.419216888f,  (float16_t)0.907886116f,
+    (float16_t)0.417823716f,  (float16_t)0.908528119f,
+    (float16_t)0.416429560f,  (float16_t)0.909167983f,
+    (float16_t)0.415034424f,  (float16_t)0.909805708f,
+    (float16_t)0.413638312f,  (float16_t)0.910441292f,
+    (float16_t)0.412241227f,  (float16_t)0.911074734f,
+    (float16_t)0.410843171f,  (float16_t)0.911706032f,
+    (float16_t)0.409444149f,  (float16_t)0.912335185f,
+    (float16_t)0.408044163f,  (float16_t)0.912962190f,
+    (float16_t)0.406643217f,  (float16_t)0.913587048f,
+    (float16_t)0.405241314f,  (float16_t)0.914209756f,
+    (float16_t)0.403838458f,  (float16_t)0.914830312f,
+    (float16_t)0.402434651f,  (float16_t)0.915448716f,
+    (float16_t)0.401029897f,  (float16_t)0.916064966f,
+    (float16_t)0.399624200f,  (float16_t)0.916679060f,
+    (float16_t)0.398217562f,  (float16_t)0.917290997f,
+    (float16_t)0.396809987f,  (float16_t)0.917900776f,
+    (float16_t)0.395401479f,  (float16_t)0.918508394f,
+    (float16_t)0.393992040f,  (float16_t)0.919113852f,
+    (float16_t)0.392581674f,  (float16_t)0.919717146f,
+    (float16_t)0.391170384f,  (float16_t)0.920318277f,
+    (float16_t)0.389758174f,  (float16_t)0.920917242f,
+    (float16_t)0.388345047f,  (float16_t)0.921514039f,
+    (float16_t)0.386931006f,  (float16_t)0.922108669f,
+    (float16_t)0.385516054f,  (float16_t)0.922701128f,
+    (float16_t)0.384100195f,  (float16_t)0.923291417f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.381265769f,  (float16_t)0.924465474f,
+    (float16_t)0.379847209f,  (float16_t)0.925049241f,
+    (float16_t)0.378427755f,  (float16_t)0.925630831f,
+    (float16_t)0.377007410f,  (float16_t)0.926210242f,
+    (float16_t)0.375586178f,  (float16_t)0.926787474f,
+    (float16_t)0.374164063f,  (float16_t)0.927362526f,
+    (float16_t)0.372741067f,  (float16_t)0.927935395f,
+    (float16_t)0.371317194f,  (float16_t)0.928506080f,
+    (float16_t)0.369892447f,  (float16_t)0.929074581f,
+    (float16_t)0.368466830f,  (float16_t)0.929640896f,
+    (float16_t)0.367040346f,  (float16_t)0.930205023f,
+    (float16_t)0.365612998f,  (float16_t)0.930766961f,
+    (float16_t)0.364184790f,  (float16_t)0.931326709f,
+    (float16_t)0.362755724f,  (float16_t)0.931884266f,
+    (float16_t)0.361325806f,  (float16_t)0.932439629f,
+    (float16_t)0.359895037f,  (float16_t)0.932992799f,
+    (float16_t)0.358463421f,  (float16_t)0.933543773f,
+    (float16_t)0.357030961f,  (float16_t)0.934092550f,
+    (float16_t)0.355597662f,  (float16_t)0.934639130f,
+    (float16_t)0.354163525f,  (float16_t)0.935183510f,
+    (float16_t)0.352728556f,  (float16_t)0.935725689f,
+    (float16_t)0.351292756f,  (float16_t)0.936265667f,
+    (float16_t)0.349856130f,  (float16_t)0.936803442f,
+    (float16_t)0.348418680f,  (float16_t)0.937339012f,
+    (float16_t)0.346980411f,  (float16_t)0.937872376f,
+    (float16_t)0.345541325f,  (float16_t)0.938403534f,
+    (float16_t)0.344101426f,  (float16_t)0.938932484f,
+    (float16_t)0.342660717f,  (float16_t)0.939459224f,
+    (float16_t)0.341219202f,  (float16_t)0.939983753f,
+    (float16_t)0.339776884f,  (float16_t)0.940506071f,
+    (float16_t)0.338333767f,  (float16_t)0.941026175f,
+    (float16_t)0.336889853f,  (float16_t)0.941544065f,
+    (float16_t)0.335445147f,  (float16_t)0.942059740f,
+    (float16_t)0.333999651f,  (float16_t)0.942573198f,
+    (float16_t)0.332553370f,  (float16_t)0.943084437f,
+    (float16_t)0.331106306f,  (float16_t)0.943593458f,
+    (float16_t)0.329658463f,  (float16_t)0.944100258f,
+    (float16_t)0.328209844f,  (float16_t)0.944604837f,
+    (float16_t)0.326760452f,  (float16_t)0.945107193f,
+    (float16_t)0.325310292f,  (float16_t)0.945607325f,
+    (float16_t)0.323859367f,  (float16_t)0.946105232f,
+    (float16_t)0.322407679f,  (float16_t)0.946600913f,
+    (float16_t)0.320955232f,  (float16_t)0.947094366f,
+    (float16_t)0.319502031f,  (float16_t)0.947585591f,
+    (float16_t)0.318048077f,  (float16_t)0.948074586f,
+    (float16_t)0.316593376f,  (float16_t)0.948561350f,
+    (float16_t)0.315137929f,  (float16_t)0.949045882f,
+    (float16_t)0.313681740f,  (float16_t)0.949528181f,
+    (float16_t)0.312224814f,  (float16_t)0.950008245f,
+    (float16_t)0.310767153f,  (float16_t)0.950486074f,
+    (float16_t)0.309308760f,  (float16_t)0.950961666f,
+    (float16_t)0.307849640f,  (float16_t)0.951435021f,
+    (float16_t)0.306389795f,  (float16_t)0.951906137f,
+    (float16_t)0.304929230f,  (float16_t)0.952375013f,
+    (float16_t)0.303467947f,  (float16_t)0.952841648f,
+    (float16_t)0.302005949f,  (float16_t)0.953306040f,
+    (float16_t)0.300543241f,  (float16_t)0.953768190f,
+    (float16_t)0.299079826f,  (float16_t)0.954228095f,
+    (float16_t)0.297615707f,  (float16_t)0.954685755f,
+    (float16_t)0.296150888f,  (float16_t)0.955141168f,
+    (float16_t)0.294685372f,  (float16_t)0.955594334f,
+    (float16_t)0.293219163f,  (float16_t)0.956045251f,
+    (float16_t)0.291752263f,  (float16_t)0.956493919f,
+    (float16_t)0.290284677f,  (float16_t)0.956940336f,
+    (float16_t)0.288816408f,  (float16_t)0.957384501f,
+    (float16_t)0.287347460f,  (float16_t)0.957826413f,
+    (float16_t)0.285877835f,  (float16_t)0.958266071f,
+    (float16_t)0.284407537f,  (float16_t)0.958703475f,
+    (float16_t)0.282936570f,  (float16_t)0.959138622f,
+    (float16_t)0.281464938f,  (float16_t)0.959571513f,
+    (float16_t)0.279992643f,  (float16_t)0.960002146f,
+    (float16_t)0.278519689f,  (float16_t)0.960430519f,
+    (float16_t)0.277046080f,  (float16_t)0.960856633f,
+    (float16_t)0.275571819f,  (float16_t)0.961280486f,
+    (float16_t)0.274096910f,  (float16_t)0.961702077f,
+    (float16_t)0.272621355f,  (float16_t)0.962121404f,
+    (float16_t)0.271145160f,  (float16_t)0.962538468f,
+    (float16_t)0.269668326f,  (float16_t)0.962953267f,
+    (float16_t)0.268190857f,  (float16_t)0.963365800f,
+    (float16_t)0.266712757f,  (float16_t)0.963776066f,
+    (float16_t)0.265234030f,  (float16_t)0.964184064f,
+    (float16_t)0.263754679f,  (float16_t)0.964589793f,
+    (float16_t)0.262274707f,  (float16_t)0.964993253f,
+    (float16_t)0.260794118f,  (float16_t)0.965394442f,
+    (float16_t)0.259312915f,  (float16_t)0.965793359f,
+    (float16_t)0.257831102f,  (float16_t)0.966190003f,
+    (float16_t)0.256348682f,  (float16_t)0.966584374f,
+    (float16_t)0.254865660f,  (float16_t)0.966976471f,
+    (float16_t)0.253382037f,  (float16_t)0.967366292f,
+    (float16_t)0.251897818f,  (float16_t)0.967753837f,
+    (float16_t)0.250413007f,  (float16_t)0.968139105f,
+    (float16_t)0.248927606f,  (float16_t)0.968522094f,
+    (float16_t)0.247441619f,  (float16_t)0.968902805f,
+    (float16_t)0.245955050f,  (float16_t)0.969281235f,
+    (float16_t)0.244467903f,  (float16_t)0.969657385f,
+    (float16_t)0.242980180f,  (float16_t)0.970031253f,
+    (float16_t)0.241491885f,  (float16_t)0.970402839f,
+    (float16_t)0.240003022f,  (float16_t)0.970772141f,
+    (float16_t)0.238513595f,  (float16_t)0.971139158f,
+    (float16_t)0.237023606f,  (float16_t)0.971503891f,
+    (float16_t)0.235533059f,  (float16_t)0.971866337f,
+    (float16_t)0.234041959f,  (float16_t)0.972226497f,
+    (float16_t)0.232550307f,  (float16_t)0.972584369f,
+    (float16_t)0.231058108f,  (float16_t)0.972939952f,
+    (float16_t)0.229565366f,  (float16_t)0.973293246f,
+    (float16_t)0.228072083f,  (float16_t)0.973644250f,
+    (float16_t)0.226578264f,  (float16_t)0.973992962f,
+    (float16_t)0.225083911f,  (float16_t)0.974339383f,
+    (float16_t)0.223589029f,  (float16_t)0.974683511f,
+    (float16_t)0.222093621f,  (float16_t)0.975025345f,
+    (float16_t)0.220597690f,  (float16_t)0.975364885f,
+    (float16_t)0.219101240f,  (float16_t)0.975702130f,
+    (float16_t)0.217604275f,  (float16_t)0.976037079f,
+    (float16_t)0.216106797f,  (float16_t)0.976369731f,
+    (float16_t)0.214608811f,  (float16_t)0.976700086f,
+    (float16_t)0.213110320f,  (float16_t)0.977028143f,
+    (float16_t)0.211611327f,  (float16_t)0.977353900f,
+    (float16_t)0.210111837f,  (float16_t)0.977677358f,
+    (float16_t)0.208611852f,  (float16_t)0.977998515f,
+    (float16_t)0.207111376f,  (float16_t)0.978317371f,
+    (float16_t)0.205610413f,  (float16_t)0.978633924f,
+    (float16_t)0.204108966f,  (float16_t)0.978948175f,
+    (float16_t)0.202607039f,  (float16_t)0.979260123f,
+    (float16_t)0.201104635f,  (float16_t)0.979569766f,
+    (float16_t)0.199601758f,  (float16_t)0.979877104f,
+    (float16_t)0.198098411f,  (float16_t)0.980182136f,
+    (float16_t)0.196594598f,  (float16_t)0.980484862f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.193585587f,  (float16_t)0.981083391f,
+    (float16_t)0.192080397f,  (float16_t)0.981379193f,
+    (float16_t)0.190574755f,  (float16_t)0.981672686f,
+    (float16_t)0.189068664f,  (float16_t)0.981963869f,
+    (float16_t)0.187562129f,  (float16_t)0.982252741f,
+    (float16_t)0.186055152f,  (float16_t)0.982539302f,
+    (float16_t)0.184547737f,  (float16_t)0.982823551f,
+    (float16_t)0.183039888f,  (float16_t)0.983105487f,
+    (float16_t)0.181531608f,  (float16_t)0.983385110f,
+    (float16_t)0.180022901f,  (float16_t)0.983662419f,
+    (float16_t)0.178513771f,  (float16_t)0.983937413f,
+    (float16_t)0.177004220f,  (float16_t)0.984210092f,
+    (float16_t)0.175494253f,  (float16_t)0.984480455f,
+    (float16_t)0.173983873f,  (float16_t)0.984748502f,
+    (float16_t)0.172473084f,  (float16_t)0.985014231f,
+    (float16_t)0.170961889f,  (float16_t)0.985277642f,
+    (float16_t)0.169450291f,  (float16_t)0.985538735f,
+    (float16_t)0.167938295f,  (float16_t)0.985797509f,
+    (float16_t)0.166425904f,  (float16_t)0.986053963f,
+    (float16_t)0.164913120f,  (float16_t)0.986308097f,
+    (float16_t)0.163399949f,  (float16_t)0.986559910f,
+    (float16_t)0.161886394f,  (float16_t)0.986809402f,
+    (float16_t)0.160372457f,  (float16_t)0.987056571f,
+    (float16_t)0.158858143f,  (float16_t)0.987301418f,
+    (float16_t)0.157343456f,  (float16_t)0.987543942f,
+    (float16_t)0.155828398f,  (float16_t)0.987784142f,
+    (float16_t)0.154312973f,  (float16_t)0.988022017f,
+    (float16_t)0.152797185f,  (float16_t)0.988257568f,
+    (float16_t)0.151281038f,  (float16_t)0.988490793f,
+    (float16_t)0.149764535f,  (float16_t)0.988721692f,
+    (float16_t)0.148247679f,  (float16_t)0.988950265f,
+    (float16_t)0.146730474f,  (float16_t)0.989176510f,
+    (float16_t)0.145212925f,  (float16_t)0.989400428f,
+    (float16_t)0.143695033f,  (float16_t)0.989622017f,
+    (float16_t)0.142176804f,  (float16_t)0.989841278f,
+    (float16_t)0.140658239f,  (float16_t)0.990058210f,
+    (float16_t)0.139139344f,  (float16_t)0.990272812f,
+    (float16_t)0.137620122f,  (float16_t)0.990485084f,
+    (float16_t)0.136100575f,  (float16_t)0.990695025f,
+    (float16_t)0.134580709f,  (float16_t)0.990902635f,
+    (float16_t)0.133060525f,  (float16_t)0.991107914f,
+    (float16_t)0.131540029f,  (float16_t)0.991310860f,
+    (float16_t)0.130019223f,  (float16_t)0.991511473f,
+    (float16_t)0.128498111f,  (float16_t)0.991709754f,
+    (float16_t)0.126976696f,  (float16_t)0.991905700f,
+    (float16_t)0.125454983f,  (float16_t)0.992099313f,
+    (float16_t)0.123932975f,  (float16_t)0.992290591f,
+    (float16_t)0.122410675f,  (float16_t)0.992479535f,
+    (float16_t)0.120888087f,  (float16_t)0.992666142f,
+    (float16_t)0.119365215f,  (float16_t)0.992850414f,
+    (float16_t)0.117842062f,  (float16_t)0.993032350f,
+    (float16_t)0.116318631f,  (float16_t)0.993211949f,
+    (float16_t)0.114794927f,  (float16_t)0.993389211f,
+    (float16_t)0.113270952f,  (float16_t)0.993564136f,
+    (float16_t)0.111746711f,  (float16_t)0.993736722f,
+    (float16_t)0.110222207f,  (float16_t)0.993906970f,
+    (float16_t)0.108697444f,  (float16_t)0.994074879f,
+    (float16_t)0.107172425f,  (float16_t)0.994240449f,
+    (float16_t)0.105647154f,  (float16_t)0.994403680f,
+    (float16_t)0.104121634f,  (float16_t)0.994564571f,
+    (float16_t)0.102595869f,  (float16_t)0.994723121f,
+    (float16_t)0.101069863f,  (float16_t)0.994879331f,
+    (float16_t)0.099543619f,  (float16_t)0.995033199f,
+    (float16_t)0.098017140f,  (float16_t)0.995184727f,
+    (float16_t)0.096490431f,  (float16_t)0.995333912f,
+    (float16_t)0.094963495f,  (float16_t)0.995480755f,
+    (float16_t)0.093436336f,  (float16_t)0.995625256f,
+    (float16_t)0.091908956f,  (float16_t)0.995767414f,
+    (float16_t)0.090381361f,  (float16_t)0.995907229f,
+    (float16_t)0.088853553f,  (float16_t)0.996044701f,
+    (float16_t)0.087325535f,  (float16_t)0.996179829f,
+    (float16_t)0.085797312f,  (float16_t)0.996312612f,
+    (float16_t)0.084268888f,  (float16_t)0.996443051f,
+    (float16_t)0.082740265f,  (float16_t)0.996571146f,
+    (float16_t)0.081211447f,  (float16_t)0.996696895f,
+    (float16_t)0.079682438f,  (float16_t)0.996820299f,
+    (float16_t)0.078153242f,  (float16_t)0.996941358f,
+    (float16_t)0.076623861f,  (float16_t)0.997060070f,
+    (float16_t)0.075094301f,  (float16_t)0.997176437f,
+    (float16_t)0.073564564f,  (float16_t)0.997290457f,
+    (float16_t)0.072034653f,  (float16_t)0.997402130f,
+    (float16_t)0.070504573f,  (float16_t)0.997511456f,
+    (float16_t)0.068974328f,  (float16_t)0.997618435f,
+    (float16_t)0.067443920f,  (float16_t)0.997723067f,
+    (float16_t)0.065913353f,  (float16_t)0.997825350f,
+    (float16_t)0.064382631f,  (float16_t)0.997925286f,
+    (float16_t)0.062851758f,  (float16_t)0.998022874f,
+    (float16_t)0.061320736f,  (float16_t)0.998118113f,
+    (float16_t)0.059789571f,  (float16_t)0.998211003f,
+    (float16_t)0.058258265f,  (float16_t)0.998301545f,
+    (float16_t)0.056726821f,  (float16_t)0.998389737f,
+    (float16_t)0.055195244f,  (float16_t)0.998475581f,
+    (float16_t)0.053663538f,  (float16_t)0.998559074f,
+    (float16_t)0.052131705f,  (float16_t)0.998640218f,
+    (float16_t)0.050599749f,  (float16_t)0.998719012f,
+    (float16_t)0.049067674f,  (float16_t)0.998795456f,
+    (float16_t)0.047535484f,  (float16_t)0.998869550f,
+    (float16_t)0.046003182f,  (float16_t)0.998941293f,
+    (float16_t)0.044470772f,  (float16_t)0.999010686f,
+    (float16_t)0.042938257f,  (float16_t)0.999077728f,
+    (float16_t)0.041405641f,  (float16_t)0.999142419f,
+    (float16_t)0.039872928f,  (float16_t)0.999204759f,
+    (float16_t)0.038340120f,  (float16_t)0.999264747f,
+    (float16_t)0.036807223f,  (float16_t)0.999322385f,
+    (float16_t)0.035274239f,  (float16_t)0.999377670f,
+    (float16_t)0.033741172f,  (float16_t)0.999430605f,
+    (float16_t)0.032208025f,  (float16_t)0.999481187f,
+    (float16_t)0.030674803f,  (float16_t)0.999529418f,
+    (float16_t)0.029141509f,  (float16_t)0.999575296f,
+    (float16_t)0.027608146f,  (float16_t)0.999618822f,
+    (float16_t)0.026074718f,  (float16_t)0.999659997f,
+    (float16_t)0.024541229f,  (float16_t)0.999698819f,
+    (float16_t)0.023007681f,  (float16_t)0.999735288f,
+    (float16_t)0.021474080f,  (float16_t)0.999769405f,
+    (float16_t)0.019940429f,  (float16_t)0.999801170f,
+    (float16_t)0.018406730f,  (float16_t)0.999830582f,
+    (float16_t)0.016872988f,  (float16_t)0.999857641f,
+    (float16_t)0.015339206f,  (float16_t)0.999882347f,
+    (float16_t)0.013805389f,  (float16_t)0.999904701f,
+    (float16_t)0.012271538f,  (float16_t)0.999924702f,
+    (float16_t)0.010737659f,  (float16_t)0.999942350f,
+    (float16_t)0.009203755f,  (float16_t)0.999957645f,
+    (float16_t)0.007669829f,  (float16_t)0.999970586f,
+    (float16_t)0.006135885f,  (float16_t)0.999981175f,
+    (float16_t)0.004601926f,  (float16_t)0.999989411f,
+    (float16_t)0.003067957f,  (float16_t)0.999995294f,
+    (float16_t)0.001533980f,  (float16_t)0.999998823f,
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+   (float16_t)-0.001533980f,  (float16_t)0.999998823f,
+   (float16_t)-0.003067957f,  (float16_t)0.999995294f,
+   (float16_t)-0.004601926f,  (float16_t)0.999989411f,
+   (float16_t)-0.006135885f,  (float16_t)0.999981175f,
+   (float16_t)-0.007669829f,  (float16_t)0.999970586f,
+   (float16_t)-0.009203755f,  (float16_t)0.999957645f,
+   (float16_t)-0.010737659f,  (float16_t)0.999942350f,
+   (float16_t)-0.012271538f,  (float16_t)0.999924702f,
+   (float16_t)-0.013805389f,  (float16_t)0.999904701f,
+   (float16_t)-0.015339206f,  (float16_t)0.999882347f,
+   (float16_t)-0.016872988f,  (float16_t)0.999857641f,
+   (float16_t)-0.018406730f,  (float16_t)0.999830582f,
+   (float16_t)-0.019940429f,  (float16_t)0.999801170f,
+   (float16_t)-0.021474080f,  (float16_t)0.999769405f,
+   (float16_t)-0.023007681f,  (float16_t)0.999735288f,
+   (float16_t)-0.024541229f,  (float16_t)0.999698819f,
+   (float16_t)-0.026074718f,  (float16_t)0.999659997f,
+   (float16_t)-0.027608146f,  (float16_t)0.999618822f,
+   (float16_t)-0.029141509f,  (float16_t)0.999575296f,
+   (float16_t)-0.030674803f,  (float16_t)0.999529418f,
+   (float16_t)-0.032208025f,  (float16_t)0.999481187f,
+   (float16_t)-0.033741172f,  (float16_t)0.999430605f,
+   (float16_t)-0.035274239f,  (float16_t)0.999377670f,
+   (float16_t)-0.036807223f,  (float16_t)0.999322385f,
+   (float16_t)-0.038340120f,  (float16_t)0.999264747f,
+   (float16_t)-0.039872928f,  (float16_t)0.999204759f,
+   (float16_t)-0.041405641f,  (float16_t)0.999142419f,
+   (float16_t)-0.042938257f,  (float16_t)0.999077728f,
+   (float16_t)-0.044470772f,  (float16_t)0.999010686f,
+   (float16_t)-0.046003182f,  (float16_t)0.998941293f,
+   (float16_t)-0.047535484f,  (float16_t)0.998869550f,
+   (float16_t)-0.049067674f,  (float16_t)0.998795456f,
+   (float16_t)-0.050599749f,  (float16_t)0.998719012f,
+   (float16_t)-0.052131705f,  (float16_t)0.998640218f,
+   (float16_t)-0.053663538f,  (float16_t)0.998559074f,
+   (float16_t)-0.055195244f,  (float16_t)0.998475581f,
+   (float16_t)-0.056726821f,  (float16_t)0.998389737f,
+   (float16_t)-0.058258265f,  (float16_t)0.998301545f,
+   (float16_t)-0.059789571f,  (float16_t)0.998211003f,
+   (float16_t)-0.061320736f,  (float16_t)0.998118113f,
+   (float16_t)-0.062851758f,  (float16_t)0.998022874f,
+   (float16_t)-0.064382631f,  (float16_t)0.997925286f,
+   (float16_t)-0.065913353f,  (float16_t)0.997825350f,
+   (float16_t)-0.067443920f,  (float16_t)0.997723067f,
+   (float16_t)-0.068974328f,  (float16_t)0.997618435f,
+   (float16_t)-0.070504573f,  (float16_t)0.997511456f,
+   (float16_t)-0.072034653f,  (float16_t)0.997402130f,
+   (float16_t)-0.073564564f,  (float16_t)0.997290457f,
+   (float16_t)-0.075094301f,  (float16_t)0.997176437f,
+   (float16_t)-0.076623861f,  (float16_t)0.997060070f,
+   (float16_t)-0.078153242f,  (float16_t)0.996941358f,
+   (float16_t)-0.079682438f,  (float16_t)0.996820299f,
+   (float16_t)-0.081211447f,  (float16_t)0.996696895f,
+   (float16_t)-0.082740265f,  (float16_t)0.996571146f,
+   (float16_t)-0.084268888f,  (float16_t)0.996443051f,
+   (float16_t)-0.085797312f,  (float16_t)0.996312612f,
+   (float16_t)-0.087325535f,  (float16_t)0.996179829f,
+   (float16_t)-0.088853553f,  (float16_t)0.996044701f,
+   (float16_t)-0.090381361f,  (float16_t)0.995907229f,
+   (float16_t)-0.091908956f,  (float16_t)0.995767414f,
+   (float16_t)-0.093436336f,  (float16_t)0.995625256f,
+   (float16_t)-0.094963495f,  (float16_t)0.995480755f,
+   (float16_t)-0.096490431f,  (float16_t)0.995333912f,
+   (float16_t)-0.098017140f,  (float16_t)0.995184727f,
+   (float16_t)-0.099543619f,  (float16_t)0.995033199f,
+   (float16_t)-0.101069863f,  (float16_t)0.994879331f,
+   (float16_t)-0.102595869f,  (float16_t)0.994723121f,
+   (float16_t)-0.104121634f,  (float16_t)0.994564571f,
+   (float16_t)-0.105647154f,  (float16_t)0.994403680f,
+   (float16_t)-0.107172425f,  (float16_t)0.994240449f,
+   (float16_t)-0.108697444f,  (float16_t)0.994074879f,
+   (float16_t)-0.110222207f,  (float16_t)0.993906970f,
+   (float16_t)-0.111746711f,  (float16_t)0.993736722f,
+   (float16_t)-0.113270952f,  (float16_t)0.993564136f,
+   (float16_t)-0.114794927f,  (float16_t)0.993389211f,
+   (float16_t)-0.116318631f,  (float16_t)0.993211949f,
+   (float16_t)-0.117842062f,  (float16_t)0.993032350f,
+   (float16_t)-0.119365215f,  (float16_t)0.992850414f,
+   (float16_t)-0.120888087f,  (float16_t)0.992666142f,
+   (float16_t)-0.122410675f,  (float16_t)0.992479535f,
+   (float16_t)-0.123932975f,  (float16_t)0.992290591f,
+   (float16_t)-0.125454983f,  (float16_t)0.992099313f,
+   (float16_t)-0.126976696f,  (float16_t)0.991905700f,
+   (float16_t)-0.128498111f,  (float16_t)0.991709754f,
+   (float16_t)-0.130019223f,  (float16_t)0.991511473f,
+   (float16_t)-0.131540029f,  (float16_t)0.991310860f,
+   (float16_t)-0.133060525f,  (float16_t)0.991107914f,
+   (float16_t)-0.134580709f,  (float16_t)0.990902635f,
+   (float16_t)-0.136100575f,  (float16_t)0.990695025f,
+   (float16_t)-0.137620122f,  (float16_t)0.990485084f,
+   (float16_t)-0.139139344f,  (float16_t)0.990272812f,
+   (float16_t)-0.140658239f,  (float16_t)0.990058210f,
+   (float16_t)-0.142176804f,  (float16_t)0.989841278f,
+   (float16_t)-0.143695033f,  (float16_t)0.989622017f,
+   (float16_t)-0.145212925f,  (float16_t)0.989400428f,
+   (float16_t)-0.146730474f,  (float16_t)0.989176510f,
+   (float16_t)-0.148247679f,  (float16_t)0.988950265f,
+   (float16_t)-0.149764535f,  (float16_t)0.988721692f,
+   (float16_t)-0.151281038f,  (float16_t)0.988490793f,
+   (float16_t)-0.152797185f,  (float16_t)0.988257568f,
+   (float16_t)-0.154312973f,  (float16_t)0.988022017f,
+   (float16_t)-0.155828398f,  (float16_t)0.987784142f,
+   (float16_t)-0.157343456f,  (float16_t)0.987543942f,
+   (float16_t)-0.158858143f,  (float16_t)0.987301418f,
+   (float16_t)-0.160372457f,  (float16_t)0.987056571f,
+   (float16_t)-0.161886394f,  (float16_t)0.986809402f,
+   (float16_t)-0.163399949f,  (float16_t)0.986559910f,
+   (float16_t)-0.164913120f,  (float16_t)0.986308097f,
+   (float16_t)-0.166425904f,  (float16_t)0.986053963f,
+   (float16_t)-0.167938295f,  (float16_t)0.985797509f,
+   (float16_t)-0.169450291f,  (float16_t)0.985538735f,
+   (float16_t)-0.170961889f,  (float16_t)0.985277642f,
+   (float16_t)-0.172473084f,  (float16_t)0.985014231f,
+   (float16_t)-0.173983873f,  (float16_t)0.984748502f,
+   (float16_t)-0.175494253f,  (float16_t)0.984480455f,
+   (float16_t)-0.177004220f,  (float16_t)0.984210092f,
+   (float16_t)-0.178513771f,  (float16_t)0.983937413f,
+   (float16_t)-0.180022901f,  (float16_t)0.983662419f,
+   (float16_t)-0.181531608f,  (float16_t)0.983385110f,
+   (float16_t)-0.183039888f,  (float16_t)0.983105487f,
+   (float16_t)-0.184547737f,  (float16_t)0.982823551f,
+   (float16_t)-0.186055152f,  (float16_t)0.982539302f,
+   (float16_t)-0.187562129f,  (float16_t)0.982252741f,
+   (float16_t)-0.189068664f,  (float16_t)0.981963869f,
+   (float16_t)-0.190574755f,  (float16_t)0.981672686f,
+   (float16_t)-0.192080397f,  (float16_t)0.981379193f,
+   (float16_t)-0.193585587f,  (float16_t)0.981083391f,
+   (float16_t)-0.195090322f,  (float16_t)0.980785280f,
+   (float16_t)-0.196594598f,  (float16_t)0.980484862f,
+   (float16_t)-0.198098411f,  (float16_t)0.980182136f,
+   (float16_t)-0.199601758f,  (float16_t)0.979877104f,
+   (float16_t)-0.201104635f,  (float16_t)0.979569766f,
+   (float16_t)-0.202607039f,  (float16_t)0.979260123f,
+   (float16_t)-0.204108966f,  (float16_t)0.978948175f,
+   (float16_t)-0.205610413f,  (float16_t)0.978633924f,
+   (float16_t)-0.207111376f,  (float16_t)0.978317371f,
+   (float16_t)-0.208611852f,  (float16_t)0.977998515f,
+   (float16_t)-0.210111837f,  (float16_t)0.977677358f,
+   (float16_t)-0.211611327f,  (float16_t)0.977353900f,
+   (float16_t)-0.213110320f,  (float16_t)0.977028143f,
+   (float16_t)-0.214608811f,  (float16_t)0.976700086f,
+   (float16_t)-0.216106797f,  (float16_t)0.976369731f,
+   (float16_t)-0.217604275f,  (float16_t)0.976037079f,
+   (float16_t)-0.219101240f,  (float16_t)0.975702130f,
+   (float16_t)-0.220597690f,  (float16_t)0.975364885f,
+   (float16_t)-0.222093621f,  (float16_t)0.975025345f,
+   (float16_t)-0.223589029f,  (float16_t)0.974683511f,
+   (float16_t)-0.225083911f,  (float16_t)0.974339383f,
+   (float16_t)-0.226578264f,  (float16_t)0.973992962f,
+   (float16_t)-0.228072083f,  (float16_t)0.973644250f,
+   (float16_t)-0.229565366f,  (float16_t)0.973293246f,
+   (float16_t)-0.231058108f,  (float16_t)0.972939952f,
+   (float16_t)-0.232550307f,  (float16_t)0.972584369f,
+   (float16_t)-0.234041959f,  (float16_t)0.972226497f,
+   (float16_t)-0.235533059f,  (float16_t)0.971866337f,
+   (float16_t)-0.237023606f,  (float16_t)0.971503891f,
+   (float16_t)-0.238513595f,  (float16_t)0.971139158f,
+   (float16_t)-0.240003022f,  (float16_t)0.970772141f,
+   (float16_t)-0.241491885f,  (float16_t)0.970402839f,
+   (float16_t)-0.242980180f,  (float16_t)0.970031253f,
+   (float16_t)-0.244467903f,  (float16_t)0.969657385f,
+   (float16_t)-0.245955050f,  (float16_t)0.969281235f,
+   (float16_t)-0.247441619f,  (float16_t)0.968902805f,
+   (float16_t)-0.248927606f,  (float16_t)0.968522094f,
+   (float16_t)-0.250413007f,  (float16_t)0.968139105f,
+   (float16_t)-0.251897818f,  (float16_t)0.967753837f,
+   (float16_t)-0.253382037f,  (float16_t)0.967366292f,
+   (float16_t)-0.254865660f,  (float16_t)0.966976471f,
+   (float16_t)-0.256348682f,  (float16_t)0.966584374f,
+   (float16_t)-0.257831102f,  (float16_t)0.966190003f,
+   (float16_t)-0.259312915f,  (float16_t)0.965793359f,
+   (float16_t)-0.260794118f,  (float16_t)0.965394442f,
+   (float16_t)-0.262274707f,  (float16_t)0.964993253f,
+   (float16_t)-0.263754679f,  (float16_t)0.964589793f,
+   (float16_t)-0.265234030f,  (float16_t)0.964184064f,
+   (float16_t)-0.266712757f,  (float16_t)0.963776066f,
+   (float16_t)-0.268190857f,  (float16_t)0.963365800f,
+   (float16_t)-0.269668326f,  (float16_t)0.962953267f,
+   (float16_t)-0.271145160f,  (float16_t)0.962538468f,
+   (float16_t)-0.272621355f,  (float16_t)0.962121404f,
+   (float16_t)-0.274096910f,  (float16_t)0.961702077f,
+   (float16_t)-0.275571819f,  (float16_t)0.961280486f,
+   (float16_t)-0.277046080f,  (float16_t)0.960856633f,
+   (float16_t)-0.278519689f,  (float16_t)0.960430519f,
+   (float16_t)-0.279992643f,  (float16_t)0.960002146f,
+   (float16_t)-0.281464938f,  (float16_t)0.959571513f,
+   (float16_t)-0.282936570f,  (float16_t)0.959138622f,
+   (float16_t)-0.284407537f,  (float16_t)0.958703475f,
+   (float16_t)-0.285877835f,  (float16_t)0.958266071f,
+   (float16_t)-0.287347460f,  (float16_t)0.957826413f,
+   (float16_t)-0.288816408f,  (float16_t)0.957384501f,
+   (float16_t)-0.290284677f,  (float16_t)0.956940336f,
+   (float16_t)-0.291752263f,  (float16_t)0.956493919f,
+   (float16_t)-0.293219163f,  (float16_t)0.956045251f,
+   (float16_t)-0.294685372f,  (float16_t)0.955594334f,
+   (float16_t)-0.296150888f,  (float16_t)0.955141168f,
+   (float16_t)-0.297615707f,  (float16_t)0.954685755f,
+   (float16_t)-0.299079826f,  (float16_t)0.954228095f,
+   (float16_t)-0.300543241f,  (float16_t)0.953768190f,
+   (float16_t)-0.302005949f,  (float16_t)0.953306040f,
+   (float16_t)-0.303467947f,  (float16_t)0.952841648f,
+   (float16_t)-0.304929230f,  (float16_t)0.952375013f,
+   (float16_t)-0.306389795f,  (float16_t)0.951906137f,
+   (float16_t)-0.307849640f,  (float16_t)0.951435021f,
+   (float16_t)-0.309308760f,  (float16_t)0.950961666f,
+   (float16_t)-0.310767153f,  (float16_t)0.950486074f,
+   (float16_t)-0.312224814f,  (float16_t)0.950008245f,
+   (float16_t)-0.313681740f,  (float16_t)0.949528181f,
+   (float16_t)-0.315137929f,  (float16_t)0.949045882f,
+   (float16_t)-0.316593376f,  (float16_t)0.948561350f,
+   (float16_t)-0.318048077f,  (float16_t)0.948074586f,
+   (float16_t)-0.319502031f,  (float16_t)0.947585591f,
+   (float16_t)-0.320955232f,  (float16_t)0.947094366f,
+   (float16_t)-0.322407679f,  (float16_t)0.946600913f,
+   (float16_t)-0.323859367f,  (float16_t)0.946105232f,
+   (float16_t)-0.325310292f,  (float16_t)0.945607325f,
+   (float16_t)-0.326760452f,  (float16_t)0.945107193f,
+   (float16_t)-0.328209844f,  (float16_t)0.944604837f,
+   (float16_t)-0.329658463f,  (float16_t)0.944100258f,
+   (float16_t)-0.331106306f,  (float16_t)0.943593458f,
+   (float16_t)-0.332553370f,  (float16_t)0.943084437f,
+   (float16_t)-0.333999651f,  (float16_t)0.942573198f,
+   (float16_t)-0.335445147f,  (float16_t)0.942059740f,
+   (float16_t)-0.336889853f,  (float16_t)0.941544065f,
+   (float16_t)-0.338333767f,  (float16_t)0.941026175f,
+   (float16_t)-0.339776884f,  (float16_t)0.940506071f,
+   (float16_t)-0.341219202f,  (float16_t)0.939983753f,
+   (float16_t)-0.342660717f,  (float16_t)0.939459224f,
+   (float16_t)-0.344101426f,  (float16_t)0.938932484f,
+   (float16_t)-0.345541325f,  (float16_t)0.938403534f,
+   (float16_t)-0.346980411f,  (float16_t)0.937872376f,
+   (float16_t)-0.348418680f,  (float16_t)0.937339012f,
+   (float16_t)-0.349856130f,  (float16_t)0.936803442f,
+   (float16_t)-0.351292756f,  (float16_t)0.936265667f,
+   (float16_t)-0.352728556f,  (float16_t)0.935725689f,
+   (float16_t)-0.354163525f,  (float16_t)0.935183510f,
+   (float16_t)-0.355597662f,  (float16_t)0.934639130f,
+   (float16_t)-0.357030961f,  (float16_t)0.934092550f,
+   (float16_t)-0.358463421f,  (float16_t)0.933543773f,
+   (float16_t)-0.359895037f,  (float16_t)0.932992799f,
+   (float16_t)-0.361325806f,  (float16_t)0.932439629f,
+   (float16_t)-0.362755724f,  (float16_t)0.931884266f,
+   (float16_t)-0.364184790f,  (float16_t)0.931326709f,
+   (float16_t)-0.365612998f,  (float16_t)0.930766961f,
+   (float16_t)-0.367040346f,  (float16_t)0.930205023f,
+   (float16_t)-0.368466830f,  (float16_t)0.929640896f,
+   (float16_t)-0.369892447f,  (float16_t)0.929074581f,
+   (float16_t)-0.371317194f,  (float16_t)0.928506080f,
+   (float16_t)-0.372741067f,  (float16_t)0.927935395f,
+   (float16_t)-0.374164063f,  (float16_t)0.927362526f,
+   (float16_t)-0.375586178f,  (float16_t)0.926787474f,
+   (float16_t)-0.377007410f,  (float16_t)0.926210242f,
+   (float16_t)-0.378427755f,  (float16_t)0.925630831f,
+   (float16_t)-0.379847209f,  (float16_t)0.925049241f,
+   (float16_t)-0.381265769f,  (float16_t)0.924465474f,
+   (float16_t)-0.382683432f,  (float16_t)0.923879533f,
+   (float16_t)-0.384100195f,  (float16_t)0.923291417f,
+   (float16_t)-0.385516054f,  (float16_t)0.922701128f,
+   (float16_t)-0.386931006f,  (float16_t)0.922108669f,
+   (float16_t)-0.388345047f,  (float16_t)0.921514039f,
+   (float16_t)-0.389758174f,  (float16_t)0.920917242f,
+   (float16_t)-0.391170384f,  (float16_t)0.920318277f,
+   (float16_t)-0.392581674f,  (float16_t)0.919717146f,
+   (float16_t)-0.393992040f,  (float16_t)0.919113852f,
+   (float16_t)-0.395401479f,  (float16_t)0.918508394f,
+   (float16_t)-0.396809987f,  (float16_t)0.917900776f,
+   (float16_t)-0.398217562f,  (float16_t)0.917290997f,
+   (float16_t)-0.399624200f,  (float16_t)0.916679060f,
+   (float16_t)-0.401029897f,  (float16_t)0.916064966f,
+   (float16_t)-0.402434651f,  (float16_t)0.915448716f,
+   (float16_t)-0.403838458f,  (float16_t)0.914830312f,
+   (float16_t)-0.405241314f,  (float16_t)0.914209756f,
+   (float16_t)-0.406643217f,  (float16_t)0.913587048f,
+   (float16_t)-0.408044163f,  (float16_t)0.912962190f,
+   (float16_t)-0.409444149f,  (float16_t)0.912335185f,
+   (float16_t)-0.410843171f,  (float16_t)0.911706032f,
+   (float16_t)-0.412241227f,  (float16_t)0.911074734f,
+   (float16_t)-0.413638312f,  (float16_t)0.910441292f,
+   (float16_t)-0.415034424f,  (float16_t)0.909805708f,
+   (float16_t)-0.416429560f,  (float16_t)0.909167983f,
+   (float16_t)-0.417823716f,  (float16_t)0.908528119f,
+   (float16_t)-0.419216888f,  (float16_t)0.907886116f,
+   (float16_t)-0.420609074f,  (float16_t)0.907241978f,
+   (float16_t)-0.422000271f,  (float16_t)0.906595705f,
+   (float16_t)-0.423390474f,  (float16_t)0.905947298f,
+   (float16_t)-0.424779681f,  (float16_t)0.905296759f,
+   (float16_t)-0.426167889f,  (float16_t)0.904644091f,
+   (float16_t)-0.427555093f,  (float16_t)0.903989293f,
+   (float16_t)-0.428941292f,  (float16_t)0.903332368f,
+   (float16_t)-0.430326481f,  (float16_t)0.902673318f,
+   (float16_t)-0.431710658f,  (float16_t)0.902012144f,
+   (float16_t)-0.433093819f,  (float16_t)0.901348847f,
+   (float16_t)-0.434475961f,  (float16_t)0.900683429f,
+   (float16_t)-0.435857080f,  (float16_t)0.900015892f,
+   (float16_t)-0.437237174f,  (float16_t)0.899346237f,
+   (float16_t)-0.438616239f,  (float16_t)0.898674466f,
+   (float16_t)-0.439994271f,  (float16_t)0.898000580f,
+   (float16_t)-0.441371269f,  (float16_t)0.897324581f,
+   (float16_t)-0.442747228f,  (float16_t)0.896646470f,
+   (float16_t)-0.444122145f,  (float16_t)0.895966250f,
+   (float16_t)-0.445496017f,  (float16_t)0.895283921f,
+   (float16_t)-0.446868840f,  (float16_t)0.894599486f,
+   (float16_t)-0.448240612f,  (float16_t)0.893912945f,
+   (float16_t)-0.449611330f,  (float16_t)0.893224301f,
+   (float16_t)-0.450980989f,  (float16_t)0.892533555f,
+   (float16_t)-0.452349587f,  (float16_t)0.891840709f,
+   (float16_t)-0.453717121f,  (float16_t)0.891145765f,
+   (float16_t)-0.455083587f,  (float16_t)0.890448723f,
+   (float16_t)-0.456448982f,  (float16_t)0.889749586f,
+   (float16_t)-0.457813304f,  (float16_t)0.889048356f,
+   (float16_t)-0.459176548f,  (float16_t)0.888345033f,
+   (float16_t)-0.460538711f,  (float16_t)0.887639620f,
+   (float16_t)-0.461899791f,  (float16_t)0.886932119f,
+   (float16_t)-0.463259784f,  (float16_t)0.886222530f,
+   (float16_t)-0.464618686f,  (float16_t)0.885510856f,
+   (float16_t)-0.465976496f,  (float16_t)0.884797098f,
+   (float16_t)-0.467333209f,  (float16_t)0.884081259f,
+   (float16_t)-0.468688822f,  (float16_t)0.883363339f,
+   (float16_t)-0.470043332f,  (float16_t)0.882643340f,
+   (float16_t)-0.471396737f,  (float16_t)0.881921264f,
+   (float16_t)-0.472749032f,  (float16_t)0.881197113f,
+   (float16_t)-0.474100215f,  (float16_t)0.880470889f,
+   (float16_t)-0.475450282f,  (float16_t)0.879742593f,
+   (float16_t)-0.476799230f,  (float16_t)0.879012226f,
+   (float16_t)-0.478147056f,  (float16_t)0.878279792f,
+   (float16_t)-0.479493758f,  (float16_t)0.877545290f,
+   (float16_t)-0.480839331f,  (float16_t)0.876808724f,
+   (float16_t)-0.482183772f,  (float16_t)0.876070094f,
+   (float16_t)-0.483527079f,  (float16_t)0.875329403f,
+   (float16_t)-0.484869248f,  (float16_t)0.874586652f,
+   (float16_t)-0.486210276f,  (float16_t)0.873841843f,
+   (float16_t)-0.487550160f,  (float16_t)0.873094978f,
+   (float16_t)-0.488888897f,  (float16_t)0.872346059f,
+   (float16_t)-0.490226483f,  (float16_t)0.871595087f,
+   (float16_t)-0.491562916f,  (float16_t)0.870842063f,
+   (float16_t)-0.492898192f,  (float16_t)0.870086991f,
+   (float16_t)-0.494232309f,  (float16_t)0.869329871f,
+   (float16_t)-0.495565262f,  (float16_t)0.868570706f,
+   (float16_t)-0.496897049f,  (float16_t)0.867809497f,
+   (float16_t)-0.498227667f,  (float16_t)0.867046246f,
+   (float16_t)-0.499557113f,  (float16_t)0.866280954f,
+   (float16_t)-0.500885383f,  (float16_t)0.865513624f,
+   (float16_t)-0.502212474f,  (float16_t)0.864744258f,
+   (float16_t)-0.503538384f,  (float16_t)0.863972856f,
+   (float16_t)-0.504863109f,  (float16_t)0.863199422f,
+   (float16_t)-0.506186645f,  (float16_t)0.862423956f,
+   (float16_t)-0.507508991f,  (float16_t)0.861646461f,
+   (float16_t)-0.508830143f,  (float16_t)0.860866939f,
+   (float16_t)-0.510150097f,  (float16_t)0.860085390f,
+   (float16_t)-0.511468850f,  (float16_t)0.859301818f,
+   (float16_t)-0.512786401f,  (float16_t)0.858516224f,
+   (float16_t)-0.514102744f,  (float16_t)0.857728610f,
+   (float16_t)-0.515417878f,  (float16_t)0.856938977f,
+   (float16_t)-0.516731799f,  (float16_t)0.856147328f,
+   (float16_t)-0.518044504f,  (float16_t)0.855353665f,
+   (float16_t)-0.519355990f,  (float16_t)0.854557988f,
+   (float16_t)-0.520666254f,  (float16_t)0.853760301f,
+   (float16_t)-0.521975293f,  (float16_t)0.852960605f,
+   (float16_t)-0.523283103f,  (float16_t)0.852158902f,
+   (float16_t)-0.524589683f,  (float16_t)0.851355193f,
+   (float16_t)-0.525895027f,  (float16_t)0.850549481f,
+   (float16_t)-0.527199135f,  (float16_t)0.849741768f,
+   (float16_t)-0.528502002f,  (float16_t)0.848932055f,
+   (float16_t)-0.529803625f,  (float16_t)0.848120345f,
+   (float16_t)-0.531104001f,  (float16_t)0.847306639f,
+   (float16_t)-0.532403128f,  (float16_t)0.846490939f,
+   (float16_t)-0.533701002f,  (float16_t)0.845673247f,
+   (float16_t)-0.534997620f,  (float16_t)0.844853565f,
+   (float16_t)-0.536292979f,  (float16_t)0.844031895f,
+   (float16_t)-0.537587076f,  (float16_t)0.843208240f,
+   (float16_t)-0.538879909f,  (float16_t)0.842382600f,
+   (float16_t)-0.540171473f,  (float16_t)0.841554977f,
+   (float16_t)-0.541461766f,  (float16_t)0.840725375f,
+   (float16_t)-0.542750785f,  (float16_t)0.839893794f,
+   (float16_t)-0.544038527f,  (float16_t)0.839060237f,
+   (float16_t)-0.545324988f,  (float16_t)0.838224706f,
+   (float16_t)-0.546610167f,  (float16_t)0.837387202f,
+   (float16_t)-0.547894059f,  (float16_t)0.836547727f,
+   (float16_t)-0.549176662f,  (float16_t)0.835706284f,
+   (float16_t)-0.550457973f,  (float16_t)0.834862875f,
+   (float16_t)-0.551737988f,  (float16_t)0.834017501f,
+   (float16_t)-0.553016706f,  (float16_t)0.833170165f,
+   (float16_t)-0.554294121f,  (float16_t)0.832320868f,
+   (float16_t)-0.555570233f,  (float16_t)0.831469612f,
+   (float16_t)-0.556845037f,  (float16_t)0.830616400f,
+   (float16_t)-0.558118531f,  (float16_t)0.829761234f,
+   (float16_t)-0.559390712f,  (float16_t)0.828904115f,
+   (float16_t)-0.560661576f,  (float16_t)0.828045045f,
+   (float16_t)-0.561931121f,  (float16_t)0.827184027f,
+   (float16_t)-0.563199344f,  (float16_t)0.826321063f,
+   (float16_t)-0.564466242f,  (float16_t)0.825456154f,
+   (float16_t)-0.565731811f,  (float16_t)0.824589303f,
+   (float16_t)-0.566996049f,  (float16_t)0.823720511f,
+   (float16_t)-0.568258953f,  (float16_t)0.822849781f,
+   (float16_t)-0.569520519f,  (float16_t)0.821977115f,
+   (float16_t)-0.570780746f,  (float16_t)0.821102515f,
+   (float16_t)-0.572039629f,  (float16_t)0.820225983f,
+   (float16_t)-0.573297167f,  (float16_t)0.819347520f,
+   (float16_t)-0.574553355f,  (float16_t)0.818467130f,
+   (float16_t)-0.575808191f,  (float16_t)0.817584813f,
+   (float16_t)-0.577061673f,  (float16_t)0.816700573f,
+   (float16_t)-0.578313796f,  (float16_t)0.815814411f,
+   (float16_t)-0.579564559f,  (float16_t)0.814926329f,
+   (float16_t)-0.580813958f,  (float16_t)0.814036330f,
+   (float16_t)-0.582061990f,  (float16_t)0.813144415f,
+   (float16_t)-0.583308653f,  (float16_t)0.812250587f,
+   (float16_t)-0.584553943f,  (float16_t)0.811354847f,
+   (float16_t)-0.585797857f,  (float16_t)0.810457198f,
+   (float16_t)-0.587040394f,  (float16_t)0.809557642f,
+   (float16_t)-0.588281548f,  (float16_t)0.808656182f,
+   (float16_t)-0.589521319f,  (float16_t)0.807752818f,
+   (float16_t)-0.590759702f,  (float16_t)0.806847554f,
+   (float16_t)-0.591996695f,  (float16_t)0.805940391f,
+   (float16_t)-0.593232295f,  (float16_t)0.805031331f,
+   (float16_t)-0.594466499f,  (float16_t)0.804120377f,
+   (float16_t)-0.595699304f,  (float16_t)0.803207531f,
+   (float16_t)-0.596930708f,  (float16_t)0.802292796f,
+   (float16_t)-0.598160707f,  (float16_t)0.801376172f,
+   (float16_t)-0.599389298f,  (float16_t)0.800457662f,
+   (float16_t)-0.600616479f,  (float16_t)0.799537269f,
+   (float16_t)-0.601842247f,  (float16_t)0.798614995f,
+   (float16_t)-0.603066599f,  (float16_t)0.797690841f,
+   (float16_t)-0.604289531f,  (float16_t)0.796764810f,
+   (float16_t)-0.605511041f,  (float16_t)0.795836905f,
+   (float16_t)-0.606731127f,  (float16_t)0.794907126f,
+   (float16_t)-0.607949785f,  (float16_t)0.793975478f,
+   (float16_t)-0.609167012f,  (float16_t)0.793041960f,
+   (float16_t)-0.610382806f,  (float16_t)0.792106577f,
+   (float16_t)-0.611597164f,  (float16_t)0.791169330f,
+   (float16_t)-0.612810082f,  (float16_t)0.790230221f,
+   (float16_t)-0.614021559f,  (float16_t)0.789289253f,
+   (float16_t)-0.615231591f,  (float16_t)0.788346428f,
+   (float16_t)-0.616440175f,  (float16_t)0.787401747f,
+   (float16_t)-0.617647308f,  (float16_t)0.786455214f,
+   (float16_t)-0.618852988f,  (float16_t)0.785506830f,
+   (float16_t)-0.620057212f,  (float16_t)0.784556597f,
+   (float16_t)-0.621259977f,  (float16_t)0.783604519f,
+   (float16_t)-0.622461279f,  (float16_t)0.782650596f,
+   (float16_t)-0.623661118f,  (float16_t)0.781694832f,
+   (float16_t)-0.624859488f,  (float16_t)0.780737229f,
+   (float16_t)-0.626056388f,  (float16_t)0.779777788f,
+   (float16_t)-0.627251815f,  (float16_t)0.778816512f,
+   (float16_t)-0.628445767f,  (float16_t)0.777853404f,
+   (float16_t)-0.629638239f,  (float16_t)0.776888466f,
+   (float16_t)-0.630829230f,  (float16_t)0.775921699f,
+   (float16_t)-0.632018736f,  (float16_t)0.774953107f,
+   (float16_t)-0.633206755f,  (float16_t)0.773982691f,
+   (float16_t)-0.634393284f,  (float16_t)0.773010453f,
+   (float16_t)-0.635578320f,  (float16_t)0.772036397f,
+   (float16_t)-0.636761861f,  (float16_t)0.771060524f,
+   (float16_t)-0.637943904f,  (float16_t)0.770082837f,
+   (float16_t)-0.639124445f,  (float16_t)0.769103338f,
+   (float16_t)-0.640303482f,  (float16_t)0.768122029f,
+   (float16_t)-0.641481013f,  (float16_t)0.767138912f,
+   (float16_t)-0.642657034f,  (float16_t)0.766153990f,
+   (float16_t)-0.643831543f,  (float16_t)0.765167266f,
+   (float16_t)-0.645004537f,  (float16_t)0.764178741f,
+   (float16_t)-0.646176013f,  (float16_t)0.763188417f,
+   (float16_t)-0.647345969f,  (float16_t)0.762196298f,
+   (float16_t)-0.648514401f,  (float16_t)0.761202385f,
+   (float16_t)-0.649681307f,  (float16_t)0.760206682f,
+   (float16_t)-0.650846685f,  (float16_t)0.759209189f,
+   (float16_t)-0.652010531f,  (float16_t)0.758209910f,
+   (float16_t)-0.653172843f,  (float16_t)0.757208847f,
+   (float16_t)-0.654333618f,  (float16_t)0.756206001f,
+   (float16_t)-0.655492853f,  (float16_t)0.755201377f,
+   (float16_t)-0.656650546f,  (float16_t)0.754194975f,
+   (float16_t)-0.657806693f,  (float16_t)0.753186799f,
+   (float16_t)-0.658961293f,  (float16_t)0.752176850f,
+   (float16_t)-0.660114342f,  (float16_t)0.751165132f,
+   (float16_t)-0.661265838f,  (float16_t)0.750151646f,
+   (float16_t)-0.662415778f,  (float16_t)0.749136395f,
+   (float16_t)-0.663564159f,  (float16_t)0.748119380f,
+   (float16_t)-0.664710978f,  (float16_t)0.747100606f,
+   (float16_t)-0.665856234f,  (float16_t)0.746080074f,
+   (float16_t)-0.666999922f,  (float16_t)0.745057785f,
+   (float16_t)-0.668142041f,  (float16_t)0.744033744f,
+   (float16_t)-0.669282588f,  (float16_t)0.743007952f,
+   (float16_t)-0.670421560f,  (float16_t)0.741980412f,
+   (float16_t)-0.671558955f,  (float16_t)0.740951125f,
+   (float16_t)-0.672694769f,  (float16_t)0.739920095f,
+   (float16_t)-0.673829000f,  (float16_t)0.738887324f,
+   (float16_t)-0.674961646f,  (float16_t)0.737852815f,
+   (float16_t)-0.676092704f,  (float16_t)0.736816569f,
+   (float16_t)-0.677222170f,  (float16_t)0.735778589f,
+   (float16_t)-0.678350043f,  (float16_t)0.734738878f,
+   (float16_t)-0.679476320f,  (float16_t)0.733697438f,
+   (float16_t)-0.680600998f,  (float16_t)0.732654272f,
+   (float16_t)-0.681724074f,  (float16_t)0.731609381f,
+   (float16_t)-0.682845546f,  (float16_t)0.730562769f,
+   (float16_t)-0.683965412f,  (float16_t)0.729514438f,
+   (float16_t)-0.685083668f,  (float16_t)0.728464390f,
+   (float16_t)-0.686200312f,  (float16_t)0.727412629f,
+   (float16_t)-0.687315341f,  (float16_t)0.726359155f,
+   (float16_t)-0.688428753f,  (float16_t)0.725303972f,
+   (float16_t)-0.689540545f,  (float16_t)0.724247083f,
+   (float16_t)-0.690650714f,  (float16_t)0.723188489f,
+   (float16_t)-0.691759258f,  (float16_t)0.722128194f,
+   (float16_t)-0.692866175f,  (float16_t)0.721066199f,
+   (float16_t)-0.693971461f,  (float16_t)0.720002508f,
+   (float16_t)-0.695075114f,  (float16_t)0.718937122f,
+   (float16_t)-0.696177131f,  (float16_t)0.717870045f,
+   (float16_t)-0.697277511f,  (float16_t)0.716801279f,
+   (float16_t)-0.698376249f,  (float16_t)0.715730825f,
+   (float16_t)-0.699473345f,  (float16_t)0.714658688f,
+   (float16_t)-0.700568794f,  (float16_t)0.713584869f,
+   (float16_t)-0.701662595f,  (float16_t)0.712509371f,
+   (float16_t)-0.702754744f,  (float16_t)0.711432196f,
+   (float16_t)-0.703845241f,  (float16_t)0.710353347f,
+   (float16_t)-0.704934080f,  (float16_t)0.709272826f,
+   (float16_t)-0.706021261f,  (float16_t)0.708190637f,
+   (float16_t)-0.707106781f,  (float16_t)0.707106781f,
+   (float16_t)-0.708190637f,  (float16_t)0.706021261f,
+   (float16_t)-0.709272826f,  (float16_t)0.704934080f,
+   (float16_t)-0.710353347f,  (float16_t)0.703845241f,
+   (float16_t)-0.711432196f,  (float16_t)0.702754744f,
+   (float16_t)-0.712509371f,  (float16_t)0.701662595f,
+   (float16_t)-0.713584869f,  (float16_t)0.700568794f,
+   (float16_t)-0.714658688f,  (float16_t)0.699473345f,
+   (float16_t)-0.715730825f,  (float16_t)0.698376249f,
+   (float16_t)-0.716801279f,  (float16_t)0.697277511f,
+   (float16_t)-0.717870045f,  (float16_t)0.696177131f,
+   (float16_t)-0.718937122f,  (float16_t)0.695075114f,
+   (float16_t)-0.720002508f,  (float16_t)0.693971461f,
+   (float16_t)-0.721066199f,  (float16_t)0.692866175f,
+   (float16_t)-0.722128194f,  (float16_t)0.691759258f,
+   (float16_t)-0.723188489f,  (float16_t)0.690650714f,
+   (float16_t)-0.724247083f,  (float16_t)0.689540545f,
+   (float16_t)-0.725303972f,  (float16_t)0.688428753f,
+   (float16_t)-0.726359155f,  (float16_t)0.687315341f,
+   (float16_t)-0.727412629f,  (float16_t)0.686200312f,
+   (float16_t)-0.728464390f,  (float16_t)0.685083668f,
+   (float16_t)-0.729514438f,  (float16_t)0.683965412f,
+   (float16_t)-0.730562769f,  (float16_t)0.682845546f,
+   (float16_t)-0.731609381f,  (float16_t)0.681724074f,
+   (float16_t)-0.732654272f,  (float16_t)0.680600998f,
+   (float16_t)-0.733697438f,  (float16_t)0.679476320f,
+   (float16_t)-0.734738878f,  (float16_t)0.678350043f,
+   (float16_t)-0.735778589f,  (float16_t)0.677222170f,
+   (float16_t)-0.736816569f,  (float16_t)0.676092704f,
+   (float16_t)-0.737852815f,  (float16_t)0.674961646f,
+   (float16_t)-0.738887324f,  (float16_t)0.673829000f,
+   (float16_t)-0.739920095f,  (float16_t)0.672694769f,
+   (float16_t)-0.740951125f,  (float16_t)0.671558955f,
+   (float16_t)-0.741980412f,  (float16_t)0.670421560f,
+   (float16_t)-0.743007952f,  (float16_t)0.669282588f,
+   (float16_t)-0.744033744f,  (float16_t)0.668142041f,
+   (float16_t)-0.745057785f,  (float16_t)0.666999922f,
+   (float16_t)-0.746080074f,  (float16_t)0.665856234f,
+   (float16_t)-0.747100606f,  (float16_t)0.664710978f,
+   (float16_t)-0.748119380f,  (float16_t)0.663564159f,
+   (float16_t)-0.749136395f,  (float16_t)0.662415778f,
+   (float16_t)-0.750151646f,  (float16_t)0.661265838f,
+   (float16_t)-0.751165132f,  (float16_t)0.660114342f,
+   (float16_t)-0.752176850f,  (float16_t)0.658961293f,
+   (float16_t)-0.753186799f,  (float16_t)0.657806693f,
+   (float16_t)-0.754194975f,  (float16_t)0.656650546f,
+   (float16_t)-0.755201377f,  (float16_t)0.655492853f,
+   (float16_t)-0.756206001f,  (float16_t)0.654333618f,
+   (float16_t)-0.757208847f,  (float16_t)0.653172843f,
+   (float16_t)-0.758209910f,  (float16_t)0.652010531f,
+   (float16_t)-0.759209189f,  (float16_t)0.650846685f,
+   (float16_t)-0.760206682f,  (float16_t)0.649681307f,
+   (float16_t)-0.761202385f,  (float16_t)0.648514401f,
+   (float16_t)-0.762196298f,  (float16_t)0.647345969f,
+   (float16_t)-0.763188417f,  (float16_t)0.646176013f,
+   (float16_t)-0.764178741f,  (float16_t)0.645004537f,
+   (float16_t)-0.765167266f,  (float16_t)0.643831543f,
+   (float16_t)-0.766153990f,  (float16_t)0.642657034f,
+   (float16_t)-0.767138912f,  (float16_t)0.641481013f,
+   (float16_t)-0.768122029f,  (float16_t)0.640303482f,
+   (float16_t)-0.769103338f,  (float16_t)0.639124445f,
+   (float16_t)-0.770082837f,  (float16_t)0.637943904f,
+   (float16_t)-0.771060524f,  (float16_t)0.636761861f,
+   (float16_t)-0.772036397f,  (float16_t)0.635578320f,
+   (float16_t)-0.773010453f,  (float16_t)0.634393284f,
+   (float16_t)-0.773982691f,  (float16_t)0.633206755f,
+   (float16_t)-0.774953107f,  (float16_t)0.632018736f,
+   (float16_t)-0.775921699f,  (float16_t)0.630829230f,
+   (float16_t)-0.776888466f,  (float16_t)0.629638239f,
+   (float16_t)-0.777853404f,  (float16_t)0.628445767f,
+   (float16_t)-0.778816512f,  (float16_t)0.627251815f,
+   (float16_t)-0.779777788f,  (float16_t)0.626056388f,
+   (float16_t)-0.780737229f,  (float16_t)0.624859488f,
+   (float16_t)-0.781694832f,  (float16_t)0.623661118f,
+   (float16_t)-0.782650596f,  (float16_t)0.622461279f,
+   (float16_t)-0.783604519f,  (float16_t)0.621259977f,
+   (float16_t)-0.784556597f,  (float16_t)0.620057212f,
+   (float16_t)-0.785506830f,  (float16_t)0.618852988f,
+   (float16_t)-0.786455214f,  (float16_t)0.617647308f,
+   (float16_t)-0.787401747f,  (float16_t)0.616440175f,
+   (float16_t)-0.788346428f,  (float16_t)0.615231591f,
+   (float16_t)-0.789289253f,  (float16_t)0.614021559f,
+   (float16_t)-0.790230221f,  (float16_t)0.612810082f,
+   (float16_t)-0.791169330f,  (float16_t)0.611597164f,
+   (float16_t)-0.792106577f,  (float16_t)0.610382806f,
+   (float16_t)-0.793041960f,  (float16_t)0.609167012f,
+   (float16_t)-0.793975478f,  (float16_t)0.607949785f,
+   (float16_t)-0.794907126f,  (float16_t)0.606731127f,
+   (float16_t)-0.795836905f,  (float16_t)0.605511041f,
+   (float16_t)-0.796764810f,  (float16_t)0.604289531f,
+   (float16_t)-0.797690841f,  (float16_t)0.603066599f,
+   (float16_t)-0.798614995f,  (float16_t)0.601842247f,
+   (float16_t)-0.799537269f,  (float16_t)0.600616479f,
+   (float16_t)-0.800457662f,  (float16_t)0.599389298f,
+   (float16_t)-0.801376172f,  (float16_t)0.598160707f,
+   (float16_t)-0.802292796f,  (float16_t)0.596930708f,
+   (float16_t)-0.803207531f,  (float16_t)0.595699304f,
+   (float16_t)-0.804120377f,  (float16_t)0.594466499f,
+   (float16_t)-0.805031331f,  (float16_t)0.593232295f,
+   (float16_t)-0.805940391f,  (float16_t)0.591996695f,
+   (float16_t)-0.806847554f,  (float16_t)0.590759702f,
+   (float16_t)-0.807752818f,  (float16_t)0.589521319f,
+   (float16_t)-0.808656182f,  (float16_t)0.588281548f,
+   (float16_t)-0.809557642f,  (float16_t)0.587040394f,
+   (float16_t)-0.810457198f,  (float16_t)0.585797857f,
+   (float16_t)-0.811354847f,  (float16_t)0.584553943f,
+   (float16_t)-0.812250587f,  (float16_t)0.583308653f,
+   (float16_t)-0.813144415f,  (float16_t)0.582061990f,
+   (float16_t)-0.814036330f,  (float16_t)0.580813958f,
+   (float16_t)-0.814926329f,  (float16_t)0.579564559f,
+   (float16_t)-0.815814411f,  (float16_t)0.578313796f,
+   (float16_t)-0.816700573f,  (float16_t)0.577061673f,
+   (float16_t)-0.817584813f,  (float16_t)0.575808191f,
+   (float16_t)-0.818467130f,  (float16_t)0.574553355f,
+   (float16_t)-0.819347520f,  (float16_t)0.573297167f,
+   (float16_t)-0.820225983f,  (float16_t)0.572039629f,
+   (float16_t)-0.821102515f,  (float16_t)0.570780746f,
+   (float16_t)-0.821977115f,  (float16_t)0.569520519f,
+   (float16_t)-0.822849781f,  (float16_t)0.568258953f,
+   (float16_t)-0.823720511f,  (float16_t)0.566996049f,
+   (float16_t)-0.824589303f,  (float16_t)0.565731811f,
+   (float16_t)-0.825456154f,  (float16_t)0.564466242f,
+   (float16_t)-0.826321063f,  (float16_t)0.563199344f,
+   (float16_t)-0.827184027f,  (float16_t)0.561931121f,
+   (float16_t)-0.828045045f,  (float16_t)0.560661576f,
+   (float16_t)-0.828904115f,  (float16_t)0.559390712f,
+   (float16_t)-0.829761234f,  (float16_t)0.558118531f,
+   (float16_t)-0.830616400f,  (float16_t)0.556845037f,
+   (float16_t)-0.831469612f,  (float16_t)0.555570233f,
+   (float16_t)-0.832320868f,  (float16_t)0.554294121f,
+   (float16_t)-0.833170165f,  (float16_t)0.553016706f,
+   (float16_t)-0.834017501f,  (float16_t)0.551737988f,
+   (float16_t)-0.834862875f,  (float16_t)0.550457973f,
+   (float16_t)-0.835706284f,  (float16_t)0.549176662f,
+   (float16_t)-0.836547727f,  (float16_t)0.547894059f,
+   (float16_t)-0.837387202f,  (float16_t)0.546610167f,
+   (float16_t)-0.838224706f,  (float16_t)0.545324988f,
+   (float16_t)-0.839060237f,  (float16_t)0.544038527f,
+   (float16_t)-0.839893794f,  (float16_t)0.542750785f,
+   (float16_t)-0.840725375f,  (float16_t)0.541461766f,
+   (float16_t)-0.841554977f,  (float16_t)0.540171473f,
+   (float16_t)-0.842382600f,  (float16_t)0.538879909f,
+   (float16_t)-0.843208240f,  (float16_t)0.537587076f,
+   (float16_t)-0.844031895f,  (float16_t)0.536292979f,
+   (float16_t)-0.844853565f,  (float16_t)0.534997620f,
+   (float16_t)-0.845673247f,  (float16_t)0.533701002f,
+   (float16_t)-0.846490939f,  (float16_t)0.532403128f,
+   (float16_t)-0.847306639f,  (float16_t)0.531104001f,
+   (float16_t)-0.848120345f,  (float16_t)0.529803625f,
+   (float16_t)-0.848932055f,  (float16_t)0.528502002f,
+   (float16_t)-0.849741768f,  (float16_t)0.527199135f,
+   (float16_t)-0.850549481f,  (float16_t)0.525895027f,
+   (float16_t)-0.851355193f,  (float16_t)0.524589683f,
+   (float16_t)-0.852158902f,  (float16_t)0.523283103f,
+   (float16_t)-0.852960605f,  (float16_t)0.521975293f,
+   (float16_t)-0.853760301f,  (float16_t)0.520666254f,
+   (float16_t)-0.854557988f,  (float16_t)0.519355990f,
+   (float16_t)-0.855353665f,  (float16_t)0.518044504f,
+   (float16_t)-0.856147328f,  (float16_t)0.516731799f,
+   (float16_t)-0.856938977f,  (float16_t)0.515417878f,
+   (float16_t)-0.857728610f,  (float16_t)0.514102744f,
+   (float16_t)-0.858516224f,  (float16_t)0.512786401f,
+   (float16_t)-0.859301818f,  (float16_t)0.511468850f,
+   (float16_t)-0.860085390f,  (float16_t)0.510150097f,
+   (float16_t)-0.860866939f,  (float16_t)0.508830143f,
+   (float16_t)-0.861646461f,  (float16_t)0.507508991f,
+   (float16_t)-0.862423956f,  (float16_t)0.506186645f,
+   (float16_t)-0.863199422f,  (float16_t)0.504863109f,
+   (float16_t)-0.863972856f,  (float16_t)0.503538384f,
+   (float16_t)-0.864744258f,  (float16_t)0.502212474f,
+   (float16_t)-0.865513624f,  (float16_t)0.500885383f,
+   (float16_t)-0.866280954f,  (float16_t)0.499557113f,
+   (float16_t)-0.867046246f,  (float16_t)0.498227667f,
+   (float16_t)-0.867809497f,  (float16_t)0.496897049f,
+   (float16_t)-0.868570706f,  (float16_t)0.495565262f,
+   (float16_t)-0.869329871f,  (float16_t)0.494232309f,
+   (float16_t)-0.870086991f,  (float16_t)0.492898192f,
+   (float16_t)-0.870842063f,  (float16_t)0.491562916f,
+   (float16_t)-0.871595087f,  (float16_t)0.490226483f,
+   (float16_t)-0.872346059f,  (float16_t)0.488888897f,
+   (float16_t)-0.873094978f,  (float16_t)0.487550160f,
+   (float16_t)-0.873841843f,  (float16_t)0.486210276f,
+   (float16_t)-0.874586652f,  (float16_t)0.484869248f,
+   (float16_t)-0.875329403f,  (float16_t)0.483527079f,
+   (float16_t)-0.876070094f,  (float16_t)0.482183772f,
+   (float16_t)-0.876808724f,  (float16_t)0.480839331f,
+   (float16_t)-0.877545290f,  (float16_t)0.479493758f,
+   (float16_t)-0.878279792f,  (float16_t)0.478147056f,
+   (float16_t)-0.879012226f,  (float16_t)0.476799230f,
+   (float16_t)-0.879742593f,  (float16_t)0.475450282f,
+   (float16_t)-0.880470889f,  (float16_t)0.474100215f,
+   (float16_t)-0.881197113f,  (float16_t)0.472749032f,
+   (float16_t)-0.881921264f,  (float16_t)0.471396737f,
+   (float16_t)-0.882643340f,  (float16_t)0.470043332f,
+   (float16_t)-0.883363339f,  (float16_t)0.468688822f,
+   (float16_t)-0.884081259f,  (float16_t)0.467333209f,
+   (float16_t)-0.884797098f,  (float16_t)0.465976496f,
+   (float16_t)-0.885510856f,  (float16_t)0.464618686f,
+   (float16_t)-0.886222530f,  (float16_t)0.463259784f,
+   (float16_t)-0.886932119f,  (float16_t)0.461899791f,
+   (float16_t)-0.887639620f,  (float16_t)0.460538711f,
+   (float16_t)-0.888345033f,  (float16_t)0.459176548f,
+   (float16_t)-0.889048356f,  (float16_t)0.457813304f,
+   (float16_t)-0.889749586f,  (float16_t)0.456448982f,
+   (float16_t)-0.890448723f,  (float16_t)0.455083587f,
+   (float16_t)-0.891145765f,  (float16_t)0.453717121f,
+   (float16_t)-0.891840709f,  (float16_t)0.452349587f,
+   (float16_t)-0.892533555f,  (float16_t)0.450980989f,
+   (float16_t)-0.893224301f,  (float16_t)0.449611330f,
+   (float16_t)-0.893912945f,  (float16_t)0.448240612f,
+   (float16_t)-0.894599486f,  (float16_t)0.446868840f,
+   (float16_t)-0.895283921f,  (float16_t)0.445496017f,
+   (float16_t)-0.895966250f,  (float16_t)0.444122145f,
+   (float16_t)-0.896646470f,  (float16_t)0.442747228f,
+   (float16_t)-0.897324581f,  (float16_t)0.441371269f,
+   (float16_t)-0.898000580f,  (float16_t)0.439994271f,
+   (float16_t)-0.898674466f,  (float16_t)0.438616239f,
+   (float16_t)-0.899346237f,  (float16_t)0.437237174f,
+   (float16_t)-0.900015892f,  (float16_t)0.435857080f,
+   (float16_t)-0.900683429f,  (float16_t)0.434475961f,
+   (float16_t)-0.901348847f,  (float16_t)0.433093819f,
+   (float16_t)-0.902012144f,  (float16_t)0.431710658f,
+   (float16_t)-0.902673318f,  (float16_t)0.430326481f,
+   (float16_t)-0.903332368f,  (float16_t)0.428941292f,
+   (float16_t)-0.903989293f,  (float16_t)0.427555093f,
+   (float16_t)-0.904644091f,  (float16_t)0.426167889f,
+   (float16_t)-0.905296759f,  (float16_t)0.424779681f,
+   (float16_t)-0.905947298f,  (float16_t)0.423390474f,
+   (float16_t)-0.906595705f,  (float16_t)0.422000271f,
+   (float16_t)-0.907241978f,  (float16_t)0.420609074f,
+   (float16_t)-0.907886116f,  (float16_t)0.419216888f,
+   (float16_t)-0.908528119f,  (float16_t)0.417823716f,
+   (float16_t)-0.909167983f,  (float16_t)0.416429560f,
+   (float16_t)-0.909805708f,  (float16_t)0.415034424f,
+   (float16_t)-0.910441292f,  (float16_t)0.413638312f,
+   (float16_t)-0.911074734f,  (float16_t)0.412241227f,
+   (float16_t)-0.911706032f,  (float16_t)0.410843171f,
+   (float16_t)-0.912335185f,  (float16_t)0.409444149f,
+   (float16_t)-0.912962190f,  (float16_t)0.408044163f,
+   (float16_t)-0.913587048f,  (float16_t)0.406643217f,
+   (float16_t)-0.914209756f,  (float16_t)0.405241314f,
+   (float16_t)-0.914830312f,  (float16_t)0.403838458f,
+   (float16_t)-0.915448716f,  (float16_t)0.402434651f,
+   (float16_t)-0.916064966f,  (float16_t)0.401029897f,
+   (float16_t)-0.916679060f,  (float16_t)0.399624200f,
+   (float16_t)-0.917290997f,  (float16_t)0.398217562f,
+   (float16_t)-0.917900776f,  (float16_t)0.396809987f,
+   (float16_t)-0.918508394f,  (float16_t)0.395401479f,
+   (float16_t)-0.919113852f,  (float16_t)0.393992040f,
+   (float16_t)-0.919717146f,  (float16_t)0.392581674f,
+   (float16_t)-0.920318277f,  (float16_t)0.391170384f,
+   (float16_t)-0.920917242f,  (float16_t)0.389758174f,
+   (float16_t)-0.921514039f,  (float16_t)0.388345047f,
+   (float16_t)-0.922108669f,  (float16_t)0.386931006f,
+   (float16_t)-0.922701128f,  (float16_t)0.385516054f,
+   (float16_t)-0.923291417f,  (float16_t)0.384100195f,
+   (float16_t)-0.923879533f,  (float16_t)0.382683432f,
+   (float16_t)-0.924465474f,  (float16_t)0.381265769f,
+   (float16_t)-0.925049241f,  (float16_t)0.379847209f,
+   (float16_t)-0.925630831f,  (float16_t)0.378427755f,
+   (float16_t)-0.926210242f,  (float16_t)0.377007410f,
+   (float16_t)-0.926787474f,  (float16_t)0.375586178f,
+   (float16_t)-0.927362526f,  (float16_t)0.374164063f,
+   (float16_t)-0.927935395f,  (float16_t)0.372741067f,
+   (float16_t)-0.928506080f,  (float16_t)0.371317194f,
+   (float16_t)-0.929074581f,  (float16_t)0.369892447f,
+   (float16_t)-0.929640896f,  (float16_t)0.368466830f,
+   (float16_t)-0.930205023f,  (float16_t)0.367040346f,
+   (float16_t)-0.930766961f,  (float16_t)0.365612998f,
+   (float16_t)-0.931326709f,  (float16_t)0.364184790f,
+   (float16_t)-0.931884266f,  (float16_t)0.362755724f,
+   (float16_t)-0.932439629f,  (float16_t)0.361325806f,
+   (float16_t)-0.932992799f,  (float16_t)0.359895037f,
+   (float16_t)-0.933543773f,  (float16_t)0.358463421f,
+   (float16_t)-0.934092550f,  (float16_t)0.357030961f,
+   (float16_t)-0.934639130f,  (float16_t)0.355597662f,
+   (float16_t)-0.935183510f,  (float16_t)0.354163525f,
+   (float16_t)-0.935725689f,  (float16_t)0.352728556f,
+   (float16_t)-0.936265667f,  (float16_t)0.351292756f,
+   (float16_t)-0.936803442f,  (float16_t)0.349856130f,
+   (float16_t)-0.937339012f,  (float16_t)0.348418680f,
+   (float16_t)-0.937872376f,  (float16_t)0.346980411f,
+   (float16_t)-0.938403534f,  (float16_t)0.345541325f,
+   (float16_t)-0.938932484f,  (float16_t)0.344101426f,
+   (float16_t)-0.939459224f,  (float16_t)0.342660717f,
+   (float16_t)-0.939983753f,  (float16_t)0.341219202f,
+   (float16_t)-0.940506071f,  (float16_t)0.339776884f,
+   (float16_t)-0.941026175f,  (float16_t)0.338333767f,
+   (float16_t)-0.941544065f,  (float16_t)0.336889853f,
+   (float16_t)-0.942059740f,  (float16_t)0.335445147f,
+   (float16_t)-0.942573198f,  (float16_t)0.333999651f,
+   (float16_t)-0.943084437f,  (float16_t)0.332553370f,
+   (float16_t)-0.943593458f,  (float16_t)0.331106306f,
+   (float16_t)-0.944100258f,  (float16_t)0.329658463f,
+   (float16_t)-0.944604837f,  (float16_t)0.328209844f,
+   (float16_t)-0.945107193f,  (float16_t)0.326760452f,
+   (float16_t)-0.945607325f,  (float16_t)0.325310292f,
+   (float16_t)-0.946105232f,  (float16_t)0.323859367f,
+   (float16_t)-0.946600913f,  (float16_t)0.322407679f,
+   (float16_t)-0.947094366f,  (float16_t)0.320955232f,
+   (float16_t)-0.947585591f,  (float16_t)0.319502031f,
+   (float16_t)-0.948074586f,  (float16_t)0.318048077f,
+   (float16_t)-0.948561350f,  (float16_t)0.316593376f,
+   (float16_t)-0.949045882f,  (float16_t)0.315137929f,
+   (float16_t)-0.949528181f,  (float16_t)0.313681740f,
+   (float16_t)-0.950008245f,  (float16_t)0.312224814f,
+   (float16_t)-0.950486074f,  (float16_t)0.310767153f,
+   (float16_t)-0.950961666f,  (float16_t)0.309308760f,
+   (float16_t)-0.951435021f,  (float16_t)0.307849640f,
+   (float16_t)-0.951906137f,  (float16_t)0.306389795f,
+   (float16_t)-0.952375013f,  (float16_t)0.304929230f,
+   (float16_t)-0.952841648f,  (float16_t)0.303467947f,
+   (float16_t)-0.953306040f,  (float16_t)0.302005949f,
+   (float16_t)-0.953768190f,  (float16_t)0.300543241f,
+   (float16_t)-0.954228095f,  (float16_t)0.299079826f,
+   (float16_t)-0.954685755f,  (float16_t)0.297615707f,
+   (float16_t)-0.955141168f,  (float16_t)0.296150888f,
+   (float16_t)-0.955594334f,  (float16_t)0.294685372f,
+   (float16_t)-0.956045251f,  (float16_t)0.293219163f,
+   (float16_t)-0.956493919f,  (float16_t)0.291752263f,
+   (float16_t)-0.956940336f,  (float16_t)0.290284677f,
+   (float16_t)-0.957384501f,  (float16_t)0.288816408f,
+   (float16_t)-0.957826413f,  (float16_t)0.287347460f,
+   (float16_t)-0.958266071f,  (float16_t)0.285877835f,
+   (float16_t)-0.958703475f,  (float16_t)0.284407537f,
+   (float16_t)-0.959138622f,  (float16_t)0.282936570f,
+   (float16_t)-0.959571513f,  (float16_t)0.281464938f,
+   (float16_t)-0.960002146f,  (float16_t)0.279992643f,
+   (float16_t)-0.960430519f,  (float16_t)0.278519689f,
+   (float16_t)-0.960856633f,  (float16_t)0.277046080f,
+   (float16_t)-0.961280486f,  (float16_t)0.275571819f,
+   (float16_t)-0.961702077f,  (float16_t)0.274096910f,
+   (float16_t)-0.962121404f,  (float16_t)0.272621355f,
+   (float16_t)-0.962538468f,  (float16_t)0.271145160f,
+   (float16_t)-0.962953267f,  (float16_t)0.269668326f,
+   (float16_t)-0.963365800f,  (float16_t)0.268190857f,
+   (float16_t)-0.963776066f,  (float16_t)0.266712757f,
+   (float16_t)-0.964184064f,  (float16_t)0.265234030f,
+   (float16_t)-0.964589793f,  (float16_t)0.263754679f,
+   (float16_t)-0.964993253f,  (float16_t)0.262274707f,
+   (float16_t)-0.965394442f,  (float16_t)0.260794118f,
+   (float16_t)-0.965793359f,  (float16_t)0.259312915f,
+   (float16_t)-0.966190003f,  (float16_t)0.257831102f,
+   (float16_t)-0.966584374f,  (float16_t)0.256348682f,
+   (float16_t)-0.966976471f,  (float16_t)0.254865660f,
+   (float16_t)-0.967366292f,  (float16_t)0.253382037f,
+   (float16_t)-0.967753837f,  (float16_t)0.251897818f,
+   (float16_t)-0.968139105f,  (float16_t)0.250413007f,
+   (float16_t)-0.968522094f,  (float16_t)0.248927606f,
+   (float16_t)-0.968902805f,  (float16_t)0.247441619f,
+   (float16_t)-0.969281235f,  (float16_t)0.245955050f,
+   (float16_t)-0.969657385f,  (float16_t)0.244467903f,
+   (float16_t)-0.970031253f,  (float16_t)0.242980180f,
+   (float16_t)-0.970402839f,  (float16_t)0.241491885f,
+   (float16_t)-0.970772141f,  (float16_t)0.240003022f,
+   (float16_t)-0.971139158f,  (float16_t)0.238513595f,
+   (float16_t)-0.971503891f,  (float16_t)0.237023606f,
+   (float16_t)-0.971866337f,  (float16_t)0.235533059f,
+   (float16_t)-0.972226497f,  (float16_t)0.234041959f,
+   (float16_t)-0.972584369f,  (float16_t)0.232550307f,
+   (float16_t)-0.972939952f,  (float16_t)0.231058108f,
+   (float16_t)-0.973293246f,  (float16_t)0.229565366f,
+   (float16_t)-0.973644250f,  (float16_t)0.228072083f,
+   (float16_t)-0.973992962f,  (float16_t)0.226578264f,
+   (float16_t)-0.974339383f,  (float16_t)0.225083911f,
+   (float16_t)-0.974683511f,  (float16_t)0.223589029f,
+   (float16_t)-0.975025345f,  (float16_t)0.222093621f,
+   (float16_t)-0.975364885f,  (float16_t)0.220597690f,
+   (float16_t)-0.975702130f,  (float16_t)0.219101240f,
+   (float16_t)-0.976037079f,  (float16_t)0.217604275f,
+   (float16_t)-0.976369731f,  (float16_t)0.216106797f,
+   (float16_t)-0.976700086f,  (float16_t)0.214608811f,
+   (float16_t)-0.977028143f,  (float16_t)0.213110320f,
+   (float16_t)-0.977353900f,  (float16_t)0.211611327f,
+   (float16_t)-0.977677358f,  (float16_t)0.210111837f,
+   (float16_t)-0.977998515f,  (float16_t)0.208611852f,
+   (float16_t)-0.978317371f,  (float16_t)0.207111376f,
+   (float16_t)-0.978633924f,  (float16_t)0.205610413f,
+   (float16_t)-0.978948175f,  (float16_t)0.204108966f,
+   (float16_t)-0.979260123f,  (float16_t)0.202607039f,
+   (float16_t)-0.979569766f,  (float16_t)0.201104635f,
+   (float16_t)-0.979877104f,  (float16_t)0.199601758f,
+   (float16_t)-0.980182136f,  (float16_t)0.198098411f,
+   (float16_t)-0.980484862f,  (float16_t)0.196594598f,
+   (float16_t)-0.980785280f,  (float16_t)0.195090322f,
+   (float16_t)-0.981083391f,  (float16_t)0.193585587f,
+   (float16_t)-0.981379193f,  (float16_t)0.192080397f,
+   (float16_t)-0.981672686f,  (float16_t)0.190574755f,
+   (float16_t)-0.981963869f,  (float16_t)0.189068664f,
+   (float16_t)-0.982252741f,  (float16_t)0.187562129f,
+   (float16_t)-0.982539302f,  (float16_t)0.186055152f,
+   (float16_t)-0.982823551f,  (float16_t)0.184547737f,
+   (float16_t)-0.983105487f,  (float16_t)0.183039888f,
+   (float16_t)-0.983385110f,  (float16_t)0.181531608f,
+   (float16_t)-0.983662419f,  (float16_t)0.180022901f,
+   (float16_t)-0.983937413f,  (float16_t)0.178513771f,
+   (float16_t)-0.984210092f,  (float16_t)0.177004220f,
+   (float16_t)-0.984480455f,  (float16_t)0.175494253f,
+   (float16_t)-0.984748502f,  (float16_t)0.173983873f,
+   (float16_t)-0.985014231f,  (float16_t)0.172473084f,
+   (float16_t)-0.985277642f,  (float16_t)0.170961889f,
+   (float16_t)-0.985538735f,  (float16_t)0.169450291f,
+   (float16_t)-0.985797509f,  (float16_t)0.167938295f,
+   (float16_t)-0.986053963f,  (float16_t)0.166425904f,
+   (float16_t)-0.986308097f,  (float16_t)0.164913120f,
+   (float16_t)-0.986559910f,  (float16_t)0.163399949f,
+   (float16_t)-0.986809402f,  (float16_t)0.161886394f,
+   (float16_t)-0.987056571f,  (float16_t)0.160372457f,
+   (float16_t)-0.987301418f,  (float16_t)0.158858143f,
+   (float16_t)-0.987543942f,  (float16_t)0.157343456f,
+   (float16_t)-0.987784142f,  (float16_t)0.155828398f,
+   (float16_t)-0.988022017f,  (float16_t)0.154312973f,
+   (float16_t)-0.988257568f,  (float16_t)0.152797185f,
+   (float16_t)-0.988490793f,  (float16_t)0.151281038f,
+   (float16_t)-0.988721692f,  (float16_t)0.149764535f,
+   (float16_t)-0.988950265f,  (float16_t)0.148247679f,
+   (float16_t)-0.989176510f,  (float16_t)0.146730474f,
+   (float16_t)-0.989400428f,  (float16_t)0.145212925f,
+   (float16_t)-0.989622017f,  (float16_t)0.143695033f,
+   (float16_t)-0.989841278f,  (float16_t)0.142176804f,
+   (float16_t)-0.990058210f,  (float16_t)0.140658239f,
+   (float16_t)-0.990272812f,  (float16_t)0.139139344f,
+   (float16_t)-0.990485084f,  (float16_t)0.137620122f,
+   (float16_t)-0.990695025f,  (float16_t)0.136100575f,
+   (float16_t)-0.990902635f,  (float16_t)0.134580709f,
+   (float16_t)-0.991107914f,  (float16_t)0.133060525f,
+   (float16_t)-0.991310860f,  (float16_t)0.131540029f,
+   (float16_t)-0.991511473f,  (float16_t)0.130019223f,
+   (float16_t)-0.991709754f,  (float16_t)0.128498111f,
+   (float16_t)-0.991905700f,  (float16_t)0.126976696f,
+   (float16_t)-0.992099313f,  (float16_t)0.125454983f,
+   (float16_t)-0.992290591f,  (float16_t)0.123932975f,
+   (float16_t)-0.992479535f,  (float16_t)0.122410675f,
+   (float16_t)-0.992666142f,  (float16_t)0.120888087f,
+   (float16_t)-0.992850414f,  (float16_t)0.119365215f,
+   (float16_t)-0.993032350f,  (float16_t)0.117842062f,
+   (float16_t)-0.993211949f,  (float16_t)0.116318631f,
+   (float16_t)-0.993389211f,  (float16_t)0.114794927f,
+   (float16_t)-0.993564136f,  (float16_t)0.113270952f,
+   (float16_t)-0.993736722f,  (float16_t)0.111746711f,
+   (float16_t)-0.993906970f,  (float16_t)0.110222207f,
+   (float16_t)-0.994074879f,  (float16_t)0.108697444f,
+   (float16_t)-0.994240449f,  (float16_t)0.107172425f,
+   (float16_t)-0.994403680f,  (float16_t)0.105647154f,
+   (float16_t)-0.994564571f,  (float16_t)0.104121634f,
+   (float16_t)-0.994723121f,  (float16_t)0.102595869f,
+   (float16_t)-0.994879331f,  (float16_t)0.101069863f,
+   (float16_t)-0.995033199f,  (float16_t)0.099543619f,
+   (float16_t)-0.995184727f,  (float16_t)0.098017140f,
+   (float16_t)-0.995333912f,  (float16_t)0.096490431f,
+   (float16_t)-0.995480755f,  (float16_t)0.094963495f,
+   (float16_t)-0.995625256f,  (float16_t)0.093436336f,
+   (float16_t)-0.995767414f,  (float16_t)0.091908956f,
+   (float16_t)-0.995907229f,  (float16_t)0.090381361f,
+   (float16_t)-0.996044701f,  (float16_t)0.088853553f,
+   (float16_t)-0.996179829f,  (float16_t)0.087325535f,
+   (float16_t)-0.996312612f,  (float16_t)0.085797312f,
+   (float16_t)-0.996443051f,  (float16_t)0.084268888f,
+   (float16_t)-0.996571146f,  (float16_t)0.082740265f,
+   (float16_t)-0.996696895f,  (float16_t)0.081211447f,
+   (float16_t)-0.996820299f,  (float16_t)0.079682438f,
+   (float16_t)-0.996941358f,  (float16_t)0.078153242f,
+   (float16_t)-0.997060070f,  (float16_t)0.076623861f,
+   (float16_t)-0.997176437f,  (float16_t)0.075094301f,
+   (float16_t)-0.997290457f,  (float16_t)0.073564564f,
+   (float16_t)-0.997402130f,  (float16_t)0.072034653f,
+   (float16_t)-0.997511456f,  (float16_t)0.070504573f,
+   (float16_t)-0.997618435f,  (float16_t)0.068974328f,
+   (float16_t)-0.997723067f,  (float16_t)0.067443920f,
+   (float16_t)-0.997825350f,  (float16_t)0.065913353f,
+   (float16_t)-0.997925286f,  (float16_t)0.064382631f,
+   (float16_t)-0.998022874f,  (float16_t)0.062851758f,
+   (float16_t)-0.998118113f,  (float16_t)0.061320736f,
+   (float16_t)-0.998211003f,  (float16_t)0.059789571f,
+   (float16_t)-0.998301545f,  (float16_t)0.058258265f,
+   (float16_t)-0.998389737f,  (float16_t)0.056726821f,
+   (float16_t)-0.998475581f,  (float16_t)0.055195244f,
+   (float16_t)-0.998559074f,  (float16_t)0.053663538f,
+   (float16_t)-0.998640218f,  (float16_t)0.052131705f,
+   (float16_t)-0.998719012f,  (float16_t)0.050599749f,
+   (float16_t)-0.998795456f,  (float16_t)0.049067674f,
+   (float16_t)-0.998869550f,  (float16_t)0.047535484f,
+   (float16_t)-0.998941293f,  (float16_t)0.046003182f,
+   (float16_t)-0.999010686f,  (float16_t)0.044470772f,
+   (float16_t)-0.999077728f,  (float16_t)0.042938257f,
+   (float16_t)-0.999142419f,  (float16_t)0.041405641f,
+   (float16_t)-0.999204759f,  (float16_t)0.039872928f,
+   (float16_t)-0.999264747f,  (float16_t)0.038340120f,
+   (float16_t)-0.999322385f,  (float16_t)0.036807223f,
+   (float16_t)-0.999377670f,  (float16_t)0.035274239f,
+   (float16_t)-0.999430605f,  (float16_t)0.033741172f,
+   (float16_t)-0.999481187f,  (float16_t)0.032208025f,
+   (float16_t)-0.999529418f,  (float16_t)0.030674803f,
+   (float16_t)-0.999575296f,  (float16_t)0.029141509f,
+   (float16_t)-0.999618822f,  (float16_t)0.027608146f,
+   (float16_t)-0.999659997f,  (float16_t)0.026074718f,
+   (float16_t)-0.999698819f,  (float16_t)0.024541229f,
+   (float16_t)-0.999735288f,  (float16_t)0.023007681f,
+   (float16_t)-0.999769405f,  (float16_t)0.021474080f,
+   (float16_t)-0.999801170f,  (float16_t)0.019940429f,
+   (float16_t)-0.999830582f,  (float16_t)0.018406730f,
+   (float16_t)-0.999857641f,  (float16_t)0.016872988f,
+   (float16_t)-0.999882347f,  (float16_t)0.015339206f,
+   (float16_t)-0.999904701f,  (float16_t)0.013805389f,
+   (float16_t)-0.999924702f,  (float16_t)0.012271538f,
+   (float16_t)-0.999942350f,  (float16_t)0.010737659f,
+   (float16_t)-0.999957645f,  (float16_t)0.009203755f,
+   (float16_t)-0.999970586f,  (float16_t)0.007669829f,
+   (float16_t)-0.999981175f,  (float16_t)0.006135885f,
+   (float16_t)-0.999989411f,  (float16_t)0.004601926f,
+   (float16_t)-0.999995294f,  (float16_t)0.003067957f,
+   (float16_t)-0.999998823f,  (float16_t)0.001533980f,
+   (float16_t)-1.000000000f,  (float16_t)0.000000000f,
+   (float16_t)-0.999998823f, (float16_t)-0.001533980f,
+   (float16_t)-0.999995294f, (float16_t)-0.003067957f,
+   (float16_t)-0.999989411f, (float16_t)-0.004601926f,
+   (float16_t)-0.999981175f, (float16_t)-0.006135885f,
+   (float16_t)-0.999970586f, (float16_t)-0.007669829f,
+   (float16_t)-0.999957645f, (float16_t)-0.009203755f,
+   (float16_t)-0.999942350f, (float16_t)-0.010737659f,
+   (float16_t)-0.999924702f, (float16_t)-0.012271538f,
+   (float16_t)-0.999904701f, (float16_t)-0.013805389f,
+   (float16_t)-0.999882347f, (float16_t)-0.015339206f,
+   (float16_t)-0.999857641f, (float16_t)-0.016872988f,
+   (float16_t)-0.999830582f, (float16_t)-0.018406730f,
+   (float16_t)-0.999801170f, (float16_t)-0.019940429f,
+   (float16_t)-0.999769405f, (float16_t)-0.021474080f,
+   (float16_t)-0.999735288f, (float16_t)-0.023007681f,
+   (float16_t)-0.999698819f, (float16_t)-0.024541229f,
+   (float16_t)-0.999659997f, (float16_t)-0.026074718f,
+   (float16_t)-0.999618822f, (float16_t)-0.027608146f,
+   (float16_t)-0.999575296f, (float16_t)-0.029141509f,
+   (float16_t)-0.999529418f, (float16_t)-0.030674803f,
+   (float16_t)-0.999481187f, (float16_t)-0.032208025f,
+   (float16_t)-0.999430605f, (float16_t)-0.033741172f,
+   (float16_t)-0.999377670f, (float16_t)-0.035274239f,
+   (float16_t)-0.999322385f, (float16_t)-0.036807223f,
+   (float16_t)-0.999264747f, (float16_t)-0.038340120f,
+   (float16_t)-0.999204759f, (float16_t)-0.039872928f,
+   (float16_t)-0.999142419f, (float16_t)-0.041405641f,
+   (float16_t)-0.999077728f, (float16_t)-0.042938257f,
+   (float16_t)-0.999010686f, (float16_t)-0.044470772f,
+   (float16_t)-0.998941293f, (float16_t)-0.046003182f,
+   (float16_t)-0.998869550f, (float16_t)-0.047535484f,
+   (float16_t)-0.998795456f, (float16_t)-0.049067674f,
+   (float16_t)-0.998719012f, (float16_t)-0.050599749f,
+   (float16_t)-0.998640218f, (float16_t)-0.052131705f,
+   (float16_t)-0.998559074f, (float16_t)-0.053663538f,
+   (float16_t)-0.998475581f, (float16_t)-0.055195244f,
+   (float16_t)-0.998389737f, (float16_t)-0.056726821f,
+   (float16_t)-0.998301545f, (float16_t)-0.058258265f,
+   (float16_t)-0.998211003f, (float16_t)-0.059789571f,
+   (float16_t)-0.998118113f, (float16_t)-0.061320736f,
+   (float16_t)-0.998022874f, (float16_t)-0.062851758f,
+   (float16_t)-0.997925286f, (float16_t)-0.064382631f,
+   (float16_t)-0.997825350f, (float16_t)-0.065913353f,
+   (float16_t)-0.997723067f, (float16_t)-0.067443920f,
+   (float16_t)-0.997618435f, (float16_t)-0.068974328f,
+   (float16_t)-0.997511456f, (float16_t)-0.070504573f,
+   (float16_t)-0.997402130f, (float16_t)-0.072034653f,
+   (float16_t)-0.997290457f, (float16_t)-0.073564564f,
+   (float16_t)-0.997176437f, (float16_t)-0.075094301f,
+   (float16_t)-0.997060070f, (float16_t)-0.076623861f,
+   (float16_t)-0.996941358f, (float16_t)-0.078153242f,
+   (float16_t)-0.996820299f, (float16_t)-0.079682438f,
+   (float16_t)-0.996696895f, (float16_t)-0.081211447f,
+   (float16_t)-0.996571146f, (float16_t)-0.082740265f,
+   (float16_t)-0.996443051f, (float16_t)-0.084268888f,
+   (float16_t)-0.996312612f, (float16_t)-0.085797312f,
+   (float16_t)-0.996179829f, (float16_t)-0.087325535f,
+   (float16_t)-0.996044701f, (float16_t)-0.088853553f,
+   (float16_t)-0.995907229f, (float16_t)-0.090381361f,
+   (float16_t)-0.995767414f, (float16_t)-0.091908956f,
+   (float16_t)-0.995625256f, (float16_t)-0.093436336f,
+   (float16_t)-0.995480755f, (float16_t)-0.094963495f,
+   (float16_t)-0.995333912f, (float16_t)-0.096490431f,
+   (float16_t)-0.995184727f, (float16_t)-0.098017140f,
+   (float16_t)-0.995033199f, (float16_t)-0.099543619f,
+   (float16_t)-0.994879331f, (float16_t)-0.101069863f,
+   (float16_t)-0.994723121f, (float16_t)-0.102595869f,
+   (float16_t)-0.994564571f, (float16_t)-0.104121634f,
+   (float16_t)-0.994403680f, (float16_t)-0.105647154f,
+   (float16_t)-0.994240449f, (float16_t)-0.107172425f,
+   (float16_t)-0.994074879f, (float16_t)-0.108697444f,
+   (float16_t)-0.993906970f, (float16_t)-0.110222207f,
+   (float16_t)-0.993736722f, (float16_t)-0.111746711f,
+   (float16_t)-0.993564136f, (float16_t)-0.113270952f,
+   (float16_t)-0.993389211f, (float16_t)-0.114794927f,
+   (float16_t)-0.993211949f, (float16_t)-0.116318631f,
+   (float16_t)-0.993032350f, (float16_t)-0.117842062f,
+   (float16_t)-0.992850414f, (float16_t)-0.119365215f,
+   (float16_t)-0.992666142f, (float16_t)-0.120888087f,
+   (float16_t)-0.992479535f, (float16_t)-0.122410675f,
+   (float16_t)-0.992290591f, (float16_t)-0.123932975f,
+   (float16_t)-0.992099313f, (float16_t)-0.125454983f,
+   (float16_t)-0.991905700f, (float16_t)-0.126976696f,
+   (float16_t)-0.991709754f, (float16_t)-0.128498111f,
+   (float16_t)-0.991511473f, (float16_t)-0.130019223f,
+   (float16_t)-0.991310860f, (float16_t)-0.131540029f,
+   (float16_t)-0.991107914f, (float16_t)-0.133060525f,
+   (float16_t)-0.990902635f, (float16_t)-0.134580709f,
+   (float16_t)-0.990695025f, (float16_t)-0.136100575f,
+   (float16_t)-0.990485084f, (float16_t)-0.137620122f,
+   (float16_t)-0.990272812f, (float16_t)-0.139139344f,
+   (float16_t)-0.990058210f, (float16_t)-0.140658239f,
+   (float16_t)-0.989841278f, (float16_t)-0.142176804f,
+   (float16_t)-0.989622017f, (float16_t)-0.143695033f,
+   (float16_t)-0.989400428f, (float16_t)-0.145212925f,
+   (float16_t)-0.989176510f, (float16_t)-0.146730474f,
+   (float16_t)-0.988950265f, (float16_t)-0.148247679f,
+   (float16_t)-0.988721692f, (float16_t)-0.149764535f,
+   (float16_t)-0.988490793f, (float16_t)-0.151281038f,
+   (float16_t)-0.988257568f, (float16_t)-0.152797185f,
+   (float16_t)-0.988022017f, (float16_t)-0.154312973f,
+   (float16_t)-0.987784142f, (float16_t)-0.155828398f,
+   (float16_t)-0.987543942f, (float16_t)-0.157343456f,
+   (float16_t)-0.987301418f, (float16_t)-0.158858143f,
+   (float16_t)-0.987056571f, (float16_t)-0.160372457f,
+   (float16_t)-0.986809402f, (float16_t)-0.161886394f,
+   (float16_t)-0.986559910f, (float16_t)-0.163399949f,
+   (float16_t)-0.986308097f, (float16_t)-0.164913120f,
+   (float16_t)-0.986053963f, (float16_t)-0.166425904f,
+   (float16_t)-0.985797509f, (float16_t)-0.167938295f,
+   (float16_t)-0.985538735f, (float16_t)-0.169450291f,
+   (float16_t)-0.985277642f, (float16_t)-0.170961889f,
+   (float16_t)-0.985014231f, (float16_t)-0.172473084f,
+   (float16_t)-0.984748502f, (float16_t)-0.173983873f,
+   (float16_t)-0.984480455f, (float16_t)-0.175494253f,
+   (float16_t)-0.984210092f, (float16_t)-0.177004220f,
+   (float16_t)-0.983937413f, (float16_t)-0.178513771f,
+   (float16_t)-0.983662419f, (float16_t)-0.180022901f,
+   (float16_t)-0.983385110f, (float16_t)-0.181531608f,
+   (float16_t)-0.983105487f, (float16_t)-0.183039888f,
+   (float16_t)-0.982823551f, (float16_t)-0.184547737f,
+   (float16_t)-0.982539302f, (float16_t)-0.186055152f,
+   (float16_t)-0.982252741f, (float16_t)-0.187562129f,
+   (float16_t)-0.981963869f, (float16_t)-0.189068664f,
+   (float16_t)-0.981672686f, (float16_t)-0.190574755f,
+   (float16_t)-0.981379193f, (float16_t)-0.192080397f,
+   (float16_t)-0.981083391f, (float16_t)-0.193585587f,
+   (float16_t)-0.980785280f, (float16_t)-0.195090322f,
+   (float16_t)-0.980484862f, (float16_t)-0.196594598f,
+   (float16_t)-0.980182136f, (float16_t)-0.198098411f,
+   (float16_t)-0.979877104f, (float16_t)-0.199601758f,
+   (float16_t)-0.979569766f, (float16_t)-0.201104635f,
+   (float16_t)-0.979260123f, (float16_t)-0.202607039f,
+   (float16_t)-0.978948175f, (float16_t)-0.204108966f,
+   (float16_t)-0.978633924f, (float16_t)-0.205610413f,
+   (float16_t)-0.978317371f, (float16_t)-0.207111376f,
+   (float16_t)-0.977998515f, (float16_t)-0.208611852f,
+   (float16_t)-0.977677358f, (float16_t)-0.210111837f,
+   (float16_t)-0.977353900f, (float16_t)-0.211611327f,
+   (float16_t)-0.977028143f, (float16_t)-0.213110320f,
+   (float16_t)-0.976700086f, (float16_t)-0.214608811f,
+   (float16_t)-0.976369731f, (float16_t)-0.216106797f,
+   (float16_t)-0.976037079f, (float16_t)-0.217604275f,
+   (float16_t)-0.975702130f, (float16_t)-0.219101240f,
+   (float16_t)-0.975364885f, (float16_t)-0.220597690f,
+   (float16_t)-0.975025345f, (float16_t)-0.222093621f,
+   (float16_t)-0.974683511f, (float16_t)-0.223589029f,
+   (float16_t)-0.974339383f, (float16_t)-0.225083911f,
+   (float16_t)-0.973992962f, (float16_t)-0.226578264f,
+   (float16_t)-0.973644250f, (float16_t)-0.228072083f,
+   (float16_t)-0.973293246f, (float16_t)-0.229565366f,
+   (float16_t)-0.972939952f, (float16_t)-0.231058108f,
+   (float16_t)-0.972584369f, (float16_t)-0.232550307f,
+   (float16_t)-0.972226497f, (float16_t)-0.234041959f,
+   (float16_t)-0.971866337f, (float16_t)-0.235533059f,
+   (float16_t)-0.971503891f, (float16_t)-0.237023606f,
+   (float16_t)-0.971139158f, (float16_t)-0.238513595f,
+   (float16_t)-0.970772141f, (float16_t)-0.240003022f,
+   (float16_t)-0.970402839f, (float16_t)-0.241491885f,
+   (float16_t)-0.970031253f, (float16_t)-0.242980180f,
+   (float16_t)-0.969657385f, (float16_t)-0.244467903f,
+   (float16_t)-0.969281235f, (float16_t)-0.245955050f,
+   (float16_t)-0.968902805f, (float16_t)-0.247441619f,
+   (float16_t)-0.968522094f, (float16_t)-0.248927606f,
+   (float16_t)-0.968139105f, (float16_t)-0.250413007f,
+   (float16_t)-0.967753837f, (float16_t)-0.251897818f,
+   (float16_t)-0.967366292f, (float16_t)-0.253382037f,
+   (float16_t)-0.966976471f, (float16_t)-0.254865660f,
+   (float16_t)-0.966584374f, (float16_t)-0.256348682f,
+   (float16_t)-0.966190003f, (float16_t)-0.257831102f,
+   (float16_t)-0.965793359f, (float16_t)-0.259312915f,
+   (float16_t)-0.965394442f, (float16_t)-0.260794118f,
+   (float16_t)-0.964993253f, (float16_t)-0.262274707f,
+   (float16_t)-0.964589793f, (float16_t)-0.263754679f,
+   (float16_t)-0.964184064f, (float16_t)-0.265234030f,
+   (float16_t)-0.963776066f, (float16_t)-0.266712757f,
+   (float16_t)-0.963365800f, (float16_t)-0.268190857f,
+   (float16_t)-0.962953267f, (float16_t)-0.269668326f,
+   (float16_t)-0.962538468f, (float16_t)-0.271145160f,
+   (float16_t)-0.962121404f, (float16_t)-0.272621355f,
+   (float16_t)-0.961702077f, (float16_t)-0.274096910f,
+   (float16_t)-0.961280486f, (float16_t)-0.275571819f,
+   (float16_t)-0.960856633f, (float16_t)-0.277046080f,
+   (float16_t)-0.960430519f, (float16_t)-0.278519689f,
+   (float16_t)-0.960002146f, (float16_t)-0.279992643f,
+   (float16_t)-0.959571513f, (float16_t)-0.281464938f,
+   (float16_t)-0.959138622f, (float16_t)-0.282936570f,
+   (float16_t)-0.958703475f, (float16_t)-0.284407537f,
+   (float16_t)-0.958266071f, (float16_t)-0.285877835f,
+   (float16_t)-0.957826413f, (float16_t)-0.287347460f,
+   (float16_t)-0.957384501f, (float16_t)-0.288816408f,
+   (float16_t)-0.956940336f, (float16_t)-0.290284677f,
+   (float16_t)-0.956493919f, (float16_t)-0.291752263f,
+   (float16_t)-0.956045251f, (float16_t)-0.293219163f,
+   (float16_t)-0.955594334f, (float16_t)-0.294685372f,
+   (float16_t)-0.955141168f, (float16_t)-0.296150888f,
+   (float16_t)-0.954685755f, (float16_t)-0.297615707f,
+   (float16_t)-0.954228095f, (float16_t)-0.299079826f,
+   (float16_t)-0.953768190f, (float16_t)-0.300543241f,
+   (float16_t)-0.953306040f, (float16_t)-0.302005949f,
+   (float16_t)-0.952841648f, (float16_t)-0.303467947f,
+   (float16_t)-0.952375013f, (float16_t)-0.304929230f,
+   (float16_t)-0.951906137f, (float16_t)-0.306389795f,
+   (float16_t)-0.951435021f, (float16_t)-0.307849640f,
+   (float16_t)-0.950961666f, (float16_t)-0.309308760f,
+   (float16_t)-0.950486074f, (float16_t)-0.310767153f,
+   (float16_t)-0.950008245f, (float16_t)-0.312224814f,
+   (float16_t)-0.949528181f, (float16_t)-0.313681740f,
+   (float16_t)-0.949045882f, (float16_t)-0.315137929f,
+   (float16_t)-0.948561350f, (float16_t)-0.316593376f,
+   (float16_t)-0.948074586f, (float16_t)-0.318048077f,
+   (float16_t)-0.947585591f, (float16_t)-0.319502031f,
+   (float16_t)-0.947094366f, (float16_t)-0.320955232f,
+   (float16_t)-0.946600913f, (float16_t)-0.322407679f,
+   (float16_t)-0.946105232f, (float16_t)-0.323859367f,
+   (float16_t)-0.945607325f, (float16_t)-0.325310292f,
+   (float16_t)-0.945107193f, (float16_t)-0.326760452f,
+   (float16_t)-0.944604837f, (float16_t)-0.328209844f,
+   (float16_t)-0.944100258f, (float16_t)-0.329658463f,
+   (float16_t)-0.943593458f, (float16_t)-0.331106306f,
+   (float16_t)-0.943084437f, (float16_t)-0.332553370f,
+   (float16_t)-0.942573198f, (float16_t)-0.333999651f,
+   (float16_t)-0.942059740f, (float16_t)-0.335445147f,
+   (float16_t)-0.941544065f, (float16_t)-0.336889853f,
+   (float16_t)-0.941026175f, (float16_t)-0.338333767f,
+   (float16_t)-0.940506071f, (float16_t)-0.339776884f,
+   (float16_t)-0.939983753f, (float16_t)-0.341219202f,
+   (float16_t)-0.939459224f, (float16_t)-0.342660717f,
+   (float16_t)-0.938932484f, (float16_t)-0.344101426f,
+   (float16_t)-0.938403534f, (float16_t)-0.345541325f,
+   (float16_t)-0.937872376f, (float16_t)-0.346980411f,
+   (float16_t)-0.937339012f, (float16_t)-0.348418680f,
+   (float16_t)-0.936803442f, (float16_t)-0.349856130f,
+   (float16_t)-0.936265667f, (float16_t)-0.351292756f,
+   (float16_t)-0.935725689f, (float16_t)-0.352728556f,
+   (float16_t)-0.935183510f, (float16_t)-0.354163525f,
+   (float16_t)-0.934639130f, (float16_t)-0.355597662f,
+   (float16_t)-0.934092550f, (float16_t)-0.357030961f,
+   (float16_t)-0.933543773f, (float16_t)-0.358463421f,
+   (float16_t)-0.932992799f, (float16_t)-0.359895037f,
+   (float16_t)-0.932439629f, (float16_t)-0.361325806f,
+   (float16_t)-0.931884266f, (float16_t)-0.362755724f,
+   (float16_t)-0.931326709f, (float16_t)-0.364184790f,
+   (float16_t)-0.930766961f, (float16_t)-0.365612998f,
+   (float16_t)-0.930205023f, (float16_t)-0.367040346f,
+   (float16_t)-0.929640896f, (float16_t)-0.368466830f,
+   (float16_t)-0.929074581f, (float16_t)-0.369892447f,
+   (float16_t)-0.928506080f, (float16_t)-0.371317194f,
+   (float16_t)-0.927935395f, (float16_t)-0.372741067f,
+   (float16_t)-0.927362526f, (float16_t)-0.374164063f,
+   (float16_t)-0.926787474f, (float16_t)-0.375586178f,
+   (float16_t)-0.926210242f, (float16_t)-0.377007410f,
+   (float16_t)-0.925630831f, (float16_t)-0.378427755f,
+   (float16_t)-0.925049241f, (float16_t)-0.379847209f,
+   (float16_t)-0.924465474f, (float16_t)-0.381265769f,
+   (float16_t)-0.923879533f, (float16_t)-0.382683432f,
+   (float16_t)-0.923291417f, (float16_t)-0.384100195f,
+   (float16_t)-0.922701128f, (float16_t)-0.385516054f,
+   (float16_t)-0.922108669f, (float16_t)-0.386931006f,
+   (float16_t)-0.921514039f, (float16_t)-0.388345047f,
+   (float16_t)-0.920917242f, (float16_t)-0.389758174f,
+   (float16_t)-0.920318277f, (float16_t)-0.391170384f,
+   (float16_t)-0.919717146f, (float16_t)-0.392581674f,
+   (float16_t)-0.919113852f, (float16_t)-0.393992040f,
+   (float16_t)-0.918508394f, (float16_t)-0.395401479f,
+   (float16_t)-0.917900776f, (float16_t)-0.396809987f,
+   (float16_t)-0.917290997f, (float16_t)-0.398217562f,
+   (float16_t)-0.916679060f, (float16_t)-0.399624200f,
+   (float16_t)-0.916064966f, (float16_t)-0.401029897f,
+   (float16_t)-0.915448716f, (float16_t)-0.402434651f,
+   (float16_t)-0.914830312f, (float16_t)-0.403838458f,
+   (float16_t)-0.914209756f, (float16_t)-0.405241314f,
+   (float16_t)-0.913587048f, (float16_t)-0.406643217f,
+   (float16_t)-0.912962190f, (float16_t)-0.408044163f,
+   (float16_t)-0.912335185f, (float16_t)-0.409444149f,
+   (float16_t)-0.911706032f, (float16_t)-0.410843171f,
+   (float16_t)-0.911074734f, (float16_t)-0.412241227f,
+   (float16_t)-0.910441292f, (float16_t)-0.413638312f,
+   (float16_t)-0.909805708f, (float16_t)-0.415034424f,
+   (float16_t)-0.909167983f, (float16_t)-0.416429560f,
+   (float16_t)-0.908528119f, (float16_t)-0.417823716f,
+   (float16_t)-0.907886116f, (float16_t)-0.419216888f,
+   (float16_t)-0.907241978f, (float16_t)-0.420609074f,
+   (float16_t)-0.906595705f, (float16_t)-0.422000271f,
+   (float16_t)-0.905947298f, (float16_t)-0.423390474f,
+   (float16_t)-0.905296759f, (float16_t)-0.424779681f,
+   (float16_t)-0.904644091f, (float16_t)-0.426167889f,
+   (float16_t)-0.903989293f, (float16_t)-0.427555093f,
+   (float16_t)-0.903332368f, (float16_t)-0.428941292f,
+   (float16_t)-0.902673318f, (float16_t)-0.430326481f,
+   (float16_t)-0.902012144f, (float16_t)-0.431710658f,
+   (float16_t)-0.901348847f, (float16_t)-0.433093819f,
+   (float16_t)-0.900683429f, (float16_t)-0.434475961f,
+   (float16_t)-0.900015892f, (float16_t)-0.435857080f,
+   (float16_t)-0.899346237f, (float16_t)-0.437237174f,
+   (float16_t)-0.898674466f, (float16_t)-0.438616239f,
+   (float16_t)-0.898000580f, (float16_t)-0.439994271f,
+   (float16_t)-0.897324581f, (float16_t)-0.441371269f,
+   (float16_t)-0.896646470f, (float16_t)-0.442747228f,
+   (float16_t)-0.895966250f, (float16_t)-0.444122145f,
+   (float16_t)-0.895283921f, (float16_t)-0.445496017f,
+   (float16_t)-0.894599486f, (float16_t)-0.446868840f,
+   (float16_t)-0.893912945f, (float16_t)-0.448240612f,
+   (float16_t)-0.893224301f, (float16_t)-0.449611330f,
+   (float16_t)-0.892533555f, (float16_t)-0.450980989f,
+   (float16_t)-0.891840709f, (float16_t)-0.452349587f,
+   (float16_t)-0.891145765f, (float16_t)-0.453717121f,
+   (float16_t)-0.890448723f, (float16_t)-0.455083587f,
+   (float16_t)-0.889749586f, (float16_t)-0.456448982f,
+   (float16_t)-0.889048356f, (float16_t)-0.457813304f,
+   (float16_t)-0.888345033f, (float16_t)-0.459176548f,
+   (float16_t)-0.887639620f, (float16_t)-0.460538711f,
+   (float16_t)-0.886932119f, (float16_t)-0.461899791f,
+   (float16_t)-0.886222530f, (float16_t)-0.463259784f,
+   (float16_t)-0.885510856f, (float16_t)-0.464618686f,
+   (float16_t)-0.884797098f, (float16_t)-0.465976496f,
+   (float16_t)-0.884081259f, (float16_t)-0.467333209f,
+   (float16_t)-0.883363339f, (float16_t)-0.468688822f,
+   (float16_t)-0.882643340f, (float16_t)-0.470043332f,
+   (float16_t)-0.881921264f, (float16_t)-0.471396737f,
+   (float16_t)-0.881197113f, (float16_t)-0.472749032f,
+   (float16_t)-0.880470889f, (float16_t)-0.474100215f,
+   (float16_t)-0.879742593f, (float16_t)-0.475450282f,
+   (float16_t)-0.879012226f, (float16_t)-0.476799230f,
+   (float16_t)-0.878279792f, (float16_t)-0.478147056f,
+   (float16_t)-0.877545290f, (float16_t)-0.479493758f,
+   (float16_t)-0.876808724f, (float16_t)-0.480839331f,
+   (float16_t)-0.876070094f, (float16_t)-0.482183772f,
+   (float16_t)-0.875329403f, (float16_t)-0.483527079f,
+   (float16_t)-0.874586652f, (float16_t)-0.484869248f,
+   (float16_t)-0.873841843f, (float16_t)-0.486210276f,
+   (float16_t)-0.873094978f, (float16_t)-0.487550160f,
+   (float16_t)-0.872346059f, (float16_t)-0.488888897f,
+   (float16_t)-0.871595087f, (float16_t)-0.490226483f,
+   (float16_t)-0.870842063f, (float16_t)-0.491562916f,
+   (float16_t)-0.870086991f, (float16_t)-0.492898192f,
+   (float16_t)-0.869329871f, (float16_t)-0.494232309f,
+   (float16_t)-0.868570706f, (float16_t)-0.495565262f,
+   (float16_t)-0.867809497f, (float16_t)-0.496897049f,
+   (float16_t)-0.867046246f, (float16_t)-0.498227667f,
+   (float16_t)-0.866280954f, (float16_t)-0.499557113f,
+   (float16_t)-0.865513624f, (float16_t)-0.500885383f,
+   (float16_t)-0.864744258f, (float16_t)-0.502212474f,
+   (float16_t)-0.863972856f, (float16_t)-0.503538384f,
+   (float16_t)-0.863199422f, (float16_t)-0.504863109f,
+   (float16_t)-0.862423956f, (float16_t)-0.506186645f,
+   (float16_t)-0.861646461f, (float16_t)-0.507508991f,
+   (float16_t)-0.860866939f, (float16_t)-0.508830143f,
+   (float16_t)-0.860085390f, (float16_t)-0.510150097f,
+   (float16_t)-0.859301818f, (float16_t)-0.511468850f,
+   (float16_t)-0.858516224f, (float16_t)-0.512786401f,
+   (float16_t)-0.857728610f, (float16_t)-0.514102744f,
+   (float16_t)-0.856938977f, (float16_t)-0.515417878f,
+   (float16_t)-0.856147328f, (float16_t)-0.516731799f,
+   (float16_t)-0.855353665f, (float16_t)-0.518044504f,
+   (float16_t)-0.854557988f, (float16_t)-0.519355990f,
+   (float16_t)-0.853760301f, (float16_t)-0.520666254f,
+   (float16_t)-0.852960605f, (float16_t)-0.521975293f,
+   (float16_t)-0.852158902f, (float16_t)-0.523283103f,
+   (float16_t)-0.851355193f, (float16_t)-0.524589683f,
+   (float16_t)-0.850549481f, (float16_t)-0.525895027f,
+   (float16_t)-0.849741768f, (float16_t)-0.527199135f,
+   (float16_t)-0.848932055f, (float16_t)-0.528502002f,
+   (float16_t)-0.848120345f, (float16_t)-0.529803625f,
+   (float16_t)-0.847306639f, (float16_t)-0.531104001f,
+   (float16_t)-0.846490939f, (float16_t)-0.532403128f,
+   (float16_t)-0.845673247f, (float16_t)-0.533701002f,
+   (float16_t)-0.844853565f, (float16_t)-0.534997620f,
+   (float16_t)-0.844031895f, (float16_t)-0.536292979f,
+   (float16_t)-0.843208240f, (float16_t)-0.537587076f,
+   (float16_t)-0.842382600f, (float16_t)-0.538879909f,
+   (float16_t)-0.841554977f, (float16_t)-0.540171473f,
+   (float16_t)-0.840725375f, (float16_t)-0.541461766f,
+   (float16_t)-0.839893794f, (float16_t)-0.542750785f,
+   (float16_t)-0.839060237f, (float16_t)-0.544038527f,
+   (float16_t)-0.838224706f, (float16_t)-0.545324988f,
+   (float16_t)-0.837387202f, (float16_t)-0.546610167f,
+   (float16_t)-0.836547727f, (float16_t)-0.547894059f,
+   (float16_t)-0.835706284f, (float16_t)-0.549176662f,
+   (float16_t)-0.834862875f, (float16_t)-0.550457973f,
+   (float16_t)-0.834017501f, (float16_t)-0.551737988f,
+   (float16_t)-0.833170165f, (float16_t)-0.553016706f,
+   (float16_t)-0.832320868f, (float16_t)-0.554294121f,
+   (float16_t)-0.831469612f, (float16_t)-0.555570233f,
+   (float16_t)-0.830616400f, (float16_t)-0.556845037f,
+   (float16_t)-0.829761234f, (float16_t)-0.558118531f,
+   (float16_t)-0.828904115f, (float16_t)-0.559390712f,
+   (float16_t)-0.828045045f, (float16_t)-0.560661576f,
+   (float16_t)-0.827184027f, (float16_t)-0.561931121f,
+   (float16_t)-0.826321063f, (float16_t)-0.563199344f,
+   (float16_t)-0.825456154f, (float16_t)-0.564466242f,
+   (float16_t)-0.824589303f, (float16_t)-0.565731811f,
+   (float16_t)-0.823720511f, (float16_t)-0.566996049f,
+   (float16_t)-0.822849781f, (float16_t)-0.568258953f,
+   (float16_t)-0.821977115f, (float16_t)-0.569520519f,
+   (float16_t)-0.821102515f, (float16_t)-0.570780746f,
+   (float16_t)-0.820225983f, (float16_t)-0.572039629f,
+   (float16_t)-0.819347520f, (float16_t)-0.573297167f,
+   (float16_t)-0.818467130f, (float16_t)-0.574553355f,
+   (float16_t)-0.817584813f, (float16_t)-0.575808191f,
+   (float16_t)-0.816700573f, (float16_t)-0.577061673f,
+   (float16_t)-0.815814411f, (float16_t)-0.578313796f,
+   (float16_t)-0.814926329f, (float16_t)-0.579564559f,
+   (float16_t)-0.814036330f, (float16_t)-0.580813958f,
+   (float16_t)-0.813144415f, (float16_t)-0.582061990f,
+   (float16_t)-0.812250587f, (float16_t)-0.583308653f,
+   (float16_t)-0.811354847f, (float16_t)-0.584553943f,
+   (float16_t)-0.810457198f, (float16_t)-0.585797857f,
+   (float16_t)-0.809557642f, (float16_t)-0.587040394f,
+   (float16_t)-0.808656182f, (float16_t)-0.588281548f,
+   (float16_t)-0.807752818f, (float16_t)-0.589521319f,
+   (float16_t)-0.806847554f, (float16_t)-0.590759702f,
+   (float16_t)-0.805940391f, (float16_t)-0.591996695f,
+   (float16_t)-0.805031331f, (float16_t)-0.593232295f,
+   (float16_t)-0.804120377f, (float16_t)-0.594466499f,
+   (float16_t)-0.803207531f, (float16_t)-0.595699304f,
+   (float16_t)-0.802292796f, (float16_t)-0.596930708f,
+   (float16_t)-0.801376172f, (float16_t)-0.598160707f,
+   (float16_t)-0.800457662f, (float16_t)-0.599389298f,
+   (float16_t)-0.799537269f, (float16_t)-0.600616479f,
+   (float16_t)-0.798614995f, (float16_t)-0.601842247f,
+   (float16_t)-0.797690841f, (float16_t)-0.603066599f,
+   (float16_t)-0.796764810f, (float16_t)-0.604289531f,
+   (float16_t)-0.795836905f, (float16_t)-0.605511041f,
+   (float16_t)-0.794907126f, (float16_t)-0.606731127f,
+   (float16_t)-0.793975478f, (float16_t)-0.607949785f,
+   (float16_t)-0.793041960f, (float16_t)-0.609167012f,
+   (float16_t)-0.792106577f, (float16_t)-0.610382806f,
+   (float16_t)-0.791169330f, (float16_t)-0.611597164f,
+   (float16_t)-0.790230221f, (float16_t)-0.612810082f,
+   (float16_t)-0.789289253f, (float16_t)-0.614021559f,
+   (float16_t)-0.788346428f, (float16_t)-0.615231591f,
+   (float16_t)-0.787401747f, (float16_t)-0.616440175f,
+   (float16_t)-0.786455214f, (float16_t)-0.617647308f,
+   (float16_t)-0.785506830f, (float16_t)-0.618852988f,
+   (float16_t)-0.784556597f, (float16_t)-0.620057212f,
+   (float16_t)-0.783604519f, (float16_t)-0.621259977f,
+   (float16_t)-0.782650596f, (float16_t)-0.622461279f,
+   (float16_t)-0.781694832f, (float16_t)-0.623661118f,
+   (float16_t)-0.780737229f, (float16_t)-0.624859488f,
+   (float16_t)-0.779777788f, (float16_t)-0.626056388f,
+   (float16_t)-0.778816512f, (float16_t)-0.627251815f,
+   (float16_t)-0.777853404f, (float16_t)-0.628445767f,
+   (float16_t)-0.776888466f, (float16_t)-0.629638239f,
+   (float16_t)-0.775921699f, (float16_t)-0.630829230f,
+   (float16_t)-0.774953107f, (float16_t)-0.632018736f,
+   (float16_t)-0.773982691f, (float16_t)-0.633206755f,
+   (float16_t)-0.773010453f, (float16_t)-0.634393284f,
+   (float16_t)-0.772036397f, (float16_t)-0.635578320f,
+   (float16_t)-0.771060524f, (float16_t)-0.636761861f,
+   (float16_t)-0.770082837f, (float16_t)-0.637943904f,
+   (float16_t)-0.769103338f, (float16_t)-0.639124445f,
+   (float16_t)-0.768122029f, (float16_t)-0.640303482f,
+   (float16_t)-0.767138912f, (float16_t)-0.641481013f,
+   (float16_t)-0.766153990f, (float16_t)-0.642657034f,
+   (float16_t)-0.765167266f, (float16_t)-0.643831543f,
+   (float16_t)-0.764178741f, (float16_t)-0.645004537f,
+   (float16_t)-0.763188417f, (float16_t)-0.646176013f,
+   (float16_t)-0.762196298f, (float16_t)-0.647345969f,
+   (float16_t)-0.761202385f, (float16_t)-0.648514401f,
+   (float16_t)-0.760206682f, (float16_t)-0.649681307f,
+   (float16_t)-0.759209189f, (float16_t)-0.650846685f,
+   (float16_t)-0.758209910f, (float16_t)-0.652010531f,
+   (float16_t)-0.757208847f, (float16_t)-0.653172843f,
+   (float16_t)-0.756206001f, (float16_t)-0.654333618f,
+   (float16_t)-0.755201377f, (float16_t)-0.655492853f,
+   (float16_t)-0.754194975f, (float16_t)-0.656650546f,
+   (float16_t)-0.753186799f, (float16_t)-0.657806693f,
+   (float16_t)-0.752176850f, (float16_t)-0.658961293f,
+   (float16_t)-0.751165132f, (float16_t)-0.660114342f,
+   (float16_t)-0.750151646f, (float16_t)-0.661265838f,
+   (float16_t)-0.749136395f, (float16_t)-0.662415778f,
+   (float16_t)-0.748119380f, (float16_t)-0.663564159f,
+   (float16_t)-0.747100606f, (float16_t)-0.664710978f,
+   (float16_t)-0.746080074f, (float16_t)-0.665856234f,
+   (float16_t)-0.745057785f, (float16_t)-0.666999922f,
+   (float16_t)-0.744033744f, (float16_t)-0.668142041f,
+   (float16_t)-0.743007952f, (float16_t)-0.669282588f,
+   (float16_t)-0.741980412f, (float16_t)-0.670421560f,
+   (float16_t)-0.740951125f, (float16_t)-0.671558955f,
+   (float16_t)-0.739920095f, (float16_t)-0.672694769f,
+   (float16_t)-0.738887324f, (float16_t)-0.673829000f,
+   (float16_t)-0.737852815f, (float16_t)-0.674961646f,
+   (float16_t)-0.736816569f, (float16_t)-0.676092704f,
+   (float16_t)-0.735778589f, (float16_t)-0.677222170f,
+   (float16_t)-0.734738878f, (float16_t)-0.678350043f,
+   (float16_t)-0.733697438f, (float16_t)-0.679476320f,
+   (float16_t)-0.732654272f, (float16_t)-0.680600998f,
+   (float16_t)-0.731609381f, (float16_t)-0.681724074f,
+   (float16_t)-0.730562769f, (float16_t)-0.682845546f,
+   (float16_t)-0.729514438f, (float16_t)-0.683965412f,
+   (float16_t)-0.728464390f, (float16_t)-0.685083668f,
+   (float16_t)-0.727412629f, (float16_t)-0.686200312f,
+   (float16_t)-0.726359155f, (float16_t)-0.687315341f,
+   (float16_t)-0.725303972f, (float16_t)-0.688428753f,
+   (float16_t)-0.724247083f, (float16_t)-0.689540545f,
+   (float16_t)-0.723188489f, (float16_t)-0.690650714f,
+   (float16_t)-0.722128194f, (float16_t)-0.691759258f,
+   (float16_t)-0.721066199f, (float16_t)-0.692866175f,
+   (float16_t)-0.720002508f, (float16_t)-0.693971461f,
+   (float16_t)-0.718937122f, (float16_t)-0.695075114f,
+   (float16_t)-0.717870045f, (float16_t)-0.696177131f,
+   (float16_t)-0.716801279f, (float16_t)-0.697277511f,
+   (float16_t)-0.715730825f, (float16_t)-0.698376249f,
+   (float16_t)-0.714658688f, (float16_t)-0.699473345f,
+   (float16_t)-0.713584869f, (float16_t)-0.700568794f,
+   (float16_t)-0.712509371f, (float16_t)-0.701662595f,
+   (float16_t)-0.711432196f, (float16_t)-0.702754744f,
+   (float16_t)-0.710353347f, (float16_t)-0.703845241f,
+   (float16_t)-0.709272826f, (float16_t)-0.704934080f,
+   (float16_t)-0.708190637f, (float16_t)-0.706021261f,
+   (float16_t)-0.707106781f, (float16_t)-0.707106781f,
+   (float16_t)-0.706021261f, (float16_t)-0.708190637f,
+   (float16_t)-0.704934080f, (float16_t)-0.709272826f,
+   (float16_t)-0.703845241f, (float16_t)-0.710353347f,
+   (float16_t)-0.702754744f, (float16_t)-0.711432196f,
+   (float16_t)-0.701662595f, (float16_t)-0.712509371f,
+   (float16_t)-0.700568794f, (float16_t)-0.713584869f,
+   (float16_t)-0.699473345f, (float16_t)-0.714658688f,
+   (float16_t)-0.698376249f, (float16_t)-0.715730825f,
+   (float16_t)-0.697277511f, (float16_t)-0.716801279f,
+   (float16_t)-0.696177131f, (float16_t)-0.717870045f,
+   (float16_t)-0.695075114f, (float16_t)-0.718937122f,
+   (float16_t)-0.693971461f, (float16_t)-0.720002508f,
+   (float16_t)-0.692866175f, (float16_t)-0.721066199f,
+   (float16_t)-0.691759258f, (float16_t)-0.722128194f,
+   (float16_t)-0.690650714f, (float16_t)-0.723188489f,
+   (float16_t)-0.689540545f, (float16_t)-0.724247083f,
+   (float16_t)-0.688428753f, (float16_t)-0.725303972f,
+   (float16_t)-0.687315341f, (float16_t)-0.726359155f,
+   (float16_t)-0.686200312f, (float16_t)-0.727412629f,
+   (float16_t)-0.685083668f, (float16_t)-0.728464390f,
+   (float16_t)-0.683965412f, (float16_t)-0.729514438f,
+   (float16_t)-0.682845546f, (float16_t)-0.730562769f,
+   (float16_t)-0.681724074f, (float16_t)-0.731609381f,
+   (float16_t)-0.680600998f, (float16_t)-0.732654272f,
+   (float16_t)-0.679476320f, (float16_t)-0.733697438f,
+   (float16_t)-0.678350043f, (float16_t)-0.734738878f,
+   (float16_t)-0.677222170f, (float16_t)-0.735778589f,
+   (float16_t)-0.676092704f, (float16_t)-0.736816569f,
+   (float16_t)-0.674961646f, (float16_t)-0.737852815f,
+   (float16_t)-0.673829000f, (float16_t)-0.738887324f,
+   (float16_t)-0.672694769f, (float16_t)-0.739920095f,
+   (float16_t)-0.671558955f, (float16_t)-0.740951125f,
+   (float16_t)-0.670421560f, (float16_t)-0.741980412f,
+   (float16_t)-0.669282588f, (float16_t)-0.743007952f,
+   (float16_t)-0.668142041f, (float16_t)-0.744033744f,
+   (float16_t)-0.666999922f, (float16_t)-0.745057785f,
+   (float16_t)-0.665856234f, (float16_t)-0.746080074f,
+   (float16_t)-0.664710978f, (float16_t)-0.747100606f,
+   (float16_t)-0.663564159f, (float16_t)-0.748119380f,
+   (float16_t)-0.662415778f, (float16_t)-0.749136395f,
+   (float16_t)-0.661265838f, (float16_t)-0.750151646f,
+   (float16_t)-0.660114342f, (float16_t)-0.751165132f,
+   (float16_t)-0.658961293f, (float16_t)-0.752176850f,
+   (float16_t)-0.657806693f, (float16_t)-0.753186799f,
+   (float16_t)-0.656650546f, (float16_t)-0.754194975f,
+   (float16_t)-0.655492853f, (float16_t)-0.755201377f,
+   (float16_t)-0.654333618f, (float16_t)-0.756206001f,
+   (float16_t)-0.653172843f, (float16_t)-0.757208847f,
+   (float16_t)-0.652010531f, (float16_t)-0.758209910f,
+   (float16_t)-0.650846685f, (float16_t)-0.759209189f,
+   (float16_t)-0.649681307f, (float16_t)-0.760206682f,
+   (float16_t)-0.648514401f, (float16_t)-0.761202385f,
+   (float16_t)-0.647345969f, (float16_t)-0.762196298f,
+   (float16_t)-0.646176013f, (float16_t)-0.763188417f,
+   (float16_t)-0.645004537f, (float16_t)-0.764178741f,
+   (float16_t)-0.643831543f, (float16_t)-0.765167266f,
+   (float16_t)-0.642657034f, (float16_t)-0.766153990f,
+   (float16_t)-0.641481013f, (float16_t)-0.767138912f,
+   (float16_t)-0.640303482f, (float16_t)-0.768122029f,
+   (float16_t)-0.639124445f, (float16_t)-0.769103338f,
+   (float16_t)-0.637943904f, (float16_t)-0.770082837f,
+   (float16_t)-0.636761861f, (float16_t)-0.771060524f,
+   (float16_t)-0.635578320f, (float16_t)-0.772036397f,
+   (float16_t)-0.634393284f, (float16_t)-0.773010453f,
+   (float16_t)-0.633206755f, (float16_t)-0.773982691f,
+   (float16_t)-0.632018736f, (float16_t)-0.774953107f,
+   (float16_t)-0.630829230f, (float16_t)-0.775921699f,
+   (float16_t)-0.629638239f, (float16_t)-0.776888466f,
+   (float16_t)-0.628445767f, (float16_t)-0.777853404f,
+   (float16_t)-0.627251815f, (float16_t)-0.778816512f,
+   (float16_t)-0.626056388f, (float16_t)-0.779777788f,
+   (float16_t)-0.624859488f, (float16_t)-0.780737229f,
+   (float16_t)-0.623661118f, (float16_t)-0.781694832f,
+   (float16_t)-0.622461279f, (float16_t)-0.782650596f,
+   (float16_t)-0.621259977f, (float16_t)-0.783604519f,
+   (float16_t)-0.620057212f, (float16_t)-0.784556597f,
+   (float16_t)-0.618852988f, (float16_t)-0.785506830f,
+   (float16_t)-0.617647308f, (float16_t)-0.786455214f,
+   (float16_t)-0.616440175f, (float16_t)-0.787401747f,
+   (float16_t)-0.615231591f, (float16_t)-0.788346428f,
+   (float16_t)-0.614021559f, (float16_t)-0.789289253f,
+   (float16_t)-0.612810082f, (float16_t)-0.790230221f,
+   (float16_t)-0.611597164f, (float16_t)-0.791169330f,
+   (float16_t)-0.610382806f, (float16_t)-0.792106577f,
+   (float16_t)-0.609167012f, (float16_t)-0.793041960f,
+   (float16_t)-0.607949785f, (float16_t)-0.793975478f,
+   (float16_t)-0.606731127f, (float16_t)-0.794907126f,
+   (float16_t)-0.605511041f, (float16_t)-0.795836905f,
+   (float16_t)-0.604289531f, (float16_t)-0.796764810f,
+   (float16_t)-0.603066599f, (float16_t)-0.797690841f,
+   (float16_t)-0.601842247f, (float16_t)-0.798614995f,
+   (float16_t)-0.600616479f, (float16_t)-0.799537269f,
+   (float16_t)-0.599389298f, (float16_t)-0.800457662f,
+   (float16_t)-0.598160707f, (float16_t)-0.801376172f,
+   (float16_t)-0.596930708f, (float16_t)-0.802292796f,
+   (float16_t)-0.595699304f, (float16_t)-0.803207531f,
+   (float16_t)-0.594466499f, (float16_t)-0.804120377f,
+   (float16_t)-0.593232295f, (float16_t)-0.805031331f,
+   (float16_t)-0.591996695f, (float16_t)-0.805940391f,
+   (float16_t)-0.590759702f, (float16_t)-0.806847554f,
+   (float16_t)-0.589521319f, (float16_t)-0.807752818f,
+   (float16_t)-0.588281548f, (float16_t)-0.808656182f,
+   (float16_t)-0.587040394f, (float16_t)-0.809557642f,
+   (float16_t)-0.585797857f, (float16_t)-0.810457198f,
+   (float16_t)-0.584553943f, (float16_t)-0.811354847f,
+   (float16_t)-0.583308653f, (float16_t)-0.812250587f,
+   (float16_t)-0.582061990f, (float16_t)-0.813144415f,
+   (float16_t)-0.580813958f, (float16_t)-0.814036330f,
+   (float16_t)-0.579564559f, (float16_t)-0.814926329f,
+   (float16_t)-0.578313796f, (float16_t)-0.815814411f,
+   (float16_t)-0.577061673f, (float16_t)-0.816700573f,
+   (float16_t)-0.575808191f, (float16_t)-0.817584813f,
+   (float16_t)-0.574553355f, (float16_t)-0.818467130f,
+   (float16_t)-0.573297167f, (float16_t)-0.819347520f,
+   (float16_t)-0.572039629f, (float16_t)-0.820225983f,
+   (float16_t)-0.570780746f, (float16_t)-0.821102515f,
+   (float16_t)-0.569520519f, (float16_t)-0.821977115f,
+   (float16_t)-0.568258953f, (float16_t)-0.822849781f,
+   (float16_t)-0.566996049f, (float16_t)-0.823720511f,
+   (float16_t)-0.565731811f, (float16_t)-0.824589303f,
+   (float16_t)-0.564466242f, (float16_t)-0.825456154f,
+   (float16_t)-0.563199344f, (float16_t)-0.826321063f,
+   (float16_t)-0.561931121f, (float16_t)-0.827184027f,
+   (float16_t)-0.560661576f, (float16_t)-0.828045045f,
+   (float16_t)-0.559390712f, (float16_t)-0.828904115f,
+   (float16_t)-0.558118531f, (float16_t)-0.829761234f,
+   (float16_t)-0.556845037f, (float16_t)-0.830616400f,
+   (float16_t)-0.555570233f, (float16_t)-0.831469612f,
+   (float16_t)-0.554294121f, (float16_t)-0.832320868f,
+   (float16_t)-0.553016706f, (float16_t)-0.833170165f,
+   (float16_t)-0.551737988f, (float16_t)-0.834017501f,
+   (float16_t)-0.550457973f, (float16_t)-0.834862875f,
+   (float16_t)-0.549176662f, (float16_t)-0.835706284f,
+   (float16_t)-0.547894059f, (float16_t)-0.836547727f,
+   (float16_t)-0.546610167f, (float16_t)-0.837387202f,
+   (float16_t)-0.545324988f, (float16_t)-0.838224706f,
+   (float16_t)-0.544038527f, (float16_t)-0.839060237f,
+   (float16_t)-0.542750785f, (float16_t)-0.839893794f,
+   (float16_t)-0.541461766f, (float16_t)-0.840725375f,
+   (float16_t)-0.540171473f, (float16_t)-0.841554977f,
+   (float16_t)-0.538879909f, (float16_t)-0.842382600f,
+   (float16_t)-0.537587076f, (float16_t)-0.843208240f,
+   (float16_t)-0.536292979f, (float16_t)-0.844031895f,
+   (float16_t)-0.534997620f, (float16_t)-0.844853565f,
+   (float16_t)-0.533701002f, (float16_t)-0.845673247f,
+   (float16_t)-0.532403128f, (float16_t)-0.846490939f,
+   (float16_t)-0.531104001f, (float16_t)-0.847306639f,
+   (float16_t)-0.529803625f, (float16_t)-0.848120345f,
+   (float16_t)-0.528502002f, (float16_t)-0.848932055f,
+   (float16_t)-0.527199135f, (float16_t)-0.849741768f,
+   (float16_t)-0.525895027f, (float16_t)-0.850549481f,
+   (float16_t)-0.524589683f, (float16_t)-0.851355193f,
+   (float16_t)-0.523283103f, (float16_t)-0.852158902f,
+   (float16_t)-0.521975293f, (float16_t)-0.852960605f,
+   (float16_t)-0.520666254f, (float16_t)-0.853760301f,
+   (float16_t)-0.519355990f, (float16_t)-0.854557988f,
+   (float16_t)-0.518044504f, (float16_t)-0.855353665f,
+   (float16_t)-0.516731799f, (float16_t)-0.856147328f,
+   (float16_t)-0.515417878f, (float16_t)-0.856938977f,
+   (float16_t)-0.514102744f, (float16_t)-0.857728610f,
+   (float16_t)-0.512786401f, (float16_t)-0.858516224f,
+   (float16_t)-0.511468850f, (float16_t)-0.859301818f,
+   (float16_t)-0.510150097f, (float16_t)-0.860085390f,
+   (float16_t)-0.508830143f, (float16_t)-0.860866939f,
+   (float16_t)-0.507508991f, (float16_t)-0.861646461f,
+   (float16_t)-0.506186645f, (float16_t)-0.862423956f,
+   (float16_t)-0.504863109f, (float16_t)-0.863199422f,
+   (float16_t)-0.503538384f, (float16_t)-0.863972856f,
+   (float16_t)-0.502212474f, (float16_t)-0.864744258f,
+   (float16_t)-0.500885383f, (float16_t)-0.865513624f,
+   (float16_t)-0.499557113f, (float16_t)-0.866280954f,
+   (float16_t)-0.498227667f, (float16_t)-0.867046246f,
+   (float16_t)-0.496897049f, (float16_t)-0.867809497f,
+   (float16_t)-0.495565262f, (float16_t)-0.868570706f,
+   (float16_t)-0.494232309f, (float16_t)-0.869329871f,
+   (float16_t)-0.492898192f, (float16_t)-0.870086991f,
+   (float16_t)-0.491562916f, (float16_t)-0.870842063f,
+   (float16_t)-0.490226483f, (float16_t)-0.871595087f,
+   (float16_t)-0.488888897f, (float16_t)-0.872346059f,
+   (float16_t)-0.487550160f, (float16_t)-0.873094978f,
+   (float16_t)-0.486210276f, (float16_t)-0.873841843f,
+   (float16_t)-0.484869248f, (float16_t)-0.874586652f,
+   (float16_t)-0.483527079f, (float16_t)-0.875329403f,
+   (float16_t)-0.482183772f, (float16_t)-0.876070094f,
+   (float16_t)-0.480839331f, (float16_t)-0.876808724f,
+   (float16_t)-0.479493758f, (float16_t)-0.877545290f,
+   (float16_t)-0.478147056f, (float16_t)-0.878279792f,
+   (float16_t)-0.476799230f, (float16_t)-0.879012226f,
+   (float16_t)-0.475450282f, (float16_t)-0.879742593f,
+   (float16_t)-0.474100215f, (float16_t)-0.880470889f,
+   (float16_t)-0.472749032f, (float16_t)-0.881197113f,
+   (float16_t)-0.471396737f, (float16_t)-0.881921264f,
+   (float16_t)-0.470043332f, (float16_t)-0.882643340f,
+   (float16_t)-0.468688822f, (float16_t)-0.883363339f,
+   (float16_t)-0.467333209f, (float16_t)-0.884081259f,
+   (float16_t)-0.465976496f, (float16_t)-0.884797098f,
+   (float16_t)-0.464618686f, (float16_t)-0.885510856f,
+   (float16_t)-0.463259784f, (float16_t)-0.886222530f,
+   (float16_t)-0.461899791f, (float16_t)-0.886932119f,
+   (float16_t)-0.460538711f, (float16_t)-0.887639620f,
+   (float16_t)-0.459176548f, (float16_t)-0.888345033f,
+   (float16_t)-0.457813304f, (float16_t)-0.889048356f,
+   (float16_t)-0.456448982f, (float16_t)-0.889749586f,
+   (float16_t)-0.455083587f, (float16_t)-0.890448723f,
+   (float16_t)-0.453717121f, (float16_t)-0.891145765f,
+   (float16_t)-0.452349587f, (float16_t)-0.891840709f,
+   (float16_t)-0.450980989f, (float16_t)-0.892533555f,
+   (float16_t)-0.449611330f, (float16_t)-0.893224301f,
+   (float16_t)-0.448240612f, (float16_t)-0.893912945f,
+   (float16_t)-0.446868840f, (float16_t)-0.894599486f,
+   (float16_t)-0.445496017f, (float16_t)-0.895283921f,
+   (float16_t)-0.444122145f, (float16_t)-0.895966250f,
+   (float16_t)-0.442747228f, (float16_t)-0.896646470f,
+   (float16_t)-0.441371269f, (float16_t)-0.897324581f,
+   (float16_t)-0.439994271f, (float16_t)-0.898000580f,
+   (float16_t)-0.438616239f, (float16_t)-0.898674466f,
+   (float16_t)-0.437237174f, (float16_t)-0.899346237f,
+   (float16_t)-0.435857080f, (float16_t)-0.900015892f,
+   (float16_t)-0.434475961f, (float16_t)-0.900683429f,
+   (float16_t)-0.433093819f, (float16_t)-0.901348847f,
+   (float16_t)-0.431710658f, (float16_t)-0.902012144f,
+   (float16_t)-0.430326481f, (float16_t)-0.902673318f,
+   (float16_t)-0.428941292f, (float16_t)-0.903332368f,
+   (float16_t)-0.427555093f, (float16_t)-0.903989293f,
+   (float16_t)-0.426167889f, (float16_t)-0.904644091f,
+   (float16_t)-0.424779681f, (float16_t)-0.905296759f,
+   (float16_t)-0.423390474f, (float16_t)-0.905947298f,
+   (float16_t)-0.422000271f, (float16_t)-0.906595705f,
+   (float16_t)-0.420609074f, (float16_t)-0.907241978f,
+   (float16_t)-0.419216888f, (float16_t)-0.907886116f,
+   (float16_t)-0.417823716f, (float16_t)-0.908528119f,
+   (float16_t)-0.416429560f, (float16_t)-0.909167983f,
+   (float16_t)-0.415034424f, (float16_t)-0.909805708f,
+   (float16_t)-0.413638312f, (float16_t)-0.910441292f,
+   (float16_t)-0.412241227f, (float16_t)-0.911074734f,
+   (float16_t)-0.410843171f, (float16_t)-0.911706032f,
+   (float16_t)-0.409444149f, (float16_t)-0.912335185f,
+   (float16_t)-0.408044163f, (float16_t)-0.912962190f,
+   (float16_t)-0.406643217f, (float16_t)-0.913587048f,
+   (float16_t)-0.405241314f, (float16_t)-0.914209756f,
+   (float16_t)-0.403838458f, (float16_t)-0.914830312f,
+   (float16_t)-0.402434651f, (float16_t)-0.915448716f,
+   (float16_t)-0.401029897f, (float16_t)-0.916064966f,
+   (float16_t)-0.399624200f, (float16_t)-0.916679060f,
+   (float16_t)-0.398217562f, (float16_t)-0.917290997f,
+   (float16_t)-0.396809987f, (float16_t)-0.917900776f,
+   (float16_t)-0.395401479f, (float16_t)-0.918508394f,
+   (float16_t)-0.393992040f, (float16_t)-0.919113852f,
+   (float16_t)-0.392581674f, (float16_t)-0.919717146f,
+   (float16_t)-0.391170384f, (float16_t)-0.920318277f,
+   (float16_t)-0.389758174f, (float16_t)-0.920917242f,
+   (float16_t)-0.388345047f, (float16_t)-0.921514039f,
+   (float16_t)-0.386931006f, (float16_t)-0.922108669f,
+   (float16_t)-0.385516054f, (float16_t)-0.922701128f,
+   (float16_t)-0.384100195f, (float16_t)-0.923291417f,
+   (float16_t)-0.382683432f, (float16_t)-0.923879533f,
+   (float16_t)-0.381265769f, (float16_t)-0.924465474f,
+   (float16_t)-0.379847209f, (float16_t)-0.925049241f,
+   (float16_t)-0.378427755f, (float16_t)-0.925630831f,
+   (float16_t)-0.377007410f, (float16_t)-0.926210242f,
+   (float16_t)-0.375586178f, (float16_t)-0.926787474f,
+   (float16_t)-0.374164063f, (float16_t)-0.927362526f,
+   (float16_t)-0.372741067f, (float16_t)-0.927935395f,
+   (float16_t)-0.371317194f, (float16_t)-0.928506080f,
+   (float16_t)-0.369892447f, (float16_t)-0.929074581f,
+   (float16_t)-0.368466830f, (float16_t)-0.929640896f,
+   (float16_t)-0.367040346f, (float16_t)-0.930205023f,
+   (float16_t)-0.365612998f, (float16_t)-0.930766961f,
+   (float16_t)-0.364184790f, (float16_t)-0.931326709f,
+   (float16_t)-0.362755724f, (float16_t)-0.931884266f,
+   (float16_t)-0.361325806f, (float16_t)-0.932439629f,
+   (float16_t)-0.359895037f, (float16_t)-0.932992799f,
+   (float16_t)-0.358463421f, (float16_t)-0.933543773f,
+   (float16_t)-0.357030961f, (float16_t)-0.934092550f,
+   (float16_t)-0.355597662f, (float16_t)-0.934639130f,
+   (float16_t)-0.354163525f, (float16_t)-0.935183510f,
+   (float16_t)-0.352728556f, (float16_t)-0.935725689f,
+   (float16_t)-0.351292756f, (float16_t)-0.936265667f,
+   (float16_t)-0.349856130f, (float16_t)-0.936803442f,
+   (float16_t)-0.348418680f, (float16_t)-0.937339012f,
+   (float16_t)-0.346980411f, (float16_t)-0.937872376f,
+   (float16_t)-0.345541325f, (float16_t)-0.938403534f,
+   (float16_t)-0.344101426f, (float16_t)-0.938932484f,
+   (float16_t)-0.342660717f, (float16_t)-0.939459224f,
+   (float16_t)-0.341219202f, (float16_t)-0.939983753f,
+   (float16_t)-0.339776884f, (float16_t)-0.940506071f,
+   (float16_t)-0.338333767f, (float16_t)-0.941026175f,
+   (float16_t)-0.336889853f, (float16_t)-0.941544065f,
+   (float16_t)-0.335445147f, (float16_t)-0.942059740f,
+   (float16_t)-0.333999651f, (float16_t)-0.942573198f,
+   (float16_t)-0.332553370f, (float16_t)-0.943084437f,
+   (float16_t)-0.331106306f, (float16_t)-0.943593458f,
+   (float16_t)-0.329658463f, (float16_t)-0.944100258f,
+   (float16_t)-0.328209844f, (float16_t)-0.944604837f,
+   (float16_t)-0.326760452f, (float16_t)-0.945107193f,
+   (float16_t)-0.325310292f, (float16_t)-0.945607325f,
+   (float16_t)-0.323859367f, (float16_t)-0.946105232f,
+   (float16_t)-0.322407679f, (float16_t)-0.946600913f,
+   (float16_t)-0.320955232f, (float16_t)-0.947094366f,
+   (float16_t)-0.319502031f, (float16_t)-0.947585591f,
+   (float16_t)-0.318048077f, (float16_t)-0.948074586f,
+   (float16_t)-0.316593376f, (float16_t)-0.948561350f,
+   (float16_t)-0.315137929f, (float16_t)-0.949045882f,
+   (float16_t)-0.313681740f, (float16_t)-0.949528181f,
+   (float16_t)-0.312224814f, (float16_t)-0.950008245f,
+   (float16_t)-0.310767153f, (float16_t)-0.950486074f,
+   (float16_t)-0.309308760f, (float16_t)-0.950961666f,
+   (float16_t)-0.307849640f, (float16_t)-0.951435021f,
+   (float16_t)-0.306389795f, (float16_t)-0.951906137f,
+   (float16_t)-0.304929230f, (float16_t)-0.952375013f,
+   (float16_t)-0.303467947f, (float16_t)-0.952841648f,
+   (float16_t)-0.302005949f, (float16_t)-0.953306040f,
+   (float16_t)-0.300543241f, (float16_t)-0.953768190f,
+   (float16_t)-0.299079826f, (float16_t)-0.954228095f,
+   (float16_t)-0.297615707f, (float16_t)-0.954685755f,
+   (float16_t)-0.296150888f, (float16_t)-0.955141168f,
+   (float16_t)-0.294685372f, (float16_t)-0.955594334f,
+   (float16_t)-0.293219163f, (float16_t)-0.956045251f,
+   (float16_t)-0.291752263f, (float16_t)-0.956493919f,
+   (float16_t)-0.290284677f, (float16_t)-0.956940336f,
+   (float16_t)-0.288816408f, (float16_t)-0.957384501f,
+   (float16_t)-0.287347460f, (float16_t)-0.957826413f,
+   (float16_t)-0.285877835f, (float16_t)-0.958266071f,
+   (float16_t)-0.284407537f, (float16_t)-0.958703475f,
+   (float16_t)-0.282936570f, (float16_t)-0.959138622f,
+   (float16_t)-0.281464938f, (float16_t)-0.959571513f,
+   (float16_t)-0.279992643f, (float16_t)-0.960002146f,
+   (float16_t)-0.278519689f, (float16_t)-0.960430519f,
+   (float16_t)-0.277046080f, (float16_t)-0.960856633f,
+   (float16_t)-0.275571819f, (float16_t)-0.961280486f,
+   (float16_t)-0.274096910f, (float16_t)-0.961702077f,
+   (float16_t)-0.272621355f, (float16_t)-0.962121404f,
+   (float16_t)-0.271145160f, (float16_t)-0.962538468f,
+   (float16_t)-0.269668326f, (float16_t)-0.962953267f,
+   (float16_t)-0.268190857f, (float16_t)-0.963365800f,
+   (float16_t)-0.266712757f, (float16_t)-0.963776066f,
+   (float16_t)-0.265234030f, (float16_t)-0.964184064f,
+   (float16_t)-0.263754679f, (float16_t)-0.964589793f,
+   (float16_t)-0.262274707f, (float16_t)-0.964993253f,
+   (float16_t)-0.260794118f, (float16_t)-0.965394442f,
+   (float16_t)-0.259312915f, (float16_t)-0.965793359f,
+   (float16_t)-0.257831102f, (float16_t)-0.966190003f,
+   (float16_t)-0.256348682f, (float16_t)-0.966584374f,
+   (float16_t)-0.254865660f, (float16_t)-0.966976471f,
+   (float16_t)-0.253382037f, (float16_t)-0.967366292f,
+   (float16_t)-0.251897818f, (float16_t)-0.967753837f,
+   (float16_t)-0.250413007f, (float16_t)-0.968139105f,
+   (float16_t)-0.248927606f, (float16_t)-0.968522094f,
+   (float16_t)-0.247441619f, (float16_t)-0.968902805f,
+   (float16_t)-0.245955050f, (float16_t)-0.969281235f,
+   (float16_t)-0.244467903f, (float16_t)-0.969657385f,
+   (float16_t)-0.242980180f, (float16_t)-0.970031253f,
+   (float16_t)-0.241491885f, (float16_t)-0.970402839f,
+   (float16_t)-0.240003022f, (float16_t)-0.970772141f,
+   (float16_t)-0.238513595f, (float16_t)-0.971139158f,
+   (float16_t)-0.237023606f, (float16_t)-0.971503891f,
+   (float16_t)-0.235533059f, (float16_t)-0.971866337f,
+   (float16_t)-0.234041959f, (float16_t)-0.972226497f,
+   (float16_t)-0.232550307f, (float16_t)-0.972584369f,
+   (float16_t)-0.231058108f, (float16_t)-0.972939952f,
+   (float16_t)-0.229565366f, (float16_t)-0.973293246f,
+   (float16_t)-0.228072083f, (float16_t)-0.973644250f,
+   (float16_t)-0.226578264f, (float16_t)-0.973992962f,
+   (float16_t)-0.225083911f, (float16_t)-0.974339383f,
+   (float16_t)-0.223589029f, (float16_t)-0.974683511f,
+   (float16_t)-0.222093621f, (float16_t)-0.975025345f,
+   (float16_t)-0.220597690f, (float16_t)-0.975364885f,
+   (float16_t)-0.219101240f, (float16_t)-0.975702130f,
+   (float16_t)-0.217604275f, (float16_t)-0.976037079f,
+   (float16_t)-0.216106797f, (float16_t)-0.976369731f,
+   (float16_t)-0.214608811f, (float16_t)-0.976700086f,
+   (float16_t)-0.213110320f, (float16_t)-0.977028143f,
+   (float16_t)-0.211611327f, (float16_t)-0.977353900f,
+   (float16_t)-0.210111837f, (float16_t)-0.977677358f,
+   (float16_t)-0.208611852f, (float16_t)-0.977998515f,
+   (float16_t)-0.207111376f, (float16_t)-0.978317371f,
+   (float16_t)-0.205610413f, (float16_t)-0.978633924f,
+   (float16_t)-0.204108966f, (float16_t)-0.978948175f,
+   (float16_t)-0.202607039f, (float16_t)-0.979260123f,
+   (float16_t)-0.201104635f, (float16_t)-0.979569766f,
+   (float16_t)-0.199601758f, (float16_t)-0.979877104f,
+   (float16_t)-0.198098411f, (float16_t)-0.980182136f,
+   (float16_t)-0.196594598f, (float16_t)-0.980484862f,
+   (float16_t)-0.195090322f, (float16_t)-0.980785280f,
+   (float16_t)-0.193585587f, (float16_t)-0.981083391f,
+   (float16_t)-0.192080397f, (float16_t)-0.981379193f,
+   (float16_t)-0.190574755f, (float16_t)-0.981672686f,
+   (float16_t)-0.189068664f, (float16_t)-0.981963869f,
+   (float16_t)-0.187562129f, (float16_t)-0.982252741f,
+   (float16_t)-0.186055152f, (float16_t)-0.982539302f,
+   (float16_t)-0.184547737f, (float16_t)-0.982823551f,
+   (float16_t)-0.183039888f, (float16_t)-0.983105487f,
+   (float16_t)-0.181531608f, (float16_t)-0.983385110f,
+   (float16_t)-0.180022901f, (float16_t)-0.983662419f,
+   (float16_t)-0.178513771f, (float16_t)-0.983937413f,
+   (float16_t)-0.177004220f, (float16_t)-0.984210092f,
+   (float16_t)-0.175494253f, (float16_t)-0.984480455f,
+   (float16_t)-0.173983873f, (float16_t)-0.984748502f,
+   (float16_t)-0.172473084f, (float16_t)-0.985014231f,
+   (float16_t)-0.170961889f, (float16_t)-0.985277642f,
+   (float16_t)-0.169450291f, (float16_t)-0.985538735f,
+   (float16_t)-0.167938295f, (float16_t)-0.985797509f,
+   (float16_t)-0.166425904f, (float16_t)-0.986053963f,
+   (float16_t)-0.164913120f, (float16_t)-0.986308097f,
+   (float16_t)-0.163399949f, (float16_t)-0.986559910f,
+   (float16_t)-0.161886394f, (float16_t)-0.986809402f,
+   (float16_t)-0.160372457f, (float16_t)-0.987056571f,
+   (float16_t)-0.158858143f, (float16_t)-0.987301418f,
+   (float16_t)-0.157343456f, (float16_t)-0.987543942f,
+   (float16_t)-0.155828398f, (float16_t)-0.987784142f,
+   (float16_t)-0.154312973f, (float16_t)-0.988022017f,
+   (float16_t)-0.152797185f, (float16_t)-0.988257568f,
+   (float16_t)-0.151281038f, (float16_t)-0.988490793f,
+   (float16_t)-0.149764535f, (float16_t)-0.988721692f,
+   (float16_t)-0.148247679f, (float16_t)-0.988950265f,
+   (float16_t)-0.146730474f, (float16_t)-0.989176510f,
+   (float16_t)-0.145212925f, (float16_t)-0.989400428f,
+   (float16_t)-0.143695033f, (float16_t)-0.989622017f,
+   (float16_t)-0.142176804f, (float16_t)-0.989841278f,
+   (float16_t)-0.140658239f, (float16_t)-0.990058210f,
+   (float16_t)-0.139139344f, (float16_t)-0.990272812f,
+   (float16_t)-0.137620122f, (float16_t)-0.990485084f,
+   (float16_t)-0.136100575f, (float16_t)-0.990695025f,
+   (float16_t)-0.134580709f, (float16_t)-0.990902635f,
+   (float16_t)-0.133060525f, (float16_t)-0.991107914f,
+   (float16_t)-0.131540029f, (float16_t)-0.991310860f,
+   (float16_t)-0.130019223f, (float16_t)-0.991511473f,
+   (float16_t)-0.128498111f, (float16_t)-0.991709754f,
+   (float16_t)-0.126976696f, (float16_t)-0.991905700f,
+   (float16_t)-0.125454983f, (float16_t)-0.992099313f,
+   (float16_t)-0.123932975f, (float16_t)-0.992290591f,
+   (float16_t)-0.122410675f, (float16_t)-0.992479535f,
+   (float16_t)-0.120888087f, (float16_t)-0.992666142f,
+   (float16_t)-0.119365215f, (float16_t)-0.992850414f,
+   (float16_t)-0.117842062f, (float16_t)-0.993032350f,
+   (float16_t)-0.116318631f, (float16_t)-0.993211949f,
+   (float16_t)-0.114794927f, (float16_t)-0.993389211f,
+   (float16_t)-0.113270952f, (float16_t)-0.993564136f,
+   (float16_t)-0.111746711f, (float16_t)-0.993736722f,
+   (float16_t)-0.110222207f, (float16_t)-0.993906970f,
+   (float16_t)-0.108697444f, (float16_t)-0.994074879f,
+   (float16_t)-0.107172425f, (float16_t)-0.994240449f,
+   (float16_t)-0.105647154f, (float16_t)-0.994403680f,
+   (float16_t)-0.104121634f, (float16_t)-0.994564571f,
+   (float16_t)-0.102595869f, (float16_t)-0.994723121f,
+   (float16_t)-0.101069863f, (float16_t)-0.994879331f,
+   (float16_t)-0.099543619f, (float16_t)-0.995033199f,
+   (float16_t)-0.098017140f, (float16_t)-0.995184727f,
+   (float16_t)-0.096490431f, (float16_t)-0.995333912f,
+   (float16_t)-0.094963495f, (float16_t)-0.995480755f,
+   (float16_t)-0.093436336f, (float16_t)-0.995625256f,
+   (float16_t)-0.091908956f, (float16_t)-0.995767414f,
+   (float16_t)-0.090381361f, (float16_t)-0.995907229f,
+   (float16_t)-0.088853553f, (float16_t)-0.996044701f,
+   (float16_t)-0.087325535f, (float16_t)-0.996179829f,
+   (float16_t)-0.085797312f, (float16_t)-0.996312612f,
+   (float16_t)-0.084268888f, (float16_t)-0.996443051f,
+   (float16_t)-0.082740265f, (float16_t)-0.996571146f,
+   (float16_t)-0.081211447f, (float16_t)-0.996696895f,
+   (float16_t)-0.079682438f, (float16_t)-0.996820299f,
+   (float16_t)-0.078153242f, (float16_t)-0.996941358f,
+   (float16_t)-0.076623861f, (float16_t)-0.997060070f,
+   (float16_t)-0.075094301f, (float16_t)-0.997176437f,
+   (float16_t)-0.073564564f, (float16_t)-0.997290457f,
+   (float16_t)-0.072034653f, (float16_t)-0.997402130f,
+   (float16_t)-0.070504573f, (float16_t)-0.997511456f,
+   (float16_t)-0.068974328f, (float16_t)-0.997618435f,
+   (float16_t)-0.067443920f, (float16_t)-0.997723067f,
+   (float16_t)-0.065913353f, (float16_t)-0.997825350f,
+   (float16_t)-0.064382631f, (float16_t)-0.997925286f,
+   (float16_t)-0.062851758f, (float16_t)-0.998022874f,
+   (float16_t)-0.061320736f, (float16_t)-0.998118113f,
+   (float16_t)-0.059789571f, (float16_t)-0.998211003f,
+   (float16_t)-0.058258265f, (float16_t)-0.998301545f,
+   (float16_t)-0.056726821f, (float16_t)-0.998389737f,
+   (float16_t)-0.055195244f, (float16_t)-0.998475581f,
+   (float16_t)-0.053663538f, (float16_t)-0.998559074f,
+   (float16_t)-0.052131705f, (float16_t)-0.998640218f,
+   (float16_t)-0.050599749f, (float16_t)-0.998719012f,
+   (float16_t)-0.049067674f, (float16_t)-0.998795456f,
+   (float16_t)-0.047535484f, (float16_t)-0.998869550f,
+   (float16_t)-0.046003182f, (float16_t)-0.998941293f,
+   (float16_t)-0.044470772f, (float16_t)-0.999010686f,
+   (float16_t)-0.042938257f, (float16_t)-0.999077728f,
+   (float16_t)-0.041405641f, (float16_t)-0.999142419f,
+   (float16_t)-0.039872928f, (float16_t)-0.999204759f,
+   (float16_t)-0.038340120f, (float16_t)-0.999264747f,
+   (float16_t)-0.036807223f, (float16_t)-0.999322385f,
+   (float16_t)-0.035274239f, (float16_t)-0.999377670f,
+   (float16_t)-0.033741172f, (float16_t)-0.999430605f,
+   (float16_t)-0.032208025f, (float16_t)-0.999481187f,
+   (float16_t)-0.030674803f, (float16_t)-0.999529418f,
+   (float16_t)-0.029141509f, (float16_t)-0.999575296f,
+   (float16_t)-0.027608146f, (float16_t)-0.999618822f,
+   (float16_t)-0.026074718f, (float16_t)-0.999659997f,
+   (float16_t)-0.024541229f, (float16_t)-0.999698819f,
+   (float16_t)-0.023007681f, (float16_t)-0.999735288f,
+   (float16_t)-0.021474080f, (float16_t)-0.999769405f,
+   (float16_t)-0.019940429f, (float16_t)-0.999801170f,
+   (float16_t)-0.018406730f, (float16_t)-0.999830582f,
+   (float16_t)-0.016872988f, (float16_t)-0.999857641f,
+   (float16_t)-0.015339206f, (float16_t)-0.999882347f,
+   (float16_t)-0.013805389f, (float16_t)-0.999904701f,
+   (float16_t)-0.012271538f, (float16_t)-0.999924702f,
+   (float16_t)-0.010737659f, (float16_t)-0.999942350f,
+   (float16_t)-0.009203755f, (float16_t)-0.999957645f,
+   (float16_t)-0.007669829f, (float16_t)-0.999970586f,
+   (float16_t)-0.006135885f, (float16_t)-0.999981175f,
+   (float16_t)-0.004601926f, (float16_t)-0.999989411f,
+   (float16_t)-0.003067957f, (float16_t)-0.999995294f,
+   (float16_t)-0.001533980f, (float16_t)-0.999998823f,
+   (float16_t)-0.000000000f, (float16_t)-1.000000000f,
+    (float16_t)0.001533980f, (float16_t)-0.999998823f,
+    (float16_t)0.003067957f, (float16_t)-0.999995294f,
+    (float16_t)0.004601926f, (float16_t)-0.999989411f,
+    (float16_t)0.006135885f, (float16_t)-0.999981175f,
+    (float16_t)0.007669829f, (float16_t)-0.999970586f,
+    (float16_t)0.009203755f, (float16_t)-0.999957645f,
+    (float16_t)0.010737659f, (float16_t)-0.999942350f,
+    (float16_t)0.012271538f, (float16_t)-0.999924702f,
+    (float16_t)0.013805389f, (float16_t)-0.999904701f,
+    (float16_t)0.015339206f, (float16_t)-0.999882347f,
+    (float16_t)0.016872988f, (float16_t)-0.999857641f,
+    (float16_t)0.018406730f, (float16_t)-0.999830582f,
+    (float16_t)0.019940429f, (float16_t)-0.999801170f,
+    (float16_t)0.021474080f, (float16_t)-0.999769405f,
+    (float16_t)0.023007681f, (float16_t)-0.999735288f,
+    (float16_t)0.024541229f, (float16_t)-0.999698819f,
+    (float16_t)0.026074718f, (float16_t)-0.999659997f,
+    (float16_t)0.027608146f, (float16_t)-0.999618822f,
+    (float16_t)0.029141509f, (float16_t)-0.999575296f,
+    (float16_t)0.030674803f, (float16_t)-0.999529418f,
+    (float16_t)0.032208025f, (float16_t)-0.999481187f,
+    (float16_t)0.033741172f, (float16_t)-0.999430605f,
+    (float16_t)0.035274239f, (float16_t)-0.999377670f,
+    (float16_t)0.036807223f, (float16_t)-0.999322385f,
+    (float16_t)0.038340120f, (float16_t)-0.999264747f,
+    (float16_t)0.039872928f, (float16_t)-0.999204759f,
+    (float16_t)0.041405641f, (float16_t)-0.999142419f,
+    (float16_t)0.042938257f, (float16_t)-0.999077728f,
+    (float16_t)0.044470772f, (float16_t)-0.999010686f,
+    (float16_t)0.046003182f, (float16_t)-0.998941293f,
+    (float16_t)0.047535484f, (float16_t)-0.998869550f,
+    (float16_t)0.049067674f, (float16_t)-0.998795456f,
+    (float16_t)0.050599749f, (float16_t)-0.998719012f,
+    (float16_t)0.052131705f, (float16_t)-0.998640218f,
+    (float16_t)0.053663538f, (float16_t)-0.998559074f,
+    (float16_t)0.055195244f, (float16_t)-0.998475581f,
+    (float16_t)0.056726821f, (float16_t)-0.998389737f,
+    (float16_t)0.058258265f, (float16_t)-0.998301545f,
+    (float16_t)0.059789571f, (float16_t)-0.998211003f,
+    (float16_t)0.061320736f, (float16_t)-0.998118113f,
+    (float16_t)0.062851758f, (float16_t)-0.998022874f,
+    (float16_t)0.064382631f, (float16_t)-0.997925286f,
+    (float16_t)0.065913353f, (float16_t)-0.997825350f,
+    (float16_t)0.067443920f, (float16_t)-0.997723067f,
+    (float16_t)0.068974328f, (float16_t)-0.997618435f,
+    (float16_t)0.070504573f, (float16_t)-0.997511456f,
+    (float16_t)0.072034653f, (float16_t)-0.997402130f,
+    (float16_t)0.073564564f, (float16_t)-0.997290457f,
+    (float16_t)0.075094301f, (float16_t)-0.997176437f,
+    (float16_t)0.076623861f, (float16_t)-0.997060070f,
+    (float16_t)0.078153242f, (float16_t)-0.996941358f,
+    (float16_t)0.079682438f, (float16_t)-0.996820299f,
+    (float16_t)0.081211447f, (float16_t)-0.996696895f,
+    (float16_t)0.082740265f, (float16_t)-0.996571146f,
+    (float16_t)0.084268888f, (float16_t)-0.996443051f,
+    (float16_t)0.085797312f, (float16_t)-0.996312612f,
+    (float16_t)0.087325535f, (float16_t)-0.996179829f,
+    (float16_t)0.088853553f, (float16_t)-0.996044701f,
+    (float16_t)0.090381361f, (float16_t)-0.995907229f,
+    (float16_t)0.091908956f, (float16_t)-0.995767414f,
+    (float16_t)0.093436336f, (float16_t)-0.995625256f,
+    (float16_t)0.094963495f, (float16_t)-0.995480755f,
+    (float16_t)0.096490431f, (float16_t)-0.995333912f,
+    (float16_t)0.098017140f, (float16_t)-0.995184727f,
+    (float16_t)0.099543619f, (float16_t)-0.995033199f,
+    (float16_t)0.101069863f, (float16_t)-0.994879331f,
+    (float16_t)0.102595869f, (float16_t)-0.994723121f,
+    (float16_t)0.104121634f, (float16_t)-0.994564571f,
+    (float16_t)0.105647154f, (float16_t)-0.994403680f,
+    (float16_t)0.107172425f, (float16_t)-0.994240449f,
+    (float16_t)0.108697444f, (float16_t)-0.994074879f,
+    (float16_t)0.110222207f, (float16_t)-0.993906970f,
+    (float16_t)0.111746711f, (float16_t)-0.993736722f,
+    (float16_t)0.113270952f, (float16_t)-0.993564136f,
+    (float16_t)0.114794927f, (float16_t)-0.993389211f,
+    (float16_t)0.116318631f, (float16_t)-0.993211949f,
+    (float16_t)0.117842062f, (float16_t)-0.993032350f,
+    (float16_t)0.119365215f, (float16_t)-0.992850414f,
+    (float16_t)0.120888087f, (float16_t)-0.992666142f,
+    (float16_t)0.122410675f, (float16_t)-0.992479535f,
+    (float16_t)0.123932975f, (float16_t)-0.992290591f,
+    (float16_t)0.125454983f, (float16_t)-0.992099313f,
+    (float16_t)0.126976696f, (float16_t)-0.991905700f,
+    (float16_t)0.128498111f, (float16_t)-0.991709754f,
+    (float16_t)0.130019223f, (float16_t)-0.991511473f,
+    (float16_t)0.131540029f, (float16_t)-0.991310860f,
+    (float16_t)0.133060525f, (float16_t)-0.991107914f,
+    (float16_t)0.134580709f, (float16_t)-0.990902635f,
+    (float16_t)0.136100575f, (float16_t)-0.990695025f,
+    (float16_t)0.137620122f, (float16_t)-0.990485084f,
+    (float16_t)0.139139344f, (float16_t)-0.990272812f,
+    (float16_t)0.140658239f, (float16_t)-0.990058210f,
+    (float16_t)0.142176804f, (float16_t)-0.989841278f,
+    (float16_t)0.143695033f, (float16_t)-0.989622017f,
+    (float16_t)0.145212925f, (float16_t)-0.989400428f,
+    (float16_t)0.146730474f, (float16_t)-0.989176510f,
+    (float16_t)0.148247679f, (float16_t)-0.988950265f,
+    (float16_t)0.149764535f, (float16_t)-0.988721692f,
+    (float16_t)0.151281038f, (float16_t)-0.988490793f,
+    (float16_t)0.152797185f, (float16_t)-0.988257568f,
+    (float16_t)0.154312973f, (float16_t)-0.988022017f,
+    (float16_t)0.155828398f, (float16_t)-0.987784142f,
+    (float16_t)0.157343456f, (float16_t)-0.987543942f,
+    (float16_t)0.158858143f, (float16_t)-0.987301418f,
+    (float16_t)0.160372457f, (float16_t)-0.987056571f,
+    (float16_t)0.161886394f, (float16_t)-0.986809402f,
+    (float16_t)0.163399949f, (float16_t)-0.986559910f,
+    (float16_t)0.164913120f, (float16_t)-0.986308097f,
+    (float16_t)0.166425904f, (float16_t)-0.986053963f,
+    (float16_t)0.167938295f, (float16_t)-0.985797509f,
+    (float16_t)0.169450291f, (float16_t)-0.985538735f,
+    (float16_t)0.170961889f, (float16_t)-0.985277642f,
+    (float16_t)0.172473084f, (float16_t)-0.985014231f,
+    (float16_t)0.173983873f, (float16_t)-0.984748502f,
+    (float16_t)0.175494253f, (float16_t)-0.984480455f,
+    (float16_t)0.177004220f, (float16_t)-0.984210092f,
+    (float16_t)0.178513771f, (float16_t)-0.983937413f,
+    (float16_t)0.180022901f, (float16_t)-0.983662419f,
+    (float16_t)0.181531608f, (float16_t)-0.983385110f,
+    (float16_t)0.183039888f, (float16_t)-0.983105487f,
+    (float16_t)0.184547737f, (float16_t)-0.982823551f,
+    (float16_t)0.186055152f, (float16_t)-0.982539302f,
+    (float16_t)0.187562129f, (float16_t)-0.982252741f,
+    (float16_t)0.189068664f, (float16_t)-0.981963869f,
+    (float16_t)0.190574755f, (float16_t)-0.981672686f,
+    (float16_t)0.192080397f, (float16_t)-0.981379193f,
+    (float16_t)0.193585587f, (float16_t)-0.981083391f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.196594598f, (float16_t)-0.980484862f,
+    (float16_t)0.198098411f, (float16_t)-0.980182136f,
+    (float16_t)0.199601758f, (float16_t)-0.979877104f,
+    (float16_t)0.201104635f, (float16_t)-0.979569766f,
+    (float16_t)0.202607039f, (float16_t)-0.979260123f,
+    (float16_t)0.204108966f, (float16_t)-0.978948175f,
+    (float16_t)0.205610413f, (float16_t)-0.978633924f,
+    (float16_t)0.207111376f, (float16_t)-0.978317371f,
+    (float16_t)0.208611852f, (float16_t)-0.977998515f,
+    (float16_t)0.210111837f, (float16_t)-0.977677358f,
+    (float16_t)0.211611327f, (float16_t)-0.977353900f,
+    (float16_t)0.213110320f, (float16_t)-0.977028143f,
+    (float16_t)0.214608811f, (float16_t)-0.976700086f,
+    (float16_t)0.216106797f, (float16_t)-0.976369731f,
+    (float16_t)0.217604275f, (float16_t)-0.976037079f,
+    (float16_t)0.219101240f, (float16_t)-0.975702130f,
+    (float16_t)0.220597690f, (float16_t)-0.975364885f,
+    (float16_t)0.222093621f, (float16_t)-0.975025345f,
+    (float16_t)0.223589029f, (float16_t)-0.974683511f,
+    (float16_t)0.225083911f, (float16_t)-0.974339383f,
+    (float16_t)0.226578264f, (float16_t)-0.973992962f,
+    (float16_t)0.228072083f, (float16_t)-0.973644250f,
+    (float16_t)0.229565366f, (float16_t)-0.973293246f,
+    (float16_t)0.231058108f, (float16_t)-0.972939952f,
+    (float16_t)0.232550307f, (float16_t)-0.972584369f,
+    (float16_t)0.234041959f, (float16_t)-0.972226497f,
+    (float16_t)0.235533059f, (float16_t)-0.971866337f,
+    (float16_t)0.237023606f, (float16_t)-0.971503891f,
+    (float16_t)0.238513595f, (float16_t)-0.971139158f,
+    (float16_t)0.240003022f, (float16_t)-0.970772141f,
+    (float16_t)0.241491885f, (float16_t)-0.970402839f,
+    (float16_t)0.242980180f, (float16_t)-0.970031253f,
+    (float16_t)0.244467903f, (float16_t)-0.969657385f,
+    (float16_t)0.245955050f, (float16_t)-0.969281235f,
+    (float16_t)0.247441619f, (float16_t)-0.968902805f,
+    (float16_t)0.248927606f, (float16_t)-0.968522094f,
+    (float16_t)0.250413007f, (float16_t)-0.968139105f,
+    (float16_t)0.251897818f, (float16_t)-0.967753837f,
+    (float16_t)0.253382037f, (float16_t)-0.967366292f,
+    (float16_t)0.254865660f, (float16_t)-0.966976471f,
+    (float16_t)0.256348682f, (float16_t)-0.966584374f,
+    (float16_t)0.257831102f, (float16_t)-0.966190003f,
+    (float16_t)0.259312915f, (float16_t)-0.965793359f,
+    (float16_t)0.260794118f, (float16_t)-0.965394442f,
+    (float16_t)0.262274707f, (float16_t)-0.964993253f,
+    (float16_t)0.263754679f, (float16_t)-0.964589793f,
+    (float16_t)0.265234030f, (float16_t)-0.964184064f,
+    (float16_t)0.266712757f, (float16_t)-0.963776066f,
+    (float16_t)0.268190857f, (float16_t)-0.963365800f,
+    (float16_t)0.269668326f, (float16_t)-0.962953267f,
+    (float16_t)0.271145160f, (float16_t)-0.962538468f,
+    (float16_t)0.272621355f, (float16_t)-0.962121404f,
+    (float16_t)0.274096910f, (float16_t)-0.961702077f,
+    (float16_t)0.275571819f, (float16_t)-0.961280486f,
+    (float16_t)0.277046080f, (float16_t)-0.960856633f,
+    (float16_t)0.278519689f, (float16_t)-0.960430519f,
+    (float16_t)0.279992643f, (float16_t)-0.960002146f,
+    (float16_t)0.281464938f, (float16_t)-0.959571513f,
+    (float16_t)0.282936570f, (float16_t)-0.959138622f,
+    (float16_t)0.284407537f, (float16_t)-0.958703475f,
+    (float16_t)0.285877835f, (float16_t)-0.958266071f,
+    (float16_t)0.287347460f, (float16_t)-0.957826413f,
+    (float16_t)0.288816408f, (float16_t)-0.957384501f,
+    (float16_t)0.290284677f, (float16_t)-0.956940336f,
+    (float16_t)0.291752263f, (float16_t)-0.956493919f,
+    (float16_t)0.293219163f, (float16_t)-0.956045251f,
+    (float16_t)0.294685372f, (float16_t)-0.955594334f,
+    (float16_t)0.296150888f, (float16_t)-0.955141168f,
+    (float16_t)0.297615707f, (float16_t)-0.954685755f,
+    (float16_t)0.299079826f, (float16_t)-0.954228095f,
+    (float16_t)0.300543241f, (float16_t)-0.953768190f,
+    (float16_t)0.302005949f, (float16_t)-0.953306040f,
+    (float16_t)0.303467947f, (float16_t)-0.952841648f,
+    (float16_t)0.304929230f, (float16_t)-0.952375013f,
+    (float16_t)0.306389795f, (float16_t)-0.951906137f,
+    (float16_t)0.307849640f, (float16_t)-0.951435021f,
+    (float16_t)0.309308760f, (float16_t)-0.950961666f,
+    (float16_t)0.310767153f, (float16_t)-0.950486074f,
+    (float16_t)0.312224814f, (float16_t)-0.950008245f,
+    (float16_t)0.313681740f, (float16_t)-0.949528181f,
+    (float16_t)0.315137929f, (float16_t)-0.949045882f,
+    (float16_t)0.316593376f, (float16_t)-0.948561350f,
+    (float16_t)0.318048077f, (float16_t)-0.948074586f,
+    (float16_t)0.319502031f, (float16_t)-0.947585591f,
+    (float16_t)0.320955232f, (float16_t)-0.947094366f,
+    (float16_t)0.322407679f, (float16_t)-0.946600913f,
+    (float16_t)0.323859367f, (float16_t)-0.946105232f,
+    (float16_t)0.325310292f, (float16_t)-0.945607325f,
+    (float16_t)0.326760452f, (float16_t)-0.945107193f,
+    (float16_t)0.328209844f, (float16_t)-0.944604837f,
+    (float16_t)0.329658463f, (float16_t)-0.944100258f,
+    (float16_t)0.331106306f, (float16_t)-0.943593458f,
+    (float16_t)0.332553370f, (float16_t)-0.943084437f,
+    (float16_t)0.333999651f, (float16_t)-0.942573198f,
+    (float16_t)0.335445147f, (float16_t)-0.942059740f,
+    (float16_t)0.336889853f, (float16_t)-0.941544065f,
+    (float16_t)0.338333767f, (float16_t)-0.941026175f,
+    (float16_t)0.339776884f, (float16_t)-0.940506071f,
+    (float16_t)0.341219202f, (float16_t)-0.939983753f,
+    (float16_t)0.342660717f, (float16_t)-0.939459224f,
+    (float16_t)0.344101426f, (float16_t)-0.938932484f,
+    (float16_t)0.345541325f, (float16_t)-0.938403534f,
+    (float16_t)0.346980411f, (float16_t)-0.937872376f,
+    (float16_t)0.348418680f, (float16_t)-0.937339012f,
+    (float16_t)0.349856130f, (float16_t)-0.936803442f,
+    (float16_t)0.351292756f, (float16_t)-0.936265667f,
+    (float16_t)0.352728556f, (float16_t)-0.935725689f,
+    (float16_t)0.354163525f, (float16_t)-0.935183510f,
+    (float16_t)0.355597662f, (float16_t)-0.934639130f,
+    (float16_t)0.357030961f, (float16_t)-0.934092550f,
+    (float16_t)0.358463421f, (float16_t)-0.933543773f,
+    (float16_t)0.359895037f, (float16_t)-0.932992799f,
+    (float16_t)0.361325806f, (float16_t)-0.932439629f,
+    (float16_t)0.362755724f, (float16_t)-0.931884266f,
+    (float16_t)0.364184790f, (float16_t)-0.931326709f,
+    (float16_t)0.365612998f, (float16_t)-0.930766961f,
+    (float16_t)0.367040346f, (float16_t)-0.930205023f,
+    (float16_t)0.368466830f, (float16_t)-0.929640896f,
+    (float16_t)0.369892447f, (float16_t)-0.929074581f,
+    (float16_t)0.371317194f, (float16_t)-0.928506080f,
+    (float16_t)0.372741067f, (float16_t)-0.927935395f,
+    (float16_t)0.374164063f, (float16_t)-0.927362526f,
+    (float16_t)0.375586178f, (float16_t)-0.926787474f,
+    (float16_t)0.377007410f, (float16_t)-0.926210242f,
+    (float16_t)0.378427755f, (float16_t)-0.925630831f,
+    (float16_t)0.379847209f, (float16_t)-0.925049241f,
+    (float16_t)0.381265769f, (float16_t)-0.924465474f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.384100195f, (float16_t)-0.923291417f,
+    (float16_t)0.385516054f, (float16_t)-0.922701128f,
+    (float16_t)0.386931006f, (float16_t)-0.922108669f,
+    (float16_t)0.388345047f, (float16_t)-0.921514039f,
+    (float16_t)0.389758174f, (float16_t)-0.920917242f,
+    (float16_t)0.391170384f, (float16_t)-0.920318277f,
+    (float16_t)0.392581674f, (float16_t)-0.919717146f,
+    (float16_t)0.393992040f, (float16_t)-0.919113852f,
+    (float16_t)0.395401479f, (float16_t)-0.918508394f,
+    (float16_t)0.396809987f, (float16_t)-0.917900776f,
+    (float16_t)0.398217562f, (float16_t)-0.917290997f,
+    (float16_t)0.399624200f, (float16_t)-0.916679060f,
+    (float16_t)0.401029897f, (float16_t)-0.916064966f,
+    (float16_t)0.402434651f, (float16_t)-0.915448716f,
+    (float16_t)0.403838458f, (float16_t)-0.914830312f,
+    (float16_t)0.405241314f, (float16_t)-0.914209756f,
+    (float16_t)0.406643217f, (float16_t)-0.913587048f,
+    (float16_t)0.408044163f, (float16_t)-0.912962190f,
+    (float16_t)0.409444149f, (float16_t)-0.912335185f,
+    (float16_t)0.410843171f, (float16_t)-0.911706032f,
+    (float16_t)0.412241227f, (float16_t)-0.911074734f,
+    (float16_t)0.413638312f, (float16_t)-0.910441292f,
+    (float16_t)0.415034424f, (float16_t)-0.909805708f,
+    (float16_t)0.416429560f, (float16_t)-0.909167983f,
+    (float16_t)0.417823716f, (float16_t)-0.908528119f,
+    (float16_t)0.419216888f, (float16_t)-0.907886116f,
+    (float16_t)0.420609074f, (float16_t)-0.907241978f,
+    (float16_t)0.422000271f, (float16_t)-0.906595705f,
+    (float16_t)0.423390474f, (float16_t)-0.905947298f,
+    (float16_t)0.424779681f, (float16_t)-0.905296759f,
+    (float16_t)0.426167889f, (float16_t)-0.904644091f,
+    (float16_t)0.427555093f, (float16_t)-0.903989293f,
+    (float16_t)0.428941292f, (float16_t)-0.903332368f,
+    (float16_t)0.430326481f, (float16_t)-0.902673318f,
+    (float16_t)0.431710658f, (float16_t)-0.902012144f,
+    (float16_t)0.433093819f, (float16_t)-0.901348847f,
+    (float16_t)0.434475961f, (float16_t)-0.900683429f,
+    (float16_t)0.435857080f, (float16_t)-0.900015892f,
+    (float16_t)0.437237174f, (float16_t)-0.899346237f,
+    (float16_t)0.438616239f, (float16_t)-0.898674466f,
+    (float16_t)0.439994271f, (float16_t)-0.898000580f,
+    (float16_t)0.441371269f, (float16_t)-0.897324581f,
+    (float16_t)0.442747228f, (float16_t)-0.896646470f,
+    (float16_t)0.444122145f, (float16_t)-0.895966250f,
+    (float16_t)0.445496017f, (float16_t)-0.895283921f,
+    (float16_t)0.446868840f, (float16_t)-0.894599486f,
+    (float16_t)0.448240612f, (float16_t)-0.893912945f,
+    (float16_t)0.449611330f, (float16_t)-0.893224301f,
+    (float16_t)0.450980989f, (float16_t)-0.892533555f,
+    (float16_t)0.452349587f, (float16_t)-0.891840709f,
+    (float16_t)0.453717121f, (float16_t)-0.891145765f,
+    (float16_t)0.455083587f, (float16_t)-0.890448723f,
+    (float16_t)0.456448982f, (float16_t)-0.889749586f,
+    (float16_t)0.457813304f, (float16_t)-0.889048356f,
+    (float16_t)0.459176548f, (float16_t)-0.888345033f,
+    (float16_t)0.460538711f, (float16_t)-0.887639620f,
+    (float16_t)0.461899791f, (float16_t)-0.886932119f,
+    (float16_t)0.463259784f, (float16_t)-0.886222530f,
+    (float16_t)0.464618686f, (float16_t)-0.885510856f,
+    (float16_t)0.465976496f, (float16_t)-0.884797098f,
+    (float16_t)0.467333209f, (float16_t)-0.884081259f,
+    (float16_t)0.468688822f, (float16_t)-0.883363339f,
+    (float16_t)0.470043332f, (float16_t)-0.882643340f,
+    (float16_t)0.471396737f, (float16_t)-0.881921264f,
+    (float16_t)0.472749032f, (float16_t)-0.881197113f,
+    (float16_t)0.474100215f, (float16_t)-0.880470889f,
+    (float16_t)0.475450282f, (float16_t)-0.879742593f,
+    (float16_t)0.476799230f, (float16_t)-0.879012226f,
+    (float16_t)0.478147056f, (float16_t)-0.878279792f,
+    (float16_t)0.479493758f, (float16_t)-0.877545290f,
+    (float16_t)0.480839331f, (float16_t)-0.876808724f,
+    (float16_t)0.482183772f, (float16_t)-0.876070094f,
+    (float16_t)0.483527079f, (float16_t)-0.875329403f,
+    (float16_t)0.484869248f, (float16_t)-0.874586652f,
+    (float16_t)0.486210276f, (float16_t)-0.873841843f,
+    (float16_t)0.487550160f, (float16_t)-0.873094978f,
+    (float16_t)0.488888897f, (float16_t)-0.872346059f,
+    (float16_t)0.490226483f, (float16_t)-0.871595087f,
+    (float16_t)0.491562916f, (float16_t)-0.870842063f,
+    (float16_t)0.492898192f, (float16_t)-0.870086991f,
+    (float16_t)0.494232309f, (float16_t)-0.869329871f,
+    (float16_t)0.495565262f, (float16_t)-0.868570706f,
+    (float16_t)0.496897049f, (float16_t)-0.867809497f,
+    (float16_t)0.498227667f, (float16_t)-0.867046246f,
+    (float16_t)0.499557113f, (float16_t)-0.866280954f,
+    (float16_t)0.500885383f, (float16_t)-0.865513624f,
+    (float16_t)0.502212474f, (float16_t)-0.864744258f,
+    (float16_t)0.503538384f, (float16_t)-0.863972856f,
+    (float16_t)0.504863109f, (float16_t)-0.863199422f,
+    (float16_t)0.506186645f, (float16_t)-0.862423956f,
+    (float16_t)0.507508991f, (float16_t)-0.861646461f,
+    (float16_t)0.508830143f, (float16_t)-0.860866939f,
+    (float16_t)0.510150097f, (float16_t)-0.860085390f,
+    (float16_t)0.511468850f, (float16_t)-0.859301818f,
+    (float16_t)0.512786401f, (float16_t)-0.858516224f,
+    (float16_t)0.514102744f, (float16_t)-0.857728610f,
+    (float16_t)0.515417878f, (float16_t)-0.856938977f,
+    (float16_t)0.516731799f, (float16_t)-0.856147328f,
+    (float16_t)0.518044504f, (float16_t)-0.855353665f,
+    (float16_t)0.519355990f, (float16_t)-0.854557988f,
+    (float16_t)0.520666254f, (float16_t)-0.853760301f,
+    (float16_t)0.521975293f, (float16_t)-0.852960605f,
+    (float16_t)0.523283103f, (float16_t)-0.852158902f,
+    (float16_t)0.524589683f, (float16_t)-0.851355193f,
+    (float16_t)0.525895027f, (float16_t)-0.850549481f,
+    (float16_t)0.527199135f, (float16_t)-0.849741768f,
+    (float16_t)0.528502002f, (float16_t)-0.848932055f,
+    (float16_t)0.529803625f, (float16_t)-0.848120345f,
+    (float16_t)0.531104001f, (float16_t)-0.847306639f,
+    (float16_t)0.532403128f, (float16_t)-0.846490939f,
+    (float16_t)0.533701002f, (float16_t)-0.845673247f,
+    (float16_t)0.534997620f, (float16_t)-0.844853565f,
+    (float16_t)0.536292979f, (float16_t)-0.844031895f,
+    (float16_t)0.537587076f, (float16_t)-0.843208240f,
+    (float16_t)0.538879909f, (float16_t)-0.842382600f,
+    (float16_t)0.540171473f, (float16_t)-0.841554977f,
+    (float16_t)0.541461766f, (float16_t)-0.840725375f,
+    (float16_t)0.542750785f, (float16_t)-0.839893794f,
+    (float16_t)0.544038527f, (float16_t)-0.839060237f,
+    (float16_t)0.545324988f, (float16_t)-0.838224706f,
+    (float16_t)0.546610167f, (float16_t)-0.837387202f,
+    (float16_t)0.547894059f, (float16_t)-0.836547727f,
+    (float16_t)0.549176662f, (float16_t)-0.835706284f,
+    (float16_t)0.550457973f, (float16_t)-0.834862875f,
+    (float16_t)0.551737988f, (float16_t)-0.834017501f,
+    (float16_t)0.553016706f, (float16_t)-0.833170165f,
+    (float16_t)0.554294121f, (float16_t)-0.832320868f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.556845037f, (float16_t)-0.830616400f,
+    (float16_t)0.558118531f, (float16_t)-0.829761234f,
+    (float16_t)0.559390712f, (float16_t)-0.828904115f,
+    (float16_t)0.560661576f, (float16_t)-0.828045045f,
+    (float16_t)0.561931121f, (float16_t)-0.827184027f,
+    (float16_t)0.563199344f, (float16_t)-0.826321063f,
+    (float16_t)0.564466242f, (float16_t)-0.825456154f,
+    (float16_t)0.565731811f, (float16_t)-0.824589303f,
+    (float16_t)0.566996049f, (float16_t)-0.823720511f,
+    (float16_t)0.568258953f, (float16_t)-0.822849781f,
+    (float16_t)0.569520519f, (float16_t)-0.821977115f,
+    (float16_t)0.570780746f, (float16_t)-0.821102515f,
+    (float16_t)0.572039629f, (float16_t)-0.820225983f,
+    (float16_t)0.573297167f, (float16_t)-0.819347520f,
+    (float16_t)0.574553355f, (float16_t)-0.818467130f,
+    (float16_t)0.575808191f, (float16_t)-0.817584813f,
+    (float16_t)0.577061673f, (float16_t)-0.816700573f,
+    (float16_t)0.578313796f, (float16_t)-0.815814411f,
+    (float16_t)0.579564559f, (float16_t)-0.814926329f,
+    (float16_t)0.580813958f, (float16_t)-0.814036330f,
+    (float16_t)0.582061990f, (float16_t)-0.813144415f,
+    (float16_t)0.583308653f, (float16_t)-0.812250587f,
+    (float16_t)0.584553943f, (float16_t)-0.811354847f,
+    (float16_t)0.585797857f, (float16_t)-0.810457198f,
+    (float16_t)0.587040394f, (float16_t)-0.809557642f,
+    (float16_t)0.588281548f, (float16_t)-0.808656182f,
+    (float16_t)0.589521319f, (float16_t)-0.807752818f,
+    (float16_t)0.590759702f, (float16_t)-0.806847554f,
+    (float16_t)0.591996695f, (float16_t)-0.805940391f,
+    (float16_t)0.593232295f, (float16_t)-0.805031331f,
+    (float16_t)0.594466499f, (float16_t)-0.804120377f,
+    (float16_t)0.595699304f, (float16_t)-0.803207531f,
+    (float16_t)0.596930708f, (float16_t)-0.802292796f,
+    (float16_t)0.598160707f, (float16_t)-0.801376172f,
+    (float16_t)0.599389298f, (float16_t)-0.800457662f,
+    (float16_t)0.600616479f, (float16_t)-0.799537269f,
+    (float16_t)0.601842247f, (float16_t)-0.798614995f,
+    (float16_t)0.603066599f, (float16_t)-0.797690841f,
+    (float16_t)0.604289531f, (float16_t)-0.796764810f,
+    (float16_t)0.605511041f, (float16_t)-0.795836905f,
+    (float16_t)0.606731127f, (float16_t)-0.794907126f,
+    (float16_t)0.607949785f, (float16_t)-0.793975478f,
+    (float16_t)0.609167012f, (float16_t)-0.793041960f,
+    (float16_t)0.610382806f, (float16_t)-0.792106577f,
+    (float16_t)0.611597164f, (float16_t)-0.791169330f,
+    (float16_t)0.612810082f, (float16_t)-0.790230221f,
+    (float16_t)0.614021559f, (float16_t)-0.789289253f,
+    (float16_t)0.615231591f, (float16_t)-0.788346428f,
+    (float16_t)0.616440175f, (float16_t)-0.787401747f,
+    (float16_t)0.617647308f, (float16_t)-0.786455214f,
+    (float16_t)0.618852988f, (float16_t)-0.785506830f,
+    (float16_t)0.620057212f, (float16_t)-0.784556597f,
+    (float16_t)0.621259977f, (float16_t)-0.783604519f,
+    (float16_t)0.622461279f, (float16_t)-0.782650596f,
+    (float16_t)0.623661118f, (float16_t)-0.781694832f,
+    (float16_t)0.624859488f, (float16_t)-0.780737229f,
+    (float16_t)0.626056388f, (float16_t)-0.779777788f,
+    (float16_t)0.627251815f, (float16_t)-0.778816512f,
+    (float16_t)0.628445767f, (float16_t)-0.777853404f,
+    (float16_t)0.629638239f, (float16_t)-0.776888466f,
+    (float16_t)0.630829230f, (float16_t)-0.775921699f,
+    (float16_t)0.632018736f, (float16_t)-0.774953107f,
+    (float16_t)0.633206755f, (float16_t)-0.773982691f,
+    (float16_t)0.634393284f, (float16_t)-0.773010453f,
+    (float16_t)0.635578320f, (float16_t)-0.772036397f,
+    (float16_t)0.636761861f, (float16_t)-0.771060524f,
+    (float16_t)0.637943904f, (float16_t)-0.770082837f,
+    (float16_t)0.639124445f, (float16_t)-0.769103338f,
+    (float16_t)0.640303482f, (float16_t)-0.768122029f,
+    (float16_t)0.641481013f, (float16_t)-0.767138912f,
+    (float16_t)0.642657034f, (float16_t)-0.766153990f,
+    (float16_t)0.643831543f, (float16_t)-0.765167266f,
+    (float16_t)0.645004537f, (float16_t)-0.764178741f,
+    (float16_t)0.646176013f, (float16_t)-0.763188417f,
+    (float16_t)0.647345969f, (float16_t)-0.762196298f,
+    (float16_t)0.648514401f, (float16_t)-0.761202385f,
+    (float16_t)0.649681307f, (float16_t)-0.760206682f,
+    (float16_t)0.650846685f, (float16_t)-0.759209189f,
+    (float16_t)0.652010531f, (float16_t)-0.758209910f,
+    (float16_t)0.653172843f, (float16_t)-0.757208847f,
+    (float16_t)0.654333618f, (float16_t)-0.756206001f,
+    (float16_t)0.655492853f, (float16_t)-0.755201377f,
+    (float16_t)0.656650546f, (float16_t)-0.754194975f,
+    (float16_t)0.657806693f, (float16_t)-0.753186799f,
+    (float16_t)0.658961293f, (float16_t)-0.752176850f,
+    (float16_t)0.660114342f, (float16_t)-0.751165132f,
+    (float16_t)0.661265838f, (float16_t)-0.750151646f,
+    (float16_t)0.662415778f, (float16_t)-0.749136395f,
+    (float16_t)0.663564159f, (float16_t)-0.748119380f,
+    (float16_t)0.664710978f, (float16_t)-0.747100606f,
+    (float16_t)0.665856234f, (float16_t)-0.746080074f,
+    (float16_t)0.666999922f, (float16_t)-0.745057785f,
+    (float16_t)0.668142041f, (float16_t)-0.744033744f,
+    (float16_t)0.669282588f, (float16_t)-0.743007952f,
+    (float16_t)0.670421560f, (float16_t)-0.741980412f,
+    (float16_t)0.671558955f, (float16_t)-0.740951125f,
+    (float16_t)0.672694769f, (float16_t)-0.739920095f,
+    (float16_t)0.673829000f, (float16_t)-0.738887324f,
+    (float16_t)0.674961646f, (float16_t)-0.737852815f,
+    (float16_t)0.676092704f, (float16_t)-0.736816569f,
+    (float16_t)0.677222170f, (float16_t)-0.735778589f,
+    (float16_t)0.678350043f, (float16_t)-0.734738878f,
+    (float16_t)0.679476320f, (float16_t)-0.733697438f,
+    (float16_t)0.680600998f, (float16_t)-0.732654272f,
+    (float16_t)0.681724074f, (float16_t)-0.731609381f,
+    (float16_t)0.682845546f, (float16_t)-0.730562769f,
+    (float16_t)0.683965412f, (float16_t)-0.729514438f,
+    (float16_t)0.685083668f, (float16_t)-0.728464390f,
+    (float16_t)0.686200312f, (float16_t)-0.727412629f,
+    (float16_t)0.687315341f, (float16_t)-0.726359155f,
+    (float16_t)0.688428753f, (float16_t)-0.725303972f,
+    (float16_t)0.689540545f, (float16_t)-0.724247083f,
+    (float16_t)0.690650714f, (float16_t)-0.723188489f,
+    (float16_t)0.691759258f, (float16_t)-0.722128194f,
+    (float16_t)0.692866175f, (float16_t)-0.721066199f,
+    (float16_t)0.693971461f, (float16_t)-0.720002508f,
+    (float16_t)0.695075114f, (float16_t)-0.718937122f,
+    (float16_t)0.696177131f, (float16_t)-0.717870045f,
+    (float16_t)0.697277511f, (float16_t)-0.716801279f,
+    (float16_t)0.698376249f, (float16_t)-0.715730825f,
+    (float16_t)0.699473345f, (float16_t)-0.714658688f,
+    (float16_t)0.700568794f, (float16_t)-0.713584869f,
+    (float16_t)0.701662595f, (float16_t)-0.712509371f,
+    (float16_t)0.702754744f, (float16_t)-0.711432196f,
+    (float16_t)0.703845241f, (float16_t)-0.710353347f,
+    (float16_t)0.704934080f, (float16_t)-0.709272826f,
+    (float16_t)0.706021261f, (float16_t)-0.708190637f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.708190637f, (float16_t)-0.706021261f,
+    (float16_t)0.709272826f, (float16_t)-0.704934080f,
+    (float16_t)0.710353347f, (float16_t)-0.703845241f,
+    (float16_t)0.711432196f, (float16_t)-0.702754744f,
+    (float16_t)0.712509371f, (float16_t)-0.701662595f,
+    (float16_t)0.713584869f, (float16_t)-0.700568794f,
+    (float16_t)0.714658688f, (float16_t)-0.699473345f,
+    (float16_t)0.715730825f, (float16_t)-0.698376249f,
+    (float16_t)0.716801279f, (float16_t)-0.697277511f,
+    (float16_t)0.717870045f, (float16_t)-0.696177131f,
+    (float16_t)0.718937122f, (float16_t)-0.695075114f,
+    (float16_t)0.720002508f, (float16_t)-0.693971461f,
+    (float16_t)0.721066199f, (float16_t)-0.692866175f,
+    (float16_t)0.722128194f, (float16_t)-0.691759258f,
+    (float16_t)0.723188489f, (float16_t)-0.690650714f,
+    (float16_t)0.724247083f, (float16_t)-0.689540545f,
+    (float16_t)0.725303972f, (float16_t)-0.688428753f,
+    (float16_t)0.726359155f, (float16_t)-0.687315341f,
+    (float16_t)0.727412629f, (float16_t)-0.686200312f,
+    (float16_t)0.728464390f, (float16_t)-0.685083668f,
+    (float16_t)0.729514438f, (float16_t)-0.683965412f,
+    (float16_t)0.730562769f, (float16_t)-0.682845546f,
+    (float16_t)0.731609381f, (float16_t)-0.681724074f,
+    (float16_t)0.732654272f, (float16_t)-0.680600998f,
+    (float16_t)0.733697438f, (float16_t)-0.679476320f,
+    (float16_t)0.734738878f, (float16_t)-0.678350043f,
+    (float16_t)0.735778589f, (float16_t)-0.677222170f,
+    (float16_t)0.736816569f, (float16_t)-0.676092704f,
+    (float16_t)0.737852815f, (float16_t)-0.674961646f,
+    (float16_t)0.738887324f, (float16_t)-0.673829000f,
+    (float16_t)0.739920095f, (float16_t)-0.672694769f,
+    (float16_t)0.740951125f, (float16_t)-0.671558955f,
+    (float16_t)0.741980412f, (float16_t)-0.670421560f,
+    (float16_t)0.743007952f, (float16_t)-0.669282588f,
+    (float16_t)0.744033744f, (float16_t)-0.668142041f,
+    (float16_t)0.745057785f, (float16_t)-0.666999922f,
+    (float16_t)0.746080074f, (float16_t)-0.665856234f,
+    (float16_t)0.747100606f, (float16_t)-0.664710978f,
+    (float16_t)0.748119380f, (float16_t)-0.663564159f,
+    (float16_t)0.749136395f, (float16_t)-0.662415778f,
+    (float16_t)0.750151646f, (float16_t)-0.661265838f,
+    (float16_t)0.751165132f, (float16_t)-0.660114342f,
+    (float16_t)0.752176850f, (float16_t)-0.658961293f,
+    (float16_t)0.753186799f, (float16_t)-0.657806693f,
+    (float16_t)0.754194975f, (float16_t)-0.656650546f,
+    (float16_t)0.755201377f, (float16_t)-0.655492853f,
+    (float16_t)0.756206001f, (float16_t)-0.654333618f,
+    (float16_t)0.757208847f, (float16_t)-0.653172843f,
+    (float16_t)0.758209910f, (float16_t)-0.652010531f,
+    (float16_t)0.759209189f, (float16_t)-0.650846685f,
+    (float16_t)0.760206682f, (float16_t)-0.649681307f,
+    (float16_t)0.761202385f, (float16_t)-0.648514401f,
+    (float16_t)0.762196298f, (float16_t)-0.647345969f,
+    (float16_t)0.763188417f, (float16_t)-0.646176013f,
+    (float16_t)0.764178741f, (float16_t)-0.645004537f,
+    (float16_t)0.765167266f, (float16_t)-0.643831543f,
+    (float16_t)0.766153990f, (float16_t)-0.642657034f,
+    (float16_t)0.767138912f, (float16_t)-0.641481013f,
+    (float16_t)0.768122029f, (float16_t)-0.640303482f,
+    (float16_t)0.769103338f, (float16_t)-0.639124445f,
+    (float16_t)0.770082837f, (float16_t)-0.637943904f,
+    (float16_t)0.771060524f, (float16_t)-0.636761861f,
+    (float16_t)0.772036397f, (float16_t)-0.635578320f,
+    (float16_t)0.773010453f, (float16_t)-0.634393284f,
+    (float16_t)0.773982691f, (float16_t)-0.633206755f,
+    (float16_t)0.774953107f, (float16_t)-0.632018736f,
+    (float16_t)0.775921699f, (float16_t)-0.630829230f,
+    (float16_t)0.776888466f, (float16_t)-0.629638239f,
+    (float16_t)0.777853404f, (float16_t)-0.628445767f,
+    (float16_t)0.778816512f, (float16_t)-0.627251815f,
+    (float16_t)0.779777788f, (float16_t)-0.626056388f,
+    (float16_t)0.780737229f, (float16_t)-0.624859488f,
+    (float16_t)0.781694832f, (float16_t)-0.623661118f,
+    (float16_t)0.782650596f, (float16_t)-0.622461279f,
+    (float16_t)0.783604519f, (float16_t)-0.621259977f,
+    (float16_t)0.784556597f, (float16_t)-0.620057212f,
+    (float16_t)0.785506830f, (float16_t)-0.618852988f,
+    (float16_t)0.786455214f, (float16_t)-0.617647308f,
+    (float16_t)0.787401747f, (float16_t)-0.616440175f,
+    (float16_t)0.788346428f, (float16_t)-0.615231591f,
+    (float16_t)0.789289253f, (float16_t)-0.614021559f,
+    (float16_t)0.790230221f, (float16_t)-0.612810082f,
+    (float16_t)0.791169330f, (float16_t)-0.611597164f,
+    (float16_t)0.792106577f, (float16_t)-0.610382806f,
+    (float16_t)0.793041960f, (float16_t)-0.609167012f,
+    (float16_t)0.793975478f, (float16_t)-0.607949785f,
+    (float16_t)0.794907126f, (float16_t)-0.606731127f,
+    (float16_t)0.795836905f, (float16_t)-0.605511041f,
+    (float16_t)0.796764810f, (float16_t)-0.604289531f,
+    (float16_t)0.797690841f, (float16_t)-0.603066599f,
+    (float16_t)0.798614995f, (float16_t)-0.601842247f,
+    (float16_t)0.799537269f, (float16_t)-0.600616479f,
+    (float16_t)0.800457662f, (float16_t)-0.599389298f,
+    (float16_t)0.801376172f, (float16_t)-0.598160707f,
+    (float16_t)0.802292796f, (float16_t)-0.596930708f,
+    (float16_t)0.803207531f, (float16_t)-0.595699304f,
+    (float16_t)0.804120377f, (float16_t)-0.594466499f,
+    (float16_t)0.805031331f, (float16_t)-0.593232295f,
+    (float16_t)0.805940391f, (float16_t)-0.591996695f,
+    (float16_t)0.806847554f, (float16_t)-0.590759702f,
+    (float16_t)0.807752818f, (float16_t)-0.589521319f,
+    (float16_t)0.808656182f, (float16_t)-0.588281548f,
+    (float16_t)0.809557642f, (float16_t)-0.587040394f,
+    (float16_t)0.810457198f, (float16_t)-0.585797857f,
+    (float16_t)0.811354847f, (float16_t)-0.584553943f,
+    (float16_t)0.812250587f, (float16_t)-0.583308653f,
+    (float16_t)0.813144415f, (float16_t)-0.582061990f,
+    (float16_t)0.814036330f, (float16_t)-0.580813958f,
+    (float16_t)0.814926329f, (float16_t)-0.579564559f,
+    (float16_t)0.815814411f, (float16_t)-0.578313796f,
+    (float16_t)0.816700573f, (float16_t)-0.577061673f,
+    (float16_t)0.817584813f, (float16_t)-0.575808191f,
+    (float16_t)0.818467130f, (float16_t)-0.574553355f,
+    (float16_t)0.819347520f, (float16_t)-0.573297167f,
+    (float16_t)0.820225983f, (float16_t)-0.572039629f,
+    (float16_t)0.821102515f, (float16_t)-0.570780746f,
+    (float16_t)0.821977115f, (float16_t)-0.569520519f,
+    (float16_t)0.822849781f, (float16_t)-0.568258953f,
+    (float16_t)0.823720511f, (float16_t)-0.566996049f,
+    (float16_t)0.824589303f, (float16_t)-0.565731811f,
+    (float16_t)0.825456154f, (float16_t)-0.564466242f,
+    (float16_t)0.826321063f, (float16_t)-0.563199344f,
+    (float16_t)0.827184027f, (float16_t)-0.561931121f,
+    (float16_t)0.828045045f, (float16_t)-0.560661576f,
+    (float16_t)0.828904115f, (float16_t)-0.559390712f,
+    (float16_t)0.829761234f, (float16_t)-0.558118531f,
+    (float16_t)0.830616400f, (float16_t)-0.556845037f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.832320868f, (float16_t)-0.554294121f,
+    (float16_t)0.833170165f, (float16_t)-0.553016706f,
+    (float16_t)0.834017501f, (float16_t)-0.551737988f,
+    (float16_t)0.834862875f, (float16_t)-0.550457973f,
+    (float16_t)0.835706284f, (float16_t)-0.549176662f,
+    (float16_t)0.836547727f, (float16_t)-0.547894059f,
+    (float16_t)0.837387202f, (float16_t)-0.546610167f,
+    (float16_t)0.838224706f, (float16_t)-0.545324988f,
+    (float16_t)0.839060237f, (float16_t)-0.544038527f,
+    (float16_t)0.839893794f, (float16_t)-0.542750785f,
+    (float16_t)0.840725375f, (float16_t)-0.541461766f,
+    (float16_t)0.841554977f, (float16_t)-0.540171473f,
+    (float16_t)0.842382600f, (float16_t)-0.538879909f,
+    (float16_t)0.843208240f, (float16_t)-0.537587076f,
+    (float16_t)0.844031895f, (float16_t)-0.536292979f,
+    (float16_t)0.844853565f, (float16_t)-0.534997620f,
+    (float16_t)0.845673247f, (float16_t)-0.533701002f,
+    (float16_t)0.846490939f, (float16_t)-0.532403128f,
+    (float16_t)0.847306639f, (float16_t)-0.531104001f,
+    (float16_t)0.848120345f, (float16_t)-0.529803625f,
+    (float16_t)0.848932055f, (float16_t)-0.528502002f,
+    (float16_t)0.849741768f, (float16_t)-0.527199135f,
+    (float16_t)0.850549481f, (float16_t)-0.525895027f,
+    (float16_t)0.851355193f, (float16_t)-0.524589683f,
+    (float16_t)0.852158902f, (float16_t)-0.523283103f,
+    (float16_t)0.852960605f, (float16_t)-0.521975293f,
+    (float16_t)0.853760301f, (float16_t)-0.520666254f,
+    (float16_t)0.854557988f, (float16_t)-0.519355990f,
+    (float16_t)0.855353665f, (float16_t)-0.518044504f,
+    (float16_t)0.856147328f, (float16_t)-0.516731799f,
+    (float16_t)0.856938977f, (float16_t)-0.515417878f,
+    (float16_t)0.857728610f, (float16_t)-0.514102744f,
+    (float16_t)0.858516224f, (float16_t)-0.512786401f,
+    (float16_t)0.859301818f, (float16_t)-0.511468850f,
+    (float16_t)0.860085390f, (float16_t)-0.510150097f,
+    (float16_t)0.860866939f, (float16_t)-0.508830143f,
+    (float16_t)0.861646461f, (float16_t)-0.507508991f,
+    (float16_t)0.862423956f, (float16_t)-0.506186645f,
+    (float16_t)0.863199422f, (float16_t)-0.504863109f,
+    (float16_t)0.863972856f, (float16_t)-0.503538384f,
+    (float16_t)0.864744258f, (float16_t)-0.502212474f,
+    (float16_t)0.865513624f, (float16_t)-0.500885383f,
+    (float16_t)0.866280954f, (float16_t)-0.499557113f,
+    (float16_t)0.867046246f, (float16_t)-0.498227667f,
+    (float16_t)0.867809497f, (float16_t)-0.496897049f,
+    (float16_t)0.868570706f, (float16_t)-0.495565262f,
+    (float16_t)0.869329871f, (float16_t)-0.494232309f,
+    (float16_t)0.870086991f, (float16_t)-0.492898192f,
+    (float16_t)0.870842063f, (float16_t)-0.491562916f,
+    (float16_t)0.871595087f, (float16_t)-0.490226483f,
+    (float16_t)0.872346059f, (float16_t)-0.488888897f,
+    (float16_t)0.873094978f, (float16_t)-0.487550160f,
+    (float16_t)0.873841843f, (float16_t)-0.486210276f,
+    (float16_t)0.874586652f, (float16_t)-0.484869248f,
+    (float16_t)0.875329403f, (float16_t)-0.483527079f,
+    (float16_t)0.876070094f, (float16_t)-0.482183772f,
+    (float16_t)0.876808724f, (float16_t)-0.480839331f,
+    (float16_t)0.877545290f, (float16_t)-0.479493758f,
+    (float16_t)0.878279792f, (float16_t)-0.478147056f,
+    (float16_t)0.879012226f, (float16_t)-0.476799230f,
+    (float16_t)0.879742593f, (float16_t)-0.475450282f,
+    (float16_t)0.880470889f, (float16_t)-0.474100215f,
+    (float16_t)0.881197113f, (float16_t)-0.472749032f,
+    (float16_t)0.881921264f, (float16_t)-0.471396737f,
+    (float16_t)0.882643340f, (float16_t)-0.470043332f,
+    (float16_t)0.883363339f, (float16_t)-0.468688822f,
+    (float16_t)0.884081259f, (float16_t)-0.467333209f,
+    (float16_t)0.884797098f, (float16_t)-0.465976496f,
+    (float16_t)0.885510856f, (float16_t)-0.464618686f,
+    (float16_t)0.886222530f, (float16_t)-0.463259784f,
+    (float16_t)0.886932119f, (float16_t)-0.461899791f,
+    (float16_t)0.887639620f, (float16_t)-0.460538711f,
+    (float16_t)0.888345033f, (float16_t)-0.459176548f,
+    (float16_t)0.889048356f, (float16_t)-0.457813304f,
+    (float16_t)0.889749586f, (float16_t)-0.456448982f,
+    (float16_t)0.890448723f, (float16_t)-0.455083587f,
+    (float16_t)0.891145765f, (float16_t)-0.453717121f,
+    (float16_t)0.891840709f, (float16_t)-0.452349587f,
+    (float16_t)0.892533555f, (float16_t)-0.450980989f,
+    (float16_t)0.893224301f, (float16_t)-0.449611330f,
+    (float16_t)0.893912945f, (float16_t)-0.448240612f,
+    (float16_t)0.894599486f, (float16_t)-0.446868840f,
+    (float16_t)0.895283921f, (float16_t)-0.445496017f,
+    (float16_t)0.895966250f, (float16_t)-0.444122145f,
+    (float16_t)0.896646470f, (float16_t)-0.442747228f,
+    (float16_t)0.897324581f, (float16_t)-0.441371269f,
+    (float16_t)0.898000580f, (float16_t)-0.439994271f,
+    (float16_t)0.898674466f, (float16_t)-0.438616239f,
+    (float16_t)0.899346237f, (float16_t)-0.437237174f,
+    (float16_t)0.900015892f, (float16_t)-0.435857080f,
+    (float16_t)0.900683429f, (float16_t)-0.434475961f,
+    (float16_t)0.901348847f, (float16_t)-0.433093819f,
+    (float16_t)0.902012144f, (float16_t)-0.431710658f,
+    (float16_t)0.902673318f, (float16_t)-0.430326481f,
+    (float16_t)0.903332368f, (float16_t)-0.428941292f,
+    (float16_t)0.903989293f, (float16_t)-0.427555093f,
+    (float16_t)0.904644091f, (float16_t)-0.426167889f,
+    (float16_t)0.905296759f, (float16_t)-0.424779681f,
+    (float16_t)0.905947298f, (float16_t)-0.423390474f,
+    (float16_t)0.906595705f, (float16_t)-0.422000271f,
+    (float16_t)0.907241978f, (float16_t)-0.420609074f,
+    (float16_t)0.907886116f, (float16_t)-0.419216888f,
+    (float16_t)0.908528119f, (float16_t)-0.417823716f,
+    (float16_t)0.909167983f, (float16_t)-0.416429560f,
+    (float16_t)0.909805708f, (float16_t)-0.415034424f,
+    (float16_t)0.910441292f, (float16_t)-0.413638312f,
+    (float16_t)0.911074734f, (float16_t)-0.412241227f,
+    (float16_t)0.911706032f, (float16_t)-0.410843171f,
+    (float16_t)0.912335185f, (float16_t)-0.409444149f,
+    (float16_t)0.912962190f, (float16_t)-0.408044163f,
+    (float16_t)0.913587048f, (float16_t)-0.406643217f,
+    (float16_t)0.914209756f, (float16_t)-0.405241314f,
+    (float16_t)0.914830312f, (float16_t)-0.403838458f,
+    (float16_t)0.915448716f, (float16_t)-0.402434651f,
+    (float16_t)0.916064966f, (float16_t)-0.401029897f,
+    (float16_t)0.916679060f, (float16_t)-0.399624200f,
+    (float16_t)0.917290997f, (float16_t)-0.398217562f,
+    (float16_t)0.917900776f, (float16_t)-0.396809987f,
+    (float16_t)0.918508394f, (float16_t)-0.395401479f,
+    (float16_t)0.919113852f, (float16_t)-0.393992040f,
+    (float16_t)0.919717146f, (float16_t)-0.392581674f,
+    (float16_t)0.920318277f, (float16_t)-0.391170384f,
+    (float16_t)0.920917242f, (float16_t)-0.389758174f,
+    (float16_t)0.921514039f, (float16_t)-0.388345047f,
+    (float16_t)0.922108669f, (float16_t)-0.386931006f,
+    (float16_t)0.922701128f, (float16_t)-0.385516054f,
+    (float16_t)0.923291417f, (float16_t)-0.384100195f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.924465474f, (float16_t)-0.381265769f,
+    (float16_t)0.925049241f, (float16_t)-0.379847209f,
+    (float16_t)0.925630831f, (float16_t)-0.378427755f,
+    (float16_t)0.926210242f, (float16_t)-0.377007410f,
+    (float16_t)0.926787474f, (float16_t)-0.375586178f,
+    (float16_t)0.927362526f, (float16_t)-0.374164063f,
+    (float16_t)0.927935395f, (float16_t)-0.372741067f,
+    (float16_t)0.928506080f, (float16_t)-0.371317194f,
+    (float16_t)0.929074581f, (float16_t)-0.369892447f,
+    (float16_t)0.929640896f, (float16_t)-0.368466830f,
+    (float16_t)0.930205023f, (float16_t)-0.367040346f,
+    (float16_t)0.930766961f, (float16_t)-0.365612998f,
+    (float16_t)0.931326709f, (float16_t)-0.364184790f,
+    (float16_t)0.931884266f, (float16_t)-0.362755724f,
+    (float16_t)0.932439629f, (float16_t)-0.361325806f,
+    (float16_t)0.932992799f, (float16_t)-0.359895037f,
+    (float16_t)0.933543773f, (float16_t)-0.358463421f,
+    (float16_t)0.934092550f, (float16_t)-0.357030961f,
+    (float16_t)0.934639130f, (float16_t)-0.355597662f,
+    (float16_t)0.935183510f, (float16_t)-0.354163525f,
+    (float16_t)0.935725689f, (float16_t)-0.352728556f,
+    (float16_t)0.936265667f, (float16_t)-0.351292756f,
+    (float16_t)0.936803442f, (float16_t)-0.349856130f,
+    (float16_t)0.937339012f, (float16_t)-0.348418680f,
+    (float16_t)0.937872376f, (float16_t)-0.346980411f,
+    (float16_t)0.938403534f, (float16_t)-0.345541325f,
+    (float16_t)0.938932484f, (float16_t)-0.344101426f,
+    (float16_t)0.939459224f, (float16_t)-0.342660717f,
+    (float16_t)0.939983753f, (float16_t)-0.341219202f,
+    (float16_t)0.940506071f, (float16_t)-0.339776884f,
+    (float16_t)0.941026175f, (float16_t)-0.338333767f,
+    (float16_t)0.941544065f, (float16_t)-0.336889853f,
+    (float16_t)0.942059740f, (float16_t)-0.335445147f,
+    (float16_t)0.942573198f, (float16_t)-0.333999651f,
+    (float16_t)0.943084437f, (float16_t)-0.332553370f,
+    (float16_t)0.943593458f, (float16_t)-0.331106306f,
+    (float16_t)0.944100258f, (float16_t)-0.329658463f,
+    (float16_t)0.944604837f, (float16_t)-0.328209844f,
+    (float16_t)0.945107193f, (float16_t)-0.326760452f,
+    (float16_t)0.945607325f, (float16_t)-0.325310292f,
+    (float16_t)0.946105232f, (float16_t)-0.323859367f,
+    (float16_t)0.946600913f, (float16_t)-0.322407679f,
+    (float16_t)0.947094366f, (float16_t)-0.320955232f,
+    (float16_t)0.947585591f, (float16_t)-0.319502031f,
+    (float16_t)0.948074586f, (float16_t)-0.318048077f,
+    (float16_t)0.948561350f, (float16_t)-0.316593376f,
+    (float16_t)0.949045882f, (float16_t)-0.315137929f,
+    (float16_t)0.949528181f, (float16_t)-0.313681740f,
+    (float16_t)0.950008245f, (float16_t)-0.312224814f,
+    (float16_t)0.950486074f, (float16_t)-0.310767153f,
+    (float16_t)0.950961666f, (float16_t)-0.309308760f,
+    (float16_t)0.951435021f, (float16_t)-0.307849640f,
+    (float16_t)0.951906137f, (float16_t)-0.306389795f,
+    (float16_t)0.952375013f, (float16_t)-0.304929230f,
+    (float16_t)0.952841648f, (float16_t)-0.303467947f,
+    (float16_t)0.953306040f, (float16_t)-0.302005949f,
+    (float16_t)0.953768190f, (float16_t)-0.300543241f,
+    (float16_t)0.954228095f, (float16_t)-0.299079826f,
+    (float16_t)0.954685755f, (float16_t)-0.297615707f,
+    (float16_t)0.955141168f, (float16_t)-0.296150888f,
+    (float16_t)0.955594334f, (float16_t)-0.294685372f,
+    (float16_t)0.956045251f, (float16_t)-0.293219163f,
+    (float16_t)0.956493919f, (float16_t)-0.291752263f,
+    (float16_t)0.956940336f, (float16_t)-0.290284677f,
+    (float16_t)0.957384501f, (float16_t)-0.288816408f,
+    (float16_t)0.957826413f, (float16_t)-0.287347460f,
+    (float16_t)0.958266071f, (float16_t)-0.285877835f,
+    (float16_t)0.958703475f, (float16_t)-0.284407537f,
+    (float16_t)0.959138622f, (float16_t)-0.282936570f,
+    (float16_t)0.959571513f, (float16_t)-0.281464938f,
+    (float16_t)0.960002146f, (float16_t)-0.279992643f,
+    (float16_t)0.960430519f, (float16_t)-0.278519689f,
+    (float16_t)0.960856633f, (float16_t)-0.277046080f,
+    (float16_t)0.961280486f, (float16_t)-0.275571819f,
+    (float16_t)0.961702077f, (float16_t)-0.274096910f,
+    (float16_t)0.962121404f, (float16_t)-0.272621355f,
+    (float16_t)0.962538468f, (float16_t)-0.271145160f,
+    (float16_t)0.962953267f, (float16_t)-0.269668326f,
+    (float16_t)0.963365800f, (float16_t)-0.268190857f,
+    (float16_t)0.963776066f, (float16_t)-0.266712757f,
+    (float16_t)0.964184064f, (float16_t)-0.265234030f,
+    (float16_t)0.964589793f, (float16_t)-0.263754679f,
+    (float16_t)0.964993253f, (float16_t)-0.262274707f,
+    (float16_t)0.965394442f, (float16_t)-0.260794118f,
+    (float16_t)0.965793359f, (float16_t)-0.259312915f,
+    (float16_t)0.966190003f, (float16_t)-0.257831102f,
+    (float16_t)0.966584374f, (float16_t)-0.256348682f,
+    (float16_t)0.966976471f, (float16_t)-0.254865660f,
+    (float16_t)0.967366292f, (float16_t)-0.253382037f,
+    (float16_t)0.967753837f, (float16_t)-0.251897818f,
+    (float16_t)0.968139105f, (float16_t)-0.250413007f,
+    (float16_t)0.968522094f, (float16_t)-0.248927606f,
+    (float16_t)0.968902805f, (float16_t)-0.247441619f,
+    (float16_t)0.969281235f, (float16_t)-0.245955050f,
+    (float16_t)0.969657385f, (float16_t)-0.244467903f,
+    (float16_t)0.970031253f, (float16_t)-0.242980180f,
+    (float16_t)0.970402839f, (float16_t)-0.241491885f,
+    (float16_t)0.970772141f, (float16_t)-0.240003022f,
+    (float16_t)0.971139158f, (float16_t)-0.238513595f,
+    (float16_t)0.971503891f, (float16_t)-0.237023606f,
+    (float16_t)0.971866337f, (float16_t)-0.235533059f,
+    (float16_t)0.972226497f, (float16_t)-0.234041959f,
+    (float16_t)0.972584369f, (float16_t)-0.232550307f,
+    (float16_t)0.972939952f, (float16_t)-0.231058108f,
+    (float16_t)0.973293246f, (float16_t)-0.229565366f,
+    (float16_t)0.973644250f, (float16_t)-0.228072083f,
+    (float16_t)0.973992962f, (float16_t)-0.226578264f,
+    (float16_t)0.974339383f, (float16_t)-0.225083911f,
+    (float16_t)0.974683511f, (float16_t)-0.223589029f,
+    (float16_t)0.975025345f, (float16_t)-0.222093621f,
+    (float16_t)0.975364885f, (float16_t)-0.220597690f,
+    (float16_t)0.975702130f, (float16_t)-0.219101240f,
+    (float16_t)0.976037079f, (float16_t)-0.217604275f,
+    (float16_t)0.976369731f, (float16_t)-0.216106797f,
+    (float16_t)0.976700086f, (float16_t)-0.214608811f,
+    (float16_t)0.977028143f, (float16_t)-0.213110320f,
+    (float16_t)0.977353900f, (float16_t)-0.211611327f,
+    (float16_t)0.977677358f, (float16_t)-0.210111837f,
+    (float16_t)0.977998515f, (float16_t)-0.208611852f,
+    (float16_t)0.978317371f, (float16_t)-0.207111376f,
+    (float16_t)0.978633924f, (float16_t)-0.205610413f,
+    (float16_t)0.978948175f, (float16_t)-0.204108966f,
+    (float16_t)0.979260123f, (float16_t)-0.202607039f,
+    (float16_t)0.979569766f, (float16_t)-0.201104635f,
+    (float16_t)0.979877104f, (float16_t)-0.199601758f,
+    (float16_t)0.980182136f, (float16_t)-0.198098411f,
+    (float16_t)0.980484862f, (float16_t)-0.196594598f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.981083391f, (float16_t)-0.193585587f,
+    (float16_t)0.981379193f, (float16_t)-0.192080397f,
+    (float16_t)0.981672686f, (float16_t)-0.190574755f,
+    (float16_t)0.981963869f, (float16_t)-0.189068664f,
+    (float16_t)0.982252741f, (float16_t)-0.187562129f,
+    (float16_t)0.982539302f, (float16_t)-0.186055152f,
+    (float16_t)0.982823551f, (float16_t)-0.184547737f,
+    (float16_t)0.983105487f, (float16_t)-0.183039888f,
+    (float16_t)0.983385110f, (float16_t)-0.181531608f,
+    (float16_t)0.983662419f, (float16_t)-0.180022901f,
+    (float16_t)0.983937413f, (float16_t)-0.178513771f,
+    (float16_t)0.984210092f, (float16_t)-0.177004220f,
+    (float16_t)0.984480455f, (float16_t)-0.175494253f,
+    (float16_t)0.984748502f, (float16_t)-0.173983873f,
+    (float16_t)0.985014231f, (float16_t)-0.172473084f,
+    (float16_t)0.985277642f, (float16_t)-0.170961889f,
+    (float16_t)0.985538735f, (float16_t)-0.169450291f,
+    (float16_t)0.985797509f, (float16_t)-0.167938295f,
+    (float16_t)0.986053963f, (float16_t)-0.166425904f,
+    (float16_t)0.986308097f, (float16_t)-0.164913120f,
+    (float16_t)0.986559910f, (float16_t)-0.163399949f,
+    (float16_t)0.986809402f, (float16_t)-0.161886394f,
+    (float16_t)0.987056571f, (float16_t)-0.160372457f,
+    (float16_t)0.987301418f, (float16_t)-0.158858143f,
+    (float16_t)0.987543942f, (float16_t)-0.157343456f,
+    (float16_t)0.987784142f, (float16_t)-0.155828398f,
+    (float16_t)0.988022017f, (float16_t)-0.154312973f,
+    (float16_t)0.988257568f, (float16_t)-0.152797185f,
+    (float16_t)0.988490793f, (float16_t)-0.151281038f,
+    (float16_t)0.988721692f, (float16_t)-0.149764535f,
+    (float16_t)0.988950265f, (float16_t)-0.148247679f,
+    (float16_t)0.989176510f, (float16_t)-0.146730474f,
+    (float16_t)0.989400428f, (float16_t)-0.145212925f,
+    (float16_t)0.989622017f, (float16_t)-0.143695033f,
+    (float16_t)0.989841278f, (float16_t)-0.142176804f,
+    (float16_t)0.990058210f, (float16_t)-0.140658239f,
+    (float16_t)0.990272812f, (float16_t)-0.139139344f,
+    (float16_t)0.990485084f, (float16_t)-0.137620122f,
+    (float16_t)0.990695025f, (float16_t)-0.136100575f,
+    (float16_t)0.990902635f, (float16_t)-0.134580709f,
+    (float16_t)0.991107914f, (float16_t)-0.133060525f,
+    (float16_t)0.991310860f, (float16_t)-0.131540029f,
+    (float16_t)0.991511473f, (float16_t)-0.130019223f,
+    (float16_t)0.991709754f, (float16_t)-0.128498111f,
+    (float16_t)0.991905700f, (float16_t)-0.126976696f,
+    (float16_t)0.992099313f, (float16_t)-0.125454983f,
+    (float16_t)0.992290591f, (float16_t)-0.123932975f,
+    (float16_t)0.992479535f, (float16_t)-0.122410675f,
+    (float16_t)0.992666142f, (float16_t)-0.120888087f,
+    (float16_t)0.992850414f, (float16_t)-0.119365215f,
+    (float16_t)0.993032350f, (float16_t)-0.117842062f,
+    (float16_t)0.993211949f, (float16_t)-0.116318631f,
+    (float16_t)0.993389211f, (float16_t)-0.114794927f,
+    (float16_t)0.993564136f, (float16_t)-0.113270952f,
+    (float16_t)0.993736722f, (float16_t)-0.111746711f,
+    (float16_t)0.993906970f, (float16_t)-0.110222207f,
+    (float16_t)0.994074879f, (float16_t)-0.108697444f,
+    (float16_t)0.994240449f, (float16_t)-0.107172425f,
+    (float16_t)0.994403680f, (float16_t)-0.105647154f,
+    (float16_t)0.994564571f, (float16_t)-0.104121634f,
+    (float16_t)0.994723121f, (float16_t)-0.102595869f,
+    (float16_t)0.994879331f, (float16_t)-0.101069863f,
+    (float16_t)0.995033199f, (float16_t)-0.099543619f,
+    (float16_t)0.995184727f, (float16_t)-0.098017140f,
+    (float16_t)0.995333912f, (float16_t)-0.096490431f,
+    (float16_t)0.995480755f, (float16_t)-0.094963495f,
+    (float16_t)0.995625256f, (float16_t)-0.093436336f,
+    (float16_t)0.995767414f, (float16_t)-0.091908956f,
+    (float16_t)0.995907229f, (float16_t)-0.090381361f,
+    (float16_t)0.996044701f, (float16_t)-0.088853553f,
+    (float16_t)0.996179829f, (float16_t)-0.087325535f,
+    (float16_t)0.996312612f, (float16_t)-0.085797312f,
+    (float16_t)0.996443051f, (float16_t)-0.084268888f,
+    (float16_t)0.996571146f, (float16_t)-0.082740265f,
+    (float16_t)0.996696895f, (float16_t)-0.081211447f,
+    (float16_t)0.996820299f, (float16_t)-0.079682438f,
+    (float16_t)0.996941358f, (float16_t)-0.078153242f,
+    (float16_t)0.997060070f, (float16_t)-0.076623861f,
+    (float16_t)0.997176437f, (float16_t)-0.075094301f,
+    (float16_t)0.997290457f, (float16_t)-0.073564564f,
+    (float16_t)0.997402130f, (float16_t)-0.072034653f,
+    (float16_t)0.997511456f, (float16_t)-0.070504573f,
+    (float16_t)0.997618435f, (float16_t)-0.068974328f,
+    (float16_t)0.997723067f, (float16_t)-0.067443920f,
+    (float16_t)0.997825350f, (float16_t)-0.065913353f,
+    (float16_t)0.997925286f, (float16_t)-0.064382631f,
+    (float16_t)0.998022874f, (float16_t)-0.062851758f,
+    (float16_t)0.998118113f, (float16_t)-0.061320736f,
+    (float16_t)0.998211003f, (float16_t)-0.059789571f,
+    (float16_t)0.998301545f, (float16_t)-0.058258265f,
+    (float16_t)0.998389737f, (float16_t)-0.056726821f,
+    (float16_t)0.998475581f, (float16_t)-0.055195244f,
+    (float16_t)0.998559074f, (float16_t)-0.053663538f,
+    (float16_t)0.998640218f, (float16_t)-0.052131705f,
+    (float16_t)0.998719012f, (float16_t)-0.050599749f,
+    (float16_t)0.998795456f, (float16_t)-0.049067674f,
+    (float16_t)0.998869550f, (float16_t)-0.047535484f,
+    (float16_t)0.998941293f, (float16_t)-0.046003182f,
+    (float16_t)0.999010686f, (float16_t)-0.044470772f,
+    (float16_t)0.999077728f, (float16_t)-0.042938257f,
+    (float16_t)0.999142419f, (float16_t)-0.041405641f,
+    (float16_t)0.999204759f, (float16_t)-0.039872928f,
+    (float16_t)0.999264747f, (float16_t)-0.038340120f,
+    (float16_t)0.999322385f, (float16_t)-0.036807223f,
+    (float16_t)0.999377670f, (float16_t)-0.035274239f,
+    (float16_t)0.999430605f, (float16_t)-0.033741172f,
+    (float16_t)0.999481187f, (float16_t)-0.032208025f,
+    (float16_t)0.999529418f, (float16_t)-0.030674803f,
+    (float16_t)0.999575296f, (float16_t)-0.029141509f,
+    (float16_t)0.999618822f, (float16_t)-0.027608146f,
+    (float16_t)0.999659997f, (float16_t)-0.026074718f,
+    (float16_t)0.999698819f, (float16_t)-0.024541229f,
+    (float16_t)0.999735288f, (float16_t)-0.023007681f,
+    (float16_t)0.999769405f, (float16_t)-0.021474080f,
+    (float16_t)0.999801170f, (float16_t)-0.019940429f,
+    (float16_t)0.999830582f, (float16_t)-0.018406730f,
+    (float16_t)0.999857641f, (float16_t)-0.016872988f,
+    (float16_t)0.999882347f, (float16_t)-0.015339206f,
+    (float16_t)0.999904701f, (float16_t)-0.013805389f,
+    (float16_t)0.999924702f, (float16_t)-0.012271538f,
+    (float16_t)0.999942350f, (float16_t)-0.010737659f,
+    (float16_t)0.999957645f, (float16_t)-0.009203755f,
+    (float16_t)0.999970586f, (float16_t)-0.007669829f,
+    (float16_t)0.999981175f, (float16_t)-0.006135885f,
+    (float16_t)0.999989411f, (float16_t)-0.004601926f,
+    (float16_t)0.999995294f, (float16_t)-0.003067957f,
+    (float16_t)0.999998823f, (float16_t)-0.001533980f
+};
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_32)
+/**
+  @par
+  Example code for Floating-point RFFT Twiddle factors Generation:
+  @par
+  <pre>TW = exp(pi/2*i-2*pi*i*[0:L/2-1]/L).' </pre>
+  @par
+  Real and Imag values are in interleaved fashion
+*/
+const float16_t twiddleCoefF16_rfft_32[32] = {
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f
+};
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_64)
+const float16_t twiddleCoefF16_rfft_64[64] = {
+    (float16_t)0.000000000000000f,  (float16_t)1.000000000000000f,
+    (float16_t)0.098017140329561f,  (float16_t)0.995184726672197f,
+    (float16_t)0.195090322016128f,  (float16_t)0.980785280403230f,
+    (float16_t)0.290284677254462f,  (float16_t)0.956940335732209f,
+    (float16_t)0.382683432365090f,  (float16_t)0.923879532511287f,
+    (float16_t)0.471396736825998f,  (float16_t)0.881921264348355f,
+    (float16_t)0.555570233019602f,  (float16_t)0.831469612302545f,
+    (float16_t)0.634393284163645f,  (float16_t)0.773010453362737f,
+    (float16_t)0.707106781186547f,  (float16_t)0.707106781186548f,
+    (float16_t)0.773010453362737f,  (float16_t)0.634393284163645f,
+    (float16_t)0.831469612302545f,  (float16_t)0.555570233019602f,
+    (float16_t)0.881921264348355f,  (float16_t)0.471396736825998f,
+    (float16_t)0.923879532511287f,  (float16_t)0.382683432365090f,
+    (float16_t)0.956940335732209f,  (float16_t)0.290284677254462f,
+    (float16_t)0.980785280403230f,  (float16_t)0.195090322016128f,
+    (float16_t)0.995184726672197f,  (float16_t)0.098017140329561f,
+    (float16_t)1.000000000000000f,  (float16_t)0.000000000000000f,
+    (float16_t)0.995184726672197f, (float16_t)-0.098017140329561f,
+    (float16_t)0.980785280403230f, (float16_t)-0.195090322016128f,
+    (float16_t)0.956940335732209f, (float16_t)-0.290284677254462f,
+    (float16_t)0.923879532511287f, (float16_t)-0.382683432365090f,
+    (float16_t)0.881921264348355f, (float16_t)-0.471396736825998f,
+    (float16_t)0.831469612302545f, (float16_t)-0.555570233019602f,
+    (float16_t)0.773010453362737f, (float16_t)-0.634393284163645f,
+    (float16_t)0.707106781186548f, (float16_t)-0.707106781186547f,
+    (float16_t)0.634393284163645f, (float16_t)-0.773010453362737f,
+    (float16_t)0.555570233019602f, (float16_t)-0.831469612302545f,
+    (float16_t)0.471396736825998f, (float16_t)-0.881921264348355f,
+    (float16_t)0.382683432365090f, (float16_t)-0.923879532511287f,
+    (float16_t)0.290284677254462f, (float16_t)-0.956940335732209f,
+    (float16_t)0.195090322016129f, (float16_t)-0.980785280403230f,
+    (float16_t)0.098017140329561f, (float16_t)-0.995184726672197f
+};
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_128)
+const float16_t twiddleCoefF16_rfft_128[128] = {
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+    (float16_t)0.049067674f,  (float16_t)0.998795456f,
+    (float16_t)0.098017140f,  (float16_t)0.995184727f,
+    (float16_t)0.146730474f,  (float16_t)0.989176510f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.242980180f,  (float16_t)0.970031253f,
+    (float16_t)0.290284677f,  (float16_t)0.956940336f,
+    (float16_t)0.336889853f,  (float16_t)0.941544065f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.427555093f,  (float16_t)0.903989293f,
+    (float16_t)0.471396737f,  (float16_t)0.881921264f,
+    (float16_t)0.514102744f,  (float16_t)0.857728610f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.595699304f,  (float16_t)0.803207531f,
+    (float16_t)0.634393284f,  (float16_t)0.773010453f,
+    (float16_t)0.671558955f,  (float16_t)0.740951125f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.740951125f,  (float16_t)0.671558955f,
+    (float16_t)0.773010453f,  (float16_t)0.634393284f,
+    (float16_t)0.803207531f,  (float16_t)0.595699304f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.857728610f,  (float16_t)0.514102744f,
+    (float16_t)0.881921264f,  (float16_t)0.471396737f,
+    (float16_t)0.903989293f,  (float16_t)0.427555093f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.941544065f,  (float16_t)0.336889853f,
+    (float16_t)0.956940336f,  (float16_t)0.290284677f,
+    (float16_t)0.970031253f,  (float16_t)0.242980180f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.989176510f,  (float16_t)0.146730474f,
+    (float16_t)0.995184727f,  (float16_t)0.098017140f,
+    (float16_t)0.998795456f,  (float16_t)0.049067674f,
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.998795456f, (float16_t)-0.049067674f,
+    (float16_t)0.995184727f, (float16_t)-0.098017140f,
+    (float16_t)0.989176510f, (float16_t)-0.146730474f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.970031253f, (float16_t)-0.242980180f,
+    (float16_t)0.956940336f, (float16_t)-0.290284677f,
+    (float16_t)0.941544065f, (float16_t)-0.336889853f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.903989293f, (float16_t)-0.427555093f,
+    (float16_t)0.881921264f, (float16_t)-0.471396737f,
+    (float16_t)0.857728610f, (float16_t)-0.514102744f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.803207531f, (float16_t)-0.595699304f,
+    (float16_t)0.773010453f, (float16_t)-0.634393284f,
+    (float16_t)0.740951125f, (float16_t)-0.671558955f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.671558955f, (float16_t)-0.740951125f,
+    (float16_t)0.634393284f, (float16_t)-0.773010453f,
+    (float16_t)0.595699304f, (float16_t)-0.803207531f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.514102744f, (float16_t)-0.857728610f,
+    (float16_t)0.471396737f, (float16_t)-0.881921264f,
+    (float16_t)0.427555093f, (float16_t)-0.903989293f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.336889853f, (float16_t)-0.941544065f,
+    (float16_t)0.290284677f, (float16_t)-0.956940336f,
+    (float16_t)0.242980180f, (float16_t)-0.970031253f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.146730474f, (float16_t)-0.989176510f,
+    (float16_t)0.098017140f, (float16_t)-0.995184727f,
+    (float16_t)0.049067674f, (float16_t)-0.998795456f
+};
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_256)
+const float16_t twiddleCoefF16_rfft_256[256] = {
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+    (float16_t)0.024541229f,  (float16_t)0.999698819f,
+    (float16_t)0.049067674f,  (float16_t)0.998795456f,
+    (float16_t)0.073564564f,  (float16_t)0.997290457f,
+    (float16_t)0.098017140f,  (float16_t)0.995184727f,
+    (float16_t)0.122410675f,  (float16_t)0.992479535f,
+    (float16_t)0.146730474f,  (float16_t)0.989176510f,
+    (float16_t)0.170961889f,  (float16_t)0.985277642f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.219101240f,  (float16_t)0.975702130f,
+    (float16_t)0.242980180f,  (float16_t)0.970031253f,
+    (float16_t)0.266712757f,  (float16_t)0.963776066f,
+    (float16_t)0.290284677f,  (float16_t)0.956940336f,
+    (float16_t)0.313681740f,  (float16_t)0.949528181f,
+    (float16_t)0.336889853f,  (float16_t)0.941544065f,
+    (float16_t)0.359895037f,  (float16_t)0.932992799f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.405241314f,  (float16_t)0.914209756f,
+    (float16_t)0.427555093f,  (float16_t)0.903989293f,
+    (float16_t)0.449611330f,  (float16_t)0.893224301f,
+    (float16_t)0.471396737f,  (float16_t)0.881921264f,
+    (float16_t)0.492898192f,  (float16_t)0.870086991f,
+    (float16_t)0.514102744f,  (float16_t)0.857728610f,
+    (float16_t)0.534997620f,  (float16_t)0.844853565f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.575808191f,  (float16_t)0.817584813f,
+    (float16_t)0.595699304f,  (float16_t)0.803207531f,
+    (float16_t)0.615231591f,  (float16_t)0.788346428f,
+    (float16_t)0.634393284f,  (float16_t)0.773010453f,
+    (float16_t)0.653172843f,  (float16_t)0.757208847f,
+    (float16_t)0.671558955f,  (float16_t)0.740951125f,
+    (float16_t)0.689540545f,  (float16_t)0.724247083f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.724247083f,  (float16_t)0.689540545f,
+    (float16_t)0.740951125f,  (float16_t)0.671558955f,
+    (float16_t)0.757208847f,  (float16_t)0.653172843f,
+    (float16_t)0.773010453f,  (float16_t)0.634393284f,
+    (float16_t)0.788346428f,  (float16_t)0.615231591f,
+    (float16_t)0.803207531f,  (float16_t)0.595699304f,
+    (float16_t)0.817584813f,  (float16_t)0.575808191f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.844853565f,  (float16_t)0.534997620f,
+    (float16_t)0.857728610f,  (float16_t)0.514102744f,
+    (float16_t)0.870086991f,  (float16_t)0.492898192f,
+    (float16_t)0.881921264f,  (float16_t)0.471396737f,
+    (float16_t)0.893224301f,  (float16_t)0.449611330f,
+    (float16_t)0.903989293f,  (float16_t)0.427555093f,
+    (float16_t)0.914209756f,  (float16_t)0.405241314f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.932992799f,  (float16_t)0.359895037f,
+    (float16_t)0.941544065f,  (float16_t)0.336889853f,
+    (float16_t)0.949528181f,  (float16_t)0.313681740f,
+    (float16_t)0.956940336f,  (float16_t)0.290284677f,
+    (float16_t)0.963776066f,  (float16_t)0.266712757f,
+    (float16_t)0.970031253f,  (float16_t)0.242980180f,
+    (float16_t)0.975702130f,  (float16_t)0.219101240f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.985277642f,  (float16_t)0.170961889f,
+    (float16_t)0.989176510f,  (float16_t)0.146730474f,
+    (float16_t)0.992479535f,  (float16_t)0.122410675f,
+    (float16_t)0.995184727f,  (float16_t)0.098017140f,
+    (float16_t)0.997290457f,  (float16_t)0.073564564f,
+    (float16_t)0.998795456f,  (float16_t)0.049067674f,
+    (float16_t)0.999698819f,  (float16_t)0.024541229f,
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.999698819f, (float16_t)-0.024541229f,
+    (float16_t)0.998795456f, (float16_t)-0.049067674f,
+    (float16_t)0.997290457f, (float16_t)-0.073564564f,
+    (float16_t)0.995184727f, (float16_t)-0.098017140f,
+    (float16_t)0.992479535f, (float16_t)-0.122410675f,
+    (float16_t)0.989176510f, (float16_t)-0.146730474f,
+    (float16_t)0.985277642f, (float16_t)-0.170961889f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.975702130f, (float16_t)-0.219101240f,
+    (float16_t)0.970031253f, (float16_t)-0.242980180f,
+    (float16_t)0.963776066f, (float16_t)-0.266712757f,
+    (float16_t)0.956940336f, (float16_t)-0.290284677f,
+    (float16_t)0.949528181f, (float16_t)-0.313681740f,
+    (float16_t)0.941544065f, (float16_t)-0.336889853f,
+    (float16_t)0.932992799f, (float16_t)-0.359895037f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.914209756f, (float16_t)-0.405241314f,
+    (float16_t)0.903989293f, (float16_t)-0.427555093f,
+    (float16_t)0.893224301f, (float16_t)-0.449611330f,
+    (float16_t)0.881921264f, (float16_t)-0.471396737f,
+    (float16_t)0.870086991f, (float16_t)-0.492898192f,
+    (float16_t)0.857728610f, (float16_t)-0.514102744f,
+    (float16_t)0.844853565f, (float16_t)-0.534997620f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.817584813f, (float16_t)-0.575808191f,
+    (float16_t)0.803207531f, (float16_t)-0.595699304f,
+    (float16_t)0.788346428f, (float16_t)-0.615231591f,
+    (float16_t)0.773010453f, (float16_t)-0.634393284f,
+    (float16_t)0.757208847f, (float16_t)-0.653172843f,
+    (float16_t)0.740951125f, (float16_t)-0.671558955f,
+    (float16_t)0.724247083f, (float16_t)-0.689540545f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.689540545f, (float16_t)-0.724247083f,
+    (float16_t)0.671558955f, (float16_t)-0.740951125f,
+    (float16_t)0.653172843f, (float16_t)-0.757208847f,
+    (float16_t)0.634393284f, (float16_t)-0.773010453f,
+    (float16_t)0.615231591f, (float16_t)-0.788346428f,
+    (float16_t)0.595699304f, (float16_t)-0.803207531f,
+    (float16_t)0.575808191f, (float16_t)-0.817584813f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.534997620f, (float16_t)-0.844853565f,
+    (float16_t)0.514102744f, (float16_t)-0.857728610f,
+    (float16_t)0.492898192f, (float16_t)-0.870086991f,
+    (float16_t)0.471396737f, (float16_t)-0.881921264f,
+    (float16_t)0.449611330f, (float16_t)-0.893224301f,
+    (float16_t)0.427555093f, (float16_t)-0.903989293f,
+    (float16_t)0.405241314f, (float16_t)-0.914209756f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.359895037f, (float16_t)-0.932992799f,
+    (float16_t)0.336889853f, (float16_t)-0.941544065f,
+    (float16_t)0.313681740f, (float16_t)-0.949528181f,
+    (float16_t)0.290284677f, (float16_t)-0.956940336f,
+    (float16_t)0.266712757f, (float16_t)-0.963776066f,
+    (float16_t)0.242980180f, (float16_t)-0.970031253f,
+    (float16_t)0.219101240f, (float16_t)-0.975702130f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.170961889f, (float16_t)-0.985277642f,
+    (float16_t)0.146730474f, (float16_t)-0.989176510f,
+    (float16_t)0.122410675f, (float16_t)-0.992479535f,
+    (float16_t)0.098017140f, (float16_t)-0.995184727f,
+    (float16_t)0.073564564f, (float16_t)-0.997290457f,
+    (float16_t)0.049067674f, (float16_t)-0.998795456f,
+    (float16_t)0.024541229f, (float16_t)-0.999698819f
+};
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_512)
+const float16_t twiddleCoefF16_rfft_512[512] = {
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+    (float16_t)0.012271538f,  (float16_t)0.999924702f,
+    (float16_t)0.024541229f,  (float16_t)0.999698819f,
+    (float16_t)0.036807223f,  (float16_t)0.999322385f,
+    (float16_t)0.049067674f,  (float16_t)0.998795456f,
+    (float16_t)0.061320736f,  (float16_t)0.998118113f,
+    (float16_t)0.073564564f,  (float16_t)0.997290457f,
+    (float16_t)0.085797312f,  (float16_t)0.996312612f,
+    (float16_t)0.098017140f,  (float16_t)0.995184727f,
+    (float16_t)0.110222207f,  (float16_t)0.993906970f,
+    (float16_t)0.122410675f,  (float16_t)0.992479535f,
+    (float16_t)0.134580709f,  (float16_t)0.990902635f,
+    (float16_t)0.146730474f,  (float16_t)0.989176510f,
+    (float16_t)0.158858143f,  (float16_t)0.987301418f,
+    (float16_t)0.170961889f,  (float16_t)0.985277642f,
+    (float16_t)0.183039888f,  (float16_t)0.983105487f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.207111376f,  (float16_t)0.978317371f,
+    (float16_t)0.219101240f,  (float16_t)0.975702130f,
+    (float16_t)0.231058108f,  (float16_t)0.972939952f,
+    (float16_t)0.242980180f,  (float16_t)0.970031253f,
+    (float16_t)0.254865660f,  (float16_t)0.966976471f,
+    (float16_t)0.266712757f,  (float16_t)0.963776066f,
+    (float16_t)0.278519689f,  (float16_t)0.960430519f,
+    (float16_t)0.290284677f,  (float16_t)0.956940336f,
+    (float16_t)0.302005949f,  (float16_t)0.953306040f,
+    (float16_t)0.313681740f,  (float16_t)0.949528181f,
+    (float16_t)0.325310292f,  (float16_t)0.945607325f,
+    (float16_t)0.336889853f,  (float16_t)0.941544065f,
+    (float16_t)0.348418680f,  (float16_t)0.937339012f,
+    (float16_t)0.359895037f,  (float16_t)0.932992799f,
+    (float16_t)0.371317194f,  (float16_t)0.928506080f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.393992040f,  (float16_t)0.919113852f,
+    (float16_t)0.405241314f,  (float16_t)0.914209756f,
+    (float16_t)0.416429560f,  (float16_t)0.909167983f,
+    (float16_t)0.427555093f,  (float16_t)0.903989293f,
+    (float16_t)0.438616239f,  (float16_t)0.898674466f,
+    (float16_t)0.449611330f,  (float16_t)0.893224301f,
+    (float16_t)0.460538711f,  (float16_t)0.887639620f,
+    (float16_t)0.471396737f,  (float16_t)0.881921264f,
+    (float16_t)0.482183772f,  (float16_t)0.876070094f,
+    (float16_t)0.492898192f,  (float16_t)0.870086991f,
+    (float16_t)0.503538384f,  (float16_t)0.863972856f,
+    (float16_t)0.514102744f,  (float16_t)0.857728610f,
+    (float16_t)0.524589683f,  (float16_t)0.851355193f,
+    (float16_t)0.534997620f,  (float16_t)0.844853565f,
+    (float16_t)0.545324988f,  (float16_t)0.838224706f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.565731811f,  (float16_t)0.824589303f,
+    (float16_t)0.575808191f,  (float16_t)0.817584813f,
+    (float16_t)0.585797857f,  (float16_t)0.810457198f,
+    (float16_t)0.595699304f,  (float16_t)0.803207531f,
+    (float16_t)0.605511041f,  (float16_t)0.795836905f,
+    (float16_t)0.615231591f,  (float16_t)0.788346428f,
+    (float16_t)0.624859488f,  (float16_t)0.780737229f,
+    (float16_t)0.634393284f,  (float16_t)0.773010453f,
+    (float16_t)0.643831543f,  (float16_t)0.765167266f,
+    (float16_t)0.653172843f,  (float16_t)0.757208847f,
+    (float16_t)0.662415778f,  (float16_t)0.749136395f,
+    (float16_t)0.671558955f,  (float16_t)0.740951125f,
+    (float16_t)0.680600998f,  (float16_t)0.732654272f,
+    (float16_t)0.689540545f,  (float16_t)0.724247083f,
+    (float16_t)0.698376249f,  (float16_t)0.715730825f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.715730825f,  (float16_t)0.698376249f,
+    (float16_t)0.724247083f,  (float16_t)0.689540545f,
+    (float16_t)0.732654272f,  (float16_t)0.680600998f,
+    (float16_t)0.740951125f,  (float16_t)0.671558955f,
+    (float16_t)0.749136395f,  (float16_t)0.662415778f,
+    (float16_t)0.757208847f,  (float16_t)0.653172843f,
+    (float16_t)0.765167266f,  (float16_t)0.643831543f,
+    (float16_t)0.773010453f,  (float16_t)0.634393284f,
+    (float16_t)0.780737229f,  (float16_t)0.624859488f,
+    (float16_t)0.788346428f,  (float16_t)0.615231591f,
+    (float16_t)0.795836905f,  (float16_t)0.605511041f,
+    (float16_t)0.803207531f,  (float16_t)0.595699304f,
+    (float16_t)0.810457198f,  (float16_t)0.585797857f,
+    (float16_t)0.817584813f,  (float16_t)0.575808191f,
+    (float16_t)0.824589303f,  (float16_t)0.565731811f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.838224706f,  (float16_t)0.545324988f,
+    (float16_t)0.844853565f,  (float16_t)0.534997620f,
+    (float16_t)0.851355193f,  (float16_t)0.524589683f,
+    (float16_t)0.857728610f,  (float16_t)0.514102744f,
+    (float16_t)0.863972856f,  (float16_t)0.503538384f,
+    (float16_t)0.870086991f,  (float16_t)0.492898192f,
+    (float16_t)0.876070094f,  (float16_t)0.482183772f,
+    (float16_t)0.881921264f,  (float16_t)0.471396737f,
+    (float16_t)0.887639620f,  (float16_t)0.460538711f,
+    (float16_t)0.893224301f,  (float16_t)0.449611330f,
+    (float16_t)0.898674466f,  (float16_t)0.438616239f,
+    (float16_t)0.903989293f,  (float16_t)0.427555093f,
+    (float16_t)0.909167983f,  (float16_t)0.416429560f,
+    (float16_t)0.914209756f,  (float16_t)0.405241314f,
+    (float16_t)0.919113852f,  (float16_t)0.393992040f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.928506080f,  (float16_t)0.371317194f,
+    (float16_t)0.932992799f,  (float16_t)0.359895037f,
+    (float16_t)0.937339012f,  (float16_t)0.348418680f,
+    (float16_t)0.941544065f,  (float16_t)0.336889853f,
+    (float16_t)0.945607325f,  (float16_t)0.325310292f,
+    (float16_t)0.949528181f,  (float16_t)0.313681740f,
+    (float16_t)0.953306040f,  (float16_t)0.302005949f,
+    (float16_t)0.956940336f,  (float16_t)0.290284677f,
+    (float16_t)0.960430519f,  (float16_t)0.278519689f,
+    (float16_t)0.963776066f,  (float16_t)0.266712757f,
+    (float16_t)0.966976471f,  (float16_t)0.254865660f,
+    (float16_t)0.970031253f,  (float16_t)0.242980180f,
+    (float16_t)0.972939952f,  (float16_t)0.231058108f,
+    (float16_t)0.975702130f,  (float16_t)0.219101240f,
+    (float16_t)0.978317371f,  (float16_t)0.207111376f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.983105487f,  (float16_t)0.183039888f,
+    (float16_t)0.985277642f,  (float16_t)0.170961889f,
+    (float16_t)0.987301418f,  (float16_t)0.158858143f,
+    (float16_t)0.989176510f,  (float16_t)0.146730474f,
+    (float16_t)0.990902635f,  (float16_t)0.134580709f,
+    (float16_t)0.992479535f,  (float16_t)0.122410675f,
+    (float16_t)0.993906970f,  (float16_t)0.110222207f,
+    (float16_t)0.995184727f,  (float16_t)0.098017140f,
+    (float16_t)0.996312612f,  (float16_t)0.085797312f,
+    (float16_t)0.997290457f,  (float16_t)0.073564564f,
+    (float16_t)0.998118113f,  (float16_t)0.061320736f,
+    (float16_t)0.998795456f,  (float16_t)0.049067674f,
+    (float16_t)0.999322385f,  (float16_t)0.036807223f,
+    (float16_t)0.999698819f,  (float16_t)0.024541229f,
+    (float16_t)0.999924702f,  (float16_t)0.012271538f,
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.999924702f, (float16_t)-0.012271538f,
+    (float16_t)0.999698819f, (float16_t)-0.024541229f,
+    (float16_t)0.999322385f, (float16_t)-0.036807223f,
+    (float16_t)0.998795456f, (float16_t)-0.049067674f,
+    (float16_t)0.998118113f, (float16_t)-0.061320736f,
+    (float16_t)0.997290457f, (float16_t)-0.073564564f,
+    (float16_t)0.996312612f, (float16_t)-0.085797312f,
+    (float16_t)0.995184727f, (float16_t)-0.098017140f,
+    (float16_t)0.993906970f, (float16_t)-0.110222207f,
+    (float16_t)0.992479535f, (float16_t)-0.122410675f,
+    (float16_t)0.990902635f, (float16_t)-0.134580709f,
+    (float16_t)0.989176510f, (float16_t)-0.146730474f,
+    (float16_t)0.987301418f, (float16_t)-0.158858143f,
+    (float16_t)0.985277642f, (float16_t)-0.170961889f,
+    (float16_t)0.983105487f, (float16_t)-0.183039888f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.978317371f, (float16_t)-0.207111376f,
+    (float16_t)0.975702130f, (float16_t)-0.219101240f,
+    (float16_t)0.972939952f, (float16_t)-0.231058108f,
+    (float16_t)0.970031253f, (float16_t)-0.242980180f,
+    (float16_t)0.966976471f, (float16_t)-0.254865660f,
+    (float16_t)0.963776066f, (float16_t)-0.266712757f,
+    (float16_t)0.960430519f, (float16_t)-0.278519689f,
+    (float16_t)0.956940336f, (float16_t)-0.290284677f,
+    (float16_t)0.953306040f, (float16_t)-0.302005949f,
+    (float16_t)0.949528181f, (float16_t)-0.313681740f,
+    (float16_t)0.945607325f, (float16_t)-0.325310292f,
+    (float16_t)0.941544065f, (float16_t)-0.336889853f,
+    (float16_t)0.937339012f, (float16_t)-0.348418680f,
+    (float16_t)0.932992799f, (float16_t)-0.359895037f,
+    (float16_t)0.928506080f, (float16_t)-0.371317194f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.919113852f, (float16_t)-0.393992040f,
+    (float16_t)0.914209756f, (float16_t)-0.405241314f,
+    (float16_t)0.909167983f, (float16_t)-0.416429560f,
+    (float16_t)0.903989293f, (float16_t)-0.427555093f,
+    (float16_t)0.898674466f, (float16_t)-0.438616239f,
+    (float16_t)0.893224301f, (float16_t)-0.449611330f,
+    (float16_t)0.887639620f, (float16_t)-0.460538711f,
+    (float16_t)0.881921264f, (float16_t)-0.471396737f,
+    (float16_t)0.876070094f, (float16_t)-0.482183772f,
+    (float16_t)0.870086991f, (float16_t)-0.492898192f,
+    (float16_t)0.863972856f, (float16_t)-0.503538384f,
+    (float16_t)0.857728610f, (float16_t)-0.514102744f,
+    (float16_t)0.851355193f, (float16_t)-0.524589683f,
+    (float16_t)0.844853565f, (float16_t)-0.534997620f,
+    (float16_t)0.838224706f, (float16_t)-0.545324988f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.824589303f, (float16_t)-0.565731811f,
+    (float16_t)0.817584813f, (float16_t)-0.575808191f,
+    (float16_t)0.810457198f, (float16_t)-0.585797857f,
+    (float16_t)0.803207531f, (float16_t)-0.595699304f,
+    (float16_t)0.795836905f, (float16_t)-0.605511041f,
+    (float16_t)0.788346428f, (float16_t)-0.615231591f,
+    (float16_t)0.780737229f, (float16_t)-0.624859488f,
+    (float16_t)0.773010453f, (float16_t)-0.634393284f,
+    (float16_t)0.765167266f, (float16_t)-0.643831543f,
+    (float16_t)0.757208847f, (float16_t)-0.653172843f,
+    (float16_t)0.749136395f, (float16_t)-0.662415778f,
+    (float16_t)0.740951125f, (float16_t)-0.671558955f,
+    (float16_t)0.732654272f, (float16_t)-0.680600998f,
+    (float16_t)0.724247083f, (float16_t)-0.689540545f,
+    (float16_t)0.715730825f, (float16_t)-0.698376249f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.698376249f, (float16_t)-0.715730825f,
+    (float16_t)0.689540545f, (float16_t)-0.724247083f,
+    (float16_t)0.680600998f, (float16_t)-0.732654272f,
+    (float16_t)0.671558955f, (float16_t)-0.740951125f,
+    (float16_t)0.662415778f, (float16_t)-0.749136395f,
+    (float16_t)0.653172843f, (float16_t)-0.757208847f,
+    (float16_t)0.643831543f, (float16_t)-0.765167266f,
+    (float16_t)0.634393284f, (float16_t)-0.773010453f,
+    (float16_t)0.624859488f, (float16_t)-0.780737229f,
+    (float16_t)0.615231591f, (float16_t)-0.788346428f,
+    (float16_t)0.605511041f, (float16_t)-0.795836905f,
+    (float16_t)0.595699304f, (float16_t)-0.803207531f,
+    (float16_t)0.585797857f, (float16_t)-0.810457198f,
+    (float16_t)0.575808191f, (float16_t)-0.817584813f,
+    (float16_t)0.565731811f, (float16_t)-0.824589303f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.545324988f, (float16_t)-0.838224706f,
+    (float16_t)0.534997620f, (float16_t)-0.844853565f,
+    (float16_t)0.524589683f, (float16_t)-0.851355193f,
+    (float16_t)0.514102744f, (float16_t)-0.857728610f,
+    (float16_t)0.503538384f, (float16_t)-0.863972856f,
+    (float16_t)0.492898192f, (float16_t)-0.870086991f,
+    (float16_t)0.482183772f, (float16_t)-0.876070094f,
+    (float16_t)0.471396737f, (float16_t)-0.881921264f,
+    (float16_t)0.460538711f, (float16_t)-0.887639620f,
+    (float16_t)0.449611330f, (float16_t)-0.893224301f,
+    (float16_t)0.438616239f, (float16_t)-0.898674466f,
+    (float16_t)0.427555093f, (float16_t)-0.903989293f,
+    (float16_t)0.416429560f, (float16_t)-0.909167983f,
+    (float16_t)0.405241314f, (float16_t)-0.914209756f,
+    (float16_t)0.393992040f, (float16_t)-0.919113852f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.371317194f, (float16_t)-0.928506080f,
+    (float16_t)0.359895037f, (float16_t)-0.932992799f,
+    (float16_t)0.348418680f, (float16_t)-0.937339012f,
+    (float16_t)0.336889853f, (float16_t)-0.941544065f,
+    (float16_t)0.325310292f, (float16_t)-0.945607325f,
+    (float16_t)0.313681740f, (float16_t)-0.949528181f,
+    (float16_t)0.302005949f, (float16_t)-0.953306040f,
+    (float16_t)0.290284677f, (float16_t)-0.956940336f,
+    (float16_t)0.278519689f, (float16_t)-0.960430519f,
+    (float16_t)0.266712757f, (float16_t)-0.963776066f,
+    (float16_t)0.254865660f, (float16_t)-0.966976471f,
+    (float16_t)0.242980180f, (float16_t)-0.970031253f,
+    (float16_t)0.231058108f, (float16_t)-0.972939952f,
+    (float16_t)0.219101240f, (float16_t)-0.975702130f,
+    (float16_t)0.207111376f, (float16_t)-0.978317371f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.183039888f, (float16_t)-0.983105487f,
+    (float16_t)0.170961889f, (float16_t)-0.985277642f,
+    (float16_t)0.158858143f, (float16_t)-0.987301418f,
+    (float16_t)0.146730474f, (float16_t)-0.989176510f,
+    (float16_t)0.134580709f, (float16_t)-0.990902635f,
+    (float16_t)0.122410675f, (float16_t)-0.992479535f,
+    (float16_t)0.110222207f, (float16_t)-0.993906970f,
+    (float16_t)0.098017140f, (float16_t)-0.995184727f,
+    (float16_t)0.085797312f, (float16_t)-0.996312612f,
+    (float16_t)0.073564564f, (float16_t)-0.997290457f,
+    (float16_t)0.061320736f, (float16_t)-0.998118113f,
+    (float16_t)0.049067674f, (float16_t)-0.998795456f,
+    (float16_t)0.036807223f, (float16_t)-0.999322385f,
+    (float16_t)0.024541229f, (float16_t)-0.999698819f,
+    (float16_t)0.012271538f, (float16_t)-0.999924702f
+};
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_1024)
+const float16_t twiddleCoefF16_rfft_1024[1024] = {
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+    (float16_t)0.006135885f,  (float16_t)0.999981175f,
+    (float16_t)0.012271538f,  (float16_t)0.999924702f,
+    (float16_t)0.018406730f,  (float16_t)0.999830582f,
+    (float16_t)0.024541229f,  (float16_t)0.999698819f,
+    (float16_t)0.030674803f,  (float16_t)0.999529418f,
+    (float16_t)0.036807223f,  (float16_t)0.999322385f,
+    (float16_t)0.042938257f,  (float16_t)0.999077728f,
+    (float16_t)0.049067674f,  (float16_t)0.998795456f,
+    (float16_t)0.055195244f,  (float16_t)0.998475581f,
+    (float16_t)0.061320736f,  (float16_t)0.998118113f,
+    (float16_t)0.067443920f,  (float16_t)0.997723067f,
+    (float16_t)0.073564564f,  (float16_t)0.997290457f,
+    (float16_t)0.079682438f,  (float16_t)0.996820299f,
+    (float16_t)0.085797312f,  (float16_t)0.996312612f,
+    (float16_t)0.091908956f,  (float16_t)0.995767414f,
+    (float16_t)0.098017140f,  (float16_t)0.995184727f,
+    (float16_t)0.104121634f,  (float16_t)0.994564571f,
+    (float16_t)0.110222207f,  (float16_t)0.993906970f,
+    (float16_t)0.116318631f,  (float16_t)0.993211949f,
+    (float16_t)0.122410675f,  (float16_t)0.992479535f,
+    (float16_t)0.128498111f,  (float16_t)0.991709754f,
+    (float16_t)0.134580709f,  (float16_t)0.990902635f,
+    (float16_t)0.140658239f,  (float16_t)0.990058210f,
+    (float16_t)0.146730474f,  (float16_t)0.989176510f,
+    (float16_t)0.152797185f,  (float16_t)0.988257568f,
+    (float16_t)0.158858143f,  (float16_t)0.987301418f,
+    (float16_t)0.164913120f,  (float16_t)0.986308097f,
+    (float16_t)0.170961889f,  (float16_t)0.985277642f,
+    (float16_t)0.177004220f,  (float16_t)0.984210092f,
+    (float16_t)0.183039888f,  (float16_t)0.983105487f,
+    (float16_t)0.189068664f,  (float16_t)0.981963869f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.201104635f,  (float16_t)0.979569766f,
+    (float16_t)0.207111376f,  (float16_t)0.978317371f,
+    (float16_t)0.213110320f,  (float16_t)0.977028143f,
+    (float16_t)0.219101240f,  (float16_t)0.975702130f,
+    (float16_t)0.225083911f,  (float16_t)0.974339383f,
+    (float16_t)0.231058108f,  (float16_t)0.972939952f,
+    (float16_t)0.237023606f,  (float16_t)0.971503891f,
+    (float16_t)0.242980180f,  (float16_t)0.970031253f,
+    (float16_t)0.248927606f,  (float16_t)0.968522094f,
+    (float16_t)0.254865660f,  (float16_t)0.966976471f,
+    (float16_t)0.260794118f,  (float16_t)0.965394442f,
+    (float16_t)0.266712757f,  (float16_t)0.963776066f,
+    (float16_t)0.272621355f,  (float16_t)0.962121404f,
+    (float16_t)0.278519689f,  (float16_t)0.960430519f,
+    (float16_t)0.284407537f,  (float16_t)0.958703475f,
+    (float16_t)0.290284677f,  (float16_t)0.956940336f,
+    (float16_t)0.296150888f,  (float16_t)0.955141168f,
+    (float16_t)0.302005949f,  (float16_t)0.953306040f,
+    (float16_t)0.307849640f,  (float16_t)0.951435021f,
+    (float16_t)0.313681740f,  (float16_t)0.949528181f,
+    (float16_t)0.319502031f,  (float16_t)0.947585591f,
+    (float16_t)0.325310292f,  (float16_t)0.945607325f,
+    (float16_t)0.331106306f,  (float16_t)0.943593458f,
+    (float16_t)0.336889853f,  (float16_t)0.941544065f,
+    (float16_t)0.342660717f,  (float16_t)0.939459224f,
+    (float16_t)0.348418680f,  (float16_t)0.937339012f,
+    (float16_t)0.354163525f,  (float16_t)0.935183510f,
+    (float16_t)0.359895037f,  (float16_t)0.932992799f,
+    (float16_t)0.365612998f,  (float16_t)0.930766961f,
+    (float16_t)0.371317194f,  (float16_t)0.928506080f,
+    (float16_t)0.377007410f,  (float16_t)0.926210242f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.388345047f,  (float16_t)0.921514039f,
+    (float16_t)0.393992040f,  (float16_t)0.919113852f,
+    (float16_t)0.399624200f,  (float16_t)0.916679060f,
+    (float16_t)0.405241314f,  (float16_t)0.914209756f,
+    (float16_t)0.410843171f,  (float16_t)0.911706032f,
+    (float16_t)0.416429560f,  (float16_t)0.909167983f,
+    (float16_t)0.422000271f,  (float16_t)0.906595705f,
+    (float16_t)0.427555093f,  (float16_t)0.903989293f,
+    (float16_t)0.433093819f,  (float16_t)0.901348847f,
+    (float16_t)0.438616239f,  (float16_t)0.898674466f,
+    (float16_t)0.444122145f,  (float16_t)0.895966250f,
+    (float16_t)0.449611330f,  (float16_t)0.893224301f,
+    (float16_t)0.455083587f,  (float16_t)0.890448723f,
+    (float16_t)0.460538711f,  (float16_t)0.887639620f,
+    (float16_t)0.465976496f,  (float16_t)0.884797098f,
+    (float16_t)0.471396737f,  (float16_t)0.881921264f,
+    (float16_t)0.476799230f,  (float16_t)0.879012226f,
+    (float16_t)0.482183772f,  (float16_t)0.876070094f,
+    (float16_t)0.487550160f,  (float16_t)0.873094978f,
+    (float16_t)0.492898192f,  (float16_t)0.870086991f,
+    (float16_t)0.498227667f,  (float16_t)0.867046246f,
+    (float16_t)0.503538384f,  (float16_t)0.863972856f,
+    (float16_t)0.508830143f,  (float16_t)0.860866939f,
+    (float16_t)0.514102744f,  (float16_t)0.857728610f,
+    (float16_t)0.519355990f,  (float16_t)0.854557988f,
+    (float16_t)0.524589683f,  (float16_t)0.851355193f,
+    (float16_t)0.529803625f,  (float16_t)0.848120345f,
+    (float16_t)0.534997620f,  (float16_t)0.844853565f,
+    (float16_t)0.540171473f,  (float16_t)0.841554977f,
+    (float16_t)0.545324988f,  (float16_t)0.838224706f,
+    (float16_t)0.550457973f,  (float16_t)0.834862875f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.560661576f,  (float16_t)0.828045045f,
+    (float16_t)0.565731811f,  (float16_t)0.824589303f,
+    (float16_t)0.570780746f,  (float16_t)0.821102515f,
+    (float16_t)0.575808191f,  (float16_t)0.817584813f,
+    (float16_t)0.580813958f,  (float16_t)0.814036330f,
+    (float16_t)0.585797857f,  (float16_t)0.810457198f,
+    (float16_t)0.590759702f,  (float16_t)0.806847554f,
+    (float16_t)0.595699304f,  (float16_t)0.803207531f,
+    (float16_t)0.600616479f,  (float16_t)0.799537269f,
+    (float16_t)0.605511041f,  (float16_t)0.795836905f,
+    (float16_t)0.610382806f,  (float16_t)0.792106577f,
+    (float16_t)0.615231591f,  (float16_t)0.788346428f,
+    (float16_t)0.620057212f,  (float16_t)0.784556597f,
+    (float16_t)0.624859488f,  (float16_t)0.780737229f,
+    (float16_t)0.629638239f,  (float16_t)0.776888466f,
+    (float16_t)0.634393284f,  (float16_t)0.773010453f,
+    (float16_t)0.639124445f,  (float16_t)0.769103338f,
+    (float16_t)0.643831543f,  (float16_t)0.765167266f,
+    (float16_t)0.648514401f,  (float16_t)0.761202385f,
+    (float16_t)0.653172843f,  (float16_t)0.757208847f,
+    (float16_t)0.657806693f,  (float16_t)0.753186799f,
+    (float16_t)0.662415778f,  (float16_t)0.749136395f,
+    (float16_t)0.666999922f,  (float16_t)0.745057785f,
+    (float16_t)0.671558955f,  (float16_t)0.740951125f,
+    (float16_t)0.676092704f,  (float16_t)0.736816569f,
+    (float16_t)0.680600998f,  (float16_t)0.732654272f,
+    (float16_t)0.685083668f,  (float16_t)0.728464390f,
+    (float16_t)0.689540545f,  (float16_t)0.724247083f,
+    (float16_t)0.693971461f,  (float16_t)0.720002508f,
+    (float16_t)0.698376249f,  (float16_t)0.715730825f,
+    (float16_t)0.702754744f,  (float16_t)0.711432196f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.711432196f,  (float16_t)0.702754744f,
+    (float16_t)0.715730825f,  (float16_t)0.698376249f,
+    (float16_t)0.720002508f,  (float16_t)0.693971461f,
+    (float16_t)0.724247083f,  (float16_t)0.689540545f,
+    (float16_t)0.728464390f,  (float16_t)0.685083668f,
+    (float16_t)0.732654272f,  (float16_t)0.680600998f,
+    (float16_t)0.736816569f,  (float16_t)0.676092704f,
+    (float16_t)0.740951125f,  (float16_t)0.671558955f,
+    (float16_t)0.745057785f,  (float16_t)0.666999922f,
+    (float16_t)0.749136395f,  (float16_t)0.662415778f,
+    (float16_t)0.753186799f,  (float16_t)0.657806693f,
+    (float16_t)0.757208847f,  (float16_t)0.653172843f,
+    (float16_t)0.761202385f,  (float16_t)0.648514401f,
+    (float16_t)0.765167266f,  (float16_t)0.643831543f,
+    (float16_t)0.769103338f,  (float16_t)0.639124445f,
+    (float16_t)0.773010453f,  (float16_t)0.634393284f,
+    (float16_t)0.776888466f,  (float16_t)0.629638239f,
+    (float16_t)0.780737229f,  (float16_t)0.624859488f,
+    (float16_t)0.784556597f,  (float16_t)0.620057212f,
+    (float16_t)0.788346428f,  (float16_t)0.615231591f,
+    (float16_t)0.792106577f,  (float16_t)0.610382806f,
+    (float16_t)0.795836905f,  (float16_t)0.605511041f,
+    (float16_t)0.799537269f,  (float16_t)0.600616479f,
+    (float16_t)0.803207531f,  (float16_t)0.595699304f,
+    (float16_t)0.806847554f,  (float16_t)0.590759702f,
+    (float16_t)0.810457198f,  (float16_t)0.585797857f,
+    (float16_t)0.814036330f,  (float16_t)0.580813958f,
+    (float16_t)0.817584813f,  (float16_t)0.575808191f,
+    (float16_t)0.821102515f,  (float16_t)0.570780746f,
+    (float16_t)0.824589303f,  (float16_t)0.565731811f,
+    (float16_t)0.828045045f,  (float16_t)0.560661576f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.834862875f,  (float16_t)0.550457973f,
+    (float16_t)0.838224706f,  (float16_t)0.545324988f,
+    (float16_t)0.841554977f,  (float16_t)0.540171473f,
+    (float16_t)0.844853565f,  (float16_t)0.534997620f,
+    (float16_t)0.848120345f,  (float16_t)0.529803625f,
+    (float16_t)0.851355193f,  (float16_t)0.524589683f,
+    (float16_t)0.854557988f,  (float16_t)0.519355990f,
+    (float16_t)0.857728610f,  (float16_t)0.514102744f,
+    (float16_t)0.860866939f,  (float16_t)0.508830143f,
+    (float16_t)0.863972856f,  (float16_t)0.503538384f,
+    (float16_t)0.867046246f,  (float16_t)0.498227667f,
+    (float16_t)0.870086991f,  (float16_t)0.492898192f,
+    (float16_t)0.873094978f,  (float16_t)0.487550160f,
+    (float16_t)0.876070094f,  (float16_t)0.482183772f,
+    (float16_t)0.879012226f,  (float16_t)0.476799230f,
+    (float16_t)0.881921264f,  (float16_t)0.471396737f,
+    (float16_t)0.884797098f,  (float16_t)0.465976496f,
+    (float16_t)0.887639620f,  (float16_t)0.460538711f,
+    (float16_t)0.890448723f,  (float16_t)0.455083587f,
+    (float16_t)0.893224301f,  (float16_t)0.449611330f,
+    (float16_t)0.895966250f,  (float16_t)0.444122145f,
+    (float16_t)0.898674466f,  (float16_t)0.438616239f,
+    (float16_t)0.901348847f,  (float16_t)0.433093819f,
+    (float16_t)0.903989293f,  (float16_t)0.427555093f,
+    (float16_t)0.906595705f,  (float16_t)0.422000271f,
+    (float16_t)0.909167983f,  (float16_t)0.416429560f,
+    (float16_t)0.911706032f,  (float16_t)0.410843171f,
+    (float16_t)0.914209756f,  (float16_t)0.405241314f,
+    (float16_t)0.916679060f,  (float16_t)0.399624200f,
+    (float16_t)0.919113852f,  (float16_t)0.393992040f,
+    (float16_t)0.921514039f,  (float16_t)0.388345047f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.926210242f,  (float16_t)0.377007410f,
+    (float16_t)0.928506080f,  (float16_t)0.371317194f,
+    (float16_t)0.930766961f,  (float16_t)0.365612998f,
+    (float16_t)0.932992799f,  (float16_t)0.359895037f,
+    (float16_t)0.935183510f,  (float16_t)0.354163525f,
+    (float16_t)0.937339012f,  (float16_t)0.348418680f,
+    (float16_t)0.939459224f,  (float16_t)0.342660717f,
+    (float16_t)0.941544065f,  (float16_t)0.336889853f,
+    (float16_t)0.943593458f,  (float16_t)0.331106306f,
+    (float16_t)0.945607325f,  (float16_t)0.325310292f,
+    (float16_t)0.947585591f,  (float16_t)0.319502031f,
+    (float16_t)0.949528181f,  (float16_t)0.313681740f,
+    (float16_t)0.951435021f,  (float16_t)0.307849640f,
+    (float16_t)0.953306040f,  (float16_t)0.302005949f,
+    (float16_t)0.955141168f,  (float16_t)0.296150888f,
+    (float16_t)0.956940336f,  (float16_t)0.290284677f,
+    (float16_t)0.958703475f,  (float16_t)0.284407537f,
+    (float16_t)0.960430519f,  (float16_t)0.278519689f,
+    (float16_t)0.962121404f,  (float16_t)0.272621355f,
+    (float16_t)0.963776066f,  (float16_t)0.266712757f,
+    (float16_t)0.965394442f,  (float16_t)0.260794118f,
+    (float16_t)0.966976471f,  (float16_t)0.254865660f,
+    (float16_t)0.968522094f,  (float16_t)0.248927606f,
+    (float16_t)0.970031253f,  (float16_t)0.242980180f,
+    (float16_t)0.971503891f,  (float16_t)0.237023606f,
+    (float16_t)0.972939952f,  (float16_t)0.231058108f,
+    (float16_t)0.974339383f,  (float16_t)0.225083911f,
+    (float16_t)0.975702130f,  (float16_t)0.219101240f,
+    (float16_t)0.977028143f,  (float16_t)0.213110320f,
+    (float16_t)0.978317371f,  (float16_t)0.207111376f,
+    (float16_t)0.979569766f,  (float16_t)0.201104635f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.981963869f,  (float16_t)0.189068664f,
+    (float16_t)0.983105487f,  (float16_t)0.183039888f,
+    (float16_t)0.984210092f,  (float16_t)0.177004220f,
+    (float16_t)0.985277642f,  (float16_t)0.170961889f,
+    (float16_t)0.986308097f,  (float16_t)0.164913120f,
+    (float16_t)0.987301418f,  (float16_t)0.158858143f,
+    (float16_t)0.988257568f,  (float16_t)0.152797185f,
+    (float16_t)0.989176510f,  (float16_t)0.146730474f,
+    (float16_t)0.990058210f,  (float16_t)0.140658239f,
+    (float16_t)0.990902635f,  (float16_t)0.134580709f,
+    (float16_t)0.991709754f,  (float16_t)0.128498111f,
+    (float16_t)0.992479535f,  (float16_t)0.122410675f,
+    (float16_t)0.993211949f,  (float16_t)0.116318631f,
+    (float16_t)0.993906970f,  (float16_t)0.110222207f,
+    (float16_t)0.994564571f,  (float16_t)0.104121634f,
+    (float16_t)0.995184727f,  (float16_t)0.098017140f,
+    (float16_t)0.995767414f,  (float16_t)0.091908956f,
+    (float16_t)0.996312612f,  (float16_t)0.085797312f,
+    (float16_t)0.996820299f,  (float16_t)0.079682438f,
+    (float16_t)0.997290457f,  (float16_t)0.073564564f,
+    (float16_t)0.997723067f,  (float16_t)0.067443920f,
+    (float16_t)0.998118113f,  (float16_t)0.061320736f,
+    (float16_t)0.998475581f,  (float16_t)0.055195244f,
+    (float16_t)0.998795456f,  (float16_t)0.049067674f,
+    (float16_t)0.999077728f,  (float16_t)0.042938257f,
+    (float16_t)0.999322385f,  (float16_t)0.036807223f,
+    (float16_t)0.999529418f,  (float16_t)0.030674803f,
+    (float16_t)0.999698819f,  (float16_t)0.024541229f,
+    (float16_t)0.999830582f,  (float16_t)0.018406730f,
+    (float16_t)0.999924702f,  (float16_t)0.012271538f,
+    (float16_t)0.999981175f,  (float16_t)0.006135885f,
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.999981175f, (float16_t)-0.006135885f,
+    (float16_t)0.999924702f, (float16_t)-0.012271538f,
+    (float16_t)0.999830582f, (float16_t)-0.018406730f,
+    (float16_t)0.999698819f, (float16_t)-0.024541229f,
+    (float16_t)0.999529418f, (float16_t)-0.030674803f,
+    (float16_t)0.999322385f, (float16_t)-0.036807223f,
+    (float16_t)0.999077728f, (float16_t)-0.042938257f,
+    (float16_t)0.998795456f, (float16_t)-0.049067674f,
+    (float16_t)0.998475581f, (float16_t)-0.055195244f,
+    (float16_t)0.998118113f, (float16_t)-0.061320736f,
+    (float16_t)0.997723067f, (float16_t)-0.067443920f,
+    (float16_t)0.997290457f, (float16_t)-0.073564564f,
+    (float16_t)0.996820299f, (float16_t)-0.079682438f,
+    (float16_t)0.996312612f, (float16_t)-0.085797312f,
+    (float16_t)0.995767414f, (float16_t)-0.091908956f,
+    (float16_t)0.995184727f, (float16_t)-0.098017140f,
+    (float16_t)0.994564571f, (float16_t)-0.104121634f,
+    (float16_t)0.993906970f, (float16_t)-0.110222207f,
+    (float16_t)0.993211949f, (float16_t)-0.116318631f,
+    (float16_t)0.992479535f, (float16_t)-0.122410675f,
+    (float16_t)0.991709754f, (float16_t)-0.128498111f,
+    (float16_t)0.990902635f, (float16_t)-0.134580709f,
+    (float16_t)0.990058210f, (float16_t)-0.140658239f,
+    (float16_t)0.989176510f, (float16_t)-0.146730474f,
+    (float16_t)0.988257568f, (float16_t)-0.152797185f,
+    (float16_t)0.987301418f, (float16_t)-0.158858143f,
+    (float16_t)0.986308097f, (float16_t)-0.164913120f,
+    (float16_t)0.985277642f, (float16_t)-0.170961889f,
+    (float16_t)0.984210092f, (float16_t)-0.177004220f,
+    (float16_t)0.983105487f, (float16_t)-0.183039888f,
+    (float16_t)0.981963869f, (float16_t)-0.189068664f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.979569766f, (float16_t)-0.201104635f,
+    (float16_t)0.978317371f, (float16_t)-0.207111376f,
+    (float16_t)0.977028143f, (float16_t)-0.213110320f,
+    (float16_t)0.975702130f, (float16_t)-0.219101240f,
+    (float16_t)0.974339383f, (float16_t)-0.225083911f,
+    (float16_t)0.972939952f, (float16_t)-0.231058108f,
+    (float16_t)0.971503891f, (float16_t)-0.237023606f,
+    (float16_t)0.970031253f, (float16_t)-0.242980180f,
+    (float16_t)0.968522094f, (float16_t)-0.248927606f,
+    (float16_t)0.966976471f, (float16_t)-0.254865660f,
+    (float16_t)0.965394442f, (float16_t)-0.260794118f,
+    (float16_t)0.963776066f, (float16_t)-0.266712757f,
+    (float16_t)0.962121404f, (float16_t)-0.272621355f,
+    (float16_t)0.960430519f, (float16_t)-0.278519689f,
+    (float16_t)0.958703475f, (float16_t)-0.284407537f,
+    (float16_t)0.956940336f, (float16_t)-0.290284677f,
+    (float16_t)0.955141168f, (float16_t)-0.296150888f,
+    (float16_t)0.953306040f, (float16_t)-0.302005949f,
+    (float16_t)0.951435021f, (float16_t)-0.307849640f,
+    (float16_t)0.949528181f, (float16_t)-0.313681740f,
+    (float16_t)0.947585591f, (float16_t)-0.319502031f,
+    (float16_t)0.945607325f, (float16_t)-0.325310292f,
+    (float16_t)0.943593458f, (float16_t)-0.331106306f,
+    (float16_t)0.941544065f, (float16_t)-0.336889853f,
+    (float16_t)0.939459224f, (float16_t)-0.342660717f,
+    (float16_t)0.937339012f, (float16_t)-0.348418680f,
+    (float16_t)0.935183510f, (float16_t)-0.354163525f,
+    (float16_t)0.932992799f, (float16_t)-0.359895037f,
+    (float16_t)0.930766961f, (float16_t)-0.365612998f,
+    (float16_t)0.928506080f, (float16_t)-0.371317194f,
+    (float16_t)0.926210242f, (float16_t)-0.377007410f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.921514039f, (float16_t)-0.388345047f,
+    (float16_t)0.919113852f, (float16_t)-0.393992040f,
+    (float16_t)0.916679060f, (float16_t)-0.399624200f,
+    (float16_t)0.914209756f, (float16_t)-0.405241314f,
+    (float16_t)0.911706032f, (float16_t)-0.410843171f,
+    (float16_t)0.909167983f, (float16_t)-0.416429560f,
+    (float16_t)0.906595705f, (float16_t)-0.422000271f,
+    (float16_t)0.903989293f, (float16_t)-0.427555093f,
+    (float16_t)0.901348847f, (float16_t)-0.433093819f,
+    (float16_t)0.898674466f, (float16_t)-0.438616239f,
+    (float16_t)0.895966250f, (float16_t)-0.444122145f,
+    (float16_t)0.893224301f, (float16_t)-0.449611330f,
+    (float16_t)0.890448723f, (float16_t)-0.455083587f,
+    (float16_t)0.887639620f, (float16_t)-0.460538711f,
+    (float16_t)0.884797098f, (float16_t)-0.465976496f,
+    (float16_t)0.881921264f, (float16_t)-0.471396737f,
+    (float16_t)0.879012226f, (float16_t)-0.476799230f,
+    (float16_t)0.876070094f, (float16_t)-0.482183772f,
+    (float16_t)0.873094978f, (float16_t)-0.487550160f,
+    (float16_t)0.870086991f, (float16_t)-0.492898192f,
+    (float16_t)0.867046246f, (float16_t)-0.498227667f,
+    (float16_t)0.863972856f, (float16_t)-0.503538384f,
+    (float16_t)0.860866939f, (float16_t)-0.508830143f,
+    (float16_t)0.857728610f, (float16_t)-0.514102744f,
+    (float16_t)0.854557988f, (float16_t)-0.519355990f,
+    (float16_t)0.851355193f, (float16_t)-0.524589683f,
+    (float16_t)0.848120345f, (float16_t)-0.529803625f,
+    (float16_t)0.844853565f, (float16_t)-0.534997620f,
+    (float16_t)0.841554977f, (float16_t)-0.540171473f,
+    (float16_t)0.838224706f, (float16_t)-0.545324988f,
+    (float16_t)0.834862875f, (float16_t)-0.550457973f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.828045045f, (float16_t)-0.560661576f,
+    (float16_t)0.824589303f, (float16_t)-0.565731811f,
+    (float16_t)0.821102515f, (float16_t)-0.570780746f,
+    (float16_t)0.817584813f, (float16_t)-0.575808191f,
+    (float16_t)0.814036330f, (float16_t)-0.580813958f,
+    (float16_t)0.810457198f, (float16_t)-0.585797857f,
+    (float16_t)0.806847554f, (float16_t)-0.590759702f,
+    (float16_t)0.803207531f, (float16_t)-0.595699304f,
+    (float16_t)0.799537269f, (float16_t)-0.600616479f,
+    (float16_t)0.795836905f, (float16_t)-0.605511041f,
+    (float16_t)0.792106577f, (float16_t)-0.610382806f,
+    (float16_t)0.788346428f, (float16_t)-0.615231591f,
+    (float16_t)0.784556597f, (float16_t)-0.620057212f,
+    (float16_t)0.780737229f, (float16_t)-0.624859488f,
+    (float16_t)0.776888466f, (float16_t)-0.629638239f,
+    (float16_t)0.773010453f, (float16_t)-0.634393284f,
+    (float16_t)0.769103338f, (float16_t)-0.639124445f,
+    (float16_t)0.765167266f, (float16_t)-0.643831543f,
+    (float16_t)0.761202385f, (float16_t)-0.648514401f,
+    (float16_t)0.757208847f, (float16_t)-0.653172843f,
+    (float16_t)0.753186799f, (float16_t)-0.657806693f,
+    (float16_t)0.749136395f, (float16_t)-0.662415778f,
+    (float16_t)0.745057785f, (float16_t)-0.666999922f,
+    (float16_t)0.740951125f, (float16_t)-0.671558955f,
+    (float16_t)0.736816569f, (float16_t)-0.676092704f,
+    (float16_t)0.732654272f, (float16_t)-0.680600998f,
+    (float16_t)0.728464390f, (float16_t)-0.685083668f,
+    (float16_t)0.724247083f, (float16_t)-0.689540545f,
+    (float16_t)0.720002508f, (float16_t)-0.693971461f,
+    (float16_t)0.715730825f, (float16_t)-0.698376249f,
+    (float16_t)0.711432196f, (float16_t)-0.702754744f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.702754744f, (float16_t)-0.711432196f,
+    (float16_t)0.698376249f, (float16_t)-0.715730825f,
+    (float16_t)0.693971461f, (float16_t)-0.720002508f,
+    (float16_t)0.689540545f, (float16_t)-0.724247083f,
+    (float16_t)0.685083668f, (float16_t)-0.728464390f,
+    (float16_t)0.680600998f, (float16_t)-0.732654272f,
+    (float16_t)0.676092704f, (float16_t)-0.736816569f,
+    (float16_t)0.671558955f, (float16_t)-0.740951125f,
+    (float16_t)0.666999922f, (float16_t)-0.745057785f,
+    (float16_t)0.662415778f, (float16_t)-0.749136395f,
+    (float16_t)0.657806693f, (float16_t)-0.753186799f,
+    (float16_t)0.653172843f, (float16_t)-0.757208847f,
+    (float16_t)0.648514401f, (float16_t)-0.761202385f,
+    (float16_t)0.643831543f, (float16_t)-0.765167266f,
+    (float16_t)0.639124445f, (float16_t)-0.769103338f,
+    (float16_t)0.634393284f, (float16_t)-0.773010453f,
+    (float16_t)0.629638239f, (float16_t)-0.776888466f,
+    (float16_t)0.624859488f, (float16_t)-0.780737229f,
+    (float16_t)0.620057212f, (float16_t)-0.784556597f,
+    (float16_t)0.615231591f, (float16_t)-0.788346428f,
+    (float16_t)0.610382806f, (float16_t)-0.792106577f,
+    (float16_t)0.605511041f, (float16_t)-0.795836905f,
+    (float16_t)0.600616479f, (float16_t)-0.799537269f,
+    (float16_t)0.595699304f, (float16_t)-0.803207531f,
+    (float16_t)0.590759702f, (float16_t)-0.806847554f,
+    (float16_t)0.585797857f, (float16_t)-0.810457198f,
+    (float16_t)0.580813958f, (float16_t)-0.814036330f,
+    (float16_t)0.575808191f, (float16_t)-0.817584813f,
+    (float16_t)0.570780746f, (float16_t)-0.821102515f,
+    (float16_t)0.565731811f, (float16_t)-0.824589303f,
+    (float16_t)0.560661576f, (float16_t)-0.828045045f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.550457973f, (float16_t)-0.834862875f,
+    (float16_t)0.545324988f, (float16_t)-0.838224706f,
+    (float16_t)0.540171473f, (float16_t)-0.841554977f,
+    (float16_t)0.534997620f, (float16_t)-0.844853565f,
+    (float16_t)0.529803625f, (float16_t)-0.848120345f,
+    (float16_t)0.524589683f, (float16_t)-0.851355193f,
+    (float16_t)0.519355990f, (float16_t)-0.854557988f,
+    (float16_t)0.514102744f, (float16_t)-0.857728610f,
+    (float16_t)0.508830143f, (float16_t)-0.860866939f,
+    (float16_t)0.503538384f, (float16_t)-0.863972856f,
+    (float16_t)0.498227667f, (float16_t)-0.867046246f,
+    (float16_t)0.492898192f, (float16_t)-0.870086991f,
+    (float16_t)0.487550160f, (float16_t)-0.873094978f,
+    (float16_t)0.482183772f, (float16_t)-0.876070094f,
+    (float16_t)0.476799230f, (float16_t)-0.879012226f,
+    (float16_t)0.471396737f, (float16_t)-0.881921264f,
+    (float16_t)0.465976496f, (float16_t)-0.884797098f,
+    (float16_t)0.460538711f, (float16_t)-0.887639620f,
+    (float16_t)0.455083587f, (float16_t)-0.890448723f,
+    (float16_t)0.449611330f, (float16_t)-0.893224301f,
+    (float16_t)0.444122145f, (float16_t)-0.895966250f,
+    (float16_t)0.438616239f, (float16_t)-0.898674466f,
+    (float16_t)0.433093819f, (float16_t)-0.901348847f,
+    (float16_t)0.427555093f, (float16_t)-0.903989293f,
+    (float16_t)0.422000271f, (float16_t)-0.906595705f,
+    (float16_t)0.416429560f, (float16_t)-0.909167983f,
+    (float16_t)0.410843171f, (float16_t)-0.911706032f,
+    (float16_t)0.405241314f, (float16_t)-0.914209756f,
+    (float16_t)0.399624200f, (float16_t)-0.916679060f,
+    (float16_t)0.393992040f, (float16_t)-0.919113852f,
+    (float16_t)0.388345047f, (float16_t)-0.921514039f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.377007410f, (float16_t)-0.926210242f,
+    (float16_t)0.371317194f, (float16_t)-0.928506080f,
+    (float16_t)0.365612998f, (float16_t)-0.930766961f,
+    (float16_t)0.359895037f, (float16_t)-0.932992799f,
+    (float16_t)0.354163525f, (float16_t)-0.935183510f,
+    (float16_t)0.348418680f, (float16_t)-0.937339012f,
+    (float16_t)0.342660717f, (float16_t)-0.939459224f,
+    (float16_t)0.336889853f, (float16_t)-0.941544065f,
+    (float16_t)0.331106306f, (float16_t)-0.943593458f,
+    (float16_t)0.325310292f, (float16_t)-0.945607325f,
+    (float16_t)0.319502031f, (float16_t)-0.947585591f,
+    (float16_t)0.313681740f, (float16_t)-0.949528181f,
+    (float16_t)0.307849640f, (float16_t)-0.951435021f,
+    (float16_t)0.302005949f, (float16_t)-0.953306040f,
+    (float16_t)0.296150888f, (float16_t)-0.955141168f,
+    (float16_t)0.290284677f, (float16_t)-0.956940336f,
+    (float16_t)0.284407537f, (float16_t)-0.958703475f,
+    (float16_t)0.278519689f, (float16_t)-0.960430519f,
+    (float16_t)0.272621355f, (float16_t)-0.962121404f,
+    (float16_t)0.266712757f, (float16_t)-0.963776066f,
+    (float16_t)0.260794118f, (float16_t)-0.965394442f,
+    (float16_t)0.254865660f, (float16_t)-0.966976471f,
+    (float16_t)0.248927606f, (float16_t)-0.968522094f,
+    (float16_t)0.242980180f, (float16_t)-0.970031253f,
+    (float16_t)0.237023606f, (float16_t)-0.971503891f,
+    (float16_t)0.231058108f, (float16_t)-0.972939952f,
+    (float16_t)0.225083911f, (float16_t)-0.974339383f,
+    (float16_t)0.219101240f, (float16_t)-0.975702130f,
+    (float16_t)0.213110320f, (float16_t)-0.977028143f,
+    (float16_t)0.207111376f, (float16_t)-0.978317371f,
+    (float16_t)0.201104635f, (float16_t)-0.979569766f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.189068664f, (float16_t)-0.981963869f,
+    (float16_t)0.183039888f, (float16_t)-0.983105487f,
+    (float16_t)0.177004220f, (float16_t)-0.984210092f,
+    (float16_t)0.170961889f, (float16_t)-0.985277642f,
+    (float16_t)0.164913120f, (float16_t)-0.986308097f,
+    (float16_t)0.158858143f, (float16_t)-0.987301418f,
+    (float16_t)0.152797185f, (float16_t)-0.988257568f,
+    (float16_t)0.146730474f, (float16_t)-0.989176510f,
+    (float16_t)0.140658239f, (float16_t)-0.990058210f,
+    (float16_t)0.134580709f, (float16_t)-0.990902635f,
+    (float16_t)0.128498111f, (float16_t)-0.991709754f,
+    (float16_t)0.122410675f, (float16_t)-0.992479535f,
+    (float16_t)0.116318631f, (float16_t)-0.993211949f,
+    (float16_t)0.110222207f, (float16_t)-0.993906970f,
+    (float16_t)0.104121634f, (float16_t)-0.994564571f,
+    (float16_t)0.098017140f, (float16_t)-0.995184727f,
+    (float16_t)0.091908956f, (float16_t)-0.995767414f,
+    (float16_t)0.085797312f, (float16_t)-0.996312612f,
+    (float16_t)0.079682438f, (float16_t)-0.996820299f,
+    (float16_t)0.073564564f, (float16_t)-0.997290457f,
+    (float16_t)0.067443920f, (float16_t)-0.997723067f,
+    (float16_t)0.061320736f, (float16_t)-0.998118113f,
+    (float16_t)0.055195244f, (float16_t)-0.998475581f,
+    (float16_t)0.049067674f, (float16_t)-0.998795456f,
+    (float16_t)0.042938257f, (float16_t)-0.999077728f,
+    (float16_t)0.036807223f, (float16_t)-0.999322385f,
+    (float16_t)0.030674803f, (float16_t)-0.999529418f,
+    (float16_t)0.024541229f, (float16_t)-0.999698819f,
+    (float16_t)0.018406730f, (float16_t)-0.999830582f,
+    (float16_t)0.012271538f, (float16_t)-0.999924702f,
+    (float16_t)0.006135885f, (float16_t)-0.999981175f
+};
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_2048)
+const float16_t twiddleCoefF16_rfft_2048[2048] = {
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+    (float16_t)0.003067957f,  (float16_t)0.999995294f,
+    (float16_t)0.006135885f,  (float16_t)0.999981175f,
+    (float16_t)0.009203755f,  (float16_t)0.999957645f,
+    (float16_t)0.012271538f,  (float16_t)0.999924702f,
+    (float16_t)0.015339206f,  (float16_t)0.999882347f,
+    (float16_t)0.018406730f,  (float16_t)0.999830582f,
+    (float16_t)0.021474080f,  (float16_t)0.999769405f,
+    (float16_t)0.024541229f,  (float16_t)0.999698819f,
+    (float16_t)0.027608146f,  (float16_t)0.999618822f,
+    (float16_t)0.030674803f,  (float16_t)0.999529418f,
+    (float16_t)0.033741172f,  (float16_t)0.999430605f,
+    (float16_t)0.036807223f,  (float16_t)0.999322385f,
+    (float16_t)0.039872928f,  (float16_t)0.999204759f,
+    (float16_t)0.042938257f,  (float16_t)0.999077728f,
+    (float16_t)0.046003182f,  (float16_t)0.998941293f,
+    (float16_t)0.049067674f,  (float16_t)0.998795456f,
+    (float16_t)0.052131705f,  (float16_t)0.998640218f,
+    (float16_t)0.055195244f,  (float16_t)0.998475581f,
+    (float16_t)0.058258265f,  (float16_t)0.998301545f,
+    (float16_t)0.061320736f,  (float16_t)0.998118113f,
+    (float16_t)0.064382631f,  (float16_t)0.997925286f,
+    (float16_t)0.067443920f,  (float16_t)0.997723067f,
+    (float16_t)0.070504573f,  (float16_t)0.997511456f,
+    (float16_t)0.073564564f,  (float16_t)0.997290457f,
+    (float16_t)0.076623861f,  (float16_t)0.997060070f,
+    (float16_t)0.079682438f,  (float16_t)0.996820299f,
+    (float16_t)0.082740265f,  (float16_t)0.996571146f,
+    (float16_t)0.085797312f,  (float16_t)0.996312612f,
+    (float16_t)0.088853553f,  (float16_t)0.996044701f,
+    (float16_t)0.091908956f,  (float16_t)0.995767414f,
+    (float16_t)0.094963495f,  (float16_t)0.995480755f,
+    (float16_t)0.098017140f,  (float16_t)0.995184727f,
+    (float16_t)0.101069863f,  (float16_t)0.994879331f,
+    (float16_t)0.104121634f,  (float16_t)0.994564571f,
+    (float16_t)0.107172425f,  (float16_t)0.994240449f,
+    (float16_t)0.110222207f,  (float16_t)0.993906970f,
+    (float16_t)0.113270952f,  (float16_t)0.993564136f,
+    (float16_t)0.116318631f,  (float16_t)0.993211949f,
+    (float16_t)0.119365215f,  (float16_t)0.992850414f,
+    (float16_t)0.122410675f,  (float16_t)0.992479535f,
+    (float16_t)0.125454983f,  (float16_t)0.992099313f,
+    (float16_t)0.128498111f,  (float16_t)0.991709754f,
+    (float16_t)0.131540029f,  (float16_t)0.991310860f,
+    (float16_t)0.134580709f,  (float16_t)0.990902635f,
+    (float16_t)0.137620122f,  (float16_t)0.990485084f,
+    (float16_t)0.140658239f,  (float16_t)0.990058210f,
+    (float16_t)0.143695033f,  (float16_t)0.989622017f,
+    (float16_t)0.146730474f,  (float16_t)0.989176510f,
+    (float16_t)0.149764535f,  (float16_t)0.988721692f,
+    (float16_t)0.152797185f,  (float16_t)0.988257568f,
+    (float16_t)0.155828398f,  (float16_t)0.987784142f,
+    (float16_t)0.158858143f,  (float16_t)0.987301418f,
+    (float16_t)0.161886394f,  (float16_t)0.986809402f,
+    (float16_t)0.164913120f,  (float16_t)0.986308097f,
+    (float16_t)0.167938295f,  (float16_t)0.985797509f,
+    (float16_t)0.170961889f,  (float16_t)0.985277642f,
+    (float16_t)0.173983873f,  (float16_t)0.984748502f,
+    (float16_t)0.177004220f,  (float16_t)0.984210092f,
+    (float16_t)0.180022901f,  (float16_t)0.983662419f,
+    (float16_t)0.183039888f,  (float16_t)0.983105487f,
+    (float16_t)0.186055152f,  (float16_t)0.982539302f,
+    (float16_t)0.189068664f,  (float16_t)0.981963869f,
+    (float16_t)0.192080397f,  (float16_t)0.981379193f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.198098411f,  (float16_t)0.980182136f,
+    (float16_t)0.201104635f,  (float16_t)0.979569766f,
+    (float16_t)0.204108966f,  (float16_t)0.978948175f,
+    (float16_t)0.207111376f,  (float16_t)0.978317371f,
+    (float16_t)0.210111837f,  (float16_t)0.977677358f,
+    (float16_t)0.213110320f,  (float16_t)0.977028143f,
+    (float16_t)0.216106797f,  (float16_t)0.976369731f,
+    (float16_t)0.219101240f,  (float16_t)0.975702130f,
+    (float16_t)0.222093621f,  (float16_t)0.975025345f,
+    (float16_t)0.225083911f,  (float16_t)0.974339383f,
+    (float16_t)0.228072083f,  (float16_t)0.973644250f,
+    (float16_t)0.231058108f,  (float16_t)0.972939952f,
+    (float16_t)0.234041959f,  (float16_t)0.972226497f,
+    (float16_t)0.237023606f,  (float16_t)0.971503891f,
+    (float16_t)0.240003022f,  (float16_t)0.970772141f,
+    (float16_t)0.242980180f,  (float16_t)0.970031253f,
+    (float16_t)0.245955050f,  (float16_t)0.969281235f,
+    (float16_t)0.248927606f,  (float16_t)0.968522094f,
+    (float16_t)0.251897818f,  (float16_t)0.967753837f,
+    (float16_t)0.254865660f,  (float16_t)0.966976471f,
+    (float16_t)0.257831102f,  (float16_t)0.966190003f,
+    (float16_t)0.260794118f,  (float16_t)0.965394442f,
+    (float16_t)0.263754679f,  (float16_t)0.964589793f,
+    (float16_t)0.266712757f,  (float16_t)0.963776066f,
+    (float16_t)0.269668326f,  (float16_t)0.962953267f,
+    (float16_t)0.272621355f,  (float16_t)0.962121404f,
+    (float16_t)0.275571819f,  (float16_t)0.961280486f,
+    (float16_t)0.278519689f,  (float16_t)0.960430519f,
+    (float16_t)0.281464938f,  (float16_t)0.959571513f,
+    (float16_t)0.284407537f,  (float16_t)0.958703475f,
+    (float16_t)0.287347460f,  (float16_t)0.957826413f,
+    (float16_t)0.290284677f,  (float16_t)0.956940336f,
+    (float16_t)0.293219163f,  (float16_t)0.956045251f,
+    (float16_t)0.296150888f,  (float16_t)0.955141168f,
+    (float16_t)0.299079826f,  (float16_t)0.954228095f,
+    (float16_t)0.302005949f,  (float16_t)0.953306040f,
+    (float16_t)0.304929230f,  (float16_t)0.952375013f,
+    (float16_t)0.307849640f,  (float16_t)0.951435021f,
+    (float16_t)0.310767153f,  (float16_t)0.950486074f,
+    (float16_t)0.313681740f,  (float16_t)0.949528181f,
+    (float16_t)0.316593376f,  (float16_t)0.948561350f,
+    (float16_t)0.319502031f,  (float16_t)0.947585591f,
+    (float16_t)0.322407679f,  (float16_t)0.946600913f,
+    (float16_t)0.325310292f,  (float16_t)0.945607325f,
+    (float16_t)0.328209844f,  (float16_t)0.944604837f,
+    (float16_t)0.331106306f,  (float16_t)0.943593458f,
+    (float16_t)0.333999651f,  (float16_t)0.942573198f,
+    (float16_t)0.336889853f,  (float16_t)0.941544065f,
+    (float16_t)0.339776884f,  (float16_t)0.940506071f,
+    (float16_t)0.342660717f,  (float16_t)0.939459224f,
+    (float16_t)0.345541325f,  (float16_t)0.938403534f,
+    (float16_t)0.348418680f,  (float16_t)0.937339012f,
+    (float16_t)0.351292756f,  (float16_t)0.936265667f,
+    (float16_t)0.354163525f,  (float16_t)0.935183510f,
+    (float16_t)0.357030961f,  (float16_t)0.934092550f,
+    (float16_t)0.359895037f,  (float16_t)0.932992799f,
+    (float16_t)0.362755724f,  (float16_t)0.931884266f,
+    (float16_t)0.365612998f,  (float16_t)0.930766961f,
+    (float16_t)0.368466830f,  (float16_t)0.929640896f,
+    (float16_t)0.371317194f,  (float16_t)0.928506080f,
+    (float16_t)0.374164063f,  (float16_t)0.927362526f,
+    (float16_t)0.377007410f,  (float16_t)0.926210242f,
+    (float16_t)0.379847209f,  (float16_t)0.925049241f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.385516054f,  (float16_t)0.922701128f,
+    (float16_t)0.388345047f,  (float16_t)0.921514039f,
+    (float16_t)0.391170384f,  (float16_t)0.920318277f,
+    (float16_t)0.393992040f,  (float16_t)0.919113852f,
+    (float16_t)0.396809987f,  (float16_t)0.917900776f,
+    (float16_t)0.399624200f,  (float16_t)0.916679060f,
+    (float16_t)0.402434651f,  (float16_t)0.915448716f,
+    (float16_t)0.405241314f,  (float16_t)0.914209756f,
+    (float16_t)0.408044163f,  (float16_t)0.912962190f,
+    (float16_t)0.410843171f,  (float16_t)0.911706032f,
+    (float16_t)0.413638312f,  (float16_t)0.910441292f,
+    (float16_t)0.416429560f,  (float16_t)0.909167983f,
+    (float16_t)0.419216888f,  (float16_t)0.907886116f,
+    (float16_t)0.422000271f,  (float16_t)0.906595705f,
+    (float16_t)0.424779681f,  (float16_t)0.905296759f,
+    (float16_t)0.427555093f,  (float16_t)0.903989293f,
+    (float16_t)0.430326481f,  (float16_t)0.902673318f,
+    (float16_t)0.433093819f,  (float16_t)0.901348847f,
+    (float16_t)0.435857080f,  (float16_t)0.900015892f,
+    (float16_t)0.438616239f,  (float16_t)0.898674466f,
+    (float16_t)0.441371269f,  (float16_t)0.897324581f,
+    (float16_t)0.444122145f,  (float16_t)0.895966250f,
+    (float16_t)0.446868840f,  (float16_t)0.894599486f,
+    (float16_t)0.449611330f,  (float16_t)0.893224301f,
+    (float16_t)0.452349587f,  (float16_t)0.891840709f,
+    (float16_t)0.455083587f,  (float16_t)0.890448723f,
+    (float16_t)0.457813304f,  (float16_t)0.889048356f,
+    (float16_t)0.460538711f,  (float16_t)0.887639620f,
+    (float16_t)0.463259784f,  (float16_t)0.886222530f,
+    (float16_t)0.465976496f,  (float16_t)0.884797098f,
+    (float16_t)0.468688822f,  (float16_t)0.883363339f,
+    (float16_t)0.471396737f,  (float16_t)0.881921264f,
+    (float16_t)0.474100215f,  (float16_t)0.880470889f,
+    (float16_t)0.476799230f,  (float16_t)0.879012226f,
+    (float16_t)0.479493758f,  (float16_t)0.877545290f,
+    (float16_t)0.482183772f,  (float16_t)0.876070094f,
+    (float16_t)0.484869248f,  (float16_t)0.874586652f,
+    (float16_t)0.487550160f,  (float16_t)0.873094978f,
+    (float16_t)0.490226483f,  (float16_t)0.871595087f,
+    (float16_t)0.492898192f,  (float16_t)0.870086991f,
+    (float16_t)0.495565262f,  (float16_t)0.868570706f,
+    (float16_t)0.498227667f,  (float16_t)0.867046246f,
+    (float16_t)0.500885383f,  (float16_t)0.865513624f,
+    (float16_t)0.503538384f,  (float16_t)0.863972856f,
+    (float16_t)0.506186645f,  (float16_t)0.862423956f,
+    (float16_t)0.508830143f,  (float16_t)0.860866939f,
+    (float16_t)0.511468850f,  (float16_t)0.859301818f,
+    (float16_t)0.514102744f,  (float16_t)0.857728610f,
+    (float16_t)0.516731799f,  (float16_t)0.856147328f,
+    (float16_t)0.519355990f,  (float16_t)0.854557988f,
+    (float16_t)0.521975293f,  (float16_t)0.852960605f,
+    (float16_t)0.524589683f,  (float16_t)0.851355193f,
+    (float16_t)0.527199135f,  (float16_t)0.849741768f,
+    (float16_t)0.529803625f,  (float16_t)0.848120345f,
+    (float16_t)0.532403128f,  (float16_t)0.846490939f,
+    (float16_t)0.534997620f,  (float16_t)0.844853565f,
+    (float16_t)0.537587076f,  (float16_t)0.843208240f,
+    (float16_t)0.540171473f,  (float16_t)0.841554977f,
+    (float16_t)0.542750785f,  (float16_t)0.839893794f,
+    (float16_t)0.545324988f,  (float16_t)0.838224706f,
+    (float16_t)0.547894059f,  (float16_t)0.836547727f,
+    (float16_t)0.550457973f,  (float16_t)0.834862875f,
+    (float16_t)0.553016706f,  (float16_t)0.833170165f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.558118531f,  (float16_t)0.829761234f,
+    (float16_t)0.560661576f,  (float16_t)0.828045045f,
+    (float16_t)0.563199344f,  (float16_t)0.826321063f,
+    (float16_t)0.565731811f,  (float16_t)0.824589303f,
+    (float16_t)0.568258953f,  (float16_t)0.822849781f,
+    (float16_t)0.570780746f,  (float16_t)0.821102515f,
+    (float16_t)0.573297167f,  (float16_t)0.819347520f,
+    (float16_t)0.575808191f,  (float16_t)0.817584813f,
+    (float16_t)0.578313796f,  (float16_t)0.815814411f,
+    (float16_t)0.580813958f,  (float16_t)0.814036330f,
+    (float16_t)0.583308653f,  (float16_t)0.812250587f,
+    (float16_t)0.585797857f,  (float16_t)0.810457198f,
+    (float16_t)0.588281548f,  (float16_t)0.808656182f,
+    (float16_t)0.590759702f,  (float16_t)0.806847554f,
+    (float16_t)0.593232295f,  (float16_t)0.805031331f,
+    (float16_t)0.595699304f,  (float16_t)0.803207531f,
+    (float16_t)0.598160707f,  (float16_t)0.801376172f,
+    (float16_t)0.600616479f,  (float16_t)0.799537269f,
+    (float16_t)0.603066599f,  (float16_t)0.797690841f,
+    (float16_t)0.605511041f,  (float16_t)0.795836905f,
+    (float16_t)0.607949785f,  (float16_t)0.793975478f,
+    (float16_t)0.610382806f,  (float16_t)0.792106577f,
+    (float16_t)0.612810082f,  (float16_t)0.790230221f,
+    (float16_t)0.615231591f,  (float16_t)0.788346428f,
+    (float16_t)0.617647308f,  (float16_t)0.786455214f,
+    (float16_t)0.620057212f,  (float16_t)0.784556597f,
+    (float16_t)0.622461279f,  (float16_t)0.782650596f,
+    (float16_t)0.624859488f,  (float16_t)0.780737229f,
+    (float16_t)0.627251815f,  (float16_t)0.778816512f,
+    (float16_t)0.629638239f,  (float16_t)0.776888466f,
+    (float16_t)0.632018736f,  (float16_t)0.774953107f,
+    (float16_t)0.634393284f,  (float16_t)0.773010453f,
+    (float16_t)0.636761861f,  (float16_t)0.771060524f,
+    (float16_t)0.639124445f,  (float16_t)0.769103338f,
+    (float16_t)0.641481013f,  (float16_t)0.767138912f,
+    (float16_t)0.643831543f,  (float16_t)0.765167266f,
+    (float16_t)0.646176013f,  (float16_t)0.763188417f,
+    (float16_t)0.648514401f,  (float16_t)0.761202385f,
+    (float16_t)0.650846685f,  (float16_t)0.759209189f,
+    (float16_t)0.653172843f,  (float16_t)0.757208847f,
+    (float16_t)0.655492853f,  (float16_t)0.755201377f,
+    (float16_t)0.657806693f,  (float16_t)0.753186799f,
+    (float16_t)0.660114342f,  (float16_t)0.751165132f,
+    (float16_t)0.662415778f,  (float16_t)0.749136395f,
+    (float16_t)0.664710978f,  (float16_t)0.747100606f,
+    (float16_t)0.666999922f,  (float16_t)0.745057785f,
+    (float16_t)0.669282588f,  (float16_t)0.743007952f,
+    (float16_t)0.671558955f,  (float16_t)0.740951125f,
+    (float16_t)0.673829000f,  (float16_t)0.738887324f,
+    (float16_t)0.676092704f,  (float16_t)0.736816569f,
+    (float16_t)0.678350043f,  (float16_t)0.734738878f,
+    (float16_t)0.680600998f,  (float16_t)0.732654272f,
+    (float16_t)0.682845546f,  (float16_t)0.730562769f,
+    (float16_t)0.685083668f,  (float16_t)0.728464390f,
+    (float16_t)0.687315341f,  (float16_t)0.726359155f,
+    (float16_t)0.689540545f,  (float16_t)0.724247083f,
+    (float16_t)0.691759258f,  (float16_t)0.722128194f,
+    (float16_t)0.693971461f,  (float16_t)0.720002508f,
+    (float16_t)0.696177131f,  (float16_t)0.717870045f,
+    (float16_t)0.698376249f,  (float16_t)0.715730825f,
+    (float16_t)0.700568794f,  (float16_t)0.713584869f,
+    (float16_t)0.702754744f,  (float16_t)0.711432196f,
+    (float16_t)0.704934080f,  (float16_t)0.709272826f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.709272826f,  (float16_t)0.704934080f,
+    (float16_t)0.711432196f,  (float16_t)0.702754744f,
+    (float16_t)0.713584869f,  (float16_t)0.700568794f,
+    (float16_t)0.715730825f,  (float16_t)0.698376249f,
+    (float16_t)0.717870045f,  (float16_t)0.696177131f,
+    (float16_t)0.720002508f,  (float16_t)0.693971461f,
+    (float16_t)0.722128194f,  (float16_t)0.691759258f,
+    (float16_t)0.724247083f,  (float16_t)0.689540545f,
+    (float16_t)0.726359155f,  (float16_t)0.687315341f,
+    (float16_t)0.728464390f,  (float16_t)0.685083668f,
+    (float16_t)0.730562769f,  (float16_t)0.682845546f,
+    (float16_t)0.732654272f,  (float16_t)0.680600998f,
+    (float16_t)0.734738878f,  (float16_t)0.678350043f,
+    (float16_t)0.736816569f,  (float16_t)0.676092704f,
+    (float16_t)0.738887324f,  (float16_t)0.673829000f,
+    (float16_t)0.740951125f,  (float16_t)0.671558955f,
+    (float16_t)0.743007952f,  (float16_t)0.669282588f,
+    (float16_t)0.745057785f,  (float16_t)0.666999922f,
+    (float16_t)0.747100606f,  (float16_t)0.664710978f,
+    (float16_t)0.749136395f,  (float16_t)0.662415778f,
+    (float16_t)0.751165132f,  (float16_t)0.660114342f,
+    (float16_t)0.753186799f,  (float16_t)0.657806693f,
+    (float16_t)0.755201377f,  (float16_t)0.655492853f,
+    (float16_t)0.757208847f,  (float16_t)0.653172843f,
+    (float16_t)0.759209189f,  (float16_t)0.650846685f,
+    (float16_t)0.761202385f,  (float16_t)0.648514401f,
+    (float16_t)0.763188417f,  (float16_t)0.646176013f,
+    (float16_t)0.765167266f,  (float16_t)0.643831543f,
+    (float16_t)0.767138912f,  (float16_t)0.641481013f,
+    (float16_t)0.769103338f,  (float16_t)0.639124445f,
+    (float16_t)0.771060524f,  (float16_t)0.636761861f,
+    (float16_t)0.773010453f,  (float16_t)0.634393284f,
+    (float16_t)0.774953107f,  (float16_t)0.632018736f,
+    (float16_t)0.776888466f,  (float16_t)0.629638239f,
+    (float16_t)0.778816512f,  (float16_t)0.627251815f,
+    (float16_t)0.780737229f,  (float16_t)0.624859488f,
+    (float16_t)0.782650596f,  (float16_t)0.622461279f,
+    (float16_t)0.784556597f,  (float16_t)0.620057212f,
+    (float16_t)0.786455214f,  (float16_t)0.617647308f,
+    (float16_t)0.788346428f,  (float16_t)0.615231591f,
+    (float16_t)0.790230221f,  (float16_t)0.612810082f,
+    (float16_t)0.792106577f,  (float16_t)0.610382806f,
+    (float16_t)0.793975478f,  (float16_t)0.607949785f,
+    (float16_t)0.795836905f,  (float16_t)0.605511041f,
+    (float16_t)0.797690841f,  (float16_t)0.603066599f,
+    (float16_t)0.799537269f,  (float16_t)0.600616479f,
+    (float16_t)0.801376172f,  (float16_t)0.598160707f,
+    (float16_t)0.803207531f,  (float16_t)0.595699304f,
+    (float16_t)0.805031331f,  (float16_t)0.593232295f,
+    (float16_t)0.806847554f,  (float16_t)0.590759702f,
+    (float16_t)0.808656182f,  (float16_t)0.588281548f,
+    (float16_t)0.810457198f,  (float16_t)0.585797857f,
+    (float16_t)0.812250587f,  (float16_t)0.583308653f,
+    (float16_t)0.814036330f,  (float16_t)0.580813958f,
+    (float16_t)0.815814411f,  (float16_t)0.578313796f,
+    (float16_t)0.817584813f,  (float16_t)0.575808191f,
+    (float16_t)0.819347520f,  (float16_t)0.573297167f,
+    (float16_t)0.821102515f,  (float16_t)0.570780746f,
+    (float16_t)0.822849781f,  (float16_t)0.568258953f,
+    (float16_t)0.824589303f,  (float16_t)0.565731811f,
+    (float16_t)0.826321063f,  (float16_t)0.563199344f,
+    (float16_t)0.828045045f,  (float16_t)0.560661576f,
+    (float16_t)0.829761234f,  (float16_t)0.558118531f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.833170165f,  (float16_t)0.553016706f,
+    (float16_t)0.834862875f,  (float16_t)0.550457973f,
+    (float16_t)0.836547727f,  (float16_t)0.547894059f,
+    (float16_t)0.838224706f,  (float16_t)0.545324988f,
+    (float16_t)0.839893794f,  (float16_t)0.542750785f,
+    (float16_t)0.841554977f,  (float16_t)0.540171473f,
+    (float16_t)0.843208240f,  (float16_t)0.537587076f,
+    (float16_t)0.844853565f,  (float16_t)0.534997620f,
+    (float16_t)0.846490939f,  (float16_t)0.532403128f,
+    (float16_t)0.848120345f,  (float16_t)0.529803625f,
+    (float16_t)0.849741768f,  (float16_t)0.527199135f,
+    (float16_t)0.851355193f,  (float16_t)0.524589683f,
+    (float16_t)0.852960605f,  (float16_t)0.521975293f,
+    (float16_t)0.854557988f,  (float16_t)0.519355990f,
+    (float16_t)0.856147328f,  (float16_t)0.516731799f,
+    (float16_t)0.857728610f,  (float16_t)0.514102744f,
+    (float16_t)0.859301818f,  (float16_t)0.511468850f,
+    (float16_t)0.860866939f,  (float16_t)0.508830143f,
+    (float16_t)0.862423956f,  (float16_t)0.506186645f,
+    (float16_t)0.863972856f,  (float16_t)0.503538384f,
+    (float16_t)0.865513624f,  (float16_t)0.500885383f,
+    (float16_t)0.867046246f,  (float16_t)0.498227667f,
+    (float16_t)0.868570706f,  (float16_t)0.495565262f,
+    (float16_t)0.870086991f,  (float16_t)0.492898192f,
+    (float16_t)0.871595087f,  (float16_t)0.490226483f,
+    (float16_t)0.873094978f,  (float16_t)0.487550160f,
+    (float16_t)0.874586652f,  (float16_t)0.484869248f,
+    (float16_t)0.876070094f,  (float16_t)0.482183772f,
+    (float16_t)0.877545290f,  (float16_t)0.479493758f,
+    (float16_t)0.879012226f,  (float16_t)0.476799230f,
+    (float16_t)0.880470889f,  (float16_t)0.474100215f,
+    (float16_t)0.881921264f,  (float16_t)0.471396737f,
+    (float16_t)0.883363339f,  (float16_t)0.468688822f,
+    (float16_t)0.884797098f,  (float16_t)0.465976496f,
+    (float16_t)0.886222530f,  (float16_t)0.463259784f,
+    (float16_t)0.887639620f,  (float16_t)0.460538711f,
+    (float16_t)0.889048356f,  (float16_t)0.457813304f,
+    (float16_t)0.890448723f,  (float16_t)0.455083587f,
+    (float16_t)0.891840709f,  (float16_t)0.452349587f,
+    (float16_t)0.893224301f,  (float16_t)0.449611330f,
+    (float16_t)0.894599486f,  (float16_t)0.446868840f,
+    (float16_t)0.895966250f,  (float16_t)0.444122145f,
+    (float16_t)0.897324581f,  (float16_t)0.441371269f,
+    (float16_t)0.898674466f,  (float16_t)0.438616239f,
+    (float16_t)0.900015892f,  (float16_t)0.435857080f,
+    (float16_t)0.901348847f,  (float16_t)0.433093819f,
+    (float16_t)0.902673318f,  (float16_t)0.430326481f,
+    (float16_t)0.903989293f,  (float16_t)0.427555093f,
+    (float16_t)0.905296759f,  (float16_t)0.424779681f,
+    (float16_t)0.906595705f,  (float16_t)0.422000271f,
+    (float16_t)0.907886116f,  (float16_t)0.419216888f,
+    (float16_t)0.909167983f,  (float16_t)0.416429560f,
+    (float16_t)0.910441292f,  (float16_t)0.413638312f,
+    (float16_t)0.911706032f,  (float16_t)0.410843171f,
+    (float16_t)0.912962190f,  (float16_t)0.408044163f,
+    (float16_t)0.914209756f,  (float16_t)0.405241314f,
+    (float16_t)0.915448716f,  (float16_t)0.402434651f,
+    (float16_t)0.916679060f,  (float16_t)0.399624200f,
+    (float16_t)0.917900776f,  (float16_t)0.396809987f,
+    (float16_t)0.919113852f,  (float16_t)0.393992040f,
+    (float16_t)0.920318277f,  (float16_t)0.391170384f,
+    (float16_t)0.921514039f,  (float16_t)0.388345047f,
+    (float16_t)0.922701128f,  (float16_t)0.385516054f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.925049241f,  (float16_t)0.379847209f,
+    (float16_t)0.926210242f,  (float16_t)0.377007410f,
+    (float16_t)0.927362526f,  (float16_t)0.374164063f,
+    (float16_t)0.928506080f,  (float16_t)0.371317194f,
+    (float16_t)0.929640896f,  (float16_t)0.368466830f,
+    (float16_t)0.930766961f,  (float16_t)0.365612998f,
+    (float16_t)0.931884266f,  (float16_t)0.362755724f,
+    (float16_t)0.932992799f,  (float16_t)0.359895037f,
+    (float16_t)0.934092550f,  (float16_t)0.357030961f,
+    (float16_t)0.935183510f,  (float16_t)0.354163525f,
+    (float16_t)0.936265667f,  (float16_t)0.351292756f,
+    (float16_t)0.937339012f,  (float16_t)0.348418680f,
+    (float16_t)0.938403534f,  (float16_t)0.345541325f,
+    (float16_t)0.939459224f,  (float16_t)0.342660717f,
+    (float16_t)0.940506071f,  (float16_t)0.339776884f,
+    (float16_t)0.941544065f,  (float16_t)0.336889853f,
+    (float16_t)0.942573198f,  (float16_t)0.333999651f,
+    (float16_t)0.943593458f,  (float16_t)0.331106306f,
+    (float16_t)0.944604837f,  (float16_t)0.328209844f,
+    (float16_t)0.945607325f,  (float16_t)0.325310292f,
+    (float16_t)0.946600913f,  (float16_t)0.322407679f,
+    (float16_t)0.947585591f,  (float16_t)0.319502031f,
+    (float16_t)0.948561350f,  (float16_t)0.316593376f,
+    (float16_t)0.949528181f,  (float16_t)0.313681740f,
+    (float16_t)0.950486074f,  (float16_t)0.310767153f,
+    (float16_t)0.951435021f,  (float16_t)0.307849640f,
+    (float16_t)0.952375013f,  (float16_t)0.304929230f,
+    (float16_t)0.953306040f,  (float16_t)0.302005949f,
+    (float16_t)0.954228095f,  (float16_t)0.299079826f,
+    (float16_t)0.955141168f,  (float16_t)0.296150888f,
+    (float16_t)0.956045251f,  (float16_t)0.293219163f,
+    (float16_t)0.956940336f,  (float16_t)0.290284677f,
+    (float16_t)0.957826413f,  (float16_t)0.287347460f,
+    (float16_t)0.958703475f,  (float16_t)0.284407537f,
+    (float16_t)0.959571513f,  (float16_t)0.281464938f,
+    (float16_t)0.960430519f,  (float16_t)0.278519689f,
+    (float16_t)0.961280486f,  (float16_t)0.275571819f,
+    (float16_t)0.962121404f,  (float16_t)0.272621355f,
+    (float16_t)0.962953267f,  (float16_t)0.269668326f,
+    (float16_t)0.963776066f,  (float16_t)0.266712757f,
+    (float16_t)0.964589793f,  (float16_t)0.263754679f,
+    (float16_t)0.965394442f,  (float16_t)0.260794118f,
+    (float16_t)0.966190003f,  (float16_t)0.257831102f,
+    (float16_t)0.966976471f,  (float16_t)0.254865660f,
+    (float16_t)0.967753837f,  (float16_t)0.251897818f,
+    (float16_t)0.968522094f,  (float16_t)0.248927606f,
+    (float16_t)0.969281235f,  (float16_t)0.245955050f,
+    (float16_t)0.970031253f,  (float16_t)0.242980180f,
+    (float16_t)0.970772141f,  (float16_t)0.240003022f,
+    (float16_t)0.971503891f,  (float16_t)0.237023606f,
+    (float16_t)0.972226497f,  (float16_t)0.234041959f,
+    (float16_t)0.972939952f,  (float16_t)0.231058108f,
+    (float16_t)0.973644250f,  (float16_t)0.228072083f,
+    (float16_t)0.974339383f,  (float16_t)0.225083911f,
+    (float16_t)0.975025345f,  (float16_t)0.222093621f,
+    (float16_t)0.975702130f,  (float16_t)0.219101240f,
+    (float16_t)0.976369731f,  (float16_t)0.216106797f,
+    (float16_t)0.977028143f,  (float16_t)0.213110320f,
+    (float16_t)0.977677358f,  (float16_t)0.210111837f,
+    (float16_t)0.978317371f,  (float16_t)0.207111376f,
+    (float16_t)0.978948175f,  (float16_t)0.204108966f,
+    (float16_t)0.979569766f,  (float16_t)0.201104635f,
+    (float16_t)0.980182136f,  (float16_t)0.198098411f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.981379193f,  (float16_t)0.192080397f,
+    (float16_t)0.981963869f,  (float16_t)0.189068664f,
+    (float16_t)0.982539302f,  (float16_t)0.186055152f,
+    (float16_t)0.983105487f,  (float16_t)0.183039888f,
+    (float16_t)0.983662419f,  (float16_t)0.180022901f,
+    (float16_t)0.984210092f,  (float16_t)0.177004220f,
+    (float16_t)0.984748502f,  (float16_t)0.173983873f,
+    (float16_t)0.985277642f,  (float16_t)0.170961889f,
+    (float16_t)0.985797509f,  (float16_t)0.167938295f,
+    (float16_t)0.986308097f,  (float16_t)0.164913120f,
+    (float16_t)0.986809402f,  (float16_t)0.161886394f,
+    (float16_t)0.987301418f,  (float16_t)0.158858143f,
+    (float16_t)0.987784142f,  (float16_t)0.155828398f,
+    (float16_t)0.988257568f,  (float16_t)0.152797185f,
+    (float16_t)0.988721692f,  (float16_t)0.149764535f,
+    (float16_t)0.989176510f,  (float16_t)0.146730474f,
+    (float16_t)0.989622017f,  (float16_t)0.143695033f,
+    (float16_t)0.990058210f,  (float16_t)0.140658239f,
+    (float16_t)0.990485084f,  (float16_t)0.137620122f,
+    (float16_t)0.990902635f,  (float16_t)0.134580709f,
+    (float16_t)0.991310860f,  (float16_t)0.131540029f,
+    (float16_t)0.991709754f,  (float16_t)0.128498111f,
+    (float16_t)0.992099313f,  (float16_t)0.125454983f,
+    (float16_t)0.992479535f,  (float16_t)0.122410675f,
+    (float16_t)0.992850414f,  (float16_t)0.119365215f,
+    (float16_t)0.993211949f,  (float16_t)0.116318631f,
+    (float16_t)0.993564136f,  (float16_t)0.113270952f,
+    (float16_t)0.993906970f,  (float16_t)0.110222207f,
+    (float16_t)0.994240449f,  (float16_t)0.107172425f,
+    (float16_t)0.994564571f,  (float16_t)0.104121634f,
+    (float16_t)0.994879331f,  (float16_t)0.101069863f,
+    (float16_t)0.995184727f,  (float16_t)0.098017140f,
+    (float16_t)0.995480755f,  (float16_t)0.094963495f,
+    (float16_t)0.995767414f,  (float16_t)0.091908956f,
+    (float16_t)0.996044701f,  (float16_t)0.088853553f,
+    (float16_t)0.996312612f,  (float16_t)0.085797312f,
+    (float16_t)0.996571146f,  (float16_t)0.082740265f,
+    (float16_t)0.996820299f,  (float16_t)0.079682438f,
+    (float16_t)0.997060070f,  (float16_t)0.076623861f,
+    (float16_t)0.997290457f,  (float16_t)0.073564564f,
+    (float16_t)0.997511456f,  (float16_t)0.070504573f,
+    (float16_t)0.997723067f,  (float16_t)0.067443920f,
+    (float16_t)0.997925286f,  (float16_t)0.064382631f,
+    (float16_t)0.998118113f,  (float16_t)0.061320736f,
+    (float16_t)0.998301545f,  (float16_t)0.058258265f,
+    (float16_t)0.998475581f,  (float16_t)0.055195244f,
+    (float16_t)0.998640218f,  (float16_t)0.052131705f,
+    (float16_t)0.998795456f,  (float16_t)0.049067674f,
+    (float16_t)0.998941293f,  (float16_t)0.046003182f,
+    (float16_t)0.999077728f,  (float16_t)0.042938257f,
+    (float16_t)0.999204759f,  (float16_t)0.039872928f,
+    (float16_t)0.999322385f,  (float16_t)0.036807223f,
+    (float16_t)0.999430605f,  (float16_t)0.033741172f,
+    (float16_t)0.999529418f,  (float16_t)0.030674803f,
+    (float16_t)0.999618822f,  (float16_t)0.027608146f,
+    (float16_t)0.999698819f,  (float16_t)0.024541229f,
+    (float16_t)0.999769405f,  (float16_t)0.021474080f,
+    (float16_t)0.999830582f,  (float16_t)0.018406730f,
+    (float16_t)0.999882347f,  (float16_t)0.015339206f,
+    (float16_t)0.999924702f,  (float16_t)0.012271538f,
+    (float16_t)0.999957645f,  (float16_t)0.009203755f,
+    (float16_t)0.999981175f,  (float16_t)0.006135885f,
+    (float16_t)0.999995294f,  (float16_t)0.003067957f,
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.999995294f, (float16_t)-0.003067957f,
+    (float16_t)0.999981175f, (float16_t)-0.006135885f,
+    (float16_t)0.999957645f, (float16_t)-0.009203755f,
+    (float16_t)0.999924702f, (float16_t)-0.012271538f,
+    (float16_t)0.999882347f, (float16_t)-0.015339206f,
+    (float16_t)0.999830582f, (float16_t)-0.018406730f,
+    (float16_t)0.999769405f, (float16_t)-0.021474080f,
+    (float16_t)0.999698819f, (float16_t)-0.024541229f,
+    (float16_t)0.999618822f, (float16_t)-0.027608146f,
+    (float16_t)0.999529418f, (float16_t)-0.030674803f,
+    (float16_t)0.999430605f, (float16_t)-0.033741172f,
+    (float16_t)0.999322385f, (float16_t)-0.036807223f,
+    (float16_t)0.999204759f, (float16_t)-0.039872928f,
+    (float16_t)0.999077728f, (float16_t)-0.042938257f,
+    (float16_t)0.998941293f, (float16_t)-0.046003182f,
+    (float16_t)0.998795456f, (float16_t)-0.049067674f,
+    (float16_t)0.998640218f, (float16_t)-0.052131705f,
+    (float16_t)0.998475581f, (float16_t)-0.055195244f,
+    (float16_t)0.998301545f, (float16_t)-0.058258265f,
+    (float16_t)0.998118113f, (float16_t)-0.061320736f,
+    (float16_t)0.997925286f, (float16_t)-0.064382631f,
+    (float16_t)0.997723067f, (float16_t)-0.067443920f,
+    (float16_t)0.997511456f, (float16_t)-0.070504573f,
+    (float16_t)0.997290457f, (float16_t)-0.073564564f,
+    (float16_t)0.997060070f, (float16_t)-0.076623861f,
+    (float16_t)0.996820299f, (float16_t)-0.079682438f,
+    (float16_t)0.996571146f, (float16_t)-0.082740265f,
+    (float16_t)0.996312612f, (float16_t)-0.085797312f,
+    (float16_t)0.996044701f, (float16_t)-0.088853553f,
+    (float16_t)0.995767414f, (float16_t)-0.091908956f,
+    (float16_t)0.995480755f, (float16_t)-0.094963495f,
+    (float16_t)0.995184727f, (float16_t)-0.098017140f,
+    (float16_t)0.994879331f, (float16_t)-0.101069863f,
+    (float16_t)0.994564571f, (float16_t)-0.104121634f,
+    (float16_t)0.994240449f, (float16_t)-0.107172425f,
+    (float16_t)0.993906970f, (float16_t)-0.110222207f,
+    (float16_t)0.993564136f, (float16_t)-0.113270952f,
+    (float16_t)0.993211949f, (float16_t)-0.116318631f,
+    (float16_t)0.992850414f, (float16_t)-0.119365215f,
+    (float16_t)0.992479535f, (float16_t)-0.122410675f,
+    (float16_t)0.992099313f, (float16_t)-0.125454983f,
+    (float16_t)0.991709754f, (float16_t)-0.128498111f,
+    (float16_t)0.991310860f, (float16_t)-0.131540029f,
+    (float16_t)0.990902635f, (float16_t)-0.134580709f,
+    (float16_t)0.990485084f, (float16_t)-0.137620122f,
+    (float16_t)0.990058210f, (float16_t)-0.140658239f,
+    (float16_t)0.989622017f, (float16_t)-0.143695033f,
+    (float16_t)0.989176510f, (float16_t)-0.146730474f,
+    (float16_t)0.988721692f, (float16_t)-0.149764535f,
+    (float16_t)0.988257568f, (float16_t)-0.152797185f,
+    (float16_t)0.987784142f, (float16_t)-0.155828398f,
+    (float16_t)0.987301418f, (float16_t)-0.158858143f,
+    (float16_t)0.986809402f, (float16_t)-0.161886394f,
+    (float16_t)0.986308097f, (float16_t)-0.164913120f,
+    (float16_t)0.985797509f, (float16_t)-0.167938295f,
+    (float16_t)0.985277642f, (float16_t)-0.170961889f,
+    (float16_t)0.984748502f, (float16_t)-0.173983873f,
+    (float16_t)0.984210092f, (float16_t)-0.177004220f,
+    (float16_t)0.983662419f, (float16_t)-0.180022901f,
+    (float16_t)0.983105487f, (float16_t)-0.183039888f,
+    (float16_t)0.982539302f, (float16_t)-0.186055152f,
+    (float16_t)0.981963869f, (float16_t)-0.189068664f,
+    (float16_t)0.981379193f, (float16_t)-0.192080397f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.980182136f, (float16_t)-0.198098411f,
+    (float16_t)0.979569766f, (float16_t)-0.201104635f,
+    (float16_t)0.978948175f, (float16_t)-0.204108966f,
+    (float16_t)0.978317371f, (float16_t)-0.207111376f,
+    (float16_t)0.977677358f, (float16_t)-0.210111837f,
+    (float16_t)0.977028143f, (float16_t)-0.213110320f,
+    (float16_t)0.976369731f, (float16_t)-0.216106797f,
+    (float16_t)0.975702130f, (float16_t)-0.219101240f,
+    (float16_t)0.975025345f, (float16_t)-0.222093621f,
+    (float16_t)0.974339383f, (float16_t)-0.225083911f,
+    (float16_t)0.973644250f, (float16_t)-0.228072083f,
+    (float16_t)0.972939952f, (float16_t)-0.231058108f,
+    (float16_t)0.972226497f, (float16_t)-0.234041959f,
+    (float16_t)0.971503891f, (float16_t)-0.237023606f,
+    (float16_t)0.970772141f, (float16_t)-0.240003022f,
+    (float16_t)0.970031253f, (float16_t)-0.242980180f,
+    (float16_t)0.969281235f, (float16_t)-0.245955050f,
+    (float16_t)0.968522094f, (float16_t)-0.248927606f,
+    (float16_t)0.967753837f, (float16_t)-0.251897818f,
+    (float16_t)0.966976471f, (float16_t)-0.254865660f,
+    (float16_t)0.966190003f, (float16_t)-0.257831102f,
+    (float16_t)0.965394442f, (float16_t)-0.260794118f,
+    (float16_t)0.964589793f, (float16_t)-0.263754679f,
+    (float16_t)0.963776066f, (float16_t)-0.266712757f,
+    (float16_t)0.962953267f, (float16_t)-0.269668326f,
+    (float16_t)0.962121404f, (float16_t)-0.272621355f,
+    (float16_t)0.961280486f, (float16_t)-0.275571819f,
+    (float16_t)0.960430519f, (float16_t)-0.278519689f,
+    (float16_t)0.959571513f, (float16_t)-0.281464938f,
+    (float16_t)0.958703475f, (float16_t)-0.284407537f,
+    (float16_t)0.957826413f, (float16_t)-0.287347460f,
+    (float16_t)0.956940336f, (float16_t)-0.290284677f,
+    (float16_t)0.956045251f, (float16_t)-0.293219163f,
+    (float16_t)0.955141168f, (float16_t)-0.296150888f,
+    (float16_t)0.954228095f, (float16_t)-0.299079826f,
+    (float16_t)0.953306040f, (float16_t)-0.302005949f,
+    (float16_t)0.952375013f, (float16_t)-0.304929230f,
+    (float16_t)0.951435021f, (float16_t)-0.307849640f,
+    (float16_t)0.950486074f, (float16_t)-0.310767153f,
+    (float16_t)0.949528181f, (float16_t)-0.313681740f,
+    (float16_t)0.948561350f, (float16_t)-0.316593376f,
+    (float16_t)0.947585591f, (float16_t)-0.319502031f,
+    (float16_t)0.946600913f, (float16_t)-0.322407679f,
+    (float16_t)0.945607325f, (float16_t)-0.325310292f,
+    (float16_t)0.944604837f, (float16_t)-0.328209844f,
+    (float16_t)0.943593458f, (float16_t)-0.331106306f,
+    (float16_t)0.942573198f, (float16_t)-0.333999651f,
+    (float16_t)0.941544065f, (float16_t)-0.336889853f,
+    (float16_t)0.940506071f, (float16_t)-0.339776884f,
+    (float16_t)0.939459224f, (float16_t)-0.342660717f,
+    (float16_t)0.938403534f, (float16_t)-0.345541325f,
+    (float16_t)0.937339012f, (float16_t)-0.348418680f,
+    (float16_t)0.936265667f, (float16_t)-0.351292756f,
+    (float16_t)0.935183510f, (float16_t)-0.354163525f,
+    (float16_t)0.934092550f, (float16_t)-0.357030961f,
+    (float16_t)0.932992799f, (float16_t)-0.359895037f,
+    (float16_t)0.931884266f, (float16_t)-0.362755724f,
+    (float16_t)0.930766961f, (float16_t)-0.365612998f,
+    (float16_t)0.929640896f, (float16_t)-0.368466830f,
+    (float16_t)0.928506080f, (float16_t)-0.371317194f,
+    (float16_t)0.927362526f, (float16_t)-0.374164063f,
+    (float16_t)0.926210242f, (float16_t)-0.377007410f,
+    (float16_t)0.925049241f, (float16_t)-0.379847209f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.922701128f, (float16_t)-0.385516054f,
+    (float16_t)0.921514039f, (float16_t)-0.388345047f,
+    (float16_t)0.920318277f, (float16_t)-0.391170384f,
+    (float16_t)0.919113852f, (float16_t)-0.393992040f,
+    (float16_t)0.917900776f, (float16_t)-0.396809987f,
+    (float16_t)0.916679060f, (float16_t)-0.399624200f,
+    (float16_t)0.915448716f, (float16_t)-0.402434651f,
+    (float16_t)0.914209756f, (float16_t)-0.405241314f,
+    (float16_t)0.912962190f, (float16_t)-0.408044163f,
+    (float16_t)0.911706032f, (float16_t)-0.410843171f,
+    (float16_t)0.910441292f, (float16_t)-0.413638312f,
+    (float16_t)0.909167983f, (float16_t)-0.416429560f,
+    (float16_t)0.907886116f, (float16_t)-0.419216888f,
+    (float16_t)0.906595705f, (float16_t)-0.422000271f,
+    (float16_t)0.905296759f, (float16_t)-0.424779681f,
+    (float16_t)0.903989293f, (float16_t)-0.427555093f,
+    (float16_t)0.902673318f, (float16_t)-0.430326481f,
+    (float16_t)0.901348847f, (float16_t)-0.433093819f,
+    (float16_t)0.900015892f, (float16_t)-0.435857080f,
+    (float16_t)0.898674466f, (float16_t)-0.438616239f,
+    (float16_t)0.897324581f, (float16_t)-0.441371269f,
+    (float16_t)0.895966250f, (float16_t)-0.444122145f,
+    (float16_t)0.894599486f, (float16_t)-0.446868840f,
+    (float16_t)0.893224301f, (float16_t)-0.449611330f,
+    (float16_t)0.891840709f, (float16_t)-0.452349587f,
+    (float16_t)0.890448723f, (float16_t)-0.455083587f,
+    (float16_t)0.889048356f, (float16_t)-0.457813304f,
+    (float16_t)0.887639620f, (float16_t)-0.460538711f,
+    (float16_t)0.886222530f, (float16_t)-0.463259784f,
+    (float16_t)0.884797098f, (float16_t)-0.465976496f,
+    (float16_t)0.883363339f, (float16_t)-0.468688822f,
+    (float16_t)0.881921264f, (float16_t)-0.471396737f,
+    (float16_t)0.880470889f, (float16_t)-0.474100215f,
+    (float16_t)0.879012226f, (float16_t)-0.476799230f,
+    (float16_t)0.877545290f, (float16_t)-0.479493758f,
+    (float16_t)0.876070094f, (float16_t)-0.482183772f,
+    (float16_t)0.874586652f, (float16_t)-0.484869248f,
+    (float16_t)0.873094978f, (float16_t)-0.487550160f,
+    (float16_t)0.871595087f, (float16_t)-0.490226483f,
+    (float16_t)0.870086991f, (float16_t)-0.492898192f,
+    (float16_t)0.868570706f, (float16_t)-0.495565262f,
+    (float16_t)0.867046246f, (float16_t)-0.498227667f,
+    (float16_t)0.865513624f, (float16_t)-0.500885383f,
+    (float16_t)0.863972856f, (float16_t)-0.503538384f,
+    (float16_t)0.862423956f, (float16_t)-0.506186645f,
+    (float16_t)0.860866939f, (float16_t)-0.508830143f,
+    (float16_t)0.859301818f, (float16_t)-0.511468850f,
+    (float16_t)0.857728610f, (float16_t)-0.514102744f,
+    (float16_t)0.856147328f, (float16_t)-0.516731799f,
+    (float16_t)0.854557988f, (float16_t)-0.519355990f,
+    (float16_t)0.852960605f, (float16_t)-0.521975293f,
+    (float16_t)0.851355193f, (float16_t)-0.524589683f,
+    (float16_t)0.849741768f, (float16_t)-0.527199135f,
+    (float16_t)0.848120345f, (float16_t)-0.529803625f,
+    (float16_t)0.846490939f, (float16_t)-0.532403128f,
+    (float16_t)0.844853565f, (float16_t)-0.534997620f,
+    (float16_t)0.843208240f, (float16_t)-0.537587076f,
+    (float16_t)0.841554977f, (float16_t)-0.540171473f,
+    (float16_t)0.839893794f, (float16_t)-0.542750785f,
+    (float16_t)0.838224706f, (float16_t)-0.545324988f,
+    (float16_t)0.836547727f, (float16_t)-0.547894059f,
+    (float16_t)0.834862875f, (float16_t)-0.550457973f,
+    (float16_t)0.833170165f, (float16_t)-0.553016706f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.829761234f, (float16_t)-0.558118531f,
+    (float16_t)0.828045045f, (float16_t)-0.560661576f,
+    (float16_t)0.826321063f, (float16_t)-0.563199344f,
+    (float16_t)0.824589303f, (float16_t)-0.565731811f,
+    (float16_t)0.822849781f, (float16_t)-0.568258953f,
+    (float16_t)0.821102515f, (float16_t)-0.570780746f,
+    (float16_t)0.819347520f, (float16_t)-0.573297167f,
+    (float16_t)0.817584813f, (float16_t)-0.575808191f,
+    (float16_t)0.815814411f, (float16_t)-0.578313796f,
+    (float16_t)0.814036330f, (float16_t)-0.580813958f,
+    (float16_t)0.812250587f, (float16_t)-0.583308653f,
+    (float16_t)0.810457198f, (float16_t)-0.585797857f,
+    (float16_t)0.808656182f, (float16_t)-0.588281548f,
+    (float16_t)0.806847554f, (float16_t)-0.590759702f,
+    (float16_t)0.805031331f, (float16_t)-0.593232295f,
+    (float16_t)0.803207531f, (float16_t)-0.595699304f,
+    (float16_t)0.801376172f, (float16_t)-0.598160707f,
+    (float16_t)0.799537269f, (float16_t)-0.600616479f,
+    (float16_t)0.797690841f, (float16_t)-0.603066599f,
+    (float16_t)0.795836905f, (float16_t)-0.605511041f,
+    (float16_t)0.793975478f, (float16_t)-0.607949785f,
+    (float16_t)0.792106577f, (float16_t)-0.610382806f,
+    (float16_t)0.790230221f, (float16_t)-0.612810082f,
+    (float16_t)0.788346428f, (float16_t)-0.615231591f,
+    (float16_t)0.786455214f, (float16_t)-0.617647308f,
+    (float16_t)0.784556597f, (float16_t)-0.620057212f,
+    (float16_t)0.782650596f, (float16_t)-0.622461279f,
+    (float16_t)0.780737229f, (float16_t)-0.624859488f,
+    (float16_t)0.778816512f, (float16_t)-0.627251815f,
+    (float16_t)0.776888466f, (float16_t)-0.629638239f,
+    (float16_t)0.774953107f, (float16_t)-0.632018736f,
+    (float16_t)0.773010453f, (float16_t)-0.634393284f,
+    (float16_t)0.771060524f, (float16_t)-0.636761861f,
+    (float16_t)0.769103338f, (float16_t)-0.639124445f,
+    (float16_t)0.767138912f, (float16_t)-0.641481013f,
+    (float16_t)0.765167266f, (float16_t)-0.643831543f,
+    (float16_t)0.763188417f, (float16_t)-0.646176013f,
+    (float16_t)0.761202385f, (float16_t)-0.648514401f,
+    (float16_t)0.759209189f, (float16_t)-0.650846685f,
+    (float16_t)0.757208847f, (float16_t)-0.653172843f,
+    (float16_t)0.755201377f, (float16_t)-0.655492853f,
+    (float16_t)0.753186799f, (float16_t)-0.657806693f,
+    (float16_t)0.751165132f, (float16_t)-0.660114342f,
+    (float16_t)0.749136395f, (float16_t)-0.662415778f,
+    (float16_t)0.747100606f, (float16_t)-0.664710978f,
+    (float16_t)0.745057785f, (float16_t)-0.666999922f,
+    (float16_t)0.743007952f, (float16_t)-0.669282588f,
+    (float16_t)0.740951125f, (float16_t)-0.671558955f,
+    (float16_t)0.738887324f, (float16_t)-0.673829000f,
+    (float16_t)0.736816569f, (float16_t)-0.676092704f,
+    (float16_t)0.734738878f, (float16_t)-0.678350043f,
+    (float16_t)0.732654272f, (float16_t)-0.680600998f,
+    (float16_t)0.730562769f, (float16_t)-0.682845546f,
+    (float16_t)0.728464390f, (float16_t)-0.685083668f,
+    (float16_t)0.726359155f, (float16_t)-0.687315341f,
+    (float16_t)0.724247083f, (float16_t)-0.689540545f,
+    (float16_t)0.722128194f, (float16_t)-0.691759258f,
+    (float16_t)0.720002508f, (float16_t)-0.693971461f,
+    (float16_t)0.717870045f, (float16_t)-0.696177131f,
+    (float16_t)0.715730825f, (float16_t)-0.698376249f,
+    (float16_t)0.713584869f, (float16_t)-0.700568794f,
+    (float16_t)0.711432196f, (float16_t)-0.702754744f,
+    (float16_t)0.709272826f, (float16_t)-0.704934080f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.704934080f, (float16_t)-0.709272826f,
+    (float16_t)0.702754744f, (float16_t)-0.711432196f,
+    (float16_t)0.700568794f, (float16_t)-0.713584869f,
+    (float16_t)0.698376249f, (float16_t)-0.715730825f,
+    (float16_t)0.696177131f, (float16_t)-0.717870045f,
+    (float16_t)0.693971461f, (float16_t)-0.720002508f,
+    (float16_t)0.691759258f, (float16_t)-0.722128194f,
+    (float16_t)0.689540545f, (float16_t)-0.724247083f,
+    (float16_t)0.687315341f, (float16_t)-0.726359155f,
+    (float16_t)0.685083668f, (float16_t)-0.728464390f,
+    (float16_t)0.682845546f, (float16_t)-0.730562769f,
+    (float16_t)0.680600998f, (float16_t)-0.732654272f,
+    (float16_t)0.678350043f, (float16_t)-0.734738878f,
+    (float16_t)0.676092704f, (float16_t)-0.736816569f,
+    (float16_t)0.673829000f, (float16_t)-0.738887324f,
+    (float16_t)0.671558955f, (float16_t)-0.740951125f,
+    (float16_t)0.669282588f, (float16_t)-0.743007952f,
+    (float16_t)0.666999922f, (float16_t)-0.745057785f,
+    (float16_t)0.664710978f, (float16_t)-0.747100606f,
+    (float16_t)0.662415778f, (float16_t)-0.749136395f,
+    (float16_t)0.660114342f, (float16_t)-0.751165132f,
+    (float16_t)0.657806693f, (float16_t)-0.753186799f,
+    (float16_t)0.655492853f, (float16_t)-0.755201377f,
+    (float16_t)0.653172843f, (float16_t)-0.757208847f,
+    (float16_t)0.650846685f, (float16_t)-0.759209189f,
+    (float16_t)0.648514401f, (float16_t)-0.761202385f,
+    (float16_t)0.646176013f, (float16_t)-0.763188417f,
+    (float16_t)0.643831543f, (float16_t)-0.765167266f,
+    (float16_t)0.641481013f, (float16_t)-0.767138912f,
+    (float16_t)0.639124445f, (float16_t)-0.769103338f,
+    (float16_t)0.636761861f, (float16_t)-0.771060524f,
+    (float16_t)0.634393284f, (float16_t)-0.773010453f,
+    (float16_t)0.632018736f, (float16_t)-0.774953107f,
+    (float16_t)0.629638239f, (float16_t)-0.776888466f,
+    (float16_t)0.627251815f, (float16_t)-0.778816512f,
+    (float16_t)0.624859488f, (float16_t)-0.780737229f,
+    (float16_t)0.622461279f, (float16_t)-0.782650596f,
+    (float16_t)0.620057212f, (float16_t)-0.784556597f,
+    (float16_t)0.617647308f, (float16_t)-0.786455214f,
+    (float16_t)0.615231591f, (float16_t)-0.788346428f,
+    (float16_t)0.612810082f, (float16_t)-0.790230221f,
+    (float16_t)0.610382806f, (float16_t)-0.792106577f,
+    (float16_t)0.607949785f, (float16_t)-0.793975478f,
+    (float16_t)0.605511041f, (float16_t)-0.795836905f,
+    (float16_t)0.603066599f, (float16_t)-0.797690841f,
+    (float16_t)0.600616479f, (float16_t)-0.799537269f,
+    (float16_t)0.598160707f, (float16_t)-0.801376172f,
+    (float16_t)0.595699304f, (float16_t)-0.803207531f,
+    (float16_t)0.593232295f, (float16_t)-0.805031331f,
+    (float16_t)0.590759702f, (float16_t)-0.806847554f,
+    (float16_t)0.588281548f, (float16_t)-0.808656182f,
+    (float16_t)0.585797857f, (float16_t)-0.810457198f,
+    (float16_t)0.583308653f, (float16_t)-0.812250587f,
+    (float16_t)0.580813958f, (float16_t)-0.814036330f,
+    (float16_t)0.578313796f, (float16_t)-0.815814411f,
+    (float16_t)0.575808191f, (float16_t)-0.817584813f,
+    (float16_t)0.573297167f, (float16_t)-0.819347520f,
+    (float16_t)0.570780746f, (float16_t)-0.821102515f,
+    (float16_t)0.568258953f, (float16_t)-0.822849781f,
+    (float16_t)0.565731811f, (float16_t)-0.824589303f,
+    (float16_t)0.563199344f, (float16_t)-0.826321063f,
+    (float16_t)0.560661576f, (float16_t)-0.828045045f,
+    (float16_t)0.558118531f, (float16_t)-0.829761234f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.553016706f, (float16_t)-0.833170165f,
+    (float16_t)0.550457973f, (float16_t)-0.834862875f,
+    (float16_t)0.547894059f, (float16_t)-0.836547727f,
+    (float16_t)0.545324988f, (float16_t)-0.838224706f,
+    (float16_t)0.542750785f, (float16_t)-0.839893794f,
+    (float16_t)0.540171473f, (float16_t)-0.841554977f,
+    (float16_t)0.537587076f, (float16_t)-0.843208240f,
+    (float16_t)0.534997620f, (float16_t)-0.844853565f,
+    (float16_t)0.532403128f, (float16_t)-0.846490939f,
+    (float16_t)0.529803625f, (float16_t)-0.848120345f,
+    (float16_t)0.527199135f, (float16_t)-0.849741768f,
+    (float16_t)0.524589683f, (float16_t)-0.851355193f,
+    (float16_t)0.521975293f, (float16_t)-0.852960605f,
+    (float16_t)0.519355990f, (float16_t)-0.854557988f,
+    (float16_t)0.516731799f, (float16_t)-0.856147328f,
+    (float16_t)0.514102744f, (float16_t)-0.857728610f,
+    (float16_t)0.511468850f, (float16_t)-0.859301818f,
+    (float16_t)0.508830143f, (float16_t)-0.860866939f,
+    (float16_t)0.506186645f, (float16_t)-0.862423956f,
+    (float16_t)0.503538384f, (float16_t)-0.863972856f,
+    (float16_t)0.500885383f, (float16_t)-0.865513624f,
+    (float16_t)0.498227667f, (float16_t)-0.867046246f,
+    (float16_t)0.495565262f, (float16_t)-0.868570706f,
+    (float16_t)0.492898192f, (float16_t)-0.870086991f,
+    (float16_t)0.490226483f, (float16_t)-0.871595087f,
+    (float16_t)0.487550160f, (float16_t)-0.873094978f,
+    (float16_t)0.484869248f, (float16_t)-0.874586652f,
+    (float16_t)0.482183772f, (float16_t)-0.876070094f,
+    (float16_t)0.479493758f, (float16_t)-0.877545290f,
+    (float16_t)0.476799230f, (float16_t)-0.879012226f,
+    (float16_t)0.474100215f, (float16_t)-0.880470889f,
+    (float16_t)0.471396737f, (float16_t)-0.881921264f,
+    (float16_t)0.468688822f, (float16_t)-0.883363339f,
+    (float16_t)0.465976496f, (float16_t)-0.884797098f,
+    (float16_t)0.463259784f, (float16_t)-0.886222530f,
+    (float16_t)0.460538711f, (float16_t)-0.887639620f,
+    (float16_t)0.457813304f, (float16_t)-0.889048356f,
+    (float16_t)0.455083587f, (float16_t)-0.890448723f,
+    (float16_t)0.452349587f, (float16_t)-0.891840709f,
+    (float16_t)0.449611330f, (float16_t)-0.893224301f,
+    (float16_t)0.446868840f, (float16_t)-0.894599486f,
+    (float16_t)0.444122145f, (float16_t)-0.895966250f,
+    (float16_t)0.441371269f, (float16_t)-0.897324581f,
+    (float16_t)0.438616239f, (float16_t)-0.898674466f,
+    (float16_t)0.435857080f, (float16_t)-0.900015892f,
+    (float16_t)0.433093819f, (float16_t)-0.901348847f,
+    (float16_t)0.430326481f, (float16_t)-0.902673318f,
+    (float16_t)0.427555093f, (float16_t)-0.903989293f,
+    (float16_t)0.424779681f, (float16_t)-0.905296759f,
+    (float16_t)0.422000271f, (float16_t)-0.906595705f,
+    (float16_t)0.419216888f, (float16_t)-0.907886116f,
+    (float16_t)0.416429560f, (float16_t)-0.909167983f,
+    (float16_t)0.413638312f, (float16_t)-0.910441292f,
+    (float16_t)0.410843171f, (float16_t)-0.911706032f,
+    (float16_t)0.408044163f, (float16_t)-0.912962190f,
+    (float16_t)0.405241314f, (float16_t)-0.914209756f,
+    (float16_t)0.402434651f, (float16_t)-0.915448716f,
+    (float16_t)0.399624200f, (float16_t)-0.916679060f,
+    (float16_t)0.396809987f, (float16_t)-0.917900776f,
+    (float16_t)0.393992040f, (float16_t)-0.919113852f,
+    (float16_t)0.391170384f, (float16_t)-0.920318277f,
+    (float16_t)0.388345047f, (float16_t)-0.921514039f,
+    (float16_t)0.385516054f, (float16_t)-0.922701128f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.379847209f, (float16_t)-0.925049241f,
+    (float16_t)0.377007410f, (float16_t)-0.926210242f,
+    (float16_t)0.374164063f, (float16_t)-0.927362526f,
+    (float16_t)0.371317194f, (float16_t)-0.928506080f,
+    (float16_t)0.368466830f, (float16_t)-0.929640896f,
+    (float16_t)0.365612998f, (float16_t)-0.930766961f,
+    (float16_t)0.362755724f, (float16_t)-0.931884266f,
+    (float16_t)0.359895037f, (float16_t)-0.932992799f,
+    (float16_t)0.357030961f, (float16_t)-0.934092550f,
+    (float16_t)0.354163525f, (float16_t)-0.935183510f,
+    (float16_t)0.351292756f, (float16_t)-0.936265667f,
+    (float16_t)0.348418680f, (float16_t)-0.937339012f,
+    (float16_t)0.345541325f, (float16_t)-0.938403534f,
+    (float16_t)0.342660717f, (float16_t)-0.939459224f,
+    (float16_t)0.339776884f, (float16_t)-0.940506071f,
+    (float16_t)0.336889853f, (float16_t)-0.941544065f,
+    (float16_t)0.333999651f, (float16_t)-0.942573198f,
+    (float16_t)0.331106306f, (float16_t)-0.943593458f,
+    (float16_t)0.328209844f, (float16_t)-0.944604837f,
+    (float16_t)0.325310292f, (float16_t)-0.945607325f,
+    (float16_t)0.322407679f, (float16_t)-0.946600913f,
+    (float16_t)0.319502031f, (float16_t)-0.947585591f,
+    (float16_t)0.316593376f, (float16_t)-0.948561350f,
+    (float16_t)0.313681740f, (float16_t)-0.949528181f,
+    (float16_t)0.310767153f, (float16_t)-0.950486074f,
+    (float16_t)0.307849640f, (float16_t)-0.951435021f,
+    (float16_t)0.304929230f, (float16_t)-0.952375013f,
+    (float16_t)0.302005949f, (float16_t)-0.953306040f,
+    (float16_t)0.299079826f, (float16_t)-0.954228095f,
+    (float16_t)0.296150888f, (float16_t)-0.955141168f,
+    (float16_t)0.293219163f, (float16_t)-0.956045251f,
+    (float16_t)0.290284677f, (float16_t)-0.956940336f,
+    (float16_t)0.287347460f, (float16_t)-0.957826413f,
+    (float16_t)0.284407537f, (float16_t)-0.958703475f,
+    (float16_t)0.281464938f, (float16_t)-0.959571513f,
+    (float16_t)0.278519689f, (float16_t)-0.960430519f,
+    (float16_t)0.275571819f, (float16_t)-0.961280486f,
+    (float16_t)0.272621355f, (float16_t)-0.962121404f,
+    (float16_t)0.269668326f, (float16_t)-0.962953267f,
+    (float16_t)0.266712757f, (float16_t)-0.963776066f,
+    (float16_t)0.263754679f, (float16_t)-0.964589793f,
+    (float16_t)0.260794118f, (float16_t)-0.965394442f,
+    (float16_t)0.257831102f, (float16_t)-0.966190003f,
+    (float16_t)0.254865660f, (float16_t)-0.966976471f,
+    (float16_t)0.251897818f, (float16_t)-0.967753837f,
+    (float16_t)0.248927606f, (float16_t)-0.968522094f,
+    (float16_t)0.245955050f, (float16_t)-0.969281235f,
+    (float16_t)0.242980180f, (float16_t)-0.970031253f,
+    (float16_t)0.240003022f, (float16_t)-0.970772141f,
+    (float16_t)0.237023606f, (float16_t)-0.971503891f,
+    (float16_t)0.234041959f, (float16_t)-0.972226497f,
+    (float16_t)0.231058108f, (float16_t)-0.972939952f,
+    (float16_t)0.228072083f, (float16_t)-0.973644250f,
+    (float16_t)0.225083911f, (float16_t)-0.974339383f,
+    (float16_t)0.222093621f, (float16_t)-0.975025345f,
+    (float16_t)0.219101240f, (float16_t)-0.975702130f,
+    (float16_t)0.216106797f, (float16_t)-0.976369731f,
+    (float16_t)0.213110320f, (float16_t)-0.977028143f,
+    (float16_t)0.210111837f, (float16_t)-0.977677358f,
+    (float16_t)0.207111376f, (float16_t)-0.978317371f,
+    (float16_t)0.204108966f, (float16_t)-0.978948175f,
+    (float16_t)0.201104635f, (float16_t)-0.979569766f,
+    (float16_t)0.198098411f, (float16_t)-0.980182136f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.192080397f, (float16_t)-0.981379193f,
+    (float16_t)0.189068664f, (float16_t)-0.981963869f,
+    (float16_t)0.186055152f, (float16_t)-0.982539302f,
+    (float16_t)0.183039888f, (float16_t)-0.983105487f,
+    (float16_t)0.180022901f, (float16_t)-0.983662419f,
+    (float16_t)0.177004220f, (float16_t)-0.984210092f,
+    (float16_t)0.173983873f, (float16_t)-0.984748502f,
+    (float16_t)0.170961889f, (float16_t)-0.985277642f,
+    (float16_t)0.167938295f, (float16_t)-0.985797509f,
+    (float16_t)0.164913120f, (float16_t)-0.986308097f,
+    (float16_t)0.161886394f, (float16_t)-0.986809402f,
+    (float16_t)0.158858143f, (float16_t)-0.987301418f,
+    (float16_t)0.155828398f, (float16_t)-0.987784142f,
+    (float16_t)0.152797185f, (float16_t)-0.988257568f,
+    (float16_t)0.149764535f, (float16_t)-0.988721692f,
+    (float16_t)0.146730474f, (float16_t)-0.989176510f,
+    (float16_t)0.143695033f, (float16_t)-0.989622017f,
+    (float16_t)0.140658239f, (float16_t)-0.990058210f,
+    (float16_t)0.137620122f, (float16_t)-0.990485084f,
+    (float16_t)0.134580709f, (float16_t)-0.990902635f,
+    (float16_t)0.131540029f, (float16_t)-0.991310860f,
+    (float16_t)0.128498111f, (float16_t)-0.991709754f,
+    (float16_t)0.125454983f, (float16_t)-0.992099313f,
+    (float16_t)0.122410675f, (float16_t)-0.992479535f,
+    (float16_t)0.119365215f, (float16_t)-0.992850414f,
+    (float16_t)0.116318631f, (float16_t)-0.993211949f,
+    (float16_t)0.113270952f, (float16_t)-0.993564136f,
+    (float16_t)0.110222207f, (float16_t)-0.993906970f,
+    (float16_t)0.107172425f, (float16_t)-0.994240449f,
+    (float16_t)0.104121634f, (float16_t)-0.994564571f,
+    (float16_t)0.101069863f, (float16_t)-0.994879331f,
+    (float16_t)0.098017140f, (float16_t)-0.995184727f,
+    (float16_t)0.094963495f, (float16_t)-0.995480755f,
+    (float16_t)0.091908956f, (float16_t)-0.995767414f,
+    (float16_t)0.088853553f, (float16_t)-0.996044701f,
+    (float16_t)0.085797312f, (float16_t)-0.996312612f,
+    (float16_t)0.082740265f, (float16_t)-0.996571146f,
+    (float16_t)0.079682438f, (float16_t)-0.996820299f,
+    (float16_t)0.076623861f, (float16_t)-0.997060070f,
+    (float16_t)0.073564564f, (float16_t)-0.997290457f,
+    (float16_t)0.070504573f, (float16_t)-0.997511456f,
+    (float16_t)0.067443920f, (float16_t)-0.997723067f,
+    (float16_t)0.064382631f, (float16_t)-0.997925286f,
+    (float16_t)0.061320736f, (float16_t)-0.998118113f,
+    (float16_t)0.058258265f, (float16_t)-0.998301545f,
+    (float16_t)0.055195244f, (float16_t)-0.998475581f,
+    (float16_t)0.052131705f, (float16_t)-0.998640218f,
+    (float16_t)0.049067674f, (float16_t)-0.998795456f,
+    (float16_t)0.046003182f, (float16_t)-0.998941293f,
+    (float16_t)0.042938257f, (float16_t)-0.999077728f,
+    (float16_t)0.039872928f, (float16_t)-0.999204759f,
+    (float16_t)0.036807223f, (float16_t)-0.999322385f,
+    (float16_t)0.033741172f, (float16_t)-0.999430605f,
+    (float16_t)0.030674803f, (float16_t)-0.999529418f,
+    (float16_t)0.027608146f, (float16_t)-0.999618822f,
+    (float16_t)0.024541229f, (float16_t)-0.999698819f,
+    (float16_t)0.021474080f, (float16_t)-0.999769405f,
+    (float16_t)0.018406730f, (float16_t)-0.999830582f,
+    (float16_t)0.015339206f, (float16_t)-0.999882347f,
+    (float16_t)0.012271538f, (float16_t)-0.999924702f,
+    (float16_t)0.009203755f, (float16_t)-0.999957645f,
+    (float16_t)0.006135885f, (float16_t)-0.999981175f,
+    (float16_t)0.003067957f, (float16_t)-0.999995294f
+};
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_4096)
+const float16_t twiddleCoefF16_rfft_4096[4096] = {
+    (float16_t)0.000000000f,  (float16_t)1.000000000f,
+    (float16_t)0.001533980f,  (float16_t)0.999998823f,
+    (float16_t)0.003067957f,  (float16_t)0.999995294f,
+    (float16_t)0.004601926f,  (float16_t)0.999989411f,
+    (float16_t)0.006135885f,  (float16_t)0.999981175f,
+    (float16_t)0.007669829f,  (float16_t)0.999970586f,
+    (float16_t)0.009203755f,  (float16_t)0.999957645f,
+    (float16_t)0.010737659f,  (float16_t)0.999942350f,
+    (float16_t)0.012271538f,  (float16_t)0.999924702f,
+    (float16_t)0.013805389f,  (float16_t)0.999904701f,
+    (float16_t)0.015339206f,  (float16_t)0.999882347f,
+    (float16_t)0.016872988f,  (float16_t)0.999857641f,
+    (float16_t)0.018406730f,  (float16_t)0.999830582f,
+    (float16_t)0.019940429f,  (float16_t)0.999801170f,
+    (float16_t)0.021474080f,  (float16_t)0.999769405f,
+    (float16_t)0.023007681f,  (float16_t)0.999735288f,
+    (float16_t)0.024541229f,  (float16_t)0.999698819f,
+    (float16_t)0.026074718f,  (float16_t)0.999659997f,
+    (float16_t)0.027608146f,  (float16_t)0.999618822f,
+    (float16_t)0.029141509f,  (float16_t)0.999575296f,
+    (float16_t)0.030674803f,  (float16_t)0.999529418f,
+    (float16_t)0.032208025f,  (float16_t)0.999481187f,
+    (float16_t)0.033741172f,  (float16_t)0.999430605f,
+    (float16_t)0.035274239f,  (float16_t)0.999377670f,
+    (float16_t)0.036807223f,  (float16_t)0.999322385f,
+    (float16_t)0.038340120f,  (float16_t)0.999264747f,
+    (float16_t)0.039872928f,  (float16_t)0.999204759f,
+    (float16_t)0.041405641f,  (float16_t)0.999142419f,
+    (float16_t)0.042938257f,  (float16_t)0.999077728f,
+    (float16_t)0.044470772f,  (float16_t)0.999010686f,
+    (float16_t)0.046003182f,  (float16_t)0.998941293f,
+    (float16_t)0.047535484f,  (float16_t)0.998869550f,
+    (float16_t)0.049067674f,  (float16_t)0.998795456f,
+    (float16_t)0.050599749f,  (float16_t)0.998719012f,
+    (float16_t)0.052131705f,  (float16_t)0.998640218f,
+    (float16_t)0.053663538f,  (float16_t)0.998559074f,
+    (float16_t)0.055195244f,  (float16_t)0.998475581f,
+    (float16_t)0.056726821f,  (float16_t)0.998389737f,
+    (float16_t)0.058258265f,  (float16_t)0.998301545f,
+    (float16_t)0.059789571f,  (float16_t)0.998211003f,
+    (float16_t)0.061320736f,  (float16_t)0.998118113f,
+    (float16_t)0.062851758f,  (float16_t)0.998022874f,
+    (float16_t)0.064382631f,  (float16_t)0.997925286f,
+    (float16_t)0.065913353f,  (float16_t)0.997825350f,
+    (float16_t)0.067443920f,  (float16_t)0.997723067f,
+    (float16_t)0.068974328f,  (float16_t)0.997618435f,
+    (float16_t)0.070504573f,  (float16_t)0.997511456f,
+    (float16_t)0.072034653f,  (float16_t)0.997402130f,
+    (float16_t)0.073564564f,  (float16_t)0.997290457f,
+    (float16_t)0.075094301f,  (float16_t)0.997176437f,
+    (float16_t)0.076623861f,  (float16_t)0.997060070f,
+    (float16_t)0.078153242f,  (float16_t)0.996941358f,
+    (float16_t)0.079682438f,  (float16_t)0.996820299f,
+    (float16_t)0.081211447f,  (float16_t)0.996696895f,
+    (float16_t)0.082740265f,  (float16_t)0.996571146f,
+    (float16_t)0.084268888f,  (float16_t)0.996443051f,
+    (float16_t)0.085797312f,  (float16_t)0.996312612f,
+    (float16_t)0.087325535f,  (float16_t)0.996179829f,
+    (float16_t)0.088853553f,  (float16_t)0.996044701f,
+    (float16_t)0.090381361f,  (float16_t)0.995907229f,
+    (float16_t)0.091908956f,  (float16_t)0.995767414f,
+    (float16_t)0.093436336f,  (float16_t)0.995625256f,
+    (float16_t)0.094963495f,  (float16_t)0.995480755f,
+    (float16_t)0.096490431f,  (float16_t)0.995333912f,
+    (float16_t)0.098017140f,  (float16_t)0.995184727f,
+    (float16_t)0.099543619f,  (float16_t)0.995033199f,
+    (float16_t)0.101069863f,  (float16_t)0.994879331f,
+    (float16_t)0.102595869f,  (float16_t)0.994723121f,
+    (float16_t)0.104121634f,  (float16_t)0.994564571f,
+    (float16_t)0.105647154f,  (float16_t)0.994403680f,
+    (float16_t)0.107172425f,  (float16_t)0.994240449f,
+    (float16_t)0.108697444f,  (float16_t)0.994074879f,
+    (float16_t)0.110222207f,  (float16_t)0.993906970f,
+    (float16_t)0.111746711f,  (float16_t)0.993736722f,
+    (float16_t)0.113270952f,  (float16_t)0.993564136f,
+    (float16_t)0.114794927f,  (float16_t)0.993389211f,
+    (float16_t)0.116318631f,  (float16_t)0.993211949f,
+    (float16_t)0.117842062f,  (float16_t)0.993032350f,
+    (float16_t)0.119365215f,  (float16_t)0.992850414f,
+    (float16_t)0.120888087f,  (float16_t)0.992666142f,
+    (float16_t)0.122410675f,  (float16_t)0.992479535f,
+    (float16_t)0.123932975f,  (float16_t)0.992290591f,
+    (float16_t)0.125454983f,  (float16_t)0.992099313f,
+    (float16_t)0.126976696f,  (float16_t)0.991905700f,
+    (float16_t)0.128498111f,  (float16_t)0.991709754f,
+    (float16_t)0.130019223f,  (float16_t)0.991511473f,
+    (float16_t)0.131540029f,  (float16_t)0.991310860f,
+    (float16_t)0.133060525f,  (float16_t)0.991107914f,
+    (float16_t)0.134580709f,  (float16_t)0.990902635f,
+    (float16_t)0.136100575f,  (float16_t)0.990695025f,
+    (float16_t)0.137620122f,  (float16_t)0.990485084f,
+    (float16_t)0.139139344f,  (float16_t)0.990272812f,
+    (float16_t)0.140658239f,  (float16_t)0.990058210f,
+    (float16_t)0.142176804f,  (float16_t)0.989841278f,
+    (float16_t)0.143695033f,  (float16_t)0.989622017f,
+    (float16_t)0.145212925f,  (float16_t)0.989400428f,
+    (float16_t)0.146730474f,  (float16_t)0.989176510f,
+    (float16_t)0.148247679f,  (float16_t)0.988950265f,
+    (float16_t)0.149764535f,  (float16_t)0.988721692f,
+    (float16_t)0.151281038f,  (float16_t)0.988490793f,
+    (float16_t)0.152797185f,  (float16_t)0.988257568f,
+    (float16_t)0.154312973f,  (float16_t)0.988022017f,
+    (float16_t)0.155828398f,  (float16_t)0.987784142f,
+    (float16_t)0.157343456f,  (float16_t)0.987543942f,
+    (float16_t)0.158858143f,  (float16_t)0.987301418f,
+    (float16_t)0.160372457f,  (float16_t)0.987056571f,
+    (float16_t)0.161886394f,  (float16_t)0.986809402f,
+    (float16_t)0.163399949f,  (float16_t)0.986559910f,
+    (float16_t)0.164913120f,  (float16_t)0.986308097f,
+    (float16_t)0.166425904f,  (float16_t)0.986053963f,
+    (float16_t)0.167938295f,  (float16_t)0.985797509f,
+    (float16_t)0.169450291f,  (float16_t)0.985538735f,
+    (float16_t)0.170961889f,  (float16_t)0.985277642f,
+    (float16_t)0.172473084f,  (float16_t)0.985014231f,
+    (float16_t)0.173983873f,  (float16_t)0.984748502f,
+    (float16_t)0.175494253f,  (float16_t)0.984480455f,
+    (float16_t)0.177004220f,  (float16_t)0.984210092f,
+    (float16_t)0.178513771f,  (float16_t)0.983937413f,
+    (float16_t)0.180022901f,  (float16_t)0.983662419f,
+    (float16_t)0.181531608f,  (float16_t)0.983385110f,
+    (float16_t)0.183039888f,  (float16_t)0.983105487f,
+    (float16_t)0.184547737f,  (float16_t)0.982823551f,
+    (float16_t)0.186055152f,  (float16_t)0.982539302f,
+    (float16_t)0.187562129f,  (float16_t)0.982252741f,
+    (float16_t)0.189068664f,  (float16_t)0.981963869f,
+    (float16_t)0.190574755f,  (float16_t)0.981672686f,
+    (float16_t)0.192080397f,  (float16_t)0.981379193f,
+    (float16_t)0.193585587f,  (float16_t)0.981083391f,
+    (float16_t)0.195090322f,  (float16_t)0.980785280f,
+    (float16_t)0.196594598f,  (float16_t)0.980484862f,
+    (float16_t)0.198098411f,  (float16_t)0.980182136f,
+    (float16_t)0.199601758f,  (float16_t)0.979877104f,
+    (float16_t)0.201104635f,  (float16_t)0.979569766f,
+    (float16_t)0.202607039f,  (float16_t)0.979260123f,
+    (float16_t)0.204108966f,  (float16_t)0.978948175f,
+    (float16_t)0.205610413f,  (float16_t)0.978633924f,
+    (float16_t)0.207111376f,  (float16_t)0.978317371f,
+    (float16_t)0.208611852f,  (float16_t)0.977998515f,
+    (float16_t)0.210111837f,  (float16_t)0.977677358f,
+    (float16_t)0.211611327f,  (float16_t)0.977353900f,
+    (float16_t)0.213110320f,  (float16_t)0.977028143f,
+    (float16_t)0.214608811f,  (float16_t)0.976700086f,
+    (float16_t)0.216106797f,  (float16_t)0.976369731f,
+    (float16_t)0.217604275f,  (float16_t)0.976037079f,
+    (float16_t)0.219101240f,  (float16_t)0.975702130f,
+    (float16_t)0.220597690f,  (float16_t)0.975364885f,
+    (float16_t)0.222093621f,  (float16_t)0.975025345f,
+    (float16_t)0.223589029f,  (float16_t)0.974683511f,
+    (float16_t)0.225083911f,  (float16_t)0.974339383f,
+    (float16_t)0.226578264f,  (float16_t)0.973992962f,
+    (float16_t)0.228072083f,  (float16_t)0.973644250f,
+    (float16_t)0.229565366f,  (float16_t)0.973293246f,
+    (float16_t)0.231058108f,  (float16_t)0.972939952f,
+    (float16_t)0.232550307f,  (float16_t)0.972584369f,
+    (float16_t)0.234041959f,  (float16_t)0.972226497f,
+    (float16_t)0.235533059f,  (float16_t)0.971866337f,
+    (float16_t)0.237023606f,  (float16_t)0.971503891f,
+    (float16_t)0.238513595f,  (float16_t)0.971139158f,
+    (float16_t)0.240003022f,  (float16_t)0.970772141f,
+    (float16_t)0.241491885f,  (float16_t)0.970402839f,
+    (float16_t)0.242980180f,  (float16_t)0.970031253f,
+    (float16_t)0.244467903f,  (float16_t)0.969657385f,
+    (float16_t)0.245955050f,  (float16_t)0.969281235f,
+    (float16_t)0.247441619f,  (float16_t)0.968902805f,
+    (float16_t)0.248927606f,  (float16_t)0.968522094f,
+    (float16_t)0.250413007f,  (float16_t)0.968139105f,
+    (float16_t)0.251897818f,  (float16_t)0.967753837f,
+    (float16_t)0.253382037f,  (float16_t)0.967366292f,
+    (float16_t)0.254865660f,  (float16_t)0.966976471f,
+    (float16_t)0.256348682f,  (float16_t)0.966584374f,
+    (float16_t)0.257831102f,  (float16_t)0.966190003f,
+    (float16_t)0.259312915f,  (float16_t)0.965793359f,
+    (float16_t)0.260794118f,  (float16_t)0.965394442f,
+    (float16_t)0.262274707f,  (float16_t)0.964993253f,
+    (float16_t)0.263754679f,  (float16_t)0.964589793f,
+    (float16_t)0.265234030f,  (float16_t)0.964184064f,
+    (float16_t)0.266712757f,  (float16_t)0.963776066f,
+    (float16_t)0.268190857f,  (float16_t)0.963365800f,
+    (float16_t)0.269668326f,  (float16_t)0.962953267f,
+    (float16_t)0.271145160f,  (float16_t)0.962538468f,
+    (float16_t)0.272621355f,  (float16_t)0.962121404f,
+    (float16_t)0.274096910f,  (float16_t)0.961702077f,
+    (float16_t)0.275571819f,  (float16_t)0.961280486f,
+    (float16_t)0.277046080f,  (float16_t)0.960856633f,
+    (float16_t)0.278519689f,  (float16_t)0.960430519f,
+    (float16_t)0.279992643f,  (float16_t)0.960002146f,
+    (float16_t)0.281464938f,  (float16_t)0.959571513f,
+    (float16_t)0.282936570f,  (float16_t)0.959138622f,
+    (float16_t)0.284407537f,  (float16_t)0.958703475f,
+    (float16_t)0.285877835f,  (float16_t)0.958266071f,
+    (float16_t)0.287347460f,  (float16_t)0.957826413f,
+    (float16_t)0.288816408f,  (float16_t)0.957384501f,
+    (float16_t)0.290284677f,  (float16_t)0.956940336f,
+    (float16_t)0.291752263f,  (float16_t)0.956493919f,
+    (float16_t)0.293219163f,  (float16_t)0.956045251f,
+    (float16_t)0.294685372f,  (float16_t)0.955594334f,
+    (float16_t)0.296150888f,  (float16_t)0.955141168f,
+    (float16_t)0.297615707f,  (float16_t)0.954685755f,
+    (float16_t)0.299079826f,  (float16_t)0.954228095f,
+    (float16_t)0.300543241f,  (float16_t)0.953768190f,
+    (float16_t)0.302005949f,  (float16_t)0.953306040f,
+    (float16_t)0.303467947f,  (float16_t)0.952841648f,
+    (float16_t)0.304929230f,  (float16_t)0.952375013f,
+    (float16_t)0.306389795f,  (float16_t)0.951906137f,
+    (float16_t)0.307849640f,  (float16_t)0.951435021f,
+    (float16_t)0.309308760f,  (float16_t)0.950961666f,
+    (float16_t)0.310767153f,  (float16_t)0.950486074f,
+    (float16_t)0.312224814f,  (float16_t)0.950008245f,
+    (float16_t)0.313681740f,  (float16_t)0.949528181f,
+    (float16_t)0.315137929f,  (float16_t)0.949045882f,
+    (float16_t)0.316593376f,  (float16_t)0.948561350f,
+    (float16_t)0.318048077f,  (float16_t)0.948074586f,
+    (float16_t)0.319502031f,  (float16_t)0.947585591f,
+    (float16_t)0.320955232f,  (float16_t)0.947094366f,
+    (float16_t)0.322407679f,  (float16_t)0.946600913f,
+    (float16_t)0.323859367f,  (float16_t)0.946105232f,
+    (float16_t)0.325310292f,  (float16_t)0.945607325f,
+    (float16_t)0.326760452f,  (float16_t)0.945107193f,
+    (float16_t)0.328209844f,  (float16_t)0.944604837f,
+    (float16_t)0.329658463f,  (float16_t)0.944100258f,
+    (float16_t)0.331106306f,  (float16_t)0.943593458f,
+    (float16_t)0.332553370f,  (float16_t)0.943084437f,
+    (float16_t)0.333999651f,  (float16_t)0.942573198f,
+    (float16_t)0.335445147f,  (float16_t)0.942059740f,
+    (float16_t)0.336889853f,  (float16_t)0.941544065f,
+    (float16_t)0.338333767f,  (float16_t)0.941026175f,
+    (float16_t)0.339776884f,  (float16_t)0.940506071f,
+    (float16_t)0.341219202f,  (float16_t)0.939983753f,
+    (float16_t)0.342660717f,  (float16_t)0.939459224f,
+    (float16_t)0.344101426f,  (float16_t)0.938932484f,
+    (float16_t)0.345541325f,  (float16_t)0.938403534f,
+    (float16_t)0.346980411f,  (float16_t)0.937872376f,
+    (float16_t)0.348418680f,  (float16_t)0.937339012f,
+    (float16_t)0.349856130f,  (float16_t)0.936803442f,
+    (float16_t)0.351292756f,  (float16_t)0.936265667f,
+    (float16_t)0.352728556f,  (float16_t)0.935725689f,
+    (float16_t)0.354163525f,  (float16_t)0.935183510f,
+    (float16_t)0.355597662f,  (float16_t)0.934639130f,
+    (float16_t)0.357030961f,  (float16_t)0.934092550f,
+    (float16_t)0.358463421f,  (float16_t)0.933543773f,
+    (float16_t)0.359895037f,  (float16_t)0.932992799f,
+    (float16_t)0.361325806f,  (float16_t)0.932439629f,
+    (float16_t)0.362755724f,  (float16_t)0.931884266f,
+    (float16_t)0.364184790f,  (float16_t)0.931326709f,
+    (float16_t)0.365612998f,  (float16_t)0.930766961f,
+    (float16_t)0.367040346f,  (float16_t)0.930205023f,
+    (float16_t)0.368466830f,  (float16_t)0.929640896f,
+    (float16_t)0.369892447f,  (float16_t)0.929074581f,
+    (float16_t)0.371317194f,  (float16_t)0.928506080f,
+    (float16_t)0.372741067f,  (float16_t)0.927935395f,
+    (float16_t)0.374164063f,  (float16_t)0.927362526f,
+    (float16_t)0.375586178f,  (float16_t)0.926787474f,
+    (float16_t)0.377007410f,  (float16_t)0.926210242f,
+    (float16_t)0.378427755f,  (float16_t)0.925630831f,
+    (float16_t)0.379847209f,  (float16_t)0.925049241f,
+    (float16_t)0.381265769f,  (float16_t)0.924465474f,
+    (float16_t)0.382683432f,  (float16_t)0.923879533f,
+    (float16_t)0.384100195f,  (float16_t)0.923291417f,
+    (float16_t)0.385516054f,  (float16_t)0.922701128f,
+    (float16_t)0.386931006f,  (float16_t)0.922108669f,
+    (float16_t)0.388345047f,  (float16_t)0.921514039f,
+    (float16_t)0.389758174f,  (float16_t)0.920917242f,
+    (float16_t)0.391170384f,  (float16_t)0.920318277f,
+    (float16_t)0.392581674f,  (float16_t)0.919717146f,
+    (float16_t)0.393992040f,  (float16_t)0.919113852f,
+    (float16_t)0.395401479f,  (float16_t)0.918508394f,
+    (float16_t)0.396809987f,  (float16_t)0.917900776f,
+    (float16_t)0.398217562f,  (float16_t)0.917290997f,
+    (float16_t)0.399624200f,  (float16_t)0.916679060f,
+    (float16_t)0.401029897f,  (float16_t)0.916064966f,
+    (float16_t)0.402434651f,  (float16_t)0.915448716f,
+    (float16_t)0.403838458f,  (float16_t)0.914830312f,
+    (float16_t)0.405241314f,  (float16_t)0.914209756f,
+    (float16_t)0.406643217f,  (float16_t)0.913587048f,
+    (float16_t)0.408044163f,  (float16_t)0.912962190f,
+    (float16_t)0.409444149f,  (float16_t)0.912335185f,
+    (float16_t)0.410843171f,  (float16_t)0.911706032f,
+    (float16_t)0.412241227f,  (float16_t)0.911074734f,
+    (float16_t)0.413638312f,  (float16_t)0.910441292f,
+    (float16_t)0.415034424f,  (float16_t)0.909805708f,
+    (float16_t)0.416429560f,  (float16_t)0.909167983f,
+    (float16_t)0.417823716f,  (float16_t)0.908528119f,
+    (float16_t)0.419216888f,  (float16_t)0.907886116f,
+    (float16_t)0.420609074f,  (float16_t)0.907241978f,
+    (float16_t)0.422000271f,  (float16_t)0.906595705f,
+    (float16_t)0.423390474f,  (float16_t)0.905947298f,
+    (float16_t)0.424779681f,  (float16_t)0.905296759f,
+    (float16_t)0.426167889f,  (float16_t)0.904644091f,
+    (float16_t)0.427555093f,  (float16_t)0.903989293f,
+    (float16_t)0.428941292f,  (float16_t)0.903332368f,
+    (float16_t)0.430326481f,  (float16_t)0.902673318f,
+    (float16_t)0.431710658f,  (float16_t)0.902012144f,
+    (float16_t)0.433093819f,  (float16_t)0.901348847f,
+    (float16_t)0.434475961f,  (float16_t)0.900683429f,
+    (float16_t)0.435857080f,  (float16_t)0.900015892f,
+    (float16_t)0.437237174f,  (float16_t)0.899346237f,
+    (float16_t)0.438616239f,  (float16_t)0.898674466f,
+    (float16_t)0.439994271f,  (float16_t)0.898000580f,
+    (float16_t)0.441371269f,  (float16_t)0.897324581f,
+    (float16_t)0.442747228f,  (float16_t)0.896646470f,
+    (float16_t)0.444122145f,  (float16_t)0.895966250f,
+    (float16_t)0.445496017f,  (float16_t)0.895283921f,
+    (float16_t)0.446868840f,  (float16_t)0.894599486f,
+    (float16_t)0.448240612f,  (float16_t)0.893912945f,
+    (float16_t)0.449611330f,  (float16_t)0.893224301f,
+    (float16_t)0.450980989f,  (float16_t)0.892533555f,
+    (float16_t)0.452349587f,  (float16_t)0.891840709f,
+    (float16_t)0.453717121f,  (float16_t)0.891145765f,
+    (float16_t)0.455083587f,  (float16_t)0.890448723f,
+    (float16_t)0.456448982f,  (float16_t)0.889749586f,
+    (float16_t)0.457813304f,  (float16_t)0.889048356f,
+    (float16_t)0.459176548f,  (float16_t)0.888345033f,
+    (float16_t)0.460538711f,  (float16_t)0.887639620f,
+    (float16_t)0.461899791f,  (float16_t)0.886932119f,
+    (float16_t)0.463259784f,  (float16_t)0.886222530f,
+    (float16_t)0.464618686f,  (float16_t)0.885510856f,
+    (float16_t)0.465976496f,  (float16_t)0.884797098f,
+    (float16_t)0.467333209f,  (float16_t)0.884081259f,
+    (float16_t)0.468688822f,  (float16_t)0.883363339f,
+    (float16_t)0.470043332f,  (float16_t)0.882643340f,
+    (float16_t)0.471396737f,  (float16_t)0.881921264f,
+    (float16_t)0.472749032f,  (float16_t)0.881197113f,
+    (float16_t)0.474100215f,  (float16_t)0.880470889f,
+    (float16_t)0.475450282f,  (float16_t)0.879742593f,
+    (float16_t)0.476799230f,  (float16_t)0.879012226f,
+    (float16_t)0.478147056f,  (float16_t)0.878279792f,
+    (float16_t)0.479493758f,  (float16_t)0.877545290f,
+    (float16_t)0.480839331f,  (float16_t)0.876808724f,
+    (float16_t)0.482183772f,  (float16_t)0.876070094f,
+    (float16_t)0.483527079f,  (float16_t)0.875329403f,
+    (float16_t)0.484869248f,  (float16_t)0.874586652f,
+    (float16_t)0.486210276f,  (float16_t)0.873841843f,
+    (float16_t)0.487550160f,  (float16_t)0.873094978f,
+    (float16_t)0.488888897f,  (float16_t)0.872346059f,
+    (float16_t)0.490226483f,  (float16_t)0.871595087f,
+    (float16_t)0.491562916f,  (float16_t)0.870842063f,
+    (float16_t)0.492898192f,  (float16_t)0.870086991f,
+    (float16_t)0.494232309f,  (float16_t)0.869329871f,
+    (float16_t)0.495565262f,  (float16_t)0.868570706f,
+    (float16_t)0.496897049f,  (float16_t)0.867809497f,
+    (float16_t)0.498227667f,  (float16_t)0.867046246f,
+    (float16_t)0.499557113f,  (float16_t)0.866280954f,
+    (float16_t)0.500885383f,  (float16_t)0.865513624f,
+    (float16_t)0.502212474f,  (float16_t)0.864744258f,
+    (float16_t)0.503538384f,  (float16_t)0.863972856f,
+    (float16_t)0.504863109f,  (float16_t)0.863199422f,
+    (float16_t)0.506186645f,  (float16_t)0.862423956f,
+    (float16_t)0.507508991f,  (float16_t)0.861646461f,
+    (float16_t)0.508830143f,  (float16_t)0.860866939f,
+    (float16_t)0.510150097f,  (float16_t)0.860085390f,
+    (float16_t)0.511468850f,  (float16_t)0.859301818f,
+    (float16_t)0.512786401f,  (float16_t)0.858516224f,
+    (float16_t)0.514102744f,  (float16_t)0.857728610f,
+    (float16_t)0.515417878f,  (float16_t)0.856938977f,
+    (float16_t)0.516731799f,  (float16_t)0.856147328f,
+    (float16_t)0.518044504f,  (float16_t)0.855353665f,
+    (float16_t)0.519355990f,  (float16_t)0.854557988f,
+    (float16_t)0.520666254f,  (float16_t)0.853760301f,
+    (float16_t)0.521975293f,  (float16_t)0.852960605f,
+    (float16_t)0.523283103f,  (float16_t)0.852158902f,
+    (float16_t)0.524589683f,  (float16_t)0.851355193f,
+    (float16_t)0.525895027f,  (float16_t)0.850549481f,
+    (float16_t)0.527199135f,  (float16_t)0.849741768f,
+    (float16_t)0.528502002f,  (float16_t)0.848932055f,
+    (float16_t)0.529803625f,  (float16_t)0.848120345f,
+    (float16_t)0.531104001f,  (float16_t)0.847306639f,
+    (float16_t)0.532403128f,  (float16_t)0.846490939f,
+    (float16_t)0.533701002f,  (float16_t)0.845673247f,
+    (float16_t)0.534997620f,  (float16_t)0.844853565f,
+    (float16_t)0.536292979f,  (float16_t)0.844031895f,
+    (float16_t)0.537587076f,  (float16_t)0.843208240f,
+    (float16_t)0.538879909f,  (float16_t)0.842382600f,
+    (float16_t)0.540171473f,  (float16_t)0.841554977f,
+    (float16_t)0.541461766f,  (float16_t)0.840725375f,
+    (float16_t)0.542750785f,  (float16_t)0.839893794f,
+    (float16_t)0.544038527f,  (float16_t)0.839060237f,
+    (float16_t)0.545324988f,  (float16_t)0.838224706f,
+    (float16_t)0.546610167f,  (float16_t)0.837387202f,
+    (float16_t)0.547894059f,  (float16_t)0.836547727f,
+    (float16_t)0.549176662f,  (float16_t)0.835706284f,
+    (float16_t)0.550457973f,  (float16_t)0.834862875f,
+    (float16_t)0.551737988f,  (float16_t)0.834017501f,
+    (float16_t)0.553016706f,  (float16_t)0.833170165f,
+    (float16_t)0.554294121f,  (float16_t)0.832320868f,
+    (float16_t)0.555570233f,  (float16_t)0.831469612f,
+    (float16_t)0.556845037f,  (float16_t)0.830616400f,
+    (float16_t)0.558118531f,  (float16_t)0.829761234f,
+    (float16_t)0.559390712f,  (float16_t)0.828904115f,
+    (float16_t)0.560661576f,  (float16_t)0.828045045f,
+    (float16_t)0.561931121f,  (float16_t)0.827184027f,
+    (float16_t)0.563199344f,  (float16_t)0.826321063f,
+    (float16_t)0.564466242f,  (float16_t)0.825456154f,
+    (float16_t)0.565731811f,  (float16_t)0.824589303f,
+    (float16_t)0.566996049f,  (float16_t)0.823720511f,
+    (float16_t)0.568258953f,  (float16_t)0.822849781f,
+    (float16_t)0.569520519f,  (float16_t)0.821977115f,
+    (float16_t)0.570780746f,  (float16_t)0.821102515f,
+    (float16_t)0.572039629f,  (float16_t)0.820225983f,
+    (float16_t)0.573297167f,  (float16_t)0.819347520f,
+    (float16_t)0.574553355f,  (float16_t)0.818467130f,
+    (float16_t)0.575808191f,  (float16_t)0.817584813f,
+    (float16_t)0.577061673f,  (float16_t)0.816700573f,
+    (float16_t)0.578313796f,  (float16_t)0.815814411f,
+    (float16_t)0.579564559f,  (float16_t)0.814926329f,
+    (float16_t)0.580813958f,  (float16_t)0.814036330f,
+    (float16_t)0.582061990f,  (float16_t)0.813144415f,
+    (float16_t)0.583308653f,  (float16_t)0.812250587f,
+    (float16_t)0.584553943f,  (float16_t)0.811354847f,
+    (float16_t)0.585797857f,  (float16_t)0.810457198f,
+    (float16_t)0.587040394f,  (float16_t)0.809557642f,
+    (float16_t)0.588281548f,  (float16_t)0.808656182f,
+    (float16_t)0.589521319f,  (float16_t)0.807752818f,
+    (float16_t)0.590759702f,  (float16_t)0.806847554f,
+    (float16_t)0.591996695f,  (float16_t)0.805940391f,
+    (float16_t)0.593232295f,  (float16_t)0.805031331f,
+    (float16_t)0.594466499f,  (float16_t)0.804120377f,
+    (float16_t)0.595699304f,  (float16_t)0.803207531f,
+    (float16_t)0.596930708f,  (float16_t)0.802292796f,
+    (float16_t)0.598160707f,  (float16_t)0.801376172f,
+    (float16_t)0.599389298f,  (float16_t)0.800457662f,
+    (float16_t)0.600616479f,  (float16_t)0.799537269f,
+    (float16_t)0.601842247f,  (float16_t)0.798614995f,
+    (float16_t)0.603066599f,  (float16_t)0.797690841f,
+    (float16_t)0.604289531f,  (float16_t)0.796764810f,
+    (float16_t)0.605511041f,  (float16_t)0.795836905f,
+    (float16_t)0.606731127f,  (float16_t)0.794907126f,
+    (float16_t)0.607949785f,  (float16_t)0.793975478f,
+    (float16_t)0.609167012f,  (float16_t)0.793041960f,
+    (float16_t)0.610382806f,  (float16_t)0.792106577f,
+    (float16_t)0.611597164f,  (float16_t)0.791169330f,
+    (float16_t)0.612810082f,  (float16_t)0.790230221f,
+    (float16_t)0.614021559f,  (float16_t)0.789289253f,
+    (float16_t)0.615231591f,  (float16_t)0.788346428f,
+    (float16_t)0.616440175f,  (float16_t)0.787401747f,
+    (float16_t)0.617647308f,  (float16_t)0.786455214f,
+    (float16_t)0.618852988f,  (float16_t)0.785506830f,
+    (float16_t)0.620057212f,  (float16_t)0.784556597f,
+    (float16_t)0.621259977f,  (float16_t)0.783604519f,
+    (float16_t)0.622461279f,  (float16_t)0.782650596f,
+    (float16_t)0.623661118f,  (float16_t)0.781694832f,
+    (float16_t)0.624859488f,  (float16_t)0.780737229f,
+    (float16_t)0.626056388f,  (float16_t)0.779777788f,
+    (float16_t)0.627251815f,  (float16_t)0.778816512f,
+    (float16_t)0.628445767f,  (float16_t)0.777853404f,
+    (float16_t)0.629638239f,  (float16_t)0.776888466f,
+    (float16_t)0.630829230f,  (float16_t)0.775921699f,
+    (float16_t)0.632018736f,  (float16_t)0.774953107f,
+    (float16_t)0.633206755f,  (float16_t)0.773982691f,
+    (float16_t)0.634393284f,  (float16_t)0.773010453f,
+    (float16_t)0.635578320f,  (float16_t)0.772036397f,
+    (float16_t)0.636761861f,  (float16_t)0.771060524f,
+    (float16_t)0.637943904f,  (float16_t)0.770082837f,
+    (float16_t)0.639124445f,  (float16_t)0.769103338f,
+    (float16_t)0.640303482f,  (float16_t)0.768122029f,
+    (float16_t)0.641481013f,  (float16_t)0.767138912f,
+    (float16_t)0.642657034f,  (float16_t)0.766153990f,
+    (float16_t)0.643831543f,  (float16_t)0.765167266f,
+    (float16_t)0.645004537f,  (float16_t)0.764178741f,
+    (float16_t)0.646176013f,  (float16_t)0.763188417f,
+    (float16_t)0.647345969f,  (float16_t)0.762196298f,
+    (float16_t)0.648514401f,  (float16_t)0.761202385f,
+    (float16_t)0.649681307f,  (float16_t)0.760206682f,
+    (float16_t)0.650846685f,  (float16_t)0.759209189f,
+    (float16_t)0.652010531f,  (float16_t)0.758209910f,
+    (float16_t)0.653172843f,  (float16_t)0.757208847f,
+    (float16_t)0.654333618f,  (float16_t)0.756206001f,
+    (float16_t)0.655492853f,  (float16_t)0.755201377f,
+    (float16_t)0.656650546f,  (float16_t)0.754194975f,
+    (float16_t)0.657806693f,  (float16_t)0.753186799f,
+    (float16_t)0.658961293f,  (float16_t)0.752176850f,
+    (float16_t)0.660114342f,  (float16_t)0.751165132f,
+    (float16_t)0.661265838f,  (float16_t)0.750151646f,
+    (float16_t)0.662415778f,  (float16_t)0.749136395f,
+    (float16_t)0.663564159f,  (float16_t)0.748119380f,
+    (float16_t)0.664710978f,  (float16_t)0.747100606f,
+    (float16_t)0.665856234f,  (float16_t)0.746080074f,
+    (float16_t)0.666999922f,  (float16_t)0.745057785f,
+    (float16_t)0.668142041f,  (float16_t)0.744033744f,
+    (float16_t)0.669282588f,  (float16_t)0.743007952f,
+    (float16_t)0.670421560f,  (float16_t)0.741980412f,
+    (float16_t)0.671558955f,  (float16_t)0.740951125f,
+    (float16_t)0.672694769f,  (float16_t)0.739920095f,
+    (float16_t)0.673829000f,  (float16_t)0.738887324f,
+    (float16_t)0.674961646f,  (float16_t)0.737852815f,
+    (float16_t)0.676092704f,  (float16_t)0.736816569f,
+    (float16_t)0.677222170f,  (float16_t)0.735778589f,
+    (float16_t)0.678350043f,  (float16_t)0.734738878f,
+    (float16_t)0.679476320f,  (float16_t)0.733697438f,
+    (float16_t)0.680600998f,  (float16_t)0.732654272f,
+    (float16_t)0.681724074f,  (float16_t)0.731609381f,
+    (float16_t)0.682845546f,  (float16_t)0.730562769f,
+    (float16_t)0.683965412f,  (float16_t)0.729514438f,
+    (float16_t)0.685083668f,  (float16_t)0.728464390f,
+    (float16_t)0.686200312f,  (float16_t)0.727412629f,
+    (float16_t)0.687315341f,  (float16_t)0.726359155f,
+    (float16_t)0.688428753f,  (float16_t)0.725303972f,
+    (float16_t)0.689540545f,  (float16_t)0.724247083f,
+    (float16_t)0.690650714f,  (float16_t)0.723188489f,
+    (float16_t)0.691759258f,  (float16_t)0.722128194f,
+    (float16_t)0.692866175f,  (float16_t)0.721066199f,
+    (float16_t)0.693971461f,  (float16_t)0.720002508f,
+    (float16_t)0.695075114f,  (float16_t)0.718937122f,
+    (float16_t)0.696177131f,  (float16_t)0.717870045f,
+    (float16_t)0.697277511f,  (float16_t)0.716801279f,
+    (float16_t)0.698376249f,  (float16_t)0.715730825f,
+    (float16_t)0.699473345f,  (float16_t)0.714658688f,
+    (float16_t)0.700568794f,  (float16_t)0.713584869f,
+    (float16_t)0.701662595f,  (float16_t)0.712509371f,
+    (float16_t)0.702754744f,  (float16_t)0.711432196f,
+    (float16_t)0.703845241f,  (float16_t)0.710353347f,
+    (float16_t)0.704934080f,  (float16_t)0.709272826f,
+    (float16_t)0.706021261f,  (float16_t)0.708190637f,
+    (float16_t)0.707106781f,  (float16_t)0.707106781f,
+    (float16_t)0.708190637f,  (float16_t)0.706021261f,
+    (float16_t)0.709272826f,  (float16_t)0.704934080f,
+    (float16_t)0.710353347f,  (float16_t)0.703845241f,
+    (float16_t)0.711432196f,  (float16_t)0.702754744f,
+    (float16_t)0.712509371f,  (float16_t)0.701662595f,
+    (float16_t)0.713584869f,  (float16_t)0.700568794f,
+    (float16_t)0.714658688f,  (float16_t)0.699473345f,
+    (float16_t)0.715730825f,  (float16_t)0.698376249f,
+    (float16_t)0.716801279f,  (float16_t)0.697277511f,
+    (float16_t)0.717870045f,  (float16_t)0.696177131f,
+    (float16_t)0.718937122f,  (float16_t)0.695075114f,
+    (float16_t)0.720002508f,  (float16_t)0.693971461f,
+    (float16_t)0.721066199f,  (float16_t)0.692866175f,
+    (float16_t)0.722128194f,  (float16_t)0.691759258f,
+    (float16_t)0.723188489f,  (float16_t)0.690650714f,
+    (float16_t)0.724247083f,  (float16_t)0.689540545f,
+    (float16_t)0.725303972f,  (float16_t)0.688428753f,
+    (float16_t)0.726359155f,  (float16_t)0.687315341f,
+    (float16_t)0.727412629f,  (float16_t)0.686200312f,
+    (float16_t)0.728464390f,  (float16_t)0.685083668f,
+    (float16_t)0.729514438f,  (float16_t)0.683965412f,
+    (float16_t)0.730562769f,  (float16_t)0.682845546f,
+    (float16_t)0.731609381f,  (float16_t)0.681724074f,
+    (float16_t)0.732654272f,  (float16_t)0.680600998f,
+    (float16_t)0.733697438f,  (float16_t)0.679476320f,
+    (float16_t)0.734738878f,  (float16_t)0.678350043f,
+    (float16_t)0.735778589f,  (float16_t)0.677222170f,
+    (float16_t)0.736816569f,  (float16_t)0.676092704f,
+    (float16_t)0.737852815f,  (float16_t)0.674961646f,
+    (float16_t)0.738887324f,  (float16_t)0.673829000f,
+    (float16_t)0.739920095f,  (float16_t)0.672694769f,
+    (float16_t)0.740951125f,  (float16_t)0.671558955f,
+    (float16_t)0.741980412f,  (float16_t)0.670421560f,
+    (float16_t)0.743007952f,  (float16_t)0.669282588f,
+    (float16_t)0.744033744f,  (float16_t)0.668142041f,
+    (float16_t)0.745057785f,  (float16_t)0.666999922f,
+    (float16_t)0.746080074f,  (float16_t)0.665856234f,
+    (float16_t)0.747100606f,  (float16_t)0.664710978f,
+    (float16_t)0.748119380f,  (float16_t)0.663564159f,
+    (float16_t)0.749136395f,  (float16_t)0.662415778f,
+    (float16_t)0.750151646f,  (float16_t)0.661265838f,
+    (float16_t)0.751165132f,  (float16_t)0.660114342f,
+    (float16_t)0.752176850f,  (float16_t)0.658961293f,
+    (float16_t)0.753186799f,  (float16_t)0.657806693f,
+    (float16_t)0.754194975f,  (float16_t)0.656650546f,
+    (float16_t)0.755201377f,  (float16_t)0.655492853f,
+    (float16_t)0.756206001f,  (float16_t)0.654333618f,
+    (float16_t)0.757208847f,  (float16_t)0.653172843f,
+    (float16_t)0.758209910f,  (float16_t)0.652010531f,
+    (float16_t)0.759209189f,  (float16_t)0.650846685f,
+    (float16_t)0.760206682f,  (float16_t)0.649681307f,
+    (float16_t)0.761202385f,  (float16_t)0.648514401f,
+    (float16_t)0.762196298f,  (float16_t)0.647345969f,
+    (float16_t)0.763188417f,  (float16_t)0.646176013f,
+    (float16_t)0.764178741f,  (float16_t)0.645004537f,
+    (float16_t)0.765167266f,  (float16_t)0.643831543f,
+    (float16_t)0.766153990f,  (float16_t)0.642657034f,
+    (float16_t)0.767138912f,  (float16_t)0.641481013f,
+    (float16_t)0.768122029f,  (float16_t)0.640303482f,
+    (float16_t)0.769103338f,  (float16_t)0.639124445f,
+    (float16_t)0.770082837f,  (float16_t)0.637943904f,
+    (float16_t)0.771060524f,  (float16_t)0.636761861f,
+    (float16_t)0.772036397f,  (float16_t)0.635578320f,
+    (float16_t)0.773010453f,  (float16_t)0.634393284f,
+    (float16_t)0.773982691f,  (float16_t)0.633206755f,
+    (float16_t)0.774953107f,  (float16_t)0.632018736f,
+    (float16_t)0.775921699f,  (float16_t)0.630829230f,
+    (float16_t)0.776888466f,  (float16_t)0.629638239f,
+    (float16_t)0.777853404f,  (float16_t)0.628445767f,
+    (float16_t)0.778816512f,  (float16_t)0.627251815f,
+    (float16_t)0.779777788f,  (float16_t)0.626056388f,
+    (float16_t)0.780737229f,  (float16_t)0.624859488f,
+    (float16_t)0.781694832f,  (float16_t)0.623661118f,
+    (float16_t)0.782650596f,  (float16_t)0.622461279f,
+    (float16_t)0.783604519f,  (float16_t)0.621259977f,
+    (float16_t)0.784556597f,  (float16_t)0.620057212f,
+    (float16_t)0.785506830f,  (float16_t)0.618852988f,
+    (float16_t)0.786455214f,  (float16_t)0.617647308f,
+    (float16_t)0.787401747f,  (float16_t)0.616440175f,
+    (float16_t)0.788346428f,  (float16_t)0.615231591f,
+    (float16_t)0.789289253f,  (float16_t)0.614021559f,
+    (float16_t)0.790230221f,  (float16_t)0.612810082f,
+    (float16_t)0.791169330f,  (float16_t)0.611597164f,
+    (float16_t)0.792106577f,  (float16_t)0.610382806f,
+    (float16_t)0.793041960f,  (float16_t)0.609167012f,
+    (float16_t)0.793975478f,  (float16_t)0.607949785f,
+    (float16_t)0.794907126f,  (float16_t)0.606731127f,
+    (float16_t)0.795836905f,  (float16_t)0.605511041f,
+    (float16_t)0.796764810f,  (float16_t)0.604289531f,
+    (float16_t)0.797690841f,  (float16_t)0.603066599f,
+    (float16_t)0.798614995f,  (float16_t)0.601842247f,
+    (float16_t)0.799537269f,  (float16_t)0.600616479f,
+    (float16_t)0.800457662f,  (float16_t)0.599389298f,
+    (float16_t)0.801376172f,  (float16_t)0.598160707f,
+    (float16_t)0.802292796f,  (float16_t)0.596930708f,
+    (float16_t)0.803207531f,  (float16_t)0.595699304f,
+    (float16_t)0.804120377f,  (float16_t)0.594466499f,
+    (float16_t)0.805031331f,  (float16_t)0.593232295f,
+    (float16_t)0.805940391f,  (float16_t)0.591996695f,
+    (float16_t)0.806847554f,  (float16_t)0.590759702f,
+    (float16_t)0.807752818f,  (float16_t)0.589521319f,
+    (float16_t)0.808656182f,  (float16_t)0.588281548f,
+    (float16_t)0.809557642f,  (float16_t)0.587040394f,
+    (float16_t)0.810457198f,  (float16_t)0.585797857f,
+    (float16_t)0.811354847f,  (float16_t)0.584553943f,
+    (float16_t)0.812250587f,  (float16_t)0.583308653f,
+    (float16_t)0.813144415f,  (float16_t)0.582061990f,
+    (float16_t)0.814036330f,  (float16_t)0.580813958f,
+    (float16_t)0.814926329f,  (float16_t)0.579564559f,
+    (float16_t)0.815814411f,  (float16_t)0.578313796f,
+    (float16_t)0.816700573f,  (float16_t)0.577061673f,
+    (float16_t)0.817584813f,  (float16_t)0.575808191f,
+    (float16_t)0.818467130f,  (float16_t)0.574553355f,
+    (float16_t)0.819347520f,  (float16_t)0.573297167f,
+    (float16_t)0.820225983f,  (float16_t)0.572039629f,
+    (float16_t)0.821102515f,  (float16_t)0.570780746f,
+    (float16_t)0.821977115f,  (float16_t)0.569520519f,
+    (float16_t)0.822849781f,  (float16_t)0.568258953f,
+    (float16_t)0.823720511f,  (float16_t)0.566996049f,
+    (float16_t)0.824589303f,  (float16_t)0.565731811f,
+    (float16_t)0.825456154f,  (float16_t)0.564466242f,
+    (float16_t)0.826321063f,  (float16_t)0.563199344f,
+    (float16_t)0.827184027f,  (float16_t)0.561931121f,
+    (float16_t)0.828045045f,  (float16_t)0.560661576f,
+    (float16_t)0.828904115f,  (float16_t)0.559390712f,
+    (float16_t)0.829761234f,  (float16_t)0.558118531f,
+    (float16_t)0.830616400f,  (float16_t)0.556845037f,
+    (float16_t)0.831469612f,  (float16_t)0.555570233f,
+    (float16_t)0.832320868f,  (float16_t)0.554294121f,
+    (float16_t)0.833170165f,  (float16_t)0.553016706f,
+    (float16_t)0.834017501f,  (float16_t)0.551737988f,
+    (float16_t)0.834862875f,  (float16_t)0.550457973f,
+    (float16_t)0.835706284f,  (float16_t)0.549176662f,
+    (float16_t)0.836547727f,  (float16_t)0.547894059f,
+    (float16_t)0.837387202f,  (float16_t)0.546610167f,
+    (float16_t)0.838224706f,  (float16_t)0.545324988f,
+    (float16_t)0.839060237f,  (float16_t)0.544038527f,
+    (float16_t)0.839893794f,  (float16_t)0.542750785f,
+    (float16_t)0.840725375f,  (float16_t)0.541461766f,
+    (float16_t)0.841554977f,  (float16_t)0.540171473f,
+    (float16_t)0.842382600f,  (float16_t)0.538879909f,
+    (float16_t)0.843208240f,  (float16_t)0.537587076f,
+    (float16_t)0.844031895f,  (float16_t)0.536292979f,
+    (float16_t)0.844853565f,  (float16_t)0.534997620f,
+    (float16_t)0.845673247f,  (float16_t)0.533701002f,
+    (float16_t)0.846490939f,  (float16_t)0.532403128f,
+    (float16_t)0.847306639f,  (float16_t)0.531104001f,
+    (float16_t)0.848120345f,  (float16_t)0.529803625f,
+    (float16_t)0.848932055f,  (float16_t)0.528502002f,
+    (float16_t)0.849741768f,  (float16_t)0.527199135f,
+    (float16_t)0.850549481f,  (float16_t)0.525895027f,
+    (float16_t)0.851355193f,  (float16_t)0.524589683f,
+    (float16_t)0.852158902f,  (float16_t)0.523283103f,
+    (float16_t)0.852960605f,  (float16_t)0.521975293f,
+    (float16_t)0.853760301f,  (float16_t)0.520666254f,
+    (float16_t)0.854557988f,  (float16_t)0.519355990f,
+    (float16_t)0.855353665f,  (float16_t)0.518044504f,
+    (float16_t)0.856147328f,  (float16_t)0.516731799f,
+    (float16_t)0.856938977f,  (float16_t)0.515417878f,
+    (float16_t)0.857728610f,  (float16_t)0.514102744f,
+    (float16_t)0.858516224f,  (float16_t)0.512786401f,
+    (float16_t)0.859301818f,  (float16_t)0.511468850f,
+    (float16_t)0.860085390f,  (float16_t)0.510150097f,
+    (float16_t)0.860866939f,  (float16_t)0.508830143f,
+    (float16_t)0.861646461f,  (float16_t)0.507508991f,
+    (float16_t)0.862423956f,  (float16_t)0.506186645f,
+    (float16_t)0.863199422f,  (float16_t)0.504863109f,
+    (float16_t)0.863972856f,  (float16_t)0.503538384f,
+    (float16_t)0.864744258f,  (float16_t)0.502212474f,
+    (float16_t)0.865513624f,  (float16_t)0.500885383f,
+    (float16_t)0.866280954f,  (float16_t)0.499557113f,
+    (float16_t)0.867046246f,  (float16_t)0.498227667f,
+    (float16_t)0.867809497f,  (float16_t)0.496897049f,
+    (float16_t)0.868570706f,  (float16_t)0.495565262f,
+    (float16_t)0.869329871f,  (float16_t)0.494232309f,
+    (float16_t)0.870086991f,  (float16_t)0.492898192f,
+    (float16_t)0.870842063f,  (float16_t)0.491562916f,
+    (float16_t)0.871595087f,  (float16_t)0.490226483f,
+    (float16_t)0.872346059f,  (float16_t)0.488888897f,
+    (float16_t)0.873094978f,  (float16_t)0.487550160f,
+    (float16_t)0.873841843f,  (float16_t)0.486210276f,
+    (float16_t)0.874586652f,  (float16_t)0.484869248f,
+    (float16_t)0.875329403f,  (float16_t)0.483527079f,
+    (float16_t)0.876070094f,  (float16_t)0.482183772f,
+    (float16_t)0.876808724f,  (float16_t)0.480839331f,
+    (float16_t)0.877545290f,  (float16_t)0.479493758f,
+    (float16_t)0.878279792f,  (float16_t)0.478147056f,
+    (float16_t)0.879012226f,  (float16_t)0.476799230f,
+    (float16_t)0.879742593f,  (float16_t)0.475450282f,
+    (float16_t)0.880470889f,  (float16_t)0.474100215f,
+    (float16_t)0.881197113f,  (float16_t)0.472749032f,
+    (float16_t)0.881921264f,  (float16_t)0.471396737f,
+    (float16_t)0.882643340f,  (float16_t)0.470043332f,
+    (float16_t)0.883363339f,  (float16_t)0.468688822f,
+    (float16_t)0.884081259f,  (float16_t)0.467333209f,
+    (float16_t)0.884797098f,  (float16_t)0.465976496f,
+    (float16_t)0.885510856f,  (float16_t)0.464618686f,
+    (float16_t)0.886222530f,  (float16_t)0.463259784f,
+    (float16_t)0.886932119f,  (float16_t)0.461899791f,
+    (float16_t)0.887639620f,  (float16_t)0.460538711f,
+    (float16_t)0.888345033f,  (float16_t)0.459176548f,
+    (float16_t)0.889048356f,  (float16_t)0.457813304f,
+    (float16_t)0.889749586f,  (float16_t)0.456448982f,
+    (float16_t)0.890448723f,  (float16_t)0.455083587f,
+    (float16_t)0.891145765f,  (float16_t)0.453717121f,
+    (float16_t)0.891840709f,  (float16_t)0.452349587f,
+    (float16_t)0.892533555f,  (float16_t)0.450980989f,
+    (float16_t)0.893224301f,  (float16_t)0.449611330f,
+    (float16_t)0.893912945f,  (float16_t)0.448240612f,
+    (float16_t)0.894599486f,  (float16_t)0.446868840f,
+    (float16_t)0.895283921f,  (float16_t)0.445496017f,
+    (float16_t)0.895966250f,  (float16_t)0.444122145f,
+    (float16_t)0.896646470f,  (float16_t)0.442747228f,
+    (float16_t)0.897324581f,  (float16_t)0.441371269f,
+    (float16_t)0.898000580f,  (float16_t)0.439994271f,
+    (float16_t)0.898674466f,  (float16_t)0.438616239f,
+    (float16_t)0.899346237f,  (float16_t)0.437237174f,
+    (float16_t)0.900015892f,  (float16_t)0.435857080f,
+    (float16_t)0.900683429f,  (float16_t)0.434475961f,
+    (float16_t)0.901348847f,  (float16_t)0.433093819f,
+    (float16_t)0.902012144f,  (float16_t)0.431710658f,
+    (float16_t)0.902673318f,  (float16_t)0.430326481f,
+    (float16_t)0.903332368f,  (float16_t)0.428941292f,
+    (float16_t)0.903989293f,  (float16_t)0.427555093f,
+    (float16_t)0.904644091f,  (float16_t)0.426167889f,
+    (float16_t)0.905296759f,  (float16_t)0.424779681f,
+    (float16_t)0.905947298f,  (float16_t)0.423390474f,
+    (float16_t)0.906595705f,  (float16_t)0.422000271f,
+    (float16_t)0.907241978f,  (float16_t)0.420609074f,
+    (float16_t)0.907886116f,  (float16_t)0.419216888f,
+    (float16_t)0.908528119f,  (float16_t)0.417823716f,
+    (float16_t)0.909167983f,  (float16_t)0.416429560f,
+    (float16_t)0.909805708f,  (float16_t)0.415034424f,
+    (float16_t)0.910441292f,  (float16_t)0.413638312f,
+    (float16_t)0.911074734f,  (float16_t)0.412241227f,
+    (float16_t)0.911706032f,  (float16_t)0.410843171f,
+    (float16_t)0.912335185f,  (float16_t)0.409444149f,
+    (float16_t)0.912962190f,  (float16_t)0.408044163f,
+    (float16_t)0.913587048f,  (float16_t)0.406643217f,
+    (float16_t)0.914209756f,  (float16_t)0.405241314f,
+    (float16_t)0.914830312f,  (float16_t)0.403838458f,
+    (float16_t)0.915448716f,  (float16_t)0.402434651f,
+    (float16_t)0.916064966f,  (float16_t)0.401029897f,
+    (float16_t)0.916679060f,  (float16_t)0.399624200f,
+    (float16_t)0.917290997f,  (float16_t)0.398217562f,
+    (float16_t)0.917900776f,  (float16_t)0.396809987f,
+    (float16_t)0.918508394f,  (float16_t)0.395401479f,
+    (float16_t)0.919113852f,  (float16_t)0.393992040f,
+    (float16_t)0.919717146f,  (float16_t)0.392581674f,
+    (float16_t)0.920318277f,  (float16_t)0.391170384f,
+    (float16_t)0.920917242f,  (float16_t)0.389758174f,
+    (float16_t)0.921514039f,  (float16_t)0.388345047f,
+    (float16_t)0.922108669f,  (float16_t)0.386931006f,
+    (float16_t)0.922701128f,  (float16_t)0.385516054f,
+    (float16_t)0.923291417f,  (float16_t)0.384100195f,
+    (float16_t)0.923879533f,  (float16_t)0.382683432f,
+    (float16_t)0.924465474f,  (float16_t)0.381265769f,
+    (float16_t)0.925049241f,  (float16_t)0.379847209f,
+    (float16_t)0.925630831f,  (float16_t)0.378427755f,
+    (float16_t)0.926210242f,  (float16_t)0.377007410f,
+    (float16_t)0.926787474f,  (float16_t)0.375586178f,
+    (float16_t)0.927362526f,  (float16_t)0.374164063f,
+    (float16_t)0.927935395f,  (float16_t)0.372741067f,
+    (float16_t)0.928506080f,  (float16_t)0.371317194f,
+    (float16_t)0.929074581f,  (float16_t)0.369892447f,
+    (float16_t)0.929640896f,  (float16_t)0.368466830f,
+    (float16_t)0.930205023f,  (float16_t)0.367040346f,
+    (float16_t)0.930766961f,  (float16_t)0.365612998f,
+    (float16_t)0.931326709f,  (float16_t)0.364184790f,
+    (float16_t)0.931884266f,  (float16_t)0.362755724f,
+    (float16_t)0.932439629f,  (float16_t)0.361325806f,
+    (float16_t)0.932992799f,  (float16_t)0.359895037f,
+    (float16_t)0.933543773f,  (float16_t)0.358463421f,
+    (float16_t)0.934092550f,  (float16_t)0.357030961f,
+    (float16_t)0.934639130f,  (float16_t)0.355597662f,
+    (float16_t)0.935183510f,  (float16_t)0.354163525f,
+    (float16_t)0.935725689f,  (float16_t)0.352728556f,
+    (float16_t)0.936265667f,  (float16_t)0.351292756f,
+    (float16_t)0.936803442f,  (float16_t)0.349856130f,
+    (float16_t)0.937339012f,  (float16_t)0.348418680f,
+    (float16_t)0.937872376f,  (float16_t)0.346980411f,
+    (float16_t)0.938403534f,  (float16_t)0.345541325f,
+    (float16_t)0.938932484f,  (float16_t)0.344101426f,
+    (float16_t)0.939459224f,  (float16_t)0.342660717f,
+    (float16_t)0.939983753f,  (float16_t)0.341219202f,
+    (float16_t)0.940506071f,  (float16_t)0.339776884f,
+    (float16_t)0.941026175f,  (float16_t)0.338333767f,
+    (float16_t)0.941544065f,  (float16_t)0.336889853f,
+    (float16_t)0.942059740f,  (float16_t)0.335445147f,
+    (float16_t)0.942573198f,  (float16_t)0.333999651f,
+    (float16_t)0.943084437f,  (float16_t)0.332553370f,
+    (float16_t)0.943593458f,  (float16_t)0.331106306f,
+    (float16_t)0.944100258f,  (float16_t)0.329658463f,
+    (float16_t)0.944604837f,  (float16_t)0.328209844f,
+    (float16_t)0.945107193f,  (float16_t)0.326760452f,
+    (float16_t)0.945607325f,  (float16_t)0.325310292f,
+    (float16_t)0.946105232f,  (float16_t)0.323859367f,
+    (float16_t)0.946600913f,  (float16_t)0.322407679f,
+    (float16_t)0.947094366f,  (float16_t)0.320955232f,
+    (float16_t)0.947585591f,  (float16_t)0.319502031f,
+    (float16_t)0.948074586f,  (float16_t)0.318048077f,
+    (float16_t)0.948561350f,  (float16_t)0.316593376f,
+    (float16_t)0.949045882f,  (float16_t)0.315137929f,
+    (float16_t)0.949528181f,  (float16_t)0.313681740f,
+    (float16_t)0.950008245f,  (float16_t)0.312224814f,
+    (float16_t)0.950486074f,  (float16_t)0.310767153f,
+    (float16_t)0.950961666f,  (float16_t)0.309308760f,
+    (float16_t)0.951435021f,  (float16_t)0.307849640f,
+    (float16_t)0.951906137f,  (float16_t)0.306389795f,
+    (float16_t)0.952375013f,  (float16_t)0.304929230f,
+    (float16_t)0.952841648f,  (float16_t)0.303467947f,
+    (float16_t)0.953306040f,  (float16_t)0.302005949f,
+    (float16_t)0.953768190f,  (float16_t)0.300543241f,
+    (float16_t)0.954228095f,  (float16_t)0.299079826f,
+    (float16_t)0.954685755f,  (float16_t)0.297615707f,
+    (float16_t)0.955141168f,  (float16_t)0.296150888f,
+    (float16_t)0.955594334f,  (float16_t)0.294685372f,
+    (float16_t)0.956045251f,  (float16_t)0.293219163f,
+    (float16_t)0.956493919f,  (float16_t)0.291752263f,
+    (float16_t)0.956940336f,  (float16_t)0.290284677f,
+    (float16_t)0.957384501f,  (float16_t)0.288816408f,
+    (float16_t)0.957826413f,  (float16_t)0.287347460f,
+    (float16_t)0.958266071f,  (float16_t)0.285877835f,
+    (float16_t)0.958703475f,  (float16_t)0.284407537f,
+    (float16_t)0.959138622f,  (float16_t)0.282936570f,
+    (float16_t)0.959571513f,  (float16_t)0.281464938f,
+    (float16_t)0.960002146f,  (float16_t)0.279992643f,
+    (float16_t)0.960430519f,  (float16_t)0.278519689f,
+    (float16_t)0.960856633f,  (float16_t)0.277046080f,
+    (float16_t)0.961280486f,  (float16_t)0.275571819f,
+    (float16_t)0.961702077f,  (float16_t)0.274096910f,
+    (float16_t)0.962121404f,  (float16_t)0.272621355f,
+    (float16_t)0.962538468f,  (float16_t)0.271145160f,
+    (float16_t)0.962953267f,  (float16_t)0.269668326f,
+    (float16_t)0.963365800f,  (float16_t)0.268190857f,
+    (float16_t)0.963776066f,  (float16_t)0.266712757f,
+    (float16_t)0.964184064f,  (float16_t)0.265234030f,
+    (float16_t)0.964589793f,  (float16_t)0.263754679f,
+    (float16_t)0.964993253f,  (float16_t)0.262274707f,
+    (float16_t)0.965394442f,  (float16_t)0.260794118f,
+    (float16_t)0.965793359f,  (float16_t)0.259312915f,
+    (float16_t)0.966190003f,  (float16_t)0.257831102f,
+    (float16_t)0.966584374f,  (float16_t)0.256348682f,
+    (float16_t)0.966976471f,  (float16_t)0.254865660f,
+    (float16_t)0.967366292f,  (float16_t)0.253382037f,
+    (float16_t)0.967753837f,  (float16_t)0.251897818f,
+    (float16_t)0.968139105f,  (float16_t)0.250413007f,
+    (float16_t)0.968522094f,  (float16_t)0.248927606f,
+    (float16_t)0.968902805f,  (float16_t)0.247441619f,
+    (float16_t)0.969281235f,  (float16_t)0.245955050f,
+    (float16_t)0.969657385f,  (float16_t)0.244467903f,
+    (float16_t)0.970031253f,  (float16_t)0.242980180f,
+    (float16_t)0.970402839f,  (float16_t)0.241491885f,
+    (float16_t)0.970772141f,  (float16_t)0.240003022f,
+    (float16_t)0.971139158f,  (float16_t)0.238513595f,
+    (float16_t)0.971503891f,  (float16_t)0.237023606f,
+    (float16_t)0.971866337f,  (float16_t)0.235533059f,
+    (float16_t)0.972226497f,  (float16_t)0.234041959f,
+    (float16_t)0.972584369f,  (float16_t)0.232550307f,
+    (float16_t)0.972939952f,  (float16_t)0.231058108f,
+    (float16_t)0.973293246f,  (float16_t)0.229565366f,
+    (float16_t)0.973644250f,  (float16_t)0.228072083f,
+    (float16_t)0.973992962f,  (float16_t)0.226578264f,
+    (float16_t)0.974339383f,  (float16_t)0.225083911f,
+    (float16_t)0.974683511f,  (float16_t)0.223589029f,
+    (float16_t)0.975025345f,  (float16_t)0.222093621f,
+    (float16_t)0.975364885f,  (float16_t)0.220597690f,
+    (float16_t)0.975702130f,  (float16_t)0.219101240f,
+    (float16_t)0.976037079f,  (float16_t)0.217604275f,
+    (float16_t)0.976369731f,  (float16_t)0.216106797f,
+    (float16_t)0.976700086f,  (float16_t)0.214608811f,
+    (float16_t)0.977028143f,  (float16_t)0.213110320f,
+    (float16_t)0.977353900f,  (float16_t)0.211611327f,
+    (float16_t)0.977677358f,  (float16_t)0.210111837f,
+    (float16_t)0.977998515f,  (float16_t)0.208611852f,
+    (float16_t)0.978317371f,  (float16_t)0.207111376f,
+    (float16_t)0.978633924f,  (float16_t)0.205610413f,
+    (float16_t)0.978948175f,  (float16_t)0.204108966f,
+    (float16_t)0.979260123f,  (float16_t)0.202607039f,
+    (float16_t)0.979569766f,  (float16_t)0.201104635f,
+    (float16_t)0.979877104f,  (float16_t)0.199601758f,
+    (float16_t)0.980182136f,  (float16_t)0.198098411f,
+    (float16_t)0.980484862f,  (float16_t)0.196594598f,
+    (float16_t)0.980785280f,  (float16_t)0.195090322f,
+    (float16_t)0.981083391f,  (float16_t)0.193585587f,
+    (float16_t)0.981379193f,  (float16_t)0.192080397f,
+    (float16_t)0.981672686f,  (float16_t)0.190574755f,
+    (float16_t)0.981963869f,  (float16_t)0.189068664f,
+    (float16_t)0.982252741f,  (float16_t)0.187562129f,
+    (float16_t)0.982539302f,  (float16_t)0.186055152f,
+    (float16_t)0.982823551f,  (float16_t)0.184547737f,
+    (float16_t)0.983105487f,  (float16_t)0.183039888f,
+    (float16_t)0.983385110f,  (float16_t)0.181531608f,
+    (float16_t)0.983662419f,  (float16_t)0.180022901f,
+    (float16_t)0.983937413f,  (float16_t)0.178513771f,
+    (float16_t)0.984210092f,  (float16_t)0.177004220f,
+    (float16_t)0.984480455f,  (float16_t)0.175494253f,
+    (float16_t)0.984748502f,  (float16_t)0.173983873f,
+    (float16_t)0.985014231f,  (float16_t)0.172473084f,
+    (float16_t)0.985277642f,  (float16_t)0.170961889f,
+    (float16_t)0.985538735f,  (float16_t)0.169450291f,
+    (float16_t)0.985797509f,  (float16_t)0.167938295f,
+    (float16_t)0.986053963f,  (float16_t)0.166425904f,
+    (float16_t)0.986308097f,  (float16_t)0.164913120f,
+    (float16_t)0.986559910f,  (float16_t)0.163399949f,
+    (float16_t)0.986809402f,  (float16_t)0.161886394f,
+    (float16_t)0.987056571f,  (float16_t)0.160372457f,
+    (float16_t)0.987301418f,  (float16_t)0.158858143f,
+    (float16_t)0.987543942f,  (float16_t)0.157343456f,
+    (float16_t)0.987784142f,  (float16_t)0.155828398f,
+    (float16_t)0.988022017f,  (float16_t)0.154312973f,
+    (float16_t)0.988257568f,  (float16_t)0.152797185f,
+    (float16_t)0.988490793f,  (float16_t)0.151281038f,
+    (float16_t)0.988721692f,  (float16_t)0.149764535f,
+    (float16_t)0.988950265f,  (float16_t)0.148247679f,
+    (float16_t)0.989176510f,  (float16_t)0.146730474f,
+    (float16_t)0.989400428f,  (float16_t)0.145212925f,
+    (float16_t)0.989622017f,  (float16_t)0.143695033f,
+    (float16_t)0.989841278f,  (float16_t)0.142176804f,
+    (float16_t)0.990058210f,  (float16_t)0.140658239f,
+    (float16_t)0.990272812f,  (float16_t)0.139139344f,
+    (float16_t)0.990485084f,  (float16_t)0.137620122f,
+    (float16_t)0.990695025f,  (float16_t)0.136100575f,
+    (float16_t)0.990902635f,  (float16_t)0.134580709f,
+    (float16_t)0.991107914f,  (float16_t)0.133060525f,
+    (float16_t)0.991310860f,  (float16_t)0.131540029f,
+    (float16_t)0.991511473f,  (float16_t)0.130019223f,
+    (float16_t)0.991709754f,  (float16_t)0.128498111f,
+    (float16_t)0.991905700f,  (float16_t)0.126976696f,
+    (float16_t)0.992099313f,  (float16_t)0.125454983f,
+    (float16_t)0.992290591f,  (float16_t)0.123932975f,
+    (float16_t)0.992479535f,  (float16_t)0.122410675f,
+    (float16_t)0.992666142f,  (float16_t)0.120888087f,
+    (float16_t)0.992850414f,  (float16_t)0.119365215f,
+    (float16_t)0.993032350f,  (float16_t)0.117842062f,
+    (float16_t)0.993211949f,  (float16_t)0.116318631f,
+    (float16_t)0.993389211f,  (float16_t)0.114794927f,
+    (float16_t)0.993564136f,  (float16_t)0.113270952f,
+    (float16_t)0.993736722f,  (float16_t)0.111746711f,
+    (float16_t)0.993906970f,  (float16_t)0.110222207f,
+    (float16_t)0.994074879f,  (float16_t)0.108697444f,
+    (float16_t)0.994240449f,  (float16_t)0.107172425f,
+    (float16_t)0.994403680f,  (float16_t)0.105647154f,
+    (float16_t)0.994564571f,  (float16_t)0.104121634f,
+    (float16_t)0.994723121f,  (float16_t)0.102595869f,
+    (float16_t)0.994879331f,  (float16_t)0.101069863f,
+    (float16_t)0.995033199f,  (float16_t)0.099543619f,
+    (float16_t)0.995184727f,  (float16_t)0.098017140f,
+    (float16_t)0.995333912f,  (float16_t)0.096490431f,
+    (float16_t)0.995480755f,  (float16_t)0.094963495f,
+    (float16_t)0.995625256f,  (float16_t)0.093436336f,
+    (float16_t)0.995767414f,  (float16_t)0.091908956f,
+    (float16_t)0.995907229f,  (float16_t)0.090381361f,
+    (float16_t)0.996044701f,  (float16_t)0.088853553f,
+    (float16_t)0.996179829f,  (float16_t)0.087325535f,
+    (float16_t)0.996312612f,  (float16_t)0.085797312f,
+    (float16_t)0.996443051f,  (float16_t)0.084268888f,
+    (float16_t)0.996571146f,  (float16_t)0.082740265f,
+    (float16_t)0.996696895f,  (float16_t)0.081211447f,
+    (float16_t)0.996820299f,  (float16_t)0.079682438f,
+    (float16_t)0.996941358f,  (float16_t)0.078153242f,
+    (float16_t)0.997060070f,  (float16_t)0.076623861f,
+    (float16_t)0.997176437f,  (float16_t)0.075094301f,
+    (float16_t)0.997290457f,  (float16_t)0.073564564f,
+    (float16_t)0.997402130f,  (float16_t)0.072034653f,
+    (float16_t)0.997511456f,  (float16_t)0.070504573f,
+    (float16_t)0.997618435f,  (float16_t)0.068974328f,
+    (float16_t)0.997723067f,  (float16_t)0.067443920f,
+    (float16_t)0.997825350f,  (float16_t)0.065913353f,
+    (float16_t)0.997925286f,  (float16_t)0.064382631f,
+    (float16_t)0.998022874f,  (float16_t)0.062851758f,
+    (float16_t)0.998118113f,  (float16_t)0.061320736f,
+    (float16_t)0.998211003f,  (float16_t)0.059789571f,
+    (float16_t)0.998301545f,  (float16_t)0.058258265f,
+    (float16_t)0.998389737f,  (float16_t)0.056726821f,
+    (float16_t)0.998475581f,  (float16_t)0.055195244f,
+    (float16_t)0.998559074f,  (float16_t)0.053663538f,
+    (float16_t)0.998640218f,  (float16_t)0.052131705f,
+    (float16_t)0.998719012f,  (float16_t)0.050599749f,
+    (float16_t)0.998795456f,  (float16_t)0.049067674f,
+    (float16_t)0.998869550f,  (float16_t)0.047535484f,
+    (float16_t)0.998941293f,  (float16_t)0.046003182f,
+    (float16_t)0.999010686f,  (float16_t)0.044470772f,
+    (float16_t)0.999077728f,  (float16_t)0.042938257f,
+    (float16_t)0.999142419f,  (float16_t)0.041405641f,
+    (float16_t)0.999204759f,  (float16_t)0.039872928f,
+    (float16_t)0.999264747f,  (float16_t)0.038340120f,
+    (float16_t)0.999322385f,  (float16_t)0.036807223f,
+    (float16_t)0.999377670f,  (float16_t)0.035274239f,
+    (float16_t)0.999430605f,  (float16_t)0.033741172f,
+    (float16_t)0.999481187f,  (float16_t)0.032208025f,
+    (float16_t)0.999529418f,  (float16_t)0.030674803f,
+    (float16_t)0.999575296f,  (float16_t)0.029141509f,
+    (float16_t)0.999618822f,  (float16_t)0.027608146f,
+    (float16_t)0.999659997f,  (float16_t)0.026074718f,
+    (float16_t)0.999698819f,  (float16_t)0.024541229f,
+    (float16_t)0.999735288f,  (float16_t)0.023007681f,
+    (float16_t)0.999769405f,  (float16_t)0.021474080f,
+    (float16_t)0.999801170f,  (float16_t)0.019940429f,
+    (float16_t)0.999830582f,  (float16_t)0.018406730f,
+    (float16_t)0.999857641f,  (float16_t)0.016872988f,
+    (float16_t)0.999882347f,  (float16_t)0.015339206f,
+    (float16_t)0.999904701f,  (float16_t)0.013805389f,
+    (float16_t)0.999924702f,  (float16_t)0.012271538f,
+    (float16_t)0.999942350f,  (float16_t)0.010737659f,
+    (float16_t)0.999957645f,  (float16_t)0.009203755f,
+    (float16_t)0.999970586f,  (float16_t)0.007669829f,
+    (float16_t)0.999981175f,  (float16_t)0.006135885f,
+    (float16_t)0.999989411f,  (float16_t)0.004601926f,
+    (float16_t)0.999995294f,  (float16_t)0.003067957f,
+    (float16_t)0.999998823f,  (float16_t)0.001533980f,
+    (float16_t)1.000000000f,  (float16_t)0.000000000f,
+    (float16_t)0.999998823f, (float16_t)-0.001533980f,
+    (float16_t)0.999995294f, (float16_t)-0.003067957f,
+    (float16_t)0.999989411f, (float16_t)-0.004601926f,
+    (float16_t)0.999981175f, (float16_t)-0.006135885f,
+    (float16_t)0.999970586f, (float16_t)-0.007669829f,
+    (float16_t)0.999957645f, (float16_t)-0.009203755f,
+    (float16_t)0.999942350f, (float16_t)-0.010737659f,
+    (float16_t)0.999924702f, (float16_t)-0.012271538f,
+    (float16_t)0.999904701f, (float16_t)-0.013805389f,
+    (float16_t)0.999882347f, (float16_t)-0.015339206f,
+    (float16_t)0.999857641f, (float16_t)-0.016872988f,
+    (float16_t)0.999830582f, (float16_t)-0.018406730f,
+    (float16_t)0.999801170f, (float16_t)-0.019940429f,
+    (float16_t)0.999769405f, (float16_t)-0.021474080f,
+    (float16_t)0.999735288f, (float16_t)-0.023007681f,
+    (float16_t)0.999698819f, (float16_t)-0.024541229f,
+    (float16_t)0.999659997f, (float16_t)-0.026074718f,
+    (float16_t)0.999618822f, (float16_t)-0.027608146f,
+    (float16_t)0.999575296f, (float16_t)-0.029141509f,
+    (float16_t)0.999529418f, (float16_t)-0.030674803f,
+    (float16_t)0.999481187f, (float16_t)-0.032208025f,
+    (float16_t)0.999430605f, (float16_t)-0.033741172f,
+    (float16_t)0.999377670f, (float16_t)-0.035274239f,
+    (float16_t)0.999322385f, (float16_t)-0.036807223f,
+    (float16_t)0.999264747f, (float16_t)-0.038340120f,
+    (float16_t)0.999204759f, (float16_t)-0.039872928f,
+    (float16_t)0.999142419f, (float16_t)-0.041405641f,
+    (float16_t)0.999077728f, (float16_t)-0.042938257f,
+    (float16_t)0.999010686f, (float16_t)-0.044470772f,
+    (float16_t)0.998941293f, (float16_t)-0.046003182f,
+    (float16_t)0.998869550f, (float16_t)-0.047535484f,
+    (float16_t)0.998795456f, (float16_t)-0.049067674f,
+    (float16_t)0.998719012f, (float16_t)-0.050599749f,
+    (float16_t)0.998640218f, (float16_t)-0.052131705f,
+    (float16_t)0.998559074f, (float16_t)-0.053663538f,
+    (float16_t)0.998475581f, (float16_t)-0.055195244f,
+    (float16_t)0.998389737f, (float16_t)-0.056726821f,
+    (float16_t)0.998301545f, (float16_t)-0.058258265f,
+    (float16_t)0.998211003f, (float16_t)-0.059789571f,
+    (float16_t)0.998118113f, (float16_t)-0.061320736f,
+    (float16_t)0.998022874f, (float16_t)-0.062851758f,
+    (float16_t)0.997925286f, (float16_t)-0.064382631f,
+    (float16_t)0.997825350f, (float16_t)-0.065913353f,
+    (float16_t)0.997723067f, (float16_t)-0.067443920f,
+    (float16_t)0.997618435f, (float16_t)-0.068974328f,
+    (float16_t)0.997511456f, (float16_t)-0.070504573f,
+    (float16_t)0.997402130f, (float16_t)-0.072034653f,
+    (float16_t)0.997290457f, (float16_t)-0.073564564f,
+    (float16_t)0.997176437f, (float16_t)-0.075094301f,
+    (float16_t)0.997060070f, (float16_t)-0.076623861f,
+    (float16_t)0.996941358f, (float16_t)-0.078153242f,
+    (float16_t)0.996820299f, (float16_t)-0.079682438f,
+    (float16_t)0.996696895f, (float16_t)-0.081211447f,
+    (float16_t)0.996571146f, (float16_t)-0.082740265f,
+    (float16_t)0.996443051f, (float16_t)-0.084268888f,
+    (float16_t)0.996312612f, (float16_t)-0.085797312f,
+    (float16_t)0.996179829f, (float16_t)-0.087325535f,
+    (float16_t)0.996044701f, (float16_t)-0.088853553f,
+    (float16_t)0.995907229f, (float16_t)-0.090381361f,
+    (float16_t)0.995767414f, (float16_t)-0.091908956f,
+    (float16_t)0.995625256f, (float16_t)-0.093436336f,
+    (float16_t)0.995480755f, (float16_t)-0.094963495f,
+    (float16_t)0.995333912f, (float16_t)-0.096490431f,
+    (float16_t)0.995184727f, (float16_t)-0.098017140f,
+    (float16_t)0.995033199f, (float16_t)-0.099543619f,
+    (float16_t)0.994879331f, (float16_t)-0.101069863f,
+    (float16_t)0.994723121f, (float16_t)-0.102595869f,
+    (float16_t)0.994564571f, (float16_t)-0.104121634f,
+    (float16_t)0.994403680f, (float16_t)-0.105647154f,
+    (float16_t)0.994240449f, (float16_t)-0.107172425f,
+    (float16_t)0.994074879f, (float16_t)-0.108697444f,
+    (float16_t)0.993906970f, (float16_t)-0.110222207f,
+    (float16_t)0.993736722f, (float16_t)-0.111746711f,
+    (float16_t)0.993564136f, (float16_t)-0.113270952f,
+    (float16_t)0.993389211f, (float16_t)-0.114794927f,
+    (float16_t)0.993211949f, (float16_t)-0.116318631f,
+    (float16_t)0.993032350f, (float16_t)-0.117842062f,
+    (float16_t)0.992850414f, (float16_t)-0.119365215f,
+    (float16_t)0.992666142f, (float16_t)-0.120888087f,
+    (float16_t)0.992479535f, (float16_t)-0.122410675f,
+    (float16_t)0.992290591f, (float16_t)-0.123932975f,
+    (float16_t)0.992099313f, (float16_t)-0.125454983f,
+    (float16_t)0.991905700f, (float16_t)-0.126976696f,
+    (float16_t)0.991709754f, (float16_t)-0.128498111f,
+    (float16_t)0.991511473f, (float16_t)-0.130019223f,
+    (float16_t)0.991310860f, (float16_t)-0.131540029f,
+    (float16_t)0.991107914f, (float16_t)-0.133060525f,
+    (float16_t)0.990902635f, (float16_t)-0.134580709f,
+    (float16_t)0.990695025f, (float16_t)-0.136100575f,
+    (float16_t)0.990485084f, (float16_t)-0.137620122f,
+    (float16_t)0.990272812f, (float16_t)-0.139139344f,
+    (float16_t)0.990058210f, (float16_t)-0.140658239f,
+    (float16_t)0.989841278f, (float16_t)-0.142176804f,
+    (float16_t)0.989622017f, (float16_t)-0.143695033f,
+    (float16_t)0.989400428f, (float16_t)-0.145212925f,
+    (float16_t)0.989176510f, (float16_t)-0.146730474f,
+    (float16_t)0.988950265f, (float16_t)-0.148247679f,
+    (float16_t)0.988721692f, (float16_t)-0.149764535f,
+    (float16_t)0.988490793f, (float16_t)-0.151281038f,
+    (float16_t)0.988257568f, (float16_t)-0.152797185f,
+    (float16_t)0.988022017f, (float16_t)-0.154312973f,
+    (float16_t)0.987784142f, (float16_t)-0.155828398f,
+    (float16_t)0.987543942f, (float16_t)-0.157343456f,
+    (float16_t)0.987301418f, (float16_t)-0.158858143f,
+    (float16_t)0.987056571f, (float16_t)-0.160372457f,
+    (float16_t)0.986809402f, (float16_t)-0.161886394f,
+    (float16_t)0.986559910f, (float16_t)-0.163399949f,
+    (float16_t)0.986308097f, (float16_t)-0.164913120f,
+    (float16_t)0.986053963f, (float16_t)-0.166425904f,
+    (float16_t)0.985797509f, (float16_t)-0.167938295f,
+    (float16_t)0.985538735f, (float16_t)-0.169450291f,
+    (float16_t)0.985277642f, (float16_t)-0.170961889f,
+    (float16_t)0.985014231f, (float16_t)-0.172473084f,
+    (float16_t)0.984748502f, (float16_t)-0.173983873f,
+    (float16_t)0.984480455f, (float16_t)-0.175494253f,
+    (float16_t)0.984210092f, (float16_t)-0.177004220f,
+    (float16_t)0.983937413f, (float16_t)-0.178513771f,
+    (float16_t)0.983662419f, (float16_t)-0.180022901f,
+    (float16_t)0.983385110f, (float16_t)-0.181531608f,
+    (float16_t)0.983105487f, (float16_t)-0.183039888f,
+    (float16_t)0.982823551f, (float16_t)-0.184547737f,
+    (float16_t)0.982539302f, (float16_t)-0.186055152f,
+    (float16_t)0.982252741f, (float16_t)-0.187562129f,
+    (float16_t)0.981963869f, (float16_t)-0.189068664f,
+    (float16_t)0.981672686f, (float16_t)-0.190574755f,
+    (float16_t)0.981379193f, (float16_t)-0.192080397f,
+    (float16_t)0.981083391f, (float16_t)-0.193585587f,
+    (float16_t)0.980785280f, (float16_t)-0.195090322f,
+    (float16_t)0.980484862f, (float16_t)-0.196594598f,
+    (float16_t)0.980182136f, (float16_t)-0.198098411f,
+    (float16_t)0.979877104f, (float16_t)-0.199601758f,
+    (float16_t)0.979569766f, (float16_t)-0.201104635f,
+    (float16_t)0.979260123f, (float16_t)-0.202607039f,
+    (float16_t)0.978948175f, (float16_t)-0.204108966f,
+    (float16_t)0.978633924f, (float16_t)-0.205610413f,
+    (float16_t)0.978317371f, (float16_t)-0.207111376f,
+    (float16_t)0.977998515f, (float16_t)-0.208611852f,
+    (float16_t)0.977677358f, (float16_t)-0.210111837f,
+    (float16_t)0.977353900f, (float16_t)-0.211611327f,
+    (float16_t)0.977028143f, (float16_t)-0.213110320f,
+    (float16_t)0.976700086f, (float16_t)-0.214608811f,
+    (float16_t)0.976369731f, (float16_t)-0.216106797f,
+    (float16_t)0.976037079f, (float16_t)-0.217604275f,
+    (float16_t)0.975702130f, (float16_t)-0.219101240f,
+    (float16_t)0.975364885f, (float16_t)-0.220597690f,
+    (float16_t)0.975025345f, (float16_t)-0.222093621f,
+    (float16_t)0.974683511f, (float16_t)-0.223589029f,
+    (float16_t)0.974339383f, (float16_t)-0.225083911f,
+    (float16_t)0.973992962f, (float16_t)-0.226578264f,
+    (float16_t)0.973644250f, (float16_t)-0.228072083f,
+    (float16_t)0.973293246f, (float16_t)-0.229565366f,
+    (float16_t)0.972939952f, (float16_t)-0.231058108f,
+    (float16_t)0.972584369f, (float16_t)-0.232550307f,
+    (float16_t)0.972226497f, (float16_t)-0.234041959f,
+    (float16_t)0.971866337f, (float16_t)-0.235533059f,
+    (float16_t)0.971503891f, (float16_t)-0.237023606f,
+    (float16_t)0.971139158f, (float16_t)-0.238513595f,
+    (float16_t)0.970772141f, (float16_t)-0.240003022f,
+    (float16_t)0.970402839f, (float16_t)-0.241491885f,
+    (float16_t)0.970031253f, (float16_t)-0.242980180f,
+    (float16_t)0.969657385f, (float16_t)-0.244467903f,
+    (float16_t)0.969281235f, (float16_t)-0.245955050f,
+    (float16_t)0.968902805f, (float16_t)-0.247441619f,
+    (float16_t)0.968522094f, (float16_t)-0.248927606f,
+    (float16_t)0.968139105f, (float16_t)-0.250413007f,
+    (float16_t)0.967753837f, (float16_t)-0.251897818f,
+    (float16_t)0.967366292f, (float16_t)-0.253382037f,
+    (float16_t)0.966976471f, (float16_t)-0.254865660f,
+    (float16_t)0.966584374f, (float16_t)-0.256348682f,
+    (float16_t)0.966190003f, (float16_t)-0.257831102f,
+    (float16_t)0.965793359f, (float16_t)-0.259312915f,
+    (float16_t)0.965394442f, (float16_t)-0.260794118f,
+    (float16_t)0.964993253f, (float16_t)-0.262274707f,
+    (float16_t)0.964589793f, (float16_t)-0.263754679f,
+    (float16_t)0.964184064f, (float16_t)-0.265234030f,
+    (float16_t)0.963776066f, (float16_t)-0.266712757f,
+    (float16_t)0.963365800f, (float16_t)-0.268190857f,
+    (float16_t)0.962953267f, (float16_t)-0.269668326f,
+    (float16_t)0.962538468f, (float16_t)-0.271145160f,
+    (float16_t)0.962121404f, (float16_t)-0.272621355f,
+    (float16_t)0.961702077f, (float16_t)-0.274096910f,
+    (float16_t)0.961280486f, (float16_t)-0.275571819f,
+    (float16_t)0.960856633f, (float16_t)-0.277046080f,
+    (float16_t)0.960430519f, (float16_t)-0.278519689f,
+    (float16_t)0.960002146f, (float16_t)-0.279992643f,
+    (float16_t)0.959571513f, (float16_t)-0.281464938f,
+    (float16_t)0.959138622f, (float16_t)-0.282936570f,
+    (float16_t)0.958703475f, (float16_t)-0.284407537f,
+    (float16_t)0.958266071f, (float16_t)-0.285877835f,
+    (float16_t)0.957826413f, (float16_t)-0.287347460f,
+    (float16_t)0.957384501f, (float16_t)-0.288816408f,
+    (float16_t)0.956940336f, (float16_t)-0.290284677f,
+    (float16_t)0.956493919f, (float16_t)-0.291752263f,
+    (float16_t)0.956045251f, (float16_t)-0.293219163f,
+    (float16_t)0.955594334f, (float16_t)-0.294685372f,
+    (float16_t)0.955141168f, (float16_t)-0.296150888f,
+    (float16_t)0.954685755f, (float16_t)-0.297615707f,
+    (float16_t)0.954228095f, (float16_t)-0.299079826f,
+    (float16_t)0.953768190f, (float16_t)-0.300543241f,
+    (float16_t)0.953306040f, (float16_t)-0.302005949f,
+    (float16_t)0.952841648f, (float16_t)-0.303467947f,
+    (float16_t)0.952375013f, (float16_t)-0.304929230f,
+    (float16_t)0.951906137f, (float16_t)-0.306389795f,
+    (float16_t)0.951435021f, (float16_t)-0.307849640f,
+    (float16_t)0.950961666f, (float16_t)-0.309308760f,
+    (float16_t)0.950486074f, (float16_t)-0.310767153f,
+    (float16_t)0.950008245f, (float16_t)-0.312224814f,
+    (float16_t)0.949528181f, (float16_t)-0.313681740f,
+    (float16_t)0.949045882f, (float16_t)-0.315137929f,
+    (float16_t)0.948561350f, (float16_t)-0.316593376f,
+    (float16_t)0.948074586f, (float16_t)-0.318048077f,
+    (float16_t)0.947585591f, (float16_t)-0.319502031f,
+    (float16_t)0.947094366f, (float16_t)-0.320955232f,
+    (float16_t)0.946600913f, (float16_t)-0.322407679f,
+    (float16_t)0.946105232f, (float16_t)-0.323859367f,
+    (float16_t)0.945607325f, (float16_t)-0.325310292f,
+    (float16_t)0.945107193f, (float16_t)-0.326760452f,
+    (float16_t)0.944604837f, (float16_t)-0.328209844f,
+    (float16_t)0.944100258f, (float16_t)-0.329658463f,
+    (float16_t)0.943593458f, (float16_t)-0.331106306f,
+    (float16_t)0.943084437f, (float16_t)-0.332553370f,
+    (float16_t)0.942573198f, (float16_t)-0.333999651f,
+    (float16_t)0.942059740f, (float16_t)-0.335445147f,
+    (float16_t)0.941544065f, (float16_t)-0.336889853f,
+    (float16_t)0.941026175f, (float16_t)-0.338333767f,
+    (float16_t)0.940506071f, (float16_t)-0.339776884f,
+    (float16_t)0.939983753f, (float16_t)-0.341219202f,
+    (float16_t)0.939459224f, (float16_t)-0.342660717f,
+    (float16_t)0.938932484f, (float16_t)-0.344101426f,
+    (float16_t)0.938403534f, (float16_t)-0.345541325f,
+    (float16_t)0.937872376f, (float16_t)-0.346980411f,
+    (float16_t)0.937339012f, (float16_t)-0.348418680f,
+    (float16_t)0.936803442f, (float16_t)-0.349856130f,
+    (float16_t)0.936265667f, (float16_t)-0.351292756f,
+    (float16_t)0.935725689f, (float16_t)-0.352728556f,
+    (float16_t)0.935183510f, (float16_t)-0.354163525f,
+    (float16_t)0.934639130f, (float16_t)-0.355597662f,
+    (float16_t)0.934092550f, (float16_t)-0.357030961f,
+    (float16_t)0.933543773f, (float16_t)-0.358463421f,
+    (float16_t)0.932992799f, (float16_t)-0.359895037f,
+    (float16_t)0.932439629f, (float16_t)-0.361325806f,
+    (float16_t)0.931884266f, (float16_t)-0.362755724f,
+    (float16_t)0.931326709f, (float16_t)-0.364184790f,
+    (float16_t)0.930766961f, (float16_t)-0.365612998f,
+    (float16_t)0.930205023f, (float16_t)-0.367040346f,
+    (float16_t)0.929640896f, (float16_t)-0.368466830f,
+    (float16_t)0.929074581f, (float16_t)-0.369892447f,
+    (float16_t)0.928506080f, (float16_t)-0.371317194f,
+    (float16_t)0.927935395f, (float16_t)-0.372741067f,
+    (float16_t)0.927362526f, (float16_t)-0.374164063f,
+    (float16_t)0.926787474f, (float16_t)-0.375586178f,
+    (float16_t)0.926210242f, (float16_t)-0.377007410f,
+    (float16_t)0.925630831f, (float16_t)-0.378427755f,
+    (float16_t)0.925049241f, (float16_t)-0.379847209f,
+    (float16_t)0.924465474f, (float16_t)-0.381265769f,
+    (float16_t)0.923879533f, (float16_t)-0.382683432f,
+    (float16_t)0.923291417f, (float16_t)-0.384100195f,
+    (float16_t)0.922701128f, (float16_t)-0.385516054f,
+    (float16_t)0.922108669f, (float16_t)-0.386931006f,
+    (float16_t)0.921514039f, (float16_t)-0.388345047f,
+    (float16_t)0.920917242f, (float16_t)-0.389758174f,
+    (float16_t)0.920318277f, (float16_t)-0.391170384f,
+    (float16_t)0.919717146f, (float16_t)-0.392581674f,
+    (float16_t)0.919113852f, (float16_t)-0.393992040f,
+    (float16_t)0.918508394f, (float16_t)-0.395401479f,
+    (float16_t)0.917900776f, (float16_t)-0.396809987f,
+    (float16_t)0.917290997f, (float16_t)-0.398217562f,
+    (float16_t)0.916679060f, (float16_t)-0.399624200f,
+    (float16_t)0.916064966f, (float16_t)-0.401029897f,
+    (float16_t)0.915448716f, (float16_t)-0.402434651f,
+    (float16_t)0.914830312f, (float16_t)-0.403838458f,
+    (float16_t)0.914209756f, (float16_t)-0.405241314f,
+    (float16_t)0.913587048f, (float16_t)-0.406643217f,
+    (float16_t)0.912962190f, (float16_t)-0.408044163f,
+    (float16_t)0.912335185f, (float16_t)-0.409444149f,
+    (float16_t)0.911706032f, (float16_t)-0.410843171f,
+    (float16_t)0.911074734f, (float16_t)-0.412241227f,
+    (float16_t)0.910441292f, (float16_t)-0.413638312f,
+    (float16_t)0.909805708f, (float16_t)-0.415034424f,
+    (float16_t)0.909167983f, (float16_t)-0.416429560f,
+    (float16_t)0.908528119f, (float16_t)-0.417823716f,
+    (float16_t)0.907886116f, (float16_t)-0.419216888f,
+    (float16_t)0.907241978f, (float16_t)-0.420609074f,
+    (float16_t)0.906595705f, (float16_t)-0.422000271f,
+    (float16_t)0.905947298f, (float16_t)-0.423390474f,
+    (float16_t)0.905296759f, (float16_t)-0.424779681f,
+    (float16_t)0.904644091f, (float16_t)-0.426167889f,
+    (float16_t)0.903989293f, (float16_t)-0.427555093f,
+    (float16_t)0.903332368f, (float16_t)-0.428941292f,
+    (float16_t)0.902673318f, (float16_t)-0.430326481f,
+    (float16_t)0.902012144f, (float16_t)-0.431710658f,
+    (float16_t)0.901348847f, (float16_t)-0.433093819f,
+    (float16_t)0.900683429f, (float16_t)-0.434475961f,
+    (float16_t)0.900015892f, (float16_t)-0.435857080f,
+    (float16_t)0.899346237f, (float16_t)-0.437237174f,
+    (float16_t)0.898674466f, (float16_t)-0.438616239f,
+    (float16_t)0.898000580f, (float16_t)-0.439994271f,
+    (float16_t)0.897324581f, (float16_t)-0.441371269f,
+    (float16_t)0.896646470f, (float16_t)-0.442747228f,
+    (float16_t)0.895966250f, (float16_t)-0.444122145f,
+    (float16_t)0.895283921f, (float16_t)-0.445496017f,
+    (float16_t)0.894599486f, (float16_t)-0.446868840f,
+    (float16_t)0.893912945f, (float16_t)-0.448240612f,
+    (float16_t)0.893224301f, (float16_t)-0.449611330f,
+    (float16_t)0.892533555f, (float16_t)-0.450980989f,
+    (float16_t)0.891840709f, (float16_t)-0.452349587f,
+    (float16_t)0.891145765f, (float16_t)-0.453717121f,
+    (float16_t)0.890448723f, (float16_t)-0.455083587f,
+    (float16_t)0.889749586f, (float16_t)-0.456448982f,
+    (float16_t)0.889048356f, (float16_t)-0.457813304f,
+    (float16_t)0.888345033f, (float16_t)-0.459176548f,
+    (float16_t)0.887639620f, (float16_t)-0.460538711f,
+    (float16_t)0.886932119f, (float16_t)-0.461899791f,
+    (float16_t)0.886222530f, (float16_t)-0.463259784f,
+    (float16_t)0.885510856f, (float16_t)-0.464618686f,
+    (float16_t)0.884797098f, (float16_t)-0.465976496f,
+    (float16_t)0.884081259f, (float16_t)-0.467333209f,
+    (float16_t)0.883363339f, (float16_t)-0.468688822f,
+    (float16_t)0.882643340f, (float16_t)-0.470043332f,
+    (float16_t)0.881921264f, (float16_t)-0.471396737f,
+    (float16_t)0.881197113f, (float16_t)-0.472749032f,
+    (float16_t)0.880470889f, (float16_t)-0.474100215f,
+    (float16_t)0.879742593f, (float16_t)-0.475450282f,
+    (float16_t)0.879012226f, (float16_t)-0.476799230f,
+    (float16_t)0.878279792f, (float16_t)-0.478147056f,
+    (float16_t)0.877545290f, (float16_t)-0.479493758f,
+    (float16_t)0.876808724f, (float16_t)-0.480839331f,
+    (float16_t)0.876070094f, (float16_t)-0.482183772f,
+    (float16_t)0.875329403f, (float16_t)-0.483527079f,
+    (float16_t)0.874586652f, (float16_t)-0.484869248f,
+    (float16_t)0.873841843f, (float16_t)-0.486210276f,
+    (float16_t)0.873094978f, (float16_t)-0.487550160f,
+    (float16_t)0.872346059f, (float16_t)-0.488888897f,
+    (float16_t)0.871595087f, (float16_t)-0.490226483f,
+    (float16_t)0.870842063f, (float16_t)-0.491562916f,
+    (float16_t)0.870086991f, (float16_t)-0.492898192f,
+    (float16_t)0.869329871f, (float16_t)-0.494232309f,
+    (float16_t)0.868570706f, (float16_t)-0.495565262f,
+    (float16_t)0.867809497f, (float16_t)-0.496897049f,
+    (float16_t)0.867046246f, (float16_t)-0.498227667f,
+    (float16_t)0.866280954f, (float16_t)-0.499557113f,
+    (float16_t)0.865513624f, (float16_t)-0.500885383f,
+    (float16_t)0.864744258f, (float16_t)-0.502212474f,
+    (float16_t)0.863972856f, (float16_t)-0.503538384f,
+    (float16_t)0.863199422f, (float16_t)-0.504863109f,
+    (float16_t)0.862423956f, (float16_t)-0.506186645f,
+    (float16_t)0.861646461f, (float16_t)-0.507508991f,
+    (float16_t)0.860866939f, (float16_t)-0.508830143f,
+    (float16_t)0.860085390f, (float16_t)-0.510150097f,
+    (float16_t)0.859301818f, (float16_t)-0.511468850f,
+    (float16_t)0.858516224f, (float16_t)-0.512786401f,
+    (float16_t)0.857728610f, (float16_t)-0.514102744f,
+    (float16_t)0.856938977f, (float16_t)-0.515417878f,
+    (float16_t)0.856147328f, (float16_t)-0.516731799f,
+    (float16_t)0.855353665f, (float16_t)-0.518044504f,
+    (float16_t)0.854557988f, (float16_t)-0.519355990f,
+    (float16_t)0.853760301f, (float16_t)-0.520666254f,
+    (float16_t)0.852960605f, (float16_t)-0.521975293f,
+    (float16_t)0.852158902f, (float16_t)-0.523283103f,
+    (float16_t)0.851355193f, (float16_t)-0.524589683f,
+    (float16_t)0.850549481f, (float16_t)-0.525895027f,
+    (float16_t)0.849741768f, (float16_t)-0.527199135f,
+    (float16_t)0.848932055f, (float16_t)-0.528502002f,
+    (float16_t)0.848120345f, (float16_t)-0.529803625f,
+    (float16_t)0.847306639f, (float16_t)-0.531104001f,
+    (float16_t)0.846490939f, (float16_t)-0.532403128f,
+    (float16_t)0.845673247f, (float16_t)-0.533701002f,
+    (float16_t)0.844853565f, (float16_t)-0.534997620f,
+    (float16_t)0.844031895f, (float16_t)-0.536292979f,
+    (float16_t)0.843208240f, (float16_t)-0.537587076f,
+    (float16_t)0.842382600f, (float16_t)-0.538879909f,
+    (float16_t)0.841554977f, (float16_t)-0.540171473f,
+    (float16_t)0.840725375f, (float16_t)-0.541461766f,
+    (float16_t)0.839893794f, (float16_t)-0.542750785f,
+    (float16_t)0.839060237f, (float16_t)-0.544038527f,
+    (float16_t)0.838224706f, (float16_t)-0.545324988f,
+    (float16_t)0.837387202f, (float16_t)-0.546610167f,
+    (float16_t)0.836547727f, (float16_t)-0.547894059f,
+    (float16_t)0.835706284f, (float16_t)-0.549176662f,
+    (float16_t)0.834862875f, (float16_t)-0.550457973f,
+    (float16_t)0.834017501f, (float16_t)-0.551737988f,
+    (float16_t)0.833170165f, (float16_t)-0.553016706f,
+    (float16_t)0.832320868f, (float16_t)-0.554294121f,
+    (float16_t)0.831469612f, (float16_t)-0.555570233f,
+    (float16_t)0.830616400f, (float16_t)-0.556845037f,
+    (float16_t)0.829761234f, (float16_t)-0.558118531f,
+    (float16_t)0.828904115f, (float16_t)-0.559390712f,
+    (float16_t)0.828045045f, (float16_t)-0.560661576f,
+    (float16_t)0.827184027f, (float16_t)-0.561931121f,
+    (float16_t)0.826321063f, (float16_t)-0.563199344f,
+    (float16_t)0.825456154f, (float16_t)-0.564466242f,
+    (float16_t)0.824589303f, (float16_t)-0.565731811f,
+    (float16_t)0.823720511f, (float16_t)-0.566996049f,
+    (float16_t)0.822849781f, (float16_t)-0.568258953f,
+    (float16_t)0.821977115f, (float16_t)-0.569520519f,
+    (float16_t)0.821102515f, (float16_t)-0.570780746f,
+    (float16_t)0.820225983f, (float16_t)-0.572039629f,
+    (float16_t)0.819347520f, (float16_t)-0.573297167f,
+    (float16_t)0.818467130f, (float16_t)-0.574553355f,
+    (float16_t)0.817584813f, (float16_t)-0.575808191f,
+    (float16_t)0.816700573f, (float16_t)-0.577061673f,
+    (float16_t)0.815814411f, (float16_t)-0.578313796f,
+    (float16_t)0.814926329f, (float16_t)-0.579564559f,
+    (float16_t)0.814036330f, (float16_t)-0.580813958f,
+    (float16_t)0.813144415f, (float16_t)-0.582061990f,
+    (float16_t)0.812250587f, (float16_t)-0.583308653f,
+    (float16_t)0.811354847f, (float16_t)-0.584553943f,
+    (float16_t)0.810457198f, (float16_t)-0.585797857f,
+    (float16_t)0.809557642f, (float16_t)-0.587040394f,
+    (float16_t)0.808656182f, (float16_t)-0.588281548f,
+    (float16_t)0.807752818f, (float16_t)-0.589521319f,
+    (float16_t)0.806847554f, (float16_t)-0.590759702f,
+    (float16_t)0.805940391f, (float16_t)-0.591996695f,
+    (float16_t)0.805031331f, (float16_t)-0.593232295f,
+    (float16_t)0.804120377f, (float16_t)-0.594466499f,
+    (float16_t)0.803207531f, (float16_t)-0.595699304f,
+    (float16_t)0.802292796f, (float16_t)-0.596930708f,
+    (float16_t)0.801376172f, (float16_t)-0.598160707f,
+    (float16_t)0.800457662f, (float16_t)-0.599389298f,
+    (float16_t)0.799537269f, (float16_t)-0.600616479f,
+    (float16_t)0.798614995f, (float16_t)-0.601842247f,
+    (float16_t)0.797690841f, (float16_t)-0.603066599f,
+    (float16_t)0.796764810f, (float16_t)-0.604289531f,
+    (float16_t)0.795836905f, (float16_t)-0.605511041f,
+    (float16_t)0.794907126f, (float16_t)-0.606731127f,
+    (float16_t)0.793975478f, (float16_t)-0.607949785f,
+    (float16_t)0.793041960f, (float16_t)-0.609167012f,
+    (float16_t)0.792106577f, (float16_t)-0.610382806f,
+    (float16_t)0.791169330f, (float16_t)-0.611597164f,
+    (float16_t)0.790230221f, (float16_t)-0.612810082f,
+    (float16_t)0.789289253f, (float16_t)-0.614021559f,
+    (float16_t)0.788346428f, (float16_t)-0.615231591f,
+    (float16_t)0.787401747f, (float16_t)-0.616440175f,
+    (float16_t)0.786455214f, (float16_t)-0.617647308f,
+    (float16_t)0.785506830f, (float16_t)-0.618852988f,
+    (float16_t)0.784556597f, (float16_t)-0.620057212f,
+    (float16_t)0.783604519f, (float16_t)-0.621259977f,
+    (float16_t)0.782650596f, (float16_t)-0.622461279f,
+    (float16_t)0.781694832f, (float16_t)-0.623661118f,
+    (float16_t)0.780737229f, (float16_t)-0.624859488f,
+    (float16_t)0.779777788f, (float16_t)-0.626056388f,
+    (float16_t)0.778816512f, (float16_t)-0.627251815f,
+    (float16_t)0.777853404f, (float16_t)-0.628445767f,
+    (float16_t)0.776888466f, (float16_t)-0.629638239f,
+    (float16_t)0.775921699f, (float16_t)-0.630829230f,
+    (float16_t)0.774953107f, (float16_t)-0.632018736f,
+    (float16_t)0.773982691f, (float16_t)-0.633206755f,
+    (float16_t)0.773010453f, (float16_t)-0.634393284f,
+    (float16_t)0.772036397f, (float16_t)-0.635578320f,
+    (float16_t)0.771060524f, (float16_t)-0.636761861f,
+    (float16_t)0.770082837f, (float16_t)-0.637943904f,
+    (float16_t)0.769103338f, (float16_t)-0.639124445f,
+    (float16_t)0.768122029f, (float16_t)-0.640303482f,
+    (float16_t)0.767138912f, (float16_t)-0.641481013f,
+    (float16_t)0.766153990f, (float16_t)-0.642657034f,
+    (float16_t)0.765167266f, (float16_t)-0.643831543f,
+    (float16_t)0.764178741f, (float16_t)-0.645004537f,
+    (float16_t)0.763188417f, (float16_t)-0.646176013f,
+    (float16_t)0.762196298f, (float16_t)-0.647345969f,
+    (float16_t)0.761202385f, (float16_t)-0.648514401f,
+    (float16_t)0.760206682f, (float16_t)-0.649681307f,
+    (float16_t)0.759209189f, (float16_t)-0.650846685f,
+    (float16_t)0.758209910f, (float16_t)-0.652010531f,
+    (float16_t)0.757208847f, (float16_t)-0.653172843f,
+    (float16_t)0.756206001f, (float16_t)-0.654333618f,
+    (float16_t)0.755201377f, (float16_t)-0.655492853f,
+    (float16_t)0.754194975f, (float16_t)-0.656650546f,
+    (float16_t)0.753186799f, (float16_t)-0.657806693f,
+    (float16_t)0.752176850f, (float16_t)-0.658961293f,
+    (float16_t)0.751165132f, (float16_t)-0.660114342f,
+    (float16_t)0.750151646f, (float16_t)-0.661265838f,
+    (float16_t)0.749136395f, (float16_t)-0.662415778f,
+    (float16_t)0.748119380f, (float16_t)-0.663564159f,
+    (float16_t)0.747100606f, (float16_t)-0.664710978f,
+    (float16_t)0.746080074f, (float16_t)-0.665856234f,
+    (float16_t)0.745057785f, (float16_t)-0.666999922f,
+    (float16_t)0.744033744f, (float16_t)-0.668142041f,
+    (float16_t)0.743007952f, (float16_t)-0.669282588f,
+    (float16_t)0.741980412f, (float16_t)-0.670421560f,
+    (float16_t)0.740951125f, (float16_t)-0.671558955f,
+    (float16_t)0.739920095f, (float16_t)-0.672694769f,
+    (float16_t)0.738887324f, (float16_t)-0.673829000f,
+    (float16_t)0.737852815f, (float16_t)-0.674961646f,
+    (float16_t)0.736816569f, (float16_t)-0.676092704f,
+    (float16_t)0.735778589f, (float16_t)-0.677222170f,
+    (float16_t)0.734738878f, (float16_t)-0.678350043f,
+    (float16_t)0.733697438f, (float16_t)-0.679476320f,
+    (float16_t)0.732654272f, (float16_t)-0.680600998f,
+    (float16_t)0.731609381f, (float16_t)-0.681724074f,
+    (float16_t)0.730562769f, (float16_t)-0.682845546f,
+    (float16_t)0.729514438f, (float16_t)-0.683965412f,
+    (float16_t)0.728464390f, (float16_t)-0.685083668f,
+    (float16_t)0.727412629f, (float16_t)-0.686200312f,
+    (float16_t)0.726359155f, (float16_t)-0.687315341f,
+    (float16_t)0.725303972f, (float16_t)-0.688428753f,
+    (float16_t)0.724247083f, (float16_t)-0.689540545f,
+    (float16_t)0.723188489f, (float16_t)-0.690650714f,
+    (float16_t)0.722128194f, (float16_t)-0.691759258f,
+    (float16_t)0.721066199f, (float16_t)-0.692866175f,
+    (float16_t)0.720002508f, (float16_t)-0.693971461f,
+    (float16_t)0.718937122f, (float16_t)-0.695075114f,
+    (float16_t)0.717870045f, (float16_t)-0.696177131f,
+    (float16_t)0.716801279f, (float16_t)-0.697277511f,
+    (float16_t)0.715730825f, (float16_t)-0.698376249f,
+    (float16_t)0.714658688f, (float16_t)-0.699473345f,
+    (float16_t)0.713584869f, (float16_t)-0.700568794f,
+    (float16_t)0.712509371f, (float16_t)-0.701662595f,
+    (float16_t)0.711432196f, (float16_t)-0.702754744f,
+    (float16_t)0.710353347f, (float16_t)-0.703845241f,
+    (float16_t)0.709272826f, (float16_t)-0.704934080f,
+    (float16_t)0.708190637f, (float16_t)-0.706021261f,
+    (float16_t)0.707106781f, (float16_t)-0.707106781f,
+    (float16_t)0.706021261f, (float16_t)-0.708190637f,
+    (float16_t)0.704934080f, (float16_t)-0.709272826f,
+    (float16_t)0.703845241f, (float16_t)-0.710353347f,
+    (float16_t)0.702754744f, (float16_t)-0.711432196f,
+    (float16_t)0.701662595f, (float16_t)-0.712509371f,
+    (float16_t)0.700568794f, (float16_t)-0.713584869f,
+    (float16_t)0.699473345f, (float16_t)-0.714658688f,
+    (float16_t)0.698376249f, (float16_t)-0.715730825f,
+    (float16_t)0.697277511f, (float16_t)-0.716801279f,
+    (float16_t)0.696177131f, (float16_t)-0.717870045f,
+    (float16_t)0.695075114f, (float16_t)-0.718937122f,
+    (float16_t)0.693971461f, (float16_t)-0.720002508f,
+    (float16_t)0.692866175f, (float16_t)-0.721066199f,
+    (float16_t)0.691759258f, (float16_t)-0.722128194f,
+    (float16_t)0.690650714f, (float16_t)-0.723188489f,
+    (float16_t)0.689540545f, (float16_t)-0.724247083f,
+    (float16_t)0.688428753f, (float16_t)-0.725303972f,
+    (float16_t)0.687315341f, (float16_t)-0.726359155f,
+    (float16_t)0.686200312f, (float16_t)-0.727412629f,
+    (float16_t)0.685083668f, (float16_t)-0.728464390f,
+    (float16_t)0.683965412f, (float16_t)-0.729514438f,
+    (float16_t)0.682845546f, (float16_t)-0.730562769f,
+    (float16_t)0.681724074f, (float16_t)-0.731609381f,
+    (float16_t)0.680600998f, (float16_t)-0.732654272f,
+    (float16_t)0.679476320f, (float16_t)-0.733697438f,
+    (float16_t)0.678350043f, (float16_t)-0.734738878f,
+    (float16_t)0.677222170f, (float16_t)-0.735778589f,
+    (float16_t)0.676092704f, (float16_t)-0.736816569f,
+    (float16_t)0.674961646f, (float16_t)-0.737852815f,
+    (float16_t)0.673829000f, (float16_t)-0.738887324f,
+    (float16_t)0.672694769f, (float16_t)-0.739920095f,
+    (float16_t)0.671558955f, (float16_t)-0.740951125f,
+    (float16_t)0.670421560f, (float16_t)-0.741980412f,
+    (float16_t)0.669282588f, (float16_t)-0.743007952f,
+    (float16_t)0.668142041f, (float16_t)-0.744033744f,
+    (float16_t)0.666999922f, (float16_t)-0.745057785f,
+    (float16_t)0.665856234f, (float16_t)-0.746080074f,
+    (float16_t)0.664710978f, (float16_t)-0.747100606f,
+    (float16_t)0.663564159f, (float16_t)-0.748119380f,
+    (float16_t)0.662415778f, (float16_t)-0.749136395f,
+    (float16_t)0.661265838f, (float16_t)-0.750151646f,
+    (float16_t)0.660114342f, (float16_t)-0.751165132f,
+    (float16_t)0.658961293f, (float16_t)-0.752176850f,
+    (float16_t)0.657806693f, (float16_t)-0.753186799f,
+    (float16_t)0.656650546f, (float16_t)-0.754194975f,
+    (float16_t)0.655492853f, (float16_t)-0.755201377f,
+    (float16_t)0.654333618f, (float16_t)-0.756206001f,
+    (float16_t)0.653172843f, (float16_t)-0.757208847f,
+    (float16_t)0.652010531f, (float16_t)-0.758209910f,
+    (float16_t)0.650846685f, (float16_t)-0.759209189f,
+    (float16_t)0.649681307f, (float16_t)-0.760206682f,
+    (float16_t)0.648514401f, (float16_t)-0.761202385f,
+    (float16_t)0.647345969f, (float16_t)-0.762196298f,
+    (float16_t)0.646176013f, (float16_t)-0.763188417f,
+    (float16_t)0.645004537f, (float16_t)-0.764178741f,
+    (float16_t)0.643831543f, (float16_t)-0.765167266f,
+    (float16_t)0.642657034f, (float16_t)-0.766153990f,
+    (float16_t)0.641481013f, (float16_t)-0.767138912f,
+    (float16_t)0.640303482f, (float16_t)-0.768122029f,
+    (float16_t)0.639124445f, (float16_t)-0.769103338f,
+    (float16_t)0.637943904f, (float16_t)-0.770082837f,
+    (float16_t)0.636761861f, (float16_t)-0.771060524f,
+    (float16_t)0.635578320f, (float16_t)-0.772036397f,
+    (float16_t)0.634393284f, (float16_t)-0.773010453f,
+    (float16_t)0.633206755f, (float16_t)-0.773982691f,
+    (float16_t)0.632018736f, (float16_t)-0.774953107f,
+    (float16_t)0.630829230f, (float16_t)-0.775921699f,
+    (float16_t)0.629638239f, (float16_t)-0.776888466f,
+    (float16_t)0.628445767f, (float16_t)-0.777853404f,
+    (float16_t)0.627251815f, (float16_t)-0.778816512f,
+    (float16_t)0.626056388f, (float16_t)-0.779777788f,
+    (float16_t)0.624859488f, (float16_t)-0.780737229f,
+    (float16_t)0.623661118f, (float16_t)-0.781694832f,
+    (float16_t)0.622461279f, (float16_t)-0.782650596f,
+    (float16_t)0.621259977f, (float16_t)-0.783604519f,
+    (float16_t)0.620057212f, (float16_t)-0.784556597f,
+    (float16_t)0.618852988f, (float16_t)-0.785506830f,
+    (float16_t)0.617647308f, (float16_t)-0.786455214f,
+    (float16_t)0.616440175f, (float16_t)-0.787401747f,
+    (float16_t)0.615231591f, (float16_t)-0.788346428f,
+    (float16_t)0.614021559f, (float16_t)-0.789289253f,
+    (float16_t)0.612810082f, (float16_t)-0.790230221f,
+    (float16_t)0.611597164f, (float16_t)-0.791169330f,
+    (float16_t)0.610382806f, (float16_t)-0.792106577f,
+    (float16_t)0.609167012f, (float16_t)-0.793041960f,
+    (float16_t)0.607949785f, (float16_t)-0.793975478f,
+    (float16_t)0.606731127f, (float16_t)-0.794907126f,
+    (float16_t)0.605511041f, (float16_t)-0.795836905f,
+    (float16_t)0.604289531f, (float16_t)-0.796764810f,
+    (float16_t)0.603066599f, (float16_t)-0.797690841f,
+    (float16_t)0.601842247f, (float16_t)-0.798614995f,
+    (float16_t)0.600616479f, (float16_t)-0.799537269f,
+    (float16_t)0.599389298f, (float16_t)-0.800457662f,
+    (float16_t)0.598160707f, (float16_t)-0.801376172f,
+    (float16_t)0.596930708f, (float16_t)-0.802292796f,
+    (float16_t)0.595699304f, (float16_t)-0.803207531f,
+    (float16_t)0.594466499f, (float16_t)-0.804120377f,
+    (float16_t)0.593232295f, (float16_t)-0.805031331f,
+    (float16_t)0.591996695f, (float16_t)-0.805940391f,
+    (float16_t)0.590759702f, (float16_t)-0.806847554f,
+    (float16_t)0.589521319f, (float16_t)-0.807752818f,
+    (float16_t)0.588281548f, (float16_t)-0.808656182f,
+    (float16_t)0.587040394f, (float16_t)-0.809557642f,
+    (float16_t)0.585797857f, (float16_t)-0.810457198f,
+    (float16_t)0.584553943f, (float16_t)-0.811354847f,
+    (float16_t)0.583308653f, (float16_t)-0.812250587f,
+    (float16_t)0.582061990f, (float16_t)-0.813144415f,
+    (float16_t)0.580813958f, (float16_t)-0.814036330f,
+    (float16_t)0.579564559f, (float16_t)-0.814926329f,
+    (float16_t)0.578313796f, (float16_t)-0.815814411f,
+    (float16_t)0.577061673f, (float16_t)-0.816700573f,
+    (float16_t)0.575808191f, (float16_t)-0.817584813f,
+    (float16_t)0.574553355f, (float16_t)-0.818467130f,
+    (float16_t)0.573297167f, (float16_t)-0.819347520f,
+    (float16_t)0.572039629f, (float16_t)-0.820225983f,
+    (float16_t)0.570780746f, (float16_t)-0.821102515f,
+    (float16_t)0.569520519f, (float16_t)-0.821977115f,
+    (float16_t)0.568258953f, (float16_t)-0.822849781f,
+    (float16_t)0.566996049f, (float16_t)-0.823720511f,
+    (float16_t)0.565731811f, (float16_t)-0.824589303f,
+    (float16_t)0.564466242f, (float16_t)-0.825456154f,
+    (float16_t)0.563199344f, (float16_t)-0.826321063f,
+    (float16_t)0.561931121f, (float16_t)-0.827184027f,
+    (float16_t)0.560661576f, (float16_t)-0.828045045f,
+    (float16_t)0.559390712f, (float16_t)-0.828904115f,
+    (float16_t)0.558118531f, (float16_t)-0.829761234f,
+    (float16_t)0.556845037f, (float16_t)-0.830616400f,
+    (float16_t)0.555570233f, (float16_t)-0.831469612f,
+    (float16_t)0.554294121f, (float16_t)-0.832320868f,
+    (float16_t)0.553016706f, (float16_t)-0.833170165f,
+    (float16_t)0.551737988f, (float16_t)-0.834017501f,
+    (float16_t)0.550457973f, (float16_t)-0.834862875f,
+    (float16_t)0.549176662f, (float16_t)-0.835706284f,
+    (float16_t)0.547894059f, (float16_t)-0.836547727f,
+    (float16_t)0.546610167f, (float16_t)-0.837387202f,
+    (float16_t)0.545324988f, (float16_t)-0.838224706f,
+    (float16_t)0.544038527f, (float16_t)-0.839060237f,
+    (float16_t)0.542750785f, (float16_t)-0.839893794f,
+    (float16_t)0.541461766f, (float16_t)-0.840725375f,
+    (float16_t)0.540171473f, (float16_t)-0.841554977f,
+    (float16_t)0.538879909f, (float16_t)-0.842382600f,
+    (float16_t)0.537587076f, (float16_t)-0.843208240f,
+    (float16_t)0.536292979f, (float16_t)-0.844031895f,
+    (float16_t)0.534997620f, (float16_t)-0.844853565f,
+    (float16_t)0.533701002f, (float16_t)-0.845673247f,
+    (float16_t)0.532403128f, (float16_t)-0.846490939f,
+    (float16_t)0.531104001f, (float16_t)-0.847306639f,
+    (float16_t)0.529803625f, (float16_t)-0.848120345f,
+    (float16_t)0.528502002f, (float16_t)-0.848932055f,
+    (float16_t)0.527199135f, (float16_t)-0.849741768f,
+    (float16_t)0.525895027f, (float16_t)-0.850549481f,
+    (float16_t)0.524589683f, (float16_t)-0.851355193f,
+    (float16_t)0.523283103f, (float16_t)-0.852158902f,
+    (float16_t)0.521975293f, (float16_t)-0.852960605f,
+    (float16_t)0.520666254f, (float16_t)-0.853760301f,
+    (float16_t)0.519355990f, (float16_t)-0.854557988f,
+    (float16_t)0.518044504f, (float16_t)-0.855353665f,
+    (float16_t)0.516731799f, (float16_t)-0.856147328f,
+    (float16_t)0.515417878f, (float16_t)-0.856938977f,
+    (float16_t)0.514102744f, (float16_t)-0.857728610f,
+    (float16_t)0.512786401f, (float16_t)-0.858516224f,
+    (float16_t)0.511468850f, (float16_t)-0.859301818f,
+    (float16_t)0.510150097f, (float16_t)-0.860085390f,
+    (float16_t)0.508830143f, (float16_t)-0.860866939f,
+    (float16_t)0.507508991f, (float16_t)-0.861646461f,
+    (float16_t)0.506186645f, (float16_t)-0.862423956f,
+    (float16_t)0.504863109f, (float16_t)-0.863199422f,
+    (float16_t)0.503538384f, (float16_t)-0.863972856f,
+    (float16_t)0.502212474f, (float16_t)-0.864744258f,
+    (float16_t)0.500885383f, (float16_t)-0.865513624f,
+    (float16_t)0.499557113f, (float16_t)-0.866280954f,
+    (float16_t)0.498227667f, (float16_t)-0.867046246f,
+    (float16_t)0.496897049f, (float16_t)-0.867809497f,
+    (float16_t)0.495565262f, (float16_t)-0.868570706f,
+    (float16_t)0.494232309f, (float16_t)-0.869329871f,
+    (float16_t)0.492898192f, (float16_t)-0.870086991f,
+    (float16_t)0.491562916f, (float16_t)-0.870842063f,
+    (float16_t)0.490226483f, (float16_t)-0.871595087f,
+    (float16_t)0.488888897f, (float16_t)-0.872346059f,
+    (float16_t)0.487550160f, (float16_t)-0.873094978f,
+    (float16_t)0.486210276f, (float16_t)-0.873841843f,
+    (float16_t)0.484869248f, (float16_t)-0.874586652f,
+    (float16_t)0.483527079f, (float16_t)-0.875329403f,
+    (float16_t)0.482183772f, (float16_t)-0.876070094f,
+    (float16_t)0.480839331f, (float16_t)-0.876808724f,
+    (float16_t)0.479493758f, (float16_t)-0.877545290f,
+    (float16_t)0.478147056f, (float16_t)-0.878279792f,
+    (float16_t)0.476799230f, (float16_t)-0.879012226f,
+    (float16_t)0.475450282f, (float16_t)-0.879742593f,
+    (float16_t)0.474100215f, (float16_t)-0.880470889f,
+    (float16_t)0.472749032f, (float16_t)-0.881197113f,
+    (float16_t)0.471396737f, (float16_t)-0.881921264f,
+    (float16_t)0.470043332f, (float16_t)-0.882643340f,
+    (float16_t)0.468688822f, (float16_t)-0.883363339f,
+    (float16_t)0.467333209f, (float16_t)-0.884081259f,
+    (float16_t)0.465976496f, (float16_t)-0.884797098f,
+    (float16_t)0.464618686f, (float16_t)-0.885510856f,
+    (float16_t)0.463259784f, (float16_t)-0.886222530f,
+    (float16_t)0.461899791f, (float16_t)-0.886932119f,
+    (float16_t)0.460538711f, (float16_t)-0.887639620f,
+    (float16_t)0.459176548f, (float16_t)-0.888345033f,
+    (float16_t)0.457813304f, (float16_t)-0.889048356f,
+    (float16_t)0.456448982f, (float16_t)-0.889749586f,
+    (float16_t)0.455083587f, (float16_t)-0.890448723f,
+    (float16_t)0.453717121f, (float16_t)-0.891145765f,
+    (float16_t)0.452349587f, (float16_t)-0.891840709f,
+    (float16_t)0.450980989f, (float16_t)-0.892533555f,
+    (float16_t)0.449611330f, (float16_t)-0.893224301f,
+    (float16_t)0.448240612f, (float16_t)-0.893912945f,
+    (float16_t)0.446868840f, (float16_t)-0.894599486f,
+    (float16_t)0.445496017f, (float16_t)-0.895283921f,
+    (float16_t)0.444122145f, (float16_t)-0.895966250f,
+    (float16_t)0.442747228f, (float16_t)-0.896646470f,
+    (float16_t)0.441371269f, (float16_t)-0.897324581f,
+    (float16_t)0.439994271f, (float16_t)-0.898000580f,
+    (float16_t)0.438616239f, (float16_t)-0.898674466f,
+    (float16_t)0.437237174f, (float16_t)-0.899346237f,
+    (float16_t)0.435857080f, (float16_t)-0.900015892f,
+    (float16_t)0.434475961f, (float16_t)-0.900683429f,
+    (float16_t)0.433093819f, (float16_t)-0.901348847f,
+    (float16_t)0.431710658f, (float16_t)-0.902012144f,
+    (float16_t)0.430326481f, (float16_t)-0.902673318f,
+    (float16_t)0.428941292f, (float16_t)-0.903332368f,
+    (float16_t)0.427555093f, (float16_t)-0.903989293f,
+    (float16_t)0.426167889f, (float16_t)-0.904644091f,
+    (float16_t)0.424779681f, (float16_t)-0.905296759f,
+    (float16_t)0.423390474f, (float16_t)-0.905947298f,
+    (float16_t)0.422000271f, (float16_t)-0.906595705f,
+    (float16_t)0.420609074f, (float16_t)-0.907241978f,
+    (float16_t)0.419216888f, (float16_t)-0.907886116f,
+    (float16_t)0.417823716f, (float16_t)-0.908528119f,
+    (float16_t)0.416429560f, (float16_t)-0.909167983f,
+    (float16_t)0.415034424f, (float16_t)-0.909805708f,
+    (float16_t)0.413638312f, (float16_t)-0.910441292f,
+    (float16_t)0.412241227f, (float16_t)-0.911074734f,
+    (float16_t)0.410843171f, (float16_t)-0.911706032f,
+    (float16_t)0.409444149f, (float16_t)-0.912335185f,
+    (float16_t)0.408044163f, (float16_t)-0.912962190f,
+    (float16_t)0.406643217f, (float16_t)-0.913587048f,
+    (float16_t)0.405241314f, (float16_t)-0.914209756f,
+    (float16_t)0.403838458f, (float16_t)-0.914830312f,
+    (float16_t)0.402434651f, (float16_t)-0.915448716f,
+    (float16_t)0.401029897f, (float16_t)-0.916064966f,
+    (float16_t)0.399624200f, (float16_t)-0.916679060f,
+    (float16_t)0.398217562f, (float16_t)-0.917290997f,
+    (float16_t)0.396809987f, (float16_t)-0.917900776f,
+    (float16_t)0.395401479f, (float16_t)-0.918508394f,
+    (float16_t)0.393992040f, (float16_t)-0.919113852f,
+    (float16_t)0.392581674f, (float16_t)-0.919717146f,
+    (float16_t)0.391170384f, (float16_t)-0.920318277f,
+    (float16_t)0.389758174f, (float16_t)-0.920917242f,
+    (float16_t)0.388345047f, (float16_t)-0.921514039f,
+    (float16_t)0.386931006f, (float16_t)-0.922108669f,
+    (float16_t)0.385516054f, (float16_t)-0.922701128f,
+    (float16_t)0.384100195f, (float16_t)-0.923291417f,
+    (float16_t)0.382683432f, (float16_t)-0.923879533f,
+    (float16_t)0.381265769f, (float16_t)-0.924465474f,
+    (float16_t)0.379847209f, (float16_t)-0.925049241f,
+    (float16_t)0.378427755f, (float16_t)-0.925630831f,
+    (float16_t)0.377007410f, (float16_t)-0.926210242f,
+    (float16_t)0.375586178f, (float16_t)-0.926787474f,
+    (float16_t)0.374164063f, (float16_t)-0.927362526f,
+    (float16_t)0.372741067f, (float16_t)-0.927935395f,
+    (float16_t)0.371317194f, (float16_t)-0.928506080f,
+    (float16_t)0.369892447f, (float16_t)-0.929074581f,
+    (float16_t)0.368466830f, (float16_t)-0.929640896f,
+    (float16_t)0.367040346f, (float16_t)-0.930205023f,
+    (float16_t)0.365612998f, (float16_t)-0.930766961f,
+    (float16_t)0.364184790f, (float16_t)-0.931326709f,
+    (float16_t)0.362755724f, (float16_t)-0.931884266f,
+    (float16_t)0.361325806f, (float16_t)-0.932439629f,
+    (float16_t)0.359895037f, (float16_t)-0.932992799f,
+    (float16_t)0.358463421f, (float16_t)-0.933543773f,
+    (float16_t)0.357030961f, (float16_t)-0.934092550f,
+    (float16_t)0.355597662f, (float16_t)-0.934639130f,
+    (float16_t)0.354163525f, (float16_t)-0.935183510f,
+    (float16_t)0.352728556f, (float16_t)-0.935725689f,
+    (float16_t)0.351292756f, (float16_t)-0.936265667f,
+    (float16_t)0.349856130f, (float16_t)-0.936803442f,
+    (float16_t)0.348418680f, (float16_t)-0.937339012f,
+    (float16_t)0.346980411f, (float16_t)-0.937872376f,
+    (float16_t)0.345541325f, (float16_t)-0.938403534f,
+    (float16_t)0.344101426f, (float16_t)-0.938932484f,
+    (float16_t)0.342660717f, (float16_t)-0.939459224f,
+    (float16_t)0.341219202f, (float16_t)-0.939983753f,
+    (float16_t)0.339776884f, (float16_t)-0.940506071f,
+    (float16_t)0.338333767f, (float16_t)-0.941026175f,
+    (float16_t)0.336889853f, (float16_t)-0.941544065f,
+    (float16_t)0.335445147f, (float16_t)-0.942059740f,
+    (float16_t)0.333999651f, (float16_t)-0.942573198f,
+    (float16_t)0.332553370f, (float16_t)-0.943084437f,
+    (float16_t)0.331106306f, (float16_t)-0.943593458f,
+    (float16_t)0.329658463f, (float16_t)-0.944100258f,
+    (float16_t)0.328209844f, (float16_t)-0.944604837f,
+    (float16_t)0.326760452f, (float16_t)-0.945107193f,
+    (float16_t)0.325310292f, (float16_t)-0.945607325f,
+    (float16_t)0.323859367f, (float16_t)-0.946105232f,
+    (float16_t)0.322407679f, (float16_t)-0.946600913f,
+    (float16_t)0.320955232f, (float16_t)-0.947094366f,
+    (float16_t)0.319502031f, (float16_t)-0.947585591f,
+    (float16_t)0.318048077f, (float16_t)-0.948074586f,
+    (float16_t)0.316593376f, (float16_t)-0.948561350f,
+    (float16_t)0.315137929f, (float16_t)-0.949045882f,
+    (float16_t)0.313681740f, (float16_t)-0.949528181f,
+    (float16_t)0.312224814f, (float16_t)-0.950008245f,
+    (float16_t)0.310767153f, (float16_t)-0.950486074f,
+    (float16_t)0.309308760f, (float16_t)-0.950961666f,
+    (float16_t)0.307849640f, (float16_t)-0.951435021f,
+    (float16_t)0.306389795f, (float16_t)-0.951906137f,
+    (float16_t)0.304929230f, (float16_t)-0.952375013f,
+    (float16_t)0.303467947f, (float16_t)-0.952841648f,
+    (float16_t)0.302005949f, (float16_t)-0.953306040f,
+    (float16_t)0.300543241f, (float16_t)-0.953768190f,
+    (float16_t)0.299079826f, (float16_t)-0.954228095f,
+    (float16_t)0.297615707f, (float16_t)-0.954685755f,
+    (float16_t)0.296150888f, (float16_t)-0.955141168f,
+    (float16_t)0.294685372f, (float16_t)-0.955594334f,
+    (float16_t)0.293219163f, (float16_t)-0.956045251f,
+    (float16_t)0.291752263f, (float16_t)-0.956493919f,
+    (float16_t)0.290284677f, (float16_t)-0.956940336f,
+    (float16_t)0.288816408f, (float16_t)-0.957384501f,
+    (float16_t)0.287347460f, (float16_t)-0.957826413f,
+    (float16_t)0.285877835f, (float16_t)-0.958266071f,
+    (float16_t)0.284407537f, (float16_t)-0.958703475f,
+    (float16_t)0.282936570f, (float16_t)-0.959138622f,
+    (float16_t)0.281464938f, (float16_t)-0.959571513f,
+    (float16_t)0.279992643f, (float16_t)-0.960002146f,
+    (float16_t)0.278519689f, (float16_t)-0.960430519f,
+    (float16_t)0.277046080f, (float16_t)-0.960856633f,
+    (float16_t)0.275571819f, (float16_t)-0.961280486f,
+    (float16_t)0.274096910f, (float16_t)-0.961702077f,
+    (float16_t)0.272621355f, (float16_t)-0.962121404f,
+    (float16_t)0.271145160f, (float16_t)-0.962538468f,
+    (float16_t)0.269668326f, (float16_t)-0.962953267f,
+    (float16_t)0.268190857f, (float16_t)-0.963365800f,
+    (float16_t)0.266712757f, (float16_t)-0.963776066f,
+    (float16_t)0.265234030f, (float16_t)-0.964184064f,
+    (float16_t)0.263754679f, (float16_t)-0.964589793f,
+    (float16_t)0.262274707f, (float16_t)-0.964993253f,
+    (float16_t)0.260794118f, (float16_t)-0.965394442f,
+    (float16_t)0.259312915f, (float16_t)-0.965793359f,
+    (float16_t)0.257831102f, (float16_t)-0.966190003f,
+    (float16_t)0.256348682f, (float16_t)-0.966584374f,
+    (float16_t)0.254865660f, (float16_t)-0.966976471f,
+    (float16_t)0.253382037f, (float16_t)-0.967366292f,
+    (float16_t)0.251897818f, (float16_t)-0.967753837f,
+    (float16_t)0.250413007f, (float16_t)-0.968139105f,
+    (float16_t)0.248927606f, (float16_t)-0.968522094f,
+    (float16_t)0.247441619f, (float16_t)-0.968902805f,
+    (float16_t)0.245955050f, (float16_t)-0.969281235f,
+    (float16_t)0.244467903f, (float16_t)-0.969657385f,
+    (float16_t)0.242980180f, (float16_t)-0.970031253f,
+    (float16_t)0.241491885f, (float16_t)-0.970402839f,
+    (float16_t)0.240003022f, (float16_t)-0.970772141f,
+    (float16_t)0.238513595f, (float16_t)-0.971139158f,
+    (float16_t)0.237023606f, (float16_t)-0.971503891f,
+    (float16_t)0.235533059f, (float16_t)-0.971866337f,
+    (float16_t)0.234041959f, (float16_t)-0.972226497f,
+    (float16_t)0.232550307f, (float16_t)-0.972584369f,
+    (float16_t)0.231058108f, (float16_t)-0.972939952f,
+    (float16_t)0.229565366f, (float16_t)-0.973293246f,
+    (float16_t)0.228072083f, (float16_t)-0.973644250f,
+    (float16_t)0.226578264f, (float16_t)-0.973992962f,
+    (float16_t)0.225083911f, (float16_t)-0.974339383f,
+    (float16_t)0.223589029f, (float16_t)-0.974683511f,
+    (float16_t)0.222093621f, (float16_t)-0.975025345f,
+    (float16_t)0.220597690f, (float16_t)-0.975364885f,
+    (float16_t)0.219101240f, (float16_t)-0.975702130f,
+    (float16_t)0.217604275f, (float16_t)-0.976037079f,
+    (float16_t)0.216106797f, (float16_t)-0.976369731f,
+    (float16_t)0.214608811f, (float16_t)-0.976700086f,
+    (float16_t)0.213110320f, (float16_t)-0.977028143f,
+    (float16_t)0.211611327f, (float16_t)-0.977353900f,
+    (float16_t)0.210111837f, (float16_t)-0.977677358f,
+    (float16_t)0.208611852f, (float16_t)-0.977998515f,
+    (float16_t)0.207111376f, (float16_t)-0.978317371f,
+    (float16_t)0.205610413f, (float16_t)-0.978633924f,
+    (float16_t)0.204108966f, (float16_t)-0.978948175f,
+    (float16_t)0.202607039f, (float16_t)-0.979260123f,
+    (float16_t)0.201104635f, (float16_t)-0.979569766f,
+    (float16_t)0.199601758f, (float16_t)-0.979877104f,
+    (float16_t)0.198098411f, (float16_t)-0.980182136f,
+    (float16_t)0.196594598f, (float16_t)-0.980484862f,
+    (float16_t)0.195090322f, (float16_t)-0.980785280f,
+    (float16_t)0.193585587f, (float16_t)-0.981083391f,
+    (float16_t)0.192080397f, (float16_t)-0.981379193f,
+    (float16_t)0.190574755f, (float16_t)-0.981672686f,
+    (float16_t)0.189068664f, (float16_t)-0.981963869f,
+    (float16_t)0.187562129f, (float16_t)-0.982252741f,
+    (float16_t)0.186055152f, (float16_t)-0.982539302f,
+    (float16_t)0.184547737f, (float16_t)-0.982823551f,
+    (float16_t)0.183039888f, (float16_t)-0.983105487f,
+    (float16_t)0.181531608f, (float16_t)-0.983385110f,
+    (float16_t)0.180022901f, (float16_t)-0.983662419f,
+    (float16_t)0.178513771f, (float16_t)-0.983937413f,
+    (float16_t)0.177004220f, (float16_t)-0.984210092f,
+    (float16_t)0.175494253f, (float16_t)-0.984480455f,
+    (float16_t)0.173983873f, (float16_t)-0.984748502f,
+    (float16_t)0.172473084f, (float16_t)-0.985014231f,
+    (float16_t)0.170961889f, (float16_t)-0.985277642f,
+    (float16_t)0.169450291f, (float16_t)-0.985538735f,
+    (float16_t)0.167938295f, (float16_t)-0.985797509f,
+    (float16_t)0.166425904f, (float16_t)-0.986053963f,
+    (float16_t)0.164913120f, (float16_t)-0.986308097f,
+    (float16_t)0.163399949f, (float16_t)-0.986559910f,
+    (float16_t)0.161886394f, (float16_t)-0.986809402f,
+    (float16_t)0.160372457f, (float16_t)-0.987056571f,
+    (float16_t)0.158858143f, (float16_t)-0.987301418f,
+    (float16_t)0.157343456f, (float16_t)-0.987543942f,
+    (float16_t)0.155828398f, (float16_t)-0.987784142f,
+    (float16_t)0.154312973f, (float16_t)-0.988022017f,
+    (float16_t)0.152797185f, (float16_t)-0.988257568f,
+    (float16_t)0.151281038f, (float16_t)-0.988490793f,
+    (float16_t)0.149764535f, (float16_t)-0.988721692f,
+    (float16_t)0.148247679f, (float16_t)-0.988950265f,
+    (float16_t)0.146730474f, (float16_t)-0.989176510f,
+    (float16_t)0.145212925f, (float16_t)-0.989400428f,
+    (float16_t)0.143695033f, (float16_t)-0.989622017f,
+    (float16_t)0.142176804f, (float16_t)-0.989841278f,
+    (float16_t)0.140658239f, (float16_t)-0.990058210f,
+    (float16_t)0.139139344f, (float16_t)-0.990272812f,
+    (float16_t)0.137620122f, (float16_t)-0.990485084f,
+    (float16_t)0.136100575f, (float16_t)-0.990695025f,
+    (float16_t)0.134580709f, (float16_t)-0.990902635f,
+    (float16_t)0.133060525f, (float16_t)-0.991107914f,
+    (float16_t)0.131540029f, (float16_t)-0.991310860f,
+    (float16_t)0.130019223f, (float16_t)-0.991511473f,
+    (float16_t)0.128498111f, (float16_t)-0.991709754f,
+    (float16_t)0.126976696f, (float16_t)-0.991905700f,
+    (float16_t)0.125454983f, (float16_t)-0.992099313f,
+    (float16_t)0.123932975f, (float16_t)-0.992290591f,
+    (float16_t)0.122410675f, (float16_t)-0.992479535f,
+    (float16_t)0.120888087f, (float16_t)-0.992666142f,
+    (float16_t)0.119365215f, (float16_t)-0.992850414f,
+    (float16_t)0.117842062f, (float16_t)-0.993032350f,
+    (float16_t)0.116318631f, (float16_t)-0.993211949f,
+    (float16_t)0.114794927f, (float16_t)-0.993389211f,
+    (float16_t)0.113270952f, (float16_t)-0.993564136f,
+    (float16_t)0.111746711f, (float16_t)-0.993736722f,
+    (float16_t)0.110222207f, (float16_t)-0.993906970f,
+    (float16_t)0.108697444f, (float16_t)-0.994074879f,
+    (float16_t)0.107172425f, (float16_t)-0.994240449f,
+    (float16_t)0.105647154f, (float16_t)-0.994403680f,
+    (float16_t)0.104121634f, (float16_t)-0.994564571f,
+    (float16_t)0.102595869f, (float16_t)-0.994723121f,
+    (float16_t)0.101069863f, (float16_t)-0.994879331f,
+    (float16_t)0.099543619f, (float16_t)-0.995033199f,
+    (float16_t)0.098017140f, (float16_t)-0.995184727f,
+    (float16_t)0.096490431f, (float16_t)-0.995333912f,
+    (float16_t)0.094963495f, (float16_t)-0.995480755f,
+    (float16_t)0.093436336f, (float16_t)-0.995625256f,
+    (float16_t)0.091908956f, (float16_t)-0.995767414f,
+    (float16_t)0.090381361f, (float16_t)-0.995907229f,
+    (float16_t)0.088853553f, (float16_t)-0.996044701f,
+    (float16_t)0.087325535f, (float16_t)-0.996179829f,
+    (float16_t)0.085797312f, (float16_t)-0.996312612f,
+    (float16_t)0.084268888f, (float16_t)-0.996443051f,
+    (float16_t)0.082740265f, (float16_t)-0.996571146f,
+    (float16_t)0.081211447f, (float16_t)-0.996696895f,
+    (float16_t)0.079682438f, (float16_t)-0.996820299f,
+    (float16_t)0.078153242f, (float16_t)-0.996941358f,
+    (float16_t)0.076623861f, (float16_t)-0.997060070f,
+    (float16_t)0.075094301f, (float16_t)-0.997176437f,
+    (float16_t)0.073564564f, (float16_t)-0.997290457f,
+    (float16_t)0.072034653f, (float16_t)-0.997402130f,
+    (float16_t)0.070504573f, (float16_t)-0.997511456f,
+    (float16_t)0.068974328f, (float16_t)-0.997618435f,
+    (float16_t)0.067443920f, (float16_t)-0.997723067f,
+    (float16_t)0.065913353f, (float16_t)-0.997825350f,
+    (float16_t)0.064382631f, (float16_t)-0.997925286f,
+    (float16_t)0.062851758f, (float16_t)-0.998022874f,
+    (float16_t)0.061320736f, (float16_t)-0.998118113f,
+    (float16_t)0.059789571f, (float16_t)-0.998211003f,
+    (float16_t)0.058258265f, (float16_t)-0.998301545f,
+    (float16_t)0.056726821f, (float16_t)-0.998389737f,
+    (float16_t)0.055195244f, (float16_t)-0.998475581f,
+    (float16_t)0.053663538f, (float16_t)-0.998559074f,
+    (float16_t)0.052131705f, (float16_t)-0.998640218f,
+    (float16_t)0.050599749f, (float16_t)-0.998719012f,
+    (float16_t)0.049067674f, (float16_t)-0.998795456f,
+    (float16_t)0.047535484f, (float16_t)-0.998869550f,
+    (float16_t)0.046003182f, (float16_t)-0.998941293f,
+    (float16_t)0.044470772f, (float16_t)-0.999010686f,
+    (float16_t)0.042938257f, (float16_t)-0.999077728f,
+    (float16_t)0.041405641f, (float16_t)-0.999142419f,
+    (float16_t)0.039872928f, (float16_t)-0.999204759f,
+    (float16_t)0.038340120f, (float16_t)-0.999264747f,
+    (float16_t)0.036807223f, (float16_t)-0.999322385f,
+    (float16_t)0.035274239f, (float16_t)-0.999377670f,
+    (float16_t)0.033741172f, (float16_t)-0.999430605f,
+    (float16_t)0.032208025f, (float16_t)-0.999481187f,
+    (float16_t)0.030674803f, (float16_t)-0.999529418f,
+    (float16_t)0.029141509f, (float16_t)-0.999575296f,
+    (float16_t)0.027608146f, (float16_t)-0.999618822f,
+    (float16_t)0.026074718f, (float16_t)-0.999659997f,
+    (float16_t)0.024541229f, (float16_t)-0.999698819f,
+    (float16_t)0.023007681f, (float16_t)-0.999735288f,
+    (float16_t)0.021474080f, (float16_t)-0.999769405f,
+    (float16_t)0.019940429f, (float16_t)-0.999801170f,
+    (float16_t)0.018406730f, (float16_t)-0.999830582f,
+    (float16_t)0.016872988f, (float16_t)-0.999857641f,
+    (float16_t)0.015339206f, (float16_t)-0.999882347f,
+    (float16_t)0.013805389f, (float16_t)-0.999904701f,
+    (float16_t)0.012271538f, (float16_t)-0.999924702f,
+    (float16_t)0.010737659f, (float16_t)-0.999942350f,
+    (float16_t)0.009203755f, (float16_t)-0.999957645f,
+    (float16_t)0.007669829f, (float16_t)-0.999970586f,
+    (float16_t)0.006135885f, (float16_t)-0.999981175f,
+    (float16_t)0.004601926f, (float16_t)-0.999989411f,
+    (float16_t)0.003067957f, (float16_t)-0.999995294f,
+    (float16_t)0.001533980f, (float16_t)-0.999998823f
+};
+
+#endif /* if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALL_TABLES) */
+
+#endif /*!defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)*/
+
+#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
+const float16_t exp_tab_f16[8] = {
+    (1.f16),
+    (0.0416598916054f16),
+    (0.500000596046f16),
+    (0.00138889f16),
+    (1.00000011921f16),
+    (0.00833693705499f16),
+    (0.166665703058f16),
+    (0.000195780929062f16),
+};
+
+const float16_t __logf_lut_f16[8] = {
+    -2.295614848256274f16,         /*p0*/
+    -2.470711633419806f16,         /*p4*/
+    -5.686926051100417f16,         /*p2*/
+    -0.165253547131978f16,         /*p6*/
+    +5.175912446351073f16,         /*p1*/
+    +0.844006986174912f16,         /*p5*/
+    +4.584458825456749f16,         /*p3*/
+    +0.014127821926000f16          /*p7*/
+};
+
+#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
+#endif /* Not ARM AC5 */
+
+
+/**
+  @} end of CFFT_CIFFT group
+*/
diff --git a/CMSIS/DSP/Source/CommonTables/arm_const_structs.c b/CMSIS/DSP/Source/CommonTables/arm_const_structs.c
index 5d3f7aae58c7f60502f10fdc2cf79bc871adcf79..aff3a13052bd75c6c88f6de524a31eeb5ba53803 100644
--- a/CMSIS/DSP/Source/CommonTables/arm_const_structs.c
+++ b/CMSIS/DSP/Source/CommonTables/arm_const_structs.c
@@ -4,13 +4,13 @@
  * Description:  Constant structs that are initialized for user convenience.
  *               For example, some can be given as arguments to the arm_cfft_f32() or arm_rfft_f32() functions.
  *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -27,7 +27,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "arm_math_types.h"
 #include "arm_const_structs.h"
 
 /*
@@ -94,15 +94,6 @@ const arm_cfft_instance_f64 arm_cfft_sR_f64_len4096 = {
 /* Floating-point structs */
 #if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE)
 
-/* 
-
-Those structures cannot be used to initialize the MVE version of the FFT F32 instances.
-So they are not compiled when MVE is defined.
-
-For the MVE version, the new arm_cfft_init_f32 must be used.
-
-
-*/
 
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len16 = {
@@ -163,7 +154,7 @@ const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096 = {
 
 /* Fixed-point structs */
 
-#if !defined(ARM_MATH_MVEI)
+#if !defined(ARM_MATH_MVEI) || defined(ARM_MATH_AUTOVECTORIZE)
 
 /* 
 
@@ -357,7 +348,7 @@ const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len4096 = {
 
 #if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE)
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len32 = {
   { 16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH },
   32U,
@@ -365,7 +356,7 @@ const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len32 = {
 };
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len64 = {
    { 32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH },
   64U,
@@ -373,7 +364,7 @@ const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len64 = {
 };
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len128 = {
   { 64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH },
   128U,
@@ -381,7 +372,7 @@ const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len128 = {
 };
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len256 = {
   { 128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH },
   256U,
@@ -389,7 +380,7 @@ const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len256 = {
 };
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len512 = {
   { 256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH },
   512U,
@@ -397,7 +388,7 @@ const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len512 = {
 };
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len1024 = {
   { 512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH },
   1024U,
@@ -405,7 +396,7 @@ const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len1024 = {
 };
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len2048 = {
   { 1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH },
   2048U,
@@ -413,7 +404,7 @@ const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len2048 = {
 };
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
 const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len4096 = {
   { 2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH },
   4096U,
@@ -426,7 +417,7 @@ const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len4096 = {
 /* Fixed-point structs */
 /* q31_t */
 
-#if !defined(ARM_MATH_MVEI)
+#if !defined(ARM_MATH_MVEI) || defined(ARM_MATH_AUTOVECTORIZE)
 
 /* 
 
diff --git a/CMSIS/DSP/Source/CommonTables/arm_const_structs_f16.c b/CMSIS/DSP/Source/CommonTables/arm_const_structs_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..c0a20e5bc488c069ade7f7d30ed69bee750a19a9
--- /dev/null
+++ b/CMSIS/DSP/Source/CommonTables/arm_const_structs_f16.c
@@ -0,0 +1,120 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_const_structs_f16.c
+ * Description:  Constant structs that are initialized for user convenience.
+ *               For example, some can be given as arguments to the arm_cfft_f32() or arm_rfft_f32() functions.
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math_types_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include "arm_const_structs_f16.h"
+
+
+/*
+ALLOW TABLE is true when config table is enabled and the Tramsform folder is included 
+for compilation.
+*/
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+
+/* Floating-point structs */
+#if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE)
+
+
+/* 
+
+Those structures cannot be used to initialize the MVE version of the FFT F32 instances.
+So they are not compiled when MVE is defined.
+
+For the MVE version, the new arm_cfft_init_f16 must be used.
+
+
+*/
+
+#if !defined(__CC_ARM)
+ 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
+const arm_cfft_instance_f16 arm_cfft_sR_f16_len16 = {
+  16, twiddleCoefF16_16, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
+const arm_cfft_instance_f16 arm_cfft_sR_f16_len32 = {
+  32, twiddleCoefF16_32, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
+const arm_cfft_instance_f16 arm_cfft_sR_f16_len64 = {
+  64, twiddleCoefF16_64, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
+const arm_cfft_instance_f16 arm_cfft_sR_f16_len128 = {
+  128, twiddleCoefF16_128, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
+const arm_cfft_instance_f16 arm_cfft_sR_f16_len256 = {
+  256, twiddleCoefF16_256, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
+const arm_cfft_instance_f16 arm_cfft_sR_f16_len512 = {
+  512, twiddleCoefF16_512, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
+const arm_cfft_instance_f16 arm_cfft_sR_f16_len1024 = {
+  1024, twiddleCoefF16_1024, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
+const arm_cfft_instance_f16 arm_cfft_sR_f16_len2048 = {
+  2048, twiddleCoefF16_2048, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
+};
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
+const arm_cfft_instance_f16 arm_cfft_sR_f16_len4096 = {
+  4096, twiddleCoefF16_4096, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
+};
+#endif
+#endif 
+
+#endif /* !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+#endif
+
+#endif
diff --git a/CMSIS/DSP/Source/CommonTables/arm_mve_tables.c b/CMSIS/DSP/Source/CommonTables/arm_mve_tables.c
index c3d60e068b3aca388e2af55f671dc990a9403294..82b710e0a12e13b74a6be3197a4dc6ea889bb3b4 100644
--- a/CMSIS/DSP/Source/CommonTables/arm_mve_tables.c
+++ b/CMSIS/DSP/Source/CommonTables/arm_mve_tables.c
@@ -4,13 +4,14 @@
  * Description:  common tables like fft twiddle factors, Bitreverse, reciprocal etc
  *               used for MVE implementation only
  *
- * $Date:        08. January 2020
- * $Revision:    V1.7.0
+ * $Date:        23 April 2021
  *
- * Target Processor: Cortex-M cores
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -28,7 +29,7 @@
  */
 
  
-#include "arm_math.h"
+#include "arm_math_types.h"
 
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 
@@ -3764,9 +3765,8 @@ float32_t rearranged_twiddle_stride3_4096_f32[2728]={
 #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
-#include "arm_math.h"
 
-#if defined(ARM_MATH_MVEI) 
+#if defined(ARM_MATH_MVEI)  && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
 
@@ -5430,9 +5430,8 @@ q31_t rearranged_twiddle_stride3_4096_q31[2728]={
 #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
 #endif /* defined(ARM_MATH_MVEI)  */
 
-#include "arm_math.h"
 
-#if defined(ARM_MATH_MVEI) 
+#if defined(ARM_MATH_MVEI)  && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
 
@@ -6526,17 +6525,6 @@ q15_t rearranged_twiddle_stride3_4096_q15[2728]={
 
 
 
-#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
-#endif /* defined(ARM_MATH_MVEI)  */
-
-#include "arm_math.h"
-
-#if defined(ARM_MATH_MVEI) 
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-
-
 #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
 #endif /* defined(ARM_MATH_MVEI)  */
 
diff --git a/CMSIS/DSP/Source/CommonTables/arm_mve_tables_f16.c b/CMSIS/DSP/Source/CommonTables/arm_mve_tables_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..e28e5b4692988314209491d8a96e7c57cfd24d6e
--- /dev/null
+++ b/CMSIS/DSP/Source/CommonTables/arm_mve_tables_f16.c
@@ -0,0 +1,5575 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mve_tables_f16.c
+ * Description:  common tables like fft twiddle factors, Bitreverse, reciprocal etc
+ *               used for MVE implementation only
+ *
+ * $Date:        23 April 2021
+ *
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math_types_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_16) || defined(ARM_TABLE_TWIDDLECOEF_F16_32)
+
+uint32_t rearranged_twiddle_tab_stride1_arr_16_f16[2]={
+0,0,};
+
+uint32_t rearranged_twiddle_tab_stride2_arr_16_f16[2]={
+0,0,};
+
+uint32_t rearranged_twiddle_tab_stride3_arr_16_f16[2]={
+0,0,};
+
+float16_t rearranged_twiddle_stride1_16_f16[8]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,};
+
+float16_t rearranged_twiddle_stride2_16_f16[8]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,};
+
+float16_t rearranged_twiddle_stride3_16_f16[8]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,};
+
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_64) || defined(ARM_TABLE_TWIDDLECOEF_F16_128)
+
+uint32_t rearranged_twiddle_tab_stride1_arr_64_f16[3]={
+0,32,0,};
+
+uint32_t rearranged_twiddle_tab_stride2_arr_64_f16[3]={
+0,32,0,};
+
+uint32_t rearranged_twiddle_tab_stride3_arr_64_f16[3]={
+0,32,0,};
+
+float16_t rearranged_twiddle_stride1_64_f16[40]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,};
+
+float16_t rearranged_twiddle_stride2_64_f16[40]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.38268343236508972627f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.55557023301960195560f,(float16_t)0.83146961230254534669f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.83146961230254534669f,(float16_t)0.55557023301960217765f,
+(float16_t)-0.92387953251128673848f,(float16_t)0.38268343236508989280f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,};
+
+float16_t rearranged_twiddle_stride3_64_f16[40]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.99518472667219692873f,(float16_t)-0.09801714032956058975f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,
+(float16_t)-0.77301045336273710440f,(float16_t)-0.63439328416364526575f,
+(float16_t)-0.55557023301960217765f,(float16_t)-0.83146961230254523567f,
+(float16_t)-0.29028467725446244208f,(float16_t)-0.95694033573220882438f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,};
+
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_256) || defined(ARM_TABLE_TWIDDLECOEF_F16_512)
+
+uint32_t rearranged_twiddle_tab_stride1_arr_256_f16[4]={
+0,128,160,0,};
+
+uint32_t rearranged_twiddle_tab_stride2_arr_256_f16[4]={
+0,128,160,0,};
+
+uint32_t rearranged_twiddle_tab_stride3_arr_256_f16[4]={
+0,128,160,0,};
+
+float16_t rearranged_twiddle_stride1_256_f16[168]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99969881869620424997f,(float16_t)0.02454122852291228812f,
+(float16_t)0.99879545620517240501f,(float16_t)0.04906767432741801493f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.99247953459870996706f,(float16_t)0.12241067519921619566f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98527764238894122162f,(float16_t)0.17096188876030121717f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.97003125319454397424f,(float16_t)0.24298017990326387094f,
+(float16_t)0.96377606579543984022f,(float16_t)0.26671275747489836538f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.94952818059303667475f,(float16_t)0.31368174039889151761f,
+(float16_t)0.94154406518302080631f,(float16_t)0.33688985339222005111f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.91420975570353069095f,(float16_t)0.40524131400498986100f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.89322430119551532446f,(float16_t)0.44961132965460653965f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.85772861000027211809f,(float16_t)0.51410274419322166128f,
+(float16_t)0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.81758481315158371139f,(float16_t)0.57580819141784533866f,
+(float16_t)0.80320753148064494287f,(float16_t)0.59569930449243335691f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.75720884650648456748f,(float16_t)0.65317284295377675551f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.72424708295146700276f,(float16_t)0.68954054473706682948f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.67155895484701833009f,(float16_t)0.74095112535495910588f,
+(float16_t)0.65317284295377686654f,(float16_t)0.75720884650648456748f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.61523159058062681925f,(float16_t)0.78834642762660622761f,
+(float16_t)0.59569930449243346793f,(float16_t)0.80320753148064483184f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.53499761988709726435f,(float16_t)0.84485356524970700587f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.49289819222978409341f,(float16_t)0.87008699110871134952f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.42755509343028219593f,(float16_t)0.90398929312344333820f,
+(float16_t)0.40524131400498986100f,(float16_t)0.91420975570353069095f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.35989503653498827740f,(float16_t)0.93299279883473884567f,
+(float16_t)0.33688985339222005111f,(float16_t)0.94154406518302080631f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.26671275747489842090f,(float16_t)0.96377606579543984022f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.21910124015686976984f,(float16_t)0.97570213003852857003f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.14673047445536174793f,(float16_t)0.98917650996478101444f,
+(float16_t)0.12241067519921627893f,(float16_t)0.99247953459870996706f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.07356456359966745406f,(float16_t)0.99729045667869020697f,
+(float16_t)0.04906767432741812596f,(float16_t)0.99879545620517240501f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,};
+
+float16_t rearranged_twiddle_stride2_256_f16[168]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99879545620517240501f,(float16_t)0.04906767432741801493f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.97003125319454397424f,(float16_t)0.24298017990326387094f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.94154406518302080631f,(float16_t)0.33688985339222005111f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.85772861000027211809f,(float16_t)0.51410274419322166128f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.80320753148064494287f,(float16_t)0.59569930449243335691f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.67155895484701833009f,(float16_t)0.74095112535495910588f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.59569930449243346793f,(float16_t)0.80320753148064483184f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.42755509343028219593f,(float16_t)0.90398929312344333820f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.33688985339222005111f,(float16_t)0.94154406518302080631f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.14673047445536174793f,(float16_t)0.98917650996478101444f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.04906767432741812596f,(float16_t)0.99879545620517240501f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.04906767432741800800f,(float16_t)0.99879545620517240501f,
+(float16_t)-0.09801714032956064526f,(float16_t)0.99518472667219692873f,
+(float16_t)-0.14673047445536163691f,(float16_t)0.98917650996478101444f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.24298017990326387094f,(float16_t)0.97003125319454397424f,
+(float16_t)-0.29028467725446216452f,(float16_t)0.95694033573220893540f,
+(float16_t)-0.33688985339221994009f,(float16_t)0.94154406518302080631f,
+(float16_t)-0.38268343236508972627f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.42755509343028186287f,(float16_t)0.90398929312344344922f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.51410274419322155026f,(float16_t)0.85772861000027211809f,
+(float16_t)-0.55557023301960195560f,(float16_t)0.83146961230254534669f,
+(float16_t)-0.59569930449243335691f,(float16_t)0.80320753148064494287f,
+(float16_t)-0.63439328416364537677f,(float16_t)0.77301045336273710440f,
+(float16_t)-0.67155895484701844111f,(float16_t)0.74095112535495899486f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.74095112535495888384f,(float16_t)0.67155895484701855214f,
+(float16_t)-0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)-0.80320753148064483184f,(float16_t)0.59569930449243346793f,
+(float16_t)-0.83146961230254534669f,(float16_t)0.55557023301960217765f,
+(float16_t)-0.85772861000027200706f,(float16_t)0.51410274419322177231f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.90398929312344333820f,(float16_t)0.42755509343028202940f,
+(float16_t)-0.92387953251128673848f,(float16_t)0.38268343236508989280f,
+(float16_t)-0.94154406518302069529f,(float16_t)0.33688985339222032867f,
+(float16_t)-0.95694033573220882438f,(float16_t)0.29028467725446238656f,
+(float16_t)-0.97003125319454397424f,(float16_t)0.24298017990326406523f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.98917650996478101444f,(float16_t)0.14673047445536180344f,
+(float16_t)-0.99518472667219681771f,(float16_t)0.09801714032956082567f,
+(float16_t)-0.99879545620517240501f,(float16_t)0.04906767432741796636f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.38268343236508972627f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.55557023301960195560f,(float16_t)0.83146961230254534669f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.83146961230254534669f,(float16_t)0.55557023301960217765f,
+(float16_t)-0.92387953251128673848f,(float16_t)0.38268343236508989280f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,};
+
+float16_t rearranged_twiddle_stride3_256_f16[168]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)-0.04906767432741800800f,(float16_t)0.99879545620517240501f,
+(float16_t)-0.12241067519921615403f,(float16_t)0.99247953459870996706f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.26671275747489830987f,(float16_t)0.96377606579543984022f,
+(float16_t)-0.33688985339221994009f,(float16_t)0.94154406518302080631f,
+(float16_t)-0.40524131400498974998f,(float16_t)0.91420975570353069095f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.53499761988709704230f,(float16_t)0.84485356524970722791f,
+(float16_t)-0.59569930449243335691f,(float16_t)0.80320753148064494287f,
+(float16_t)-0.65317284295377653347f,(float16_t)0.75720884650648467851f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.75720884650648467851f,(float16_t)0.65317284295377664449f,
+(float16_t)-0.80320753148064483184f,(float16_t)0.59569930449243346793f,
+(float16_t)-0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.91420975570353069095f,(float16_t)0.40524131400498991651f,
+(float16_t)-0.94154406518302069529f,(float16_t)0.33688985339222032867f,
+(float16_t)-0.96377606579543984022f,(float16_t)0.26671275747489847641f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.99247953459870996706f,(float16_t)0.12241067519921634832f,
+(float16_t)-0.99879545620517240501f,(float16_t)0.04906767432741796636f,
+(float16_t)-0.99969881869620424997f,(float16_t)-0.02454122852291207996f,
+(float16_t)-0.99518472667219692873f,(float16_t)-0.09801714032956058975f,
+(float16_t)-0.98527764238894133264f,(float16_t)-0.17096188876030096737f,
+(float16_t)-0.97003125319454397424f,(float16_t)-0.24298017990326381543f,
+(float16_t)-0.94952818059303678577f,(float16_t)-0.31368174039889118454f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,
+(float16_t)-0.89322430119551532446f,(float16_t)-0.44961132965460665067f,
+(float16_t)-0.85772861000027211809f,(float16_t)-0.51410274419322155026f,
+(float16_t)-0.81758481315158371139f,(float16_t)-0.57580819141784533866f,
+(float16_t)-0.77301045336273710440f,(float16_t)-0.63439328416364526575f,
+(float16_t)-0.72424708295146700276f,(float16_t)-0.68954054473706682948f,
+(float16_t)-0.67155895484701866316f,(float16_t)-0.74095112535495888384f,
+(float16_t)-0.61523159058062726334f,(float16_t)-0.78834642762660589455f,
+(float16_t)-0.55557023301960217765f,(float16_t)-0.83146961230254523567f,
+(float16_t)-0.49289819222978420443f,(float16_t)-0.87008699110871134952f,
+(float16_t)-0.42755509343028247349f,(float16_t)-0.90398929312344311615f,
+(float16_t)-0.35989503653498794433f,(float16_t)-0.93299279883473895669f,
+(float16_t)-0.29028467725446244208f,(float16_t)-0.95694033573220882438f,
+(float16_t)-0.21910124015687010290f,(float16_t)-0.97570213003852845901f,
+(float16_t)-0.14673047445536230304f,(float16_t)-0.98917650996478090342f,
+(float16_t)-0.07356456359966735692f,(float16_t)-0.99729045667869020697f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.99518472667219692873f,(float16_t)-0.09801714032956058975f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,
+(float16_t)-0.77301045336273710440f,(float16_t)-0.63439328416364526575f,
+(float16_t)-0.55557023301960217765f,(float16_t)-0.83146961230254523567f,
+(float16_t)-0.29028467725446244208f,(float16_t)-0.95694033573220882438f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,};
+
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_1024) || defined(ARM_TABLE_TWIDDLECOEF_F16_2048)
+
+uint32_t rearranged_twiddle_tab_stride1_arr_1024_f16[5]={
+0,512,640,672,0,};
+
+uint32_t rearranged_twiddle_tab_stride2_arr_1024_f16[5]={
+0,512,640,672,0,};
+
+uint32_t rearranged_twiddle_tab_stride3_arr_1024_f16[5]={
+0,512,640,672,0,};
+
+float16_t rearranged_twiddle_stride1_1024_f16[680]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99998117528260110909f,(float16_t)0.00613588464915447527f,
+(float16_t)0.99992470183914450299f,(float16_t)0.01227153828571992539f,
+(float16_t)0.99983058179582340319f,(float16_t)0.01840672990580482019f,
+(float16_t)0.99969881869620424997f,(float16_t)0.02454122852291228812f,
+(float16_t)0.99952941750109314256f,(float16_t)0.03067480317663662595f,
+(float16_t)0.99932238458834954375f,(float16_t)0.03680722294135883171f,
+(float16_t)0.99907772775264536147f,(float16_t)0.04293825693494082024f,
+(float16_t)0.99879545620517240501f,(float16_t)0.04906767432741801493f,
+(float16_t)0.99847558057329477421f,(float16_t)0.05519524434968993420f,
+(float16_t)0.99811811290014917919f,(float16_t)0.06132073630220857829f,
+(float16_t)0.99772306664419163624f,(float16_t)0.06744391956366405094f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.99682029929116566791f,(float16_t)0.07968243797143012563f,
+(float16_t)0.99631261218277800129f,(float16_t)0.08579731234443989385f,
+(float16_t)0.99576741446765981713f,(float16_t)0.09190895649713272386f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.99456457073425541537f,(float16_t)0.10412163387205458642f,
+(float16_t)0.99390697000235606051f,(float16_t)0.11022220729388305938f,
+(float16_t)0.99321194923479450001f,(float16_t)0.11631863091190475235f,
+(float16_t)0.99247953459870996706f,(float16_t)0.12241067519921619566f,
+(float16_t)0.99170975366909952520f,(float16_t)0.12849811079379316880f,
+(float16_t)0.99090263542778000971f,(float16_t)0.13458070850712616773f,
+(float16_t)0.99005821026229712256f,(float16_t)0.14065823933284921088f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98825756773074946437f,(float16_t)0.15279718525844343535f,
+(float16_t)0.98730141815785843473f,(float16_t)0.15885814333386144570f,
+(float16_t)0.98630809724459866938f,(float16_t)0.16491312048996989437f,
+(float16_t)0.98527764238894122162f,(float16_t)0.17096188876030121717f,
+(float16_t)0.98421009238692902521f,(float16_t)0.17700422041214874946f,
+(float16_t)0.98310548743121628501f,(float16_t)0.18303988795514095078f,
+(float16_t)0.98196386910955524296f,(float16_t)0.18906866414980619262f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.97956976568544051887f,(float16_t)0.20110463484209190055f,
+(float16_t)0.97831737071962765473f,(float16_t)0.20711137619221856032f,
+(float16_t)0.97702814265775439484f,(float16_t)0.21311031991609136194f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.97433938278557585821f,(float16_t)0.22508391135979283204f,
+(float16_t)0.97293995220556017678f,(float16_t)0.23105810828067110951f,
+(float16_t)0.97150389098625178352f,(float16_t)0.23702360599436719801f,
+(float16_t)0.97003125319454397424f,(float16_t)0.24298017990326387094f,
+(float16_t)0.96852209427441737777f,(float16_t)0.24892760574572014853f,
+(float16_t)0.96697647104485207059f,(float16_t)0.25486565960451457169f,
+(float16_t)0.96539444169768939830f,(float16_t)0.26079411791527551401f,
+(float16_t)0.96377606579543984022f,(float16_t)0.26671275747489836538f,
+(float16_t)0.96212140426904158019f,(float16_t)0.27262135544994897662f,
+(float16_t)0.96043051941556578655f,(float16_t)0.27851968938505305973f,
+(float16_t)0.95870347489587159906f,(float16_t)0.28440753721127187692f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.95514116830577078243f,(float16_t)0.29615088824362378883f,
+(float16_t)0.95330604035419386211f,(float16_t)0.30200594931922808417f,
+(float16_t)0.95143502096900833820f,(float16_t)0.30784964004153486661f,
+(float16_t)0.94952818059303667475f,(float16_t)0.31368174039889151761f,
+(float16_t)0.94758559101774109124f,(float16_t)0.31950203081601569188f,
+(float16_t)0.94560732538052127971f,(float16_t)0.32531029216226292622f,
+(float16_t)0.94359345816196038559f,(float16_t)0.33110630575987642921f,
+(float16_t)0.94154406518302080631f,(float16_t)0.33688985339222005111f,
+(float16_t)0.93945922360218991898f,(float16_t)0.34266071731199437833f,
+(float16_t)0.93733901191257495977f,(float16_t)0.34841868024943456472f,
+(float16_t)0.93518350993894761025f,(float16_t)0.35416352542049034380f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.93076696107898371224f,(float16_t)0.36561299780477385379f,
+(float16_t)0.92850608047321558924f,(float16_t)0.37131719395183754306f,
+(float16_t)0.92621024213831137928f,(float16_t)0.37700741021641825945f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.92151403934204190183f,(float16_t)0.38834504669882624617f,
+(float16_t)0.91911385169005777040f,(float16_t)0.39399204006104809883f,
+(float16_t)0.91667905992104270485f,(float16_t)0.39962419984564678810f,
+(float16_t)0.91420975570353069095f,(float16_t)0.40524131400498986100f,
+(float16_t)0.91170603200542987832f,(float16_t)0.41084317105790391089f,
+(float16_t)0.90916798309052238025f,(float16_t)0.41642956009763715253f,
+(float16_t)0.90659570451491533483f,(float16_t)0.42200027079979968159f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.90134884704602202810f,(float16_t)0.43309381885315195726f,
+(float16_t)0.89867446569395381673f,(float16_t)0.43861623853852765853f,
+(float16_t)0.89596624975618521791f,(float16_t)0.44412214457042920035f,
+(float16_t)0.89322430119551532446f,(float16_t)0.44961132965460653965f,
+(float16_t)0.89044872324475787817f,(float16_t)0.45508358712634383592f,
+(float16_t)0.88763962040285393496f,(float16_t)0.46053871095824000514f,
+(float16_t)0.88479709843093778954f,(float16_t)0.46597649576796618121f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.87901222642863352519f,(float16_t)0.47679923006332208812f,
+(float16_t)0.87607009419540660122f,(float16_t)0.48218377207912271887f,
+(float16_t)0.87309497841829009079f,(float16_t)0.48755016014843599592f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.86704624551569264845f,(float16_t)0.49822766697278181303f,
+(float16_t)0.86397285612158669643f,(float16_t)0.50353838372571757542f,
+(float16_t)0.86086693863776730939f,(float16_t)0.50883014254310698909f,
+(float16_t)0.85772861000027211809f,(float16_t)0.51410274419322166128f,
+(float16_t)0.85455798836540053376f,(float16_t)0.51935599016558964269f,
+(float16_t)0.85135519310526519554f,(float16_t)0.52458968267846894928f,
+(float16_t)0.84812034480329723252f,(float16_t)0.52980362468629460526f,
+(float16_t)0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)0.84155497743689844370f,(float16_t)0.54017147272989285423f,
+(float16_t)0.83822470555483807875f,(float16_t)0.54532498842204646383f,
+(float16_t)0.83486287498638001026f,(float16_t)0.55045797293660481131f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.82804504525775579626f,(float16_t)0.56066157619733603124f,
+(float16_t)0.82458930278502529099f,(float16_t)0.56573181078361312046f,
+(float16_t)0.82110251499110464835f,(float16_t)0.57078074588696725566f,
+(float16_t)0.81758481315158371139f,(float16_t)0.57580819141784533866f,
+(float16_t)0.81403632970594841378f,(float16_t)0.58081395809576452649f,
+(float16_t)0.81045719825259476821f,(float16_t)0.58579785745643886408f,
+(float16_t)0.80684755354379933401f,(float16_t)0.59075970185887416442f,
+(float16_t)0.80320753148064494287f,(float16_t)0.59569930449243335691f,
+(float16_t)0.79953726910790501314f,(float16_t)0.60061647938386897305f,
+(float16_t)0.79583690460888356633f,(float16_t)0.60551104140432554512f,
+(float16_t)0.79210657730021238887f,(float16_t)0.61038280627630947528f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.78455659715557524159f,(float16_t)0.62005721176328909561f,
+(float16_t)0.78073722857209448822f,(float16_t)0.62485948814238634341f,
+(float16_t)0.77688846567323244230f,(float16_t)0.62963823891492698426f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.76910333764557969882f,(float16_t)0.63912444486377573138f,
+(float16_t)0.76516726562245895860f,(float16_t)0.64383154288979138613f,
+(float16_t)0.76120238548426177871f,(float16_t)0.64851440102211244110f,
+(float16_t)0.75720884650648456748f,(float16_t)0.65317284295377675551f,
+(float16_t)0.75318679904361252042f,(float16_t)0.65780669329707863735f,
+(float16_t)0.74913639452345937020f,(float16_t)0.66241577759017178373f,
+(float16_t)0.74505778544146594733f,(float16_t)0.66699992230363747137f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.73681656887736979300f,(float16_t)0.67609270357531592310f,
+(float16_t)0.73265427167241281570f,(float16_t)0.68060099779545302212f,
+(float16_t)0.72846439044822519637f,(float16_t)0.68508366777270035541f,
+(float16_t)0.72424708295146700276f,(float16_t)0.68954054473706682948f,
+(float16_t)0.72000250796138165477f,(float16_t)0.69397146088965389055f,
+(float16_t)0.71573082528381870571f,(float16_t)0.69837624940897280457f,
+(float16_t)0.71143219574521643356f,(float16_t)0.70275474445722529993f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.70275474445722529993f,(float16_t)0.71143219574521643356f,
+(float16_t)0.69837624940897291559f,(float16_t)0.71573082528381859468f,
+(float16_t)0.69397146088965400157f,(float16_t)0.72000250796138165477f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.68508366777270035541f,(float16_t)0.72846439044822519637f,
+(float16_t)0.68060099779545302212f,(float16_t)0.73265427167241281570f,
+(float16_t)0.67609270357531603413f,(float16_t)0.73681656887736979300f,
+(float16_t)0.67155895484701833009f,(float16_t)0.74095112535495910588f,
+(float16_t)0.66699992230363747137f,(float16_t)0.74505778544146594733f,
+(float16_t)0.66241577759017178373f,(float16_t)0.74913639452345925918f,
+(float16_t)0.65780669329707874837f,(float16_t)0.75318679904361252042f,
+(float16_t)0.65317284295377686654f,(float16_t)0.75720884650648456748f,
+(float16_t)0.64851440102211255212f,(float16_t)0.76120238548426177871f,
+(float16_t)0.64383154288979149715f,(float16_t)0.76516726562245895860f,
+(float16_t)0.63912444486377573138f,(float16_t)0.76910333764557958780f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.62963823891492709528f,(float16_t)0.77688846567323244230f,
+(float16_t)0.62485948814238645443f,(float16_t)0.78073722857209448822f,
+(float16_t)0.62005721176328920663f,(float16_t)0.78455659715557524159f,
+(float16_t)0.61523159058062681925f,(float16_t)0.78834642762660622761f,
+(float16_t)0.61038280627630947528f,(float16_t)0.79210657730021227785f,
+(float16_t)0.60551104140432554512f,(float16_t)0.79583690460888345530f,
+(float16_t)0.60061647938386897305f,(float16_t)0.79953726910790501314f,
+(float16_t)0.59569930449243346793f,(float16_t)0.80320753148064483184f,
+(float16_t)0.59075970185887427544f,(float16_t)0.80684755354379922299f,
+(float16_t)0.58579785745643886408f,(float16_t)0.81045719825259476821f,
+(float16_t)0.58081395809576452649f,(float16_t)0.81403632970594830276f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.57078074588696736669f,(float16_t)0.82110251499110464835f,
+(float16_t)0.56573181078361323149f,(float16_t)0.82458930278502529099f,
+(float16_t)0.56066157619733603124f,(float16_t)0.82804504525775579626f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.55045797293660481131f,(float16_t)0.83486287498638001026f,
+(float16_t)0.54532498842204646383f,(float16_t)0.83822470555483796772f,
+(float16_t)0.54017147272989296525f,(float16_t)0.84155497743689833268f,
+(float16_t)0.53499761988709726435f,(float16_t)0.84485356524970700587f,
+(float16_t)0.52980362468629482731f,(float16_t)0.84812034480329712149f,
+(float16_t)0.52458968267846883826f,(float16_t)0.85135519310526519554f,
+(float16_t)0.51935599016558953167f,(float16_t)0.85455798836540053376f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.50883014254310698909f,(float16_t)0.86086693863776730939f,
+(float16_t)0.50353838372571757542f,(float16_t)0.86397285612158669643f,
+(float16_t)0.49822766697278186854f,(float16_t)0.86704624551569264845f,
+(float16_t)0.49289819222978409341f,(float16_t)0.87008699110871134952f,
+(float16_t)0.48755016014843605143f,(float16_t)0.87309497841829009079f,
+(float16_t)0.48218377207912282989f,(float16_t)0.87607009419540660122f,
+(float16_t)0.47679923006332225466f,(float16_t)0.87901222642863341417f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.46597649576796612569f,(float16_t)0.88479709843093778954f,
+(float16_t)0.46053871095824000514f,(float16_t)0.88763962040285393496f,
+(float16_t)0.45508358712634383592f,(float16_t)0.89044872324475787817f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.44412214457042925586f,(float16_t)0.89596624975618510689f,
+(float16_t)0.43861623853852771404f,(float16_t)0.89867446569395381673f,
+(float16_t)0.43309381885315201277f,(float16_t)0.90134884704602202810f,
+(float16_t)0.42755509343028219593f,(float16_t)0.90398929312344333820f,
+(float16_t)0.42200027079979979261f,(float16_t)0.90659570451491533483f,
+(float16_t)0.41642956009763731906f,(float16_t)0.90916798309052226923f,
+(float16_t)0.41084317105790391089f,(float16_t)0.91170603200542987832f,
+(float16_t)0.40524131400498986100f,(float16_t)0.91420975570353069095f,
+(float16_t)0.39962419984564678810f,(float16_t)0.91667905992104270485f,
+(float16_t)0.39399204006104809883f,(float16_t)0.91911385169005777040f,
+(float16_t)0.38834504669882630168f,(float16_t)0.92151403934204190183f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.37700741021641831496f,(float16_t)0.92621024213831126826f,
+(float16_t)0.37131719395183759858f,(float16_t)0.92850608047321558924f,
+(float16_t)0.36561299780477396482f,(float16_t)0.93076696107898371224f,
+(float16_t)0.35989503653498827740f,(float16_t)0.93299279883473884567f,
+(float16_t)0.35416352542049051033f,(float16_t)0.93518350993894749923f,
+(float16_t)0.34841868024943450921f,(float16_t)0.93733901191257495977f,
+(float16_t)0.34266071731199437833f,(float16_t)0.93945922360218991898f,
+(float16_t)0.33688985339222005111f,(float16_t)0.94154406518302080631f,
+(float16_t)0.33110630575987642921f,(float16_t)0.94359345816196038559f,
+(float16_t)0.32531029216226298173f,(float16_t)0.94560732538052127971f,
+(float16_t)0.31950203081601574739f,(float16_t)0.94758559101774109124f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.30784964004153497763f,(float16_t)0.95143502096900833820f,
+(float16_t)0.30200594931922819519f,(float16_t)0.95330604035419375109f,
+(float16_t)0.29615088824362395536f,(float16_t)0.95514116830577067141f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.28440753721127182141f,(float16_t)0.95870347489587159906f,
+(float16_t)0.27851968938505305973f,(float16_t)0.96043051941556578655f,
+(float16_t)0.27262135544994897662f,(float16_t)0.96212140426904158019f,
+(float16_t)0.26671275747489842090f,(float16_t)0.96377606579543984022f,
+(float16_t)0.26079411791527556952f,(float16_t)0.96539444169768939830f,
+(float16_t)0.25486565960451462720f,(float16_t)0.96697647104485207059f,
+(float16_t)0.24892760574572025956f,(float16_t)0.96852209427441726675f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.23702360599436733679f,(float16_t)0.97150389098625178352f,
+(float16_t)0.23105810828067127605f,(float16_t)0.97293995220556006576f,
+(float16_t)0.22508391135979277653f,(float16_t)0.97433938278557585821f,
+(float16_t)0.21910124015686976984f,(float16_t)0.97570213003852857003f,
+(float16_t)0.21311031991609136194f,(float16_t)0.97702814265775439484f,
+(float16_t)0.20711137619221856032f,(float16_t)0.97831737071962765473f,
+(float16_t)0.20110463484209195606f,(float16_t)0.97956976568544051887f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.18906866414980627589f,(float16_t)0.98196386910955524296f,
+(float16_t)0.18303988795514106180f,(float16_t)0.98310548743121628501f,
+(float16_t)0.17700422041214886049f,(float16_t)0.98421009238692902521f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.16491312048997008866f,(float16_t)0.98630809724459866938f,
+(float16_t)0.15885814333386139019f,(float16_t)0.98730141815785843473f,
+(float16_t)0.15279718525844340760f,(float16_t)0.98825756773074946437f,
+(float16_t)0.14673047445536174793f,(float16_t)0.98917650996478101444f,
+(float16_t)0.14065823933284923863f,(float16_t)0.99005821026229712256f,
+(float16_t)0.13458070850712622324f,(float16_t)0.99090263542778000971f,
+(float16_t)0.12849811079379322432f,(float16_t)0.99170975366909952520f,
+(float16_t)0.12241067519921627893f,(float16_t)0.99247953459870996706f,
+(float16_t)0.11631863091190487725f,(float16_t)0.99321194923479450001f,
+(float16_t)0.11022220729388318428f,(float16_t)0.99390697000235606051f,
+(float16_t)0.10412163387205472520f,(float16_t)0.99456457073425541537f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.09190895649713269611f,(float16_t)0.99576741446765981713f,
+(float16_t)0.08579731234443987997f,(float16_t)0.99631261218277800129f,
+(float16_t)0.07968243797143012563f,(float16_t)0.99682029929116566791f,
+(float16_t)0.07356456359966745406f,(float16_t)0.99729045667869020697f,
+(float16_t)0.06744391956366410645f,(float16_t)0.99772306664419163624f,
+(float16_t)0.06132073630220864768f,(float16_t)0.99811811290014917919f,
+(float16_t)0.05519524434969003135f,(float16_t)0.99847558057329477421f,
+(float16_t)0.04906767432741812596f,(float16_t)0.99879545620517240501f,
+(float16_t)0.04293825693494095902f,(float16_t)0.99907772775264536147f,
+(float16_t)0.03680722294135899131f,(float16_t)0.99932238458834954375f,
+(float16_t)0.03067480317663658085f,(float16_t)0.99952941750109314256f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)0.01840672990580482019f,(float16_t)0.99983058179582340319f,
+(float16_t)0.01227153828571994447f,(float16_t)0.99992470183914450299f,
+(float16_t)0.00613588464915451517f,(float16_t)0.99998117528260110909f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99969881869620424997f,(float16_t)0.02454122852291228812f,
+(float16_t)0.99879545620517240501f,(float16_t)0.04906767432741801493f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.99247953459870996706f,(float16_t)0.12241067519921619566f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98527764238894122162f,(float16_t)0.17096188876030121717f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.97003125319454397424f,(float16_t)0.24298017990326387094f,
+(float16_t)0.96377606579543984022f,(float16_t)0.26671275747489836538f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.94952818059303667475f,(float16_t)0.31368174039889151761f,
+(float16_t)0.94154406518302080631f,(float16_t)0.33688985339222005111f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.91420975570353069095f,(float16_t)0.40524131400498986100f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.89322430119551532446f,(float16_t)0.44961132965460653965f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.85772861000027211809f,(float16_t)0.51410274419322166128f,
+(float16_t)0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.81758481315158371139f,(float16_t)0.57580819141784533866f,
+(float16_t)0.80320753148064494287f,(float16_t)0.59569930449243335691f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.75720884650648456748f,(float16_t)0.65317284295377675551f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.72424708295146700276f,(float16_t)0.68954054473706682948f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.67155895484701833009f,(float16_t)0.74095112535495910588f,
+(float16_t)0.65317284295377686654f,(float16_t)0.75720884650648456748f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.61523159058062681925f,(float16_t)0.78834642762660622761f,
+(float16_t)0.59569930449243346793f,(float16_t)0.80320753148064483184f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.53499761988709726435f,(float16_t)0.84485356524970700587f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.49289819222978409341f,(float16_t)0.87008699110871134952f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.42755509343028219593f,(float16_t)0.90398929312344333820f,
+(float16_t)0.40524131400498986100f,(float16_t)0.91420975570353069095f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.35989503653498827740f,(float16_t)0.93299279883473884567f,
+(float16_t)0.33688985339222005111f,(float16_t)0.94154406518302080631f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.26671275747489842090f,(float16_t)0.96377606579543984022f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.21910124015686976984f,(float16_t)0.97570213003852857003f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.14673047445536174793f,(float16_t)0.98917650996478101444f,
+(float16_t)0.12241067519921627893f,(float16_t)0.99247953459870996706f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.07356456359966745406f,(float16_t)0.99729045667869020697f,
+(float16_t)0.04906767432741812596f,(float16_t)0.99879545620517240501f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,};
+
+float16_t rearranged_twiddle_stride2_1024_f16[680]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99992470183914450299f,(float16_t)0.01227153828571992539f,
+(float16_t)0.99969881869620424997f,(float16_t)0.02454122852291228812f,
+(float16_t)0.99932238458834954375f,(float16_t)0.03680722294135883171f,
+(float16_t)0.99879545620517240501f,(float16_t)0.04906767432741801493f,
+(float16_t)0.99811811290014917919f,(float16_t)0.06132073630220857829f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.99631261218277800129f,(float16_t)0.08579731234443989385f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.99390697000235606051f,(float16_t)0.11022220729388305938f,
+(float16_t)0.99247953459870996706f,(float16_t)0.12241067519921619566f,
+(float16_t)0.99090263542778000971f,(float16_t)0.13458070850712616773f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98730141815785843473f,(float16_t)0.15885814333386144570f,
+(float16_t)0.98527764238894122162f,(float16_t)0.17096188876030121717f,
+(float16_t)0.98310548743121628501f,(float16_t)0.18303988795514095078f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.97831737071962765473f,(float16_t)0.20711137619221856032f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.97293995220556017678f,(float16_t)0.23105810828067110951f,
+(float16_t)0.97003125319454397424f,(float16_t)0.24298017990326387094f,
+(float16_t)0.96697647104485207059f,(float16_t)0.25486565960451457169f,
+(float16_t)0.96377606579543984022f,(float16_t)0.26671275747489836538f,
+(float16_t)0.96043051941556578655f,(float16_t)0.27851968938505305973f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.95330604035419386211f,(float16_t)0.30200594931922808417f,
+(float16_t)0.94952818059303667475f,(float16_t)0.31368174039889151761f,
+(float16_t)0.94560732538052127971f,(float16_t)0.32531029216226292622f,
+(float16_t)0.94154406518302080631f,(float16_t)0.33688985339222005111f,
+(float16_t)0.93733901191257495977f,(float16_t)0.34841868024943456472f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.92850608047321558924f,(float16_t)0.37131719395183754306f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.91911385169005777040f,(float16_t)0.39399204006104809883f,
+(float16_t)0.91420975570353069095f,(float16_t)0.40524131400498986100f,
+(float16_t)0.90916798309052238025f,(float16_t)0.41642956009763715253f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.89867446569395381673f,(float16_t)0.43861623853852765853f,
+(float16_t)0.89322430119551532446f,(float16_t)0.44961132965460653965f,
+(float16_t)0.88763962040285393496f,(float16_t)0.46053871095824000514f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.87607009419540660122f,(float16_t)0.48218377207912271887f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.86397285612158669643f,(float16_t)0.50353838372571757542f,
+(float16_t)0.85772861000027211809f,(float16_t)0.51410274419322166128f,
+(float16_t)0.85135519310526519554f,(float16_t)0.52458968267846894928f,
+(float16_t)0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)0.83822470555483807875f,(float16_t)0.54532498842204646383f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.82458930278502529099f,(float16_t)0.56573181078361312046f,
+(float16_t)0.81758481315158371139f,(float16_t)0.57580819141784533866f,
+(float16_t)0.81045719825259476821f,(float16_t)0.58579785745643886408f,
+(float16_t)0.80320753148064494287f,(float16_t)0.59569930449243335691f,
+(float16_t)0.79583690460888356633f,(float16_t)0.60551104140432554512f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.78073722857209448822f,(float16_t)0.62485948814238634341f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.76516726562245895860f,(float16_t)0.64383154288979138613f,
+(float16_t)0.75720884650648456748f,(float16_t)0.65317284295377675551f,
+(float16_t)0.74913639452345937020f,(float16_t)0.66241577759017178373f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.73265427167241281570f,(float16_t)0.68060099779545302212f,
+(float16_t)0.72424708295146700276f,(float16_t)0.68954054473706682948f,
+(float16_t)0.71573082528381870571f,(float16_t)0.69837624940897280457f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.69837624940897291559f,(float16_t)0.71573082528381859468f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.68060099779545302212f,(float16_t)0.73265427167241281570f,
+(float16_t)0.67155895484701833009f,(float16_t)0.74095112535495910588f,
+(float16_t)0.66241577759017178373f,(float16_t)0.74913639452345925918f,
+(float16_t)0.65317284295377686654f,(float16_t)0.75720884650648456748f,
+(float16_t)0.64383154288979149715f,(float16_t)0.76516726562245895860f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.62485948814238645443f,(float16_t)0.78073722857209448822f,
+(float16_t)0.61523159058062681925f,(float16_t)0.78834642762660622761f,
+(float16_t)0.60551104140432554512f,(float16_t)0.79583690460888345530f,
+(float16_t)0.59569930449243346793f,(float16_t)0.80320753148064483184f,
+(float16_t)0.58579785745643886408f,(float16_t)0.81045719825259476821f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.56573181078361323149f,(float16_t)0.82458930278502529099f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.54532498842204646383f,(float16_t)0.83822470555483796772f,
+(float16_t)0.53499761988709726435f,(float16_t)0.84485356524970700587f,
+(float16_t)0.52458968267846883826f,(float16_t)0.85135519310526519554f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.50353838372571757542f,(float16_t)0.86397285612158669643f,
+(float16_t)0.49289819222978409341f,(float16_t)0.87008699110871134952f,
+(float16_t)0.48218377207912282989f,(float16_t)0.87607009419540660122f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.46053871095824000514f,(float16_t)0.88763962040285393496f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.43861623853852771404f,(float16_t)0.89867446569395381673f,
+(float16_t)0.42755509343028219593f,(float16_t)0.90398929312344333820f,
+(float16_t)0.41642956009763731906f,(float16_t)0.90916798309052226923f,
+(float16_t)0.40524131400498986100f,(float16_t)0.91420975570353069095f,
+(float16_t)0.39399204006104809883f,(float16_t)0.91911385169005777040f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.37131719395183759858f,(float16_t)0.92850608047321558924f,
+(float16_t)0.35989503653498827740f,(float16_t)0.93299279883473884567f,
+(float16_t)0.34841868024943450921f,(float16_t)0.93733901191257495977f,
+(float16_t)0.33688985339222005111f,(float16_t)0.94154406518302080631f,
+(float16_t)0.32531029216226298173f,(float16_t)0.94560732538052127971f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.30200594931922819519f,(float16_t)0.95330604035419375109f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.27851968938505305973f,(float16_t)0.96043051941556578655f,
+(float16_t)0.26671275747489842090f,(float16_t)0.96377606579543984022f,
+(float16_t)0.25486565960451462720f,(float16_t)0.96697647104485207059f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.23105810828067127605f,(float16_t)0.97293995220556006576f,
+(float16_t)0.21910124015686976984f,(float16_t)0.97570213003852857003f,
+(float16_t)0.20711137619221856032f,(float16_t)0.97831737071962765473f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.18303988795514106180f,(float16_t)0.98310548743121628501f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.15885814333386139019f,(float16_t)0.98730141815785843473f,
+(float16_t)0.14673047445536174793f,(float16_t)0.98917650996478101444f,
+(float16_t)0.13458070850712622324f,(float16_t)0.99090263542778000971f,
+(float16_t)0.12241067519921627893f,(float16_t)0.99247953459870996706f,
+(float16_t)0.11022220729388318428f,(float16_t)0.99390697000235606051f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.08579731234443987997f,(float16_t)0.99631261218277800129f,
+(float16_t)0.07356456359966745406f,(float16_t)0.99729045667869020697f,
+(float16_t)0.06132073630220864768f,(float16_t)0.99811811290014917919f,
+(float16_t)0.04906767432741812596f,(float16_t)0.99879545620517240501f,
+(float16_t)0.03680722294135899131f,(float16_t)0.99932238458834954375f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)0.01227153828571994447f,(float16_t)0.99992470183914450299f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.01227153828571982304f,(float16_t)0.99992470183914450299f,
+(float16_t)-0.02454122852291214241f,(float16_t)0.99969881869620424997f,
+(float16_t)-0.03680722294135886641f,(float16_t)0.99932238458834954375f,
+(float16_t)-0.04906767432741800800f,(float16_t)0.99879545620517240501f,
+(float16_t)-0.06132073630220852972f,(float16_t)0.99811811290014917919f,
+(float16_t)-0.07356456359966732916f,(float16_t)0.99729045667869020697f,
+(float16_t)-0.08579731234443975507f,(float16_t)0.99631261218277800129f,
+(float16_t)-0.09801714032956064526f,(float16_t)0.99518472667219692873f,
+(float16_t)-0.11022220729388305938f,(float16_t)0.99390697000235606051f,
+(float16_t)-0.12241067519921615403f,(float16_t)0.99247953459870996706f,
+(float16_t)-0.13458070850712611222f,(float16_t)0.99090263542778000971f,
+(float16_t)-0.14673047445536163691f,(float16_t)0.98917650996478101444f,
+(float16_t)-0.15885814333386127917f,(float16_t)0.98730141815785843473f,
+(float16_t)-0.17096188876030124493f,(float16_t)0.98527764238894122162f,
+(float16_t)-0.18303988795514092303f,(float16_t)0.98310548743121628501f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.20711137619221844930f,(float16_t)0.97831737071962765473f,
+(float16_t)-0.21910124015686965881f,(float16_t)0.97570213003852857003f,
+(float16_t)-0.23105810828067113727f,(float16_t)0.97293995220556017678f,
+(float16_t)-0.24298017990326387094f,(float16_t)0.97003125319454397424f,
+(float16_t)-0.25486565960451451618f,(float16_t)0.96697647104485207059f,
+(float16_t)-0.26671275747489830987f,(float16_t)0.96377606579543984022f,
+(float16_t)-0.27851968938505294870f,(float16_t)0.96043051941556589757f,
+(float16_t)-0.29028467725446216452f,(float16_t)0.95694033573220893540f,
+(float16_t)-0.30200594931922808417f,(float16_t)0.95330604035419386211f,
+(float16_t)-0.31368174039889140658f,(float16_t)0.94952818059303667475f,
+(float16_t)-0.32531029216226287071f,(float16_t)0.94560732538052139073f,
+(float16_t)-0.33688985339221994009f,(float16_t)0.94154406518302080631f,
+(float16_t)-0.34841868024943439819f,(float16_t)0.93733901191257495977f,
+(float16_t)-0.35989503653498816638f,(float16_t)0.93299279883473884567f,
+(float16_t)-0.37131719395183748755f,(float16_t)0.92850608047321558924f,
+(float16_t)-0.38268343236508972627f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.39399204006104798781f,(float16_t)0.91911385169005777040f,
+(float16_t)-0.40524131400498974998f,(float16_t)0.91420975570353069095f,
+(float16_t)-0.41642956009763698599f,(float16_t)0.90916798309052249127f,
+(float16_t)-0.42755509343028186287f,(float16_t)0.90398929312344344922f,
+(float16_t)-0.43861623853852738097f,(float16_t)0.89867446569395392775f,
+(float16_t)-0.44961132965460670619f,(float16_t)0.89322430119551521344f,
+(float16_t)-0.46053871095824006066f,(float16_t)0.88763962040285393496f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.48218377207912271887f,(float16_t)0.87607009419540660122f,
+(float16_t)-0.49289819222978398239f,(float16_t)0.87008699110871146054f,
+(float16_t)-0.50353838372571746440f,(float16_t)0.86397285612158680745f,
+(float16_t)-0.51410274419322155026f,(float16_t)0.85772861000027211809f,
+(float16_t)-0.52458968267846872724f,(float16_t)0.85135519310526519554f,
+(float16_t)-0.53499761988709704230f,(float16_t)0.84485356524970722791f,
+(float16_t)-0.54532498842204624179f,(float16_t)0.83822470555483818977f,
+(float16_t)-0.55557023301960195560f,(float16_t)0.83146961230254534669f,
+(float16_t)-0.56573181078361323149f,(float16_t)0.82458930278502517996f,
+(float16_t)-0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)-0.58579785745643886408f,(float16_t)0.81045719825259476821f,
+(float16_t)-0.59569930449243335691f,(float16_t)0.80320753148064494287f,
+(float16_t)-0.60551104140432543410f,(float16_t)0.79583690460888356633f,
+(float16_t)-0.61523159058062670823f,(float16_t)0.78834642762660633863f,
+(float16_t)-0.62485948814238623239f,(float16_t)0.78073722857209459924f,
+(float16_t)-0.63439328416364537677f,(float16_t)0.77301045336273710440f,
+(float16_t)-0.64383154288979127511f,(float16_t)0.76516726562245906962f,
+(float16_t)-0.65317284295377653347f,(float16_t)0.75720884650648467851f,
+(float16_t)-0.66241577759017189475f,(float16_t)0.74913639452345925918f,
+(float16_t)-0.67155895484701844111f,(float16_t)0.74095112535495899486f,
+(float16_t)-0.68060099779545302212f,(float16_t)0.73265427167241281570f,
+(float16_t)-0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)-0.69837624940897280457f,(float16_t)0.71573082528381870571f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.71573082528381859468f,(float16_t)0.69837624940897291559f,
+(float16_t)-0.72424708295146678072f,(float16_t)0.68954054473706705153f,
+(float16_t)-0.73265427167241270467f,(float16_t)0.68060099779545324417f,
+(float16_t)-0.74095112535495888384f,(float16_t)0.67155895484701855214f,
+(float16_t)-0.74913639452345914815f,(float16_t)0.66241577759017200577f,
+(float16_t)-0.75720884650648467851f,(float16_t)0.65317284295377664449f,
+(float16_t)-0.76516726562245895860f,(float16_t)0.64383154288979138613f,
+(float16_t)-0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)-0.78073722857209448822f,(float16_t)0.62485948814238634341f,
+(float16_t)-0.78834642762660622761f,(float16_t)0.61523159058062693028f,
+(float16_t)-0.79583690460888345530f,(float16_t)0.60551104140432565615f,
+(float16_t)-0.80320753148064483184f,(float16_t)0.59569930449243346793f,
+(float16_t)-0.81045719825259465718f,(float16_t)0.58579785745643897510f,
+(float16_t)-0.81758481315158360037f,(float16_t)0.57580819141784544968f,
+(float16_t)-0.82458930278502506894f,(float16_t)0.56573181078361345353f,
+(float16_t)-0.83146961230254534669f,(float16_t)0.55557023301960217765f,
+(float16_t)-0.83822470555483807875f,(float16_t)0.54532498842204635281f,
+(float16_t)-0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)-0.85135519310526519554f,(float16_t)0.52458968267846894928f,
+(float16_t)-0.85772861000027200706f,(float16_t)0.51410274419322177231f,
+(float16_t)-0.86397285612158669643f,(float16_t)0.50353838372571757542f,
+(float16_t)-0.87008699110871134952f,(float16_t)0.49289819222978414892f,
+(float16_t)-0.87607009419540649020f,(float16_t)0.48218377207912288540f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.88763962040285382393f,(float16_t)0.46053871095824022719f,
+(float16_t)-0.89322430119551521344f,(float16_t)0.44961132965460687272f,
+(float16_t)-0.89867446569395392775f,(float16_t)0.43861623853852754751f,
+(float16_t)-0.90398929312344333820f,(float16_t)0.42755509343028202940f,
+(float16_t)-0.90916798309052238025f,(float16_t)0.41642956009763715253f,
+(float16_t)-0.91420975570353069095f,(float16_t)0.40524131400498991651f,
+(float16_t)-0.91911385169005777040f,(float16_t)0.39399204006104815434f,
+(float16_t)-0.92387953251128673848f,(float16_t)0.38268343236508989280f,
+(float16_t)-0.92850608047321547822f,(float16_t)0.37131719395183770960f,
+(float16_t)-0.93299279883473884567f,(float16_t)0.35989503653498833291f,
+(float16_t)-0.93733901191257484875f,(float16_t)0.34841868024943478677f,
+(float16_t)-0.94154406518302069529f,(float16_t)0.33688985339222032867f,
+(float16_t)-0.94560732538052116869f,(float16_t)0.32531029216226325929f,
+(float16_t)-0.94952818059303667475f,(float16_t)0.31368174039889140658f,
+(float16_t)-0.95330604035419386211f,(float16_t)0.30200594931922802866f,
+(float16_t)-0.95694033573220882438f,(float16_t)0.29028467725446238656f,
+(float16_t)-0.96043051941556578655f,(float16_t)0.27851968938505317075f,
+(float16_t)-0.96377606579543984022f,(float16_t)0.26671275747489847641f,
+(float16_t)-0.96697647104485207059f,(float16_t)0.25486565960451468271f,
+(float16_t)-0.97003125319454397424f,(float16_t)0.24298017990326406523f,
+(float16_t)-0.97293995220556006576f,(float16_t)0.23105810828067133156f,
+(float16_t)-0.97570213003852845901f,(float16_t)0.21910124015687004739f,
+(float16_t)-0.97831737071962754371f,(float16_t)0.20711137619221883788f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.98310548743121628501f,(float16_t)0.18303988795514089527f,
+(float16_t)-0.98527764238894122162f,(float16_t)0.17096188876030121717f,
+(float16_t)-0.98730141815785843473f,(float16_t)0.15885814333386147346f,
+(float16_t)-0.98917650996478101444f,(float16_t)0.14673047445536180344f,
+(float16_t)-0.99090263542778000971f,(float16_t)0.13458070850712627875f,
+(float16_t)-0.99247953459870996706f,(float16_t)0.12241067519921634832f,
+(float16_t)-0.99390697000235606051f,(float16_t)0.11022220729388323979f,
+(float16_t)-0.99518472667219681771f,(float16_t)0.09801714032956082567f,
+(float16_t)-0.99631261218277800129f,(float16_t)0.08579731234444015753f,
+(float16_t)-0.99729045667869020697f,(float16_t)0.07356456359966773162f,
+(float16_t)-0.99811811290014917919f,(float16_t)0.06132073630220848809f,
+(float16_t)-0.99879545620517240501f,(float16_t)0.04906767432741796636f,
+(float16_t)-0.99932238458834954375f,(float16_t)0.03680722294135883171f,
+(float16_t)-0.99969881869620424997f,(float16_t)0.02454122852291232629f,
+(float16_t)-0.99992470183914450299f,(float16_t)0.01227153828572000692f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99879545620517240501f,(float16_t)0.04906767432741801493f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.97003125319454397424f,(float16_t)0.24298017990326387094f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.94154406518302080631f,(float16_t)0.33688985339222005111f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.85772861000027211809f,(float16_t)0.51410274419322166128f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.80320753148064494287f,(float16_t)0.59569930449243335691f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.67155895484701833009f,(float16_t)0.74095112535495910588f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.59569930449243346793f,(float16_t)0.80320753148064483184f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.42755509343028219593f,(float16_t)0.90398929312344333820f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.33688985339222005111f,(float16_t)0.94154406518302080631f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.14673047445536174793f,(float16_t)0.98917650996478101444f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.04906767432741812596f,(float16_t)0.99879545620517240501f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.04906767432741800800f,(float16_t)0.99879545620517240501f,
+(float16_t)-0.09801714032956064526f,(float16_t)0.99518472667219692873f,
+(float16_t)-0.14673047445536163691f,(float16_t)0.98917650996478101444f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.24298017990326387094f,(float16_t)0.97003125319454397424f,
+(float16_t)-0.29028467725446216452f,(float16_t)0.95694033573220893540f,
+(float16_t)-0.33688985339221994009f,(float16_t)0.94154406518302080631f,
+(float16_t)-0.38268343236508972627f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.42755509343028186287f,(float16_t)0.90398929312344344922f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.51410274419322155026f,(float16_t)0.85772861000027211809f,
+(float16_t)-0.55557023301960195560f,(float16_t)0.83146961230254534669f,
+(float16_t)-0.59569930449243335691f,(float16_t)0.80320753148064494287f,
+(float16_t)-0.63439328416364537677f,(float16_t)0.77301045336273710440f,
+(float16_t)-0.67155895484701844111f,(float16_t)0.74095112535495899486f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.74095112535495888384f,(float16_t)0.67155895484701855214f,
+(float16_t)-0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)-0.80320753148064483184f,(float16_t)0.59569930449243346793f,
+(float16_t)-0.83146961230254534669f,(float16_t)0.55557023301960217765f,
+(float16_t)-0.85772861000027200706f,(float16_t)0.51410274419322177231f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.90398929312344333820f,(float16_t)0.42755509343028202940f,
+(float16_t)-0.92387953251128673848f,(float16_t)0.38268343236508989280f,
+(float16_t)-0.94154406518302069529f,(float16_t)0.33688985339222032867f,
+(float16_t)-0.95694033573220882438f,(float16_t)0.29028467725446238656f,
+(float16_t)-0.97003125319454397424f,(float16_t)0.24298017990326406523f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.98917650996478101444f,(float16_t)0.14673047445536180344f,
+(float16_t)-0.99518472667219681771f,(float16_t)0.09801714032956082567f,
+(float16_t)-0.99879545620517240501f,(float16_t)0.04906767432741796636f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.38268343236508972627f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.55557023301960195560f,(float16_t)0.83146961230254534669f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.83146961230254534669f,(float16_t)0.55557023301960217765f,
+(float16_t)-0.92387953251128673848f,(float16_t)0.38268343236508989280f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,};
+
+float16_t rearranged_twiddle_stride3_1024_f16[680]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99983058179582340319f,(float16_t)0.01840672990580482019f,
+(float16_t)0.99932238458834954375f,(float16_t)0.03680722294135883171f,
+(float16_t)0.99847558057329477421f,(float16_t)0.05519524434968993420f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.99576741446765981713f,(float16_t)0.09190895649713272386f,
+(float16_t)0.99390697000235606051f,(float16_t)0.11022220729388305938f,
+(float16_t)0.99170975366909952520f,(float16_t)0.12849811079379316880f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98630809724459866938f,(float16_t)0.16491312048996989437f,
+(float16_t)0.98310548743121628501f,(float16_t)0.18303988795514095078f,
+(float16_t)0.97956976568544051887f,(float16_t)0.20110463484209190055f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.97150389098625178352f,(float16_t)0.23702360599436719801f,
+(float16_t)0.96697647104485207059f,(float16_t)0.25486565960451457169f,
+(float16_t)0.96212140426904158019f,(float16_t)0.27262135544994897662f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.95143502096900833820f,(float16_t)0.30784964004153486661f,
+(float16_t)0.94560732538052127971f,(float16_t)0.32531029216226292622f,
+(float16_t)0.93945922360218991898f,(float16_t)0.34266071731199437833f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.92621024213831137928f,(float16_t)0.37700741021641825945f,
+(float16_t)0.91911385169005777040f,(float16_t)0.39399204006104809883f,
+(float16_t)0.91170603200542987832f,(float16_t)0.41084317105790391089f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.89596624975618521791f,(float16_t)0.44412214457042920035f,
+(float16_t)0.88763962040285393496f,(float16_t)0.46053871095824000514f,
+(float16_t)0.87901222642863352519f,(float16_t)0.47679923006332208812f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.86086693863776730939f,(float16_t)0.50883014254310698909f,
+(float16_t)0.85135519310526519554f,(float16_t)0.52458968267846894928f,
+(float16_t)0.84155497743689844370f,(float16_t)0.54017147272989285423f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.82110251499110464835f,(float16_t)0.57078074588696725566f,
+(float16_t)0.81045719825259476821f,(float16_t)0.58579785745643886408f,
+(float16_t)0.79953726910790501314f,(float16_t)0.60061647938386897305f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.77688846567323244230f,(float16_t)0.62963823891492698426f,
+(float16_t)0.76516726562245895860f,(float16_t)0.64383154288979138613f,
+(float16_t)0.75318679904361252042f,(float16_t)0.65780669329707863735f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.72846439044822519637f,(float16_t)0.68508366777270035541f,
+(float16_t)0.71573082528381870571f,(float16_t)0.69837624940897280457f,
+(float16_t)0.70275474445722529993f,(float16_t)0.71143219574521643356f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.67609270357531603413f,(float16_t)0.73681656887736979300f,
+(float16_t)0.66241577759017178373f,(float16_t)0.74913639452345925918f,
+(float16_t)0.64851440102211255212f,(float16_t)0.76120238548426177871f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.62005721176328920663f,(float16_t)0.78455659715557524159f,
+(float16_t)0.60551104140432554512f,(float16_t)0.79583690460888345530f,
+(float16_t)0.59075970185887427544f,(float16_t)0.80684755354379922299f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.56066157619733603124f,(float16_t)0.82804504525775579626f,
+(float16_t)0.54532498842204646383f,(float16_t)0.83822470555483796772f,
+(float16_t)0.52980362468629482731f,(float16_t)0.84812034480329712149f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.49822766697278186854f,(float16_t)0.86704624551569264845f,
+(float16_t)0.48218377207912282989f,(float16_t)0.87607009419540660122f,
+(float16_t)0.46597649576796612569f,(float16_t)0.88479709843093778954f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.43309381885315201277f,(float16_t)0.90134884704602202810f,
+(float16_t)0.41642956009763731906f,(float16_t)0.90916798309052226923f,
+(float16_t)0.39962419984564678810f,(float16_t)0.91667905992104270485f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.36561299780477396482f,(float16_t)0.93076696107898371224f,
+(float16_t)0.34841868024943450921f,(float16_t)0.93733901191257495977f,
+(float16_t)0.33110630575987642921f,(float16_t)0.94359345816196038559f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.29615088824362395536f,(float16_t)0.95514116830577067141f,
+(float16_t)0.27851968938505305973f,(float16_t)0.96043051941556578655f,
+(float16_t)0.26079411791527556952f,(float16_t)0.96539444169768939830f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.22508391135979277653f,(float16_t)0.97433938278557585821f,
+(float16_t)0.20711137619221856032f,(float16_t)0.97831737071962765473f,
+(float16_t)0.18906866414980627589f,(float16_t)0.98196386910955524296f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.15279718525844340760f,(float16_t)0.98825756773074946437f,
+(float16_t)0.13458070850712622324f,(float16_t)0.99090263542778000971f,
+(float16_t)0.11631863091190487725f,(float16_t)0.99321194923479450001f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.07968243797143012563f,(float16_t)0.99682029929116566791f,
+(float16_t)0.06132073630220864768f,(float16_t)0.99811811290014917919f,
+(float16_t)0.04293825693494095902f,(float16_t)0.99907772775264536147f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)0.00613588464915451517f,(float16_t)0.99998117528260110909f,
+(float16_t)-0.01227153828571982304f,(float16_t)0.99992470183914450299f,
+(float16_t)-0.03067480317663645942f,(float16_t)0.99952941750109314256f,
+(float16_t)-0.04906767432741800800f,(float16_t)0.99879545620517240501f,
+(float16_t)-0.06744391956366398155f,(float16_t)0.99772306664419163624f,
+(float16_t)-0.08579731234443975507f,(float16_t)0.99631261218277800129f,
+(float16_t)-0.10412163387205460030f,(float16_t)0.99456457073425541537f,
+(float16_t)-0.12241067519921615403f,(float16_t)0.99247953459870996706f,
+(float16_t)-0.14065823933284912761f,(float16_t)0.99005821026229712256f,
+(float16_t)-0.15885814333386127917f,(float16_t)0.98730141815785843473f,
+(float16_t)-0.17700422041214874946f,(float16_t)0.98421009238692902521f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.21311031991609125091f,(float16_t)0.97702814265775439484f,
+(float16_t)-0.23105810828067113727f,(float16_t)0.97293995220556017678f,
+(float16_t)-0.24892760574572012078f,(float16_t)0.96852209427441737777f,
+(float16_t)-0.26671275747489830987f,(float16_t)0.96377606579543984022f,
+(float16_t)-0.28440753721127171039f,(float16_t)0.95870347489587159906f,
+(float16_t)-0.30200594931922808417f,(float16_t)0.95330604035419386211f,
+(float16_t)-0.31950203081601563637f,(float16_t)0.94758559101774120226f,
+(float16_t)-0.33688985339221994009f,(float16_t)0.94154406518302080631f,
+(float16_t)-0.35416352542049039931f,(float16_t)0.93518350993894761025f,
+(float16_t)-0.37131719395183748755f,(float16_t)0.92850608047321558924f,
+(float16_t)-0.38834504669882619066f,(float16_t)0.92151403934204201285f,
+(float16_t)-0.40524131400498974998f,(float16_t)0.91420975570353069095f,
+(float16_t)-0.42200027079979968159f,(float16_t)0.90659570451491533483f,
+(float16_t)-0.43861623853852738097f,(float16_t)0.89867446569395392775f,
+(float16_t)-0.45508358712634372489f,(float16_t)0.89044872324475798919f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.48755016014843571837f,(float16_t)0.87309497841829020182f,
+(float16_t)-0.50353838372571746440f,(float16_t)0.86397285612158680745f,
+(float16_t)-0.51935599016558964269f,(float16_t)0.85455798836540053376f,
+(float16_t)-0.53499761988709704230f,(float16_t)0.84485356524970722791f,
+(float16_t)-0.55045797293660470029f,(float16_t)0.83486287498638012128f,
+(float16_t)-0.56573181078361323149f,(float16_t)0.82458930278502517996f,
+(float16_t)-0.58081395809576441547f,(float16_t)0.81403632970594852480f,
+(float16_t)-0.59569930449243335691f,(float16_t)0.80320753148064494287f,
+(float16_t)-0.61038280627630958630f,(float16_t)0.79210657730021227785f,
+(float16_t)-0.62485948814238623239f,(float16_t)0.78073722857209459924f,
+(float16_t)-0.63912444486377573138f,(float16_t)0.76910333764557958780f,
+(float16_t)-0.65317284295377653347f,(float16_t)0.75720884650648467851f,
+(float16_t)-0.66699992230363736034f,(float16_t)0.74505778544146605835f,
+(float16_t)-0.68060099779545302212f,(float16_t)0.73265427167241281570f,
+(float16_t)-0.69397146088965377952f,(float16_t)0.72000250796138176579f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.72000250796138165477f,(float16_t)0.69397146088965389055f,
+(float16_t)-0.73265427167241270467f,(float16_t)0.68060099779545324417f,
+(float16_t)-0.74505778544146594733f,(float16_t)0.66699992230363758239f,
+(float16_t)-0.75720884650648467851f,(float16_t)0.65317284295377664449f,
+(float16_t)-0.76910333764557947678f,(float16_t)0.63912444486377584241f,
+(float16_t)-0.78073722857209448822f,(float16_t)0.62485948814238634341f,
+(float16_t)-0.79210657730021216683f,(float16_t)0.61038280627630969732f,
+(float16_t)-0.80320753148064483184f,(float16_t)0.59569930449243346793f,
+(float16_t)-0.81403632970594841378f,(float16_t)0.58081395809576452649f,
+(float16_t)-0.82458930278502506894f,(float16_t)0.56573181078361345353f,
+(float16_t)-0.83486287498638001026f,(float16_t)0.55045797293660492233f,
+(float16_t)-0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)-0.85455798836540042274f,(float16_t)0.51935599016558975372f,
+(float16_t)-0.86397285612158669643f,(float16_t)0.50353838372571757542f,
+(float16_t)-0.87309497841829009079f,(float16_t)0.48755016014843588490f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.89044872324475787817f,(float16_t)0.45508358712634389143f,
+(float16_t)-0.89867446569395392775f,(float16_t)0.43861623853852754751f,
+(float16_t)-0.90659570451491533483f,(float16_t)0.42200027079979984812f,
+(float16_t)-0.91420975570353069095f,(float16_t)0.40524131400498991651f,
+(float16_t)-0.92151403934204179080f,(float16_t)0.38834504669882657923f,
+(float16_t)-0.92850608047321547822f,(float16_t)0.37131719395183770960f,
+(float16_t)-0.93518350993894761025f,(float16_t)0.35416352542049039931f,
+(float16_t)-0.94154406518302069529f,(float16_t)0.33688985339222032867f,
+(float16_t)-0.94758559101774109124f,(float16_t)0.31950203081601580291f,
+(float16_t)-0.95330604035419386211f,(float16_t)0.30200594931922802866f,
+(float16_t)-0.95870347489587148804f,(float16_t)0.28440753721127209896f,
+(float16_t)-0.96377606579543984022f,(float16_t)0.26671275747489847641f,
+(float16_t)-0.96852209427441737777f,(float16_t)0.24892760574572009302f,
+(float16_t)-0.97293995220556006576f,(float16_t)0.23105810828067133156f,
+(float16_t)-0.97702814265775439484f,(float16_t)0.21311031991609141745f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.98421009238692902521f,(float16_t)0.17700422041214894375f,
+(float16_t)-0.98730141815785843473f,(float16_t)0.15885814333386147346f,
+(float16_t)-0.99005821026229701154f,(float16_t)0.14065823933284954395f,
+(float16_t)-0.99247953459870996706f,(float16_t)0.12241067519921634832f,
+(float16_t)-0.99456457073425541537f,(float16_t)0.10412163387205457254f,
+(float16_t)-0.99631261218277800129f,(float16_t)0.08579731234444015753f,
+(float16_t)-0.99772306664419163624f,(float16_t)0.06744391956366417584f,
+(float16_t)-0.99879545620517240501f,(float16_t)0.04906767432741796636f,
+(float16_t)-0.99952941750109314256f,(float16_t)0.03067480317663686534f,
+(float16_t)-0.99992470183914450299f,(float16_t)0.01227153828572000692f,
+(float16_t)-0.99998117528260110909f,(float16_t)-0.00613588464915455420f,
+(float16_t)-0.99969881869620424997f,(float16_t)-0.02454122852291207996f,
+(float16_t)-0.99907772775264536147f,(float16_t)-0.04293825693494077861f,
+(float16_t)-0.99811811290014917919f,(float16_t)-0.06132073630220824523f,
+(float16_t)-0.99682029929116577893f,(float16_t)-0.07968243797142994522f,
+(float16_t)-0.99518472667219692873f,(float16_t)-0.09801714032956058975f,
+(float16_t)-0.99321194923479461103f,(float16_t)-0.11631863091190447479f,
+(float16_t)-0.99090263542778000971f,(float16_t)-0.13458070850712605671f,
+(float16_t)-0.98825756773074946437f,(float16_t)-0.15279718525844343535f,
+(float16_t)-0.98527764238894133264f,(float16_t)-0.17096188876030096737f,
+(float16_t)-0.98196386910955524296f,(float16_t)-0.18906866414980610935f,
+(float16_t)-0.97831737071962765473f,(float16_t)-0.20711137619221858808f,
+(float16_t)-0.97433938278557585821f,(float16_t)-0.22508391135979261000f,
+(float16_t)-0.97003125319454397424f,(float16_t)-0.24298017990326381543f,
+(float16_t)-0.96539444169768939830f,(float16_t)-0.26079411791527562503f,
+(float16_t)-0.96043051941556589757f,(float16_t)-0.27851968938505289319f,
+(float16_t)-0.95514116830577078243f,(float16_t)-0.29615088824362378883f,
+(float16_t)-0.94952818059303678577f,(float16_t)-0.31368174039889118454f,
+(float16_t)-0.94359345816196038559f,(float16_t)-0.33110630575987626267f,
+(float16_t)-0.93733901191257495977f,(float16_t)-0.34841868024943456472f,
+(float16_t)-0.93076696107898382326f,(float16_t)-0.36561299780477357624f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,
+(float16_t)-0.91667905992104270485f,(float16_t)-0.39962419984564684361f,
+(float16_t)-0.90916798309052249127f,(float16_t)-0.41642956009763693048f,
+(float16_t)-0.90134884704602202810f,(float16_t)-0.43309381885315184624f,
+(float16_t)-0.89322430119551532446f,(float16_t)-0.44961132965460665067f,
+(float16_t)-0.88479709843093790056f,(float16_t)-0.46597649576796595916f,
+(float16_t)-0.87607009419540660122f,(float16_t)-0.48218377207912266336f,
+(float16_t)-0.86704624551569287050f,(float16_t)-0.49822766697278153547f,
+(float16_t)-0.85772861000027211809f,(float16_t)-0.51410274419322155026f,
+(float16_t)-0.84812034480329723252f,(float16_t)-0.52980362468629460526f,
+(float16_t)-0.83822470555483818977f,(float16_t)-0.54532498842204613076f,
+(float16_t)-0.82804504525775590729f,(float16_t)-0.56066157619733592021f,
+(float16_t)-0.81758481315158371139f,(float16_t)-0.57580819141784533866f,
+(float16_t)-0.80684755354379944503f,(float16_t)-0.59075970185887394237f,
+(float16_t)-0.79583690460888356633f,(float16_t)-0.60551104140432543410f,
+(float16_t)-0.78455659715557524159f,(float16_t)-0.62005721176328920663f,
+(float16_t)-0.77301045336273710440f,(float16_t)-0.63439328416364526575f,
+(float16_t)-0.76120238548426188974f,(float16_t)-0.64851440102211233008f,
+(float16_t)-0.74913639452345925918f,(float16_t)-0.66241577759017178373f,
+(float16_t)-0.73681656887737001504f,(float16_t)-0.67609270357531581208f,
+(float16_t)-0.72424708295146700276f,(float16_t)-0.68954054473706682948f,
+(float16_t)-0.71143219574521665560f,(float16_t)-0.70275474445722507788f,
+(float16_t)-0.69837624940897302661f,(float16_t)-0.71573082528381848366f,
+(float16_t)-0.68508366777270035541f,(float16_t)-0.72846439044822519637f,
+(float16_t)-0.67155895484701866316f,(float16_t)-0.74095112535495888384f,
+(float16_t)-0.65780669329707874837f,(float16_t)-0.75318679904361240940f,
+(float16_t)-0.64383154288979149715f,(float16_t)-0.76516726562245895860f,
+(float16_t)-0.62963823891492687324f,(float16_t)-0.77688846567323255332f,
+(float16_t)-0.61523159058062726334f,(float16_t)-0.78834642762660589455f,
+(float16_t)-0.60061647938386930612f,(float16_t)-0.79953726910790479110f,
+(float16_t)-0.58579785745643908612f,(float16_t)-0.81045719825259465718f,
+(float16_t)-0.57078074588696736669f,(float16_t)-0.82110251499110464835f,
+(float16_t)-0.55557023301960217765f,(float16_t)-0.83146961230254523567f,
+(float16_t)-0.54017147272989274320f,(float16_t)-0.84155497743689855472f,
+(float16_t)-0.52458968267846928235f,(float16_t)-0.85135519310526486247f,
+(float16_t)-0.50883014254310732216f,(float16_t)-0.86086693863776708735f,
+(float16_t)-0.49289819222978420443f,(float16_t)-0.87008699110871134952f,
+(float16_t)-0.47679923006332214364f,(float16_t)-0.87901222642863341417f,
+(float16_t)-0.46053871095823989412f,(float16_t)-0.88763962040285404598f,
+(float16_t)-0.44412214457042975546f,(float16_t)-0.89596624975618488484f,
+(float16_t)-0.42755509343028247349f,(float16_t)-0.90398929312344311615f,
+(float16_t)-0.41084317105790418845f,(float16_t)-0.91170603200542976730f,
+(float16_t)-0.39399204006104820985f,(float16_t)-0.91911385169005765938f,
+(float16_t)-0.37700741021641820394f,(float16_t)-0.92621024213831137928f,
+(float16_t)-0.35989503653498794433f,(float16_t)-0.93299279883473895669f,
+(float16_t)-0.34266071731199487793f,(float16_t)-0.93945922360218969693f,
+(float16_t)-0.32531029216226331480f,(float16_t)-0.94560732538052116869f,
+(float16_t)-0.30784964004153508865f,(float16_t)-0.95143502096900833820f,
+(float16_t)-0.29028467725446244208f,(float16_t)-0.95694033573220882438f,
+(float16_t)-0.27262135544994886560f,(float16_t)-0.96212140426904158019f,
+(float16_t)-0.25486565960451434965f,(float16_t)-0.96697647104485218161f,
+(float16_t)-0.23702360599436766986f,(float16_t)-0.97150389098625167250f,
+(float16_t)-0.21910124015687010290f,(float16_t)-0.97570213003852845901f,
+(float16_t)-0.20110463484209206708f,(float16_t)-0.97956976568544051887f,
+(float16_t)-0.18303988795514095078f,(float16_t)-0.98310548743121628501f,
+(float16_t)-0.16491312048996975559f,(float16_t)-0.98630809724459866938f,
+(float16_t)-0.14673047445536230304f,(float16_t)-0.98917650996478090342f,
+(float16_t)-0.12849811079379358514f,(float16_t)-0.99170975366909952520f,
+(float16_t)-0.11022220729388330918f,(float16_t)-0.99390697000235606051f,
+(float16_t)-0.09190895649713282101f,(float16_t)-0.99576741446765981713f,
+(float16_t)-0.07356456359966735692f,(float16_t)-0.99729045667869020697f,
+(float16_t)-0.05519524434968971216f,(float16_t)-0.99847558057329477421f,
+(float16_t)-0.03680722294135933131f,(float16_t)-0.99932238458834943273f,
+(float16_t)-0.01840672990580516366f,(float16_t)-0.99983058179582340319f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)-0.04906767432741800800f,(float16_t)0.99879545620517240501f,
+(float16_t)-0.12241067519921615403f,(float16_t)0.99247953459870996706f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.26671275747489830987f,(float16_t)0.96377606579543984022f,
+(float16_t)-0.33688985339221994009f,(float16_t)0.94154406518302080631f,
+(float16_t)-0.40524131400498974998f,(float16_t)0.91420975570353069095f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.53499761988709704230f,(float16_t)0.84485356524970722791f,
+(float16_t)-0.59569930449243335691f,(float16_t)0.80320753148064494287f,
+(float16_t)-0.65317284295377653347f,(float16_t)0.75720884650648467851f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.75720884650648467851f,(float16_t)0.65317284295377664449f,
+(float16_t)-0.80320753148064483184f,(float16_t)0.59569930449243346793f,
+(float16_t)-0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.91420975570353069095f,(float16_t)0.40524131400498991651f,
+(float16_t)-0.94154406518302069529f,(float16_t)0.33688985339222032867f,
+(float16_t)-0.96377606579543984022f,(float16_t)0.26671275747489847641f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.99247953459870996706f,(float16_t)0.12241067519921634832f,
+(float16_t)-0.99879545620517240501f,(float16_t)0.04906767432741796636f,
+(float16_t)-0.99969881869620424997f,(float16_t)-0.02454122852291207996f,
+(float16_t)-0.99518472667219692873f,(float16_t)-0.09801714032956058975f,
+(float16_t)-0.98527764238894133264f,(float16_t)-0.17096188876030096737f,
+(float16_t)-0.97003125319454397424f,(float16_t)-0.24298017990326381543f,
+(float16_t)-0.94952818059303678577f,(float16_t)-0.31368174039889118454f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,
+(float16_t)-0.89322430119551532446f,(float16_t)-0.44961132965460665067f,
+(float16_t)-0.85772861000027211809f,(float16_t)-0.51410274419322155026f,
+(float16_t)-0.81758481315158371139f,(float16_t)-0.57580819141784533866f,
+(float16_t)-0.77301045336273710440f,(float16_t)-0.63439328416364526575f,
+(float16_t)-0.72424708295146700276f,(float16_t)-0.68954054473706682948f,
+(float16_t)-0.67155895484701866316f,(float16_t)-0.74095112535495888384f,
+(float16_t)-0.61523159058062726334f,(float16_t)-0.78834642762660589455f,
+(float16_t)-0.55557023301960217765f,(float16_t)-0.83146961230254523567f,
+(float16_t)-0.49289819222978420443f,(float16_t)-0.87008699110871134952f,
+(float16_t)-0.42755509343028247349f,(float16_t)-0.90398929312344311615f,
+(float16_t)-0.35989503653498794433f,(float16_t)-0.93299279883473895669f,
+(float16_t)-0.29028467725446244208f,(float16_t)-0.95694033573220882438f,
+(float16_t)-0.21910124015687010290f,(float16_t)-0.97570213003852845901f,
+(float16_t)-0.14673047445536230304f,(float16_t)-0.98917650996478090342f,
+(float16_t)-0.07356456359966735692f,(float16_t)-0.99729045667869020697f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.99518472667219692873f,(float16_t)-0.09801714032956058975f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,
+(float16_t)-0.77301045336273710440f,(float16_t)-0.63439328416364526575f,
+(float16_t)-0.55557023301960217765f,(float16_t)-0.83146961230254523567f,
+(float16_t)-0.29028467725446244208f,(float16_t)-0.95694033573220882438f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,};
+
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096) || defined(ARM_TABLE_TWIDDLECOEF_F16_8192)
+
+uint32_t rearranged_twiddle_tab_stride1_arr_4096_f16[6]={
+0,2048,2560,2688,2720,0,};
+
+uint32_t rearranged_twiddle_tab_stride2_arr_4096_f16[6]={
+0,2048,2560,2688,2720,0,};
+
+uint32_t rearranged_twiddle_tab_stride3_arr_4096_f16[6]={
+0,2048,2560,2688,2720,0,};
+
+float16_t rearranged_twiddle_stride1_4096_f16[2728]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99999882345170187925f,(float16_t)0.00153398018628476550f,
+(float16_t)0.99999529380957619118f,(float16_t)0.00306795676296597614f,
+(float16_t)0.99998941108192840321f,(float16_t)0.00460192612044857050f,
+(float16_t)0.99998117528260110909f,(float16_t)0.00613588464915447527f,
+(float16_t)0.99997058643097413988f,(float16_t)0.00766982873953109701f,
+(float16_t)0.99995764455196389786f,(float16_t)0.00920375478205981944f,
+(float16_t)0.99994234967602391162f,(float16_t)0.01073765916726449055f,
+(float16_t)0.99992470183914450299f,(float16_t)0.01227153828571992539f,
+(float16_t)0.99990470108285289808f,(float16_t)0.01380538852806039059f,
+(float16_t)0.99988234745421256111f,(float16_t)0.01533920628498810015f,
+(float16_t)0.99985764100582386060f,(float16_t)0.01687298794728171042f,
+(float16_t)0.99983058179582340319f,(float16_t)0.01840672990580482019f,
+(float16_t)0.99980116988788425569f,(float16_t)0.01994042855151444138f,
+(float16_t)0.99976940535121527898f,(float16_t)0.02147408027546950787f,
+(float16_t)0.99973528826056168306f,(float16_t)0.02300768146883936868f,
+(float16_t)0.99969881869620424997f,(float16_t)0.02454122852291228812f,
+(float16_t)0.99965999674395922270f,(float16_t)0.02607471782910390085f,
+(float16_t)0.99961882249517863830f,(float16_t)0.02760814577896573974f,
+(float16_t)0.99957529604674921764f,(float16_t)0.02914150876419372219f,
+(float16_t)0.99952941750109314256f,(float16_t)0.03067480317663662595f,
+(float16_t)0.99948118696616694567f,(float16_t)0.03220802540830458582f,
+(float16_t)0.99943060455546173237f,(float16_t)0.03374117185137757990f,
+(float16_t)0.99937767038800284780f,(float16_t)0.03527423889821394709f,
+(float16_t)0.99932238458834954375f,(float16_t)0.03680722294135883171f,
+(float16_t)0.99926474728659442359f,(float16_t)0.03834012037355269409f,
+(float16_t)0.99920475861836388631f,(float16_t)0.03987292758773981066f,
+(float16_t)0.99914241872481690532f,(float16_t)0.04140564097707673946f,
+(float16_t)0.99907772775264536147f,(float16_t)0.04293825693494082024f,
+(float16_t)0.99901068585407337697f,(float16_t)0.04447077185493866769f,
+(float16_t)0.99894129318685687124f,(float16_t)0.04600318213091462299f,
+(float16_t)0.99886954991428356099f,(float16_t)0.04753548415695930257f,
+(float16_t)0.99879545620517240501f,(float16_t)0.04906767432741801493f,
+(float16_t)0.99871901223387293811f,(float16_t)0.05059974903689928166f,
+(float16_t)0.99864021818026527111f,(float16_t)0.05213170468028332366f,
+(float16_t)0.99855907422975931365f,(float16_t)0.05366353765273051968f,
+(float16_t)0.99847558057329477421f,(float16_t)0.05519524434968993420f,
+(float16_t)0.99838973740734016094f,(float16_t)0.05672682116690774823f,
+(float16_t)0.99830154493389289261f,(float16_t)0.05825826450043575244f,
+(float16_t)0.99821100336047818846f,(float16_t)0.05978957074663986820f,
+(float16_t)0.99811811290014917919f,(float16_t)0.06132073630220857829f,
+(float16_t)0.99802287377148624081f,(float16_t)0.06285175756416140624f,
+(float16_t)0.99792528619859599548f,(float16_t)0.06438263092985746505f,
+(float16_t)0.99782535041111164453f,(float16_t)0.06591335279700380467f,
+(float16_t)0.99772306664419163624f,(float16_t)0.06744391956366405094f,
+(float16_t)0.99761843513851955478f,(float16_t)0.06897432762826674613f,
+(float16_t)0.99751145614030345410f,(float16_t)0.07050457338961385600f,
+(float16_t)0.99740212990127530279f,(float16_t)0.07203465324688933247f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.99717643673532618820f,(float16_t)0.07509430084792130533f,
+(float16_t)0.99706007033948296225f,(float16_t)0.07662386139203149205f,
+(float16_t)0.99694135776498216117f,(float16_t)0.07815324163279423197f,
+(float16_t)0.99682029929116566791f,(float16_t)0.07968243797143012563f,
+(float16_t)0.99669689520289606044f,(float16_t)0.08121144680959244133f,
+(float16_t)0.99657114579055483539f,(float16_t)0.08274026454937569164f,
+(float16_t)0.99644305135004263008f,(float16_t)0.08426888759332407108f,
+(float16_t)0.99631261218277800129f,(float16_t)0.08579731234443989385f,
+(float16_t)0.99617982859569698117f,(float16_t)0.08732553520619205922f,
+(float16_t)0.99604470090125196702f,(float16_t)0.08885355258252460031f,
+(float16_t)0.99590722941741172125f,(float16_t)0.09038136087786498296f,
+(float16_t)0.99576741446765981713f,(float16_t)0.09190895649713272386f,
+(float16_t)0.99562525638099430569f,(float16_t)0.09343633584574778661f,
+(float16_t)0.99548075549192693856f,(float16_t)0.09496349532963899165f,
+(float16_t)0.99533391214048227980f,(float16_t)0.09649043135525259274f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.99503319943811863180f,(float16_t)0.09954361866006931903f,
+(float16_t)0.99487933079480561638f,(float16_t)0.10106986275482782167f,
+(float16_t)0.99472312110432570265f,(float16_t)0.10259586902243628126f,
+(float16_t)0.99456457073425541537f,(float16_t)0.10412163387205458642f,
+(float16_t)0.99440368005767909576f,(float16_t)0.10564715371341061589f,
+(float16_t)0.99424044945318790223f,(float16_t)0.10717242495680884273f,
+(float16_t)0.99407487930487936634f,(float16_t)0.10869744401313871651f,
+(float16_t)0.99390697000235606051f,(float16_t)0.11022220729388305938f,
+(float16_t)0.99373672194072459884f,(float16_t)0.11174671121112658700f,
+(float16_t)0.99356413552059530403f,(float16_t)0.11327095217756434631f,
+(float16_t)0.99338921114808065305f,(float16_t)0.11479492660651008373f,
+(float16_t)0.99321194923479450001f,(float16_t)0.11631863091190475235f,
+(float16_t)0.99303235019785141002f,(float16_t)0.11784206150832497728f,
+(float16_t)0.99285041445986510489f,(float16_t)0.11936521481099135467f,
+(float16_t)0.99266614244894801899f,(float16_t)0.12088808723577708359f,
+(float16_t)0.99247953459870996706f,(float16_t)0.12241067519921619566f,
+(float16_t)0.99229059134825736699f,(float16_t)0.12393297511851215920f,
+(float16_t)0.99209931314219179654f,(float16_t)0.12545498341154623367f,
+(float16_t)0.99190570043060932726f,(float16_t)0.12697669649688586579f,
+(float16_t)0.99170975366909952520f,(float16_t)0.12849811079379316880f,
+(float16_t)0.99151147331874389668f,(float16_t)0.13001922272223334631f,
+(float16_t)0.99131085984611544415f,(float16_t)0.13154002870288311611f,
+(float16_t)0.99110791372327688986f,(float16_t)0.13306052515713906459f,
+(float16_t)0.99090263542778000971f,(float16_t)0.13458070850712616773f,
+(float16_t)0.99069502544266463406f,(float16_t)0.13610057517570620100f,
+(float16_t)0.99048508425645709341f,(float16_t)0.13762012158648603832f,
+(float16_t)0.99027281236316910817f,(float16_t)0.13913934416382620074f,
+(float16_t)0.99005821026229712256f,(float16_t)0.14065823933284921088f,
+(float16_t)0.98984127845882052821f,(float16_t)0.14217680351944803063f,
+(float16_t)0.98962201746320088702f,(float16_t)0.14369503315029447110f,
+(float16_t)0.98940042779138037687f,(float16_t)0.14521292465284746376f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98895026451030298986f,(float16_t)0.14824767898689603096f,
+(float16_t)0.98872169196032377858f,(float16_t)0.14976453467732150915f,
+(float16_t)0.98849079285269658701f,(float16_t)0.15128103795733022219f,
+(float16_t)0.98825756773074946437f,(float16_t)0.15279718525844343535f,
+(float16_t)0.98802201714328352633f,(float16_t)0.15431297301302010494f,
+(float16_t)0.98778414164457217783f,(float16_t)0.15582839765426523271f,
+(float16_t)0.98754394179435922574f,(float16_t)0.15734345561623824805f,
+(float16_t)0.98730141815785843473f,(float16_t)0.15885814333386144570f,
+(float16_t)0.98705657130575097380f,(float16_t)0.16037245724292828464f,
+(float16_t)0.98680940181418552726f,(float16_t)0.16188639378011182579f,
+(float16_t)0.98655991026477540817f,(float16_t)0.16339994938297322524f,
+(float16_t)0.98630809724459866938f,(float16_t)0.16491312048996989437f,
+(float16_t)0.98605396334619543897f,(float16_t)0.16642590354046410406f,
+(float16_t)0.98579750916756747614f,(float16_t)0.16793829497473117263f,
+(float16_t)0.98553873531217606185f,(float16_t)0.16945029123396795900f,
+(float16_t)0.98527764238894122162f,(float16_t)0.17096188876030121717f,
+(float16_t)0.98501423101223983814f,(float16_t)0.17247308399679595059f,
+(float16_t)0.98474850180190420801f,(float16_t)0.17398387338746382214f,
+(float16_t)0.98448045538322093151f,(float16_t)0.17549425337727142526f,
+(float16_t)0.98421009238692902521f,(float16_t)0.17700422041214874946f,
+(float16_t)0.98393741344921892278f,(float16_t)0.17851377093899750692f,
+(float16_t)0.98366241921173025453f,(float16_t)0.18002290140569951471f,
+(float16_t)0.98338511032155118130f,(float16_t)0.18153160826112496595f,
+(float16_t)0.98310548743121628501f,(float16_t)0.18303988795514095078f,
+(float16_t)0.98282355119870523641f,(float16_t)0.18454773693861961648f,
+(float16_t)0.98253930228744124076f,(float16_t)0.18605515166344663291f,
+(float16_t)0.98225274136628937249f,(float16_t)0.18756212858252960252f,
+(float16_t)0.98196386910955524296f,(float16_t)0.18906866414980619262f,
+(float16_t)0.98167268619698311305f,(float16_t)0.19057475482025273972f,
+(float16_t)0.98137919331375456089f,(float16_t)0.19208039704989243734f,
+(float16_t)0.98108339115048670553f,(float16_t)0.19358558729580360724f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.98048486177346938497f,(float16_t)0.19659459767008022335f,
+(float16_t)0.98018213596811742949f,(float16_t)0.19809841071795356027f,
+(float16_t)0.97987710369951763756f,(float16_t)0.19960175762113097075f,
+(float16_t)0.97956976568544051887f,(float16_t)0.20110463484209190055f,
+(float16_t)0.97926012264908202098f,(float16_t)0.20260703884442113343f,
+(float16_t)0.97894817531906219710f,(float16_t)0.20410896609281686809f,
+(float16_t)0.97863392442942320759f,(float16_t)0.20561041305309923910f,
+(float16_t)0.97831737071962765473f,(float16_t)0.20711137619221856032f,
+(float16_t)0.97799851493455713936f,(float16_t)0.20861185197826348503f,
+(float16_t)0.97767735782450992943f,(float16_t)0.21011183688046961016f,
+(float16_t)0.97735390014519996082f,(float16_t)0.21161132736922755315f,
+(float16_t)0.97702814265775439484f,(float16_t)0.21311031991609136194f,
+(float16_t)0.97670008612871184184f,(float16_t)0.21460881099378675829f,
+(float16_t)0.97636973133002114000f,(float16_t)0.21610679707621952006f,
+(float16_t)0.97603707903903902388f,(float16_t)0.21760427463848364127f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.97536488511665697665f,(float16_t)0.22059769010887350649f,
+(float16_t)0.97502534506699412020f,(float16_t)0.22209362097320350937f,
+(float16_t)0.97468351068851066810f,(float16_t)0.22358902922978998729f,
+(float16_t)0.97433938278557585821f,(float16_t)0.22508391135979283204f,
+(float16_t)0.97399296216795583359f,(float16_t)0.22657826384561000066f,
+(float16_t)0.97364424965081197705f,(float16_t)0.22807208317088573102f,
+(float16_t)0.97329324605469824672f,(float16_t)0.22956536582051886852f,
+(float16_t)0.97293995220556017678f,(float16_t)0.23105810828067110951f,
+(float16_t)0.97258436893473221296f,(float16_t)0.23255030703877524467f,
+(float16_t)0.97222649707893626925f,(float16_t)0.23404195858354343018f,
+(float16_t)0.97186633748027939639f,(float16_t)0.23553305940497548665f,
+(float16_t)0.97150389098625178352f,(float16_t)0.23702360599436719801f,
+(float16_t)0.97113915844972509284f,(float16_t)0.23851359484431841618f,
+(float16_t)0.97077214072895035013f,(float16_t)0.24000302244874149871f,
+(float16_t)0.97040283868755550234f,(float16_t)0.24149188530286933019f,
+(float16_t)0.97003125319454397424f,(float16_t)0.24298017990326387094f,
+(float16_t)0.96965738512429244800f,(float16_t)0.24446790274782415064f,
+(float16_t)0.96928123535654853171f,(float16_t)0.24595505033579459497f,
+(float16_t)0.96890280477642887202f,(float16_t)0.24744161916777326904f,
+(float16_t)0.96852209427441737777f,(float16_t)0.24892760574572014853f,
+(float16_t)0.96813910474636244441f,(float16_t)0.25041300657296522436f,
+(float16_t)0.96775383709347551076f,(float16_t)0.25189781815421696809f,
+(float16_t)0.96736629222232850545f,(float16_t)0.25338203699557015902f,
+(float16_t)0.96697647104485207059f,(float16_t)0.25486565960451457169f,
+(float16_t)0.96658437447833311928f,(float16_t)0.25634868248994291395f,
+(float16_t)0.96619000344541250413f,(float16_t)0.25783110216215898713f,
+(float16_t)0.96579335887408368500f,(float16_t)0.25931291513288623474f,
+(float16_t)0.96539444169768939830f,(float16_t)0.26079411791527551401f,
+(float16_t)0.96499325285492032478f,(float16_t)0.26227470702391358914f,
+(float16_t)0.96458979328981275803f,(float16_t)0.26375467897483134694f,
+(float16_t)0.96418406395174582890f,(float16_t)0.26523403028551179039f,
+(float16_t)0.96377606579543984022f,(float16_t)0.26671275747489836538f,
+(float16_t)0.96336579978095404631f,(float16_t)0.26819085706340317632f,
+(float16_t)0.96295326687368387741f,(float16_t)0.26966832557291509076f,
+(float16_t)0.96253846804435916340f,(float16_t)0.27114515952680801059f,
+(float16_t)0.96212140426904158019f,(float16_t)0.27262135544994897662f,
+(float16_t)0.96170207652912254037f,(float16_t)0.27409690986870638429f,
+(float16_t)0.96128048581132063966f,(float16_t)0.27557181931095814376f,
+(float16_t)0.96085663310767965850f,(float16_t)0.27704608030609989555f,
+(float16_t)0.96043051941556578655f,(float16_t)0.27851968938505305973f,
+(float16_t)0.96000214573766595727f,(float16_t)0.27999264308027321801f,
+(float16_t)0.95957151308198451733f,(float16_t)0.28146493792575794091f,
+(float16_t)0.95913862246184189431f,(float16_t)0.28293657045705539188f,
+(float16_t)0.95870347489587159906f,(float16_t)0.28440753721127187692f,
+(float16_t)0.95826607140801767226f,(float16_t)0.28587783472708061527f,
+(float16_t)0.95782641302753290802f,(float16_t)0.28734745954472951102f,
+(float16_t)0.95738450078897585627f,(float16_t)0.28881640820604947972f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.95649391890239510161f,(float16_t)0.29175226323498926195f,
+(float16_t)0.95604525134999640557f,(float16_t)0.29321916269425862822f,
+(float16_t)0.95559433413077110586f,(float16_t)0.29468537218051432669f,
+(float16_t)0.95514116830577078243f,(float16_t)0.29615088824362378883f,
+(float16_t)0.95468575494133833814f,(float16_t)0.29761570743508619641f,
+(float16_t)0.95422809510910566733f,(float16_t)0.29907982630804047508f,
+(float16_t)0.95376818988599032512f,(float16_t)0.30054324141727345454f,
+(float16_t)0.95330604035419386211f,(float16_t)0.30200594931922808417f,
+(float16_t)0.95284164760119871573f,(float16_t)0.30346794657201131562f,
+(float16_t)0.95237501271976587880f,(float16_t)0.30492922973540237397f,
+(float16_t)0.95190613680793234597f,(float16_t)0.30638979537086091787f,
+(float16_t)0.95143502096900833820f,(float16_t)0.30784964004153486661f,
+(float16_t)0.95096166631157508231f,(float16_t)0.30930876031226872680f,
+(float16_t)0.95048607394948170235f,(float16_t)0.31076715274961147495f,
+(float16_t)0.95000824500184299914f,(float16_t)0.31222481392182488413f,
+(float16_t)0.94952818059303667475f,(float16_t)0.31368174039889151761f,
+(float16_t)0.94904588185270055689f,(float16_t)0.31513792875252244485f,
+(float16_t)0.94856134991573026749f,(float16_t)0.31659337555616584581f,
+(float16_t)0.94807458592227622507f,(float16_t)0.31804807738501494896f,
+(float16_t)0.94758559101774109124f,(float16_t)0.31950203081601569188f,
+(float16_t)0.94709436635277721717f,(float16_t)0.32095523242787521445f,
+(float16_t)0.94660091308328353499f,(float16_t)0.32240767880106985244f,
+(float16_t)0.94610523237040344835f,(float16_t)0.32385936651785285356f,
+(float16_t)0.94560732538052127971f,(float16_t)0.32531029216226292622f,
+(float16_t)0.94510719328526060501f,(float16_t)0.32676045232013173347f,
+(float16_t)0.94460483726148025685f,(float16_t)0.32820984357909249729f,
+(float16_t)0.94410025849127265918f,(float16_t)0.32965846252858749255f,
+(float16_t)0.94359345816196038559f,(float16_t)0.33110630575987642921f,
+(float16_t)0.94308443746609349478f,(float16_t)0.33255336986604422389f,
+(float16_t)0.94257319760144686605f,(float16_t)0.33399965144200938205f,
+(float16_t)0.94205973977101731265f,(float16_t)0.33544514708453160301f,
+(float16_t)0.94154406518302080631f,(float16_t)0.33688985339222005111f,
+(float16_t)0.94102617505088925753f,(float16_t)0.33833376696554112728f,
+(float16_t)0.94050607059326829518f,(float16_t)0.33977688440682685123f,
+(float16_t)0.93998375303401404679f,(float16_t)0.34121920232028235542f,
+(float16_t)0.93945922360218991898f,(float16_t)0.34266071731199437833f,
+(float16_t)0.93893248353206459900f,(float16_t)0.34410142598993881391f,
+(float16_t)0.93840353406310805795f,(float16_t)0.34554132496398909380f,
+(float16_t)0.93787237643998988545f,(float16_t)0.34698041084592368133f,
+(float16_t)0.93733901191257495977f,(float16_t)0.34841868024943456472f,
+(float16_t)0.93680344173592156043f,(float16_t)0.34985612979013491763f,
+(float16_t)0.93626566717027825959f,(float16_t)0.35129275608556709276f,
+(float16_t)0.93572568948108036935f,(float16_t)0.35272855575521072646f,
+(float16_t)0.93518350993894761025f,(float16_t)0.35416352542049034380f,
+(float16_t)0.93463912981968078064f,(float16_t)0.35559766170478385172f,
+(float16_t)0.93409255040425887007f,(float16_t)0.35703096123342997759f,
+(float16_t)0.93354377297883617270f,(float16_t)0.35846342063373654030f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.93243962926846235550f,(float16_t)0.36132580556845428355f,
+(float16_t)0.93188426558166814750f,(float16_t)0.36275572436739722537f,
+(float16_t)0.93132670908118042608f,(float16_t)0.36418478956707989180f,
+(float16_t)0.93076696107898371224f,(float16_t)0.36561299780477385379f,
+(float16_t)0.93020502289221906889f,(float16_t)0.36704034571976718038f,
+(float16_t)0.92964089584318121418f,(float16_t)0.36846682995337232125f,
+(float16_t)0.92907458125931585702f,(float16_t)0.36989244714893410038f,
+(float16_t)0.92850608047321558924f,(float16_t)0.37131719395183754306f,
+(float16_t)0.92793539482261788720f,(float16_t)0.37274106700951575855f,
+(float16_t)0.92736252565040111495f,(float16_t)0.37416406297145793358f,
+(float16_t)0.92678747430458174872f,(float16_t)0.37558617848921721505f,
+(float16_t)0.92621024213831137928f,(float16_t)0.37700741021641825945f,
+(float16_t)0.92563083050987271516f,(float16_t)0.37842775480876555960f,
+(float16_t)0.92504924078267758425f,(float16_t)0.37984720892405116066f,
+(float16_t)0.92446547432526260391f,(float16_t)0.38126576922216237620f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.92329141671952763559f,(float16_t)0.38410019501693504207f,
+(float16_t)0.92270112833387862850f,(float16_t)0.38551605384391884890f,
+(float16_t)0.92210866874334518339f,(float16_t)0.38693100551438858181f,
+(float16_t)0.92151403934204190183f,(float16_t)0.38834504669882624617f,
+(float16_t)0.92091724152918941204f,(float16_t)0.38975817406985641123f,
+(float16_t)0.92031827670911059425f,(float16_t)0.39117038430225387069f,
+(float16_t)0.91971714629122736095f,(float16_t)0.39258167407295146978f,
+(float16_t)0.91911385169005777040f,(float16_t)0.39399204006104809883f,
+(float16_t)0.91850839432521225181f,(float16_t)0.39540147894781635385f,
+(float16_t)0.91790077562139049672f,(float16_t)0.39680998741671030805f,
+(float16_t)0.91729099700837790632f,(float16_t)0.39821756215337356100f,
+(float16_t)0.91667905992104270485f,(float16_t)0.39962419984564678810f,
+(float16_t)0.91606496579933172075f,(float16_t)0.40102989718357562321f,
+(float16_t)0.91544871608826783316f,(float16_t)0.40243465085941843018f,
+(float16_t)0.91483031223794619713f,(float16_t)0.40383845756765407442f,
+(float16_t)0.91420975570353069095f,(float16_t)0.40524131400498986100f,
+(float16_t)0.91358704794525080750f,(float16_t)0.40664321687036902864f,
+(float16_t)0.91296219042839821256f,(float16_t)0.40804416286497868782f,
+(float16_t)0.91233518462332274801f,(float16_t)0.40944414869225759235f,
+(float16_t)0.91170603200542987832f,(float16_t)0.41084317105790391089f,
+(float16_t)0.91107473405517636067f,(float16_t)0.41224122666988288755f,
+(float16_t)0.91044129225806724737f,(float16_t)0.41363831223843450235f,
+(float16_t)0.90980570810465222209f,(float16_t)0.41503442447608163146f,
+(float16_t)0.90916798309052238025f,(float16_t)0.41642956009763715253f,
+(float16_t)0.90852811871630612117f,(float16_t)0.41782371582021227141f,
+(float16_t)0.90788611648766626150f,(float16_t)0.41921688836322390515f,
+(float16_t)0.90724197791529581636f,(float16_t)0.42060907444840250902f,
+(float16_t)0.90659570451491533483f,(float16_t)0.42200027079979968159f,
+(float16_t)0.90594729780726845902f,(float16_t)0.42339047414379604728f,
+(float16_t)0.90529675931811881551f,(float16_t)0.42477968120910880589f,
+(float16_t)0.90464409057824624050f,(float16_t)0.42616788872679961520f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.90333236849451181705f,(float16_t)0.42894129205532949278f,
+(float16_t)0.90267331823725882600f,(float16_t)0.43032648134008261165f,
+(float16_t)0.90201214390249317976f,(float16_t)0.43171065802505725895f,
+(float16_t)0.90134884704602202810f,(float16_t)0.43309381885315195726f,
+(float16_t)0.90068342922864685907f,(float16_t)0.43447596056965565037f,
+(float16_t)0.90001589201616016833f,(float16_t)0.43585707992225547480f,
+(float16_t)0.89934623697934157338f,(float16_t)0.43723717366104408732f,
+(float16_t)0.89867446569395381673f,(float16_t)0.43861623853852765853f,
+(float16_t)0.89800057974073987932f,(float16_t)0.43999427130963325583f,
+(float16_t)0.89732458070541831763f,(float16_t)0.44137126873171667052f,
+(float16_t)0.89664647017868015499f,(float16_t)0.44274722756457002282f,
+(float16_t)0.89596624975618521791f,(float16_t)0.44412214457042920035f,
+(float16_t)0.89528392103855758410f,(float16_t)0.44549601651398174074f,
+(float16_t)0.89459948563138269595f,(float16_t)0.44686884016237415906f,
+(float16_t)0.89391294514520325265f,(float16_t)0.44824061228521988598f,
+(float16_t)0.89322430119551532446f,(float16_t)0.44961132965460653965f,
+(float16_t)0.89253355540276457791f,(float16_t)0.45098098904510386387f,
+(float16_t)0.89184070939234272313f,(float16_t)0.45234958723377088896f,
+(float16_t)0.89114576479458318392f,(float16_t)0.45371712100016386993f,
+(float16_t)0.89044872324475787817f,(float16_t)0.45508358712634383592f,
+(float16_t)0.88974958638307277692f,(float16_t)0.45644898239688391772f,
+(float16_t)0.88904835585466457371f,(float16_t)0.45781330359887717485f,
+(float16_t)0.88834503330959635470f,(float16_t)0.45917654752194408951f,
+(float16_t)0.88763962040285393496f,(float16_t)0.46053871095824000514f,
+(float16_t)0.88693211879434219469f,(float16_t)0.46189979070246273141f,
+(float16_t)0.88622253014888063838f,(float16_t)0.46325978355186014923f,
+(float16_t)0.88551085613619995307f,(float16_t)0.46461868630623781584f,
+(float16_t)0.88479709843093778954f,(float16_t)0.46597649576796618121f,
+(float16_t)0.88408125871263498752f,(float16_t)0.46733320874198841510f,
+(float16_t)0.88336333866573157891f,(float16_t)0.46868882203582790114f,
+(float16_t)0.88264333997956279099f,(float16_t)0.47004333245959561971f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.88119711347122209322f,(float16_t)0.47274903195034279069f,
+(float16_t)0.88047088905216075450f,(float16_t)0.47410021465054996703f,
+(float16_t)0.87974259280004740713f,(float16_t)0.47545028174715586733f,
+(float16_t)0.87901222642863352519f,(float16_t)0.47679923006332208812f,
+(float16_t)0.87827979165654157523f,(float16_t)0.47814705642484300885f,
+(float16_t)0.87754529020726135258f,(float16_t)0.47949375766015295275f,
+(float16_t)0.87680872380914565145f,(float16_t)0.48083933060033395845f,
+(float16_t)0.87607009419540660122f,(float16_t)0.48218377207912271887f,
+(float16_t)0.87532940310411089246f,(float16_t)0.48352707893291868579f,
+(float16_t)0.87458665227817611321f,(float16_t)0.48486924800079106435f,
+(float16_t)0.87384184346536686316f,(float16_t)0.48621027612448641797f,
+(float16_t)0.87309497841829009079f,(float16_t)0.48755016014843599592f,
+(float16_t)0.87234605889439154058f,(float16_t)0.48888889691976317176f,
+(float16_t)0.87159508665595097909f,(float16_t)0.49022648328829115938f,
+(float16_t)0.87084206347007897531f,(float16_t)0.49156291610654989643f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.86932987134860684186f,(float16_t)0.49423230851595967295f,
+(float16_t)0.86857070597134089507f,(float16_t)0.49556526182577254058f,
+(float16_t)0.86780949676330332299f,(float16_t)0.49689704902265446895f,
+(float16_t)0.86704624551569264845f,(float16_t)0.49822766697278181303f,
+(float16_t)0.86628095402451299467f,(float16_t)0.49955711254508183838f,
+(float16_t)0.86551362409056908920f,(float16_t)0.50088538261124071482f,
+(float16_t)0.86474425751946237817f,(float16_t)0.50221247404571078832f,
+(float16_t)0.86397285612158669643f,(float16_t)0.50353838372571757542f,
+(float16_t)0.86319942171212415971f,(float16_t)0.50486310853126759035f,
+(float16_t)0.86242395611104050168f,(float16_t)0.50618664534515522835f,
+(float16_t)0.86164646114308129921f,(float16_t)0.50750899105297087033f,
+(float16_t)0.86086693863776730939f,(float16_t)0.50883014254310698909f,
+(float16_t)0.86008539042939013974f,(float16_t)0.51015009670676680908f,
+(float16_t)0.85930181835700847337f,(float16_t)0.51146885043797030157f,
+(float16_t)0.85851622426444273994f,(float16_t)0.51278640063356295542f,
+(float16_t)0.85772861000027211809f,(float16_t)0.51410274419322166128f,
+(float16_t)0.85693897741782876221f,(float16_t)0.51541787801946292724f,
+(float16_t)0.85614732837519447184f,(float16_t)0.51673179901764987321f,
+(float16_t)0.85535366473519602870f,(float16_t)0.51804450409599933636f,
+(float16_t)0.85455798836540053376f,(float16_t)0.51935599016558964269f,
+(float16_t)0.85376030113811141042f,(float16_t)0.52066625414036715735f,
+(float16_t)0.85296060493036363059f,(float16_t)0.52197529293715438925f,
+(float16_t)0.85215890162391982887f,(float16_t)0.52328310347565643035f,
+(float16_t)0.85135519310526519554f,(float16_t)0.52458968267846894928f,
+(float16_t)0.85054948126560347976f,(float16_t)0.52589502747108463065f,
+(float16_t)0.84974176800085254868f,(float16_t)0.52719913478190127964f,
+(float16_t)0.84893205521163961347f,(float16_t)0.52850200154222848337f,
+(float16_t)0.84812034480329723252f,(float16_t)0.52980362468629460526f,
+(float16_t)0.84730663868585831544f,(float16_t)0.53110400115125500076f,
+(float16_t)0.84649093877405212627f,(float16_t)0.53240312787719790144f,
+(float16_t)0.84567324698729906540f,(float16_t)0.53370100180715296379f,
+(float16_t)0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)0.84403189549006640835f,(float16_t)0.53629297906596318235f,
+(float16_t)0.84320823964184543620f,(float16_t)0.53758707629564539410f,
+(float16_t)0.84238259964318584760f,(float16_t)0.53887990853100842248f,
+(float16_t)0.84155497743689844370f,(float16_t)0.54017147272989285423f,
+(float16_t)0.84072537497045807253f,(float16_t)0.54146176585312344454f,
+(float16_t)0.83989379419599952126f,(float16_t)0.54275078486451588944f,
+(float16_t)0.83906023707031274217f,(float16_t)0.54403852673088382019f,
+(float16_t)0.83822470555483807875f,(float16_t)0.54532498842204646383f,
+(float16_t)0.83738720161566193578f,(float16_t)0.54661016691083486041f,
+(float16_t)0.83654772722351200542f,(float16_t)0.54789405917310018967f,
+(float16_t)0.83570628435375260423f,(float16_t)0.54917666218771965525f,
+(float16_t)0.83486287498638001026f,(float16_t)0.55045797293660481131f,
+(float16_t)0.83401750110601813315f,(float16_t)0.55173798840470733573f,
+(float16_t)0.83317016470191318511f,(float16_t)0.55301670558002746780f,
+(float16_t)0.83232086776792968408f,(float16_t)0.55429412145362000341f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.83061640030884631436f,(float16_t)0.55684503727516010407f,
+(float16_t)0.82976123379452304540f,(float16_t)0.55811853122055610221f,
+(float16_t)0.82890411477186487499f,(float16_t)0.55939071185913613604f,
+(float16_t)0.82804504525775579626f,(float16_t)0.56066157619733603124f,
+(float16_t)0.82718402727366913130f,(float16_t)0.56193112124468935775f,
+(float16_t)0.82632106284566353427f,(float16_t)0.56319934401383409117f,
+(float16_t)0.82545615400437755138f,(float16_t)0.56446624152051938506f,
+(float16_t)0.82458930278502529099f,(float16_t)0.56573181078361312046f,
+(float16_t)0.82372051122739142759f,(float16_t)0.56699604882510867832f,
+(float16_t)0.82284978137582642788f,(float16_t)0.56825895267013148970f,
+(float16_t)0.82197711527924155472f,(float16_t)0.56952051934694714053f,
+(float16_t)0.82110251499110464835f,(float16_t)0.57078074588696725566f,
+(float16_t)0.82022598256943468620f,(float16_t)0.57203962932475704850f,
+(float16_t)0.81934752007679700903f,(float16_t)0.57329716669804220430f,
+(float16_t)0.81846712958029865792f,(float16_t)0.57455335504771576360f,
+(float16_t)0.81758481315158371139f,(float16_t)0.57580819141784533866f,
+(float16_t)0.81670057286682784525f,(float16_t)0.57706167285567944170f,
+(float16_t)0.81581441080673378075f,(float16_t)0.57831379641165558958f,
+(float16_t)0.81492632905652662156f,(float16_t)0.57956455913940563285f,
+(float16_t)0.81403632970594841378f,(float16_t)0.58081395809576452649f,
+(float16_t)0.81314441484925359394f,(float16_t)0.58206199034077543697f,
+(float16_t)0.81225058658520399302f,(float16_t)0.58330865293769829094f,
+(float16_t)0.81135484701706372945f,(float16_t)0.58455394295301532637f,
+(float16_t)0.81045719825259476821f,(float16_t)0.58579785745643886408f,
+(float16_t)0.80955764240405125864f,(float16_t)0.58704039352091796911f,
+(float16_t)0.80865618158817498262f,(float16_t)0.58828154822264522306f,
+(float16_t)0.80775281792619035848f,(float16_t)0.58952131864106394055f,
+(float16_t)0.80684755354379933401f,(float16_t)0.59075970185887416442f,
+(float16_t)0.80594039057117627944f,(float16_t)0.59199669496204099239f,
+(float16_t)0.80503133114296365758f,(float16_t)0.59323229503979979516f,
+(float16_t)0.80412037739826569549f,(float16_t)0.59446649918466443197f,
+(float16_t)0.80320753148064494287f,(float16_t)0.59569930449243335691f,
+(float16_t)0.80229279553811572168f,(float16_t)0.59693070806219639124f,
+(float16_t)0.80137617172314024039f,(float16_t)0.59816070699634238395f,
+(float16_t)0.80045766219262282082f,(float16_t)0.59938929840056454079f,
+(float16_t)0.79953726910790501314f,(float16_t)0.60061647938386897305f,
+(float16_t)0.79861499463476093297f,(float16_t)0.60184224705858002658f,
+(float16_t)0.79769084094339115509f,(float16_t)0.60306659854034816437f,
+(float16_t)0.79676481020841882774f,(float16_t)0.60428953094815596181f,
+(float16_t)0.79583690460888356633f,(float16_t)0.60551104140432554512f,
+(float16_t)0.79490712632823701256f,(float16_t)0.60673112703452447558f,
+(float16_t)0.79397547755433717231f,(float16_t)0.60794978496777363208f,
+(float16_t)0.79304196047944364167f,(float16_t)0.60916701233645320634f,
+(float16_t)0.79210657730021238887f,(float16_t)0.61038280627630947528f,
+(float16_t)0.79116933021769020318f,(float16_t)0.61159716392646190641f,
+(float16_t)0.79023022143731003197f,(float16_t)0.61281008242940970820f,
+(float16_t)0.78928925316888565167f,(float16_t)0.61402155893103849138f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.78740174702903142911f,(float16_t)0.61644017453085364622f,
+(float16_t)0.78645521359908576731f,(float16_t)0.61764730793780386886f,
+(float16_t)0.78550682956405393220f,(float16_t)0.61885298796097631957f,
+(float16_t)0.78455659715557524159f,(float16_t)0.62005721176328909561f,
+(float16_t)0.78360451860963820092f,(float16_t)0.62125997651108755271f,
+(float16_t)0.78265059616657572938f,(float16_t)0.62246127937414996723f,
+(float16_t)0.78169483207105938671f,(float16_t)0.62366111752569453053f,
+(float16_t)0.78073722857209448822f,(float16_t)0.62485948814238634341f,
+(float16_t)0.77977778792301455368f,(float16_t)0.62605638840434352232f,
+(float16_t)0.77881651238147597827f,(float16_t)0.62725181549514408275f,
+(float16_t)0.77785340420945314754f,(float16_t)0.62844576660183271155f,
+(float16_t)0.77688846567323244230f,(float16_t)0.62963823891492698426f,
+(float16_t)0.77592169904340768660f,(float16_t)0.63082922962842447046f,
+(float16_t)0.77495310659487393057f,(float16_t)0.63201873593980906207f,
+(float16_t)0.77398269060682289844f,(float16_t)0.63320675505005719064f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.77203639715038452351f,(float16_t)0.63557832048855611440f,
+(float16_t)0.77106052426181381776f,(float16_t)0.63676186123628419899f,
+(float16_t)0.77008283699334789674f,(float16_t)0.63794390362184405507f,
+(float16_t)0.76910333764557969882f,(float16_t)0.63912444486377573138f,
+(float16_t)0.76812202852336541881f,(float16_t)0.64030348218415167327f,
+(float16_t)0.76713891193582040007f,(float16_t)0.64148101280858305095f,
+(float16_t)0.76615399019631291733f,(float16_t)0.64265703396622686494f,
+(float16_t)0.76516726562245895860f,(float16_t)0.64383154288979138613f,
+(float16_t)0.76417874053611667406f,(float16_t)0.64500453681554392737f,
+(float16_t)0.76318841726338138010f,(float16_t)0.64617601298331628357f,
+(float16_t)0.76219629813457900891f,(float16_t)0.64734596863651205911f,
+(float16_t)0.76120238548426177871f,(float16_t)0.64851440102211244110f,
+(float16_t)0.76020668165120242055f,(float16_t)0.64968130739068319368f,
+(float16_t)0.75920918897838796102f,(float16_t)0.65084668499638087535f,
+(float16_t)0.75820990981301528144f,(float16_t)0.65201053109695950027f,
+(float16_t)0.75720884650648456748f,(float16_t)0.65317284295377675551f,
+(float16_t)0.75620600141439453523f,(float16_t)0.65433361783180044036f,
+(float16_t)0.75520137689653654700f,(float16_t)0.65549285299961534967f,
+(float16_t)0.75419497531688917125f,(float16_t)0.65665054572942893607f,
+(float16_t)0.75318679904361252042f,(float16_t)0.65780669329707863735f,
+(float16_t)0.75217685044904269986f,(float16_t)0.65896129298203731661f,
+(float16_t)0.75116513190968636771f,(float16_t)0.66011434206742047870f,
+(float16_t)0.75015164580621507273f,(float16_t)0.66126583783999226540f,
+(float16_t)0.74913639452345937020f,(float16_t)0.66241577759017178373f,
+(float16_t)0.74811938045040360379f,(float16_t)0.66356415861203976725f,
+(float16_t)0.74710060598018013245f,(float16_t)0.66471097820334479334f,
+(float16_t)0.74608007351006377927f,(float16_t)0.66585623366550972246f,
+(float16_t)0.74505778544146594733f,(float16_t)0.66699992230363747137f,
+(float16_t)0.74403374417992929057f,(float16_t)0.66814204142651845153f,
+(float16_t)0.74300795213512171866f,(float16_t)0.66928258834663600929f,
+(float16_t)0.74198041172083106787f,(float16_t)0.67042156038017308717f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.73992009545951620275f,(float16_t)0.67269476907077285777f,
+(float16_t)0.73888732446061511361f,(float16_t)0.67382900037875603783f,
+(float16_t)0.73785281478846598269f,(float16_t)0.67496164610201192513f,
+(float16_t)0.73681656887736979300f,(float16_t)0.67609270357531592310f,
+(float16_t)0.73577858916571359238f,(float16_t)0.67722217013718033485f,
+(float16_t)0.73473887809596349907f,(float16_t)0.67835004312986146857f,
+(float16_t)0.73369743811466026084f,(float16_t)0.67947631989936496666f,
+(float16_t)0.73265427167241281570f,(float16_t)0.68060099779545302212f,
+(float16_t)0.73160938122389262972f,(float16_t)0.68172407417164970767f,
+(float16_t)0.73056276922782759087f,(float16_t)0.68284554638524808112f,
+(float16_t)0.72951443814699701296f,(float16_t)0.68396541179731540350f,
+(float16_t)0.72846439044822519637f,(float16_t)0.68508366777270035541f,
+(float16_t)0.72741262860237576593f,(float16_t)0.68620031168003858824f,
+(float16_t)0.72635915508434600873f,(float16_t)0.68731534089175905233f,
+(float16_t)0.72530397237306076796f,(float16_t)0.68842875278409043638f,
+(float16_t)0.72424708295146700276f,(float16_t)0.68954054473706682948f,
+(float16_t)0.72318848930652745999f,(float16_t)0.69065071413453460458f,
+(float16_t)0.72212819392921534511f,(float16_t)0.69175925836415774750f,
+(float16_t)0.72106619931450810501f,(float16_t)0.69286617481742462932f,
+(float16_t)0.72000250796138165477f,(float16_t)0.69397146088965389055f,
+(float16_t)0.71893712237280449351f,(float16_t)0.69507511398000088043f,
+(float16_t)0.71787004505573170920f,(float16_t)0.69617713149146298601f,
+(float16_t)0.71680127852109953857f,(float16_t)0.69727751083088651551f,
+(float16_t)0.71573082528381870571f,(float16_t)0.69837624940897280457f,
+(float16_t)0.71465868786276909308f,(float16_t)0.69947334464028376733f,
+(float16_t)0.71358486878079352422f,(float16_t)0.70056879394324833576f,
+(float16_t)0.71250937056469243469f,(float16_t)0.70166259474016845488f,
+(float16_t)0.71143219574521643356f,(float16_t)0.70275474445722529993f,
+(float16_t)0.71035334685706241764f,(float16_t)0.70384524052448493858f,
+(float16_t)0.70927282643886568891f,(float16_t)0.70493408037590488124f,
+(float16_t)0.70819063703319540259f,(float16_t)0.70602126144933974317f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.70602126144933974317f,(float16_t)0.70819063703319540259f,
+(float16_t)0.70493408037590499227f,(float16_t)0.70927282643886568891f,
+(float16_t)0.70384524052448493858f,(float16_t)0.71035334685706241764f,
+(float16_t)0.70275474445722529993f,(float16_t)0.71143219574521643356f,
+(float16_t)0.70166259474016845488f,(float16_t)0.71250937056469232367f,
+(float16_t)0.70056879394324844679f,(float16_t)0.71358486878079352422f,
+(float16_t)0.69947334464028376733f,(float16_t)0.71465868786276909308f,
+(float16_t)0.69837624940897291559f,(float16_t)0.71573082528381859468f,
+(float16_t)0.69727751083088662654f,(float16_t)0.71680127852109942754f,
+(float16_t)0.69617713149146298601f,(float16_t)0.71787004505573170920f,
+(float16_t)0.69507511398000088043f,(float16_t)0.71893712237280438249f,
+(float16_t)0.69397146088965400157f,(float16_t)0.72000250796138165477f,
+(float16_t)0.69286617481742474034f,(float16_t)0.72106619931450810501f,
+(float16_t)0.69175925836415774750f,(float16_t)0.72212819392921534511f,
+(float16_t)0.69065071413453460458f,(float16_t)0.72318848930652734897f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.68842875278409043638f,(float16_t)0.72530397237306076796f,
+(float16_t)0.68731534089175905233f,(float16_t)0.72635915508434600873f,
+(float16_t)0.68620031168003858824f,(float16_t)0.72741262860237576593f,
+(float16_t)0.68508366777270035541f,(float16_t)0.72846439044822519637f,
+(float16_t)0.68396541179731551452f,(float16_t)0.72951443814699690193f,
+(float16_t)0.68284554638524808112f,(float16_t)0.73056276922782759087f,
+(float16_t)0.68172407417164981869f,(float16_t)0.73160938122389262972f,
+(float16_t)0.68060099779545302212f,(float16_t)0.73265427167241281570f,
+(float16_t)0.67947631989936496666f,(float16_t)0.73369743811466026084f,
+(float16_t)0.67835004312986146857f,(float16_t)0.73473887809596349907f,
+(float16_t)0.67722217013718044587f,(float16_t)0.73577858916571348136f,
+(float16_t)0.67609270357531603413f,(float16_t)0.73681656887736979300f,
+(float16_t)0.67496164610201203615f,(float16_t)0.73785281478846598269f,
+(float16_t)0.67382900037875614885f,(float16_t)0.73888732446061511361f,
+(float16_t)0.67269476907077296879f,(float16_t)0.73992009545951609173f,
+(float16_t)0.67155895484701833009f,(float16_t)0.74095112535495910588f,
+(float16_t)0.67042156038017308717f,(float16_t)0.74198041172083095685f,
+(float16_t)0.66928258834663600929f,(float16_t)0.74300795213512171866f,
+(float16_t)0.66814204142651856255f,(float16_t)0.74403374417992929057f,
+(float16_t)0.66699992230363747137f,(float16_t)0.74505778544146594733f,
+(float16_t)0.66585623366550972246f,(float16_t)0.74608007351006366825f,
+(float16_t)0.66471097820334490436f,(float16_t)0.74710060598018013245f,
+(float16_t)0.66356415861203987827f,(float16_t)0.74811938045040349277f,
+(float16_t)0.66241577759017178373f,(float16_t)0.74913639452345925918f,
+(float16_t)0.66126583783999226540f,(float16_t)0.75015164580621496171f,
+(float16_t)0.66011434206742047870f,(float16_t)0.75116513190968636771f,
+(float16_t)0.65896129298203731661f,(float16_t)0.75217685044904269986f,
+(float16_t)0.65780669329707874837f,(float16_t)0.75318679904361252042f,
+(float16_t)0.65665054572942904709f,(float16_t)0.75419497531688917125f,
+(float16_t)0.65549285299961546070f,(float16_t)0.75520137689653654700f,
+(float16_t)0.65433361783180055138f,(float16_t)0.75620600141439453523f,
+(float16_t)0.65317284295377686654f,(float16_t)0.75720884650648456748f,
+(float16_t)0.65201053109695950027f,(float16_t)0.75820990981301528144f,
+(float16_t)0.65084668499638098638f,(float16_t)0.75920918897838796102f,
+(float16_t)0.64968130739068319368f,(float16_t)0.76020668165120242055f,
+(float16_t)0.64851440102211255212f,(float16_t)0.76120238548426177871f,
+(float16_t)0.64734596863651205911f,(float16_t)0.76219629813457889789f,
+(float16_t)0.64617601298331639459f,(float16_t)0.76318841726338126907f,
+(float16_t)0.64500453681554403840f,(float16_t)0.76417874053611667406f,
+(float16_t)0.64383154288979149715f,(float16_t)0.76516726562245895860f,
+(float16_t)0.64265703396622686494f,(float16_t)0.76615399019631280630f,
+(float16_t)0.64148101280858316198f,(float16_t)0.76713891193582040007f,
+(float16_t)0.64030348218415167327f,(float16_t)0.76812202852336530778f,
+(float16_t)0.63912444486377573138f,(float16_t)0.76910333764557958780f,
+(float16_t)0.63794390362184416610f,(float16_t)0.77008283699334789674f,
+(float16_t)0.63676186123628419899f,(float16_t)0.77106052426181381776f,
+(float16_t)0.63557832048855622542f,(float16_t)0.77203639715038441249f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.63320675505005719064f,(float16_t)0.77398269060682278742f,
+(float16_t)0.63201873593980906207f,(float16_t)0.77495310659487381955f,
+(float16_t)0.63082922962842458148f,(float16_t)0.77592169904340757558f,
+(float16_t)0.62963823891492709528f,(float16_t)0.77688846567323244230f,
+(float16_t)0.62844576660183271155f,(float16_t)0.77785340420945303652f,
+(float16_t)0.62725181549514419377f,(float16_t)0.77881651238147586724f,
+(float16_t)0.62605638840434352232f,(float16_t)0.77977778792301444266f,
+(float16_t)0.62485948814238645443f,(float16_t)0.78073722857209448822f,
+(float16_t)0.62366111752569464155f,(float16_t)0.78169483207105938671f,
+(float16_t)0.62246127937415007825f,(float16_t)0.78265059616657572938f,
+(float16_t)0.62125997651108766373f,(float16_t)0.78360451860963820092f,
+(float16_t)0.62005721176328920663f,(float16_t)0.78455659715557524159f,
+(float16_t)0.61885298796097631957f,(float16_t)0.78550682956405393220f,
+(float16_t)0.61764730793780397988f,(float16_t)0.78645521359908576731f,
+(float16_t)0.61644017453085364622f,(float16_t)0.78740174702903131809f,
+(float16_t)0.61523159058062681925f,(float16_t)0.78834642762660622761f,
+(float16_t)0.61402155893103849138f,(float16_t)0.78928925316888565167f,
+(float16_t)0.61281008242940970820f,(float16_t)0.79023022143731003197f,
+(float16_t)0.61159716392646201744f,(float16_t)0.79116933021769009216f,
+(float16_t)0.61038280627630947528f,(float16_t)0.79210657730021227785f,
+(float16_t)0.60916701233645320634f,(float16_t)0.79304196047944364167f,
+(float16_t)0.60794978496777374311f,(float16_t)0.79397547755433717231f,
+(float16_t)0.60673112703452447558f,(float16_t)0.79490712632823701256f,
+(float16_t)0.60551104140432554512f,(float16_t)0.79583690460888345530f,
+(float16_t)0.60428953094815607283f,(float16_t)0.79676481020841871672f,
+(float16_t)0.60306659854034827539f,(float16_t)0.79769084094339104407f,
+(float16_t)0.60184224705858002658f,(float16_t)0.79861499463476082195f,
+(float16_t)0.60061647938386897305f,(float16_t)0.79953726910790501314f,
+(float16_t)0.59938929840056454079f,(float16_t)0.80045766219262270980f,
+(float16_t)0.59816070699634238395f,(float16_t)0.80137617172314012937f,
+(float16_t)0.59693070806219650226f,(float16_t)0.80229279553811572168f,
+(float16_t)0.59569930449243346793f,(float16_t)0.80320753148064483184f,
+(float16_t)0.59446649918466454299f,(float16_t)0.80412037739826569549f,
+(float16_t)0.59323229503979979516f,(float16_t)0.80503133114296365758f,
+(float16_t)0.59199669496204099239f,(float16_t)0.80594039057117627944f,
+(float16_t)0.59075970185887427544f,(float16_t)0.80684755354379922299f,
+(float16_t)0.58952131864106394055f,(float16_t)0.80775281792619024746f,
+(float16_t)0.58828154822264533408f,(float16_t)0.80865618158817498262f,
+(float16_t)0.58704039352091808013f,(float16_t)0.80955764240405125864f,
+(float16_t)0.58579785745643886408f,(float16_t)0.81045719825259476821f,
+(float16_t)0.58455394295301532637f,(float16_t)0.81135484701706372945f,
+(float16_t)0.58330865293769829094f,(float16_t)0.81225058658520388200f,
+(float16_t)0.58206199034077554799f,(float16_t)0.81314441484925359394f,
+(float16_t)0.58081395809576452649f,(float16_t)0.81403632970594830276f,
+(float16_t)0.57956455913940574387f,(float16_t)0.81492632905652662156f,
+(float16_t)0.57831379641165558958f,(float16_t)0.81581441080673378075f,
+(float16_t)0.57706167285567955272f,(float16_t)0.81670057286682784525f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.57455335504771576360f,(float16_t)0.81846712958029865792f,
+(float16_t)0.57329716669804231532f,(float16_t)0.81934752007679689800f,
+(float16_t)0.57203962932475704850f,(float16_t)0.82022598256943468620f,
+(float16_t)0.57078074588696736669f,(float16_t)0.82110251499110464835f,
+(float16_t)0.56952051934694725155f,(float16_t)0.82197711527924155472f,
+(float16_t)0.56825895267013148970f,(float16_t)0.82284978137582631685f,
+(float16_t)0.56699604882510867832f,(float16_t)0.82372051122739131657f,
+(float16_t)0.56573181078361323149f,(float16_t)0.82458930278502529099f,
+(float16_t)0.56446624152051949608f,(float16_t)0.82545615400437744036f,
+(float16_t)0.56319934401383409117f,(float16_t)0.82632106284566353427f,
+(float16_t)0.56193112124468946877f,(float16_t)0.82718402727366913130f,
+(float16_t)0.56066157619733603124f,(float16_t)0.82804504525775579626f,
+(float16_t)0.55939071185913613604f,(float16_t)0.82890411477186487499f,
+(float16_t)0.55811853122055610221f,(float16_t)0.82976123379452304540f,
+(float16_t)0.55684503727516010407f,(float16_t)0.83061640030884620334f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.55429412145362011444f,(float16_t)0.83232086776792968408f,
+(float16_t)0.55301670558002757883f,(float16_t)0.83317016470191318511f,
+(float16_t)0.55173798840470744675f,(float16_t)0.83401750110601813315f,
+(float16_t)0.55045797293660481131f,(float16_t)0.83486287498638001026f,
+(float16_t)0.54917666218771976627f,(float16_t)0.83570628435375260423f,
+(float16_t)0.54789405917310018967f,(float16_t)0.83654772722351189440f,
+(float16_t)0.54661016691083486041f,(float16_t)0.83738720161566193578f,
+(float16_t)0.54532498842204646383f,(float16_t)0.83822470555483796772f,
+(float16_t)0.54403852673088393122f,(float16_t)0.83906023707031263115f,
+(float16_t)0.54275078486451600046f,(float16_t)0.83989379419599941023f,
+(float16_t)0.54146176585312355556f,(float16_t)0.84072537497045796151f,
+(float16_t)0.54017147272989296525f,(float16_t)0.84155497743689833268f,
+(float16_t)0.53887990853100842248f,(float16_t)0.84238259964318584760f,
+(float16_t)0.53758707629564550512f,(float16_t)0.84320823964184543620f,
+(float16_t)0.53629297906596318235f,(float16_t)0.84403189549006640835f,
+(float16_t)0.53499761988709726435f,(float16_t)0.84485356524970700587f,
+(float16_t)0.53370100180715296379f,(float16_t)0.84567324698729906540f,
+(float16_t)0.53240312787719801246f,(float16_t)0.84649093877405212627f,
+(float16_t)0.53110400115125500076f,(float16_t)0.84730663868585831544f,
+(float16_t)0.52980362468629482731f,(float16_t)0.84812034480329712149f,
+(float16_t)0.52850200154222848337f,(float16_t)0.84893205521163961347f,
+(float16_t)0.52719913478190139067f,(float16_t)0.84974176800085243766f,
+(float16_t)0.52589502747108474168f,(float16_t)0.85054948126560336874f,
+(float16_t)0.52458968267846883826f,(float16_t)0.85135519310526519554f,
+(float16_t)0.52328310347565643035f,(float16_t)0.85215890162391982887f,
+(float16_t)0.52197529293715438925f,(float16_t)0.85296060493036363059f,
+(float16_t)0.52066625414036726838f,(float16_t)0.85376030113811129940f,
+(float16_t)0.51935599016558953167f,(float16_t)0.85455798836540053376f,
+(float16_t)0.51804450409599933636f,(float16_t)0.85535366473519602870f,
+(float16_t)0.51673179901764998423f,(float16_t)0.85614732837519447184f,
+(float16_t)0.51541787801946314929f,(float16_t)0.85693897741782865118f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.51278640063356306644f,(float16_t)0.85851622426444273994f,
+(float16_t)0.51146885043797052361f,(float16_t)0.85930181835700836235f,
+(float16_t)0.51015009670676669806f,(float16_t)0.86008539042939025077f,
+(float16_t)0.50883014254310698909f,(float16_t)0.86086693863776730939f,
+(float16_t)0.50750899105297087033f,(float16_t)0.86164646114308129921f,
+(float16_t)0.50618664534515533937f,(float16_t)0.86242395611104050168f,
+(float16_t)0.50486310853126747933f,(float16_t)0.86319942171212415971f,
+(float16_t)0.50353838372571757542f,(float16_t)0.86397285612158669643f,
+(float16_t)0.50221247404571089934f,(float16_t)0.86474425751946237817f,
+(float16_t)0.50088538261124093687f,(float16_t)0.86551362409056897818f,
+(float16_t)0.49955711254508183838f,(float16_t)0.86628095402451299467f,
+(float16_t)0.49822766697278186854f,(float16_t)0.86704624551569264845f,
+(float16_t)0.49689704902265463549f,(float16_t)0.86780949676330321196f,
+(float16_t)0.49556526182577248507f,(float16_t)0.86857070597134089507f,
+(float16_t)0.49423230851595972846f,(float16_t)0.86932987134860673084f,
+(float16_t)0.49289819222978409341f,(float16_t)0.87008699110871134952f,
+(float16_t)0.49156291610655006297f,(float16_t)0.87084206347007886428f,
+(float16_t)0.49022648328829110387f,(float16_t)0.87159508665595109012f,
+(float16_t)0.48888889691976322727f,(float16_t)0.87234605889439142956f,
+(float16_t)0.48755016014843605143f,(float16_t)0.87309497841829009079f,
+(float16_t)0.48621027612448652899f,(float16_t)0.87384184346536675214f,
+(float16_t)0.48486924800079111986f,(float16_t)0.87458665227817611321f,
+(float16_t)0.48352707893291874131f,(float16_t)0.87532940310411078144f,
+(float16_t)0.48218377207912282989f,(float16_t)0.87607009419540660122f,
+(float16_t)0.48083933060033390294f,(float16_t)0.87680872380914576247f,
+(float16_t)0.47949375766015300826f,(float16_t)0.87754529020726124156f,
+(float16_t)0.47814705642484311987f,(float16_t)0.87827979165654146421f,
+(float16_t)0.47679923006332225466f,(float16_t)0.87901222642863341417f,
+(float16_t)0.47545028174715586733f,(float16_t)0.87974259280004740713f,
+(float16_t)0.47410021465055002254f,(float16_t)0.88047088905216075450f,
+(float16_t)0.47274903195034290171f,(float16_t)0.88119711347122198219f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.47004333245959561971f,(float16_t)0.88264333997956279099f,
+(float16_t)0.46868882203582795665f,(float16_t)0.88336333866573157891f,
+(float16_t)0.46733320874198852612f,(float16_t)0.88408125871263498752f,
+(float16_t)0.46597649576796612569f,(float16_t)0.88479709843093778954f,
+(float16_t)0.46461868630623781584f,(float16_t)0.88551085613619995307f,
+(float16_t)0.46325978355186026025f,(float16_t)0.88622253014888063838f,
+(float16_t)0.46189979070246284243f,(float16_t)0.88693211879434208367f,
+(float16_t)0.46053871095824000514f,(float16_t)0.88763962040285393496f,
+(float16_t)0.45917654752194414502f,(float16_t)0.88834503330959635470f,
+(float16_t)0.45781330359887728587f,(float16_t)0.88904835585466457371f,
+(float16_t)0.45644898239688386221f,(float16_t)0.88974958638307288794f,
+(float16_t)0.45508358712634383592f,(float16_t)0.89044872324475787817f,
+(float16_t)0.45371712100016392544f,(float16_t)0.89114576479458318392f,
+(float16_t)0.45234958723377099998f,(float16_t)0.89184070939234272313f,
+(float16_t)0.45098098904510380835f,(float16_t)0.89253355540276468894f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.44824061228521999700f,(float16_t)0.89391294514520325265f,
+(float16_t)0.44686884016237432560f,(float16_t)0.89459948563138258493f,
+(float16_t)0.44549601651398174074f,(float16_t)0.89528392103855758410f,
+(float16_t)0.44412214457042925586f,(float16_t)0.89596624975618510689f,
+(float16_t)0.44274722756457013384f,(float16_t)0.89664647017868015499f,
+(float16_t)0.44137126873171661501f,(float16_t)0.89732458070541831763f,
+(float16_t)0.43999427130963325583f,(float16_t)0.89800057974073987932f,
+(float16_t)0.43861623853852771404f,(float16_t)0.89867446569395381673f,
+(float16_t)0.43723717366104419835f,(float16_t)0.89934623697934146236f,
+(float16_t)0.43585707992225547480f,(float16_t)0.90001589201616027935f,
+(float16_t)0.43447596056965570588f,(float16_t)0.90068342922864685907f,
+(float16_t)0.43309381885315201277f,(float16_t)0.90134884704602202810f,
+(float16_t)0.43171065802505736997f,(float16_t)0.90201214390249306874f,
+(float16_t)0.43032648134008261165f,(float16_t)0.90267331823725882600f,
+(float16_t)0.42894129205532954829f,(float16_t)0.90333236849451181705f,
+(float16_t)0.42755509343028219593f,(float16_t)0.90398929312344333820f,
+(float16_t)0.42616788872679961520f,(float16_t)0.90464409057824624050f,
+(float16_t)0.42477968120910880589f,(float16_t)0.90529675931811881551f,
+(float16_t)0.42339047414379610279f,(float16_t)0.90594729780726845902f,
+(float16_t)0.42200027079979979261f,(float16_t)0.90659570451491533483f,
+(float16_t)0.42060907444840250902f,(float16_t)0.90724197791529592738f,
+(float16_t)0.41921688836322396066f,(float16_t)0.90788611648766626150f,
+(float16_t)0.41782371582021238243f,(float16_t)0.90852811871630612117f,
+(float16_t)0.41642956009763731906f,(float16_t)0.90916798309052226923f,
+(float16_t)0.41503442447608163146f,(float16_t)0.90980570810465222209f,
+(float16_t)0.41363831223843455787f,(float16_t)0.91044129225806713634f,
+(float16_t)0.41224122666988299857f,(float16_t)0.91107473405517624965f,
+(float16_t)0.41084317105790391089f,(float16_t)0.91170603200542987832f,
+(float16_t)0.40944414869225764786f,(float16_t)0.91233518462332274801f,
+(float16_t)0.40804416286497874333f,(float16_t)0.91296219042839810154f,
+(float16_t)0.40664321687036913966f,(float16_t)0.91358704794525080750f,
+(float16_t)0.40524131400498986100f,(float16_t)0.91420975570353069095f,
+(float16_t)0.40383845756765412993f,(float16_t)0.91483031223794608611f,
+(float16_t)0.40243465085941854120f,(float16_t)0.91544871608826783316f,
+(float16_t)0.40102989718357578974f,(float16_t)0.91606496579933160973f,
+(float16_t)0.39962419984564678810f,(float16_t)0.91667905992104270485f,
+(float16_t)0.39821756215337361651f,(float16_t)0.91729099700837790632f,
+(float16_t)0.39680998741671041907f,(float16_t)0.91790077562139038569f,
+(float16_t)0.39540147894781629834f,(float16_t)0.91850839432521225181f,
+(float16_t)0.39399204006104809883f,(float16_t)0.91911385169005777040f,
+(float16_t)0.39258167407295152529f,(float16_t)0.91971714629122736095f,
+(float16_t)0.39117038430225398171f,(float16_t)0.92031827670911048322f,
+(float16_t)0.38975817406985641123f,(float16_t)0.92091724152918941204f,
+(float16_t)0.38834504669882630168f,(float16_t)0.92151403934204190183f,
+(float16_t)0.38693100551438869283f,(float16_t)0.92210866874334507237f,
+(float16_t)0.38551605384391901543f,(float16_t)0.92270112833387851747f,
+(float16_t)0.38410019501693504207f,(float16_t)0.92329141671952763559f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.38126576922216248722f,(float16_t)0.92446547432526260391f,
+(float16_t)0.37984720892405110515f,(float16_t)0.92504924078267758425f,
+(float16_t)0.37842775480876561511f,(float16_t)0.92563083050987271516f,
+(float16_t)0.37700741021641831496f,(float16_t)0.92621024213831126826f,
+(float16_t)0.37558617848921732607f,(float16_t)0.92678747430458174872f,
+(float16_t)0.37416406297145798909f,(float16_t)0.92736252565040111495f,
+(float16_t)0.37274106700951581406f,(float16_t)0.92793539482261788720f,
+(float16_t)0.37131719395183759858f,(float16_t)0.92850608047321558924f,
+(float16_t)0.36989244714893426691f,(float16_t)0.92907458125931574600f,
+(float16_t)0.36846682995337232125f,(float16_t)0.92964089584318121418f,
+(float16_t)0.36704034571976723589f,(float16_t)0.93020502289221906889f,
+(float16_t)0.36561299780477396482f,(float16_t)0.93076696107898371224f,
+(float16_t)0.36418478956707983629f,(float16_t)0.93132670908118042608f,
+(float16_t)0.36275572436739722537f,(float16_t)0.93188426558166814750f,
+(float16_t)0.36132580556845433906f,(float16_t)0.93243962926846235550f,
+(float16_t)0.35989503653498827740f,(float16_t)0.93299279883473884567f,
+(float16_t)0.35846342063373654030f,(float16_t)0.93354377297883617270f,
+(float16_t)0.35703096123343003310f,(float16_t)0.93409255040425887007f,
+(float16_t)0.35559766170478396274f,(float16_t)0.93463912981968078064f,
+(float16_t)0.35416352542049051033f,(float16_t)0.93518350993894749923f,
+(float16_t)0.35272855575521072646f,(float16_t)0.93572568948108036935f,
+(float16_t)0.35129275608556714827f,(float16_t)0.93626566717027825959f,
+(float16_t)0.34985612979013502866f,(float16_t)0.93680344173592156043f,
+(float16_t)0.34841868024943450921f,(float16_t)0.93733901191257495977f,
+(float16_t)0.34698041084592368133f,(float16_t)0.93787237643998988545f,
+(float16_t)0.34554132496398914931f,(float16_t)0.93840353406310805795f,
+(float16_t)0.34410142598993898044f,(float16_t)0.93893248353206448797f,
+(float16_t)0.34266071731199437833f,(float16_t)0.93945922360218991898f,
+(float16_t)0.34121920232028241093f,(float16_t)0.93998375303401393577f,
+(float16_t)0.33977688440682696225f,(float16_t)0.94050607059326829518f,
+(float16_t)0.33833376696554129381f,(float16_t)0.94102617505088925753f,
+(float16_t)0.33688985339222005111f,(float16_t)0.94154406518302080631f,
+(float16_t)0.33544514708453165852f,(float16_t)0.94205973977101731265f,
+(float16_t)0.33399965144200949307f,(float16_t)0.94257319760144686605f,
+(float16_t)0.33255336986604422389f,(float16_t)0.94308443746609349478f,
+(float16_t)0.33110630575987642921f,(float16_t)0.94359345816196038559f,
+(float16_t)0.32965846252858754806f,(float16_t)0.94410025849127265918f,
+(float16_t)0.32820984357909266382f,(float16_t)0.94460483726148025685f,
+(float16_t)0.32676045232013178898f,(float16_t)0.94510719328526060501f,
+(float16_t)0.32531029216226298173f,(float16_t)0.94560732538052127971f,
+(float16_t)0.32385936651785296458f,(float16_t)0.94610523237040333733f,
+(float16_t)0.32240767880107001897f,(float16_t)0.94660091308328353499f,
+(float16_t)0.32095523242787521445f,(float16_t)0.94709436635277721717f,
+(float16_t)0.31950203081601574739f,(float16_t)0.94758559101774109124f,
+(float16_t)0.31804807738501505998f,(float16_t)0.94807458592227622507f,
+(float16_t)0.31659337555616584581f,(float16_t)0.94856134991573026749f,
+(float16_t)0.31513792875252244485f,(float16_t)0.94904588185270055689f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.31222481392182505067f,(float16_t)0.95000824500184299914f,
+(float16_t)0.31076715274961147495f,(float16_t)0.95048607394948170235f,
+(float16_t)0.30930876031226878231f,(float16_t)0.95096166631157508231f,
+(float16_t)0.30784964004153497763f,(float16_t)0.95143502096900833820f,
+(float16_t)0.30638979537086108440f,(float16_t)0.95190613680793223494f,
+(float16_t)0.30492922973540242948f,(float16_t)0.95237501271976587880f,
+(float16_t)0.30346794657201137113f,(float16_t)0.95284164760119871573f,
+(float16_t)0.30200594931922819519f,(float16_t)0.95330604035419375109f,
+(float16_t)0.30054324141727339903f,(float16_t)0.95376818988599032512f,
+(float16_t)0.29907982630804047508f,(float16_t)0.95422809510910566733f,
+(float16_t)0.29761570743508630743f,(float16_t)0.95468575494133833814f,
+(float16_t)0.29615088824362395536f,(float16_t)0.95514116830577067141f,
+(float16_t)0.29468537218051432669f,(float16_t)0.95559433413077110586f,
+(float16_t)0.29321916269425868373f,(float16_t)0.95604525134999640557f,
+(float16_t)0.29175226323498937298f,(float16_t)0.95649391890239499059f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.28881640820604947972f,(float16_t)0.95738450078897585627f,
+(float16_t)0.28734745954472956653f,(float16_t)0.95782641302753290802f,
+(float16_t)0.28587783472708072630f,(float16_t)0.95826607140801767226f,
+(float16_t)0.28440753721127182141f,(float16_t)0.95870347489587159906f,
+(float16_t)0.28293657045705539188f,(float16_t)0.95913862246184189431f,
+(float16_t)0.28146493792575805193f,(float16_t)0.95957151308198451733f,
+(float16_t)0.27999264308027338455f,(float16_t)0.96000214573766584625f,
+(float16_t)0.27851968938505305973f,(float16_t)0.96043051941556578655f,
+(float16_t)0.27704608030609995106f,(float16_t)0.96085663310767965850f,
+(float16_t)0.27557181931095825478f,(float16_t)0.96128048581132063966f,
+(float16_t)0.27409690986870632878f,(float16_t)0.96170207652912254037f,
+(float16_t)0.27262135544994897662f,(float16_t)0.96212140426904158019f,
+(float16_t)0.27114515952680806610f,(float16_t)0.96253846804435916340f,
+(float16_t)0.26966832557291520178f,(float16_t)0.96295326687368387741f,
+(float16_t)0.26819085706340317632f,(float16_t)0.96336579978095404631f,
+(float16_t)0.26671275747489842090f,(float16_t)0.96377606579543984022f,
+(float16_t)0.26523403028551190141f,(float16_t)0.96418406395174571788f,
+(float16_t)0.26375467897483151347f,(float16_t)0.96458979328981264700f,
+(float16_t)0.26227470702391358914f,(float16_t)0.96499325285492032478f,
+(float16_t)0.26079411791527556952f,(float16_t)0.96539444169768939830f,
+(float16_t)0.25931291513288634576f,(float16_t)0.96579335887408357397f,
+(float16_t)0.25783110216215893162f,(float16_t)0.96619000344541261516f,
+(float16_t)0.25634868248994291395f,(float16_t)0.96658437447833311928f,
+(float16_t)0.25486565960451462720f,(float16_t)0.96697647104485207059f,
+(float16_t)0.25338203699557027004f,(float16_t)0.96736629222232850545f,
+(float16_t)0.25189781815421691258f,(float16_t)0.96775383709347551076f,
+(float16_t)0.25041300657296527987f,(float16_t)0.96813910474636244441f,
+(float16_t)0.24892760574572025956f,(float16_t)0.96852209427441726675f,
+(float16_t)0.24744161916777343557f,(float16_t)0.96890280477642887202f,
+(float16_t)0.24595505033579459497f,(float16_t)0.96928123535654853171f,
+(float16_t)0.24446790274782420616f,(float16_t)0.96965738512429244800f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.24149188530286930243f,(float16_t)0.97040283868755550234f,
+(float16_t)0.24000302244874149871f,(float16_t)0.97077214072895035013f,
+(float16_t)0.23851359484431849944f,(float16_t)0.97113915844972509284f,
+(float16_t)0.23702360599436733679f,(float16_t)0.97150389098625178352f,
+(float16_t)0.23553305940497545889f,(float16_t)0.97186633748027939639f,
+(float16_t)0.23404195858354345794f,(float16_t)0.97222649707893626925f,
+(float16_t)0.23255030703877532794f,(float16_t)0.97258436893473221296f,
+(float16_t)0.23105810828067127605f,(float16_t)0.97293995220556006576f,
+(float16_t)0.22956536582051886852f,(float16_t)0.97329324605469824672f,
+(float16_t)0.22807208317088578653f,(float16_t)0.97364424965081186603f,
+(float16_t)0.22657826384561011168f,(float16_t)0.97399296216795583359f,
+(float16_t)0.22508391135979277653f,(float16_t)0.97433938278557585821f,
+(float16_t)0.22358902922979001504f,(float16_t)0.97468351068851066810f,
+(float16_t)0.22209362097320359264f,(float16_t)0.97502534506699412020f,
+(float16_t)0.22059769010887364526f,(float16_t)0.97536488511665686563f,
+(float16_t)0.21910124015686976984f,(float16_t)0.97570213003852857003f,
+(float16_t)0.21760427463848366902f,(float16_t)0.97603707903903902388f,
+(float16_t)0.21610679707621960333f,(float16_t)0.97636973133002114000f,
+(float16_t)0.21460881099378692483f,(float16_t)0.97670008612871184184f,
+(float16_t)0.21311031991609136194f,(float16_t)0.97702814265775439484f,
+(float16_t)0.21161132736922760866f,(float16_t)0.97735390014519996082f,
+(float16_t)0.21011183688046972118f,(float16_t)0.97767735782450992943f,
+(float16_t)0.20861185197826345727f,(float16_t)0.97799851493455713936f,
+(float16_t)0.20711137619221856032f,(float16_t)0.97831737071962765473f,
+(float16_t)0.20561041305309932237f,(float16_t)0.97863392442942309657f,
+(float16_t)0.20410896609281700687f,(float16_t)0.97894817531906219710f,
+(float16_t)0.20260703884442110567f,(float16_t)0.97926012264908202098f,
+(float16_t)0.20110463484209195606f,(float16_t)0.97956976568544051887f,
+(float16_t)0.19960175762113105402f,(float16_t)0.97987710369951763756f,
+(float16_t)0.19809841071795372680f,(float16_t)0.98018213596811731847f,
+(float16_t)0.19659459767008022335f,(float16_t)0.98048486177346938497f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.19358558729580374602f,(float16_t)0.98108339115048659451f,
+(float16_t)0.19208039704989238183f,(float16_t)0.98137919331375456089f,
+(float16_t)0.19057475482025279523f,(float16_t)0.98167268619698311305f,
+(float16_t)0.18906866414980627589f,(float16_t)0.98196386910955524296f,
+(float16_t)0.18756212858252974129f,(float16_t)0.98225274136628937249f,
+(float16_t)0.18605515166344663291f,(float16_t)0.98253930228744124076f,
+(float16_t)0.18454773693861964423f,(float16_t)0.98282355119870523641f,
+(float16_t)0.18303988795514106180f,(float16_t)0.98310548743121628501f,
+(float16_t)0.18153160826112513249f,(float16_t)0.98338511032155118130f,
+(float16_t)0.18002290140569951471f,(float16_t)0.98366241921173025453f,
+(float16_t)0.17851377093899759019f,(float16_t)0.98393741344921892278f,
+(float16_t)0.17700422041214886049f,(float16_t)0.98421009238692902521f,
+(float16_t)0.17549425337727139751f,(float16_t)0.98448045538322093151f,
+(float16_t)0.17398387338746384989f,(float16_t)0.98474850180190420801f,
+(float16_t)0.17247308399679603386f,(float16_t)0.98501423101223983814f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.16945029123396793125f,(float16_t)0.98553873531217606185f,
+(float16_t)0.16793829497473122814f,(float16_t)0.98579750916756736512f,
+(float16_t)0.16642590354046421508f,(float16_t)0.98605396334619543897f,
+(float16_t)0.16491312048997008866f,(float16_t)0.98630809724459866938f,
+(float16_t)0.16339994938297322524f,(float16_t)0.98655991026477540817f,
+(float16_t)0.16188639378011188130f,(float16_t)0.98680940181418541624f,
+(float16_t)0.16037245724292839566f,(float16_t)0.98705657130575097380f,
+(float16_t)0.15885814333386139019f,(float16_t)0.98730141815785843473f,
+(float16_t)0.15734345561623827581f,(float16_t)0.98754394179435922574f,
+(float16_t)0.15582839765426531597f,(float16_t)0.98778414164457217783f,
+(float16_t)0.15431297301302024372f,(float16_t)0.98802201714328352633f,
+(float16_t)0.15279718525844340760f,(float16_t)0.98825756773074946437f,
+(float16_t)0.15128103795733024994f,(float16_t)0.98849079285269658701f,
+(float16_t)0.14976453467732162017f,(float16_t)0.98872169196032377858f,
+(float16_t)0.14824767898689619749f,(float16_t)0.98895026451030298986f,
+(float16_t)0.14673047445536174793f,(float16_t)0.98917650996478101444f,
+(float16_t)0.14521292465284751927f,(float16_t)0.98940042779138037687f,
+(float16_t)0.14369503315029458212f,(float16_t)0.98962201746320077600f,
+(float16_t)0.14217680351944800288f,(float16_t)0.98984127845882052821f,
+(float16_t)0.14065823933284923863f,(float16_t)0.99005821026229712256f,
+(float16_t)0.13913934416382628401f,(float16_t)0.99027281236316910817f,
+(float16_t)0.13762012158648617710f,(float16_t)0.99048508425645698239f,
+(float16_t)0.13610057517570620100f,(float16_t)0.99069502544266463406f,
+(float16_t)0.13458070850712622324f,(float16_t)0.99090263542778000971f,
+(float16_t)0.13306052515713917561f,(float16_t)0.99110791372327677884f,
+(float16_t)0.13154002870288328264f,(float16_t)0.99131085984611544415f,
+(float16_t)0.13001922272223334631f,(float16_t)0.99151147331874389668f,
+(float16_t)0.12849811079379322432f,(float16_t)0.99170975366909952520f,
+(float16_t)0.12697669649688597682f,(float16_t)0.99190570043060932726f,
+(float16_t)0.12545498341154620592f,(float16_t)0.99209931314219179654f,
+(float16_t)0.12393297511851220083f,(float16_t)0.99229059134825736699f,
+(float16_t)0.12241067519921627893f,(float16_t)0.99247953459870996706f,
+(float16_t)0.12088808723577722237f,(float16_t)0.99266614244894801899f,
+(float16_t)0.11936521481099135467f,(float16_t)0.99285041445986510489f,
+(float16_t)0.11784206150832501891f,(float16_t)0.99303235019785141002f,
+(float16_t)0.11631863091190487725f,(float16_t)0.99321194923479450001f,
+(float16_t)0.11479492660651025027f,(float16_t)0.99338921114808065305f,
+(float16_t)0.11327095217756436019f,(float16_t)0.99356413552059530403f,
+(float16_t)0.11174671121112665639f,(float16_t)0.99373672194072459884f,
+(float16_t)0.11022220729388318428f,(float16_t)0.99390697000235606051f,
+(float16_t)0.10869744401313867488f,(float16_t)0.99407487930487936634f,
+(float16_t)0.10717242495680887049f,(float16_t)0.99424044945318790223f,
+(float16_t)0.10564715371341069916f,(float16_t)0.99440368005767909576f,
+(float16_t)0.10412163387205472520f,(float16_t)0.99456457073425541537f,
+(float16_t)0.10259586902243628126f,(float16_t)0.99472312110432570265f,
+(float16_t)0.10106986275482787718f,(float16_t)0.99487933079480561638f,
+(float16_t)0.09954361866006944393f,(float16_t)0.99503319943811863180f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.09649043135525260662f,(float16_t)0.99533391214048227980f,
+(float16_t)0.09496349532963906104f,(float16_t)0.99548075549192693856f,
+(float16_t)0.09343633584574791151f,(float16_t)0.99562525638099430569f,
+(float16_t)0.09190895649713269611f,(float16_t)0.99576741446765981713f,
+(float16_t)0.09038136087786501072f,(float16_t)0.99590722941741172125f,
+(float16_t)0.08885355258252468358f,(float16_t)0.99604470090125196702f,
+(float16_t)0.08732553520619222576f,(float16_t)0.99617982859569687015f,
+(float16_t)0.08579731234443987997f,(float16_t)0.99631261218277800129f,
+(float16_t)0.08426888759332412659f,(float16_t)0.99644305135004263008f,
+(float16_t)0.08274026454937580266f,(float16_t)0.99657114579055483539f,
+(float16_t)0.08121144680959238582f,(float16_t)0.99669689520289606044f,
+(float16_t)0.07968243797143012563f,(float16_t)0.99682029929116566791f,
+(float16_t)0.07815324163279431524f,(float16_t)0.99694135776498216117f,
+(float16_t)0.07662386139203161695f,(float16_t)0.99706007033948296225f,
+(float16_t)0.07509430084792129145f,(float16_t)0.99717643673532618820f,
+(float16_t)0.07356456359966745406f,(float16_t)0.99729045667869020697f,
+(float16_t)0.07203465324688941573f,(float16_t)0.99740212990127530279f,
+(float16_t)0.07050457338961400866f,(float16_t)0.99751145614030345410f,
+(float16_t)0.06897432762826673225f,(float16_t)0.99761843513851955478f,
+(float16_t)0.06744391956366410645f,(float16_t)0.99772306664419163624f,
+(float16_t)0.06591335279700392957f,(float16_t)0.99782535041111164453f,
+(float16_t)0.06438263092985740954f,(float16_t)0.99792528619859599548f,
+(float16_t)0.06285175756416142012f,(float16_t)0.99802287377148624081f,
+(float16_t)0.06132073630220864768f,(float16_t)0.99811811290014917919f,
+(float16_t)0.05978957074664000698f,(float16_t)0.99821100336047818846f,
+(float16_t)0.05825826450043573163f,(float16_t)0.99830154493389289261f,
+(float16_t)0.05672682116690778292f,(float16_t)0.99838973740734016094f,
+(float16_t)0.05519524434969003135f,(float16_t)0.99847558057329477421f,
+(float16_t)0.05366353765273067927f,(float16_t)0.99855907422975931365f,
+(float16_t)0.05213170468028331672f,(float16_t)0.99864021818026527111f,
+(float16_t)0.05059974903689933717f,(float16_t)0.99871901223387293811f,
+(float16_t)0.04906767432741812596f,(float16_t)0.99879545620517240501f,
+(float16_t)0.04753548415695926094f,(float16_t)0.99886954991428356099f,
+(float16_t)0.04600318213091464381f,(float16_t)0.99894129318685687124f,
+(float16_t)0.04447077185493874402f,(float16_t)0.99901068585407337697f,
+(float16_t)0.04293825693494095902f,(float16_t)0.99907772775264536147f,
+(float16_t)0.04140564097707671171f,(float16_t)0.99914241872481690532f,
+(float16_t)0.03987292758773984536f,(float16_t)0.99920475861836388631f,
+(float16_t)0.03834012037355279123f,(float16_t)0.99926474728659442359f,
+(float16_t)0.03680722294135899131f,(float16_t)0.99932238458834954375f,
+(float16_t)0.03527423889821394709f,(float16_t)0.99937767038800284780f,
+(float16_t)0.03374117185137764235f,(float16_t)0.99943060455546173237f,
+(float16_t)0.03220802540830470378f,(float16_t)0.99948118696616694567f,
+(float16_t)0.03067480317663658085f,(float16_t)0.99952941750109314256f,
+(float16_t)0.02914150876419373953f,(float16_t)0.99957529604674921764f,
+(float16_t)0.02760814577896581953f,(float16_t)0.99961882249517863830f,
+(float16_t)0.02607471782910403962f,(float16_t)0.99965999674395922270f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)0.02300768146883941032f,(float16_t)0.99973528826056168306f,
+(float16_t)0.02147408027546960502f,(float16_t)0.99976940535121527898f,
+(float16_t)0.01994042855151459750f,(float16_t)0.99980116988788425569f,
+(float16_t)0.01840672990580482019f,(float16_t)0.99983058179582340319f,
+(float16_t)0.01687298794728177287f,(float16_t)0.99985764100582386060f,
+(float16_t)0.01533920628498821985f,(float16_t)0.99988234745421256111f,
+(float16_t)0.01380538852806034895f,(float16_t)0.99990470108285289808f,
+(float16_t)0.01227153828571994447f,(float16_t)0.99992470183914450299f,
+(float16_t)0.01073765916726457208f,(float16_t)0.99994234967602391162f,
+(float16_t)0.00920375478205995995f,(float16_t)0.99995764455196389786f,
+(float16_t)0.00766982873953107706f,(float16_t)0.99997058643097413988f,
+(float16_t)0.00613588464915451517f,(float16_t)0.99998117528260110909f,
+(float16_t)0.00460192612044867198f,(float16_t)0.99998941108192840321f,
+(float16_t)0.00306795676296613791f,(float16_t)0.99999529380957619118f,
+(float16_t)0.00153398018628476615f,(float16_t)0.99999882345170187925f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99998117528260110909f,(float16_t)0.00613588464915447527f,
+(float16_t)0.99992470183914450299f,(float16_t)0.01227153828571992539f,
+(float16_t)0.99983058179582340319f,(float16_t)0.01840672990580482019f,
+(float16_t)0.99969881869620424997f,(float16_t)0.02454122852291228812f,
+(float16_t)0.99952941750109314256f,(float16_t)0.03067480317663662595f,
+(float16_t)0.99932238458834954375f,(float16_t)0.03680722294135883171f,
+(float16_t)0.99907772775264536147f,(float16_t)0.04293825693494082024f,
+(float16_t)0.99879545620517240501f,(float16_t)0.04906767432741801493f,
+(float16_t)0.99847558057329477421f,(float16_t)0.05519524434968993420f,
+(float16_t)0.99811811290014917919f,(float16_t)0.06132073630220857829f,
+(float16_t)0.99772306664419163624f,(float16_t)0.06744391956366405094f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.99682029929116566791f,(float16_t)0.07968243797143012563f,
+(float16_t)0.99631261218277800129f,(float16_t)0.08579731234443989385f,
+(float16_t)0.99576741446765981713f,(float16_t)0.09190895649713272386f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.99456457073425541537f,(float16_t)0.10412163387205458642f,
+(float16_t)0.99390697000235606051f,(float16_t)0.11022220729388305938f,
+(float16_t)0.99321194923479450001f,(float16_t)0.11631863091190475235f,
+(float16_t)0.99247953459870996706f,(float16_t)0.12241067519921619566f,
+(float16_t)0.99170975366909952520f,(float16_t)0.12849811079379316880f,
+(float16_t)0.99090263542778000971f,(float16_t)0.13458070850712616773f,
+(float16_t)0.99005821026229712256f,(float16_t)0.14065823933284921088f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98825756773074946437f,(float16_t)0.15279718525844343535f,
+(float16_t)0.98730141815785843473f,(float16_t)0.15885814333386144570f,
+(float16_t)0.98630809724459866938f,(float16_t)0.16491312048996989437f,
+(float16_t)0.98527764238894122162f,(float16_t)0.17096188876030121717f,
+(float16_t)0.98421009238692902521f,(float16_t)0.17700422041214874946f,
+(float16_t)0.98310548743121628501f,(float16_t)0.18303988795514095078f,
+(float16_t)0.98196386910955524296f,(float16_t)0.18906866414980619262f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.97956976568544051887f,(float16_t)0.20110463484209190055f,
+(float16_t)0.97831737071962765473f,(float16_t)0.20711137619221856032f,
+(float16_t)0.97702814265775439484f,(float16_t)0.21311031991609136194f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.97433938278557585821f,(float16_t)0.22508391135979283204f,
+(float16_t)0.97293995220556017678f,(float16_t)0.23105810828067110951f,
+(float16_t)0.97150389098625178352f,(float16_t)0.23702360599436719801f,
+(float16_t)0.97003125319454397424f,(float16_t)0.24298017990326387094f,
+(float16_t)0.96852209427441737777f,(float16_t)0.24892760574572014853f,
+(float16_t)0.96697647104485207059f,(float16_t)0.25486565960451457169f,
+(float16_t)0.96539444169768939830f,(float16_t)0.26079411791527551401f,
+(float16_t)0.96377606579543984022f,(float16_t)0.26671275747489836538f,
+(float16_t)0.96212140426904158019f,(float16_t)0.27262135544994897662f,
+(float16_t)0.96043051941556578655f,(float16_t)0.27851968938505305973f,
+(float16_t)0.95870347489587159906f,(float16_t)0.28440753721127187692f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.95514116830577078243f,(float16_t)0.29615088824362378883f,
+(float16_t)0.95330604035419386211f,(float16_t)0.30200594931922808417f,
+(float16_t)0.95143502096900833820f,(float16_t)0.30784964004153486661f,
+(float16_t)0.94952818059303667475f,(float16_t)0.31368174039889151761f,
+(float16_t)0.94758559101774109124f,(float16_t)0.31950203081601569188f,
+(float16_t)0.94560732538052127971f,(float16_t)0.32531029216226292622f,
+(float16_t)0.94359345816196038559f,(float16_t)0.33110630575987642921f,
+(float16_t)0.94154406518302080631f,(float16_t)0.33688985339222005111f,
+(float16_t)0.93945922360218991898f,(float16_t)0.34266071731199437833f,
+(float16_t)0.93733901191257495977f,(float16_t)0.34841868024943456472f,
+(float16_t)0.93518350993894761025f,(float16_t)0.35416352542049034380f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.93076696107898371224f,(float16_t)0.36561299780477385379f,
+(float16_t)0.92850608047321558924f,(float16_t)0.37131719395183754306f,
+(float16_t)0.92621024213831137928f,(float16_t)0.37700741021641825945f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.92151403934204190183f,(float16_t)0.38834504669882624617f,
+(float16_t)0.91911385169005777040f,(float16_t)0.39399204006104809883f,
+(float16_t)0.91667905992104270485f,(float16_t)0.39962419984564678810f,
+(float16_t)0.91420975570353069095f,(float16_t)0.40524131400498986100f,
+(float16_t)0.91170603200542987832f,(float16_t)0.41084317105790391089f,
+(float16_t)0.90916798309052238025f,(float16_t)0.41642956009763715253f,
+(float16_t)0.90659570451491533483f,(float16_t)0.42200027079979968159f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.90134884704602202810f,(float16_t)0.43309381885315195726f,
+(float16_t)0.89867446569395381673f,(float16_t)0.43861623853852765853f,
+(float16_t)0.89596624975618521791f,(float16_t)0.44412214457042920035f,
+(float16_t)0.89322430119551532446f,(float16_t)0.44961132965460653965f,
+(float16_t)0.89044872324475787817f,(float16_t)0.45508358712634383592f,
+(float16_t)0.88763962040285393496f,(float16_t)0.46053871095824000514f,
+(float16_t)0.88479709843093778954f,(float16_t)0.46597649576796618121f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.87901222642863352519f,(float16_t)0.47679923006332208812f,
+(float16_t)0.87607009419540660122f,(float16_t)0.48218377207912271887f,
+(float16_t)0.87309497841829009079f,(float16_t)0.48755016014843599592f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.86704624551569264845f,(float16_t)0.49822766697278181303f,
+(float16_t)0.86397285612158669643f,(float16_t)0.50353838372571757542f,
+(float16_t)0.86086693863776730939f,(float16_t)0.50883014254310698909f,
+(float16_t)0.85772861000027211809f,(float16_t)0.51410274419322166128f,
+(float16_t)0.85455798836540053376f,(float16_t)0.51935599016558964269f,
+(float16_t)0.85135519310526519554f,(float16_t)0.52458968267846894928f,
+(float16_t)0.84812034480329723252f,(float16_t)0.52980362468629460526f,
+(float16_t)0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)0.84155497743689844370f,(float16_t)0.54017147272989285423f,
+(float16_t)0.83822470555483807875f,(float16_t)0.54532498842204646383f,
+(float16_t)0.83486287498638001026f,(float16_t)0.55045797293660481131f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.82804504525775579626f,(float16_t)0.56066157619733603124f,
+(float16_t)0.82458930278502529099f,(float16_t)0.56573181078361312046f,
+(float16_t)0.82110251499110464835f,(float16_t)0.57078074588696725566f,
+(float16_t)0.81758481315158371139f,(float16_t)0.57580819141784533866f,
+(float16_t)0.81403632970594841378f,(float16_t)0.58081395809576452649f,
+(float16_t)0.81045719825259476821f,(float16_t)0.58579785745643886408f,
+(float16_t)0.80684755354379933401f,(float16_t)0.59075970185887416442f,
+(float16_t)0.80320753148064494287f,(float16_t)0.59569930449243335691f,
+(float16_t)0.79953726910790501314f,(float16_t)0.60061647938386897305f,
+(float16_t)0.79583690460888356633f,(float16_t)0.60551104140432554512f,
+(float16_t)0.79210657730021238887f,(float16_t)0.61038280627630947528f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.78455659715557524159f,(float16_t)0.62005721176328909561f,
+(float16_t)0.78073722857209448822f,(float16_t)0.62485948814238634341f,
+(float16_t)0.77688846567323244230f,(float16_t)0.62963823891492698426f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.76910333764557969882f,(float16_t)0.63912444486377573138f,
+(float16_t)0.76516726562245895860f,(float16_t)0.64383154288979138613f,
+(float16_t)0.76120238548426177871f,(float16_t)0.64851440102211244110f,
+(float16_t)0.75720884650648456748f,(float16_t)0.65317284295377675551f,
+(float16_t)0.75318679904361252042f,(float16_t)0.65780669329707863735f,
+(float16_t)0.74913639452345937020f,(float16_t)0.66241577759017178373f,
+(float16_t)0.74505778544146594733f,(float16_t)0.66699992230363747137f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.73681656887736979300f,(float16_t)0.67609270357531592310f,
+(float16_t)0.73265427167241281570f,(float16_t)0.68060099779545302212f,
+(float16_t)0.72846439044822519637f,(float16_t)0.68508366777270035541f,
+(float16_t)0.72424708295146700276f,(float16_t)0.68954054473706682948f,
+(float16_t)0.72000250796138165477f,(float16_t)0.69397146088965389055f,
+(float16_t)0.71573082528381870571f,(float16_t)0.69837624940897280457f,
+(float16_t)0.71143219574521643356f,(float16_t)0.70275474445722529993f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.70275474445722529993f,(float16_t)0.71143219574521643356f,
+(float16_t)0.69837624940897291559f,(float16_t)0.71573082528381859468f,
+(float16_t)0.69397146088965400157f,(float16_t)0.72000250796138165477f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.68508366777270035541f,(float16_t)0.72846439044822519637f,
+(float16_t)0.68060099779545302212f,(float16_t)0.73265427167241281570f,
+(float16_t)0.67609270357531603413f,(float16_t)0.73681656887736979300f,
+(float16_t)0.67155895484701833009f,(float16_t)0.74095112535495910588f,
+(float16_t)0.66699992230363747137f,(float16_t)0.74505778544146594733f,
+(float16_t)0.66241577759017178373f,(float16_t)0.74913639452345925918f,
+(float16_t)0.65780669329707874837f,(float16_t)0.75318679904361252042f,
+(float16_t)0.65317284295377686654f,(float16_t)0.75720884650648456748f,
+(float16_t)0.64851440102211255212f,(float16_t)0.76120238548426177871f,
+(float16_t)0.64383154288979149715f,(float16_t)0.76516726562245895860f,
+(float16_t)0.63912444486377573138f,(float16_t)0.76910333764557958780f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.62963823891492709528f,(float16_t)0.77688846567323244230f,
+(float16_t)0.62485948814238645443f,(float16_t)0.78073722857209448822f,
+(float16_t)0.62005721176328920663f,(float16_t)0.78455659715557524159f,
+(float16_t)0.61523159058062681925f,(float16_t)0.78834642762660622761f,
+(float16_t)0.61038280627630947528f,(float16_t)0.79210657730021227785f,
+(float16_t)0.60551104140432554512f,(float16_t)0.79583690460888345530f,
+(float16_t)0.60061647938386897305f,(float16_t)0.79953726910790501314f,
+(float16_t)0.59569930449243346793f,(float16_t)0.80320753148064483184f,
+(float16_t)0.59075970185887427544f,(float16_t)0.80684755354379922299f,
+(float16_t)0.58579785745643886408f,(float16_t)0.81045719825259476821f,
+(float16_t)0.58081395809576452649f,(float16_t)0.81403632970594830276f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.57078074588696736669f,(float16_t)0.82110251499110464835f,
+(float16_t)0.56573181078361323149f,(float16_t)0.82458930278502529099f,
+(float16_t)0.56066157619733603124f,(float16_t)0.82804504525775579626f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.55045797293660481131f,(float16_t)0.83486287498638001026f,
+(float16_t)0.54532498842204646383f,(float16_t)0.83822470555483796772f,
+(float16_t)0.54017147272989296525f,(float16_t)0.84155497743689833268f,
+(float16_t)0.53499761988709726435f,(float16_t)0.84485356524970700587f,
+(float16_t)0.52980362468629482731f,(float16_t)0.84812034480329712149f,
+(float16_t)0.52458968267846883826f,(float16_t)0.85135519310526519554f,
+(float16_t)0.51935599016558953167f,(float16_t)0.85455798836540053376f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.50883014254310698909f,(float16_t)0.86086693863776730939f,
+(float16_t)0.50353838372571757542f,(float16_t)0.86397285612158669643f,
+(float16_t)0.49822766697278186854f,(float16_t)0.86704624551569264845f,
+(float16_t)0.49289819222978409341f,(float16_t)0.87008699110871134952f,
+(float16_t)0.48755016014843605143f,(float16_t)0.87309497841829009079f,
+(float16_t)0.48218377207912282989f,(float16_t)0.87607009419540660122f,
+(float16_t)0.47679923006332225466f,(float16_t)0.87901222642863341417f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.46597649576796612569f,(float16_t)0.88479709843093778954f,
+(float16_t)0.46053871095824000514f,(float16_t)0.88763962040285393496f,
+(float16_t)0.45508358712634383592f,(float16_t)0.89044872324475787817f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.44412214457042925586f,(float16_t)0.89596624975618510689f,
+(float16_t)0.43861623853852771404f,(float16_t)0.89867446569395381673f,
+(float16_t)0.43309381885315201277f,(float16_t)0.90134884704602202810f,
+(float16_t)0.42755509343028219593f,(float16_t)0.90398929312344333820f,
+(float16_t)0.42200027079979979261f,(float16_t)0.90659570451491533483f,
+(float16_t)0.41642956009763731906f,(float16_t)0.90916798309052226923f,
+(float16_t)0.41084317105790391089f,(float16_t)0.91170603200542987832f,
+(float16_t)0.40524131400498986100f,(float16_t)0.91420975570353069095f,
+(float16_t)0.39962419984564678810f,(float16_t)0.91667905992104270485f,
+(float16_t)0.39399204006104809883f,(float16_t)0.91911385169005777040f,
+(float16_t)0.38834504669882630168f,(float16_t)0.92151403934204190183f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.37700741021641831496f,(float16_t)0.92621024213831126826f,
+(float16_t)0.37131719395183759858f,(float16_t)0.92850608047321558924f,
+(float16_t)0.36561299780477396482f,(float16_t)0.93076696107898371224f,
+(float16_t)0.35989503653498827740f,(float16_t)0.93299279883473884567f,
+(float16_t)0.35416352542049051033f,(float16_t)0.93518350993894749923f,
+(float16_t)0.34841868024943450921f,(float16_t)0.93733901191257495977f,
+(float16_t)0.34266071731199437833f,(float16_t)0.93945922360218991898f,
+(float16_t)0.33688985339222005111f,(float16_t)0.94154406518302080631f,
+(float16_t)0.33110630575987642921f,(float16_t)0.94359345816196038559f,
+(float16_t)0.32531029216226298173f,(float16_t)0.94560732538052127971f,
+(float16_t)0.31950203081601574739f,(float16_t)0.94758559101774109124f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.30784964004153497763f,(float16_t)0.95143502096900833820f,
+(float16_t)0.30200594931922819519f,(float16_t)0.95330604035419375109f,
+(float16_t)0.29615088824362395536f,(float16_t)0.95514116830577067141f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.28440753721127182141f,(float16_t)0.95870347489587159906f,
+(float16_t)0.27851968938505305973f,(float16_t)0.96043051941556578655f,
+(float16_t)0.27262135544994897662f,(float16_t)0.96212140426904158019f,
+(float16_t)0.26671275747489842090f,(float16_t)0.96377606579543984022f,
+(float16_t)0.26079411791527556952f,(float16_t)0.96539444169768939830f,
+(float16_t)0.25486565960451462720f,(float16_t)0.96697647104485207059f,
+(float16_t)0.24892760574572025956f,(float16_t)0.96852209427441726675f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.23702360599436733679f,(float16_t)0.97150389098625178352f,
+(float16_t)0.23105810828067127605f,(float16_t)0.97293995220556006576f,
+(float16_t)0.22508391135979277653f,(float16_t)0.97433938278557585821f,
+(float16_t)0.21910124015686976984f,(float16_t)0.97570213003852857003f,
+(float16_t)0.21311031991609136194f,(float16_t)0.97702814265775439484f,
+(float16_t)0.20711137619221856032f,(float16_t)0.97831737071962765473f,
+(float16_t)0.20110463484209195606f,(float16_t)0.97956976568544051887f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.18906866414980627589f,(float16_t)0.98196386910955524296f,
+(float16_t)0.18303988795514106180f,(float16_t)0.98310548743121628501f,
+(float16_t)0.17700422041214886049f,(float16_t)0.98421009238692902521f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.16491312048997008866f,(float16_t)0.98630809724459866938f,
+(float16_t)0.15885814333386139019f,(float16_t)0.98730141815785843473f,
+(float16_t)0.15279718525844340760f,(float16_t)0.98825756773074946437f,
+(float16_t)0.14673047445536174793f,(float16_t)0.98917650996478101444f,
+(float16_t)0.14065823933284923863f,(float16_t)0.99005821026229712256f,
+(float16_t)0.13458070850712622324f,(float16_t)0.99090263542778000971f,
+(float16_t)0.12849811079379322432f,(float16_t)0.99170975366909952520f,
+(float16_t)0.12241067519921627893f,(float16_t)0.99247953459870996706f,
+(float16_t)0.11631863091190487725f,(float16_t)0.99321194923479450001f,
+(float16_t)0.11022220729388318428f,(float16_t)0.99390697000235606051f,
+(float16_t)0.10412163387205472520f,(float16_t)0.99456457073425541537f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.09190895649713269611f,(float16_t)0.99576741446765981713f,
+(float16_t)0.08579731234443987997f,(float16_t)0.99631261218277800129f,
+(float16_t)0.07968243797143012563f,(float16_t)0.99682029929116566791f,
+(float16_t)0.07356456359966745406f,(float16_t)0.99729045667869020697f,
+(float16_t)0.06744391956366410645f,(float16_t)0.99772306664419163624f,
+(float16_t)0.06132073630220864768f,(float16_t)0.99811811290014917919f,
+(float16_t)0.05519524434969003135f,(float16_t)0.99847558057329477421f,
+(float16_t)0.04906767432741812596f,(float16_t)0.99879545620517240501f,
+(float16_t)0.04293825693494095902f,(float16_t)0.99907772775264536147f,
+(float16_t)0.03680722294135899131f,(float16_t)0.99932238458834954375f,
+(float16_t)0.03067480317663658085f,(float16_t)0.99952941750109314256f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)0.01840672990580482019f,(float16_t)0.99983058179582340319f,
+(float16_t)0.01227153828571994447f,(float16_t)0.99992470183914450299f,
+(float16_t)0.00613588464915451517f,(float16_t)0.99998117528260110909f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99969881869620424997f,(float16_t)0.02454122852291228812f,
+(float16_t)0.99879545620517240501f,(float16_t)0.04906767432741801493f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.99247953459870996706f,(float16_t)0.12241067519921619566f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98527764238894122162f,(float16_t)0.17096188876030121717f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.97003125319454397424f,(float16_t)0.24298017990326387094f,
+(float16_t)0.96377606579543984022f,(float16_t)0.26671275747489836538f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.94952818059303667475f,(float16_t)0.31368174039889151761f,
+(float16_t)0.94154406518302080631f,(float16_t)0.33688985339222005111f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.91420975570353069095f,(float16_t)0.40524131400498986100f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.89322430119551532446f,(float16_t)0.44961132965460653965f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.85772861000027211809f,(float16_t)0.51410274419322166128f,
+(float16_t)0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.81758481315158371139f,(float16_t)0.57580819141784533866f,
+(float16_t)0.80320753148064494287f,(float16_t)0.59569930449243335691f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.75720884650648456748f,(float16_t)0.65317284295377675551f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.72424708295146700276f,(float16_t)0.68954054473706682948f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.67155895484701833009f,(float16_t)0.74095112535495910588f,
+(float16_t)0.65317284295377686654f,(float16_t)0.75720884650648456748f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.61523159058062681925f,(float16_t)0.78834642762660622761f,
+(float16_t)0.59569930449243346793f,(float16_t)0.80320753148064483184f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.53499761988709726435f,(float16_t)0.84485356524970700587f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.49289819222978409341f,(float16_t)0.87008699110871134952f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.42755509343028219593f,(float16_t)0.90398929312344333820f,
+(float16_t)0.40524131400498986100f,(float16_t)0.91420975570353069095f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.35989503653498827740f,(float16_t)0.93299279883473884567f,
+(float16_t)0.33688985339222005111f,(float16_t)0.94154406518302080631f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.26671275747489842090f,(float16_t)0.96377606579543984022f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.21910124015686976984f,(float16_t)0.97570213003852857003f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.14673047445536174793f,(float16_t)0.98917650996478101444f,
+(float16_t)0.12241067519921627893f,(float16_t)0.99247953459870996706f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.07356456359966745406f,(float16_t)0.99729045667869020697f,
+(float16_t)0.04906767432741812596f,(float16_t)0.99879545620517240501f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,};
+
+float16_t rearranged_twiddle_stride2_4096_f16[2728]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99999529380957619118f,(float16_t)0.00306795676296597614f,
+(float16_t)0.99998117528260110909f,(float16_t)0.00613588464915447527f,
+(float16_t)0.99995764455196389786f,(float16_t)0.00920375478205981944f,
+(float16_t)0.99992470183914450299f,(float16_t)0.01227153828571992539f,
+(float16_t)0.99988234745421256111f,(float16_t)0.01533920628498810015f,
+(float16_t)0.99983058179582340319f,(float16_t)0.01840672990580482019f,
+(float16_t)0.99976940535121527898f,(float16_t)0.02147408027546950787f,
+(float16_t)0.99969881869620424997f,(float16_t)0.02454122852291228812f,
+(float16_t)0.99961882249517863830f,(float16_t)0.02760814577896573974f,
+(float16_t)0.99952941750109314256f,(float16_t)0.03067480317663662595f,
+(float16_t)0.99943060455546173237f,(float16_t)0.03374117185137757990f,
+(float16_t)0.99932238458834954375f,(float16_t)0.03680722294135883171f,
+(float16_t)0.99920475861836388631f,(float16_t)0.03987292758773981066f,
+(float16_t)0.99907772775264536147f,(float16_t)0.04293825693494082024f,
+(float16_t)0.99894129318685687124f,(float16_t)0.04600318213091462299f,
+(float16_t)0.99879545620517240501f,(float16_t)0.04906767432741801493f,
+(float16_t)0.99864021818026527111f,(float16_t)0.05213170468028332366f,
+(float16_t)0.99847558057329477421f,(float16_t)0.05519524434968993420f,
+(float16_t)0.99830154493389289261f,(float16_t)0.05825826450043575244f,
+(float16_t)0.99811811290014917919f,(float16_t)0.06132073630220857829f,
+(float16_t)0.99792528619859599548f,(float16_t)0.06438263092985746505f,
+(float16_t)0.99772306664419163624f,(float16_t)0.06744391956366405094f,
+(float16_t)0.99751145614030345410f,(float16_t)0.07050457338961385600f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.99706007033948296225f,(float16_t)0.07662386139203149205f,
+(float16_t)0.99682029929116566791f,(float16_t)0.07968243797143012563f,
+(float16_t)0.99657114579055483539f,(float16_t)0.08274026454937569164f,
+(float16_t)0.99631261218277800129f,(float16_t)0.08579731234443989385f,
+(float16_t)0.99604470090125196702f,(float16_t)0.08885355258252460031f,
+(float16_t)0.99576741446765981713f,(float16_t)0.09190895649713272386f,
+(float16_t)0.99548075549192693856f,(float16_t)0.09496349532963899165f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.99487933079480561638f,(float16_t)0.10106986275482782167f,
+(float16_t)0.99456457073425541537f,(float16_t)0.10412163387205458642f,
+(float16_t)0.99424044945318790223f,(float16_t)0.10717242495680884273f,
+(float16_t)0.99390697000235606051f,(float16_t)0.11022220729388305938f,
+(float16_t)0.99356413552059530403f,(float16_t)0.11327095217756434631f,
+(float16_t)0.99321194923479450001f,(float16_t)0.11631863091190475235f,
+(float16_t)0.99285041445986510489f,(float16_t)0.11936521481099135467f,
+(float16_t)0.99247953459870996706f,(float16_t)0.12241067519921619566f,
+(float16_t)0.99209931314219179654f,(float16_t)0.12545498341154623367f,
+(float16_t)0.99170975366909952520f,(float16_t)0.12849811079379316880f,
+(float16_t)0.99131085984611544415f,(float16_t)0.13154002870288311611f,
+(float16_t)0.99090263542778000971f,(float16_t)0.13458070850712616773f,
+(float16_t)0.99048508425645709341f,(float16_t)0.13762012158648603832f,
+(float16_t)0.99005821026229712256f,(float16_t)0.14065823933284921088f,
+(float16_t)0.98962201746320088702f,(float16_t)0.14369503315029447110f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98872169196032377858f,(float16_t)0.14976453467732150915f,
+(float16_t)0.98825756773074946437f,(float16_t)0.15279718525844343535f,
+(float16_t)0.98778414164457217783f,(float16_t)0.15582839765426523271f,
+(float16_t)0.98730141815785843473f,(float16_t)0.15885814333386144570f,
+(float16_t)0.98680940181418552726f,(float16_t)0.16188639378011182579f,
+(float16_t)0.98630809724459866938f,(float16_t)0.16491312048996989437f,
+(float16_t)0.98579750916756747614f,(float16_t)0.16793829497473117263f,
+(float16_t)0.98527764238894122162f,(float16_t)0.17096188876030121717f,
+(float16_t)0.98474850180190420801f,(float16_t)0.17398387338746382214f,
+(float16_t)0.98421009238692902521f,(float16_t)0.17700422041214874946f,
+(float16_t)0.98366241921173025453f,(float16_t)0.18002290140569951471f,
+(float16_t)0.98310548743121628501f,(float16_t)0.18303988795514095078f,
+(float16_t)0.98253930228744124076f,(float16_t)0.18605515166344663291f,
+(float16_t)0.98196386910955524296f,(float16_t)0.18906866414980619262f,
+(float16_t)0.98137919331375456089f,(float16_t)0.19208039704989243734f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.98018213596811742949f,(float16_t)0.19809841071795356027f,
+(float16_t)0.97956976568544051887f,(float16_t)0.20110463484209190055f,
+(float16_t)0.97894817531906219710f,(float16_t)0.20410896609281686809f,
+(float16_t)0.97831737071962765473f,(float16_t)0.20711137619221856032f,
+(float16_t)0.97767735782450992943f,(float16_t)0.21011183688046961016f,
+(float16_t)0.97702814265775439484f,(float16_t)0.21311031991609136194f,
+(float16_t)0.97636973133002114000f,(float16_t)0.21610679707621952006f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.97502534506699412020f,(float16_t)0.22209362097320350937f,
+(float16_t)0.97433938278557585821f,(float16_t)0.22508391135979283204f,
+(float16_t)0.97364424965081197705f,(float16_t)0.22807208317088573102f,
+(float16_t)0.97293995220556017678f,(float16_t)0.23105810828067110951f,
+(float16_t)0.97222649707893626925f,(float16_t)0.23404195858354343018f,
+(float16_t)0.97150389098625178352f,(float16_t)0.23702360599436719801f,
+(float16_t)0.97077214072895035013f,(float16_t)0.24000302244874149871f,
+(float16_t)0.97003125319454397424f,(float16_t)0.24298017990326387094f,
+(float16_t)0.96928123535654853171f,(float16_t)0.24595505033579459497f,
+(float16_t)0.96852209427441737777f,(float16_t)0.24892760574572014853f,
+(float16_t)0.96775383709347551076f,(float16_t)0.25189781815421696809f,
+(float16_t)0.96697647104485207059f,(float16_t)0.25486565960451457169f,
+(float16_t)0.96619000344541250413f,(float16_t)0.25783110216215898713f,
+(float16_t)0.96539444169768939830f,(float16_t)0.26079411791527551401f,
+(float16_t)0.96458979328981275803f,(float16_t)0.26375467897483134694f,
+(float16_t)0.96377606579543984022f,(float16_t)0.26671275747489836538f,
+(float16_t)0.96295326687368387741f,(float16_t)0.26966832557291509076f,
+(float16_t)0.96212140426904158019f,(float16_t)0.27262135544994897662f,
+(float16_t)0.96128048581132063966f,(float16_t)0.27557181931095814376f,
+(float16_t)0.96043051941556578655f,(float16_t)0.27851968938505305973f,
+(float16_t)0.95957151308198451733f,(float16_t)0.28146493792575794091f,
+(float16_t)0.95870347489587159906f,(float16_t)0.28440753721127187692f,
+(float16_t)0.95782641302753290802f,(float16_t)0.28734745954472951102f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.95604525134999640557f,(float16_t)0.29321916269425862822f,
+(float16_t)0.95514116830577078243f,(float16_t)0.29615088824362378883f,
+(float16_t)0.95422809510910566733f,(float16_t)0.29907982630804047508f,
+(float16_t)0.95330604035419386211f,(float16_t)0.30200594931922808417f,
+(float16_t)0.95237501271976587880f,(float16_t)0.30492922973540237397f,
+(float16_t)0.95143502096900833820f,(float16_t)0.30784964004153486661f,
+(float16_t)0.95048607394948170235f,(float16_t)0.31076715274961147495f,
+(float16_t)0.94952818059303667475f,(float16_t)0.31368174039889151761f,
+(float16_t)0.94856134991573026749f,(float16_t)0.31659337555616584581f,
+(float16_t)0.94758559101774109124f,(float16_t)0.31950203081601569188f,
+(float16_t)0.94660091308328353499f,(float16_t)0.32240767880106985244f,
+(float16_t)0.94560732538052127971f,(float16_t)0.32531029216226292622f,
+(float16_t)0.94460483726148025685f,(float16_t)0.32820984357909249729f,
+(float16_t)0.94359345816196038559f,(float16_t)0.33110630575987642921f,
+(float16_t)0.94257319760144686605f,(float16_t)0.33399965144200938205f,
+(float16_t)0.94154406518302080631f,(float16_t)0.33688985339222005111f,
+(float16_t)0.94050607059326829518f,(float16_t)0.33977688440682685123f,
+(float16_t)0.93945922360218991898f,(float16_t)0.34266071731199437833f,
+(float16_t)0.93840353406310805795f,(float16_t)0.34554132496398909380f,
+(float16_t)0.93733901191257495977f,(float16_t)0.34841868024943456472f,
+(float16_t)0.93626566717027825959f,(float16_t)0.35129275608556709276f,
+(float16_t)0.93518350993894761025f,(float16_t)0.35416352542049034380f,
+(float16_t)0.93409255040425887007f,(float16_t)0.35703096123342997759f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.93188426558166814750f,(float16_t)0.36275572436739722537f,
+(float16_t)0.93076696107898371224f,(float16_t)0.36561299780477385379f,
+(float16_t)0.92964089584318121418f,(float16_t)0.36846682995337232125f,
+(float16_t)0.92850608047321558924f,(float16_t)0.37131719395183754306f,
+(float16_t)0.92736252565040111495f,(float16_t)0.37416406297145793358f,
+(float16_t)0.92621024213831137928f,(float16_t)0.37700741021641825945f,
+(float16_t)0.92504924078267758425f,(float16_t)0.37984720892405116066f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.92270112833387862850f,(float16_t)0.38551605384391884890f,
+(float16_t)0.92151403934204190183f,(float16_t)0.38834504669882624617f,
+(float16_t)0.92031827670911059425f,(float16_t)0.39117038430225387069f,
+(float16_t)0.91911385169005777040f,(float16_t)0.39399204006104809883f,
+(float16_t)0.91790077562139049672f,(float16_t)0.39680998741671030805f,
+(float16_t)0.91667905992104270485f,(float16_t)0.39962419984564678810f,
+(float16_t)0.91544871608826783316f,(float16_t)0.40243465085941843018f,
+(float16_t)0.91420975570353069095f,(float16_t)0.40524131400498986100f,
+(float16_t)0.91296219042839821256f,(float16_t)0.40804416286497868782f,
+(float16_t)0.91170603200542987832f,(float16_t)0.41084317105790391089f,
+(float16_t)0.91044129225806724737f,(float16_t)0.41363831223843450235f,
+(float16_t)0.90916798309052238025f,(float16_t)0.41642956009763715253f,
+(float16_t)0.90788611648766626150f,(float16_t)0.41921688836322390515f,
+(float16_t)0.90659570451491533483f,(float16_t)0.42200027079979968159f,
+(float16_t)0.90529675931811881551f,(float16_t)0.42477968120910880589f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.90267331823725882600f,(float16_t)0.43032648134008261165f,
+(float16_t)0.90134884704602202810f,(float16_t)0.43309381885315195726f,
+(float16_t)0.90001589201616016833f,(float16_t)0.43585707992225547480f,
+(float16_t)0.89867446569395381673f,(float16_t)0.43861623853852765853f,
+(float16_t)0.89732458070541831763f,(float16_t)0.44137126873171667052f,
+(float16_t)0.89596624975618521791f,(float16_t)0.44412214457042920035f,
+(float16_t)0.89459948563138269595f,(float16_t)0.44686884016237415906f,
+(float16_t)0.89322430119551532446f,(float16_t)0.44961132965460653965f,
+(float16_t)0.89184070939234272313f,(float16_t)0.45234958723377088896f,
+(float16_t)0.89044872324475787817f,(float16_t)0.45508358712634383592f,
+(float16_t)0.88904835585466457371f,(float16_t)0.45781330359887717485f,
+(float16_t)0.88763962040285393496f,(float16_t)0.46053871095824000514f,
+(float16_t)0.88622253014888063838f,(float16_t)0.46325978355186014923f,
+(float16_t)0.88479709843093778954f,(float16_t)0.46597649576796618121f,
+(float16_t)0.88336333866573157891f,(float16_t)0.46868882203582790114f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.88047088905216075450f,(float16_t)0.47410021465054996703f,
+(float16_t)0.87901222642863352519f,(float16_t)0.47679923006332208812f,
+(float16_t)0.87754529020726135258f,(float16_t)0.47949375766015295275f,
+(float16_t)0.87607009419540660122f,(float16_t)0.48218377207912271887f,
+(float16_t)0.87458665227817611321f,(float16_t)0.48486924800079106435f,
+(float16_t)0.87309497841829009079f,(float16_t)0.48755016014843599592f,
+(float16_t)0.87159508665595097909f,(float16_t)0.49022648328829115938f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.86857070597134089507f,(float16_t)0.49556526182577254058f,
+(float16_t)0.86704624551569264845f,(float16_t)0.49822766697278181303f,
+(float16_t)0.86551362409056908920f,(float16_t)0.50088538261124071482f,
+(float16_t)0.86397285612158669643f,(float16_t)0.50353838372571757542f,
+(float16_t)0.86242395611104050168f,(float16_t)0.50618664534515522835f,
+(float16_t)0.86086693863776730939f,(float16_t)0.50883014254310698909f,
+(float16_t)0.85930181835700847337f,(float16_t)0.51146885043797030157f,
+(float16_t)0.85772861000027211809f,(float16_t)0.51410274419322166128f,
+(float16_t)0.85614732837519447184f,(float16_t)0.51673179901764987321f,
+(float16_t)0.85455798836540053376f,(float16_t)0.51935599016558964269f,
+(float16_t)0.85296060493036363059f,(float16_t)0.52197529293715438925f,
+(float16_t)0.85135519310526519554f,(float16_t)0.52458968267846894928f,
+(float16_t)0.84974176800085254868f,(float16_t)0.52719913478190127964f,
+(float16_t)0.84812034480329723252f,(float16_t)0.52980362468629460526f,
+(float16_t)0.84649093877405212627f,(float16_t)0.53240312787719790144f,
+(float16_t)0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)0.84320823964184543620f,(float16_t)0.53758707629564539410f,
+(float16_t)0.84155497743689844370f,(float16_t)0.54017147272989285423f,
+(float16_t)0.83989379419599952126f,(float16_t)0.54275078486451588944f,
+(float16_t)0.83822470555483807875f,(float16_t)0.54532498842204646383f,
+(float16_t)0.83654772722351200542f,(float16_t)0.54789405917310018967f,
+(float16_t)0.83486287498638001026f,(float16_t)0.55045797293660481131f,
+(float16_t)0.83317016470191318511f,(float16_t)0.55301670558002746780f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.82976123379452304540f,(float16_t)0.55811853122055610221f,
+(float16_t)0.82804504525775579626f,(float16_t)0.56066157619733603124f,
+(float16_t)0.82632106284566353427f,(float16_t)0.56319934401383409117f,
+(float16_t)0.82458930278502529099f,(float16_t)0.56573181078361312046f,
+(float16_t)0.82284978137582642788f,(float16_t)0.56825895267013148970f,
+(float16_t)0.82110251499110464835f,(float16_t)0.57078074588696725566f,
+(float16_t)0.81934752007679700903f,(float16_t)0.57329716669804220430f,
+(float16_t)0.81758481315158371139f,(float16_t)0.57580819141784533866f,
+(float16_t)0.81581441080673378075f,(float16_t)0.57831379641165558958f,
+(float16_t)0.81403632970594841378f,(float16_t)0.58081395809576452649f,
+(float16_t)0.81225058658520399302f,(float16_t)0.58330865293769829094f,
+(float16_t)0.81045719825259476821f,(float16_t)0.58579785745643886408f,
+(float16_t)0.80865618158817498262f,(float16_t)0.58828154822264522306f,
+(float16_t)0.80684755354379933401f,(float16_t)0.59075970185887416442f,
+(float16_t)0.80503133114296365758f,(float16_t)0.59323229503979979516f,
+(float16_t)0.80320753148064494287f,(float16_t)0.59569930449243335691f,
+(float16_t)0.80137617172314024039f,(float16_t)0.59816070699634238395f,
+(float16_t)0.79953726910790501314f,(float16_t)0.60061647938386897305f,
+(float16_t)0.79769084094339115509f,(float16_t)0.60306659854034816437f,
+(float16_t)0.79583690460888356633f,(float16_t)0.60551104140432554512f,
+(float16_t)0.79397547755433717231f,(float16_t)0.60794978496777363208f,
+(float16_t)0.79210657730021238887f,(float16_t)0.61038280627630947528f,
+(float16_t)0.79023022143731003197f,(float16_t)0.61281008242940970820f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.78645521359908576731f,(float16_t)0.61764730793780386886f,
+(float16_t)0.78455659715557524159f,(float16_t)0.62005721176328909561f,
+(float16_t)0.78265059616657572938f,(float16_t)0.62246127937414996723f,
+(float16_t)0.78073722857209448822f,(float16_t)0.62485948814238634341f,
+(float16_t)0.77881651238147597827f,(float16_t)0.62725181549514408275f,
+(float16_t)0.77688846567323244230f,(float16_t)0.62963823891492698426f,
+(float16_t)0.77495310659487393057f,(float16_t)0.63201873593980906207f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.77106052426181381776f,(float16_t)0.63676186123628419899f,
+(float16_t)0.76910333764557969882f,(float16_t)0.63912444486377573138f,
+(float16_t)0.76713891193582040007f,(float16_t)0.64148101280858305095f,
+(float16_t)0.76516726562245895860f,(float16_t)0.64383154288979138613f,
+(float16_t)0.76318841726338138010f,(float16_t)0.64617601298331628357f,
+(float16_t)0.76120238548426177871f,(float16_t)0.64851440102211244110f,
+(float16_t)0.75920918897838796102f,(float16_t)0.65084668499638087535f,
+(float16_t)0.75720884650648456748f,(float16_t)0.65317284295377675551f,
+(float16_t)0.75520137689653654700f,(float16_t)0.65549285299961534967f,
+(float16_t)0.75318679904361252042f,(float16_t)0.65780669329707863735f,
+(float16_t)0.75116513190968636771f,(float16_t)0.66011434206742047870f,
+(float16_t)0.74913639452345937020f,(float16_t)0.66241577759017178373f,
+(float16_t)0.74710060598018013245f,(float16_t)0.66471097820334479334f,
+(float16_t)0.74505778544146594733f,(float16_t)0.66699992230363747137f,
+(float16_t)0.74300795213512171866f,(float16_t)0.66928258834663600929f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.73888732446061511361f,(float16_t)0.67382900037875603783f,
+(float16_t)0.73681656887736979300f,(float16_t)0.67609270357531592310f,
+(float16_t)0.73473887809596349907f,(float16_t)0.67835004312986146857f,
+(float16_t)0.73265427167241281570f,(float16_t)0.68060099779545302212f,
+(float16_t)0.73056276922782759087f,(float16_t)0.68284554638524808112f,
+(float16_t)0.72846439044822519637f,(float16_t)0.68508366777270035541f,
+(float16_t)0.72635915508434600873f,(float16_t)0.68731534089175905233f,
+(float16_t)0.72424708295146700276f,(float16_t)0.68954054473706682948f,
+(float16_t)0.72212819392921534511f,(float16_t)0.69175925836415774750f,
+(float16_t)0.72000250796138165477f,(float16_t)0.69397146088965389055f,
+(float16_t)0.71787004505573170920f,(float16_t)0.69617713149146298601f,
+(float16_t)0.71573082528381870571f,(float16_t)0.69837624940897280457f,
+(float16_t)0.71358486878079352422f,(float16_t)0.70056879394324833576f,
+(float16_t)0.71143219574521643356f,(float16_t)0.70275474445722529993f,
+(float16_t)0.70927282643886568891f,(float16_t)0.70493408037590488124f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.70493408037590499227f,(float16_t)0.70927282643886568891f,
+(float16_t)0.70275474445722529993f,(float16_t)0.71143219574521643356f,
+(float16_t)0.70056879394324844679f,(float16_t)0.71358486878079352422f,
+(float16_t)0.69837624940897291559f,(float16_t)0.71573082528381859468f,
+(float16_t)0.69617713149146298601f,(float16_t)0.71787004505573170920f,
+(float16_t)0.69397146088965400157f,(float16_t)0.72000250796138165477f,
+(float16_t)0.69175925836415774750f,(float16_t)0.72212819392921534511f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.68731534089175905233f,(float16_t)0.72635915508434600873f,
+(float16_t)0.68508366777270035541f,(float16_t)0.72846439044822519637f,
+(float16_t)0.68284554638524808112f,(float16_t)0.73056276922782759087f,
+(float16_t)0.68060099779545302212f,(float16_t)0.73265427167241281570f,
+(float16_t)0.67835004312986146857f,(float16_t)0.73473887809596349907f,
+(float16_t)0.67609270357531603413f,(float16_t)0.73681656887736979300f,
+(float16_t)0.67382900037875614885f,(float16_t)0.73888732446061511361f,
+(float16_t)0.67155895484701833009f,(float16_t)0.74095112535495910588f,
+(float16_t)0.66928258834663600929f,(float16_t)0.74300795213512171866f,
+(float16_t)0.66699992230363747137f,(float16_t)0.74505778544146594733f,
+(float16_t)0.66471097820334490436f,(float16_t)0.74710060598018013245f,
+(float16_t)0.66241577759017178373f,(float16_t)0.74913639452345925918f,
+(float16_t)0.66011434206742047870f,(float16_t)0.75116513190968636771f,
+(float16_t)0.65780669329707874837f,(float16_t)0.75318679904361252042f,
+(float16_t)0.65549285299961546070f,(float16_t)0.75520137689653654700f,
+(float16_t)0.65317284295377686654f,(float16_t)0.75720884650648456748f,
+(float16_t)0.65084668499638098638f,(float16_t)0.75920918897838796102f,
+(float16_t)0.64851440102211255212f,(float16_t)0.76120238548426177871f,
+(float16_t)0.64617601298331639459f,(float16_t)0.76318841726338126907f,
+(float16_t)0.64383154288979149715f,(float16_t)0.76516726562245895860f,
+(float16_t)0.64148101280858316198f,(float16_t)0.76713891193582040007f,
+(float16_t)0.63912444486377573138f,(float16_t)0.76910333764557958780f,
+(float16_t)0.63676186123628419899f,(float16_t)0.77106052426181381776f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.63201873593980906207f,(float16_t)0.77495310659487381955f,
+(float16_t)0.62963823891492709528f,(float16_t)0.77688846567323244230f,
+(float16_t)0.62725181549514419377f,(float16_t)0.77881651238147586724f,
+(float16_t)0.62485948814238645443f,(float16_t)0.78073722857209448822f,
+(float16_t)0.62246127937415007825f,(float16_t)0.78265059616657572938f,
+(float16_t)0.62005721176328920663f,(float16_t)0.78455659715557524159f,
+(float16_t)0.61764730793780397988f,(float16_t)0.78645521359908576731f,
+(float16_t)0.61523159058062681925f,(float16_t)0.78834642762660622761f,
+(float16_t)0.61281008242940970820f,(float16_t)0.79023022143731003197f,
+(float16_t)0.61038280627630947528f,(float16_t)0.79210657730021227785f,
+(float16_t)0.60794978496777374311f,(float16_t)0.79397547755433717231f,
+(float16_t)0.60551104140432554512f,(float16_t)0.79583690460888345530f,
+(float16_t)0.60306659854034827539f,(float16_t)0.79769084094339104407f,
+(float16_t)0.60061647938386897305f,(float16_t)0.79953726910790501314f,
+(float16_t)0.59816070699634238395f,(float16_t)0.80137617172314012937f,
+(float16_t)0.59569930449243346793f,(float16_t)0.80320753148064483184f,
+(float16_t)0.59323229503979979516f,(float16_t)0.80503133114296365758f,
+(float16_t)0.59075970185887427544f,(float16_t)0.80684755354379922299f,
+(float16_t)0.58828154822264533408f,(float16_t)0.80865618158817498262f,
+(float16_t)0.58579785745643886408f,(float16_t)0.81045719825259476821f,
+(float16_t)0.58330865293769829094f,(float16_t)0.81225058658520388200f,
+(float16_t)0.58081395809576452649f,(float16_t)0.81403632970594830276f,
+(float16_t)0.57831379641165558958f,(float16_t)0.81581441080673378075f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.57329716669804231532f,(float16_t)0.81934752007679689800f,
+(float16_t)0.57078074588696736669f,(float16_t)0.82110251499110464835f,
+(float16_t)0.56825895267013148970f,(float16_t)0.82284978137582631685f,
+(float16_t)0.56573181078361323149f,(float16_t)0.82458930278502529099f,
+(float16_t)0.56319934401383409117f,(float16_t)0.82632106284566353427f,
+(float16_t)0.56066157619733603124f,(float16_t)0.82804504525775579626f,
+(float16_t)0.55811853122055610221f,(float16_t)0.82976123379452304540f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.55301670558002757883f,(float16_t)0.83317016470191318511f,
+(float16_t)0.55045797293660481131f,(float16_t)0.83486287498638001026f,
+(float16_t)0.54789405917310018967f,(float16_t)0.83654772722351189440f,
+(float16_t)0.54532498842204646383f,(float16_t)0.83822470555483796772f,
+(float16_t)0.54275078486451600046f,(float16_t)0.83989379419599941023f,
+(float16_t)0.54017147272989296525f,(float16_t)0.84155497743689833268f,
+(float16_t)0.53758707629564550512f,(float16_t)0.84320823964184543620f,
+(float16_t)0.53499761988709726435f,(float16_t)0.84485356524970700587f,
+(float16_t)0.53240312787719801246f,(float16_t)0.84649093877405212627f,
+(float16_t)0.52980362468629482731f,(float16_t)0.84812034480329712149f,
+(float16_t)0.52719913478190139067f,(float16_t)0.84974176800085243766f,
+(float16_t)0.52458968267846883826f,(float16_t)0.85135519310526519554f,
+(float16_t)0.52197529293715438925f,(float16_t)0.85296060493036363059f,
+(float16_t)0.51935599016558953167f,(float16_t)0.85455798836540053376f,
+(float16_t)0.51673179901764998423f,(float16_t)0.85614732837519447184f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.51146885043797052361f,(float16_t)0.85930181835700836235f,
+(float16_t)0.50883014254310698909f,(float16_t)0.86086693863776730939f,
+(float16_t)0.50618664534515533937f,(float16_t)0.86242395611104050168f,
+(float16_t)0.50353838372571757542f,(float16_t)0.86397285612158669643f,
+(float16_t)0.50088538261124093687f,(float16_t)0.86551362409056897818f,
+(float16_t)0.49822766697278186854f,(float16_t)0.86704624551569264845f,
+(float16_t)0.49556526182577248507f,(float16_t)0.86857070597134089507f,
+(float16_t)0.49289819222978409341f,(float16_t)0.87008699110871134952f,
+(float16_t)0.49022648328829110387f,(float16_t)0.87159508665595109012f,
+(float16_t)0.48755016014843605143f,(float16_t)0.87309497841829009079f,
+(float16_t)0.48486924800079111986f,(float16_t)0.87458665227817611321f,
+(float16_t)0.48218377207912282989f,(float16_t)0.87607009419540660122f,
+(float16_t)0.47949375766015300826f,(float16_t)0.87754529020726124156f,
+(float16_t)0.47679923006332225466f,(float16_t)0.87901222642863341417f,
+(float16_t)0.47410021465055002254f,(float16_t)0.88047088905216075450f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.46868882203582795665f,(float16_t)0.88336333866573157891f,
+(float16_t)0.46597649576796612569f,(float16_t)0.88479709843093778954f,
+(float16_t)0.46325978355186026025f,(float16_t)0.88622253014888063838f,
+(float16_t)0.46053871095824000514f,(float16_t)0.88763962040285393496f,
+(float16_t)0.45781330359887728587f,(float16_t)0.88904835585466457371f,
+(float16_t)0.45508358712634383592f,(float16_t)0.89044872324475787817f,
+(float16_t)0.45234958723377099998f,(float16_t)0.89184070939234272313f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.44686884016237432560f,(float16_t)0.89459948563138258493f,
+(float16_t)0.44412214457042925586f,(float16_t)0.89596624975618510689f,
+(float16_t)0.44137126873171661501f,(float16_t)0.89732458070541831763f,
+(float16_t)0.43861623853852771404f,(float16_t)0.89867446569395381673f,
+(float16_t)0.43585707992225547480f,(float16_t)0.90001589201616027935f,
+(float16_t)0.43309381885315201277f,(float16_t)0.90134884704602202810f,
+(float16_t)0.43032648134008261165f,(float16_t)0.90267331823725882600f,
+(float16_t)0.42755509343028219593f,(float16_t)0.90398929312344333820f,
+(float16_t)0.42477968120910880589f,(float16_t)0.90529675931811881551f,
+(float16_t)0.42200027079979979261f,(float16_t)0.90659570451491533483f,
+(float16_t)0.41921688836322396066f,(float16_t)0.90788611648766626150f,
+(float16_t)0.41642956009763731906f,(float16_t)0.90916798309052226923f,
+(float16_t)0.41363831223843455787f,(float16_t)0.91044129225806713634f,
+(float16_t)0.41084317105790391089f,(float16_t)0.91170603200542987832f,
+(float16_t)0.40804416286497874333f,(float16_t)0.91296219042839810154f,
+(float16_t)0.40524131400498986100f,(float16_t)0.91420975570353069095f,
+(float16_t)0.40243465085941854120f,(float16_t)0.91544871608826783316f,
+(float16_t)0.39962419984564678810f,(float16_t)0.91667905992104270485f,
+(float16_t)0.39680998741671041907f,(float16_t)0.91790077562139038569f,
+(float16_t)0.39399204006104809883f,(float16_t)0.91911385169005777040f,
+(float16_t)0.39117038430225398171f,(float16_t)0.92031827670911048322f,
+(float16_t)0.38834504669882630168f,(float16_t)0.92151403934204190183f,
+(float16_t)0.38551605384391901543f,(float16_t)0.92270112833387851747f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.37984720892405110515f,(float16_t)0.92504924078267758425f,
+(float16_t)0.37700741021641831496f,(float16_t)0.92621024213831126826f,
+(float16_t)0.37416406297145798909f,(float16_t)0.92736252565040111495f,
+(float16_t)0.37131719395183759858f,(float16_t)0.92850608047321558924f,
+(float16_t)0.36846682995337232125f,(float16_t)0.92964089584318121418f,
+(float16_t)0.36561299780477396482f,(float16_t)0.93076696107898371224f,
+(float16_t)0.36275572436739722537f,(float16_t)0.93188426558166814750f,
+(float16_t)0.35989503653498827740f,(float16_t)0.93299279883473884567f,
+(float16_t)0.35703096123343003310f,(float16_t)0.93409255040425887007f,
+(float16_t)0.35416352542049051033f,(float16_t)0.93518350993894749923f,
+(float16_t)0.35129275608556714827f,(float16_t)0.93626566717027825959f,
+(float16_t)0.34841868024943450921f,(float16_t)0.93733901191257495977f,
+(float16_t)0.34554132496398914931f,(float16_t)0.93840353406310805795f,
+(float16_t)0.34266071731199437833f,(float16_t)0.93945922360218991898f,
+(float16_t)0.33977688440682696225f,(float16_t)0.94050607059326829518f,
+(float16_t)0.33688985339222005111f,(float16_t)0.94154406518302080631f,
+(float16_t)0.33399965144200949307f,(float16_t)0.94257319760144686605f,
+(float16_t)0.33110630575987642921f,(float16_t)0.94359345816196038559f,
+(float16_t)0.32820984357909266382f,(float16_t)0.94460483726148025685f,
+(float16_t)0.32531029216226298173f,(float16_t)0.94560732538052127971f,
+(float16_t)0.32240767880107001897f,(float16_t)0.94660091308328353499f,
+(float16_t)0.31950203081601574739f,(float16_t)0.94758559101774109124f,
+(float16_t)0.31659337555616584581f,(float16_t)0.94856134991573026749f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.31076715274961147495f,(float16_t)0.95048607394948170235f,
+(float16_t)0.30784964004153497763f,(float16_t)0.95143502096900833820f,
+(float16_t)0.30492922973540242948f,(float16_t)0.95237501271976587880f,
+(float16_t)0.30200594931922819519f,(float16_t)0.95330604035419375109f,
+(float16_t)0.29907982630804047508f,(float16_t)0.95422809510910566733f,
+(float16_t)0.29615088824362395536f,(float16_t)0.95514116830577067141f,
+(float16_t)0.29321916269425868373f,(float16_t)0.95604525134999640557f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.28734745954472956653f,(float16_t)0.95782641302753290802f,
+(float16_t)0.28440753721127182141f,(float16_t)0.95870347489587159906f,
+(float16_t)0.28146493792575805193f,(float16_t)0.95957151308198451733f,
+(float16_t)0.27851968938505305973f,(float16_t)0.96043051941556578655f,
+(float16_t)0.27557181931095825478f,(float16_t)0.96128048581132063966f,
+(float16_t)0.27262135544994897662f,(float16_t)0.96212140426904158019f,
+(float16_t)0.26966832557291520178f,(float16_t)0.96295326687368387741f,
+(float16_t)0.26671275747489842090f,(float16_t)0.96377606579543984022f,
+(float16_t)0.26375467897483151347f,(float16_t)0.96458979328981264700f,
+(float16_t)0.26079411791527556952f,(float16_t)0.96539444169768939830f,
+(float16_t)0.25783110216215893162f,(float16_t)0.96619000344541261516f,
+(float16_t)0.25486565960451462720f,(float16_t)0.96697647104485207059f,
+(float16_t)0.25189781815421691258f,(float16_t)0.96775383709347551076f,
+(float16_t)0.24892760574572025956f,(float16_t)0.96852209427441726675f,
+(float16_t)0.24595505033579459497f,(float16_t)0.96928123535654853171f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.24000302244874149871f,(float16_t)0.97077214072895035013f,
+(float16_t)0.23702360599436733679f,(float16_t)0.97150389098625178352f,
+(float16_t)0.23404195858354345794f,(float16_t)0.97222649707893626925f,
+(float16_t)0.23105810828067127605f,(float16_t)0.97293995220556006576f,
+(float16_t)0.22807208317088578653f,(float16_t)0.97364424965081186603f,
+(float16_t)0.22508391135979277653f,(float16_t)0.97433938278557585821f,
+(float16_t)0.22209362097320359264f,(float16_t)0.97502534506699412020f,
+(float16_t)0.21910124015686976984f,(float16_t)0.97570213003852857003f,
+(float16_t)0.21610679707621960333f,(float16_t)0.97636973133002114000f,
+(float16_t)0.21311031991609136194f,(float16_t)0.97702814265775439484f,
+(float16_t)0.21011183688046972118f,(float16_t)0.97767735782450992943f,
+(float16_t)0.20711137619221856032f,(float16_t)0.97831737071962765473f,
+(float16_t)0.20410896609281700687f,(float16_t)0.97894817531906219710f,
+(float16_t)0.20110463484209195606f,(float16_t)0.97956976568544051887f,
+(float16_t)0.19809841071795372680f,(float16_t)0.98018213596811731847f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.19208039704989238183f,(float16_t)0.98137919331375456089f,
+(float16_t)0.18906866414980627589f,(float16_t)0.98196386910955524296f,
+(float16_t)0.18605515166344663291f,(float16_t)0.98253930228744124076f,
+(float16_t)0.18303988795514106180f,(float16_t)0.98310548743121628501f,
+(float16_t)0.18002290140569951471f,(float16_t)0.98366241921173025453f,
+(float16_t)0.17700422041214886049f,(float16_t)0.98421009238692902521f,
+(float16_t)0.17398387338746384989f,(float16_t)0.98474850180190420801f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.16793829497473122814f,(float16_t)0.98579750916756736512f,
+(float16_t)0.16491312048997008866f,(float16_t)0.98630809724459866938f,
+(float16_t)0.16188639378011188130f,(float16_t)0.98680940181418541624f,
+(float16_t)0.15885814333386139019f,(float16_t)0.98730141815785843473f,
+(float16_t)0.15582839765426531597f,(float16_t)0.98778414164457217783f,
+(float16_t)0.15279718525844340760f,(float16_t)0.98825756773074946437f,
+(float16_t)0.14976453467732162017f,(float16_t)0.98872169196032377858f,
+(float16_t)0.14673047445536174793f,(float16_t)0.98917650996478101444f,
+(float16_t)0.14369503315029458212f,(float16_t)0.98962201746320077600f,
+(float16_t)0.14065823933284923863f,(float16_t)0.99005821026229712256f,
+(float16_t)0.13762012158648617710f,(float16_t)0.99048508425645698239f,
+(float16_t)0.13458070850712622324f,(float16_t)0.99090263542778000971f,
+(float16_t)0.13154002870288328264f,(float16_t)0.99131085984611544415f,
+(float16_t)0.12849811079379322432f,(float16_t)0.99170975366909952520f,
+(float16_t)0.12545498341154620592f,(float16_t)0.99209931314219179654f,
+(float16_t)0.12241067519921627893f,(float16_t)0.99247953459870996706f,
+(float16_t)0.11936521481099135467f,(float16_t)0.99285041445986510489f,
+(float16_t)0.11631863091190487725f,(float16_t)0.99321194923479450001f,
+(float16_t)0.11327095217756436019f,(float16_t)0.99356413552059530403f,
+(float16_t)0.11022220729388318428f,(float16_t)0.99390697000235606051f,
+(float16_t)0.10717242495680887049f,(float16_t)0.99424044945318790223f,
+(float16_t)0.10412163387205472520f,(float16_t)0.99456457073425541537f,
+(float16_t)0.10106986275482787718f,(float16_t)0.99487933079480561638f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.09496349532963906104f,(float16_t)0.99548075549192693856f,
+(float16_t)0.09190895649713269611f,(float16_t)0.99576741446765981713f,
+(float16_t)0.08885355258252468358f,(float16_t)0.99604470090125196702f,
+(float16_t)0.08579731234443987997f,(float16_t)0.99631261218277800129f,
+(float16_t)0.08274026454937580266f,(float16_t)0.99657114579055483539f,
+(float16_t)0.07968243797143012563f,(float16_t)0.99682029929116566791f,
+(float16_t)0.07662386139203161695f,(float16_t)0.99706007033948296225f,
+(float16_t)0.07356456359966745406f,(float16_t)0.99729045667869020697f,
+(float16_t)0.07050457338961400866f,(float16_t)0.99751145614030345410f,
+(float16_t)0.06744391956366410645f,(float16_t)0.99772306664419163624f,
+(float16_t)0.06438263092985740954f,(float16_t)0.99792528619859599548f,
+(float16_t)0.06132073630220864768f,(float16_t)0.99811811290014917919f,
+(float16_t)0.05825826450043573163f,(float16_t)0.99830154493389289261f,
+(float16_t)0.05519524434969003135f,(float16_t)0.99847558057329477421f,
+(float16_t)0.05213170468028331672f,(float16_t)0.99864021818026527111f,
+(float16_t)0.04906767432741812596f,(float16_t)0.99879545620517240501f,
+(float16_t)0.04600318213091464381f,(float16_t)0.99894129318685687124f,
+(float16_t)0.04293825693494095902f,(float16_t)0.99907772775264536147f,
+(float16_t)0.03987292758773984536f,(float16_t)0.99920475861836388631f,
+(float16_t)0.03680722294135899131f,(float16_t)0.99932238458834954375f,
+(float16_t)0.03374117185137764235f,(float16_t)0.99943060455546173237f,
+(float16_t)0.03067480317663658085f,(float16_t)0.99952941750109314256f,
+(float16_t)0.02760814577896581953f,(float16_t)0.99961882249517863830f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)0.02147408027546960502f,(float16_t)0.99976940535121527898f,
+(float16_t)0.01840672990580482019f,(float16_t)0.99983058179582340319f,
+(float16_t)0.01533920628498821985f,(float16_t)0.99988234745421256111f,
+(float16_t)0.01227153828571994447f,(float16_t)0.99992470183914450299f,
+(float16_t)0.00920375478205995995f,(float16_t)0.99995764455196389786f,
+(float16_t)0.00613588464915451517f,(float16_t)0.99998117528260110909f,
+(float16_t)0.00306795676296613791f,(float16_t)0.99999529380957619118f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.00306795676296601561f,(float16_t)0.99999529380957619118f,
+(float16_t)-0.00613588464915439287f,(float16_t)0.99998117528260110909f,
+(float16_t)-0.00920375478205983678f,(float16_t)0.99995764455196389786f,
+(float16_t)-0.01227153828571982304f,(float16_t)0.99992470183914450299f,
+(float16_t)-0.01533920628498809842f,(float16_t)0.99988234745421256111f,
+(float16_t)-0.01840672990580469529f,(float16_t)0.99983058179582340319f,
+(float16_t)-0.02147408027546948359f,(float16_t)0.99976940535121527898f,
+(float16_t)-0.02454122852291214241f,(float16_t)0.99969881869620424997f,
+(float16_t)-0.02760814577896569810f,(float16_t)0.99961882249517863830f,
+(float16_t)-0.03067480317663645942f,(float16_t)0.99952941750109314256f,
+(float16_t)-0.03374117185137751745f,(float16_t)0.99943060455546173237f,
+(float16_t)-0.03680722294135886641f,(float16_t)0.99932238458834954375f,
+(float16_t)-0.03987292758773972740f,(float16_t)0.99920475861836388631f,
+(float16_t)-0.04293825693494083412f,(float16_t)0.99907772775264536147f,
+(float16_t)-0.04600318213091451891f,(float16_t)0.99894129318685687124f,
+(float16_t)-0.04906767432741800800f,(float16_t)0.99879545620517240501f,
+(float16_t)-0.05213170468028319182f,(float16_t)0.99864021818026527111f,
+(float16_t)-0.05519524434968991339f,(float16_t)0.99847558057329477421f,
+(float16_t)-0.05825826450043560673f,(float16_t)0.99830154493389289261f,
+(float16_t)-0.06132073630220852972f,(float16_t)0.99811811290014917919f,
+(float16_t)-0.06438263092985728464f,(float16_t)0.99792528619859599548f,
+(float16_t)-0.06744391956366398155f,(float16_t)0.99772306664419163624f,
+(float16_t)-0.07050457338961389764f,(float16_t)0.99751145614030345410f,
+(float16_t)-0.07356456359966732916f,(float16_t)0.99729045667869020697f,
+(float16_t)-0.07662386139203150592f,(float16_t)0.99706007033948296225f,
+(float16_t)-0.07968243797143001461f,(float16_t)0.99682029929116577893f,
+(float16_t)-0.08274026454937567776f,(float16_t)0.99657114579055483539f,
+(float16_t)-0.08579731234443975507f,(float16_t)0.99631261218277800129f,
+(float16_t)-0.08885355258252455868f,(float16_t)0.99604470090125196702f,
+(float16_t)-0.09190895649713257121f,(float16_t)0.99576741446765981713f,
+(float16_t)-0.09496349532963895002f,(float16_t)0.99548075549192693856f,
+(float16_t)-0.09801714032956064526f,(float16_t)0.99518472667219692873f,
+(float16_t)-0.10106986275482775228f,(float16_t)0.99487933079480561638f,
+(float16_t)-0.10412163387205460030f,(float16_t)0.99456457073425541537f,
+(float16_t)-0.10717242495680875947f,(float16_t)0.99424044945318790223f,
+(float16_t)-0.11022220729388305938f,(float16_t)0.99390697000235606051f,
+(float16_t)-0.11327095217756423529f,(float16_t)0.99356413552059530403f,
+(float16_t)-0.11631863091190475235f,(float16_t)0.99321194923479450001f,
+(float16_t)-0.11936521481099122977f,(float16_t)0.99285041445986510489f,
+(float16_t)-0.12241067519921615403f,(float16_t)0.99247953459870996706f,
+(float16_t)-0.12545498341154606714f,(float16_t)0.99209931314219179654f,
+(float16_t)-0.12849811079379311329f,(float16_t)0.99170975366909952520f,
+(float16_t)-0.13154002870288314386f,(float16_t)0.99131085984611544415f,
+(float16_t)-0.13458070850712611222f,(float16_t)0.99090263542778000971f,
+(float16_t)-0.13762012158648606608f,(float16_t)0.99048508425645698239f,
+(float16_t)-0.14065823933284912761f,(float16_t)0.99005821026229712256f,
+(float16_t)-0.14369503315029444335f,(float16_t)0.98962201746320088702f,
+(float16_t)-0.14673047445536163691f,(float16_t)0.98917650996478101444f,
+(float16_t)-0.14976453467732150915f,(float16_t)0.98872169196032377858f,
+(float16_t)-0.15279718525844329657f,(float16_t)0.98825756773074946437f,
+(float16_t)-0.15582839765426520495f,(float16_t)0.98778414164457217783f,
+(float16_t)-0.15885814333386127917f,(float16_t)0.98730141815785843473f,
+(float16_t)-0.16188639378011177028f,(float16_t)0.98680940181418552726f,
+(float16_t)-0.16491312048996994988f,(float16_t)0.98630809724459866938f,
+(float16_t)-0.16793829497473108936f,(float16_t)0.98579750916756747614f,
+(float16_t)-0.17096188876030124493f,(float16_t)0.98527764238894122162f,
+(float16_t)-0.17398387338746371111f,(float16_t)0.98474850180190420801f,
+(float16_t)-0.17700422041214874946f,(float16_t)0.98421009238692902521f,
+(float16_t)-0.18002290140569940369f,(float16_t)0.98366241921173025453f,
+(float16_t)-0.18303988795514092303f,(float16_t)0.98310548743121628501f,
+(float16_t)-0.18605515166344649414f,(float16_t)0.98253930228744124076f,
+(float16_t)-0.18906866414980616486f,(float16_t)0.98196386910955524296f,
+(float16_t)-0.19208039704989227081f,(float16_t)0.98137919331375456089f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.19809841071795361578f,(float16_t)0.98018213596811742949f,
+(float16_t)-0.20110463484209181728f,(float16_t)0.97956976568544051887f,
+(float16_t)-0.20410896609281689584f,(float16_t)0.97894817531906219710f,
+(float16_t)-0.20711137619221844930f,(float16_t)0.97831737071962765473f,
+(float16_t)-0.21011183688046961016f,(float16_t)0.97767735782450992943f,
+(float16_t)-0.21311031991609125091f,(float16_t)0.97702814265775439484f,
+(float16_t)-0.21610679707621949230f,(float16_t)0.97636973133002114000f,
+(float16_t)-0.21910124015686965881f,(float16_t)0.97570213003852857003f,
+(float16_t)-0.22209362097320348162f,(float16_t)0.97502534506699412020f,
+(float16_t)-0.22508391135979266551f,(float16_t)0.97433938278557585821f,
+(float16_t)-0.22807208317088567551f,(float16_t)0.97364424965081197705f,
+(float16_t)-0.23105810828067113727f,(float16_t)0.97293995220556017678f,
+(float16_t)-0.23404195858354331916f,(float16_t)0.97222649707893638027f,
+(float16_t)-0.23702360599436722577f,(float16_t)0.97150389098625178352f,
+(float16_t)-0.24000302244874138768f,(float16_t)0.97077214072895035013f,
+(float16_t)-0.24298017990326387094f,(float16_t)0.97003125319454397424f,
+(float16_t)-0.24595505033579448395f,(float16_t)0.96928123535654853171f,
+(float16_t)-0.24892760574572012078f,(float16_t)0.96852209427441737777f,
+(float16_t)-0.25189781815421680156f,(float16_t)0.96775383709347551076f,
+(float16_t)-0.25486565960451451618f,(float16_t)0.96697647104485207059f,
+(float16_t)-0.25783110216215882060f,(float16_t)0.96619000344541261516f,
+(float16_t)-0.26079411791527545850f,(float16_t)0.96539444169768939830f,
+(float16_t)-0.26375467897483140245f,(float16_t)0.96458979328981275803f,
+(float16_t)-0.26671275747489830987f,(float16_t)0.96377606579543984022f,
+(float16_t)-0.26966832557291509076f,(float16_t)0.96295326687368387741f,
+(float16_t)-0.27262135544994886560f,(float16_t)0.96212140426904158019f,
+(float16_t)-0.27557181931095814376f,(float16_t)0.96128048581132063966f,
+(float16_t)-0.27851968938505294870f,(float16_t)0.96043051941556589757f,
+(float16_t)-0.28146493792575794091f,(float16_t)0.95957151308198451733f,
+(float16_t)-0.28440753721127171039f,(float16_t)0.95870347489587159906f,
+(float16_t)-0.28734745954472945551f,(float16_t)0.95782641302753290802f,
+(float16_t)-0.29028467725446216452f,(float16_t)0.95694033573220893540f,
+(float16_t)-0.29321916269425857271f,(float16_t)0.95604525134999651659f,
+(float16_t)-0.29615088824362384434f,(float16_t)0.95514116830577067141f,
+(float16_t)-0.29907982630804036406f,(float16_t)0.95422809510910566733f,
+(float16_t)-0.30200594931922808417f,(float16_t)0.95330604035419386211f,
+(float16_t)-0.30492922973540226295f,(float16_t)0.95237501271976587880f,
+(float16_t)-0.30784964004153486661f,(float16_t)0.95143502096900833820f,
+(float16_t)-0.31076715274961136393f,(float16_t)0.95048607394948181337f,
+(float16_t)-0.31368174039889140658f,(float16_t)0.94952818059303667475f,
+(float16_t)-0.31659337555616573479f,(float16_t)0.94856134991573037851f,
+(float16_t)-0.31950203081601563637f,(float16_t)0.94758559101774120226f,
+(float16_t)-0.32240767880106985244f,(float16_t)0.94660091308328353499f,
+(float16_t)-0.32531029216226287071f,(float16_t)0.94560732538052139073f,
+(float16_t)-0.32820984357909255280f,(float16_t)0.94460483726148025685f,
+(float16_t)-0.33110630575987631818f,(float16_t)0.94359345816196038559f,
+(float16_t)-0.33399965144200938205f,(float16_t)0.94257319760144686605f,
+(float16_t)-0.33688985339221994009f,(float16_t)0.94154406518302080631f,
+(float16_t)-0.33977688440682685123f,(float16_t)0.94050607059326829518f,
+(float16_t)-0.34266071731199426731f,(float16_t)0.93945922360218991898f,
+(float16_t)-0.34554132496398903829f,(float16_t)0.93840353406310816897f,
+(float16_t)-0.34841868024943439819f,(float16_t)0.93733901191257495977f,
+(float16_t)-0.35129275608556703725f,(float16_t)0.93626566717027825959f,
+(float16_t)-0.35416352542049039931f,(float16_t)0.93518350993894761025f,
+(float16_t)-0.35703096123342992207f,(float16_t)0.93409255040425898109f,
+(float16_t)-0.35989503653498816638f,(float16_t)0.93299279883473884567f,
+(float16_t)-0.36275572436739711435f,(float16_t)0.93188426558166814750f,
+(float16_t)-0.36561299780477385379f,(float16_t)0.93076696107898371224f,
+(float16_t)-0.36846682995337221023f,(float16_t)0.92964089584318132520f,
+(float16_t)-0.37131719395183748755f,(float16_t)0.92850608047321558924f,
+(float16_t)-0.37416406297145787807f,(float16_t)0.92736252565040111495f,
+(float16_t)-0.37700741021641820394f,(float16_t)0.92621024213831137928f,
+(float16_t)-0.37984720892405099413f,(float16_t)0.92504924078267769527f,
+(float16_t)-0.38268343236508972627f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.38551605384391890441f,(float16_t)0.92270112833387851747f,
+(float16_t)-0.38834504669882619066f,(float16_t)0.92151403934204201285f,
+(float16_t)-0.39117038430225387069f,(float16_t)0.92031827670911059425f,
+(float16_t)-0.39399204006104798781f,(float16_t)0.91911385169005777040f,
+(float16_t)-0.39680998741671030805f,(float16_t)0.91790077562139049672f,
+(float16_t)-0.39962419984564667708f,(float16_t)0.91667905992104270485f,
+(float16_t)-0.40243465085941843018f,(float16_t)0.91544871608826783316f,
+(float16_t)-0.40524131400498974998f,(float16_t)0.91420975570353069095f,
+(float16_t)-0.40804416286497863231f,(float16_t)0.91296219042839821256f,
+(float16_t)-0.41084317105790379987f,(float16_t)0.91170603200542987832f,
+(float16_t)-0.41363831223843450235f,(float16_t)0.91044129225806724737f,
+(float16_t)-0.41642956009763698599f,(float16_t)0.90916798309052249127f,
+(float16_t)-0.41921688836322407168f,(float16_t)0.90788611648766615048f,
+(float16_t)-0.42200027079979968159f,(float16_t)0.90659570451491533483f,
+(float16_t)-0.42477968120910869487f,(float16_t)0.90529675931811881551f,
+(float16_t)-0.42755509343028186287f,(float16_t)0.90398929312344344922f,
+(float16_t)-0.43032648134008272267f,(float16_t)0.90267331823725871498f,
+(float16_t)-0.43309381885315190175f,(float16_t)0.90134884704602202810f,
+(float16_t)-0.43585707992225536378f,(float16_t)0.90001589201616027935f,
+(float16_t)-0.43861623853852738097f,(float16_t)0.89867446569395392775f,
+(float16_t)-0.44137126873171672603f,(float16_t)0.89732458070541831763f,
+(float16_t)-0.44412214457042914484f,(float16_t)0.89596624975618521791f,
+(float16_t)-0.44686884016237399253f,(float16_t)0.89459948563138280697f,
+(float16_t)-0.44961132965460670619f,(float16_t)0.89322430119551521344f,
+(float16_t)-0.45234958723377088896f,(float16_t)0.89184070939234272313f,
+(float16_t)-0.45508358712634372489f,(float16_t)0.89044872324475798919f,
+(float16_t)-0.45781330359887700832f,(float16_t)0.88904835585466468473f,
+(float16_t)-0.46053871095824006066f,(float16_t)0.88763962040285393496f,
+(float16_t)-0.46325978355186014923f,(float16_t)0.88622253014888063838f,
+(float16_t)-0.46597649576796601467f,(float16_t)0.88479709843093790056f,
+(float16_t)-0.46868882203582767909f,(float16_t)0.88336333866573168994f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.47410021465054991152f,(float16_t)0.88047088905216086552f,
+(float16_t)-0.47679923006332192159f,(float16_t)0.87901222642863352519f,
+(float16_t)-0.47949375766015311928f,(float16_t)0.87754529020726124156f,
+(float16_t)-0.48218377207912271887f,(float16_t)0.87607009419540660122f,
+(float16_t)-0.48486924800079100883f,(float16_t)0.87458665227817622423f,
+(float16_t)-0.48755016014843571837f,(float16_t)0.87309497841829020182f,
+(float16_t)-0.49022648328829121489f,(float16_t)0.87159508665595097909f,
+(float16_t)-0.49289819222978398239f,(float16_t)0.87008699110871146054f,
+(float16_t)-0.49556526182577237405f,(float16_t)0.86857070597134100609f,
+(float16_t)-0.49822766697278159098f,(float16_t)0.86704624551569275948f,
+(float16_t)-0.50088538261124082585f,(float16_t)0.86551362409056908920f,
+(float16_t)-0.50353838372571746440f,(float16_t)0.86397285612158680745f,
+(float16_t)-0.50618664534515511733f,(float16_t)0.86242395611104061270f,
+(float16_t)-0.50883014254310710012f,(float16_t)0.86086693863776719837f,
+(float16_t)-0.51146885043797041259f,(float16_t)0.85930181835700847337f,
+(float16_t)-0.51410274419322155026f,(float16_t)0.85772861000027211809f,
+(float16_t)-0.51673179901764965116f,(float16_t)0.85614732837519458286f,
+(float16_t)-0.51935599016558964269f,(float16_t)0.85455798836540053376f,
+(float16_t)-0.52197529293715427823f,(float16_t)0.85296060493036374162f,
+(float16_t)-0.52458968267846872724f,(float16_t)0.85135519310526519554f,
+(float16_t)-0.52719913478190105760f,(float16_t)0.84974176800085265970f,
+(float16_t)-0.52980362468629471628f,(float16_t)0.84812034480329723252f,
+(float16_t)-0.53240312787719790144f,(float16_t)0.84649093877405212627f,
+(float16_t)-0.53499761988709704230f,(float16_t)0.84485356524970722791f,
+(float16_t)-0.53758707629564561614f,(float16_t)0.84320823964184532517f,
+(float16_t)-0.54017147272989285423f,(float16_t)0.84155497743689844370f,
+(float16_t)-0.54275078486451577842f,(float16_t)0.83989379419599952126f,
+(float16_t)-0.54532498842204624179f,(float16_t)0.83822470555483818977f,
+(float16_t)-0.54789405917310018967f,(float16_t)0.83654772722351200542f,
+(float16_t)-0.55045797293660470029f,(float16_t)0.83486287498638012128f,
+(float16_t)-0.55301670558002735678f,(float16_t)0.83317016470191329613f,
+(float16_t)-0.55557023301960195560f,(float16_t)0.83146961230254534669f,
+(float16_t)-0.55811853122055610221f,(float16_t)0.82976123379452304540f,
+(float16_t)-0.56066157619733592021f,(float16_t)0.82804504525775579626f,
+(float16_t)-0.56319934401383386913f,(float16_t)0.82632106284566364529f,
+(float16_t)-0.56573181078361323149f,(float16_t)0.82458930278502517996f,
+(float16_t)-0.56825895267013148970f,(float16_t)0.82284978137582631685f,
+(float16_t)-0.57078074588696714464f,(float16_t)0.82110251499110475937f,
+(float16_t)-0.57329716669804198226f,(float16_t)0.81934752007679712005f,
+(float16_t)-0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)-0.57831379641165547856f,(float16_t)0.81581441080673378075f,
+(float16_t)-0.58081395809576441547f,(float16_t)0.81403632970594852480f,
+(float16_t)-0.58330865293769840196f,(float16_t)0.81225058658520388200f,
+(float16_t)-0.58579785745643886408f,(float16_t)0.81045719825259476821f,
+(float16_t)-0.58828154822264522306f,(float16_t)0.80865618158817509364f,
+(float16_t)-0.59075970185887405339f,(float16_t)0.80684755354379944503f,
+(float16_t)-0.59323229503979990618f,(float16_t)0.80503133114296354655f,
+(float16_t)-0.59569930449243335691f,(float16_t)0.80320753148064494287f,
+(float16_t)-0.59816070699634216190f,(float16_t)0.80137617172314024039f,
+(float16_t)-0.60061647938386875101f,(float16_t)0.79953726910790523519f,
+(float16_t)-0.60306659854034827539f,(float16_t)0.79769084094339104407f,
+(float16_t)-0.60551104140432543410f,(float16_t)0.79583690460888356633f,
+(float16_t)-0.60794978496777352106f,(float16_t)0.79397547755433728334f,
+(float16_t)-0.61038280627630958630f,(float16_t)0.79210657730021227785f,
+(float16_t)-0.61281008242940970820f,(float16_t)0.79023022143731003197f,
+(float16_t)-0.61523159058062670823f,(float16_t)0.78834642762660633863f,
+(float16_t)-0.61764730793780375784f,(float16_t)0.78645521359908587833f,
+(float16_t)-0.62005721176328920663f,(float16_t)0.78455659715557513056f,
+(float16_t)-0.62246127937414996723f,(float16_t)0.78265059616657572938f,
+(float16_t)-0.62485948814238623239f,(float16_t)0.78073722857209459924f,
+(float16_t)-0.62725181549514386070f,(float16_t)0.77881651238147608929f,
+(float16_t)-0.62963823891492709528f,(float16_t)0.77688846567323244230f,
+(float16_t)-0.63201873593980895105f,(float16_t)0.77495310659487393057f,
+(float16_t)-0.63439328416364537677f,(float16_t)0.77301045336273710440f,
+(float16_t)-0.63676186123628431002f,(float16_t)0.77106052426181370674f,
+(float16_t)-0.63912444486377573138f,(float16_t)0.76910333764557958780f,
+(float16_t)-0.64148101280858305095f,(float16_t)0.76713891193582040007f,
+(float16_t)-0.64383154288979127511f,(float16_t)0.76516726562245906962f,
+(float16_t)-0.64617601298331639459f,(float16_t)0.76318841726338115805f,
+(float16_t)-0.64851440102211244110f,(float16_t)0.76120238548426188974f,
+(float16_t)-0.65084668499638076433f,(float16_t)0.75920918897838807204f,
+(float16_t)-0.65317284295377653347f,(float16_t)0.75720884650648467851f,
+(float16_t)-0.65549285299961546070f,(float16_t)0.75520137689653643598f,
+(float16_t)-0.65780669329707852633f,(float16_t)0.75318679904361252042f,
+(float16_t)-0.66011434206742036768f,(float16_t)0.75116513190968658975f,
+(float16_t)-0.66241577759017189475f,(float16_t)0.74913639452345925918f,
+(float16_t)-0.66471097820334490436f,(float16_t)0.74710060598018013245f,
+(float16_t)-0.66699992230363736034f,(float16_t)0.74505778544146605835f,
+(float16_t)-0.66928258834663589827f,(float16_t)0.74300795213512182968f,
+(float16_t)-0.67155895484701844111f,(float16_t)0.74095112535495899486f,
+(float16_t)-0.67382900037875603783f,(float16_t)0.73888732446061522463f,
+(float16_t)-0.67609270357531581208f,(float16_t)0.73681656887737001504f,
+(float16_t)-0.67835004312986124653f,(float16_t)0.73473887809596372112f,
+(float16_t)-0.68060099779545302212f,(float16_t)0.73265427167241281570f,
+(float16_t)-0.68284554638524797010f,(float16_t)0.73056276922782759087f,
+(float16_t)-0.68508366777270024439f,(float16_t)0.72846439044822530740f,
+(float16_t)-0.68731534089175916336f,(float16_t)0.72635915508434589771f,
+(float16_t)-0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)-0.69175925836415763648f,(float16_t)0.72212819392921545614f,
+(float16_t)-0.69397146088965377952f,(float16_t)0.72000250796138176579f,
+(float16_t)-0.69617713149146298601f,(float16_t)0.71787004505573170920f,
+(float16_t)-0.69837624940897280457f,(float16_t)0.71573082528381870571f,
+(float16_t)-0.70056879394324822474f,(float16_t)0.71358486878079363525f,
+(float16_t)-0.70275474445722507788f,(float16_t)0.71143219574521665560f,
+(float16_t)-0.70493408037590488124f,(float16_t)0.70927282643886557789f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.70927282643886546687f,(float16_t)0.70493408037590510329f,
+(float16_t)-0.71143219574521654458f,(float16_t)0.70275474445722518890f,
+(float16_t)-0.71358486878079352422f,(float16_t)0.70056879394324833576f,
+(float16_t)-0.71573082528381859468f,(float16_t)0.69837624940897291559f,
+(float16_t)-0.71787004505573159818f,(float16_t)0.69617713149146309703f,
+(float16_t)-0.72000250796138165477f,(float16_t)0.69397146088965389055f,
+(float16_t)-0.72212819392921523409f,(float16_t)0.69175925836415785852f,
+(float16_t)-0.72424708295146678072f,(float16_t)0.68954054473706705153f,
+(float16_t)-0.72635915508434578669f,(float16_t)0.68731534089175927438f,
+(float16_t)-0.72846439044822519637f,(float16_t)0.68508366777270035541f,
+(float16_t)-0.73056276922782747985f,(float16_t)0.68284554638524808112f,
+(float16_t)-0.73265427167241270467f,(float16_t)0.68060099779545324417f,
+(float16_t)-0.73473887809596349907f,(float16_t)0.67835004312986135755f,
+(float16_t)-0.73681656887736979300f,(float16_t)0.67609270357531592310f,
+(float16_t)-0.73888732446061511361f,(float16_t)0.67382900037875614885f,
+(float16_t)-0.74095112535495888384f,(float16_t)0.67155895484701855214f,
+(float16_t)-0.74300795213512171866f,(float16_t)0.66928258834663600929f,
+(float16_t)-0.74505778544146594733f,(float16_t)0.66699992230363758239f,
+(float16_t)-0.74710060598018002143f,(float16_t)0.66471097820334501538f,
+(float16_t)-0.74913639452345914815f,(float16_t)0.66241577759017200577f,
+(float16_t)-0.75116513190968636771f,(float16_t)0.66011434206742047870f,
+(float16_t)-0.75318679904361240940f,(float16_t)0.65780669329707874837f,
+(float16_t)-0.75520137689653643598f,(float16_t)0.65549285299961557172f,
+(float16_t)-0.75720884650648467851f,(float16_t)0.65317284295377664449f,
+(float16_t)-0.75920918897838796102f,(float16_t)0.65084668499638098638f,
+(float16_t)-0.76120238548426166769f,(float16_t)0.64851440102211255212f,
+(float16_t)-0.76318841726338115805f,(float16_t)0.64617601298331661663f,
+(float16_t)-0.76516726562245895860f,(float16_t)0.64383154288979138613f,
+(float16_t)-0.76713891193582040007f,(float16_t)0.64148101280858316198f,
+(float16_t)-0.76910333764557947678f,(float16_t)0.63912444486377584241f,
+(float16_t)-0.77106052426181359571f,(float16_t)0.63676186123628442104f,
+(float16_t)-0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)-0.77495310659487381955f,(float16_t)0.63201873593980906207f,
+(float16_t)-0.77688846567323233128f,(float16_t)0.62963823891492720630f,
+(float16_t)-0.77881651238147597827f,(float16_t)0.62725181549514408275f,
+(float16_t)-0.78073722857209448822f,(float16_t)0.62485948814238634341f,
+(float16_t)-0.78265059616657561836f,(float16_t)0.62246127937415007825f,
+(float16_t)-0.78455659715557501954f,(float16_t)0.62005721176328942867f,
+(float16_t)-0.78645521359908576731f,(float16_t)0.61764730793780386886f,
+(float16_t)-0.78834642762660622761f,(float16_t)0.61523159058062693028f,
+(float16_t)-0.79023022143730992095f,(float16_t)0.61281008242940981923f,
+(float16_t)-0.79210657730021216683f,(float16_t)0.61038280627630969732f,
+(float16_t)-0.79397547755433717231f,(float16_t)0.60794978496777363208f,
+(float16_t)-0.79583690460888345530f,(float16_t)0.60551104140432565615f,
+(float16_t)-0.79769084094339093305f,(float16_t)0.60306659854034838641f,
+(float16_t)-0.79953726910790512417f,(float16_t)0.60061647938386886203f,
+(float16_t)-0.80137617172314024039f,(float16_t)0.59816070699634238395f,
+(float16_t)-0.80320753148064483184f,(float16_t)0.59569930449243346793f,
+(float16_t)-0.80503133114296343553f,(float16_t)0.59323229503980001720f,
+(float16_t)-0.80684755354379933401f,(float16_t)0.59075970185887416442f,
+(float16_t)-0.80865618158817498262f,(float16_t)0.58828154822264533408f,
+(float16_t)-0.81045719825259465718f,(float16_t)0.58579785745643897510f,
+(float16_t)-0.81225058658520377097f,(float16_t)0.58330865293769851299f,
+(float16_t)-0.81403632970594841378f,(float16_t)0.58081395809576452649f,
+(float16_t)-0.81581441080673378075f,(float16_t)0.57831379641165570060f,
+(float16_t)-0.81758481315158360037f,(float16_t)0.57580819141784544968f,
+(float16_t)-0.81934752007679700903f,(float16_t)0.57329716669804209328f,
+(float16_t)-0.82110251499110464835f,(float16_t)0.57078074588696725566f,
+(float16_t)-0.82284978137582620583f,(float16_t)0.56825895267013171175f,
+(float16_t)-0.82458930278502506894f,(float16_t)0.56573181078361345353f,
+(float16_t)-0.82632106284566353427f,(float16_t)0.56319934401383409117f,
+(float16_t)-0.82804504525775568524f,(float16_t)0.56066157619733614226f,
+(float16_t)-0.82976123379452293438f,(float16_t)0.55811853122055632426f,
+(float16_t)-0.83146961230254534669f,(float16_t)0.55557023301960217765f,
+(float16_t)-0.83317016470191318511f,(float16_t)0.55301670558002746780f,
+(float16_t)-0.83486287498638001026f,(float16_t)0.55045797293660492233f,
+(float16_t)-0.83654772722351189440f,(float16_t)0.54789405917310041172f,
+(float16_t)-0.83822470555483807875f,(float16_t)0.54532498842204635281f,
+(float16_t)-0.83989379419599952126f,(float16_t)0.54275078486451588944f,
+(float16_t)-0.84155497743689833268f,(float16_t)0.54017147272989296525f,
+(float16_t)-0.84320823964184532517f,(float16_t)0.53758707629564572716f,
+(float16_t)-0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)-0.84649093877405201525f,(float16_t)0.53240312787719801246f,
+(float16_t)-0.84812034480329712149f,(float16_t)0.52980362468629482731f,
+(float16_t)-0.84974176800085254868f,(float16_t)0.52719913478190127964f,
+(float16_t)-0.85135519310526519554f,(float16_t)0.52458968267846894928f,
+(float16_t)-0.85296060493036363059f,(float16_t)0.52197529293715438925f,
+(float16_t)-0.85455798836540042274f,(float16_t)0.51935599016558975372f,
+(float16_t)-0.85614732837519447184f,(float16_t)0.51673179901764976218f,
+(float16_t)-0.85772861000027200706f,(float16_t)0.51410274419322177231f,
+(float16_t)-0.85930181835700836235f,(float16_t)0.51146885043797052361f,
+(float16_t)-0.86086693863776719837f,(float16_t)0.50883014254310732216f,
+(float16_t)-0.86242395611104050168f,(float16_t)0.50618664534515522835f,
+(float16_t)-0.86397285612158669643f,(float16_t)0.50353838372571757542f,
+(float16_t)-0.86551362409056897818f,(float16_t)0.50088538261124093687f,
+(float16_t)-0.86704624551569264845f,(float16_t)0.49822766697278175752f,
+(float16_t)-0.86857070597134089507f,(float16_t)0.49556526182577254058f,
+(float16_t)-0.87008699110871134952f,(float16_t)0.49289819222978414892f,
+(float16_t)-0.87159508665595086807f,(float16_t)0.49022648328829138142f,
+(float16_t)-0.87309497841829009079f,(float16_t)0.48755016014843588490f,
+(float16_t)-0.87458665227817611321f,(float16_t)0.48486924800079111986f,
+(float16_t)-0.87607009419540649020f,(float16_t)0.48218377207912288540f,
+(float16_t)-0.87754529020726113053f,(float16_t)0.47949375766015328582f,
+(float16_t)-0.87901222642863352519f,(float16_t)0.47679923006332208812f,
+(float16_t)-0.88047088905216075450f,(float16_t)0.47410021465055007805f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.88336333866573168994f,(float16_t)0.46868882203582784562f,
+(float16_t)-0.88479709843093778954f,(float16_t)0.46597649576796618121f,
+(float16_t)-0.88622253014888052736f,(float16_t)0.46325978355186031576f,
+(float16_t)-0.88763962040285382393f,(float16_t)0.46053871095824022719f,
+(float16_t)-0.88904835585466457371f,(float16_t)0.45781330359887717485f,
+(float16_t)-0.89044872324475787817f,(float16_t)0.45508358712634389143f,
+(float16_t)-0.89184070939234261211f,(float16_t)0.45234958723377105549f,
+(float16_t)-0.89322430119551521344f,(float16_t)0.44961132965460687272f,
+(float16_t)-0.89459948563138269595f,(float16_t)0.44686884016237415906f,
+(float16_t)-0.89596624975618510689f,(float16_t)0.44412214457042931137f,
+(float16_t)-0.89732458070541820661f,(float16_t)0.44137126873171689256f,
+(float16_t)-0.89867446569395392775f,(float16_t)0.43861623853852754751f,
+(float16_t)-0.90001589201616016833f,(float16_t)0.43585707992225553031f,
+(float16_t)-0.90134884704602191707f,(float16_t)0.43309381885315206828f,
+(float16_t)-0.90267331823725871498f,(float16_t)0.43032648134008288920f,
+(float16_t)-0.90398929312344333820f,(float16_t)0.42755509343028202940f,
+(float16_t)-0.90529675931811870448f,(float16_t)0.42477968120910886141f,
+(float16_t)-0.90659570451491533483f,(float16_t)0.42200027079979984812f,
+(float16_t)-0.90788611648766603945f,(float16_t)0.41921688836322423821f,
+(float16_t)-0.90916798309052238025f,(float16_t)0.41642956009763715253f,
+(float16_t)-0.91044129225806713634f,(float16_t)0.41363831223843466889f,
+(float16_t)-0.91170603200542976730f,(float16_t)0.41084317105790413294f,
+(float16_t)-0.91296219042839821256f,(float16_t)0.40804416286497857680f,
+(float16_t)-0.91420975570353069095f,(float16_t)0.40524131400498991651f,
+(float16_t)-0.91544871608826772214f,(float16_t)0.40243465085941859671f,
+(float16_t)-0.91667905992104259383f,(float16_t)0.39962419984564706565f,
+(float16_t)-0.91790077562139049672f,(float16_t)0.39680998741671025254f,
+(float16_t)-0.91911385169005777040f,(float16_t)0.39399204006104815434f,
+(float16_t)-0.92031827670911048322f,(float16_t)0.39117038430225403722f,
+(float16_t)-0.92151403934204179080f,(float16_t)0.38834504669882657923f,
+(float16_t)-0.92270112833387862850f,(float16_t)0.38551605384391884890f,
+(float16_t)-0.92387953251128673848f,(float16_t)0.38268343236508989280f,
+(float16_t)-0.92504924078267747323f,(float16_t)0.37984720892405138271f,
+(float16_t)-0.92621024213831137928f,(float16_t)0.37700741021641814843f,
+(float16_t)-0.92736252565040111495f,(float16_t)0.37416406297145804460f,
+(float16_t)-0.92850608047321547822f,(float16_t)0.37131719395183770960f,
+(float16_t)-0.92964089584318121418f,(float16_t)0.36846682995337259880f,
+(float16_t)-0.93076696107898371224f,(float16_t)0.36561299780477379828f,
+(float16_t)-0.93188426558166803648f,(float16_t)0.36275572436739728088f,
+(float16_t)-0.93299279883473884567f,(float16_t)0.35989503653498833291f,
+(float16_t)-0.93409255040425875904f,(float16_t)0.35703096123343031065f,
+(float16_t)-0.93518350993894761025f,(float16_t)0.35416352542049039931f,
+(float16_t)-0.93626566717027825959f,(float16_t)0.35129275608556720378f,
+(float16_t)-0.93733901191257484875f,(float16_t)0.34841868024943478677f,
+(float16_t)-0.93840353406310816897f,(float16_t)0.34554132496398898278f,
+(float16_t)-0.93945922360218991898f,(float16_t)0.34266071731199443384f,
+(float16_t)-0.94050607059326829518f,(float16_t)0.33977688440682701776f,
+(float16_t)-0.94154406518302069529f,(float16_t)0.33688985339222032867f,
+(float16_t)-0.94257319760144686605f,(float16_t)0.33399965144200938205f,
+(float16_t)-0.94359345816196038559f,(float16_t)0.33110630575987648472f,
+(float16_t)-0.94460483726148014583f,(float16_t)0.32820984357909271933f,
+(float16_t)-0.94560732538052116869f,(float16_t)0.32531029216226325929f,
+(float16_t)-0.94660091308328353499f,(float16_t)0.32240767880106985244f,
+(float16_t)-0.94758559101774109124f,(float16_t)0.31950203081601580291f,
+(float16_t)-0.94856134991573026749f,(float16_t)0.31659337555616606785f,
+(float16_t)-0.94952818059303667475f,(float16_t)0.31368174039889140658f,
+(float16_t)-0.95048607394948170235f,(float16_t)0.31076715274961153046f,
+(float16_t)-0.95143502096900833820f,(float16_t)0.30784964004153503314f,
+(float16_t)-0.95237501271976576778f,(float16_t)0.30492922973540265152f,
+(float16_t)-0.95330604035419386211f,(float16_t)0.30200594931922802866f,
+(float16_t)-0.95422809510910555630f,(float16_t)0.29907982630804053059f,
+(float16_t)-0.95514116830577067141f,(float16_t)0.29615088824362401088f,
+(float16_t)-0.95604525134999629454f,(float16_t)0.29321916269425896129f,
+(float16_t)-0.95694033573220882438f,(float16_t)0.29028467725446238656f,
+(float16_t)-0.95782641302753290802f,(float16_t)0.28734745954472962204f,
+(float16_t)-0.95870347489587148804f,(float16_t)0.28440753721127209896f,
+(float16_t)-0.95957151308198451733f,(float16_t)0.28146493792575788540f,
+(float16_t)-0.96043051941556578655f,(float16_t)0.27851968938505317075f,
+(float16_t)-0.96128048581132063966f,(float16_t)0.27557181931095831029f,
+(float16_t)-0.96212140426904146917f,(float16_t)0.27262135544994925418f,
+(float16_t)-0.96295326687368387741f,(float16_t)0.26966832557291509076f,
+(float16_t)-0.96377606579543984022f,(float16_t)0.26671275747489847641f,
+(float16_t)-0.96458979328981264700f,(float16_t)0.26375467897483156898f,
+(float16_t)-0.96539444169768928727f,(float16_t)0.26079411791527584707f,
+(float16_t)-0.96619000344541250413f,(float16_t)0.25783110216215898713f,
+(float16_t)-0.96697647104485207059f,(float16_t)0.25486565960451468271f,
+(float16_t)-0.96775383709347539973f,(float16_t)0.25189781815421719013f,
+(float16_t)-0.96852209427441737777f,(float16_t)0.24892760574572009302f,
+(float16_t)-0.96928123535654842069f,(float16_t)0.24595505033579465048f,
+(float16_t)-0.97003125319454397424f,(float16_t)0.24298017990326406523f,
+(float16_t)-0.97077214072895023911f,(float16_t)0.24000302244874177626f,
+(float16_t)-0.97150389098625178352f,(float16_t)0.23702360599436717026f,
+(float16_t)-0.97222649707893626925f,(float16_t)0.23404195858354351345f,
+(float16_t)-0.97293995220556006576f,(float16_t)0.23105810828067133156f,
+(float16_t)-0.97364424965081186603f,(float16_t)0.22807208317088606409f,
+(float16_t)-0.97433938278557585821f,(float16_t)0.22508391135979283204f,
+(float16_t)-0.97502534506699412020f,(float16_t)0.22209362097320364815f,
+(float16_t)-0.97570213003852845901f,(float16_t)0.21910124015687004739f,
+(float16_t)-0.97636973133002114000f,(float16_t)0.21610679707621943679f,
+(float16_t)-0.97702814265775439484f,(float16_t)0.21311031991609141745f,
+(float16_t)-0.97767735782450992943f,(float16_t)0.21011183688046980444f,
+(float16_t)-0.97831737071962754371f,(float16_t)0.20711137619221883788f,
+(float16_t)-0.97894817531906219710f,(float16_t)0.20410896609281684033f,
+(float16_t)-0.97956976568544051887f,(float16_t)0.20110463484209201157f,
+(float16_t)-0.98018213596811731847f,(float16_t)0.19809841071795381007f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.98137919331375456089f,(float16_t)0.19208039704989246510f,
+(float16_t)-0.98196386910955524296f,(float16_t)0.18906866414980635915f,
+(float16_t)-0.98253930228744124076f,(float16_t)0.18605515166344691047f,
+(float16_t)-0.98310548743121628501f,(float16_t)0.18303988795514089527f,
+(float16_t)-0.98366241921173025453f,(float16_t)0.18002290140569957022f,
+(float16_t)-0.98421009238692902521f,(float16_t)0.17700422041214894375f,
+(float16_t)-0.98474850180190420801f,(float16_t)0.17398387338746412745f,
+(float16_t)-0.98527764238894122162f,(float16_t)0.17096188876030121717f,
+(float16_t)-0.98579750916756736512f,(float16_t)0.16793829497473128365f,
+(float16_t)-0.98630809724459855836f,(float16_t)0.16491312048997014417f,
+(float16_t)-0.98680940181418552726f,(float16_t)0.16188639378011174252f,
+(float16_t)-0.98730141815785843473f,(float16_t)0.15885814333386147346f,
+(float16_t)-0.98778414164457217783f,(float16_t)0.15582839765426537149f,
+(float16_t)-0.98825756773074946437f,(float16_t)0.15279718525844368515f,
+(float16_t)-0.98872169196032377858f,(float16_t)0.14976453467732145364f,
+(float16_t)-0.98917650996478101444f,(float16_t)0.14673047445536180344f,
+(float16_t)-0.98962201746320077600f,(float16_t)0.14369503315029463764f,
+(float16_t)-0.99005821026229701154f,(float16_t)0.14065823933284954395f,
+(float16_t)-0.99048508425645709341f,(float16_t)0.13762012158648603832f,
+(float16_t)-0.99090263542778000971f,(float16_t)0.13458070850712627875f,
+(float16_t)-0.99131085984611544415f,(float16_t)0.13154002870288333815f,
+(float16_t)-0.99170975366909952520f,(float16_t)0.12849811079379308554f,
+(float16_t)-0.99209931314219179654f,(float16_t)0.12545498341154626143f,
+(float16_t)-0.99247953459870996706f,(float16_t)0.12241067519921634832f,
+(float16_t)-0.99285041445986510489f,(float16_t)0.11936521481099163222f,
+(float16_t)-0.99321194923479450001f,(float16_t)0.11631863091190471071f,
+(float16_t)-0.99356413552059530403f,(float16_t)0.11327095217756441570f,
+(float16_t)-0.99390697000235606051f,(float16_t)0.11022220729388323979f,
+(float16_t)-0.99424044945318790223f,(float16_t)0.10717242495680916192f,
+(float16_t)-0.99456457073425541537f,(float16_t)0.10412163387205457254f,
+(float16_t)-0.99487933079480561638f,(float16_t)0.10106986275482793269f,
+(float16_t)-0.99518472667219681771f,(float16_t)0.09801714032956082567f,
+(float16_t)-0.99548075549192693856f,(float16_t)0.09496349532963890838f,
+(float16_t)-0.99576741446765981713f,(float16_t)0.09190895649713275162f,
+(float16_t)-0.99604470090125196702f,(float16_t)0.08885355258252475297f,
+(float16_t)-0.99631261218277800129f,(float16_t)0.08579731234444015753f,
+(float16_t)-0.99657114579055483539f,(float16_t)0.08274026454937563613f,
+(float16_t)-0.99682029929116566791f,(float16_t)0.07968243797143019502f,
+(float16_t)-0.99706007033948296225f,(float16_t)0.07662386139203168633f,
+(float16_t)-0.99729045667869020697f,(float16_t)0.07356456359966773162f,
+(float16_t)-0.99751145614030345410f,(float16_t)0.07050457338961385600f,
+(float16_t)-0.99772306664419163624f,(float16_t)0.06744391956366417584f,
+(float16_t)-0.99792528619859599548f,(float16_t)0.06438263092985770097f,
+(float16_t)-0.99811811290014917919f,(float16_t)0.06132073630220848809f,
+(float16_t)-0.99830154493389289261f,(float16_t)0.05825826450043579408f,
+(float16_t)-0.99847558057329477421f,(float16_t)0.05519524434969009380f,
+(float16_t)-0.99864021818026516009f,(float16_t)0.05213170468028359428f,
+(float16_t)-0.99879545620517240501f,(float16_t)0.04906767432741796636f,
+(float16_t)-0.99894129318685687124f,(float16_t)0.04600318213091470626f,
+(float16_t)-0.99907772775264536147f,(float16_t)0.04293825693494102147f,
+(float16_t)-0.99920475861836388631f,(float16_t)0.03987292758774012985f,
+(float16_t)-0.99932238458834954375f,(float16_t)0.03680722294135883171f,
+(float16_t)-0.99943060455546173237f,(float16_t)0.03374117185137770480f,
+(float16_t)-0.99952941750109314256f,(float16_t)0.03067480317663686534f,
+(float16_t)-0.99961882249517863830f,(float16_t)0.02760814577896565994f,
+(float16_t)-0.99969881869620424997f,(float16_t)0.02454122852291232629f,
+(float16_t)-0.99976940535121527898f,(float16_t)0.02147408027546966747f,
+(float16_t)-0.99983058179582340319f,(float16_t)0.01840672990580510121f,
+(float16_t)-0.99988234745421256111f,(float16_t)0.01533920628498806026f,
+(float16_t)-0.99992470183914450299f,(float16_t)0.01227153828572000692f,
+(float16_t)-0.99995764455196389786f,(float16_t)0.00920375478206002066f,
+(float16_t)-0.99998117528260110909f,(float16_t)0.00613588464915479880f,
+(float16_t)-0.99999529380957619118f,(float16_t)0.00306795676296597701f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99992470183914450299f,(float16_t)0.01227153828571992539f,
+(float16_t)0.99969881869620424997f,(float16_t)0.02454122852291228812f,
+(float16_t)0.99932238458834954375f,(float16_t)0.03680722294135883171f,
+(float16_t)0.99879545620517240501f,(float16_t)0.04906767432741801493f,
+(float16_t)0.99811811290014917919f,(float16_t)0.06132073630220857829f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.99631261218277800129f,(float16_t)0.08579731234443989385f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.99390697000235606051f,(float16_t)0.11022220729388305938f,
+(float16_t)0.99247953459870996706f,(float16_t)0.12241067519921619566f,
+(float16_t)0.99090263542778000971f,(float16_t)0.13458070850712616773f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98730141815785843473f,(float16_t)0.15885814333386144570f,
+(float16_t)0.98527764238894122162f,(float16_t)0.17096188876030121717f,
+(float16_t)0.98310548743121628501f,(float16_t)0.18303988795514095078f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.97831737071962765473f,(float16_t)0.20711137619221856032f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.97293995220556017678f,(float16_t)0.23105810828067110951f,
+(float16_t)0.97003125319454397424f,(float16_t)0.24298017990326387094f,
+(float16_t)0.96697647104485207059f,(float16_t)0.25486565960451457169f,
+(float16_t)0.96377606579543984022f,(float16_t)0.26671275747489836538f,
+(float16_t)0.96043051941556578655f,(float16_t)0.27851968938505305973f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.95330604035419386211f,(float16_t)0.30200594931922808417f,
+(float16_t)0.94952818059303667475f,(float16_t)0.31368174039889151761f,
+(float16_t)0.94560732538052127971f,(float16_t)0.32531029216226292622f,
+(float16_t)0.94154406518302080631f,(float16_t)0.33688985339222005111f,
+(float16_t)0.93733901191257495977f,(float16_t)0.34841868024943456472f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.92850608047321558924f,(float16_t)0.37131719395183754306f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.91911385169005777040f,(float16_t)0.39399204006104809883f,
+(float16_t)0.91420975570353069095f,(float16_t)0.40524131400498986100f,
+(float16_t)0.90916798309052238025f,(float16_t)0.41642956009763715253f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.89867446569395381673f,(float16_t)0.43861623853852765853f,
+(float16_t)0.89322430119551532446f,(float16_t)0.44961132965460653965f,
+(float16_t)0.88763962040285393496f,(float16_t)0.46053871095824000514f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.87607009419540660122f,(float16_t)0.48218377207912271887f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.86397285612158669643f,(float16_t)0.50353838372571757542f,
+(float16_t)0.85772861000027211809f,(float16_t)0.51410274419322166128f,
+(float16_t)0.85135519310526519554f,(float16_t)0.52458968267846894928f,
+(float16_t)0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)0.83822470555483807875f,(float16_t)0.54532498842204646383f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.82458930278502529099f,(float16_t)0.56573181078361312046f,
+(float16_t)0.81758481315158371139f,(float16_t)0.57580819141784533866f,
+(float16_t)0.81045719825259476821f,(float16_t)0.58579785745643886408f,
+(float16_t)0.80320753148064494287f,(float16_t)0.59569930449243335691f,
+(float16_t)0.79583690460888356633f,(float16_t)0.60551104140432554512f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.78073722857209448822f,(float16_t)0.62485948814238634341f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.76516726562245895860f,(float16_t)0.64383154288979138613f,
+(float16_t)0.75720884650648456748f,(float16_t)0.65317284295377675551f,
+(float16_t)0.74913639452345937020f,(float16_t)0.66241577759017178373f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.73265427167241281570f,(float16_t)0.68060099779545302212f,
+(float16_t)0.72424708295146700276f,(float16_t)0.68954054473706682948f,
+(float16_t)0.71573082528381870571f,(float16_t)0.69837624940897280457f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.69837624940897291559f,(float16_t)0.71573082528381859468f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.68060099779545302212f,(float16_t)0.73265427167241281570f,
+(float16_t)0.67155895484701833009f,(float16_t)0.74095112535495910588f,
+(float16_t)0.66241577759017178373f,(float16_t)0.74913639452345925918f,
+(float16_t)0.65317284295377686654f,(float16_t)0.75720884650648456748f,
+(float16_t)0.64383154288979149715f,(float16_t)0.76516726562245895860f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.62485948814238645443f,(float16_t)0.78073722857209448822f,
+(float16_t)0.61523159058062681925f,(float16_t)0.78834642762660622761f,
+(float16_t)0.60551104140432554512f,(float16_t)0.79583690460888345530f,
+(float16_t)0.59569930449243346793f,(float16_t)0.80320753148064483184f,
+(float16_t)0.58579785745643886408f,(float16_t)0.81045719825259476821f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.56573181078361323149f,(float16_t)0.82458930278502529099f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.54532498842204646383f,(float16_t)0.83822470555483796772f,
+(float16_t)0.53499761988709726435f,(float16_t)0.84485356524970700587f,
+(float16_t)0.52458968267846883826f,(float16_t)0.85135519310526519554f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.50353838372571757542f,(float16_t)0.86397285612158669643f,
+(float16_t)0.49289819222978409341f,(float16_t)0.87008699110871134952f,
+(float16_t)0.48218377207912282989f,(float16_t)0.87607009419540660122f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.46053871095824000514f,(float16_t)0.88763962040285393496f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.43861623853852771404f,(float16_t)0.89867446569395381673f,
+(float16_t)0.42755509343028219593f,(float16_t)0.90398929312344333820f,
+(float16_t)0.41642956009763731906f,(float16_t)0.90916798309052226923f,
+(float16_t)0.40524131400498986100f,(float16_t)0.91420975570353069095f,
+(float16_t)0.39399204006104809883f,(float16_t)0.91911385169005777040f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.37131719395183759858f,(float16_t)0.92850608047321558924f,
+(float16_t)0.35989503653498827740f,(float16_t)0.93299279883473884567f,
+(float16_t)0.34841868024943450921f,(float16_t)0.93733901191257495977f,
+(float16_t)0.33688985339222005111f,(float16_t)0.94154406518302080631f,
+(float16_t)0.32531029216226298173f,(float16_t)0.94560732538052127971f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.30200594931922819519f,(float16_t)0.95330604035419375109f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.27851968938505305973f,(float16_t)0.96043051941556578655f,
+(float16_t)0.26671275747489842090f,(float16_t)0.96377606579543984022f,
+(float16_t)0.25486565960451462720f,(float16_t)0.96697647104485207059f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.23105810828067127605f,(float16_t)0.97293995220556006576f,
+(float16_t)0.21910124015686976984f,(float16_t)0.97570213003852857003f,
+(float16_t)0.20711137619221856032f,(float16_t)0.97831737071962765473f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.18303988795514106180f,(float16_t)0.98310548743121628501f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.15885814333386139019f,(float16_t)0.98730141815785843473f,
+(float16_t)0.14673047445536174793f,(float16_t)0.98917650996478101444f,
+(float16_t)0.13458070850712622324f,(float16_t)0.99090263542778000971f,
+(float16_t)0.12241067519921627893f,(float16_t)0.99247953459870996706f,
+(float16_t)0.11022220729388318428f,(float16_t)0.99390697000235606051f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.08579731234443987997f,(float16_t)0.99631261218277800129f,
+(float16_t)0.07356456359966745406f,(float16_t)0.99729045667869020697f,
+(float16_t)0.06132073630220864768f,(float16_t)0.99811811290014917919f,
+(float16_t)0.04906767432741812596f,(float16_t)0.99879545620517240501f,
+(float16_t)0.03680722294135899131f,(float16_t)0.99932238458834954375f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)0.01227153828571994447f,(float16_t)0.99992470183914450299f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.01227153828571982304f,(float16_t)0.99992470183914450299f,
+(float16_t)-0.02454122852291214241f,(float16_t)0.99969881869620424997f,
+(float16_t)-0.03680722294135886641f,(float16_t)0.99932238458834954375f,
+(float16_t)-0.04906767432741800800f,(float16_t)0.99879545620517240501f,
+(float16_t)-0.06132073630220852972f,(float16_t)0.99811811290014917919f,
+(float16_t)-0.07356456359966732916f,(float16_t)0.99729045667869020697f,
+(float16_t)-0.08579731234443975507f,(float16_t)0.99631261218277800129f,
+(float16_t)-0.09801714032956064526f,(float16_t)0.99518472667219692873f,
+(float16_t)-0.11022220729388305938f,(float16_t)0.99390697000235606051f,
+(float16_t)-0.12241067519921615403f,(float16_t)0.99247953459870996706f,
+(float16_t)-0.13458070850712611222f,(float16_t)0.99090263542778000971f,
+(float16_t)-0.14673047445536163691f,(float16_t)0.98917650996478101444f,
+(float16_t)-0.15885814333386127917f,(float16_t)0.98730141815785843473f,
+(float16_t)-0.17096188876030124493f,(float16_t)0.98527764238894122162f,
+(float16_t)-0.18303988795514092303f,(float16_t)0.98310548743121628501f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.20711137619221844930f,(float16_t)0.97831737071962765473f,
+(float16_t)-0.21910124015686965881f,(float16_t)0.97570213003852857003f,
+(float16_t)-0.23105810828067113727f,(float16_t)0.97293995220556017678f,
+(float16_t)-0.24298017990326387094f,(float16_t)0.97003125319454397424f,
+(float16_t)-0.25486565960451451618f,(float16_t)0.96697647104485207059f,
+(float16_t)-0.26671275747489830987f,(float16_t)0.96377606579543984022f,
+(float16_t)-0.27851968938505294870f,(float16_t)0.96043051941556589757f,
+(float16_t)-0.29028467725446216452f,(float16_t)0.95694033573220893540f,
+(float16_t)-0.30200594931922808417f,(float16_t)0.95330604035419386211f,
+(float16_t)-0.31368174039889140658f,(float16_t)0.94952818059303667475f,
+(float16_t)-0.32531029216226287071f,(float16_t)0.94560732538052139073f,
+(float16_t)-0.33688985339221994009f,(float16_t)0.94154406518302080631f,
+(float16_t)-0.34841868024943439819f,(float16_t)0.93733901191257495977f,
+(float16_t)-0.35989503653498816638f,(float16_t)0.93299279883473884567f,
+(float16_t)-0.37131719395183748755f,(float16_t)0.92850608047321558924f,
+(float16_t)-0.38268343236508972627f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.39399204006104798781f,(float16_t)0.91911385169005777040f,
+(float16_t)-0.40524131400498974998f,(float16_t)0.91420975570353069095f,
+(float16_t)-0.41642956009763698599f,(float16_t)0.90916798309052249127f,
+(float16_t)-0.42755509343028186287f,(float16_t)0.90398929312344344922f,
+(float16_t)-0.43861623853852738097f,(float16_t)0.89867446569395392775f,
+(float16_t)-0.44961132965460670619f,(float16_t)0.89322430119551521344f,
+(float16_t)-0.46053871095824006066f,(float16_t)0.88763962040285393496f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.48218377207912271887f,(float16_t)0.87607009419540660122f,
+(float16_t)-0.49289819222978398239f,(float16_t)0.87008699110871146054f,
+(float16_t)-0.50353838372571746440f,(float16_t)0.86397285612158680745f,
+(float16_t)-0.51410274419322155026f,(float16_t)0.85772861000027211809f,
+(float16_t)-0.52458968267846872724f,(float16_t)0.85135519310526519554f,
+(float16_t)-0.53499761988709704230f,(float16_t)0.84485356524970722791f,
+(float16_t)-0.54532498842204624179f,(float16_t)0.83822470555483818977f,
+(float16_t)-0.55557023301960195560f,(float16_t)0.83146961230254534669f,
+(float16_t)-0.56573181078361323149f,(float16_t)0.82458930278502517996f,
+(float16_t)-0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)-0.58579785745643886408f,(float16_t)0.81045719825259476821f,
+(float16_t)-0.59569930449243335691f,(float16_t)0.80320753148064494287f,
+(float16_t)-0.60551104140432543410f,(float16_t)0.79583690460888356633f,
+(float16_t)-0.61523159058062670823f,(float16_t)0.78834642762660633863f,
+(float16_t)-0.62485948814238623239f,(float16_t)0.78073722857209459924f,
+(float16_t)-0.63439328416364537677f,(float16_t)0.77301045336273710440f,
+(float16_t)-0.64383154288979127511f,(float16_t)0.76516726562245906962f,
+(float16_t)-0.65317284295377653347f,(float16_t)0.75720884650648467851f,
+(float16_t)-0.66241577759017189475f,(float16_t)0.74913639452345925918f,
+(float16_t)-0.67155895484701844111f,(float16_t)0.74095112535495899486f,
+(float16_t)-0.68060099779545302212f,(float16_t)0.73265427167241281570f,
+(float16_t)-0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)-0.69837624940897280457f,(float16_t)0.71573082528381870571f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.71573082528381859468f,(float16_t)0.69837624940897291559f,
+(float16_t)-0.72424708295146678072f,(float16_t)0.68954054473706705153f,
+(float16_t)-0.73265427167241270467f,(float16_t)0.68060099779545324417f,
+(float16_t)-0.74095112535495888384f,(float16_t)0.67155895484701855214f,
+(float16_t)-0.74913639452345914815f,(float16_t)0.66241577759017200577f,
+(float16_t)-0.75720884650648467851f,(float16_t)0.65317284295377664449f,
+(float16_t)-0.76516726562245895860f,(float16_t)0.64383154288979138613f,
+(float16_t)-0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)-0.78073722857209448822f,(float16_t)0.62485948814238634341f,
+(float16_t)-0.78834642762660622761f,(float16_t)0.61523159058062693028f,
+(float16_t)-0.79583690460888345530f,(float16_t)0.60551104140432565615f,
+(float16_t)-0.80320753148064483184f,(float16_t)0.59569930449243346793f,
+(float16_t)-0.81045719825259465718f,(float16_t)0.58579785745643897510f,
+(float16_t)-0.81758481315158360037f,(float16_t)0.57580819141784544968f,
+(float16_t)-0.82458930278502506894f,(float16_t)0.56573181078361345353f,
+(float16_t)-0.83146961230254534669f,(float16_t)0.55557023301960217765f,
+(float16_t)-0.83822470555483807875f,(float16_t)0.54532498842204635281f,
+(float16_t)-0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)-0.85135519310526519554f,(float16_t)0.52458968267846894928f,
+(float16_t)-0.85772861000027200706f,(float16_t)0.51410274419322177231f,
+(float16_t)-0.86397285612158669643f,(float16_t)0.50353838372571757542f,
+(float16_t)-0.87008699110871134952f,(float16_t)0.49289819222978414892f,
+(float16_t)-0.87607009419540649020f,(float16_t)0.48218377207912288540f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.88763962040285382393f,(float16_t)0.46053871095824022719f,
+(float16_t)-0.89322430119551521344f,(float16_t)0.44961132965460687272f,
+(float16_t)-0.89867446569395392775f,(float16_t)0.43861623853852754751f,
+(float16_t)-0.90398929312344333820f,(float16_t)0.42755509343028202940f,
+(float16_t)-0.90916798309052238025f,(float16_t)0.41642956009763715253f,
+(float16_t)-0.91420975570353069095f,(float16_t)0.40524131400498991651f,
+(float16_t)-0.91911385169005777040f,(float16_t)0.39399204006104815434f,
+(float16_t)-0.92387953251128673848f,(float16_t)0.38268343236508989280f,
+(float16_t)-0.92850608047321547822f,(float16_t)0.37131719395183770960f,
+(float16_t)-0.93299279883473884567f,(float16_t)0.35989503653498833291f,
+(float16_t)-0.93733901191257484875f,(float16_t)0.34841868024943478677f,
+(float16_t)-0.94154406518302069529f,(float16_t)0.33688985339222032867f,
+(float16_t)-0.94560732538052116869f,(float16_t)0.32531029216226325929f,
+(float16_t)-0.94952818059303667475f,(float16_t)0.31368174039889140658f,
+(float16_t)-0.95330604035419386211f,(float16_t)0.30200594931922802866f,
+(float16_t)-0.95694033573220882438f,(float16_t)0.29028467725446238656f,
+(float16_t)-0.96043051941556578655f,(float16_t)0.27851968938505317075f,
+(float16_t)-0.96377606579543984022f,(float16_t)0.26671275747489847641f,
+(float16_t)-0.96697647104485207059f,(float16_t)0.25486565960451468271f,
+(float16_t)-0.97003125319454397424f,(float16_t)0.24298017990326406523f,
+(float16_t)-0.97293995220556006576f,(float16_t)0.23105810828067133156f,
+(float16_t)-0.97570213003852845901f,(float16_t)0.21910124015687004739f,
+(float16_t)-0.97831737071962754371f,(float16_t)0.20711137619221883788f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.98310548743121628501f,(float16_t)0.18303988795514089527f,
+(float16_t)-0.98527764238894122162f,(float16_t)0.17096188876030121717f,
+(float16_t)-0.98730141815785843473f,(float16_t)0.15885814333386147346f,
+(float16_t)-0.98917650996478101444f,(float16_t)0.14673047445536180344f,
+(float16_t)-0.99090263542778000971f,(float16_t)0.13458070850712627875f,
+(float16_t)-0.99247953459870996706f,(float16_t)0.12241067519921634832f,
+(float16_t)-0.99390697000235606051f,(float16_t)0.11022220729388323979f,
+(float16_t)-0.99518472667219681771f,(float16_t)0.09801714032956082567f,
+(float16_t)-0.99631261218277800129f,(float16_t)0.08579731234444015753f,
+(float16_t)-0.99729045667869020697f,(float16_t)0.07356456359966773162f,
+(float16_t)-0.99811811290014917919f,(float16_t)0.06132073630220848809f,
+(float16_t)-0.99879545620517240501f,(float16_t)0.04906767432741796636f,
+(float16_t)-0.99932238458834954375f,(float16_t)0.03680722294135883171f,
+(float16_t)-0.99969881869620424997f,(float16_t)0.02454122852291232629f,
+(float16_t)-0.99992470183914450299f,(float16_t)0.01227153828572000692f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99879545620517240501f,(float16_t)0.04906767432741801493f,
+(float16_t)0.99518472667219692873f,(float16_t)0.09801714032956060363f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.97003125319454397424f,(float16_t)0.24298017990326387094f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.94154406518302080631f,(float16_t)0.33688985339222005111f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.88192126434835504956f,(float16_t)0.47139673682599764204f,
+(float16_t)0.85772861000027211809f,(float16_t)0.51410274419322166128f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.80320753148064494287f,(float16_t)0.59569930449243335691f,
+(float16_t)0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.67155895484701833009f,(float16_t)0.74095112535495910588f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.59569930449243346793f,(float16_t)0.80320753148064483184f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.47139673682599780857f,(float16_t)0.88192126434835493853f,
+(float16_t)0.42755509343028219593f,(float16_t)0.90398929312344333820f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.33688985339222005111f,(float16_t)0.94154406518302080631f,
+(float16_t)0.29028467725446233105f,(float16_t)0.95694033573220893540f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.14673047445536174793f,(float16_t)0.98917650996478101444f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.04906767432741812596f,(float16_t)0.99879545620517240501f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.04906767432741800800f,(float16_t)0.99879545620517240501f,
+(float16_t)-0.09801714032956064526f,(float16_t)0.99518472667219692873f,
+(float16_t)-0.14673047445536163691f,(float16_t)0.98917650996478101444f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.24298017990326387094f,(float16_t)0.97003125319454397424f,
+(float16_t)-0.29028467725446216452f,(float16_t)0.95694033573220893540f,
+(float16_t)-0.33688985339221994009f,(float16_t)0.94154406518302080631f,
+(float16_t)-0.38268343236508972627f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.42755509343028186287f,(float16_t)0.90398929312344344922f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.51410274419322155026f,(float16_t)0.85772861000027211809f,
+(float16_t)-0.55557023301960195560f,(float16_t)0.83146961230254534669f,
+(float16_t)-0.59569930449243335691f,(float16_t)0.80320753148064494287f,
+(float16_t)-0.63439328416364537677f,(float16_t)0.77301045336273710440f,
+(float16_t)-0.67155895484701844111f,(float16_t)0.74095112535495899486f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.74095112535495888384f,(float16_t)0.67155895484701855214f,
+(float16_t)-0.77301045336273699338f,(float16_t)0.63439328416364548779f,
+(float16_t)-0.80320753148064483184f,(float16_t)0.59569930449243346793f,
+(float16_t)-0.83146961230254534669f,(float16_t)0.55557023301960217765f,
+(float16_t)-0.85772861000027200706f,(float16_t)0.51410274419322177231f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.90398929312344333820f,(float16_t)0.42755509343028202940f,
+(float16_t)-0.92387953251128673848f,(float16_t)0.38268343236508989280f,
+(float16_t)-0.94154406518302069529f,(float16_t)0.33688985339222032867f,
+(float16_t)-0.95694033573220882438f,(float16_t)0.29028467725446238656f,
+(float16_t)-0.97003125319454397424f,(float16_t)0.24298017990326406523f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.98917650996478101444f,(float16_t)0.14673047445536180344f,
+(float16_t)-0.99518472667219681771f,(float16_t)0.09801714032956082567f,
+(float16_t)-0.99879545620517240501f,(float16_t)0.04906767432741796636f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.98078528040323043058f,(float16_t)0.19509032201612824808f,
+(float16_t)0.92387953251128673848f,(float16_t)0.38268343236508978178f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.55557023301960228867f,(float16_t)0.83146961230254523567f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.19509032201612833135f,(float16_t)0.98078528040323043058f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.38268343236508972627f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.55557023301960195560f,(float16_t)0.83146961230254534669f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.83146961230254534669f,(float16_t)0.55557023301960217765f,
+(float16_t)-0.92387953251128673848f,(float16_t)0.38268343236508989280f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.70710678118654757274f,(float16_t)0.70710678118654757274f,
+(float16_t)0.00000000000000006123f,(float16_t)1.00000000000000000000f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,};
+
+float16_t rearranged_twiddle_stride3_4096_f16[2728]={
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99998941108192840321f,(float16_t)0.00460192612044857050f,
+(float16_t)0.99995764455196389786f,(float16_t)0.00920375478205981944f,
+(float16_t)0.99990470108285289808f,(float16_t)0.01380538852806039059f,
+(float16_t)0.99983058179582340319f,(float16_t)0.01840672990580482019f,
+(float16_t)0.99973528826056168306f,(float16_t)0.02300768146883936868f,
+(float16_t)0.99961882249517863830f,(float16_t)0.02760814577896573974f,
+(float16_t)0.99948118696616694567f,(float16_t)0.03220802540830458582f,
+(float16_t)0.99932238458834954375f,(float16_t)0.03680722294135883171f,
+(float16_t)0.99914241872481690532f,(float16_t)0.04140564097707673946f,
+(float16_t)0.99894129318685687124f,(float16_t)0.04600318213091462299f,
+(float16_t)0.99871901223387293811f,(float16_t)0.05059974903689928166f,
+(float16_t)0.99847558057329477421f,(float16_t)0.05519524434968993420f,
+(float16_t)0.99821100336047818846f,(float16_t)0.05978957074663986820f,
+(float16_t)0.99792528619859599548f,(float16_t)0.06438263092985746505f,
+(float16_t)0.99761843513851955478f,(float16_t)0.06897432762826674613f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.99694135776498216117f,(float16_t)0.07815324163279423197f,
+(float16_t)0.99657114579055483539f,(float16_t)0.08274026454937569164f,
+(float16_t)0.99617982859569698117f,(float16_t)0.08732553520619205922f,
+(float16_t)0.99576741446765981713f,(float16_t)0.09190895649713272386f,
+(float16_t)0.99533391214048227980f,(float16_t)0.09649043135525259274f,
+(float16_t)0.99487933079480561638f,(float16_t)0.10106986275482782167f,
+(float16_t)0.99440368005767909576f,(float16_t)0.10564715371341061589f,
+(float16_t)0.99390697000235606051f,(float16_t)0.11022220729388305938f,
+(float16_t)0.99338921114808065305f,(float16_t)0.11479492660651008373f,
+(float16_t)0.99285041445986510489f,(float16_t)0.11936521481099135467f,
+(float16_t)0.99229059134825736699f,(float16_t)0.12393297511851215920f,
+(float16_t)0.99170975366909952520f,(float16_t)0.12849811079379316880f,
+(float16_t)0.99110791372327688986f,(float16_t)0.13306052515713906459f,
+(float16_t)0.99048508425645709341f,(float16_t)0.13762012158648603832f,
+(float16_t)0.98984127845882052821f,(float16_t)0.14217680351944803063f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98849079285269658701f,(float16_t)0.15128103795733022219f,
+(float16_t)0.98778414164457217783f,(float16_t)0.15582839765426523271f,
+(float16_t)0.98705657130575097380f,(float16_t)0.16037245724292828464f,
+(float16_t)0.98630809724459866938f,(float16_t)0.16491312048996989437f,
+(float16_t)0.98553873531217606185f,(float16_t)0.16945029123396795900f,
+(float16_t)0.98474850180190420801f,(float16_t)0.17398387338746382214f,
+(float16_t)0.98393741344921892278f,(float16_t)0.17851377093899750692f,
+(float16_t)0.98310548743121628501f,(float16_t)0.18303988795514095078f,
+(float16_t)0.98225274136628937249f,(float16_t)0.18756212858252960252f,
+(float16_t)0.98137919331375456089f,(float16_t)0.19208039704989243734f,
+(float16_t)0.98048486177346938497f,(float16_t)0.19659459767008022335f,
+(float16_t)0.97956976568544051887f,(float16_t)0.20110463484209190055f,
+(float16_t)0.97863392442942320759f,(float16_t)0.20561041305309923910f,
+(float16_t)0.97767735782450992943f,(float16_t)0.21011183688046961016f,
+(float16_t)0.97670008612871184184f,(float16_t)0.21460881099378675829f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.97468351068851066810f,(float16_t)0.22358902922978998729f,
+(float16_t)0.97364424965081197705f,(float16_t)0.22807208317088573102f,
+(float16_t)0.97258436893473221296f,(float16_t)0.23255030703877524467f,
+(float16_t)0.97150389098625178352f,(float16_t)0.23702360599436719801f,
+(float16_t)0.97040283868755550234f,(float16_t)0.24149188530286933019f,
+(float16_t)0.96928123535654853171f,(float16_t)0.24595505033579459497f,
+(float16_t)0.96813910474636244441f,(float16_t)0.25041300657296522436f,
+(float16_t)0.96697647104485207059f,(float16_t)0.25486565960451457169f,
+(float16_t)0.96579335887408368500f,(float16_t)0.25931291513288623474f,
+(float16_t)0.96458979328981275803f,(float16_t)0.26375467897483134694f,
+(float16_t)0.96336579978095404631f,(float16_t)0.26819085706340317632f,
+(float16_t)0.96212140426904158019f,(float16_t)0.27262135544994897662f,
+(float16_t)0.96085663310767965850f,(float16_t)0.27704608030609989555f,
+(float16_t)0.95957151308198451733f,(float16_t)0.28146493792575794091f,
+(float16_t)0.95826607140801767226f,(float16_t)0.28587783472708061527f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.95559433413077110586f,(float16_t)0.29468537218051432669f,
+(float16_t)0.95422809510910566733f,(float16_t)0.29907982630804047508f,
+(float16_t)0.95284164760119871573f,(float16_t)0.30346794657201131562f,
+(float16_t)0.95143502096900833820f,(float16_t)0.30784964004153486661f,
+(float16_t)0.95000824500184299914f,(float16_t)0.31222481392182488413f,
+(float16_t)0.94856134991573026749f,(float16_t)0.31659337555616584581f,
+(float16_t)0.94709436635277721717f,(float16_t)0.32095523242787521445f,
+(float16_t)0.94560732538052127971f,(float16_t)0.32531029216226292622f,
+(float16_t)0.94410025849127265918f,(float16_t)0.32965846252858749255f,
+(float16_t)0.94257319760144686605f,(float16_t)0.33399965144200938205f,
+(float16_t)0.94102617505088925753f,(float16_t)0.33833376696554112728f,
+(float16_t)0.93945922360218991898f,(float16_t)0.34266071731199437833f,
+(float16_t)0.93787237643998988545f,(float16_t)0.34698041084592368133f,
+(float16_t)0.93626566717027825959f,(float16_t)0.35129275608556709276f,
+(float16_t)0.93463912981968078064f,(float16_t)0.35559766170478385172f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.93132670908118042608f,(float16_t)0.36418478956707989180f,
+(float16_t)0.92964089584318121418f,(float16_t)0.36846682995337232125f,
+(float16_t)0.92793539482261788720f,(float16_t)0.37274106700951575855f,
+(float16_t)0.92621024213831137928f,(float16_t)0.37700741021641825945f,
+(float16_t)0.92446547432526260391f,(float16_t)0.38126576922216237620f,
+(float16_t)0.92270112833387862850f,(float16_t)0.38551605384391884890f,
+(float16_t)0.92091724152918941204f,(float16_t)0.38975817406985641123f,
+(float16_t)0.91911385169005777040f,(float16_t)0.39399204006104809883f,
+(float16_t)0.91729099700837790632f,(float16_t)0.39821756215337356100f,
+(float16_t)0.91544871608826783316f,(float16_t)0.40243465085941843018f,
+(float16_t)0.91358704794525080750f,(float16_t)0.40664321687036902864f,
+(float16_t)0.91170603200542987832f,(float16_t)0.41084317105790391089f,
+(float16_t)0.90980570810465222209f,(float16_t)0.41503442447608163146f,
+(float16_t)0.90788611648766626150f,(float16_t)0.41921688836322390515f,
+(float16_t)0.90594729780726845902f,(float16_t)0.42339047414379604728f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.90201214390249317976f,(float16_t)0.43171065802505725895f,
+(float16_t)0.90001589201616016833f,(float16_t)0.43585707992225547480f,
+(float16_t)0.89800057974073987932f,(float16_t)0.43999427130963325583f,
+(float16_t)0.89596624975618521791f,(float16_t)0.44412214457042920035f,
+(float16_t)0.89391294514520325265f,(float16_t)0.44824061228521988598f,
+(float16_t)0.89184070939234272313f,(float16_t)0.45234958723377088896f,
+(float16_t)0.88974958638307277692f,(float16_t)0.45644898239688391772f,
+(float16_t)0.88763962040285393496f,(float16_t)0.46053871095824000514f,
+(float16_t)0.88551085613619995307f,(float16_t)0.46461868630623781584f,
+(float16_t)0.88336333866573157891f,(float16_t)0.46868882203582790114f,
+(float16_t)0.88119711347122209322f,(float16_t)0.47274903195034279069f,
+(float16_t)0.87901222642863352519f,(float16_t)0.47679923006332208812f,
+(float16_t)0.87680872380914565145f,(float16_t)0.48083933060033395845f,
+(float16_t)0.87458665227817611321f,(float16_t)0.48486924800079106435f,
+(float16_t)0.87234605889439154058f,(float16_t)0.48888889691976317176f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.86780949676330332299f,(float16_t)0.49689704902265446895f,
+(float16_t)0.86551362409056908920f,(float16_t)0.50088538261124071482f,
+(float16_t)0.86319942171212415971f,(float16_t)0.50486310853126759035f,
+(float16_t)0.86086693863776730939f,(float16_t)0.50883014254310698909f,
+(float16_t)0.85851622426444273994f,(float16_t)0.51278640063356295542f,
+(float16_t)0.85614732837519447184f,(float16_t)0.51673179901764987321f,
+(float16_t)0.85376030113811141042f,(float16_t)0.52066625414036715735f,
+(float16_t)0.85135519310526519554f,(float16_t)0.52458968267846894928f,
+(float16_t)0.84893205521163961347f,(float16_t)0.52850200154222848337f,
+(float16_t)0.84649093877405212627f,(float16_t)0.53240312787719790144f,
+(float16_t)0.84403189549006640835f,(float16_t)0.53629297906596318235f,
+(float16_t)0.84155497743689844370f,(float16_t)0.54017147272989285423f,
+(float16_t)0.83906023707031274217f,(float16_t)0.54403852673088382019f,
+(float16_t)0.83654772722351200542f,(float16_t)0.54789405917310018967f,
+(float16_t)0.83401750110601813315f,(float16_t)0.55173798840470733573f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.82890411477186487499f,(float16_t)0.55939071185913613604f,
+(float16_t)0.82632106284566353427f,(float16_t)0.56319934401383409117f,
+(float16_t)0.82372051122739142759f,(float16_t)0.56699604882510867832f,
+(float16_t)0.82110251499110464835f,(float16_t)0.57078074588696725566f,
+(float16_t)0.81846712958029865792f,(float16_t)0.57455335504771576360f,
+(float16_t)0.81581441080673378075f,(float16_t)0.57831379641165558958f,
+(float16_t)0.81314441484925359394f,(float16_t)0.58206199034077543697f,
+(float16_t)0.81045719825259476821f,(float16_t)0.58579785745643886408f,
+(float16_t)0.80775281792619035848f,(float16_t)0.58952131864106394055f,
+(float16_t)0.80503133114296365758f,(float16_t)0.59323229503979979516f,
+(float16_t)0.80229279553811572168f,(float16_t)0.59693070806219639124f,
+(float16_t)0.79953726910790501314f,(float16_t)0.60061647938386897305f,
+(float16_t)0.79676481020841882774f,(float16_t)0.60428953094815596181f,
+(float16_t)0.79397547755433717231f,(float16_t)0.60794978496777363208f,
+(float16_t)0.79116933021769020318f,(float16_t)0.61159716392646190641f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.78550682956405393220f,(float16_t)0.61885298796097631957f,
+(float16_t)0.78265059616657572938f,(float16_t)0.62246127937414996723f,
+(float16_t)0.77977778792301455368f,(float16_t)0.62605638840434352232f,
+(float16_t)0.77688846567323244230f,(float16_t)0.62963823891492698426f,
+(float16_t)0.77398269060682289844f,(float16_t)0.63320675505005719064f,
+(float16_t)0.77106052426181381776f,(float16_t)0.63676186123628419899f,
+(float16_t)0.76812202852336541881f,(float16_t)0.64030348218415167327f,
+(float16_t)0.76516726562245895860f,(float16_t)0.64383154288979138613f,
+(float16_t)0.76219629813457900891f,(float16_t)0.64734596863651205911f,
+(float16_t)0.75920918897838796102f,(float16_t)0.65084668499638087535f,
+(float16_t)0.75620600141439453523f,(float16_t)0.65433361783180044036f,
+(float16_t)0.75318679904361252042f,(float16_t)0.65780669329707863735f,
+(float16_t)0.75015164580621507273f,(float16_t)0.66126583783999226540f,
+(float16_t)0.74710060598018013245f,(float16_t)0.66471097820334479334f,
+(float16_t)0.74403374417992929057f,(float16_t)0.66814204142651845153f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.73785281478846598269f,(float16_t)0.67496164610201192513f,
+(float16_t)0.73473887809596349907f,(float16_t)0.67835004312986146857f,
+(float16_t)0.73160938122389262972f,(float16_t)0.68172407417164970767f,
+(float16_t)0.72846439044822519637f,(float16_t)0.68508366777270035541f,
+(float16_t)0.72530397237306076796f,(float16_t)0.68842875278409043638f,
+(float16_t)0.72212819392921534511f,(float16_t)0.69175925836415774750f,
+(float16_t)0.71893712237280449351f,(float16_t)0.69507511398000088043f,
+(float16_t)0.71573082528381870571f,(float16_t)0.69837624940897280457f,
+(float16_t)0.71250937056469243469f,(float16_t)0.70166259474016845488f,
+(float16_t)0.70927282643886568891f,(float16_t)0.70493408037590488124f,
+(float16_t)0.70602126144933974317f,(float16_t)0.70819063703319540259f,
+(float16_t)0.70275474445722529993f,(float16_t)0.71143219574521643356f,
+(float16_t)0.69947334464028376733f,(float16_t)0.71465868786276909308f,
+(float16_t)0.69617713149146298601f,(float16_t)0.71787004505573170920f,
+(float16_t)0.69286617481742474034f,(float16_t)0.72106619931450810501f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.68620031168003858824f,(float16_t)0.72741262860237576593f,
+(float16_t)0.68284554638524808112f,(float16_t)0.73056276922782759087f,
+(float16_t)0.67947631989936496666f,(float16_t)0.73369743811466026084f,
+(float16_t)0.67609270357531603413f,(float16_t)0.73681656887736979300f,
+(float16_t)0.67269476907077296879f,(float16_t)0.73992009545951609173f,
+(float16_t)0.66928258834663600929f,(float16_t)0.74300795213512171866f,
+(float16_t)0.66585623366550972246f,(float16_t)0.74608007351006366825f,
+(float16_t)0.66241577759017178373f,(float16_t)0.74913639452345925918f,
+(float16_t)0.65896129298203731661f,(float16_t)0.75217685044904269986f,
+(float16_t)0.65549285299961546070f,(float16_t)0.75520137689653654700f,
+(float16_t)0.65201053109695950027f,(float16_t)0.75820990981301528144f,
+(float16_t)0.64851440102211255212f,(float16_t)0.76120238548426177871f,
+(float16_t)0.64500453681554403840f,(float16_t)0.76417874053611667406f,
+(float16_t)0.64148101280858316198f,(float16_t)0.76713891193582040007f,
+(float16_t)0.63794390362184416610f,(float16_t)0.77008283699334789674f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.63082922962842458148f,(float16_t)0.77592169904340757558f,
+(float16_t)0.62725181549514419377f,(float16_t)0.77881651238147586724f,
+(float16_t)0.62366111752569464155f,(float16_t)0.78169483207105938671f,
+(float16_t)0.62005721176328920663f,(float16_t)0.78455659715557524159f,
+(float16_t)0.61644017453085364622f,(float16_t)0.78740174702903131809f,
+(float16_t)0.61281008242940970820f,(float16_t)0.79023022143731003197f,
+(float16_t)0.60916701233645320634f,(float16_t)0.79304196047944364167f,
+(float16_t)0.60551104140432554512f,(float16_t)0.79583690460888345530f,
+(float16_t)0.60184224705858002658f,(float16_t)0.79861499463476082195f,
+(float16_t)0.59816070699634238395f,(float16_t)0.80137617172314012937f,
+(float16_t)0.59446649918466454299f,(float16_t)0.80412037739826569549f,
+(float16_t)0.59075970185887427544f,(float16_t)0.80684755354379922299f,
+(float16_t)0.58704039352091808013f,(float16_t)0.80955764240405125864f,
+(float16_t)0.58330865293769829094f,(float16_t)0.81225058658520388200f,
+(float16_t)0.57956455913940574387f,(float16_t)0.81492632905652662156f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.57203962932475704850f,(float16_t)0.82022598256943468620f,
+(float16_t)0.56825895267013148970f,(float16_t)0.82284978137582631685f,
+(float16_t)0.56446624152051949608f,(float16_t)0.82545615400437744036f,
+(float16_t)0.56066157619733603124f,(float16_t)0.82804504525775579626f,
+(float16_t)0.55684503727516010407f,(float16_t)0.83061640030884620334f,
+(float16_t)0.55301670558002757883f,(float16_t)0.83317016470191318511f,
+(float16_t)0.54917666218771976627f,(float16_t)0.83570628435375260423f,
+(float16_t)0.54532498842204646383f,(float16_t)0.83822470555483796772f,
+(float16_t)0.54146176585312355556f,(float16_t)0.84072537497045796151f,
+(float16_t)0.53758707629564550512f,(float16_t)0.84320823964184543620f,
+(float16_t)0.53370100180715296379f,(float16_t)0.84567324698729906540f,
+(float16_t)0.52980362468629482731f,(float16_t)0.84812034480329712149f,
+(float16_t)0.52589502747108474168f,(float16_t)0.85054948126560336874f,
+(float16_t)0.52197529293715438925f,(float16_t)0.85296060493036363059f,
+(float16_t)0.51804450409599933636f,(float16_t)0.85535366473519602870f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.51015009670676669806f,(float16_t)0.86008539042939025077f,
+(float16_t)0.50618664534515533937f,(float16_t)0.86242395611104050168f,
+(float16_t)0.50221247404571089934f,(float16_t)0.86474425751946237817f,
+(float16_t)0.49822766697278186854f,(float16_t)0.86704624551569264845f,
+(float16_t)0.49423230851595972846f,(float16_t)0.86932987134860673084f,
+(float16_t)0.49022648328829110387f,(float16_t)0.87159508665595109012f,
+(float16_t)0.48621027612448652899f,(float16_t)0.87384184346536675214f,
+(float16_t)0.48218377207912282989f,(float16_t)0.87607009419540660122f,
+(float16_t)0.47814705642484311987f,(float16_t)0.87827979165654146421f,
+(float16_t)0.47410021465055002254f,(float16_t)0.88047088905216075450f,
+(float16_t)0.47004333245959561971f,(float16_t)0.88264333997956279099f,
+(float16_t)0.46597649576796612569f,(float16_t)0.88479709843093778954f,
+(float16_t)0.46189979070246284243f,(float16_t)0.88693211879434208367f,
+(float16_t)0.45781330359887728587f,(float16_t)0.88904835585466457371f,
+(float16_t)0.45371712100016392544f,(float16_t)0.89114576479458318392f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.44549601651398174074f,(float16_t)0.89528392103855758410f,
+(float16_t)0.44137126873171661501f,(float16_t)0.89732458070541831763f,
+(float16_t)0.43723717366104419835f,(float16_t)0.89934623697934146236f,
+(float16_t)0.43309381885315201277f,(float16_t)0.90134884704602202810f,
+(float16_t)0.42894129205532954829f,(float16_t)0.90333236849451181705f,
+(float16_t)0.42477968120910880589f,(float16_t)0.90529675931811881551f,
+(float16_t)0.42060907444840250902f,(float16_t)0.90724197791529592738f,
+(float16_t)0.41642956009763731906f,(float16_t)0.90916798309052226923f,
+(float16_t)0.41224122666988299857f,(float16_t)0.91107473405517624965f,
+(float16_t)0.40804416286497874333f,(float16_t)0.91296219042839810154f,
+(float16_t)0.40383845756765412993f,(float16_t)0.91483031223794608611f,
+(float16_t)0.39962419984564678810f,(float16_t)0.91667905992104270485f,
+(float16_t)0.39540147894781629834f,(float16_t)0.91850839432521225181f,
+(float16_t)0.39117038430225398171f,(float16_t)0.92031827670911048322f,
+(float16_t)0.38693100551438869283f,(float16_t)0.92210866874334507237f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.37842775480876561511f,(float16_t)0.92563083050987271516f,
+(float16_t)0.37416406297145798909f,(float16_t)0.92736252565040111495f,
+(float16_t)0.36989244714893426691f,(float16_t)0.92907458125931574600f,
+(float16_t)0.36561299780477396482f,(float16_t)0.93076696107898371224f,
+(float16_t)0.36132580556845433906f,(float16_t)0.93243962926846235550f,
+(float16_t)0.35703096123343003310f,(float16_t)0.93409255040425887007f,
+(float16_t)0.35272855575521072646f,(float16_t)0.93572568948108036935f,
+(float16_t)0.34841868024943450921f,(float16_t)0.93733901191257495977f,
+(float16_t)0.34410142598993898044f,(float16_t)0.93893248353206448797f,
+(float16_t)0.33977688440682696225f,(float16_t)0.94050607059326829518f,
+(float16_t)0.33544514708453165852f,(float16_t)0.94205973977101731265f,
+(float16_t)0.33110630575987642921f,(float16_t)0.94359345816196038559f,
+(float16_t)0.32676045232013178898f,(float16_t)0.94510719328526060501f,
+(float16_t)0.32240767880107001897f,(float16_t)0.94660091308328353499f,
+(float16_t)0.31804807738501505998f,(float16_t)0.94807458592227622507f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.30930876031226878231f,(float16_t)0.95096166631157508231f,
+(float16_t)0.30492922973540242948f,(float16_t)0.95237501271976587880f,
+(float16_t)0.30054324141727339903f,(float16_t)0.95376818988599032512f,
+(float16_t)0.29615088824362395536f,(float16_t)0.95514116830577067141f,
+(float16_t)0.29175226323498937298f,(float16_t)0.95649391890239499059f,
+(float16_t)0.28734745954472956653f,(float16_t)0.95782641302753290802f,
+(float16_t)0.28293657045705539188f,(float16_t)0.95913862246184189431f,
+(float16_t)0.27851968938505305973f,(float16_t)0.96043051941556578655f,
+(float16_t)0.27409690986870632878f,(float16_t)0.96170207652912254037f,
+(float16_t)0.26966832557291520178f,(float16_t)0.96295326687368387741f,
+(float16_t)0.26523403028551190141f,(float16_t)0.96418406395174571788f,
+(float16_t)0.26079411791527556952f,(float16_t)0.96539444169768939830f,
+(float16_t)0.25634868248994291395f,(float16_t)0.96658437447833311928f,
+(float16_t)0.25189781815421691258f,(float16_t)0.96775383709347551076f,
+(float16_t)0.24744161916777343557f,(float16_t)0.96890280477642887202f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.23851359484431849944f,(float16_t)0.97113915844972509284f,
+(float16_t)0.23404195858354345794f,(float16_t)0.97222649707893626925f,
+(float16_t)0.22956536582051886852f,(float16_t)0.97329324605469824672f,
+(float16_t)0.22508391135979277653f,(float16_t)0.97433938278557585821f,
+(float16_t)0.22059769010887364526f,(float16_t)0.97536488511665686563f,
+(float16_t)0.21610679707621960333f,(float16_t)0.97636973133002114000f,
+(float16_t)0.21161132736922760866f,(float16_t)0.97735390014519996082f,
+(float16_t)0.20711137619221856032f,(float16_t)0.97831737071962765473f,
+(float16_t)0.20260703884442110567f,(float16_t)0.97926012264908202098f,
+(float16_t)0.19809841071795372680f,(float16_t)0.98018213596811731847f,
+(float16_t)0.19358558729580374602f,(float16_t)0.98108339115048659451f,
+(float16_t)0.18906866414980627589f,(float16_t)0.98196386910955524296f,
+(float16_t)0.18454773693861964423f,(float16_t)0.98282355119870523641f,
+(float16_t)0.18002290140569951471f,(float16_t)0.98366241921173025453f,
+(float16_t)0.17549425337727139751f,(float16_t)0.98448045538322093151f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.16642590354046421508f,(float16_t)0.98605396334619543897f,
+(float16_t)0.16188639378011188130f,(float16_t)0.98680940181418541624f,
+(float16_t)0.15734345561623827581f,(float16_t)0.98754394179435922574f,
+(float16_t)0.15279718525844340760f,(float16_t)0.98825756773074946437f,
+(float16_t)0.14824767898689619749f,(float16_t)0.98895026451030298986f,
+(float16_t)0.14369503315029458212f,(float16_t)0.98962201746320077600f,
+(float16_t)0.13913934416382628401f,(float16_t)0.99027281236316910817f,
+(float16_t)0.13458070850712622324f,(float16_t)0.99090263542778000971f,
+(float16_t)0.13001922272223334631f,(float16_t)0.99151147331874389668f,
+(float16_t)0.12545498341154620592f,(float16_t)0.99209931314219179654f,
+(float16_t)0.12088808723577722237f,(float16_t)0.99266614244894801899f,
+(float16_t)0.11631863091190487725f,(float16_t)0.99321194923479450001f,
+(float16_t)0.11174671121112665639f,(float16_t)0.99373672194072459884f,
+(float16_t)0.10717242495680887049f,(float16_t)0.99424044945318790223f,
+(float16_t)0.10259586902243628126f,(float16_t)0.99472312110432570265f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.09343633584574791151f,(float16_t)0.99562525638099430569f,
+(float16_t)0.08885355258252468358f,(float16_t)0.99604470090125196702f,
+(float16_t)0.08426888759332412659f,(float16_t)0.99644305135004263008f,
+(float16_t)0.07968243797143012563f,(float16_t)0.99682029929116566791f,
+(float16_t)0.07509430084792129145f,(float16_t)0.99717643673532618820f,
+(float16_t)0.07050457338961400866f,(float16_t)0.99751145614030345410f,
+(float16_t)0.06591335279700392957f,(float16_t)0.99782535041111164453f,
+(float16_t)0.06132073630220864768f,(float16_t)0.99811811290014917919f,
+(float16_t)0.05672682116690778292f,(float16_t)0.99838973740734016094f,
+(float16_t)0.05213170468028331672f,(float16_t)0.99864021818026527111f,
+(float16_t)0.04753548415695926094f,(float16_t)0.99886954991428356099f,
+(float16_t)0.04293825693494095902f,(float16_t)0.99907772775264536147f,
+(float16_t)0.03834012037355279123f,(float16_t)0.99926474728659442359f,
+(float16_t)0.03374117185137764235f,(float16_t)0.99943060455546173237f,
+(float16_t)0.02914150876419373953f,(float16_t)0.99957529604674921764f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)0.01994042855151459750f,(float16_t)0.99980116988788425569f,
+(float16_t)0.01533920628498821985f,(float16_t)0.99988234745421256111f,
+(float16_t)0.01073765916726457208f,(float16_t)0.99994234967602391162f,
+(float16_t)0.00613588464915451517f,(float16_t)0.99998117528260110909f,
+(float16_t)0.00153398018628476615f,(float16_t)0.99999882345170187925f,
+(float16_t)-0.00306795676296601561f,(float16_t)0.99999529380957619118f,
+(float16_t)-0.00766982873953095477f,(float16_t)0.99997058643097413988f,
+(float16_t)-0.01227153828571982304f,(float16_t)0.99992470183914450299f,
+(float16_t)-0.01687298794728165144f,(float16_t)0.99985764100582386060f,
+(float16_t)-0.02147408027546948359f,(float16_t)0.99976940535121527898f,
+(float16_t)-0.02607471782910391472f,(float16_t)0.99965999674395922270f,
+(float16_t)-0.03067480317663645942f,(float16_t)0.99952941750109314256f,
+(float16_t)-0.03527423889821382219f,(float16_t)0.99937767038800284780f,
+(float16_t)-0.03987292758773972740f,(float16_t)0.99920475861836388631f,
+(float16_t)-0.04447077185493861912f,(float16_t)0.99901068585407337697f,
+(float16_t)-0.04906767432741800800f,(float16_t)0.99879545620517240501f,
+(float16_t)-0.05366353765273055437f,(float16_t)0.99855907422975931365f,
+(float16_t)-0.05825826450043560673f,(float16_t)0.99830154493389289261f,
+(float16_t)-0.06285175756416130910f,(float16_t)0.99802287377148624081f,
+(float16_t)-0.06744391956366398155f,(float16_t)0.99772306664419163624f,
+(float16_t)-0.07203465324688929083f,(float16_t)0.99740212990127530279f,
+(float16_t)-0.07662386139203150592f,(float16_t)0.99706007033948296225f,
+(float16_t)-0.08121144680959226092f,(float16_t)0.99669689520289606044f,
+(float16_t)-0.08579731234443975507f,(float16_t)0.99631261218277800129f,
+(float16_t)-0.09038136087786488582f,(float16_t)0.99590722941741172125f,
+(float16_t)-0.09496349532963895002f,(float16_t)0.99548075549192693856f,
+(float16_t)-0.09954361866006931903f,(float16_t)0.99503319943811863180f,
+(float16_t)-0.10412163387205460030f,(float16_t)0.99456457073425541537f,
+(float16_t)-0.10869744401313856386f,(float16_t)0.99407487930487947736f,
+(float16_t)-0.11327095217756423529f,(float16_t)0.99356413552059530403f,
+(float16_t)-0.11784206150832489401f,(float16_t)0.99303235019785141002f,
+(float16_t)-0.12241067519921615403f,(float16_t)0.99247953459870996706f,
+(float16_t)-0.12697669649688586579f,(float16_t)0.99190570043060932726f,
+(float16_t)-0.13154002870288314386f,(float16_t)0.99131085984611544415f,
+(float16_t)-0.13610057517570606223f,(float16_t)0.99069502544266463406f,
+(float16_t)-0.14065823933284912761f,(float16_t)0.99005821026229712256f,
+(float16_t)-0.14521292465284740825f,(float16_t)0.98940042779138037687f,
+(float16_t)-0.14976453467732150915f,(float16_t)0.98872169196032377858f,
+(float16_t)-0.15431297301302013270f,(float16_t)0.98802201714328352633f,
+(float16_t)-0.15885814333386127917f,(float16_t)0.98730141815785843473f,
+(float16_t)-0.16339994938297311422f,(float16_t)0.98655991026477551920f,
+(float16_t)-0.16793829497473108936f,(float16_t)0.98579750916756747614f,
+(float16_t)-0.17247308399679592283f,(float16_t)0.98501423101223983814f,
+(float16_t)-0.17700422041214874946f,(float16_t)0.98421009238692902521f,
+(float16_t)-0.18153160826112502146f,(float16_t)0.98338511032155118130f,
+(float16_t)-0.18605515166344649414f,(float16_t)0.98253930228744124076f,
+(float16_t)-0.19057475482025265645f,(float16_t)0.98167268619698311305f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.19960175762113094300f,(float16_t)0.97987710369951763756f,
+(float16_t)-0.20410896609281689584f,(float16_t)0.97894817531906219710f,
+(float16_t)-0.20861185197826331850f,(float16_t)0.97799851493455713936f,
+(float16_t)-0.21311031991609125091f,(float16_t)0.97702814265775439484f,
+(float16_t)-0.21760427463848355800f,(float16_t)0.97603707903903913490f,
+(float16_t)-0.22209362097320348162f,(float16_t)0.97502534506699412020f,
+(float16_t)-0.22657826384560997290f,(float16_t)0.97399296216795583359f,
+(float16_t)-0.23105810828067113727f,(float16_t)0.97293995220556017678f,
+(float16_t)-0.23553305940497534787f,(float16_t)0.97186633748027939639f,
+(float16_t)-0.24000302244874138768f,(float16_t)0.97077214072895035013f,
+(float16_t)-0.24446790274782409513f,(float16_t)0.96965738512429244800f,
+(float16_t)-0.24892760574572012078f,(float16_t)0.96852209427441737777f,
+(float16_t)-0.25338203699557015902f,(float16_t)0.96736629222232850545f,
+(float16_t)-0.25783110216215882060f,(float16_t)0.96619000344541261516f,
+(float16_t)-0.26227470702391347812f,(float16_t)0.96499325285492043580f,
+(float16_t)-0.26671275747489830987f,(float16_t)0.96377606579543984022f,
+(float16_t)-0.27114515952680795507f,(float16_t)0.96253846804435916340f,
+(float16_t)-0.27557181931095814376f,(float16_t)0.96128048581132063966f,
+(float16_t)-0.27999264308027327353f,(float16_t)0.96000214573766584625f,
+(float16_t)-0.28440753721127171039f,(float16_t)0.95870347489587159906f,
+(float16_t)-0.28881640820604936870f,(float16_t)0.95738450078897596729f,
+(float16_t)-0.29321916269425857271f,(float16_t)0.95604525134999651659f,
+(float16_t)-0.29761570743508619641f,(float16_t)0.95468575494133833814f,
+(float16_t)-0.30200594931922808417f,(float16_t)0.95330604035419386211f,
+(float16_t)-0.30638979537086097338f,(float16_t)0.95190613680793234597f,
+(float16_t)-0.31076715274961136393f,(float16_t)0.95048607394948181337f,
+(float16_t)-0.31513792875252233383f,(float16_t)0.94904588185270055689f,
+(float16_t)-0.31950203081601563637f,(float16_t)0.94758559101774120226f,
+(float16_t)-0.32385936651785285356f,(float16_t)0.94610523237040344835f,
+(float16_t)-0.32820984357909255280f,(float16_t)0.94460483726148025685f,
+(float16_t)-0.33255336986604405736f,(float16_t)0.94308443746609349478f,
+(float16_t)-0.33688985339221994009f,(float16_t)0.94154406518302080631f,
+(float16_t)-0.34121920232028229991f,(float16_t)0.93998375303401404679f,
+(float16_t)-0.34554132496398903829f,(float16_t)0.93840353406310816897f,
+(float16_t)-0.34985612979013491763f,(float16_t)0.93680344173592156043f,
+(float16_t)-0.35416352542049039931f,(float16_t)0.93518350993894761025f,
+(float16_t)-0.35846342063373642928f,(float16_t)0.93354377297883628373f,
+(float16_t)-0.36275572436739711435f,(float16_t)0.93188426558166814750f,
+(float16_t)-0.36704034571976712487f,(float16_t)0.93020502289221906889f,
+(float16_t)-0.37131719395183748755f,(float16_t)0.92850608047321558924f,
+(float16_t)-0.37558617848921721505f,(float16_t)0.92678747430458174872f,
+(float16_t)-0.37984720892405099413f,(float16_t)0.92504924078267769527f,
+(float16_t)-0.38410019501693493105f,(float16_t)0.92329141671952774661f,
+(float16_t)-0.38834504669882619066f,(float16_t)0.92151403934204201285f,
+(float16_t)-0.39258167407295141427f,(float16_t)0.91971714629122736095f,
+(float16_t)-0.39680998741671030805f,(float16_t)0.91790077562139049672f,
+(float16_t)-0.40102989718357567872f,(float16_t)0.91606496579933172075f,
+(float16_t)-0.40524131400498974998f,(float16_t)0.91420975570353069095f,
+(float16_t)-0.40944414869225753684f,(float16_t)0.91233518462332285903f,
+(float16_t)-0.41363831223843450235f,(float16_t)0.91044129225806724737f,
+(float16_t)-0.41782371582021227141f,(float16_t)0.90852811871630612117f,
+(float16_t)-0.42200027079979968159f,(float16_t)0.90659570451491533483f,
+(float16_t)-0.42616788872679967071f,(float16_t)0.90464409057824612947f,
+(float16_t)-0.43032648134008272267f,(float16_t)0.90267331823725871498f,
+(float16_t)-0.43447596056965581690f,(float16_t)0.90068342922864685907f,
+(float16_t)-0.43861623853852738097f,(float16_t)0.89867446569395392775f,
+(float16_t)-0.44274722756456980077f,(float16_t)0.89664647017868026602f,
+(float16_t)-0.44686884016237399253f,(float16_t)0.89459948563138280697f,
+(float16_t)-0.45098098904510369733f,(float16_t)0.89253355540276468894f,
+(float16_t)-0.45508358712634372489f,(float16_t)0.89044872324475798919f,
+(float16_t)-0.45917654752194403400f,(float16_t)0.88834503330959635470f,
+(float16_t)-0.46325978355186014923f,(float16_t)0.88622253014888063838f,
+(float16_t)-0.46733320874198841510f,(float16_t)0.88408125871263498752f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.47545028174715592284f,(float16_t)0.87974259280004740713f,
+(float16_t)-0.47949375766015311928f,(float16_t)0.87754529020726124156f,
+(float16_t)-0.48352707893291846375f,(float16_t)0.87532940310411100349f,
+(float16_t)-0.48755016014843571837f,(float16_t)0.87309497841829020182f,
+(float16_t)-0.49156291610654972990f,(float16_t)0.87084206347007897531f,
+(float16_t)-0.49556526182577237405f,(float16_t)0.86857070597134100609f,
+(float16_t)-0.49955711254508178287f,(float16_t)0.86628095402451310569f,
+(float16_t)-0.50353838372571746440f,(float16_t)0.86397285612158680745f,
+(float16_t)-0.50750899105297075931f,(float16_t)0.86164646114308141023f,
+(float16_t)-0.51146885043797041259f,(float16_t)0.85930181835700847337f,
+(float16_t)-0.51541787801946303826f,(float16_t)0.85693897741782865118f,
+(float16_t)-0.51935599016558964269f,(float16_t)0.85455798836540053376f,
+(float16_t)-0.52328310347565654137f,(float16_t)0.85215890162391971785f,
+(float16_t)-0.52719913478190105760f,(float16_t)0.84974176800085265970f,
+(float16_t)-0.53110400115125477871f,(float16_t)0.84730663868585853749f,
+(float16_t)-0.53499761988709704230f,(float16_t)0.84485356524970722791f,
+(float16_t)-0.53887990853100831146f,(float16_t)0.84238259964318595863f,
+(float16_t)-0.54275078486451577842f,(float16_t)0.83989379419599952126f,
+(float16_t)-0.54661016691083474939f,(float16_t)0.83738720161566193578f,
+(float16_t)-0.55045797293660470029f,(float16_t)0.83486287498638012128f,
+(float16_t)-0.55429412145362011444f,(float16_t)0.83232086776792968408f,
+(float16_t)-0.55811853122055610221f,(float16_t)0.82976123379452304540f,
+(float16_t)-0.56193112124468946877f,(float16_t)0.82718402727366902027f,
+(float16_t)-0.56573181078361323149f,(float16_t)0.82458930278502517996f,
+(float16_t)-0.56952051934694725155f,(float16_t)0.82197711527924144370f,
+(float16_t)-0.57329716669804198226f,(float16_t)0.81934752007679712005f,
+(float16_t)-0.57706167285567933067f,(float16_t)0.81670057286682795628f,
+(float16_t)-0.58081395809576441547f,(float16_t)0.81403632970594852480f,
+(float16_t)-0.58455394295301521534f,(float16_t)0.81135484701706384048f,
+(float16_t)-0.58828154822264522306f,(float16_t)0.80865618158817509364f,
+(float16_t)-0.59199669496204088137f,(float16_t)0.80594039057117639047f,
+(float16_t)-0.59569930449243335691f,(float16_t)0.80320753148064494287f,
+(float16_t)-0.59938929840056454079f,(float16_t)0.80045766219262282082f,
+(float16_t)-0.60306659854034827539f,(float16_t)0.79769084094339104407f,
+(float16_t)-0.60673112703452458661f,(float16_t)0.79490712632823690154f,
+(float16_t)-0.61038280627630958630f,(float16_t)0.79210657730021227785f,
+(float16_t)-0.61402155893103815831f,(float16_t)0.78928925316888587371f,
+(float16_t)-0.61764730793780375784f,(float16_t)0.78645521359908587833f,
+(float16_t)-0.62125997651108744169f,(float16_t)0.78360451860963831194f,
+(float16_t)-0.62485948814238623239f,(float16_t)0.78073722857209459924f,
+(float16_t)-0.62844576660183260053f,(float16_t)0.77785340420945314754f,
+(float16_t)-0.63201873593980895105f,(float16_t)0.77495310659487393057f,
+(float16_t)-0.63557832048855611440f,(float16_t)0.77203639715038452351f,
+(float16_t)-0.63912444486377573138f,(float16_t)0.76910333764557958780f,
+(float16_t)-0.64265703396622686494f,(float16_t)0.76615399019631280630f,
+(float16_t)-0.64617601298331639459f,(float16_t)0.76318841726338115805f,
+(float16_t)-0.64968130739068330470f,(float16_t)0.76020668165120230952f,
+(float16_t)-0.65317284295377653347f,(float16_t)0.75720884650648467851f,
+(float16_t)-0.65665054572942882505f,(float16_t)0.75419497531688928227f,
+(float16_t)-0.66011434206742036768f,(float16_t)0.75116513190968658975f,
+(float16_t)-0.66356415861203965623f,(float16_t)0.74811938045040371481f,
+(float16_t)-0.66699992230363736034f,(float16_t)0.74505778544146605835f,
+(float16_t)-0.67042156038017308717f,(float16_t)0.74198041172083106787f,
+(float16_t)-0.67382900037875603783f,(float16_t)0.73888732446061522463f,
+(float16_t)-0.67722217013718044587f,(float16_t)0.73577858916571359238f,
+(float16_t)-0.68060099779545302212f,(float16_t)0.73265427167241281570f,
+(float16_t)-0.68396541179731551452f,(float16_t)0.72951443814699701296f,
+(float16_t)-0.68731534089175916336f,(float16_t)0.72635915508434589771f,
+(float16_t)-0.69065071413453438254f,(float16_t)0.72318848930652757101f,
+(float16_t)-0.69397146088965377952f,(float16_t)0.72000250796138176579f,
+(float16_t)-0.69727751083088640449f,(float16_t)0.71680127852109964959f,
+(float16_t)-0.70056879394324822474f,(float16_t)0.71358486878079363525f,
+(float16_t)-0.70384524052448482756f,(float16_t)0.71035334685706241764f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.71035334685706230662f,(float16_t)0.70384524052448504960f,
+(float16_t)-0.71358486878079352422f,(float16_t)0.70056879394324833576f,
+(float16_t)-0.71680127852109953857f,(float16_t)0.69727751083088651551f,
+(float16_t)-0.72000250796138165477f,(float16_t)0.69397146088965389055f,
+(float16_t)-0.72318848930652745999f,(float16_t)0.69065071413453460458f,
+(float16_t)-0.72635915508434578669f,(float16_t)0.68731534089175927438f,
+(float16_t)-0.72951443814699679091f,(float16_t)0.68396541179731562554f,
+(float16_t)-0.73265427167241270467f,(float16_t)0.68060099779545324417f,
+(float16_t)-0.73577858916571337033f,(float16_t)0.67722217013718055689f,
+(float16_t)-0.73888732446061511361f,(float16_t)0.67382900037875614885f,
+(float16_t)-0.74198041172083095685f,(float16_t)0.67042156038017319819f,
+(float16_t)-0.74505778544146594733f,(float16_t)0.66699992230363758239f,
+(float16_t)-0.74811938045040360379f,(float16_t)0.66356415861203976725f,
+(float16_t)-0.75116513190968636771f,(float16_t)0.66011434206742047870f,
+(float16_t)-0.75419497531688917125f,(float16_t)0.65665054572942904709f,
+(float16_t)-0.75720884650648467851f,(float16_t)0.65317284295377664449f,
+(float16_t)-0.76020668165120219850f,(float16_t)0.64968130739068341573f,
+(float16_t)-0.76318841726338115805f,(float16_t)0.64617601298331661663f,
+(float16_t)-0.76615399019631280630f,(float16_t)0.64265703396622708699f,
+(float16_t)-0.76910333764557947678f,(float16_t)0.63912444486377584241f,
+(float16_t)-0.77203639715038441249f,(float16_t)0.63557832048855622542f,
+(float16_t)-0.77495310659487381955f,(float16_t)0.63201873593980906207f,
+(float16_t)-0.77785340420945303652f,(float16_t)0.62844576660183271155f,
+(float16_t)-0.78073722857209448822f,(float16_t)0.62485948814238634341f,
+(float16_t)-0.78360451860963820092f,(float16_t)0.62125997651108755271f,
+(float16_t)-0.78645521359908576731f,(float16_t)0.61764730793780386886f,
+(float16_t)-0.78928925316888576269f,(float16_t)0.61402155893103838036f,
+(float16_t)-0.79210657730021216683f,(float16_t)0.61038280627630969732f,
+(float16_t)-0.79490712632823679051f,(float16_t)0.60673112703452469763f,
+(float16_t)-0.79769084094339093305f,(float16_t)0.60306659854034838641f,
+(float16_t)-0.80045766219262259877f,(float16_t)0.59938929840056465181f,
+(float16_t)-0.80320753148064483184f,(float16_t)0.59569930449243346793f,
+(float16_t)-0.80594039057117627944f,(float16_t)0.59199669496204099239f,
+(float16_t)-0.80865618158817498262f,(float16_t)0.58828154822264533408f,
+(float16_t)-0.81135484701706372945f,(float16_t)0.58455394295301532637f,
+(float16_t)-0.81403632970594841378f,(float16_t)0.58081395809576452649f,
+(float16_t)-0.81670057286682784525f,(float16_t)0.57706167285567944170f,
+(float16_t)-0.81934752007679700903f,(float16_t)0.57329716669804209328f,
+(float16_t)-0.82197711527924133268f,(float16_t)0.56952051934694747359f,
+(float16_t)-0.82458930278502506894f,(float16_t)0.56573181078361345353f,
+(float16_t)-0.82718402727366902027f,(float16_t)0.56193112124468957980f,
+(float16_t)-0.82976123379452293438f,(float16_t)0.55811853122055632426f,
+(float16_t)-0.83232086776792957306f,(float16_t)0.55429412145362022546f,
+(float16_t)-0.83486287498638001026f,(float16_t)0.55045797293660492233f,
+(float16_t)-0.83738720161566182476f,(float16_t)0.54661016691083497143f,
+(float16_t)-0.83989379419599952126f,(float16_t)0.54275078486451588944f,
+(float16_t)-0.84238259964318584760f,(float16_t)0.53887990853100842248f,
+(float16_t)-0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)-0.84730663868585842646f,(float16_t)0.53110400115125488973f,
+(float16_t)-0.84974176800085254868f,(float16_t)0.52719913478190127964f,
+(float16_t)-0.85215890162391960683f,(float16_t)0.52328310347565665239f,
+(float16_t)-0.85455798836540042274f,(float16_t)0.51935599016558975372f,
+(float16_t)-0.85693897741782865118f,(float16_t)0.51541787801946314929f,
+(float16_t)-0.85930181835700836235f,(float16_t)0.51146885043797052361f,
+(float16_t)-0.86164646114308129921f,(float16_t)0.50750899105297098135f,
+(float16_t)-0.86397285612158669643f,(float16_t)0.50353838372571757542f,
+(float16_t)-0.86628095402451299467f,(float16_t)0.49955711254508189390f,
+(float16_t)-0.86857070597134089507f,(float16_t)0.49556526182577254058f,
+(float16_t)-0.87084206347007886428f,(float16_t)0.49156291610654989643f,
+(float16_t)-0.87309497841829009079f,(float16_t)0.48755016014843588490f,
+(float16_t)-0.87532940310411089246f,(float16_t)0.48352707893291863028f,
+(float16_t)-0.87754529020726113053f,(float16_t)0.47949375766015328582f,
+(float16_t)-0.87974259280004729611f,(float16_t)0.47545028174715608937f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.88408125871263487650f,(float16_t)0.46733320874198858164f,
+(float16_t)-0.88622253014888052736f,(float16_t)0.46325978355186031576f,
+(float16_t)-0.88834503330959624368f,(float16_t)0.45917654752194420054f,
+(float16_t)-0.89044872324475787817f,(float16_t)0.45508358712634389143f,
+(float16_t)-0.89253355540276457791f,(float16_t)0.45098098904510386387f,
+(float16_t)-0.89459948563138269595f,(float16_t)0.44686884016237415906f,
+(float16_t)-0.89664647017868026602f,(float16_t)0.44274722756456996731f,
+(float16_t)-0.89867446569395392775f,(float16_t)0.43861623853852754751f,
+(float16_t)-0.90068342922864674804f,(float16_t)0.43447596056965598343f,
+(float16_t)-0.90267331823725871498f,(float16_t)0.43032648134008288920f,
+(float16_t)-0.90464409057824612947f,(float16_t)0.42616788872679983724f,
+(float16_t)-0.90659570451491533483f,(float16_t)0.42200027079979984812f,
+(float16_t)-0.90852811871630612117f,(float16_t)0.41782371582021243794f,
+(float16_t)-0.91044129225806713634f,(float16_t)0.41363831223843466889f,
+(float16_t)-0.91233518462332274801f,(float16_t)0.40944414869225770337f,
+(float16_t)-0.91420975570353069095f,(float16_t)0.40524131400498991651f,
+(float16_t)-0.91606496579933172075f,(float16_t)0.40102989718357562321f,
+(float16_t)-0.91790077562139049672f,(float16_t)0.39680998741671025254f,
+(float16_t)-0.91971714629122736095f,(float16_t)0.39258167407295141427f,
+(float16_t)-0.92151403934204179080f,(float16_t)0.38834504669882657923f,
+(float16_t)-0.92329141671952752457f,(float16_t)0.38410019501693531963f,
+(float16_t)-0.92504924078267747323f,(float16_t)0.37984720892405138271f,
+(float16_t)-0.92678747430458174872f,(float16_t)0.37558617848921738158f,
+(float16_t)-0.92850608047321547822f,(float16_t)0.37131719395183770960f,
+(float16_t)-0.93020502289221906889f,(float16_t)0.36704034571976729140f,
+(float16_t)-0.93188426558166803648f,(float16_t)0.36275572436739728088f,
+(float16_t)-0.93354377297883617270f,(float16_t)0.35846342063373659581f,
+(float16_t)-0.93518350993894761025f,(float16_t)0.35416352542049039931f,
+(float16_t)-0.93680344173592167145f,(float16_t)0.34985612979013486212f,
+(float16_t)-0.93840353406310816897f,(float16_t)0.34554132496398898278f,
+(float16_t)-0.93998375303401382475f,(float16_t)0.34121920232028268849f,
+(float16_t)-0.94154406518302069529f,(float16_t)0.33688985339222032867f,
+(float16_t)-0.94308443746609338376f,(float16_t)0.33255336986604444593f,
+(float16_t)-0.94460483726148014583f,(float16_t)0.32820984357909271933f,
+(float16_t)-0.94610523237040333733f,(float16_t)0.32385936651785302010f,
+(float16_t)-0.94758559101774109124f,(float16_t)0.31950203081601580291f,
+(float16_t)-0.94904588185270055689f,(float16_t)0.31513792875252250036f,
+(float16_t)-0.95048607394948170235f,(float16_t)0.31076715274961153046f,
+(float16_t)-0.95190613680793234597f,(float16_t)0.30638979537086091787f,
+(float16_t)-0.95330604035419386211f,(float16_t)0.30200594931922802866f,
+(float16_t)-0.95468575494133833814f,(float16_t)0.29761570743508614090f,
+(float16_t)-0.95604525134999629454f,(float16_t)0.29321916269425896129f,
+(float16_t)-0.95738450078897585627f,(float16_t)0.28881640820604975728f,
+(float16_t)-0.95870347489587148804f,(float16_t)0.28440753721127209896f,
+(float16_t)-0.96000214573766584625f,(float16_t)0.27999264308027344006f,
+(float16_t)-0.96128048581132063966f,(float16_t)0.27557181931095831029f,
+(float16_t)-0.96253846804435916340f,(float16_t)0.27114515952680812161f,
+(float16_t)-0.96377606579543984022f,(float16_t)0.26671275747489847641f,
+(float16_t)-0.96499325285492032478f,(float16_t)0.26227470702391370017f,
+(float16_t)-0.96619000344541250413f,(float16_t)0.25783110216215898713f,
+(float16_t)-0.96736629222232850545f,(float16_t)0.25338203699557010351f,
+(float16_t)-0.96852209427441737777f,(float16_t)0.24892760574572009302f,
+(float16_t)-0.96965738512429233698f,(float16_t)0.24446790274782448371f,
+(float16_t)-0.97077214072895023911f,(float16_t)0.24000302244874177626f,
+(float16_t)-0.97186633748027928537f,(float16_t)0.23553305940497573645f,
+(float16_t)-0.97293995220556006576f,(float16_t)0.23105810828067133156f,
+(float16_t)-0.97399296216795583359f,(float16_t)0.22657826384561016719f,
+(float16_t)-0.97502534506699412020f,(float16_t)0.22209362097320364815f,
+(float16_t)-0.97603707903903902388f,(float16_t)0.21760427463848372454f,
+(float16_t)-0.97702814265775439484f,(float16_t)0.21311031991609141745f,
+(float16_t)-0.97799851493455713936f,(float16_t)0.20861185197826351279f,
+(float16_t)-0.97894817531906219710f,(float16_t)0.20410896609281684033f,
+(float16_t)-0.97987710369951763756f,(float16_t)0.19960175762113091524f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.98167268619698311305f,(float16_t)0.19057475482025307278f,
+(float16_t)-0.98253930228744124076f,(float16_t)0.18605515166344691047f,
+(float16_t)-0.98338511032155118130f,(float16_t)0.18153160826112521575f,
+(float16_t)-0.98421009238692902521f,(float16_t)0.17700422041214894375f,
+(float16_t)-0.98501423101223983814f,(float16_t)0.17247308399679611712f,
+(float16_t)-0.98579750916756736512f,(float16_t)0.16793829497473128365f,
+(float16_t)-0.98655991026477540817f,(float16_t)0.16339994938297328075f,
+(float16_t)-0.98730141815785843473f,(float16_t)0.15885814333386147346f,
+(float16_t)-0.98802201714328352633f,(float16_t)0.15431297301302007718f,
+(float16_t)-0.98872169196032377858f,(float16_t)0.14976453467732145364f,
+(float16_t)-0.98940042779138037687f,(float16_t)0.14521292465284735274f,
+(float16_t)-0.99005821026229701154f,(float16_t)0.14065823933284954395f,
+(float16_t)-0.99069502544266463406f,(float16_t)0.13610057517570647856f,
+(float16_t)-0.99131085984611544415f,(float16_t)0.13154002870288333815f,
+(float16_t)-0.99190570043060932726f,(float16_t)0.12697669649688606008f,
+(float16_t)-0.99247953459870996706f,(float16_t)0.12241067519921634832f,
+(float16_t)-0.99303235019785141002f,(float16_t)0.11784206150832508830f,
+(float16_t)-0.99356413552059530403f,(float16_t)0.11327095217756441570f,
+(float16_t)-0.99407487930487936634f,(float16_t)0.10869744401313874427f,
+(float16_t)-0.99456457073425541537f,(float16_t)0.10412163387205457254f,
+(float16_t)-0.99503319943811863180f,(float16_t)0.09954361866006927739f,
+(float16_t)-0.99548075549192693856f,(float16_t)0.09496349532963890838f,
+(float16_t)-0.99590722941741172125f,(float16_t)0.09038136087786528827f,
+(float16_t)-0.99631261218277800129f,(float16_t)0.08579731234444015753f,
+(float16_t)-0.99669689520289606044f,(float16_t)0.08121144680959266338f,
+(float16_t)-0.99706007033948296225f,(float16_t)0.07662386139203168633f,
+(float16_t)-0.99740212990127530279f,(float16_t)0.07203465324688947125f,
+(float16_t)-0.99772306664419163624f,(float16_t)0.06744391956366417584f,
+(float16_t)-0.99802287377148624081f,(float16_t)0.06285175756416148951f,
+(float16_t)-0.99830154493389289261f,(float16_t)0.05825826450043579408f,
+(float16_t)-0.99855907422975931365f,(float16_t)0.05366353765273051968f,
+(float16_t)-0.99879545620517240501f,(float16_t)0.04906767432741796636f,
+(float16_t)-0.99901068585407337697f,(float16_t)0.04447077185493858442f,
+(float16_t)-0.99920475861836388631f,(float16_t)0.03987292758774012985f,
+(float16_t)-0.99937767038800284780f,(float16_t)0.03527423889821423159f,
+(float16_t)-0.99952941750109314256f,(float16_t)0.03067480317663686534f,
+(float16_t)-0.99965999674395922270f,(float16_t)0.02607471782910409860f,
+(float16_t)-0.99976940535121527898f,(float16_t)0.02147408027546966747f,
+(float16_t)-0.99985764100582386060f,(float16_t)0.01687298794728183532f,
+(float16_t)-0.99992470183914450299f,(float16_t)0.01227153828572000692f,
+(float16_t)-0.99997058643097413988f,(float16_t)0.00766982873953113778f,
+(float16_t)-0.99999529380957619118f,(float16_t)0.00306795676296597701f,
+(float16_t)-0.99999882345170187925f,(float16_t)-0.00153398018628480431f,
+(float16_t)-0.99998117528260110909f,(float16_t)-0.00613588464915455420f,
+(float16_t)-0.99994234967602391162f,(float16_t)-0.01073765916726416615f,
+(float16_t)-0.99988234745421256111f,(float16_t)-0.01533920628498781566f,
+(float16_t)-0.99980116988788425569f,(float16_t)-0.01994042855151419158f,
+(float16_t)-0.99969881869620424997f,(float16_t)-0.02454122852291207996f,
+(float16_t)-0.99957529604674921764f,(float16_t)-0.02914150876419355565f,
+(float16_t)-0.99943060455546173237f,(float16_t)-0.03374117185137745500f,
+(float16_t)-0.99926474728659442359f,(float16_t)-0.03834012037355261082f,
+(float16_t)-0.99907772775264536147f,(float16_t)-0.04293825693494077861f,
+(float16_t)-0.99886954991428356099f,(float16_t)-0.04753548415695929563f,
+(float16_t)-0.99864021818026527111f,(float16_t)-0.05213170468028335142f,
+(float16_t)-0.99838973740734016094f,(float16_t)-0.05672682116690781762f,
+(float16_t)-0.99811811290014917919f,(float16_t)-0.06132073630220824523f,
+(float16_t)-0.99782535041111164453f,(float16_t)-0.06591335279700352712f,
+(float16_t)-0.99751145614030345410f,(float16_t)-0.07050457338961360620f,
+(float16_t)-0.99717643673532618820f,(float16_t)-0.07509430084792109716f,
+(float16_t)-0.99682029929116577893f,(float16_t)-0.07968243797142994522f,
+(float16_t)-0.99644305135004263008f,(float16_t)-0.08426888759332393231f,
+(float16_t)-0.99604470090125196702f,(float16_t)-0.08885355258252450317f,
+(float16_t)-0.99562525638099430569f,(float16_t)-0.09343633584574773110f,
+(float16_t)-0.99518472667219692873f,(float16_t)-0.09801714032956058975f,
+(float16_t)-0.99472312110432570265f,(float16_t)-0.10259586902243630901f,
+(float16_t)-0.99424044945318790223f,(float16_t)-0.10717242495680891212f,
+(float16_t)-0.99373672194072470987f,(float16_t)-0.11174671121112625394f,
+(float16_t)-0.99321194923479461103f,(float16_t)-0.11631863091190447479f,
+(float16_t)-0.99266614244894801899f,(float16_t)-0.12088808723577681992f,
+(float16_t)-0.99209931314219179654f,(float16_t)-0.12545498341154601163f,
+(float16_t)-0.99151147331874400770f,(float16_t)-0.13001922272223317978f,
+(float16_t)-0.99090263542778000971f,(float16_t)-0.13458070850712605671f,
+(float16_t)-0.99027281236316910817f,(float16_t)-0.13913934416382611747f,
+(float16_t)-0.98962201746320088702f,(float16_t)-0.14369503315029438784f,
+(float16_t)-0.98895026451030298986f,(float16_t)-0.14824767898689603096f,
+(float16_t)-0.98825756773074946437f,(float16_t)-0.15279718525844343535f,
+(float16_t)-0.98754394179435922574f,(float16_t)-0.15734345561623830356f,
+(float16_t)-0.98680940181418552726f,(float16_t)-0.16188639378011149272f,
+(float16_t)-0.98605396334619543897f,(float16_t)-0.16642590354046382650f,
+(float16_t)-0.98527764238894133264f,(float16_t)-0.17096188876030096737f,
+(float16_t)-0.98448045538322093151f,(float16_t)-0.17549425337727120322f,
+(float16_t)-0.98366241921173025453f,(float16_t)-0.18002290140569934818f,
+(float16_t)-0.98282355119870534743f,(float16_t)-0.18454773693861947770f,
+(float16_t)-0.98196386910955524296f,(float16_t)-0.18906866414980610935f,
+(float16_t)-0.98108339115048670553f,(float16_t)-0.19358558729580355173f,
+(float16_t)-0.98018213596811742949f,(float16_t)-0.19809841071795356027f,
+(float16_t)-0.97926012264908202098f,(float16_t)-0.20260703884442113343f,
+(float16_t)-0.97831737071962765473f,(float16_t)-0.20711137619221858808f,
+(float16_t)-0.97735390014519996082f,(float16_t)-0.21161132736922766417f,
+(float16_t)-0.97636973133002125103f,(float16_t)-0.21610679707621921475f,
+(float16_t)-0.97536488511665697665f,(float16_t)-0.22059769010887325669f,
+(float16_t)-0.97433938278557585821f,(float16_t)-0.22508391135979261000f,
+(float16_t)-0.97329324605469824672f,(float16_t)-0.22956536582051870199f,
+(float16_t)-0.97222649707893638027f,(float16_t)-0.23404195858354326365f,
+(float16_t)-0.97113915844972520386f,(float16_t)-0.23851359484431830515f,
+(float16_t)-0.97003125319454397424f,(float16_t)-0.24298017990326381543f,
+(float16_t)-0.96890280477642887202f,(float16_t)-0.24744161916777326904f,
+(float16_t)-0.96775383709347551076f,(float16_t)-0.25189781815421696809f,
+(float16_t)-0.96658437447833311928f,(float16_t)-0.25634868248994291395f,
+(float16_t)-0.96539444169768939830f,(float16_t)-0.26079411791527562503f,
+(float16_t)-0.96418406395174582890f,(float16_t)-0.26523403028551151284f,
+(float16_t)-0.96295326687368398844f,(float16_t)-0.26966832557291481320f,
+(float16_t)-0.96170207652912265139f,(float16_t)-0.27409690986870616225f,
+(float16_t)-0.96043051941556589757f,(float16_t)-0.27851968938505289319f,
+(float16_t)-0.95913862246184200533f,(float16_t)-0.28293657045705516984f,
+(float16_t)-0.95782641302753290802f,(float16_t)-0.28734745954472939999f,
+(float16_t)-0.95649391890239510161f,(float16_t)-0.29175226323498920644f,
+(float16_t)-0.95514116830577078243f,(float16_t)-0.29615088824362378883f,
+(float16_t)-0.95376818988599032512f,(float16_t)-0.30054324141727345454f,
+(float16_t)-0.95237501271976587880f,(float16_t)-0.30492922973540242948f,
+(float16_t)-0.95096166631157508231f,(float16_t)-0.30930876031226878231f,
+(float16_t)-0.94952818059303678577f,(float16_t)-0.31368174039889118454f,
+(float16_t)-0.94807458592227633609f,(float16_t)-0.31804807738501467140f,
+(float16_t)-0.94660091308328364601f,(float16_t)-0.32240767880106963039f,
+(float16_t)-0.94510719328526060501f,(float16_t)-0.32676045232013156694f,
+(float16_t)-0.94359345816196038559f,(float16_t)-0.33110630575987626267f,
+(float16_t)-0.94205973977101742367f,(float16_t)-0.33544514708453149199f,
+(float16_t)-0.94050607059326840620f,(float16_t)-0.33977688440682679571f,
+(float16_t)-0.93893248353206459900f,(float16_t)-0.34410142598993881391f,
+(float16_t)-0.93733901191257495977f,(float16_t)-0.34841868024943456472f,
+(float16_t)-0.93572568948108036935f,(float16_t)-0.35272855575521072646f,
+(float16_t)-0.93409255040425887007f,(float16_t)-0.35703096123343008861f,
+(float16_t)-0.93243962926846246653f,(float16_t)-0.36132580556845395048f,
+(float16_t)-0.93076696107898382326f,(float16_t)-0.36561299780477357624f,
+(float16_t)-0.92907458125931585702f,(float16_t)-0.36989244714893387833f,
+(float16_t)-0.92736252565040111495f,(float16_t)-0.37416406297145782256f,
+(float16_t)-0.92563083050987282618f,(float16_t)-0.37842775480876539307f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,
+(float16_t)-0.92210866874334518339f,(float16_t)-0.38693100551438852630f,
+(float16_t)-0.92031827670911059425f,(float16_t)-0.39117038430225381518f,
+(float16_t)-0.91850839432521225181f,(float16_t)-0.39540147894781629834f,
+(float16_t)-0.91667905992104270485f,(float16_t)-0.39962419984564684361f,
+(float16_t)-0.91483031223794608611f,(float16_t)-0.40383845756765418544f,
+(float16_t)-0.91296219042839832358f,(float16_t)-0.40804416286497835475f,
+(float16_t)-0.91107473405517647169f,(float16_t)-0.41224122666988260999f,
+(float16_t)-0.90916798309052249127f,(float16_t)-0.41642956009763693048f,
+(float16_t)-0.90724197791529592738f,(float16_t)-0.42060907444840234248f,
+(float16_t)-0.90529675931811881551f,(float16_t)-0.42477968120910863936f,
+(float16_t)-0.90333236849451192807f,(float16_t)-0.42894129205532938176f,
+(float16_t)-0.90134884704602202810f,(float16_t)-0.43309381885315184624f,
+(float16_t)-0.89934623697934157338f,(float16_t)-0.43723717366104403181f,
+(float16_t)-0.89732458070541831763f,(float16_t)-0.44137126873171667052f,
+(float16_t)-0.89528392103855747308f,(float16_t)-0.44549601651398174074f,
+(float16_t)-0.89322430119551532446f,(float16_t)-0.44961132965460665067f,
+(float16_t)-0.89114576479458340597f,(float16_t)-0.45371712100016353686f,
+(float16_t)-0.88904835585466468473f,(float16_t)-0.45781330359887695280f,
+(float16_t)-0.88693211879434230571f,(float16_t)-0.46189979070246250936f,
+(float16_t)-0.88479709843093790056f,(float16_t)-0.46597649576796595916f,
+(float16_t)-0.88264333997956290201f,(float16_t)-0.47004333245959545318f,
+(float16_t)-0.88047088905216086552f,(float16_t)-0.47410021465054985601f,
+(float16_t)-0.87827979165654157523f,(float16_t)-0.47814705642484295334f,
+(float16_t)-0.87607009419540660122f,(float16_t)-0.48218377207912266336f,
+(float16_t)-0.87384184346536686316f,(float16_t)-0.48621027612448636246f,
+(float16_t)-0.87159508665595109012f,(float16_t)-0.49022648328829115938f,
+(float16_t)-0.86932987134860673084f,(float16_t)-0.49423230851595978397f,
+(float16_t)-0.86704624551569287050f,(float16_t)-0.49822766697278153547f,
+(float16_t)-0.86474425751946248919f,(float16_t)-0.50221247404571056627f,
+(float16_t)-0.86242395611104072373f,(float16_t)-0.50618664534515500630f,
+(float16_t)-0.86008539042939025077f,(float16_t)-0.51015009670676658704f,
+(float16_t)-0.85772861000027211809f,(float16_t)-0.51410274419322155026f,
+(float16_t)-0.85535366473519613972f,(float16_t)-0.51804450409599922533f,
+(float16_t)-0.85296060493036374162f,(float16_t)-0.52197529293715427823f,
+(float16_t)-0.85054948126560347976f,(float16_t)-0.52589502747108463065f,
+(float16_t)-0.84812034480329723252f,(float16_t)-0.52980362468629460526f,
+(float16_t)-0.84567324698729906540f,(float16_t)-0.53370100180715296379f,
+(float16_t)-0.84320823964184543620f,(float16_t)-0.53758707629564550512f,
+(float16_t)-0.84072537497045818355f,(float16_t)-0.54146176585312322249f,
+(float16_t)-0.83822470555483818977f,(float16_t)-0.54532498842204613076f,
+(float16_t)-0.83570628435375271525f,(float16_t)-0.54917666218771943321f,
+(float16_t)-0.83317016470191329613f,(float16_t)-0.55301670558002735678f,
+(float16_t)-0.83061640030884642538f,(float16_t)-0.55684503727515988203f,
+(float16_t)-0.82804504525775590729f,(float16_t)-0.56066157619733592021f,
+(float16_t)-0.82545615400437755138f,(float16_t)-0.56446624152051938506f,
+(float16_t)-0.82284978137582642788f,(float16_t)-0.56825895267013148970f,
+(float16_t)-0.82022598256943468620f,(float16_t)-0.57203962932475704850f,
+(float16_t)-0.81758481315158371139f,(float16_t)-0.57580819141784533866f,
+(float16_t)-0.81492632905652662156f,(float16_t)-0.57956455913940574387f,
+(float16_t)-0.81225058658520388200f,(float16_t)-0.58330865293769829094f,
+(float16_t)-0.80955764240405148069f,(float16_t)-0.58704039352091774706f,
+(float16_t)-0.80684755354379944503f,(float16_t)-0.59075970185887394237f,
+(float16_t)-0.80412037739826591753f,(float16_t)-0.59446649918466420992f,
+(float16_t)-0.80137617172314035141f,(float16_t)-0.59816070699634216190f,
+(float16_t)-0.79861499463476093297f,(float16_t)-0.60184224705857991555f,
+(float16_t)-0.79583690460888356633f,(float16_t)-0.60551104140432543410f,
+(float16_t)-0.79304196047944375270f,(float16_t)-0.60916701233645309532f,
+(float16_t)-0.79023022143731003197f,(float16_t)-0.61281008242940970820f,
+(float16_t)-0.78740174702903142911f,(float16_t)-0.61644017453085364622f,
+(float16_t)-0.78455659715557524159f,(float16_t)-0.62005721176328920663f,
+(float16_t)-0.78169483207105938671f,(float16_t)-0.62366111752569464155f,
+(float16_t)-0.77881651238147620031f,(float16_t)-0.62725181549514386070f,
+(float16_t)-0.77592169904340779762f,(float16_t)-0.63082922962842424841f,
+(float16_t)-0.77301045336273710440f,(float16_t)-0.63439328416364526575f,
+(float16_t)-0.77008283699334811878f,(float16_t)-0.63794390362184394405f,
+(float16_t)-0.76713891193582051109f,(float16_t)-0.64148101280858305095f,
+(float16_t)-0.76417874053611678509f,(float16_t)-0.64500453681554381635f,
+(float16_t)-0.76120238548426188974f,(float16_t)-0.64851440102211233008f,
+(float16_t)-0.75820990981301539247f,(float16_t)-0.65201053109695950027f,
+(float16_t)-0.75520137689653654700f,(float16_t)-0.65549285299961534967f,
+(float16_t)-0.75217685044904269986f,(float16_t)-0.65896129298203731661f,
+(float16_t)-0.74913639452345925918f,(float16_t)-0.66241577759017178373f,
+(float16_t)-0.74608007351006400132f,(float16_t)-0.66585623366550938940f,
+(float16_t)-0.74300795213512194071f,(float16_t)-0.66928258834663578725f,
+(float16_t)-0.73992009545951631377f,(float16_t)-0.67269476907077274674f,
+(float16_t)-0.73681656887737001504f,(float16_t)-0.67609270357531581208f,
+(float16_t)-0.73369743811466037187f,(float16_t)-0.67947631989936485564f,
+(float16_t)-0.73056276922782770189f,(float16_t)-0.68284554638524797010f,
+(float16_t)-0.72741262860237587695f,(float16_t)-0.68620031168003847721f,
+(float16_t)-0.72424708295146700276f,(float16_t)-0.68954054473706682948f,
+(float16_t)-0.72106619931450810501f,(float16_t)-0.69286617481742462932f,
+(float16_t)-0.71787004505573170920f,(float16_t)-0.69617713149146298601f,
+(float16_t)-0.71465868786276898206f,(float16_t)-0.69947334464028387835f,
+(float16_t)-0.71143219574521665560f,(float16_t)-0.70275474445722507788f,
+(float16_t)-0.70819063703319551362f,(float16_t)-0.70602126144933952112f,
+(float16_t)-0.70493408037590510329f,(float16_t)-0.70927282643886546687f,
+(float16_t)-0.70166259474016867692f,(float16_t)-0.71250937056469221265f,
+(float16_t)-0.69837624940897302661f,(float16_t)-0.71573082528381848366f,
+(float16_t)-0.69507511398000099145f,(float16_t)-0.71893712237280438249f,
+(float16_t)-0.69175925836415785852f,(float16_t)-0.72212819392921523409f,
+(float16_t)-0.68842875278409054740f,(float16_t)-0.72530397237306065694f,
+(float16_t)-0.68508366777270035541f,(float16_t)-0.72846439044822519637f,
+(float16_t)-0.68172407417164981869f,(float16_t)-0.73160938122389251870f,
+(float16_t)-0.67835004312986146857f,(float16_t)-0.73473887809596349907f,
+(float16_t)-0.67496164610201225820f,(float16_t)-0.73785281478846576064f,
+(float16_t)-0.67155895484701866316f,(float16_t)-0.74095112535495888384f,
+(float16_t)-0.66814204142651867357f,(float16_t)-0.74403374417992906853f,
+(float16_t)-0.66471097820334501538f,(float16_t)-0.74710060598017991040f,
+(float16_t)-0.66126583783999237642f,(float16_t)-0.75015164580621496171f,
+(float16_t)-0.65780669329707874837f,(float16_t)-0.75318679904361240940f,
+(float16_t)-0.65433361783180066240f,(float16_t)-0.75620600141439442421f,
+(float16_t)-0.65084668499638098638f,(float16_t)-0.75920918897838796102f,
+(float16_t)-0.64734596863651250320f,(float16_t)-0.76219629813457856482f,
+(float16_t)-0.64383154288979149715f,(float16_t)-0.76516726562245895860f,
+(float16_t)-0.64030348218415200634f,(float16_t)-0.76812202852336519676f,
+(float16_t)-0.63676186123628419899f,(float16_t)-0.77106052426181381776f,
+(float16_t)-0.63320675505005752370f,(float16_t)-0.77398269060682256537f,
+(float16_t)-0.62963823891492687324f,(float16_t)-0.77688846567323255332f,
+(float16_t)-0.62605638840434374437f,(float16_t)-0.77977778792301433164f,
+(float16_t)-0.62246127937414974518f,(float16_t)-0.78265059616657584041f,
+(float16_t)-0.61885298796097643059f,(float16_t)-0.78550682956405382118f,
+(float16_t)-0.61523159058062726334f,(float16_t)-0.78834642762660589455f,
+(float16_t)-0.61159716392646201744f,(float16_t)-0.79116933021769009216f,
+(float16_t)-0.60794978496777407617f,(float16_t)-0.79397547755433683925f,
+(float16_t)-0.60428953094815607283f,(float16_t)-0.79676481020841871672f,
+(float16_t)-0.60061647938386930612f,(float16_t)-0.79953726910790479110f,
+(float16_t)-0.59693070806219639124f,(float16_t)-0.80229279553811572168f,
+(float16_t)-0.59323229503980012822f,(float16_t)-0.80503133114296343553f,
+(float16_t)-0.58952131864106382952f,(float16_t)-0.80775281792619046950f,
+(float16_t)-0.58579785745643908612f,(float16_t)-0.81045719825259465718f,
+(float16_t)-0.58206199034077532595f,(float16_t)-0.81314441484925370496f,
+(float16_t)-0.57831379641165570060f,(float16_t)-0.81581441080673366972f,
+(float16_t)-0.57455335504771631872f,(float16_t)-0.81846712958029832485f,
+(float16_t)-0.57078074588696736669f,(float16_t)-0.82110251499110464835f,
+(float16_t)-0.56699604882510901138f,(float16_t)-0.82372051122739109452f,
+(float16_t)-0.56319934401383409117f,(float16_t)-0.82632106284566342325f,
+(float16_t)-0.55939071185913646911f,(float16_t)-0.82890411477186465294f,
+(float16_t)-0.55557023301960217765f,(float16_t)-0.83146961230254523567f,
+(float16_t)-0.55173798840470766880f,(float16_t)-0.83401750110601791111f,
+(float16_t)-0.54789405917310007865f,(float16_t)-0.83654772722351211645f,
+(float16_t)-0.54403852673088415326f,(float16_t)-0.83906023707031252012f,
+(float16_t)-0.54017147272989274320f,(float16_t)-0.84155497743689855472f,
+(float16_t)-0.53629297906596329337f,(float16_t)-0.84403189549006629733f,
+(float16_t)-0.53240312787719845655f,(float16_t)-0.84649093877405179320f,
+(float16_t)-0.52850200154222859439f,(float16_t)-0.84893205521163961347f,
+(float16_t)-0.52458968267846928235f,(float16_t)-0.85135519310526486247f,
+(float16_t)-0.52066625414036715735f,(float16_t)-0.85376030113811141042f,
+(float16_t)-0.51673179901765020627f,(float16_t)-0.85614732837519424979f,
+(float16_t)-0.51278640063356295542f,(float16_t)-0.85851622426444285097f,
+(float16_t)-0.50883014254310732216f,(float16_t)-0.86086693863776708735f,
+(float16_t)-0.50486310853126736831f,(float16_t)-0.86319942171212427073f,
+(float16_t)-0.50088538261124104789f,(float16_t)-0.86551362409056897818f,
+(float16_t)-0.49689704902265435793f,(float16_t)-0.86780949676330332299f,
+(float16_t)-0.49289819222978420443f,(float16_t)-0.87008699110871134952f,
+(float16_t)-0.48888889691976367136f,(float16_t)-0.87234605889439120752f,
+(float16_t)-0.48486924800079117537f,(float16_t)-0.87458665227817611321f,
+(float16_t)-0.48083933060033440254f,(float16_t)-0.87680872380914542941f,
+(float16_t)-0.47679923006332214364f,(float16_t)-0.87901222642863341417f,
+(float16_t)-0.47274903195034317926f,(float16_t)-0.88119711347122187117f,
+(float16_t)-0.46868882203582790114f,(float16_t)-0.88336333866573157891f,
+(float16_t)-0.46461868630623814891f,(float16_t)-0.88551085613619973103f,
+(float16_t)-0.46053871095823989412f,(float16_t)-0.88763962040285404598f,
+(float16_t)-0.45644898239688419528f,(float16_t)-0.88974958638307266590f,
+(float16_t)-0.45234958723377066692f,(float16_t)-0.89184070939234283415f,
+(float16_t)-0.44824061228522010802f,(float16_t)-0.89391294514520314163f,
+(float16_t)-0.44412214457042975546f,(float16_t)-0.89596624975618488484f,
+(float16_t)-0.43999427130963336685f,(float16_t)-0.89800057974073976830f,
+(float16_t)-0.43585707992225597440f,(float16_t)-0.90001589201615994629f,
+(float16_t)-0.43171065802505731446f,(float16_t)-0.90201214390249317976f,
+(float16_t)-0.42755509343028247349f,(float16_t)-0.90398929312344311615f,
+(float16_t)-0.42339047414379599177f,(float16_t)-0.90594729780726845902f,
+(float16_t)-0.41921688836322429372f,(float16_t)-0.90788611648766603945f,
+(float16_t)-0.41503442447608152044f,(float16_t)-0.90980570810465233311f,
+(float16_t)-0.41084317105790418845f,(float16_t)-0.91170603200542976730f,
+(float16_t)-0.40664321687036886210f,(float16_t)-0.91358704794525091852f,
+(float16_t)-0.40243465085941865222f,(float16_t)-0.91544871608826772214f,
+(float16_t)-0.39821756215337417162f,(float16_t)-0.91729099700837768427f,
+(float16_t)-0.39399204006104820985f,(float16_t)-0.91911385169005765938f,
+(float16_t)-0.38975817406985696634f,(float16_t)-0.92091724152918930102f,
+(float16_t)-0.38551605384391890441f,(float16_t)-0.92270112833387851747f,
+(float16_t)-0.38126576922216276477f,(float16_t)-0.92446547432526249288f,
+(float16_t)-0.37700741021641820394f,(float16_t)-0.92621024213831137928f,
+(float16_t)-0.37274106700951614712f,(float16_t)-0.92793539482261766516f,
+(float16_t)-0.36846682995337221023f,(float16_t)-0.92964089584318132520f,
+(float16_t)-0.36418478956708016936f,(float16_t)-0.93132670908118031505f,
+(float16_t)-0.35989503653498794433f,(float16_t)-0.93299279883473895669f,
+(float16_t)-0.35559766170478407377f,(float16_t)-0.93463912981968066962f,
+(float16_t)-0.35129275608556687072f,(float16_t)-0.93626566717027837061f,
+(float16_t)-0.34698041084592379235f,(float16_t)-0.93787237643998977443f,
+(float16_t)-0.34266071731199487793f,(float16_t)-0.93945922360218969693f,
+(float16_t)-0.33833376696554123830f,(float16_t)-0.94102617505088925753f,
+(float16_t)-0.33399965144200982614f,(float16_t)-0.94257319760144675502f,
+(float16_t)-0.32965846252858749255f,(float16_t)-0.94410025849127265918f,
+(float16_t)-0.32531029216226331480f,(float16_t)-0.94560732538052116869f,
+(float16_t)-0.32095523242787515894f,(float16_t)-0.94709436635277721717f,
+(float16_t)-0.31659337555616617887f,(float16_t)-0.94856134991573015647f,
+(float16_t)-0.31222481392182477311f,(float16_t)-0.95000824500184311017f,
+(float16_t)-0.30784964004153508865f,(float16_t)-0.95143502096900833820f,
+(float16_t)-0.30346794657201103806f,(float16_t)-0.95284164760119871573f,
+(float16_t)-0.29907982630804058610f,(float16_t)-0.95422809510910555630f,
+(float16_t)-0.29468537218051488180f,(float16_t)-0.95559433413077088382f,
+(float16_t)-0.29028467725446244208f,(float16_t)-0.95694033573220882438f,
+(float16_t)-0.28587783472708105936f,(float16_t)-0.95826607140801756124f,
+(float16_t)-0.28146493792575794091f,(float16_t)-0.95957151308198451733f,
+(float16_t)-0.27704608030610028413f,(float16_t)-0.96085663310767954748f,
+(float16_t)-0.27262135544994886560f,(float16_t)-0.96212140426904158019f,
+(float16_t)-0.26819085706340350939f,(float16_t)-0.96336579978095393528f,
+(float16_t)-0.26375467897483123592f,(float16_t)-0.96458979328981275803f,
+(float16_t)-0.25931291513288645678f,(float16_t)-0.96579335887408357397f,
+(float16_t)-0.25486565960451434965f,(float16_t)-0.96697647104485218161f,
+(float16_t)-0.25041300657296539089f,(float16_t)-0.96813910474636233339f,
+(float16_t)-0.24595505033579515008f,(float16_t)-0.96928123535654830967f,
+(float16_t)-0.24149188530286941345f,(float16_t)-0.97040283868755550234f,
+(float16_t)-0.23702360599436766986f,(float16_t)-0.97150389098625167250f,
+(float16_t)-0.23255030703877521692f,(float16_t)-0.97258436893473221296f,
+(float16_t)-0.22807208317088611960f,(float16_t)-0.97364424965081186603f,
+(float16_t)-0.22358902922978990402f,(float16_t)-0.97468351068851066810f,
+(float16_t)-0.21910124015687010290f,(float16_t)-0.97570213003852845901f,
+(float16_t)-0.21460881099378659176f,(float16_t)-0.97670008612871184184f,
+(float16_t)-0.21011183688046985996f,(float16_t)-0.97767735782450992943f,
+(float16_t)-0.20561041305309901706f,(float16_t)-0.97863392442942320759f,
+(float16_t)-0.20110463484209206708f,(float16_t)-0.97956976568544051887f,
+(float16_t)-0.19659459767008077846f,(float16_t)-0.98048486177346927395f,
+(float16_t)-0.19208039704989252061f,(float16_t)-0.98137919331375456089f,
+(float16_t)-0.18756212858253007436f,(float16_t)-0.98225274136628937249f,
+(float16_t)-0.18303988795514095078f,(float16_t)-0.98310548743121628501f,
+(float16_t)-0.17851377093899792325f,(float16_t)-0.98393741344921881176f,
+(float16_t)-0.17398387338746373887f,(float16_t)-0.98474850180190420801f,
+(float16_t)-0.16945029123396829207f,(float16_t)-0.98553873531217606185f,
+(float16_t)-0.16491312048996975559f,(float16_t)-0.98630809724459866938f,
+(float16_t)-0.16037245724292850668f,(float16_t)-0.98705657130575097380f,
+(float16_t)-0.15582839765426498291f,(float16_t)-0.98778414164457217783f,
+(float16_t)-0.15128103795733036097f,(float16_t)-0.98849079285269658701f,
+(float16_t)-0.14673047445536230304f,(float16_t)-0.98917650996478090342f,
+(float16_t)-0.14217680351944814165f,(float16_t)-0.98984127845882052821f,
+(float16_t)-0.13762012158648653792f,(float16_t)-0.99048508425645698239f,
+(float16_t)-0.13306052515713906459f,(float16_t)-0.99110791372327688986f,
+(float16_t)-0.12849811079379358514f,(float16_t)-0.99170975366909952520f,
+(float16_t)-0.12393297511851208981f,(float16_t)-0.99229059134825736699f,
+(float16_t)-0.11936521481099168773f,(float16_t)-0.99285041445986510489f,
+(float16_t)-0.11479492660650993108f,(float16_t)-0.99338921114808065305f,
+(float16_t)-0.11022220729388330918f,(float16_t)-0.99390697000235606051f,
+(float16_t)-0.10564715371341037997f,(float16_t)-0.99440368005767909576f,
+(float16_t)-0.10106986275482798820f,(float16_t)-0.99487933079480561638f,
+(float16_t)-0.09649043135525316173f,(float16_t)-0.99533391214048216877f,
+(float16_t)-0.09190895649713282101f,(float16_t)-0.99576741446765981713f,
+(float16_t)-0.08732553520619255882f,(float16_t)-0.99617982859569687015f,
+(float16_t)-0.08274026454937570552f,(float16_t)-0.99657114579055483539f,
+(float16_t)-0.07815324163279464831f,(float16_t)-0.99694135776498205015f,
+(float16_t)-0.07356456359966735692f,(float16_t)-0.99729045667869020697f,
+(float16_t)-0.06897432762826707919f,(float16_t)-0.99761843513851955478f,
+(float16_t)-0.06438263092985731240f,(float16_t)-0.99792528619859599548f,
+(float16_t)-0.05978957074664013188f,(float16_t)-0.99821100336047818846f,
+(float16_t)-0.05519524434968971216f,(float16_t)-0.99847558057329477421f,
+(float16_t)-0.05059974903689945513f,(float16_t)-0.99871901223387293811f,
+(float16_t)-0.04600318213091520586f,(float16_t)-0.99894129318685687124f,
+(float16_t)-0.04140564097707683661f,(float16_t)-0.99914241872481690532f,
+(float16_t)-0.03680722294135933131f,(float16_t)-0.99932238458834943273f,
+(float16_t)-0.03220802540830459970f,(float16_t)-0.99948118696616694567f,
+(float16_t)-0.02760814577896616301f,(float16_t)-0.99961882249517863830f,
+(float16_t)-0.02300768146883930970f,(float16_t)-0.99973528826056168306f,
+(float16_t)-0.01840672990580516366f,(float16_t)-0.99983058179582340319f,
+(float16_t)-0.01380538852806025008f,(float16_t)-0.99990470108285289808f,
+(float16_t)-0.00920375478206008311f,(float16_t)-0.99995764455196389786f,
+(float16_t)-0.00460192612044835019f,(float16_t)-0.99998941108192840321f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99983058179582340319f,(float16_t)0.01840672990580482019f,
+(float16_t)0.99932238458834954375f,(float16_t)0.03680722294135883171f,
+(float16_t)0.99847558057329477421f,(float16_t)0.05519524434968993420f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.99576741446765981713f,(float16_t)0.09190895649713272386f,
+(float16_t)0.99390697000235606051f,(float16_t)0.11022220729388305938f,
+(float16_t)0.99170975366909952520f,(float16_t)0.12849811079379316880f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.98630809724459866938f,(float16_t)0.16491312048996989437f,
+(float16_t)0.98310548743121628501f,(float16_t)0.18303988795514095078f,
+(float16_t)0.97956976568544051887f,(float16_t)0.20110463484209190055f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.97150389098625178352f,(float16_t)0.23702360599436719801f,
+(float16_t)0.96697647104485207059f,(float16_t)0.25486565960451457169f,
+(float16_t)0.96212140426904158019f,(float16_t)0.27262135544994897662f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.95143502096900833820f,(float16_t)0.30784964004153486661f,
+(float16_t)0.94560732538052127971f,(float16_t)0.32531029216226292622f,
+(float16_t)0.93945922360218991898f,(float16_t)0.34266071731199437833f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.92621024213831137928f,(float16_t)0.37700741021641825945f,
+(float16_t)0.91911385169005777040f,(float16_t)0.39399204006104809883f,
+(float16_t)0.91170603200542987832f,(float16_t)0.41084317105790391089f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.89596624975618521791f,(float16_t)0.44412214457042920035f,
+(float16_t)0.88763962040285393496f,(float16_t)0.46053871095824000514f,
+(float16_t)0.87901222642863352519f,(float16_t)0.47679923006332208812f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.86086693863776730939f,(float16_t)0.50883014254310698909f,
+(float16_t)0.85135519310526519554f,(float16_t)0.52458968267846894928f,
+(float16_t)0.84155497743689844370f,(float16_t)0.54017147272989285423f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.82110251499110464835f,(float16_t)0.57078074588696725566f,
+(float16_t)0.81045719825259476821f,(float16_t)0.58579785745643886408f,
+(float16_t)0.79953726910790501314f,(float16_t)0.60061647938386897305f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.77688846567323244230f,(float16_t)0.62963823891492698426f,
+(float16_t)0.76516726562245895860f,(float16_t)0.64383154288979138613f,
+(float16_t)0.75318679904361252042f,(float16_t)0.65780669329707863735f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.72846439044822519637f,(float16_t)0.68508366777270035541f,
+(float16_t)0.71573082528381870571f,(float16_t)0.69837624940897280457f,
+(float16_t)0.70275474445722529993f,(float16_t)0.71143219574521643356f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.67609270357531603413f,(float16_t)0.73681656887736979300f,
+(float16_t)0.66241577759017178373f,(float16_t)0.74913639452345925918f,
+(float16_t)0.64851440102211255212f,(float16_t)0.76120238548426177871f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.62005721176328920663f,(float16_t)0.78455659715557524159f,
+(float16_t)0.60551104140432554512f,(float16_t)0.79583690460888345530f,
+(float16_t)0.59075970185887427544f,(float16_t)0.80684755354379922299f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.56066157619733603124f,(float16_t)0.82804504525775579626f,
+(float16_t)0.54532498842204646383f,(float16_t)0.83822470555483796772f,
+(float16_t)0.52980362468629482731f,(float16_t)0.84812034480329712149f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.49822766697278186854f,(float16_t)0.86704624551569264845f,
+(float16_t)0.48218377207912282989f,(float16_t)0.87607009419540660122f,
+(float16_t)0.46597649576796612569f,(float16_t)0.88479709843093778954f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.43309381885315201277f,(float16_t)0.90134884704602202810f,
+(float16_t)0.41642956009763731906f,(float16_t)0.90916798309052226923f,
+(float16_t)0.39962419984564678810f,(float16_t)0.91667905992104270485f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.36561299780477396482f,(float16_t)0.93076696107898371224f,
+(float16_t)0.34841868024943450921f,(float16_t)0.93733901191257495977f,
+(float16_t)0.33110630575987642921f,(float16_t)0.94359345816196038559f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.29615088824362395536f,(float16_t)0.95514116830577067141f,
+(float16_t)0.27851968938505305973f,(float16_t)0.96043051941556578655f,
+(float16_t)0.26079411791527556952f,(float16_t)0.96539444169768939830f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.22508391135979277653f,(float16_t)0.97433938278557585821f,
+(float16_t)0.20711137619221856032f,(float16_t)0.97831737071962765473f,
+(float16_t)0.18906866414980627589f,(float16_t)0.98196386910955524296f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.15279718525844340760f,(float16_t)0.98825756773074946437f,
+(float16_t)0.13458070850712622324f,(float16_t)0.99090263542778000971f,
+(float16_t)0.11631863091190487725f,(float16_t)0.99321194923479450001f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.07968243797143012563f,(float16_t)0.99682029929116566791f,
+(float16_t)0.06132073630220864768f,(float16_t)0.99811811290014917919f,
+(float16_t)0.04293825693494095902f,(float16_t)0.99907772775264536147f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)0.00613588464915451517f,(float16_t)0.99998117528260110909f,
+(float16_t)-0.01227153828571982304f,(float16_t)0.99992470183914450299f,
+(float16_t)-0.03067480317663645942f,(float16_t)0.99952941750109314256f,
+(float16_t)-0.04906767432741800800f,(float16_t)0.99879545620517240501f,
+(float16_t)-0.06744391956366398155f,(float16_t)0.99772306664419163624f,
+(float16_t)-0.08579731234443975507f,(float16_t)0.99631261218277800129f,
+(float16_t)-0.10412163387205460030f,(float16_t)0.99456457073425541537f,
+(float16_t)-0.12241067519921615403f,(float16_t)0.99247953459870996706f,
+(float16_t)-0.14065823933284912761f,(float16_t)0.99005821026229712256f,
+(float16_t)-0.15885814333386127917f,(float16_t)0.98730141815785843473f,
+(float16_t)-0.17700422041214874946f,(float16_t)0.98421009238692902521f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.21311031991609125091f,(float16_t)0.97702814265775439484f,
+(float16_t)-0.23105810828067113727f,(float16_t)0.97293995220556017678f,
+(float16_t)-0.24892760574572012078f,(float16_t)0.96852209427441737777f,
+(float16_t)-0.26671275747489830987f,(float16_t)0.96377606579543984022f,
+(float16_t)-0.28440753721127171039f,(float16_t)0.95870347489587159906f,
+(float16_t)-0.30200594931922808417f,(float16_t)0.95330604035419386211f,
+(float16_t)-0.31950203081601563637f,(float16_t)0.94758559101774120226f,
+(float16_t)-0.33688985339221994009f,(float16_t)0.94154406518302080631f,
+(float16_t)-0.35416352542049039931f,(float16_t)0.93518350993894761025f,
+(float16_t)-0.37131719395183748755f,(float16_t)0.92850608047321558924f,
+(float16_t)-0.38834504669882619066f,(float16_t)0.92151403934204201285f,
+(float16_t)-0.40524131400498974998f,(float16_t)0.91420975570353069095f,
+(float16_t)-0.42200027079979968159f,(float16_t)0.90659570451491533483f,
+(float16_t)-0.43861623853852738097f,(float16_t)0.89867446569395392775f,
+(float16_t)-0.45508358712634372489f,(float16_t)0.89044872324475798919f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.48755016014843571837f,(float16_t)0.87309497841829020182f,
+(float16_t)-0.50353838372571746440f,(float16_t)0.86397285612158680745f,
+(float16_t)-0.51935599016558964269f,(float16_t)0.85455798836540053376f,
+(float16_t)-0.53499761988709704230f,(float16_t)0.84485356524970722791f,
+(float16_t)-0.55045797293660470029f,(float16_t)0.83486287498638012128f,
+(float16_t)-0.56573181078361323149f,(float16_t)0.82458930278502517996f,
+(float16_t)-0.58081395809576441547f,(float16_t)0.81403632970594852480f,
+(float16_t)-0.59569930449243335691f,(float16_t)0.80320753148064494287f,
+(float16_t)-0.61038280627630958630f,(float16_t)0.79210657730021227785f,
+(float16_t)-0.62485948814238623239f,(float16_t)0.78073722857209459924f,
+(float16_t)-0.63912444486377573138f,(float16_t)0.76910333764557958780f,
+(float16_t)-0.65317284295377653347f,(float16_t)0.75720884650648467851f,
+(float16_t)-0.66699992230363736034f,(float16_t)0.74505778544146605835f,
+(float16_t)-0.68060099779545302212f,(float16_t)0.73265427167241281570f,
+(float16_t)-0.69397146088965377952f,(float16_t)0.72000250796138176579f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.72000250796138165477f,(float16_t)0.69397146088965389055f,
+(float16_t)-0.73265427167241270467f,(float16_t)0.68060099779545324417f,
+(float16_t)-0.74505778544146594733f,(float16_t)0.66699992230363758239f,
+(float16_t)-0.75720884650648467851f,(float16_t)0.65317284295377664449f,
+(float16_t)-0.76910333764557947678f,(float16_t)0.63912444486377584241f,
+(float16_t)-0.78073722857209448822f,(float16_t)0.62485948814238634341f,
+(float16_t)-0.79210657730021216683f,(float16_t)0.61038280627630969732f,
+(float16_t)-0.80320753148064483184f,(float16_t)0.59569930449243346793f,
+(float16_t)-0.81403632970594841378f,(float16_t)0.58081395809576452649f,
+(float16_t)-0.82458930278502506894f,(float16_t)0.56573181078361345353f,
+(float16_t)-0.83486287498638001026f,(float16_t)0.55045797293660492233f,
+(float16_t)-0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)-0.85455798836540042274f,(float16_t)0.51935599016558975372f,
+(float16_t)-0.86397285612158669643f,(float16_t)0.50353838372571757542f,
+(float16_t)-0.87309497841829009079f,(float16_t)0.48755016014843588490f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.89044872324475787817f,(float16_t)0.45508358712634389143f,
+(float16_t)-0.89867446569395392775f,(float16_t)0.43861623853852754751f,
+(float16_t)-0.90659570451491533483f,(float16_t)0.42200027079979984812f,
+(float16_t)-0.91420975570353069095f,(float16_t)0.40524131400498991651f,
+(float16_t)-0.92151403934204179080f,(float16_t)0.38834504669882657923f,
+(float16_t)-0.92850608047321547822f,(float16_t)0.37131719395183770960f,
+(float16_t)-0.93518350993894761025f,(float16_t)0.35416352542049039931f,
+(float16_t)-0.94154406518302069529f,(float16_t)0.33688985339222032867f,
+(float16_t)-0.94758559101774109124f,(float16_t)0.31950203081601580291f,
+(float16_t)-0.95330604035419386211f,(float16_t)0.30200594931922802866f,
+(float16_t)-0.95870347489587148804f,(float16_t)0.28440753721127209896f,
+(float16_t)-0.96377606579543984022f,(float16_t)0.26671275747489847641f,
+(float16_t)-0.96852209427441737777f,(float16_t)0.24892760574572009302f,
+(float16_t)-0.97293995220556006576f,(float16_t)0.23105810828067133156f,
+(float16_t)-0.97702814265775439484f,(float16_t)0.21311031991609141745f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.98421009238692902521f,(float16_t)0.17700422041214894375f,
+(float16_t)-0.98730141815785843473f,(float16_t)0.15885814333386147346f,
+(float16_t)-0.99005821026229701154f,(float16_t)0.14065823933284954395f,
+(float16_t)-0.99247953459870996706f,(float16_t)0.12241067519921634832f,
+(float16_t)-0.99456457073425541537f,(float16_t)0.10412163387205457254f,
+(float16_t)-0.99631261218277800129f,(float16_t)0.08579731234444015753f,
+(float16_t)-0.99772306664419163624f,(float16_t)0.06744391956366417584f,
+(float16_t)-0.99879545620517240501f,(float16_t)0.04906767432741796636f,
+(float16_t)-0.99952941750109314256f,(float16_t)0.03067480317663686534f,
+(float16_t)-0.99992470183914450299f,(float16_t)0.01227153828572000692f,
+(float16_t)-0.99998117528260110909f,(float16_t)-0.00613588464915455420f,
+(float16_t)-0.99969881869620424997f,(float16_t)-0.02454122852291207996f,
+(float16_t)-0.99907772775264536147f,(float16_t)-0.04293825693494077861f,
+(float16_t)-0.99811811290014917919f,(float16_t)-0.06132073630220824523f,
+(float16_t)-0.99682029929116577893f,(float16_t)-0.07968243797142994522f,
+(float16_t)-0.99518472667219692873f,(float16_t)-0.09801714032956058975f,
+(float16_t)-0.99321194923479461103f,(float16_t)-0.11631863091190447479f,
+(float16_t)-0.99090263542778000971f,(float16_t)-0.13458070850712605671f,
+(float16_t)-0.98825756773074946437f,(float16_t)-0.15279718525844343535f,
+(float16_t)-0.98527764238894133264f,(float16_t)-0.17096188876030096737f,
+(float16_t)-0.98196386910955524296f,(float16_t)-0.18906866414980610935f,
+(float16_t)-0.97831737071962765473f,(float16_t)-0.20711137619221858808f,
+(float16_t)-0.97433938278557585821f,(float16_t)-0.22508391135979261000f,
+(float16_t)-0.97003125319454397424f,(float16_t)-0.24298017990326381543f,
+(float16_t)-0.96539444169768939830f,(float16_t)-0.26079411791527562503f,
+(float16_t)-0.96043051941556589757f,(float16_t)-0.27851968938505289319f,
+(float16_t)-0.95514116830577078243f,(float16_t)-0.29615088824362378883f,
+(float16_t)-0.94952818059303678577f,(float16_t)-0.31368174039889118454f,
+(float16_t)-0.94359345816196038559f,(float16_t)-0.33110630575987626267f,
+(float16_t)-0.93733901191257495977f,(float16_t)-0.34841868024943456472f,
+(float16_t)-0.93076696107898382326f,(float16_t)-0.36561299780477357624f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,
+(float16_t)-0.91667905992104270485f,(float16_t)-0.39962419984564684361f,
+(float16_t)-0.90916798309052249127f,(float16_t)-0.41642956009763693048f,
+(float16_t)-0.90134884704602202810f,(float16_t)-0.43309381885315184624f,
+(float16_t)-0.89322430119551532446f,(float16_t)-0.44961132965460665067f,
+(float16_t)-0.88479709843093790056f,(float16_t)-0.46597649576796595916f,
+(float16_t)-0.87607009419540660122f,(float16_t)-0.48218377207912266336f,
+(float16_t)-0.86704624551569287050f,(float16_t)-0.49822766697278153547f,
+(float16_t)-0.85772861000027211809f,(float16_t)-0.51410274419322155026f,
+(float16_t)-0.84812034480329723252f,(float16_t)-0.52980362468629460526f,
+(float16_t)-0.83822470555483818977f,(float16_t)-0.54532498842204613076f,
+(float16_t)-0.82804504525775590729f,(float16_t)-0.56066157619733592021f,
+(float16_t)-0.81758481315158371139f,(float16_t)-0.57580819141784533866f,
+(float16_t)-0.80684755354379944503f,(float16_t)-0.59075970185887394237f,
+(float16_t)-0.79583690460888356633f,(float16_t)-0.60551104140432543410f,
+(float16_t)-0.78455659715557524159f,(float16_t)-0.62005721176328920663f,
+(float16_t)-0.77301045336273710440f,(float16_t)-0.63439328416364526575f,
+(float16_t)-0.76120238548426188974f,(float16_t)-0.64851440102211233008f,
+(float16_t)-0.74913639452345925918f,(float16_t)-0.66241577759017178373f,
+(float16_t)-0.73681656887737001504f,(float16_t)-0.67609270357531581208f,
+(float16_t)-0.72424708295146700276f,(float16_t)-0.68954054473706682948f,
+(float16_t)-0.71143219574521665560f,(float16_t)-0.70275474445722507788f,
+(float16_t)-0.69837624940897302661f,(float16_t)-0.71573082528381848366f,
+(float16_t)-0.68508366777270035541f,(float16_t)-0.72846439044822519637f,
+(float16_t)-0.67155895484701866316f,(float16_t)-0.74095112535495888384f,
+(float16_t)-0.65780669329707874837f,(float16_t)-0.75318679904361240940f,
+(float16_t)-0.64383154288979149715f,(float16_t)-0.76516726562245895860f,
+(float16_t)-0.62963823891492687324f,(float16_t)-0.77688846567323255332f,
+(float16_t)-0.61523159058062726334f,(float16_t)-0.78834642762660589455f,
+(float16_t)-0.60061647938386930612f,(float16_t)-0.79953726910790479110f,
+(float16_t)-0.58579785745643908612f,(float16_t)-0.81045719825259465718f,
+(float16_t)-0.57078074588696736669f,(float16_t)-0.82110251499110464835f,
+(float16_t)-0.55557023301960217765f,(float16_t)-0.83146961230254523567f,
+(float16_t)-0.54017147272989274320f,(float16_t)-0.84155497743689855472f,
+(float16_t)-0.52458968267846928235f,(float16_t)-0.85135519310526486247f,
+(float16_t)-0.50883014254310732216f,(float16_t)-0.86086693863776708735f,
+(float16_t)-0.49289819222978420443f,(float16_t)-0.87008699110871134952f,
+(float16_t)-0.47679923006332214364f,(float16_t)-0.87901222642863341417f,
+(float16_t)-0.46053871095823989412f,(float16_t)-0.88763962040285404598f,
+(float16_t)-0.44412214457042975546f,(float16_t)-0.89596624975618488484f,
+(float16_t)-0.42755509343028247349f,(float16_t)-0.90398929312344311615f,
+(float16_t)-0.41084317105790418845f,(float16_t)-0.91170603200542976730f,
+(float16_t)-0.39399204006104820985f,(float16_t)-0.91911385169005765938f,
+(float16_t)-0.37700741021641820394f,(float16_t)-0.92621024213831137928f,
+(float16_t)-0.35989503653498794433f,(float16_t)-0.93299279883473895669f,
+(float16_t)-0.34266071731199487793f,(float16_t)-0.93945922360218969693f,
+(float16_t)-0.32531029216226331480f,(float16_t)-0.94560732538052116869f,
+(float16_t)-0.30784964004153508865f,(float16_t)-0.95143502096900833820f,
+(float16_t)-0.29028467725446244208f,(float16_t)-0.95694033573220882438f,
+(float16_t)-0.27262135544994886560f,(float16_t)-0.96212140426904158019f,
+(float16_t)-0.25486565960451434965f,(float16_t)-0.96697647104485218161f,
+(float16_t)-0.23702360599436766986f,(float16_t)-0.97150389098625167250f,
+(float16_t)-0.21910124015687010290f,(float16_t)-0.97570213003852845901f,
+(float16_t)-0.20110463484209206708f,(float16_t)-0.97956976568544051887f,
+(float16_t)-0.18303988795514095078f,(float16_t)-0.98310548743121628501f,
+(float16_t)-0.16491312048996975559f,(float16_t)-0.98630809724459866938f,
+(float16_t)-0.14673047445536230304f,(float16_t)-0.98917650996478090342f,
+(float16_t)-0.12849811079379358514f,(float16_t)-0.99170975366909952520f,
+(float16_t)-0.11022220729388330918f,(float16_t)-0.99390697000235606051f,
+(float16_t)-0.09190895649713282101f,(float16_t)-0.99576741446765981713f,
+(float16_t)-0.07356456359966735692f,(float16_t)-0.99729045667869020697f,
+(float16_t)-0.05519524434968971216f,(float16_t)-0.99847558057329477421f,
+(float16_t)-0.03680722294135933131f,(float16_t)-0.99932238458834943273f,
+(float16_t)-0.01840672990580516366f,(float16_t)-0.99983058179582340319f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.99729045667869020697f,(float16_t)0.07356456359966742631f,
+(float16_t)0.98917650996478101444f,(float16_t)0.14673047445536174793f,
+(float16_t)0.97570213003852857003f,(float16_t)0.21910124015686979759f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.93299279883473895669f,(float16_t)0.35989503653498811087f,
+(float16_t)0.90398929312344333820f,(float16_t)0.42755509343028208491f,
+(float16_t)0.87008699110871146054f,(float16_t)0.49289819222978403790f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.78834642762660622761f,(float16_t)0.61523159058062681925f,
+(float16_t)0.74095112535495921691f,(float16_t)0.67155895484701833009f,
+(float16_t)0.68954054473706694051f,(float16_t)0.72424708295146689174f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.57580819141784533866f,(float16_t)0.81758481315158371139f,
+(float16_t)0.51410274419322166128f,(float16_t)0.85772861000027211809f,
+(float16_t)0.44961132965460659516f,(float16_t)0.89322430119551532446f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.31368174039889157312f,(float16_t)0.94952818059303667475f,
+(float16_t)0.24298017990326398197f,(float16_t)0.97003125319454397424f,
+(float16_t)0.17096188876030135595f,(float16_t)0.98527764238894122162f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)0.02454122852291226384f,(float16_t)0.99969881869620424997f,
+(float16_t)-0.04906767432741800800f,(float16_t)0.99879545620517240501f,
+(float16_t)-0.12241067519921615403f,(float16_t)0.99247953459870996706f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.26671275747489830987f,(float16_t)0.96377606579543984022f,
+(float16_t)-0.33688985339221994009f,(float16_t)0.94154406518302080631f,
+(float16_t)-0.40524131400498974998f,(float16_t)0.91420975570353069095f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.53499761988709704230f,(float16_t)0.84485356524970722791f,
+(float16_t)-0.59569930449243335691f,(float16_t)0.80320753148064494287f,
+(float16_t)-0.65317284295377653347f,(float16_t)0.75720884650648467851f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.75720884650648467851f,(float16_t)0.65317284295377664449f,
+(float16_t)-0.80320753148064483184f,(float16_t)0.59569930449243346793f,
+(float16_t)-0.84485356524970711689f,(float16_t)0.53499761988709715332f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.91420975570353069095f,(float16_t)0.40524131400498991651f,
+(float16_t)-0.94154406518302069529f,(float16_t)0.33688985339222032867f,
+(float16_t)-0.96377606579543984022f,(float16_t)0.26671275747489847641f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.99247953459870996706f,(float16_t)0.12241067519921634832f,
+(float16_t)-0.99879545620517240501f,(float16_t)0.04906767432741796636f,
+(float16_t)-0.99969881869620424997f,(float16_t)-0.02454122852291207996f,
+(float16_t)-0.99518472667219692873f,(float16_t)-0.09801714032956058975f,
+(float16_t)-0.98527764238894133264f,(float16_t)-0.17096188876030096737f,
+(float16_t)-0.97003125319454397424f,(float16_t)-0.24298017990326381543f,
+(float16_t)-0.94952818059303678577f,(float16_t)-0.31368174039889118454f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,
+(float16_t)-0.89322430119551532446f,(float16_t)-0.44961132965460665067f,
+(float16_t)-0.85772861000027211809f,(float16_t)-0.51410274419322155026f,
+(float16_t)-0.81758481315158371139f,(float16_t)-0.57580819141784533866f,
+(float16_t)-0.77301045336273710440f,(float16_t)-0.63439328416364526575f,
+(float16_t)-0.72424708295146700276f,(float16_t)-0.68954054473706682948f,
+(float16_t)-0.67155895484701866316f,(float16_t)-0.74095112535495888384f,
+(float16_t)-0.61523159058062726334f,(float16_t)-0.78834642762660589455f,
+(float16_t)-0.55557023301960217765f,(float16_t)-0.83146961230254523567f,
+(float16_t)-0.49289819222978420443f,(float16_t)-0.87008699110871134952f,
+(float16_t)-0.42755509343028247349f,(float16_t)-0.90398929312344311615f,
+(float16_t)-0.35989503653498794433f,(float16_t)-0.93299279883473895669f,
+(float16_t)-0.29028467725446244208f,(float16_t)-0.95694033573220882438f,
+(float16_t)-0.21910124015687010290f,(float16_t)-0.97570213003852845901f,
+(float16_t)-0.14673047445536230304f,(float16_t)-0.98917650996478090342f,
+(float16_t)-0.07356456359966735692f,(float16_t)-0.99729045667869020697f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.95694033573220882438f,(float16_t)0.29028467725446233105f,
+(float16_t)0.83146961230254523567f,(float16_t)0.55557023301960217765f,
+(float16_t)0.63439328416364548779f,(float16_t)0.77301045336273688235f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)0.09801714032956077016f,(float16_t)0.99518472667219681771f,
+(float16_t)-0.19509032201612819257f,(float16_t)0.98078528040323043058f,
+(float16_t)-0.47139673682599769755f,(float16_t)0.88192126434835504956f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.88192126434835493853f,(float16_t)0.47139673682599780857f,
+(float16_t)-0.98078528040323043058f,(float16_t)0.19509032201612860891f,
+(float16_t)-0.99518472667219692873f,(float16_t)-0.09801714032956058975f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,
+(float16_t)-0.77301045336273710440f,(float16_t)-0.63439328416364526575f,
+(float16_t)-0.55557023301960217765f,(float16_t)-0.83146961230254523567f,
+(float16_t)-0.29028467725446244208f,(float16_t)-0.95694033573220882438f,
+(float16_t)1.00000000000000000000f,(float16_t)0.00000000000000000000f,
+(float16_t)0.38268343236508983729f,(float16_t)0.92387953251128673848f,
+(float16_t)-0.70710678118654746172f,(float16_t)0.70710678118654757274f,
+(float16_t)-0.92387953251128684951f,(float16_t)-0.38268343236508967076f,};
+
+#endif
+
+
+
+#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
+#endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+#endif /* if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f16.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..05d7b97cb5610d8c1c352ac4b7967856417ab9c0
--- /dev/null
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f16.c
@@ -0,0 +1,185 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cmplx_conj_f16.c
+ * Description:  Floating-point complex conjugate
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/complex_math_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+/**
+  @ingroup groupCmplxMath
+ */
+
+/**
+  @defgroup cmplx_conj Complex Conjugate
+
+  Conjugates the elements of a complex data vector.
+
+  The <code>pSrc</code> points to the source data and
+  <code>pDst</code> points to the destination data where the result should be written.
+  <code>numSamples</code> specifies the number of complex samples
+  and the data in each array is stored in an interleaved fashion
+  (real, imag, real, imag, ...).
+  Each array has a total of <code>2*numSamples</code> values.
+
+  The underlying algorithm is used:
+  <pre>
+  for (n = 0; n < numSamples; n++) {
+      pDst[(2*n)  ] =  pSrc[(2*n)  ];    // real part
+      pDst[(2*n)+1] = -pSrc[(2*n)+1];    // imag part
+  }
+  </pre>
+
+  There are separate functions for floating-point, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup cmplx_conj
+  @{
+ */
+
+/**
+  @brief         Floating-point complex conjugate.
+  @param[in]     pSrc        points to the input vector
+  @param[out]    pDst        points to the output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_cmplx_conj_f16(
+    const float16_t * pSrc,
+    float16_t * pDst,
+    uint32_t numSamples)
+{
+    static const float16_t cmplx_conj_sign[8] = { 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f };
+    uint32_t blockSize = numSamples * CMPLX_DIM;   /* loop counters */
+    uint32_t blkCnt;
+    f16x8_t vecSrc;
+    f16x8_t vecSign;
+
+    /*
+     * load sign vector
+     */
+    vecSign = *(f16x8_t *) cmplx_conj_sign;
+
+    /* Compute 4 real samples at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst,vmulq(vecSrc, vecSign));
+        /*
+         * Decrement the blkCnt loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+        blkCnt--;
+    }
+
+     /* Tail */
+    blkCnt = (blockSize & 0x7) >> 1;
+
+    while (blkCnt > 0U)
+    {
+      /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+  
+      /* Calculate Complex Conjugate and store result in destination buffer. */
+      *pDst++ =  *pSrc++;
+      *pDst++ = -*pSrc++;
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+}
+
+#else
+void arm_cmplx_conj_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t numSamples)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = numSamples >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+
+    /* Calculate Complex Conjugate and store result in destination buffer. */
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;
+
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;
+
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;
+
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = numSamples % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+
+    /* Calculate Complex Conjugate and store result in destination buffer. */
+    *pDst++ =  *pSrc++;
+    *pDst++ = -*pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of cmplx_conj group
+ */
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
index 4e74bccd1a18d2f9fe9c5f0c41080fd989db885a..5920a77806db834840de5f6a3728a3ed67baa6a9 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_conj_f32.c
  * Description:  Floating-point complex conjugate
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
index 31c19c4e1f80dce230f3949fbfbe6c0a097b3999..c076eff66bcecab222511dc6c3efdb168a153820 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_conj_q15.c
  * Description:  Q15 complex conjugate
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -50,7 +50,7 @@
  */
 
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_conj_q15(
   const q15_t * pSrc,
         q15_t * pDst,
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
index 9cc029b1281c5b2032b952df5bac3ae1a7233506..ec1b5b703baf7dea34ccee3418f2bdb92cf0d5cc 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_conj_q31.c
  * Description:  Q31 complex conjugate
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -49,7 +49,7 @@
                    The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 void arm_cmplx_conj_q31(
   const q31_t * pSrc,
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f16.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..f75d04e8c416418d89f012b8fd17a4b5e6a179a3
--- /dev/null
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f16.c
@@ -0,0 +1,288 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cmplx_dot_prod_f16.c
+ * Description:  Floating-point complex dot product
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/complex_math_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupCmplxMath
+ */
+
+/**
+  @defgroup cmplx_dot_prod Complex Dot Product
+
+  Computes the dot product of two complex vectors.
+  The vectors are multiplied element-by-element and then summed.
+
+  The <code>pSrcA</code> points to the first complex input vector and
+  <code>pSrcB</code> points to the second complex input vector.
+  <code>numSamples</code> specifies the number of complex samples
+  and the data in each array is stored in an interleaved fashion
+  (real, imag, real, imag, ...).
+  Each array has a total of <code>2*numSamples</code> values.
+
+  The underlying algorithm is used:
+
+  <pre>
+  realResult = 0;
+  imagResult = 0;
+  for (n = 0; n < numSamples; n++) {
+      realResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
+      imagResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
+  }
+  </pre>
+
+  There are separate functions for floating-point, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup cmplx_dot_prod
+  @{
+ */
+
+/**
+  @brief         Floating-point complex dot product.
+  @param[in]     pSrcA       points to the first input vector
+  @param[in]     pSrcB       points to the second input vector
+  @param[in]     numSamples  number of samples in each vector
+  @param[out]    realResult  real part of the result returned here
+  @param[out]    imagResult  imaginary part of the result returned here
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_cmplx_dot_prod_f16(
+    const float16_t * pSrcA,
+    const float16_t * pSrcB,
+    uint32_t numSamples,
+    float16_t * realResult,
+    float16_t * imagResult)
+{
+    int32_t         blkCnt;
+    float16_t       real_sum, imag_sum;
+    f16x8_t         vecSrcA, vecSrcB;
+    f16x8_t         vec_acc = vdupq_n_f16(0.0f16);
+    f16x8_t         vecSrcC, vecSrcD;
+
+    blkCnt = (numSamples >> 3);
+    blkCnt -= 1;
+    if (blkCnt > 0) {
+        /* should give more freedom to generate stall free code */
+        vecSrcA = vld1q( pSrcA);
+        vecSrcB = vld1q( pSrcB);
+        pSrcA += 8;
+        pSrcB += 8;
+
+        while (blkCnt > 0) {
+            vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
+            vecSrcC = vld1q(pSrcA);
+            pSrcA += 8;
+
+            vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
+            vecSrcD = vld1q(pSrcB);
+            pSrcB += 8;
+
+            vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
+            vecSrcA = vld1q(pSrcA);
+            pSrcA += 8;
+
+            vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
+            vecSrcB = vld1q(pSrcB);
+            pSrcB += 8;
+            /*
+             * Decrement the blockSize loop counter
+             */
+            blkCnt--;
+        }
+
+        /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
+        vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
+        vecSrcC = vld1q(pSrcA);
+
+        vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
+        vecSrcD = vld1q(pSrcB);
+
+        vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
+        vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
+
+        /*
+         * tail
+         */
+        blkCnt = CMPLX_DIM * (numSamples & 7);
+        while (blkCnt > 0) {
+            mve_pred16_t    p = vctp16q(blkCnt);
+            pSrcA += 8;
+            pSrcB += 8;
+
+            vecSrcA = vldrhq_z_f16(pSrcA, p);
+            vecSrcB = vldrhq_z_f16(pSrcB, p);
+            vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
+            vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
+
+            blkCnt -= 8;
+        }
+    } else {
+        /* small vector */
+        blkCnt = numSamples * CMPLX_DIM;
+        vec_acc = vdupq_n_f16(0.0f16);
+
+        do {
+            mve_pred16_t    p = vctp16q(blkCnt);
+
+            vecSrcA = vldrhq_z_f16(pSrcA, p);
+            vecSrcB = vldrhq_z_f16(pSrcB, p);
+
+            vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
+            vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
+
+            /*
+             * Decrement the blkCnt loop counter
+             * Advance vector source and destination pointers
+             */
+            pSrcA += 8;
+            pSrcB += 8;
+            blkCnt -= 8;
+        }
+        while (blkCnt > 0);
+    }
+
+    /* Sum the partial parts */
+    mve_cmplx_sum_intra_r_i_f16(vec_acc, real_sum, imag_sum);
+
+    /*
+     * Store the real and imaginary results in the destination buffers
+     */
+    *realResult = real_sum;
+    *imagResult = imag_sum;
+}
+
+#else
+void arm_cmplx_dot_prod_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        uint32_t numSamples,
+        float16_t * realResult,
+        float16_t * imagResult)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        _Float16 real_sum = 0.0f, imag_sum = 0.0f;    /* Temporary result variables */
+        _Float16 a0,b0,c0,d0;
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = numSamples >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;
+
+    real_sum += a0 * c0;
+    imag_sum += a0 * d0;
+    real_sum -= b0 * d0;
+    imag_sum += b0 * c0;
+
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;
+
+    real_sum += a0 * c0;
+    imag_sum += a0 * d0;
+    real_sum -= b0 * d0;
+    imag_sum += b0 * c0;
+
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;
+
+    real_sum += a0 * c0;
+    imag_sum += a0 * d0;
+    real_sum -= b0 * d0;
+    imag_sum += b0 * c0;
+
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;
+
+    real_sum += a0 * c0;
+    imag_sum += a0 * d0;
+    real_sum -= b0 * d0;
+    imag_sum += b0 * c0;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = numSamples % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;
+
+    real_sum += a0 * c0;
+    imag_sum += a0 * d0;
+    real_sum -= b0 * d0;
+    imag_sum += b0 * c0;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store real and imaginary result in destination buffer. */
+  *realResult = real_sum;
+  *imagResult = imag_sum;
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of cmplx_dot_prod group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
index 4925f02cd61f58609305d1cc65087606505b6e0f..af60d32bc851679f65eea0014359e2c155bc3c06 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_dot_prod_f32.c
  * Description:  Floating-point complex dot product
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -83,56 +83,94 @@ void arm_cmplx_dot_prod_f32(
     float32_t * realResult,
     float32_t * imagResult)
 {
-    uint32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
-    uint32_t blkCnt;
-    float32_t real_sum, imag_sum;
-    f32x4_t vecSrcA, vecSrcB;
-    f32x4_t vec_acc = vdupq_n_f32(0.0f);
-    float32_t a0,b0,c0,d0;
-
-    /* Compute 2 complex samples at a time */
-    blkCnt = blockSize >> 2U;
-
-    while (blkCnt > 0U)
-    {
+    int32_t         blkCnt;
+    float32_t       real_sum, imag_sum;
+    f32x4_t         vecSrcA, vecSrcB;
+    f32x4_t         vec_acc = vdupq_n_f32(0.0f);
+    f32x4_t         vecSrcC, vecSrcD;
+
+    blkCnt = numSamples >> 2;
+    blkCnt -= 1;
+    if (blkCnt > 0) {
+        /* should give more freedom to generate stall free code */
         vecSrcA = vld1q(pSrcA);
         vecSrcB = vld1q(pSrcB);
+        pSrcA += 4;
+        pSrcB += 4;
 
+        while (blkCnt > 0) {
+            vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
+            vecSrcC = vld1q(pSrcA);
+            pSrcA += 4;
+
+            vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
+            vecSrcD = vld1q(pSrcB);
+            pSrcB += 4;
+
+            vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
+            vecSrcA = vld1q(pSrcA);
+            pSrcA += 4;
+
+            vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
+            vecSrcB = vld1q(pSrcB);
+            pSrcB += 4;
+            /*
+             * Decrement the blockSize loop counter
+             */
+            blkCnt--;
+        }
+
+         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
         vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
+        vecSrcC = vld1q(pSrcA);
+
         vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
+        vecSrcD = vld1q(pSrcB);
+
+        vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
+        vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
 
         /*
-         * Decrement the blkCnt loop counter
-         * Advance vector source and destination pointers
+         * tail
          */
-        pSrcA += 4;
-        pSrcB += 4;
-        blkCnt--;
+        blkCnt = CMPLX_DIM * (numSamples & 3);
+        while (blkCnt > 0) {
+            mve_pred16_t    p = vctp32q(blkCnt);
+            pSrcA += 4;
+            pSrcB += 4;
+            vecSrcA = vldrwq_z_f32(pSrcA, p);
+            vecSrcB = vldrwq_z_f32(pSrcB, p);
+            vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
+            vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
+            blkCnt -= 4;
+        }
+    } else {
+        /* small vector */
+        blkCnt = numSamples * CMPLX_DIM;
+        vec_acc = vdupq_n_f32(0.0f);
+
+        do {
+            mve_pred16_t    p = vctp32q(blkCnt);
+
+            vecSrcA = vldrwq_z_f32(pSrcA, p);
+            vecSrcB = vldrwq_z_f32(pSrcB, p);
+
+            vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
+            vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
+
+            /*
+             * Decrement the blkCnt loop counter
+             * Advance vector source and destination pointers
+             */
+            pSrcA += 4;
+            pSrcB += 4;
+            blkCnt -= 4;
+        }
+        while (blkCnt > 0);
     }
 
-
     real_sum = vgetq_lane(vec_acc, 0) + vgetq_lane(vec_acc, 2);
     imag_sum = vgetq_lane(vec_acc, 1) + vgetq_lane(vec_acc, 3);
-   
-    /* Tail */
-    blkCnt = (blockSize & 3) >> 1;
-
-    while (blkCnt > 0U)
-    {
-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
-  
-      real_sum += a0 * c0;
-      imag_sum += a0 * d0;
-      real_sum -= b0 * d0;
-      imag_sum += b0 * c0;
-  
-      /* Decrement loop counter */
-      blkCnt--;
-    }
-
 
     /*
      * Store the real and imaginary results in the destination buffers
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
index 5502a48aea4df45fd5f5c22b4f7cdaab7d71b9da..1910d8ad9702de4a5fbd1e831fc298773759e2e6 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_dot_prod_q15.c
  * Description:  Processing function for the Q15 Complex Dot product
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -54,7 +54,7 @@
                    The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_dot_prod_q15(
   const q15_t * pSrcA,
   const q15_t * pSrcB,
@@ -62,76 +62,98 @@ void arm_cmplx_dot_prod_q15(
         q31_t * realResult,
         q31_t * imagResult)
 {
-
-  uint32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
-  uint32_t blkCnt;
-  q15_t a0,b0,c0,d0;
-
-  q63_t accReal = 0LL; q63_t accImag = 0LL;
-  q15x8_t vecSrcA, vecSrcB;
-
-
-
-  /* should give more freedom to generate stall free code */
-  vecSrcA = vld1q(pSrcA);
-  vecSrcB = vld1q(pSrcB);
-  pSrcA += 8;
-  pSrcB += 8;
-
-  /* Compute 4 complex samples at a time */
-  blkCnt = blockSize >> 3;
-  while (blkCnt > 0U) 
-  {
-      q15x8_t vecSrcC, vecSrcD;
-
-      accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
-      vecSrcC = vld1q(pSrcA);
-      pSrcA += 8;
-
-      accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
-      vecSrcD = vld1q(pSrcB);
-      pSrcB += 8;
-
-      accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
-      vecSrcA = vld1q(pSrcA);
-      pSrcA += 8;
-
-      accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
-      vecSrcB = vld1q(pSrcB);
-      pSrcB += 8;
-      /*
-       * Decrement the blockSize loop counter
-       */
-      blkCnt--;
-  }
-
-  /* Tail */
-  pSrcA -= 8;
-  pSrcB -= 8; 
-
-  blkCnt = (blockSize & 7) >> 1;
-  
-  while (blkCnt > 0U)
-  {
-    a0 = *pSrcA++;
-    b0 = *pSrcA++;
-    c0 = *pSrcB++;
-    d0 = *pSrcB++;
-
-    accReal += (q31_t)a0 * c0;
-    accImag += (q31_t)a0 * d0;
-    accReal -= (q31_t)b0 * d0;
-    accImag += (q31_t)b0 * c0;
-
-    /* Decrement loop counter */
-    blkCnt--;
-  }
-
-  /* Store real and imaginary result in 8.24 format  */
-  /* Convert real data in 34.30 to 8.24 by 6 right shifts */
-  *realResult = (q31_t) (accReal >> 6);
-  /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
-  *imagResult = (q31_t) (accImag >> 6);
+    int32_t         blkCnt;
+    q63_t           accReal = 0LL;
+    q63_t           accImag = 0LL;
+    q15x8_t         vecSrcA, vecSrcB;
+    q15x8_t         vecSrcC, vecSrcD;
+
+    blkCnt = (numSamples >> 3);
+    blkCnt -= 1;
+    if (blkCnt > 0) {
+        /* should give more freedom to generate stall free code */
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        pSrcA += 8;
+        pSrcB += 8;
+
+        while (blkCnt > 0) {
+
+            accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
+            vecSrcC = vld1q(pSrcA);
+            pSrcA += 8;
+
+            accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
+            vecSrcD = vld1q(pSrcB);
+            pSrcB += 8;
+
+            accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
+            vecSrcA = vld1q(pSrcA);
+            pSrcA += 8;
+
+            accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
+            vecSrcB = vld1q(pSrcB);
+            pSrcB += 8;
+            /*
+             * Decrement the blockSize loop counter
+             */
+            blkCnt--;
+        }
+
+        /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
+        accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
+        vecSrcC = vld1q(pSrcA);
+
+        accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
+        vecSrcD = vld1q(pSrcB);
+
+        accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
+        vecSrcA = vld1q(pSrcA);
+
+        accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
+        vecSrcB = vld1q(pSrcB);
+
+        /*
+         * tail
+         */
+        blkCnt = CMPLX_DIM * (numSamples & 7);
+        do {
+            mve_pred16_t    p = vctp16q(blkCnt);
+
+            pSrcA += 8;
+            pSrcB += 8;
+
+            vecSrcA = vldrhq_z_s16(pSrcA, p);
+            vecSrcB = vldrhq_z_s16(pSrcB, p);
+
+            accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
+            accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
+
+            blkCnt -= 8;
+        }
+        while ((int32_t) blkCnt > 0);
+    } else {
+        blkCnt = numSamples * CMPLX_DIM;
+        while (blkCnt > 0) {
+            mve_pred16_t    p = vctp16q(blkCnt);
+
+            vecSrcA = vldrhq_z_s16(pSrcA, p);
+            vecSrcB = vldrhq_z_s16(pSrcB, p);
+
+            accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
+            accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
+
+            /*
+             * Decrement the blkCnt loop counter
+             * Advance vector source and destination pointers
+             */
+            pSrcA += 8;
+            pSrcB += 8;
+            blkCnt -= 8;
+        }
+    }
+    *realResult = asrl(accReal, (14 - 8));
+    *imagResult = asrl(accImag, (14 - 8));
 }
 #else
 void arm_cmplx_dot_prod_q15(
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
index 5e086ecac58ca57e0685310475faf879112b6505..fac85bd0fcfd98e327245089fc9ebb24af7330bb 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_dot_prod_q31.c
  * Description:  Q31 complex dot product
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -55,7 +55,7 @@
                    Input down scaling is not required.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 void arm_cmplx_dot_prod_q31(
   const q31_t * pSrcA,
@@ -64,60 +64,99 @@ void arm_cmplx_dot_prod_q31(
         q63_t * realResult,
         q63_t * imagResult)
 {
-    uint32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
-    uint32_t blkCnt;
-    q31x4_t vecSrcA, vecSrcB;
-    q63_t accReal = 0LL; 
-    q63_t accImag = 0LL;
+    int32_t         blkCnt;
+    q63_t           accReal = 0LL;
+    q63_t           accImag = 0LL;
+    q31x4_t         vecSrcA, vecSrcB;
+    q31x4_t         vecSrcC, vecSrcD;
+
+    blkCnt = numSamples >> 2;
+    blkCnt -= 1;
+    if (blkCnt > 0) {
+        /* should give more freedom to generate stall free code */
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        pSrcA += 4;
+        pSrcB += 4;
 
-    q31_t a0,b0,c0,d0;
+        while (blkCnt > 0) {
 
-     /* Compute 2 complex samples at a time */
-    blkCnt = blockSize >> 2U;
+            accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
+            vecSrcC = vld1q(pSrcA);
+            pSrcA += 4;
 
-    while (blkCnt > 0U)
-    {        
+            accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
+            vecSrcD = vld1q(pSrcB);
+            pSrcB += 4;
 
-        vecSrcA = vld1q(pSrcA);
-        vecSrcB = vld1q(pSrcB);
+            accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
+            vecSrcA = vld1q(pSrcA);
+            pSrcA += 4;
 
+            accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
+            vecSrcB = vld1q(pSrcB);
+            pSrcB += 4;
+            /*
+             * Decrement the blockSize loop counter
+             */
+            blkCnt--;
+        }
+
+        /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
         accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
+        vecSrcC = vld1q(pSrcA);
+
         accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
+        vecSrcD = vld1q(pSrcB);
+
+        accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
+        vecSrcA = vld1q(pSrcA);
+
+        accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
+        vecSrcB = vld1q(pSrcB);
 
         /*
-         * Decrement the blkCnt loop counter
-         * Advance vector source and destination pointers
+         * tail
          */
-        pSrcA += 4;
-        pSrcB += 4;
-        blkCnt --;
+        blkCnt = CMPLX_DIM * (numSamples & 3);
+        do {
+            mve_pred16_t    p = vctp32q(blkCnt);
+
+            pSrcA += 4;
+            pSrcB += 4;
+
+            vecSrcA = vldrwq_z_s32(pSrcA, p);
+            vecSrcB = vldrwq_z_s32(pSrcB, p);
+
+            accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
+            accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
+
+            blkCnt -= 4;
+        }
+        while ((int32_t) blkCnt > 0);
+    } else {
+        blkCnt = numSamples * CMPLX_DIM;
+        while (blkCnt > 0) {
+            mve_pred16_t    p = vctp32q(blkCnt);
+
+            vecSrcA = vldrwq_z_s32(pSrcA, p);
+            vecSrcB = vldrwq_z_s32(pSrcB, p);
+
+            accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
+            accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
+
+            /*
+             * Decrement the blkCnt loop counter
+             * Advance vector source and destination pointers
+             */
+            pSrcA += 4;
+            pSrcB += 4;
+            blkCnt -= 4;
+        }
     }
+    *realResult = asrl(accReal, (14 - 8));
+    *imagResult = asrl(accImag, (14 - 8));
 
-    accReal = asrl(accReal, (14 - 8));
-    accImag = asrl(accImag, (14 - 8));
-
-    /* Tail */
-    blkCnt = (blockSize & 3) >> 1;
-
-    while (blkCnt > 0U)
-    {
-      a0 = *pSrcA++;
-      b0 = *pSrcA++;
-      c0 = *pSrcB++;
-      d0 = *pSrcB++;
-  
-      accReal += ((q63_t)a0 * c0) >> 14;
-      accImag += ((q63_t)a0 * d0) >> 14;
-      accReal -= ((q63_t)b0 * d0) >> 14;
-      accImag += ((q63_t)b0 * c0) >> 14;
-  
-      /* Decrement loop counter */
-      blkCnt--;
-    }
-  
-    /* Store real and imaginary result in destination buffer. */
-    *realResult = accReal;
-    *imagResult = accImag;
 }
 
 #else
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f16.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..c2f31989e61cb16b62c328fd82eca0955587b72d
--- /dev/null
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f16.c
@@ -0,0 +1,241 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cmplx_mag_f16.c
+ * Description:  Floating-point complex magnitude
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/complex_math_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+/**
+  @ingroup groupCmplxMath
+ */
+
+/**
+  @defgroup cmplx_mag Complex Magnitude
+
+  Computes the magnitude of the elements of a complex data vector.
+
+  The <code>pSrc</code> points to the source data and
+  <code>pDst</code> points to the where the result should be written.
+  <code>numSamples</code> specifies the number of complex samples
+  in the input array and the data is stored in an interleaved fashion
+  (real, imag, real, imag, ...).
+  The input array has a total of <code>2*numSamples</code> values;
+  the output array has a total of <code>numSamples</code> values.
+
+  The underlying algorithm is used:
+
+  <pre>
+  for (n = 0; n < numSamples; n++) {
+      pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
+  }
+  </pre>
+
+  There are separate functions for floating-point, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup cmplx_mag
+  @{
+ */
+
+/**
+  @brief         Floating-point complex magnitude.
+  @param[in]     pSrc        points to input vector
+  @param[out]    pDst        points to output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+
+void arm_cmplx_mag_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t numSamples)
+{
+    int32_t blockSize = numSamples;  /* loop counters */
+    uint32_t  blkCnt;           /* loop counters */
+    f16x8x2_t vecSrc;
+    f16x8_t sum;
+
+    /* Compute 4 complex samples at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        q15x8_t newtonStartVec;
+        f16x8_t sumHalf, invSqrt;
+
+        vecSrc = vld2q(pSrc);  
+        pSrc += 16;
+        sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
+        sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
+
+        /*
+         * inlined Fast SQRT using inverse SQRT newton-raphson method
+         */
+
+        /* compute initial value */
+        newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
+        sumHalf = sum * 0.5f;
+        /*
+         * compute 3 x iterations
+         *
+         * The more iterations, the more accuracy.
+         * If you need to trade a bit of accuracy for more performance,
+         * you can comment out the 3rd use of the macro.
+         */
+        INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
+        INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
+        INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
+        /*
+         * set negative values to 0
+         */
+        invSqrt = vdupq_m(invSqrt, (float16_t)0.0f, vcmpltq(invSqrt, (float16_t)0.0f));
+        /*
+         * sqrt(x) = x * invSqrt(x)
+         */
+        sum = vmulq(sum, invSqrt);
+        vstrhq_f16(pDst, sum); 
+        pDst += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        q15x8_t newtonStartVec;
+        f16x8_t sumHalf, invSqrt;
+
+        vecSrc = vld2q((float16_t const *)pSrc);
+        sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
+        sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
+
+        /*
+         * inlined Fast SQRT using inverse SQRT newton-raphson method
+         */
+
+        /* compute initial value */
+        newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
+        sumHalf = vmulq(sum, (float16_t)0.5);
+        /*
+         * compute 2 x iterations
+         */
+        INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
+        INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
+        /*
+         * set negative values to 0
+         */
+        invSqrt = vdupq_m(invSqrt, (float16_t)0.0, vcmpltq(invSqrt, (float16_t)0.0));
+        /*
+         * sqrt(x) = x * invSqrt(x)
+         */
+        sum = vmulq(sum, invSqrt);
+        vstrhq_p_f16(pDst, sum, p0);
+    }
+}
+
+#else
+void arm_cmplx_mag_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t numSamples)
+{
+  uint32_t blkCnt;                               /* loop counter */
+  _Float16 real, imag;                      /* Temporary variables to hold input values */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = numSamples >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+
+    real = *pSrc++;
+    imag = *pSrc++;
+
+    /* store result in destination buffer. */
+    arm_sqrt_f16((real * real) + (imag * imag), pDst++);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    arm_sqrt_f16((real * real) + (imag * imag), pDst++);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    arm_sqrt_f16((real * real) + (imag * imag), pDst++);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    arm_sqrt_f16((real * real) + (imag * imag), pDst++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = numSamples % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+
+    real = *pSrc++;
+    imag = *pSrc++;
+
+    /* store result in destination buffer. */
+    arm_sqrt_f16((real * real) + (imag * imag), pDst++);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of cmplx_mag group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
index ff614eb05ae0b6f7c798be8e6101408412ec9885..aac6fba89accd710e8f44494f8bc8d6f1e2ac633 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_mag_f32.c
  * Description:  Floating-point complex magnitude
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
index 498c0e600b1fd21d86bfdcecf4669d335a5580af..e29a3d504de275a046f1005656585ba6b1f5302e 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_mag_q15.c
  * Description:  Q15 complex magnitude
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -47,7 +47,7 @@
   @par           Scaling and Overflow Behavior
                    The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
index d0950ee4d6da2d970e9015389de3bcd84b73bc1d..57a72f5f167acd24a8f8a77052bbfef453dda6fe 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_mag_q31.c
  * Description:  Q31 complex magnitude
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -49,7 +49,7 @@
                    Input down scaling is not required.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f16.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..5d5a3a24b616ef313fdc342bd5b0d922e0907851
--- /dev/null
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f16.c
@@ -0,0 +1,174 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cmplx_mag_squared_f16.c
+ * Description:  Floating-point complex magnitude squared
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/complex_math_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+  @ingroup groupCmplxMath
+ */
+
+/**
+  @defgroup cmplx_mag_squared Complex Magnitude Squared
+
+  Computes the magnitude squared of the elements of a complex data vector.
+
+  The <code>pSrc</code> points to the source data and
+  <code>pDst</code> points to the where the result should be written.
+  <code>numSamples</code> specifies the number of complex samples
+  in the input array and the data is stored in an interleaved fashion
+  (real, imag, real, imag, ...).
+  The input array has a total of <code>2*numSamples</code> values;
+  the output array has a total of <code>numSamples</code> values.
+
+  The underlying algorithm is used:
+
+  <pre>
+  for (n = 0; n < numSamples; n++) {
+      pDst[n] = pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2;
+  }
+  </pre>
+
+  There are separate functions for floating-point, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup cmplx_mag_squared
+  @{
+ */
+
+/**
+  @brief         Floating-point complex magnitude squared.
+  @param[in]     pSrc        points to input vector
+  @param[out]    pDst        points to output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_cmplx_mag_squared_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t numSamples)
+{
+    int32_t blockSize = numSamples;  /* loop counters */
+    f16x8x2_t vecSrc;
+    f16x8_t sum;
+
+    /* Compute 4 complex samples at a time */
+    while (blockSize > 0)
+    {
+        mve_pred16_t p = vctp16q(blockSize);
+        vecSrc = vld2q(pSrc);
+        sum = vmulq_m(vuninitializedq_f16(),vecSrc.val[0], vecSrc.val[0],p);
+        sum = vfmaq_m(sum, vecSrc.val[1], vecSrc.val[1],p);
+        vstrhq_p_f16(pDst, sum,p);
+
+        pSrc += 16;
+        pDst += 8;
+        
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blockSize-= 8;
+    }
+
+}
+
+#else
+void arm_cmplx_mag_squared_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t numSamples)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        _Float16 real, imag;                          /* Temporary input variables */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = numSamples >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    *pDst++ = (real * real) + (imag * imag);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    *pDst++ = (real * real) + (imag * imag);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    *pDst++ = (real * real) + (imag * imag);
+
+    real = *pSrc++;
+    imag = *pSrc++;
+    *pDst++ = (real * real) + (imag * imag);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = numSamples % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
+    real = *pSrc++;
+    imag = *pSrc++;
+
+    /* store result in destination buffer. */
+    *pDst++ = (real * real) + (imag * imag);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of cmplx_mag_squared group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
index 1cea04e97b2de51e0c0e39a3b70d176b32db28fa..861585f16bd3b2156505ae2e7240e70a6701e470 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_mag_squared_f32.c
  * Description:  Floating-point complex magnitude squared
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
index a6107be71f6738832e5649b86151aa80bc9a541a..42fc442ef98d218297bdb1198ee72e4b70ad35ea 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_mag_squared_q15.c
  * Description:  Q15 complex magnitude squared
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -48,7 +48,7 @@
                    The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 void arm_cmplx_mag_squared_q15(
   const q15_t * pSrc,
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
index 7127e6ea4e21c9b7863f8cd72130db1be1f8d1bf..4ccde36e50dbc9fabdebe7c244ea91e7dabae194 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_mag_squared_q31.c
  * Description:  Q31 complex magnitude squared
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -49,7 +49,7 @@
                    Input down scaling is not required.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 void arm_cmplx_mag_squared_q31(
   const q31_t * pSrc,
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..3409450916a7e8644ba95857f59ef2e9a8dc5e93
--- /dev/null
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c
@@ -0,0 +1,271 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cmplx_mult_cmplx_f16.c
+ * Description:  Floating-point complex-by-complex multiplication
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/complex_math_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+  @ingroup groupCmplxMath
+ */
+
+/**
+  @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
+
+  Multiplies a complex vector by another complex vector and generates a complex result.
+  The data in the complex arrays is stored in an interleaved fashion
+  (real, imag, real, imag, ...).
+  The parameter <code>numSamples</code> represents the number of complex
+  samples processed.  The complex arrays have a total of <code>2*numSamples</code>
+  real values.
+
+  The underlying algorithm is used:
+
+  <pre>
+  for (n = 0; n < numSamples; n++) {
+      pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
+      pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
+  }
+  </pre>
+
+  There are separate functions for floating-point, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup CmplxByCmplxMult
+  @{
+ */
+
+/**
+  @brief         Floating-point complex-by-complex multiplication.
+  @param[in]     pSrcA       points to first input vector
+  @param[in]     pSrcB       points to second input vector
+  @param[out]    pDst        points to output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_cmplx_mult_cmplx_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        float16_t * pDst,
+        uint32_t numSamples)
+{
+     int32_t         blkCnt;
+    f16x8_t         vecSrcA, vecSrcB;
+    f16x8_t         vecSrcC, vecSrcD;
+    f16x8_t         vec_acc;
+
+    blkCnt = (numSamples >> 3);
+    blkCnt -= 1;
+    if (blkCnt > 0) {
+        /* should give more freedom to generate stall free code */
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        pSrcA += 8;
+        pSrcB += 8;
+
+        while (blkCnt > 0) {
+            vec_acc = vcmulq(vecSrcA, vecSrcB);
+            vecSrcC = vld1q(pSrcA);
+            pSrcA += 8;
+
+            vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
+            vecSrcD = vld1q(pSrcB);
+            pSrcB += 8;
+            vst1q(pDst, vec_acc);
+            pDst += 8;
+
+            vec_acc = vcmulq(vecSrcC, vecSrcD);
+            vecSrcA = vld1q(pSrcA);
+            pSrcA += 8;
+
+            vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
+            vecSrcB = vld1q(pSrcB);
+            pSrcB += 8;
+            vst1q(pDst, vec_acc);
+            pDst += 8;
+            /*
+             * Decrement the blockSize loop counter
+             */
+            blkCnt--;
+        }
+
+        /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
+        vec_acc = vcmulq(vecSrcA, vecSrcB);
+        vecSrcC = vld1q(pSrcA);
+
+        vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
+        vecSrcD = vld1q(pSrcB);
+        vst1q(pDst, vec_acc);
+        pDst += 8;
+
+        vec_acc = vcmulq(vecSrcC, vecSrcD);
+        vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
+        vst1q(pDst, vec_acc);
+        pDst += 8;
+
+        /*
+         * tail
+         */
+        blkCnt = CMPLX_DIM * (numSamples & 7);
+        while (blkCnt > 0) {
+            mve_pred16_t    p = vctp16q(blkCnt);
+            pSrcA += 8;
+            pSrcB += 8;
+
+            vecSrcA = vldrhq_z_f16(pSrcA, p);
+            vecSrcB = vldrhq_z_f16(pSrcB, p);
+            vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
+            vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
+
+            vstrhq_p_f16(pDst, vec_acc, p);
+            pDst += 8;
+
+            blkCnt -= 8;
+        }
+    } else {
+        /* small vector */
+        blkCnt = numSamples * CMPLX_DIM;
+
+        do {
+            mve_pred16_t    p = vctp16q(blkCnt);
+
+            vecSrcA = vldrhq_z_f16(pSrcA, p);
+            vecSrcB = vldrhq_z_f16(pSrcB, p);
+
+            vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
+            vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
+            vstrhq_p_f16(pDst, vec_acc, p);
+            pDst += 8;
+
+            /*
+             * Decrement the blkCnt loop counter
+             * Advance vector source and destination pointers
+             */
+            pSrcA += 8;
+            pSrcB += 8;
+            blkCnt -= 8;
+        }
+        while (blkCnt > 0);
+    }
+
+}
+
+
+#else
+void arm_cmplx_mult_cmplx_f16(
+  const float16_t * pSrcA,
+  const float16_t * pSrcB,
+        float16_t * pDst,
+        uint32_t numSamples)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+    _Float16 a, b, c, d;  /* Temporary variables to store real and imaginary values */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = numSamples >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
+    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
+
+    a = *pSrcA++;
+    b = *pSrcA++;
+    c = *pSrcB++;
+    d = *pSrcB++;
+    /* store result in destination buffer. */
+    *pDst++ = (a * c) - (b * d);
+    *pDst++ = (a * d) + (b * c);
+
+    a = *pSrcA++;
+    b = *pSrcA++;
+    c = *pSrcB++;
+    d = *pSrcB++;
+    *pDst++ = (a * c) - (b * d);
+    *pDst++ = (a * d) + (b * c);
+
+    a = *pSrcA++;
+    b = *pSrcA++;
+    c = *pSrcB++;
+    d = *pSrcB++;
+    *pDst++ = (a * c) - (b * d);
+    *pDst++ = (a * d) + (b * c);
+
+    a = *pSrcA++;
+    b = *pSrcA++;
+    c = *pSrcB++;
+    d = *pSrcB++;
+    *pDst++ = (a * c) - (b * d);
+    *pDst++ = (a * d) + (b * c);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = numSamples % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
+    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
+
+    a = *pSrcA++;
+    b = *pSrcA++;
+    c = *pSrcB++;
+    d = *pSrcB++;
+
+    /* store result in destination buffer. */
+    *pDst++ = (a * c) - (b * d);
+    *pDst++ = (a * d) + (b * c);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of CmplxByCmplxMult group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
index 3352032eb96f2999b12c2b16a361616732ca79fb..5d2c66c4749f98d1e85dca06db51d7ca77379fb3 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_mult_cmplx_f32.c
  * Description:  Floating-point complex-by-complex multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -76,54 +76,104 @@ void arm_cmplx_mult_cmplx_f32(
         float32_t * pDst,
         uint32_t numSamples)
 {
-    uint32_t blkCnt;           /* loop counters */
-    uint32_t blockSize = numSamples;  /* loop counters */
-    float32_t a, b, c, d;  /* Temporary variables to store real and imaginary values */
-
-    f32x4x2_t vecA;
-    f32x4x2_t vecB;
-    f32x4x2_t vecDst;
-
-    /* Compute 4 complex outputs at a time */
-    blkCnt = blockSize >> 2;
-    while (blkCnt > 0U)
-    {
-            vecA = vld2q(pSrcA);  // load & separate real/imag pSrcA (de-interleave 2)
-            vecB = vld2q(pSrcB);  // load & separate real/imag pSrcB
-            pSrcA += 8;
-            pSrcB += 8;
-
-            /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
-            vecDst.val[0] = vmulq(vecA.val[0], vecB.val[0]);
-            vecDst.val[0] = vfmsq(vecDst.val[0],vecA.val[1], vecB.val[1]);
-            /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
-            vecDst.val[1] = vmulq(vecA.val[0], vecB.val[1]);
-            vecDst.val[1] = vfmaq(vecDst.val[1], vecA.val[1], vecB.val[0]);
-
-            vst2q(pDst, vecDst);
-            pDst += 8;
-
+     int32_t         blkCnt;
+    f32x4_t         vecSrcA, vecSrcB;
+    f32x4_t         vecSrcC, vecSrcD;
+    f32x4_t         vec_acc;
+
+    blkCnt = numSamples >> 2;
+    blkCnt -= 1;
+    if (blkCnt > 0) {
+        /* should give more freedom to generate stall free code */
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        pSrcA += 4;
+        pSrcB += 4;
+
+        while (blkCnt > 0) {
+            vec_acc = vcmulq(vecSrcA, vecSrcB);
+            vecSrcC = vld1q(pSrcA);
+            pSrcA += 4;
+
+            vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
+            vecSrcD = vld1q(pSrcB);
+            pSrcB += 4;
+            vst1q(pDst, vec_acc);
+            pDst += 4;
+
+            vec_acc = vcmulq(vecSrcC, vecSrcD);
+            vecSrcA = vld1q(pSrcA);
+            pSrcA += 4;
+
+            vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
+            vecSrcB = vld1q(pSrcB);
+            pSrcB += 4;
+            vst1q(pDst, vec_acc);
+            pDst += 4;
+            /*
+             * Decrement the blockSize loop counter
+             */
             blkCnt--;
-    }
-
-    /* Tail */
-    blkCnt = blockSize & 3;
-    while (blkCnt > 0U)
-    {
-      /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
-      /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
-  
-      a = *pSrcA++;
-      b = *pSrcA++;
-      c = *pSrcB++;
-      d = *pSrcB++;
-  
-      /* store result in destination buffer. */
-      *pDst++ = (a * c) - (b * d);
-      *pDst++ = (a * d) + (b * c);
-  
-      /* Decrement loop counter */
-      blkCnt--;
+        }
+
+        /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
+        vec_acc = vcmulq(vecSrcA, vecSrcB);
+        vecSrcC = vld1q(pSrcA);
+
+        vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
+        vecSrcD = vld1q(pSrcB);
+        vst1q(pDst, vec_acc);
+        pDst += 4;
+
+        vec_acc = vcmulq(vecSrcC, vecSrcD);
+        vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
+        vst1q(pDst, vec_acc);
+        pDst += 4;
+
+        /*
+         * tail
+         */
+        blkCnt = CMPLX_DIM * (numSamples & 3);
+        while (blkCnt > 0) {
+            mve_pred16_t    p = vctp32q(blkCnt);
+            pSrcA += 4;
+            pSrcB += 4;
+
+            vecSrcA = vldrwq_z_f32(pSrcA, p);
+            vecSrcB = vldrwq_z_f32(pSrcB, p);
+            vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
+            vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
+
+            vstrwq_p_f32(pDst, vec_acc, p);
+            pDst += 4;
+
+            blkCnt -= 4;
+        }
+    } else {
+        /* small vector */
+        blkCnt = numSamples * CMPLX_DIM;
+        vec_acc = vdupq_n_f32(0.0f);
+
+        do {
+            mve_pred16_t    p = vctp32q(blkCnt);
+
+            vecSrcA = vldrwq_z_f32(pSrcA, p);
+            vecSrcB = vldrwq_z_f32(pSrcB, p);
+
+            vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
+            vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
+            vstrwq_p_f32(pDst, vec_acc, p);
+            pDst += 4;
+
+            /*
+             * Decrement the blkCnt loop counter
+             * Advance vector source and destination pointers
+             */
+            pSrcA += 4;
+            pSrcB += 4;
+            blkCnt -= 4;
+        }
+        while (blkCnt > 0);
     }
 
 }
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
index 82b31f9d0fd39f81b39d48c9ad484f8ceb31b717..53fbe58dddad2661a4772fa50dd8d586128abad3 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_mult_cmplx_q15.c
  * Description:  Q15 complex-by-complex multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -49,7 +49,7 @@
                    The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 void arm_cmplx_mult_cmplx_q15(
   const q15_t * pSrcA,
@@ -57,54 +57,116 @@ void arm_cmplx_mult_cmplx_q15(
         q15_t * pDst,
         uint32_t numSamples)
 {
-  uint32_t blkCnt;           /* loop counters */
-  uint32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
-  q15_t a, b, c, d;  
-
-  q15x8_t vecA;
-  q15x8_t vecB;
-  q15x8_t vecDst;
-
-  blkCnt = blockSize >> 3;
-  while (blkCnt > 0U)
-  {
-      vecA = vld1q(pSrcA);
-      vecB = vld1q(pSrcB);
-      /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
-      vecDst = vqdmlsdhq_s16(vuninitializedq_s16(), vecA, vecB);
-      /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
-      vecDst = vqdmladhxq_s16(vecDst, vecA, vecB);
-
-      vecDst = vshrq(vecDst, 2);
-
-      vst1q(pDst, vecDst);
-
-      blkCnt --;
-      pSrcA += 8;
-      pSrcB += 8;
-      pDst += 8;
-  };
-
-  /*
-   * tail
-   */
-  blkCnt = (blockSize & 7) >> 1;
-  while (blkCnt > 0U)
-  {
-    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
-    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
-
-    a = *pSrcA++;
-    b = *pSrcA++;
-    c = *pSrcB++;
-    d = *pSrcB++;
-
-    /* store result in 3.13 format in destination buffer. */
-    *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
-    *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
-
-    /* Decrement loop counter */
-    blkCnt--;
+   int32_t         blkCnt;
+    q15x8_t         vecSrcA, vecSrcB;
+    q15x8_t         vecSrcC, vecSrcD;
+    q15x8_t         vecDst;
+
+    blkCnt = (numSamples >> 3);
+    blkCnt -= 1;
+    if (blkCnt > 0) 
+    {
+        /* should give more freedom to generate stall free code */
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+        pSrcA += 8;
+        pSrcB += 8;
+
+        while (blkCnt > 0) 
+        {
+
+            /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
+            vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
+            vecSrcC = vld1q(pSrcA);
+            pSrcA += 8;
+
+            /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
+            vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
+            vecSrcD = vld1q(pSrcB);
+            pSrcB += 8;
+
+            vstrhq_s16(pDst, vshrq(vecDst, 2));
+            pDst += 8;
+
+            vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
+            vecSrcA = vld1q(pSrcA);
+            pSrcA += 8;
+
+            vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
+            vecSrcB = vld1q(pSrcB);
+            pSrcB += 8;
+
+            vstrhq_s16(pDst, vshrq(vecDst, 2));
+            pDst += 8;
+
+            /*
+             * Decrement the blockSize loop counter
+             */
+            blkCnt--;
+        }
+
+        /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
+        vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
+        vecSrcC = vld1q(pSrcA);
+
+        vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
+        vecSrcD = vld1q(pSrcB);
+
+        vstrhq_s16(pDst, vshrq(vecDst, 2));
+        pDst += 8;
+
+        vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
+        vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
+
+        vstrhq_s16(pDst, vshrq(vecDst, 2));
+        pDst += 8;
+
+        /*
+         * tail
+         */
+        blkCnt = CMPLX_DIM * (numSamples & 7);
+        do 
+        {
+            mve_pred16_t    p = vctp16q(blkCnt);
+
+            pSrcA += 8;
+            pSrcB += 8;
+
+            vecSrcA = vldrhq_z_s16(pSrcA, p);
+            vecSrcB = vldrhq_z_s16(pSrcB, p);
+
+            vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
+            vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
+
+            vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
+            vstrhq_p_s16(pDst, vecDst, p);
+            pDst += 8;
+
+            blkCnt -= 8;
+        }
+        while ((int32_t) blkCnt > 0);
+    } 
+    else 
+    {
+        blkCnt = numSamples * CMPLX_DIM;
+        while (blkCnt > 0) {
+            mve_pred16_t    p = vctp16q(blkCnt);
+
+            vecSrcA = vldrhq_z_s16(pSrcA, p);
+            vecSrcB = vldrhq_z_s16(pSrcB, p);
+
+            vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
+            vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
+
+            vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
+            vstrhq_p_s16(pDst, vecDst, p);
+
+            pDst += 8;
+            pSrcA += 8;
+            pSrcB += 8;
+
+            blkCnt -= 8;
+    }
   }
 }
 #else
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
index a14decf521f88d4e21fea0cd60de451e6f9db855..7f945619247ff638384a8d2c43198a149d719e97 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_mult_cmplx_q31.c
  * Description:  Q31 complex-by-complex multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -50,59 +50,118 @@
                    Input down scaling is not required.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_mult_cmplx_q31(
   const q31_t * pSrcA,
   const q31_t * pSrcB,
         q31_t * pDst,
         uint32_t numSamples)
 {
-
-    uint32_t blkCnt;           /* loop counters */
-    uint32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
-    q31x4_t vecA;
-    q31x4_t vecB;
-    q31x4_t vecDst;
-    q31_t a, b, c, d;                              /* Temporary variables */
-
-    /* Compute 2 complex outputs at a time */
-    blkCnt = blockSize >> 2;
-    while (blkCnt > 0U)
-    {
-
-        vecA = vld1q(pSrcA);
-        vecB = vld1q(pSrcB);
-        /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
-        vecDst = vqdmlsdhq(vuninitializedq_s32(),vecA, vecB);
-        /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
-        vecDst = vqdmladhxq(vecDst, vecA, vecB);
-
-        vecDst = vshrq(vecDst, 2);
-        vst1q(pDst, vecDst);
-
-        blkCnt --;
+    int32_t         blkCnt;
+    q31x4_t         vecSrcA, vecSrcB;
+    q31x4_t         vecSrcC, vecSrcD;
+    q31x4_t         vecDst;
+
+    blkCnt = numSamples >> 2;
+    blkCnt -= 1;
+    if (blkCnt > 0) {
+        /* should give more freedom to generate stall free code */
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
         pSrcA += 4;
         pSrcB += 4;
+
+        while (blkCnt > 0) {
+
+            /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
+            vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
+            vecSrcC = vld1q(pSrcA);
+            pSrcA += 4;
+
+            /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
+            vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
+            vecSrcD = vld1q(pSrcB);
+            pSrcB += 4;
+
+            vst1q(pDst, vshrq(vecDst, 2));
+            pDst += 4;
+
+            vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
+            vecSrcA = vld1q(pSrcA);
+            pSrcA += 4;
+
+            vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
+            vecSrcB = vld1q(pSrcB);
+            pSrcB += 4;
+
+            vst1q(pDst, vshrq(vecDst, 2));
+            pDst += 4;
+
+            /*
+             * Decrement the blockSize loop counter
+             */
+            blkCnt--;
+        }
+
+        /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
+        vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
+        vecSrcC = vld1q(pSrcA);
+
+        vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
+        vecSrcD = vld1q(pSrcB);
+
+        vst1q(pDst, vshrq(vecDst, 2));
+        pDst += 4;
+
+        vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
+        vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
+
+        vst1q(pDst, vshrq(vecDst, 2));
         pDst += 4;
-    };
-
-    blkCnt = (blockSize & 3) >> 1;
-    while (blkCnt > 0U)
-    {
-      /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
-      /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
-  
-      a = *pSrcA++;
-      b = *pSrcA++;
-      c = *pSrcB++;
-      d = *pSrcB++;
-  
-      /* store result in 3.29 format in destination buffer. */
-      *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
-      *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
-  
-      /* Decrement loop counter */
-      blkCnt--;
+
+        /*
+         * tail
+         */
+        blkCnt = CMPLX_DIM * (numSamples & 3);
+        do {
+            mve_pred16_t    p = vctp32q(blkCnt);
+
+            pSrcA += 4;
+            pSrcB += 4;
+
+            vecSrcA = vldrwq_z_s32(pSrcA, p);
+            vecSrcB = vldrwq_z_s32(pSrcB, p);
+
+            vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
+            vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
+
+            vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
+            vstrwq_p_s32(pDst, vecDst, p);
+            pDst += 4;
+
+            blkCnt -= 4;
+        }
+        while ((int32_t) blkCnt > 0);
+    } else {
+        blkCnt = numSamples * CMPLX_DIM;
+        while (blkCnt > 0) {
+            mve_pred16_t    p = vctp32q(blkCnt);
+
+            vecSrcA = vldrwq_z_s32(pSrcA, p);
+            vecSrcB = vldrwq_z_s32(pSrcB, p);
+
+            vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
+            vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
+
+            vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
+            vstrwq_p_s32(pDst, vecDst, p);
+
+            pDst += 4;
+            pSrcA += 4;
+            pSrcB += 4;
+
+            blkCnt -= 4;
+        }
     }
 }
 #else
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f16.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..248858bc79232e9763375d53202e63a83377e5bf
--- /dev/null
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f16.c
@@ -0,0 +1,194 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cmplx_mult_real_f16.c
+ * Description:  Floating-point complex by real multiplication
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/complex_math_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+  @ingroup groupCmplxMath
+ */
+
+/**
+  @defgroup CmplxByRealMult Complex-by-Real Multiplication
+
+  Multiplies a complex vector by a real vector and generates a complex result.
+  The data in the complex arrays is stored in an interleaved fashion
+  (real, imag, real, imag, ...).
+  The parameter <code>numSamples</code> represents the number of complex
+  samples processed.  The complex arrays have a total of <code>2*numSamples</code>
+  real values while the real array has a total of <code>numSamples</code>
+  real values.
+
+  The underlying algorithm is used:
+
+  <pre>
+  for (n = 0; n < numSamples; n++) {
+      pCmplxDst[(2*n)+0] = pSrcCmplx[(2*n)+0] * pSrcReal[n];
+      pCmplxDst[(2*n)+1] = pSrcCmplx[(2*n)+1] * pSrcReal[n];
+  }
+  </pre>
+
+  There are separate functions for floating-point, Q15, and Q31 data types.
+ */
+
+/**
+  @addtogroup CmplxByRealMult
+  @{
+ */
+
+/**
+  @brief         Floating-point complex-by-real multiplication.
+  @param[in]     pSrcCmplx   points to complex input vector
+  @param[in]     pSrcReal    points to real input vector
+  @param[out]    pCmplxDst   points to complex output vector
+  @param[in]     numSamples  number of samples in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_cmplx_mult_real_f16(
+  const float16_t * pSrcCmplx,
+  const float16_t * pSrcReal,
+        float16_t * pCmplxDst,
+        uint32_t numSamples)
+{
+    static const uint16_t stride_cmplx_x_real_16[8] = {
+        0, 0, 1, 1, 2, 2, 3, 3
+        };
+    uint32_t blockSizeC = numSamples * CMPLX_DIM;   /* loop counters */
+    uint32_t blkCnt;
+    f16x8_t rVec;
+    f16x8_t cmplxVec;
+    f16x8_t dstVec;
+    uint16x8_t strideVec;
+
+
+    /* stride vector for pairs of real generation */
+    strideVec = vld1q(stride_cmplx_x_real_16);
+
+    /* Compute 4 complex outputs at a time */
+    blkCnt = blockSizeC >> 3;
+    while (blkCnt > 0U) 
+    {
+        cmplxVec = vld1q(pSrcCmplx);
+        rVec = vldrhq_gather_shifted_offset_f16(pSrcReal, strideVec);
+        dstVec = vmulq(cmplxVec, rVec);
+        vst1q(pCmplxDst, dstVec);
+
+        pSrcReal += 4;
+        pSrcCmplx += 8;
+        pCmplxDst += 8;
+        blkCnt--;
+    }
+
+    blkCnt = blockSizeC & 7;
+    if (blkCnt > 0U) {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+
+        cmplxVec = vld1q(pSrcCmplx);
+        rVec = vldrhq_gather_shifted_offset_f16(pSrcReal, strideVec);
+        dstVec = vmulq(cmplxVec, rVec);
+        vstrhq_p_f16(pCmplxDst, dstVec, p0);
+    }
+}
+
+#else
+void arm_cmplx_mult_real_f16(
+  const float16_t * pSrcCmplx,
+  const float16_t * pSrcReal,
+        float16_t * pCmplxDst,
+        uint32_t numSamples)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        float16_t in;                                  /* Temporary variable */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = numSamples >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
+    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
+
+    in = *pSrcReal++;
+    /* store result in destination buffer. */
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+
+    in = *pSrcReal++;
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+
+    in = *pSrcReal++;
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+
+    in = *pSrcReal++;
+    *pCmplxDst++ = *pSrcCmplx++* in;
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = numSamples % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
+    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
+
+    in = *pSrcReal++;
+    /* store result in destination buffer. */
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+    *pCmplxDst++ = *pSrcCmplx++ * in;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of CmplxByRealMult group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
index 25d60a5cfa5ec2476c8c11374db9e7de657da841..bc0f59b2c1b1194a4ae017d834217ccf0cf852f1 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_mult_real_f32.c
  * Description:  Floating-point complex by real multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -77,7 +77,7 @@ void arm_cmplx_mult_real_f32(
         float32_t * pCmplxDst,
         uint32_t numSamples)
 {
-    const static uint32_t stride_cmplx_x_real_32[4] = { 0, 0, 1, 1 };
+    static const uint32_t stride_cmplx_x_real_32[4] = { 0, 0, 1, 1 };
 
     uint32_t blockSizeC = numSamples * CMPLX_DIM;   /* loop counters */
     uint32_t blkCnt;
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
index 8175938e31b753a427d264b6337b5290c1632f5f..a43383b7db93d291b3850246a4830b3d1b6a8892 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_mult_real_q15.c
  * Description:  Q15 complex by real multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -49,7 +49,7 @@
                    The function uses saturating arithmetic.
                    Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 void arm_cmplx_mult_real_q15(
   const q15_t * pSrcCmplx,
@@ -57,7 +57,7 @@ void arm_cmplx_mult_real_q15(
         q15_t * pCmplxDst,
         uint32_t numSamples)
 {
-  const static uint16_t stride_cmplx_x_real_16[8] = {
+  static const uint16_t stride_cmplx_x_real_16[8] = {
       0, 0, 1, 1, 2, 2, 3, 3
       };
   q15x8_t rVec;
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
index bc9ba69fe05dc513fcd86d573fe1dec0fe4e3bf5..de13757d8c0c30ae64cc557415794b718f326deb 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_mult_real_q31.c
  * Description:  Q31 complex by real multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/complex_math_functions.h"
 
 /**
   @ingroup groupCmplxMath
@@ -50,7 +50,7 @@
                    Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_mult_real_q31(
   const q31_t * pSrcCmplx,
   const q31_t * pSrcReal,
@@ -58,7 +58,7 @@ void arm_cmplx_mult_real_q31(
         uint32_t numSamples)
 {
 
-    const static uint32_t stride_cmplx_x_real_32[4] = {
+    static const uint32_t stride_cmplx_x_real_32[4] = {
         0, 0, 1, 1
     };
     q31x4_t rVec;
diff --git a/CMSIS/DSP/Source/ControllerFunctions/arm_pid_init_f32.c b/CMSIS/DSP/Source/ControllerFunctions/arm_pid_init_f32.c
index ecb703a036582a42a533d8944ae0045c520a3727..384994bdfe56665404f9855015c77eeb9a0f29a2 100644
--- a/CMSIS/DSP/Source/ControllerFunctions/arm_pid_init_f32.c
+++ b/CMSIS/DSP/Source/ControllerFunctions/arm_pid_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_pid_init_f32.c
  * Description:  Floating-point PID Control initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/controller_functions.h"
 
 /**
   @addtogroup PID
diff --git a/CMSIS/DSP/Source/ControllerFunctions/arm_pid_init_q15.c b/CMSIS/DSP/Source/ControllerFunctions/arm_pid_init_q15.c
index c88a3d909ce98e02c4bc519fdc5d134e48238230..e1f510b315c6002edfe4d16190fe6cf3edaee867 100644
--- a/CMSIS/DSP/Source/ControllerFunctions/arm_pid_init_q15.c
+++ b/CMSIS/DSP/Source/ControllerFunctions/arm_pid_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_pid_init_q15.c
  * Description:  Q15 PID Control initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/controller_functions.h"
 
 /**
   @addtogroup PID
diff --git a/CMSIS/DSP/Source/ControllerFunctions/arm_pid_init_q31.c b/CMSIS/DSP/Source/ControllerFunctions/arm_pid_init_q31.c
index 1625a5f2f658f0b747205300c40c1fe946bb5a0f..53df5ee5a92ec2c868f8013de207eb13efe70b02 100644
--- a/CMSIS/DSP/Source/ControllerFunctions/arm_pid_init_q31.c
+++ b/CMSIS/DSP/Source/ControllerFunctions/arm_pid_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_pid_init_q31.c
  * Description:  Q31 PID Control initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/controller_functions.h"
 
 /**
   @addtogroup PID
diff --git a/CMSIS/DSP/Source/ControllerFunctions/arm_pid_reset_f32.c b/CMSIS/DSP/Source/ControllerFunctions/arm_pid_reset_f32.c
index d839e55ea5d99d4bb49ec287d8f82b70412c060f..cb267adb8d5a6d67d223a7aa1a2d7e549d88d0f1 100644
--- a/CMSIS/DSP/Source/ControllerFunctions/arm_pid_reset_f32.c
+++ b/CMSIS/DSP/Source/ControllerFunctions/arm_pid_reset_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_pid_reset_f32.c
  * Description:  Floating-point PID Control reset function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/controller_functions.h"
 
 /**
   @addtogroup PID
diff --git a/CMSIS/DSP/Source/ControllerFunctions/arm_pid_reset_q15.c b/CMSIS/DSP/Source/ControllerFunctions/arm_pid_reset_q15.c
index 256fd8cae72c2ee15ab9a2853fcfc40593c73b67..aa4076ec983c298a30193e5f69f9e53153a9e837 100644
--- a/CMSIS/DSP/Source/ControllerFunctions/arm_pid_reset_q15.c
+++ b/CMSIS/DSP/Source/ControllerFunctions/arm_pid_reset_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_pid_reset_q15.c
  * Description:  Q15 PID Control reset function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/controller_functions.h"
 
 /**
   @addtogroup PID
diff --git a/CMSIS/DSP/Source/ControllerFunctions/arm_pid_reset_q31.c b/CMSIS/DSP/Source/ControllerFunctions/arm_pid_reset_q31.c
index 2aa391c04bfb6fb268ea327265806805c6af54f4..3e9c39fe7793b80e75c5c2640579bdd084ef0078 100644
--- a/CMSIS/DSP/Source/ControllerFunctions/arm_pid_reset_q31.c
+++ b/CMSIS/DSP/Source/ControllerFunctions/arm_pid_reset_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_pid_reset_q31.c
  * Description:  Q31 PID Control reset function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/controller_functions.h"
 
 /**
   @addtogroup PID
diff --git a/CMSIS/DSP/Source/ControllerFunctions/arm_sin_cos_f32.c b/CMSIS/DSP/Source/ControllerFunctions/arm_sin_cos_f32.c
index 12a1c83b182dcc1a1ceff65873b4073704935f87..e6d9d90f34fa926b3237fee353c11cb1f56ff902 100644
--- a/CMSIS/DSP/Source/ControllerFunctions/arm_sin_cos_f32.c
+++ b/CMSIS/DSP/Source/ControllerFunctions/arm_sin_cos_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_sin_cos_f32.c
  * Description:  Sine and Cosine calculation for floating-point values
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/controller_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/ControllerFunctions/arm_sin_cos_q31.c b/CMSIS/DSP/Source/ControllerFunctions/arm_sin_cos_q31.c
index 84ee3d2ca80412030fcce6bd107be486afbc0f32..990c891dc2e99207ebc6e2bad110c41f73052a9e 100644
--- a/CMSIS/DSP/Source/ControllerFunctions/arm_sin_cos_q31.c
+++ b/CMSIS/DSP/Source/ControllerFunctions/arm_sin_cos_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_sin_cos_q31.c
  * Description:  Cosine & Sine calculation for Q31 values
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/controller_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_boolean_distance.c b/CMSIS/DSP/Source/DistanceFunctions/arm_boolean_distance.c
index dfd5d9836fee57a0b6445c4de7daf49f69453e47..57873b1e07f620dee9c42a56162d5046497940f6 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_boolean_distance.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_boolean_distance.c
@@ -4,11 +4,13 @@
  * Title:        arm_svm_linear_init_f32.c
  * Description:  SVM Linear Instance Initialization
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,7 +27,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_boolean_distance_template.h b/CMSIS/DSP/Source/DistanceFunctions/arm_boolean_distance_template.h
index 6888e9e02a8f21a227cd8a0d6f40e125c28e01b0..eb2e2af9f80549c6300fa42a7fcda4c69ce4defa 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_boolean_distance_template.h
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_boolean_distance_template.h
@@ -4,11 +4,13 @@
  * Title:        arm_boolean_distance.c
  * Description:  Templates for boolean distances
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -44,6 +46,12 @@
  */
 
 
+
+
+#define _FUNC(A,B) A##B 
+
+#define FUNC(EXT) _FUNC(arm_boolean_distance, EXT)
+
 /**
  * @brief        Elements of boolean distances
  *
@@ -52,18 +60,11 @@
  * @param[in]    pA              First vector of packed booleans
  * @param[in]    pB              Second vector of packed booleans
  * @param[in]    numberOfBools   Number of booleans
- * @param[out]   cTT             cTT value
- * @param[out]   cTF             cTF value
- * @param[out]   cFT             cFT value
  * @return None
  *
  */
 
-#define _FUNC(A,B) A##B 
-
-#define FUNC(EXT) _FUNC(arm_boolean_distance, EXT)
-
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_common_tables.h"
 
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_braycurtis_distance_f16.c b/CMSIS/DSP/Source/DistanceFunctions/arm_braycurtis_distance_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..0ed32fc133ff2b151958b82a33e44201ec77a7c6
--- /dev/null
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_braycurtis_distance_f16.c
@@ -0,0 +1,158 @@
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_braycurtis_distance_f16.c
+ * Description:  Bray-Curtis distance between two vectors
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/distance_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+ * @ingroup groupDistance
+ */
+
+/**
+ * @defgroup FloatDist Float Distances
+ *
+ * Distances between two vectors of float values.
+ */
+
+/**
+  @ingroup FloatDist
+ */
+
+/**
+  @defgroup braycurtis Bray-Curtis distance
+
+  Bray-Curtis distance between two vectors
+ */
+
+/**
+  @addtogroup braycurtis
+  @{
+ */
+
+
+/**
+ * @brief        Bray-Curtis distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
+{
+    _Float16        accumDiff = 0.0f, accumSum = 0.0f;
+    uint32_t        blkCnt;
+    f16x8_t         a, b, c, accumDiffV, accumSumV;
+
+
+    accumDiffV = vdupq_n_f16(0.0f);
+    accumSumV = vdupq_n_f16(0.0f);
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0) {
+        a = vld1q(pA);
+        b = vld1q(pB);
+
+        c = vabdq(a, b);
+        accumDiffV = vaddq(accumDiffV, c);
+
+        c = vaddq_f16(a, b);
+        c = vabsq_f16(c);
+        accumSumV = vaddq(accumSumV, c);
+
+        pA += 8;
+        pB += 8;
+        blkCnt--;
+    }
+
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U) {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
+
+        a = vldrhq_z_f16(pA, p0);
+        b = vldrhq_z_f16(pB, p0);
+
+        c = vabdq(a, b);
+        accumDiffV = vaddq_m(accumDiffV, accumDiffV, c, p0);
+
+        c = vaddq_f16(a, b);
+        c = vabsq_f16(c);
+        accumSumV = vaddq_m(accumSumV, accumSumV, c, p0);
+    }
+
+    accumDiff = vecAddAcrossF16Mve(accumDiffV);
+    accumSum = vecAddAcrossF16Mve(accumSumV);
+
+    /*
+       It is assumed that accumSum is not zero. Since it is the sum of several absolute
+       values it would imply that all of them are zero. It is very unlikely for long vectors.
+     */
+    return (accumDiff / accumSum);
+}
+#else
+
+float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
+{
+   _Float16 accumDiff=0.0f16, accumSum=0.0f16, tmpA, tmpB;
+
+   while(blockSize > 0)
+   {
+      tmpA = *pA++;
+      tmpB = *pB++;
+      accumDiff += (_Float16)fabsf(tmpA - tmpB);
+      accumSum += (_Float16)fabsf(tmpA + tmpB);
+      blockSize --;
+   }
+   /*
+
+   It is assumed that accumSum is not zero. Since it is the sum of several absolute
+   values it would imply that all of them are zero. It is very unlikely for long vectors.
+
+   */
+   return(accumDiff / accumSum);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/**
+ * @} end of braycurtis group
+ */
+
+
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_braycurtis_distance_f32.c b/CMSIS/DSP/Source/DistanceFunctions/arm_braycurtis_distance_f32.c
index 38c9d18ed7a83da4f7789acf9702b78c134443b8..ae13c35883fc5237576f67cb660906161aa307d1 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_braycurtis_distance_f32.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_braycurtis_distance_f32.c
@@ -4,11 +4,13 @@
  * Title:        arm_braycurtis_distance_f32.c
  * Description:  Bray-Curtis distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,25 +27,14 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 
 /**
- * @ingroup groupDistance
- * @{
- */
-
-/**
- * @defgroup FloatDist Float Distances
- *
- * Distances between two vectors of float values.
- */
-
-/**
-  @addtogroup FloatDist
+  @addtogroup braycurtis
   @{
  */
 
@@ -191,9 +182,6 @@ float32_t arm_braycurtis_distance_f32(const float32_t *pA,const float32_t *pB, u
 
 
 /**
- * @} end of FloatDist group
+ * @} end of braycurtis group
  */
 
-/**
- * @} end of groupDistance group
- */
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_canberra_distance_f16.c b/CMSIS/DSP/Source/DistanceFunctions/arm_canberra_distance_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..cbb7e60c40cd7a18976042be90c96af56e67aeda
--- /dev/null
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_canberra_distance_f16.c
@@ -0,0 +1,171 @@
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_canberra_distance_f16.c
+ * Description:  Canberra distance between two vectors
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/distance_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup FloatDist
+ */
+
+/**
+  @defgroup Canberra Canberra distance
+
+  Canberra distance
+ */
+
+
+/**
+  @addtogroup Canberra
+  @{
+ */
+
+
+/**
+ * @brief        Canberra distance between two vectors
+ *
+ * This function may divide by zero when samples pA[i] and pB[i] are both zero.
+ * The result of the computation will be correct. So the division per zero may be
+ * ignored.
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
+{
+    _Float16       accum = 0.0f16;
+    uint32_t         blkCnt;
+    f16x8_t         a, b, c, accumV;
+
+    accumV = vdupq_n_f16(0.0f);
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0) {
+        a = vld1q(pA);
+        b = vld1q(pB);
+
+        c = vabdq(a, b);
+
+        a = vabsq(a);
+        b = vabsq(b);
+        a = vaddq(a, b);
+
+        /* 
+         * May divide by zero when a and b have both the same lane at zero.
+         */
+        a = vrecip_hiprec_f16(a);
+
+        /*
+         * Force result of a division by 0 to 0. It the behavior of the
+         * sklearn canberra function.
+         */
+        a = vdupq_m_n_f16(a, 0.0f, vcmpeqq(a, 0.0f));
+        c = vmulq(c, a);
+        accumV = vaddq(accumV, c);
+
+        pA += 8;
+        pB += 8;
+        blkCnt--;
+    }
+
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U) {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
+
+        a = vldrhq_z_f16(pA, p0);
+        b = vldrhq_z_f16(pB, p0);
+
+        c = vabdq(a, b);
+
+        a = vabsq(a);
+        b = vabsq(b);
+        a = vaddq(a, b);
+
+        /* 
+         * May divide by zero when a and b have both the same lane at zero.
+         */
+        a = vrecip_hiprec_f16(a);
+
+        /*
+         * Force result of a division by 0 to 0. It the behavior of the
+         * sklearn canberra function.
+         */
+        a = vdupq_m_n_f16(a, 0.0f, vcmpeqq(a, 0.0f));
+        c = vmulq(c, a);
+        accumV = vaddq_m(accumV, accumV, c, p0);
+    }
+
+    accum = vecAddAcrossF16Mve(accumV);
+
+    return (accum);
+}
+
+
+#else
+float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
+{
+   _Float16 accum=0.0f, tmpA, tmpB,diff,sum;
+
+   while(blockSize > 0)
+   {
+      tmpA = *pA++;
+      tmpB = *pB++;
+
+      diff = fabsf(tmpA - tmpB);
+      sum = fabsf(tmpA) + fabsf(tmpB);
+      if ((tmpA != 0.0f16) || (tmpB != 0.0f16))
+      {
+         accum += (diff / sum);
+      }
+      blockSize --;
+   }
+   return(accum);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/**
+ * @} end of Canberra group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_canberra_distance_f32.c b/CMSIS/DSP/Source/DistanceFunctions/arm_canberra_distance_f32.c
index f71b74d3b14a93e650a8b1c736a9a9c447cbd387..c786c045115e8ce02ea44e4fd7806aacbe5f1e3c 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_canberra_distance_f32.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_canberra_distance_f32.c
@@ -4,11 +4,13 @@
  * Title:        arm_canberra_distance_f32.c
  * Description:  Canberra distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,13 +27,13 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 /**
-  @addtogroup FloatDist
+  @addtogroup Canberra
   @{
  */
 
@@ -216,5 +218,5 @@ float32_t arm_canberra_distance_f32(const float32_t *pA,const float32_t *pB, uin
 
 
 /**
- * @} end of FloatDist group
+ * @} end of Canberra group
  */
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_chebyshev_distance_f16.c b/CMSIS/DSP/Source/DistanceFunctions/arm_chebyshev_distance_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..63ff3323c7c5dbf1c351c902e13176b32bd5032e
--- /dev/null
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_chebyshev_distance_f16.c
@@ -0,0 +1,146 @@
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_chebyshev_distance_f16.c
+ * Description:  Chebyshev distance between two vectors
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/distance_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup FloatDist
+ */
+
+/**
+  @defgroup Chebyshev Chebyshev distance
+
+  Chebyshev distance
+ */
+
+/**
+  @addtogroup Chebyshev
+  @{
+ */
+
+
+/**
+ * @brief        Chebyshev distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
+{
+    uint32_t        blkCnt;     /* loop counters */
+    f16x8_t         vecA, vecB;
+    f16x8_t         vecDiff = vdupq_n_f16(0.0);
+    float16_t       maxValue = 0.0f16;
+
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U) {
+        vecA = vld1q(pA);
+        pA += 8;
+        vecB = vld1q(pB);
+        pB += 8;
+        /*
+         * update per-lane max.
+         */
+        vecDiff = vmaxnmaq(vsubq(vecA, vecB), vecDiff);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U) {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
+
+        vecA = vldrhq_z_f16(pA, p0);
+        vecB = vldrhq_z_f16(pB, p0);
+
+        /*
+         * Get current max per lane and current index per lane
+         * when a max is selected
+         */
+        vecDiff = vmaxnmaq_m(vecDiff, vsubq(vecA, vecB), p0);
+    }
+    /*
+     * Get max value across the vector
+     */
+    return vmaxnmavq(maxValue, vecDiff);
+}
+
+#else
+float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
+{
+   _Float16 diff=0.0f,  maxVal,tmpA, tmpB;
+
+   tmpA = *pA++;
+   tmpB = *pB++;
+   diff = fabsf(tmpA - tmpB);
+   maxVal = diff;
+   blockSize--;
+
+   while(blockSize > 0)
+   {
+      tmpA = *pA++;
+      tmpB = *pB++;
+      diff = fabsf(tmpA - tmpB);
+      if (diff > maxVal)
+      {
+        maxVal = diff;
+      }
+      blockSize --;
+   }
+  
+   return(maxVal);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/**
+ * @} end of Chebyshev group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_chebyshev_distance_f32.c b/CMSIS/DSP/Source/DistanceFunctions/arm_chebyshev_distance_f32.c
index f6a47365d448f1c02728c5bcd3266d8fab1f5578..41da72d76660eb2f4e4bbac20ee20de77855fb97 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_chebyshev_distance_f32.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_chebyshev_distance_f32.c
@@ -4,11 +4,13 @@
  * Title:        arm_chebyshev_distance_f32.c
  * Description:  Chebyshev distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,13 +27,13 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 /**
-  @addtogroup FloatDist
+  @addtogroup Chebyshev
   @{
  */
 
@@ -207,5 +209,5 @@ float32_t arm_chebyshev_distance_f32(const float32_t *pA,const float32_t *pB, ui
 
 
 /**
- * @} end of FloatDist group
+ * @} end of Chebyshev group
  */
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_cityblock_distance_f16.c b/CMSIS/DSP/Source/DistanceFunctions/arm_cityblock_distance_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..a9513ceae4d410300da059c72700d72fdebdee18
--- /dev/null
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_cityblock_distance_f16.c
@@ -0,0 +1,128 @@
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cityblock_distance_f16.c
+ * Description:  Cityblock (Manhattan) distance between two vectors
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/distance_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup FloatDist
+ */
+
+/**
+  @defgroup Manhattan Cityblock (Manhattan) distance
+
+  Cityblock (Manhattan) distance
+ */
+
+/**
+  @addtogroup Manhattan
+  @{
+ */
+
+
+/**
+ * @brief        Cityblock (Manhattan) distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+float16_t arm_cityblock_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
+{
+    uint32_t        blkCnt;
+    f16x8_t         a, b, accumV, tempV;
+
+    accumV = vdupq_n_f16(0.0f);
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U) {
+        a = vld1q(pA);
+        b = vld1q(pB);
+
+        tempV = vabdq(a, b);
+        accumV = vaddq(accumV, tempV);
+
+        pA += 8;
+        pB += 8;
+        blkCnt--;
+    }
+
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U) {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
+
+        a = vldrhq_z_f16(pA, p0);
+        b = vldrhq_z_f16(pB, p0);
+
+        tempV = vabdq(a, b);
+        accumV = vaddq_m(accumV, accumV, tempV, p0);
+    }
+
+    return vecAddAcrossF16Mve(accumV);
+}
+
+#else
+float16_t arm_cityblock_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
+{
+   _Float16 accum,tmpA, tmpB;
+
+   accum = 0.0f16;
+   while(blockSize > 0)
+   {
+      tmpA = *pA++;
+      tmpB = *pB++;
+      accum  += (_Float16)fabsf(tmpA - tmpB);
+      
+      blockSize --;
+   }
+  
+   return(accum);
+}
+#endif
+
+/**
+ * @} end of Manhattan group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_cityblock_distance_f32.c b/CMSIS/DSP/Source/DistanceFunctions/arm_cityblock_distance_f32.c
index 8dce697cddb9d758c88b0de0ee4acf10d730271f..08d406a236ea574fb905ca7ff2deab971e28e1ae 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_cityblock_distance_f32.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_cityblock_distance_f32.c
@@ -4,11 +4,13 @@
  * Title:        arm_cityblock_distance_f32.c
  * Description:  Cityblock (Manhattan) distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,12 +27,12 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
 /**
-  @addtogroup FloatDist
+  @addtogroup Manhattan
   @{
  */
 
@@ -151,5 +153,5 @@ float32_t arm_cityblock_distance_f32(const float32_t *pA,const float32_t *pB, ui
 #endif
 
 /**
- * @} end of FloatDist group
+ * @} end of Manhattan group
  */
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_correlation_distance_f16.c b/CMSIS/DSP/Source/DistanceFunctions/arm_correlation_distance_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..70abbde0e137c20994c21e29ee2979e57d8bf9eb
--- /dev/null
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_correlation_distance_f16.c
@@ -0,0 +1,99 @@
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_correlation_distance_f16.c
+ * Description:  Correlation distance between two vectors
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/distance_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+
+/**
+  @ingroup FloatDist
+ */
+
+/**
+  @defgroup Correlation Correlation distance
+
+  Correlation distance
+ */
+
+/**
+  @addtogroup Correlation
+  @{
+ */
+
+
+/**
+ * @brief        Correlation distance between two vectors
+ *
+ * The input vectors are modified in place !
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+float16_t arm_correlation_distance_f16(float16_t *pA,float16_t *pB, uint32_t blockSize)
+{
+    float16_t ma,mb,pwra,pwrb,dot,tmp;
+
+    arm_mean_f16(pA, blockSize, &ma);
+    arm_mean_f16(pB, blockSize, &mb);
+
+    arm_offset_f16(pA, -ma, pA, blockSize);
+    arm_offset_f16(pB, -mb, pB, blockSize);
+
+    arm_power_f16(pA, blockSize, &pwra);
+    arm_power_f16(pB, blockSize, &pwrb);
+
+    arm_dot_prod_f16(pA,pB,blockSize,&dot);
+
+    dot = dot / blockSize;
+    pwra = pwra / blockSize;
+    pwrb = pwrb / blockSize;
+
+    arm_sqrt_f16(pwra * pwrb,&tmp);
+ 
+    return(1.0f - dot / tmp);
+
+   
+}
+
+
+
+/**
+ * @} end of Correlation group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_correlation_distance_f32.c b/CMSIS/DSP/Source/DistanceFunctions/arm_correlation_distance_f32.c
index cee07fde33c813ac3493ee150bcf53f22b5bdd6b..2578016efcdc7cebba76aa0f35c92a03c900c2e8 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_correlation_distance_f32.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_correlation_distance_f32.c
@@ -4,11 +4,13 @@
  * Title:        arm_correlation_distance_f32.c
  * Description:  Correlation distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,14 +27,14 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 
 /**
-  @addtogroup FloatDist
+  @addtogroup Correlation
   @{
  */
 
@@ -78,5 +80,5 @@ float32_t arm_correlation_distance_f32(float32_t *pA,float32_t *pB, uint32_t blo
 
 
 /**
- * @} end of FloatDist group
+ * @} end of Correlation group
  */
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_cosine_distance_f16.c b/CMSIS/DSP/Source/DistanceFunctions/arm_cosine_distance_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..c85fe911dfba9f0d95b008298817c0f8a6510cdc
--- /dev/null
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_cosine_distance_f16.c
@@ -0,0 +1,86 @@
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cosine_distance_f16.c
+ * Description:  Cosine distance between two vectors
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/distance_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup FloatDist
+ */
+
+/**
+  @defgroup CosineDist Cosine distance
+
+  Cosine distance
+ */
+
+
+/**
+  @addtogroup CosineDist
+  @{
+ */
+
+
+
+/**
+ * @brief        Cosine distance between two vectors
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+float16_t arm_cosine_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
+{
+    float16_t pwra,pwrb,dot,tmp;
+
+    arm_power_f16(pA, blockSize, &pwra);
+    arm_power_f16(pB, blockSize, &pwrb);
+
+    arm_dot_prod_f16(pA,pB,blockSize,&dot);
+
+    arm_sqrt_f16(pwra * pwrb, &tmp);
+    return(1.0f - dot / tmp);
+
+}
+
+
+
+/**
+ * @} end of CosineDist group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_cosine_distance_f32.c b/CMSIS/DSP/Source/DistanceFunctions/arm_cosine_distance_f32.c
index b35b5ee10718034c718588cda198bbd5dec89ad4..041e5e45efb74e9410ec7bb4ea4b02e2ace89027 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_cosine_distance_f32.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_cosine_distance_f32.c
@@ -4,11 +4,13 @@
  * Title:        arm_cosine_distance_f32.c
  * Description:  Cosine distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,13 +27,13 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 /**
-  @addtogroup FloatDist
+  @addtogroup CosineDist
   @{
  */
 
@@ -64,5 +66,5 @@ float32_t arm_cosine_distance_f32(const float32_t *pA,const float32_t *pB, uint3
 
 
 /**
- * @} end of FloatDist group
+ * @} end of CosineDist group
  */
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_dice_distance.c b/CMSIS/DSP/Source/DistanceFunctions/arm_dice_distance.c
index 81533a64f70ac9b56f1c96b21f4a5e93c9983854..cae963a6a38ca5e26fd9c67009c4acff74f4e698 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_dice_distance.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_dice_distance.c
@@ -4,11 +4,13 @@
  * Title:        arm_dice_distance.c
  * Description:  Dice distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,7 +27,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_euclidean_distance_f16.c b/CMSIS/DSP/Source/DistanceFunctions/arm_euclidean_distance_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..64d5d4a6f08753228aa9ef348f4ed8b92ebde256
--- /dev/null
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_euclidean_distance_f16.c
@@ -0,0 +1,131 @@
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_euclidean_distance_f16.c
+ * Description:  Euclidean distance between two vectors
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/distance_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+
+/**
+  @ingroup FloatDist
+ */
+
+/**
+  @defgroup Euclidean Euclidean distance
+
+  Euclidean distance
+ */
+
+
+/**
+  @addtogroup Euclidean
+  @{
+ */
+
+
+/**
+ * @brief        Euclidean distance between two vectors
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+float16_t arm_euclidean_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
+{
+    uint32_t        blkCnt;
+    float16_t       tmp;
+    f16x8_t         a, b, accumV, tempV;
+
+    accumV = vdupq_n_f16(0.0f);
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U) {
+        a = vld1q(pA);
+        b = vld1q(pB);
+
+        tempV = vsubq(a, b);
+        accumV = vfmaq(accumV, tempV, tempV);
+
+        pA += 8;
+        pB += 8;
+        blkCnt--;
+    }
+
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U) {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
+
+        a = vldrhq_z_f16(pA, p0);
+        b = vldrhq_z_f16(pB, p0);
+
+        tempV = vsubq(a, b);
+        accumV = vfmaq_m(accumV, tempV, tempV, p0);
+    }
+
+    arm_sqrt_f16(vecAddAcrossF16Mve(accumV), &tmp);
+    return (tmp);
+}
+
+#else
+float16_t arm_euclidean_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
+{
+   _Float16 accum=0.0f,tmp;
+   float16_t result;
+
+   while(blockSize > 0)
+   {
+      tmp = (_Float16)*pA++ - (_Float16)*pB++;
+      accum += SQ(tmp);
+      blockSize --;
+   }
+   arm_sqrt_f16(accum,&result);
+   return(result);
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/**
+ * @} end of Euclidean group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_euclidean_distance_f32.c b/CMSIS/DSP/Source/DistanceFunctions/arm_euclidean_distance_f32.c
index 34df6d7d9bb07bd1b794e5df47142f4e0bf2a40b..e226617e9b1f1fd23c9fbe7a5eb6aeef4abacbc3 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_euclidean_distance_f32.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_euclidean_distance_f32.c
@@ -4,11 +4,13 @@
  * Title:        arm_euclidean_distance_f32.c
  * Description:  Euclidean distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,14 +27,14 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 
 /**
-  @addtogroup FloatDist
+  @addtogroup Euclidean
   @{
  */
 
@@ -146,5 +148,5 @@ float32_t arm_euclidean_distance_f32(const float32_t *pA,const float32_t *pB, ui
 
 
 /**
- * @} end of FloatDist group
+ * @} end of Euclidean group
  */
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_hamming_distance.c b/CMSIS/DSP/Source/DistanceFunctions/arm_hamming_distance.c
index e89542057fc500af1a8f60f03c0c987780c8a7d6..f95c46eb2fb1fac2c6f5ee909e659c89597b9123 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_hamming_distance.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_hamming_distance.c
@@ -4,11 +4,13 @@
  * Title:        arm_hamming_distance.c
  * Description:  Hamming distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,7 +27,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_jaccard_distance.c b/CMSIS/DSP/Source/DistanceFunctions/arm_jaccard_distance.c
index c708859dfeca7fe297d3d458ed707f8e683b8ae9..26617a5d3f634da92aea7a405cc3d47ed3424703 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_jaccard_distance.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_jaccard_distance.c
@@ -4,11 +4,13 @@
  * Title:        arm_jaccard_distance.c
  * Description:  Jaccard distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,7 +27,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_jensenshannon_distance_f16.c b/CMSIS/DSP/Source/DistanceFunctions/arm_jensenshannon_distance_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..b89e7bd3cc8d288975cdf577bc2272d4aac4cad6
--- /dev/null
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_jensenshannon_distance_f16.c
@@ -0,0 +1,177 @@
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_jensenshannon_distance_f16.c
+ * Description:  Jensen-Shannon distance between two vectors
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/distance_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup FloatDist
+ */
+
+/**
+  @defgroup JensenShannon Jensen-Shannon distance
+
+  Jensen-Shannon distance
+ */
+
+
+/**
+  @addtogroup JensenShannon
+  @{
+ */
+
+#if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE)
+/// @private
+__STATIC_INLINE float16_t rel_entr(float16_t x, float16_t y)
+{
+    return (x * logf(x / y));
+}
+#endif
+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+float16_t arm_jensenshannon_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
+{
+    uint32_t        blkCnt;
+    float16_t       tmp;
+    f16x8_t         a, b, t, tmpV, accumV;
+
+    accumV = vdupq_n_f16(0.0f);
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U) {
+        a = vld1q(pA);
+        b = vld1q(pB);
+
+        t = vaddq(a, b);
+        t = vmulq(t, 0.5f);
+
+        tmpV = vmulq(a, vrecip_medprec_f16(t));
+        tmpV = vlogq_f16(tmpV);
+        accumV = vfmaq(accumV, a, tmpV);
+
+        tmpV = vmulq_f16(b, vrecip_medprec_f16(t));
+        tmpV = vlogq_f16(tmpV);
+        accumV = vfmaq(accumV, b, tmpV);
+
+        pA += 8;
+        pB += 8;
+        blkCnt--;
+    }
+
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U) {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
+
+        a = vldrhq_z_f16(pA, p0);
+        b = vldrhq_z_f16(pB, p0);
+
+        t = vaddq(a, b);
+        t = vmulq(t, 0.5f);
+
+        tmpV = vmulq_f16(a, vrecip_medprec_f16(t));
+        tmpV = vlogq_f16(tmpV);
+        accumV = vfmaq_m_f16(accumV, a, tmpV, p0);
+
+        tmpV = vmulq_f16(b, vrecip_medprec_f16(t));
+        tmpV = vlogq_f16(tmpV);
+        accumV = vfmaq_m_f16(accumV, b, tmpV, p0);
+
+    }
+
+    arm_sqrt_f16(vecAddAcrossF16Mve(accumV) / 2.0f, &tmp);
+    return (tmp);
+}
+
+#else
+
+
+/**
+ * @brief        Jensen-Shannon distance between two vectors
+ *
+ * This function is assuming that elements of second vector are > 0
+ * and 0 only when the corresponding element of first vector is 0.
+ * Otherwise the result of the computation does not make sense
+ * and for speed reasons, the cases returning NaN or Infinity are not
+ * managed.
+ *
+ * When the function is computing x log (x / y) with x == 0 and y == 0,
+ * it will compute the right result (0) but a division by zero will occur
+ * and should be ignored in client code.
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    blockSize  vector length
+ * @return distance
+ *
+ */
+
+
+float16_t arm_jensenshannon_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
+{
+    _Float16 left, right,sum, tmp;
+    float16_t result;
+    uint32_t i;
+
+    left = 0.0f16; 
+    right = 0.0f16;
+    for(i=0; i < blockSize; i++)
+    {
+      tmp = ((_Float16)pA[i] + (_Float16)pB[i]) / 2.0f16;
+      left  += (_Float16)rel_entr(pA[i], tmp);
+      right += (_Float16)rel_entr(pB[i], tmp);
+    }
+
+
+    sum = left + right;
+    arm_sqrt_f16(sum/2.0f, &result);
+    return(result);
+
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of JensenShannon group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_jensenshannon_distance_f32.c b/CMSIS/DSP/Source/DistanceFunctions/arm_jensenshannon_distance_f32.c
index 6673e4728095bc74611b446adacc42c486de4c5c..5f1a44ccb7fed927e39e0d0122085245310743d5 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_jensenshannon_distance_f32.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_jensenshannon_distance_f32.c
@@ -4,11 +4,13 @@
  * Title:        arm_jensenshannon_distance_f32.c
  * Description:  Jensen-Shannon distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,17 +27,18 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 /**
-  @addtogroup FloatDist
+  @addtogroup JensenShannon
   @{
  */
 
 #if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE)
+/// @private
 __STATIC_INLINE float32_t rel_entr(float32_t x, float32_t y)
 {
     return (x * logf(x / y));
@@ -240,5 +243,5 @@ float32_t arm_jensenshannon_distance_f32(const float32_t *pA,const float32_t *pB
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
- * @} end of FloatDist group
+ * @} end of JensenShannon group
  */
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_kulsinski_distance.c b/CMSIS/DSP/Source/DistanceFunctions/arm_kulsinski_distance.c
index 9e5c8191fc7ada6d979f9c93ac7d4daee8eb7069..239dc0e5310fda6daf896343e556fa3065ba77dd 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_kulsinski_distance.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_kulsinski_distance.c
@@ -4,11 +4,13 @@
  * Title:        arm_kulsinski_distance.c
  * Description:  Kulsinski distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,7 +27,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_minkowski_distance_f16.c b/CMSIS/DSP/Source/DistanceFunctions/arm_minkowski_distance_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..9fa1390832bfc411e762e69810eb7410b843a608
--- /dev/null
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_minkowski_distance_f16.c
@@ -0,0 +1,137 @@
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_minkowski_distance_f16.c
+ * Description:  Minkowski distance between two vectors
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/distance_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup FloatDist
+ */
+
+/**
+  @defgroup Minkowski Minkowski distance
+
+  Minkowski distance
+ */
+
+/**
+  @addtogroup Minkowski
+  @{
+ */
+
+
+/**
+ * @brief        Minkowski distance between two vectors
+ *
+ * @param[in]    pA         First vector
+ * @param[in]    pB         Second vector
+ * @param[in]    order      Distance order
+ * @param[in]    blockSize  Number of samples
+ * @return distance
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+float16_t arm_minkowski_distance_f16(const float16_t *pA,const float16_t *pB, int32_t order, uint32_t blockSize)
+{
+    uint32_t        blkCnt;
+    f16x8_t         a, b, tmpV, sumV;
+
+    sumV = vdupq_n_f16(0.0f);
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U) {
+        a = vld1q(pA);
+        b = vld1q(pB);
+
+        tmpV = vabdq(a, b);
+        tmpV = vpowq_f16(tmpV, vdupq_n_f16(order));
+        sumV = vaddq(sumV, tmpV);
+
+        pA += 8;
+        pB += 8;
+        blkCnt--;
+    }
+
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U) {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
+
+        a = vldrhq_z_f16(pA, p0);
+        b = vldrhq_z_f16(pB, p0);
+
+        tmpV = vabdq(a, b);
+        tmpV = vpowq_f16(tmpV, vdupq_n_f16(order));
+        sumV = vaddq_m(sumV, sumV, tmpV, p0);
+    }
+
+    return (powf(vecAddAcrossF16Mve(sumV), (1.0f / (float16_t) order)));
+}
+
+
+#else
+
+
+float16_t arm_minkowski_distance_f16(const float16_t *pA,const float16_t *pB, int32_t order, uint32_t blockSize)
+{
+    _Float16 sum;
+    uint32_t i;
+
+    sum = 0.0f; 
+    for(i=0; i < blockSize; i++)
+    {
+       sum += (_Float16)powf(fabsf(pA[i] - pB[i]),order);
+    }
+
+
+    return(powf(sum,(1.0f/order)));
+
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/**
+ * @} end of Minkowski group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_minkowski_distance_f32.c b/CMSIS/DSP/Source/DistanceFunctions/arm_minkowski_distance_f32.c
index 3d550fb4362e50f3fb370bf85169a37b0256c314..b881fc1f2bd894c084a3ecd65104eefaf58a9795 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_minkowski_distance_f32.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_minkowski_distance_f32.c
@@ -4,11 +4,13 @@
  * Title:        arm_minkowski_distance_f32.c
  * Description:  Minkowski distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,16 +27,35 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 /**
-  @addtogroup FloatDist
+  @addtogroup Minkowski
   @{
  */
 
+/* 6.14 bug */
+#if defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100) && (__ARMCC_VERSION < 6150001)
+ 
+__attribute__((weak)) float __powisf2(float a, int b)
+{ 
+    const int recip = b < 0;
+    float r = 1;
+    while (1)
+    {
+        if (b & 1)
+            r *= a;
+        b /= 2;
+        if (b == 0)
+            break;
+        a *= a;
+    }
+    return recip ? 1/r : r;
+}
+#endif 
 
 /**
  * @brief        Minkowski distance between two vectors
@@ -55,10 +76,9 @@
 float32_t arm_minkowski_distance_f32(const float32_t *pA,const float32_t *pB, int32_t order, uint32_t blockSize)
 {
     uint32_t        blkCnt;
-    f32x4_t         a, b, tmpV, accumV, sumV;
+    f32x4_t         a, b, tmpV, sumV;
 
     sumV = vdupq_n_f32(0.0f);
-    accumV = vdupq_n_f32(0.0f);
 
     blkCnt = blockSize >> 2;
     while (blkCnt > 0U) {
@@ -164,5 +184,5 @@ float32_t arm_minkowski_distance_f32(const float32_t *pA,const float32_t *pB, in
 
 
 /**
- * @} end of FloatDist group
+ * @} end of Minkowski group
  */
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_rogerstanimoto_distance.c b/CMSIS/DSP/Source/DistanceFunctions/arm_rogerstanimoto_distance.c
index 6d4a4c0f202dd8d2b1dac15fa9c424da52741e99..b918fadbaf34d5a5e66f19ed1c8a8720e5c78f68 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_rogerstanimoto_distance.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_rogerstanimoto_distance.c
@@ -4,11 +4,13 @@
  * Title:        arm_rogerstanimoto_distance.c
  * Description:  Roger Stanimoto distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,7 +27,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
@@ -47,7 +49,7 @@ extern void arm_boolean_distance_TT_FF_TF_FT(const uint32_t *pA
  */
 
 /**
- * @brief        Roger Stanimoto distance between two vectors
+ * @brief        Rogers Tanimoto distance between two vectors
  *
  * @param[in]    pA              First vector of packed booleans
  * @param[in]    pB              Second vector of packed booleans
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_russellrao_distance.c b/CMSIS/DSP/Source/DistanceFunctions/arm_russellrao_distance.c
index 98913a573cb6867f2b28be301f417514f6f3c12d..6587412899166ddec5fb02f1506b3005b7ece603 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_russellrao_distance.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_russellrao_distance.c
@@ -4,11 +4,13 @@
  * Title:        arm_russellrao_distance.c
  * Description:  Russell-Rao distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,7 +27,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_sokalmichener_distance.c b/CMSIS/DSP/Source/DistanceFunctions/arm_sokalmichener_distance.c
index 28f06962f9c9708d3667994c378e25f59b7288e9..8015c97af7a315d1c5f2add82fd334d20ac25ed1 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_sokalmichener_distance.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_sokalmichener_distance.c
@@ -4,11 +4,13 @@
  * Title:        arm_sokalmichener_distance.c
  * Description:  Sokal-Michener distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,7 +27,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_sokalsneath_distance.c b/CMSIS/DSP/Source/DistanceFunctions/arm_sokalsneath_distance.c
index d70f95824d62a9a958c6248ceea3c109dddc331e..93d8e50e00c6954ae31eae62faa57e081f79f104 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_sokalsneath_distance.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_sokalsneath_distance.c
@@ -4,11 +4,13 @@
  * Title:        arm_sokalsneath_distance.c
  * Description:  Sokal-Sneath distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,7 +27,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
diff --git a/CMSIS/DSP/Source/DistanceFunctions/arm_yule_distance.c b/CMSIS/DSP/Source/DistanceFunctions/arm_yule_distance.c
index 000ccedf6a7828ae94ef060d4d7e2a2f2ca8a5a5..54afebfaf14939ab78bc796614c4a2a8d4c6b6b1 100644
--- a/CMSIS/DSP/Source/DistanceFunctions/arm_yule_distance.c
+++ b/CMSIS/DSP/Source/DistanceFunctions/arm_yule_distance.c
@@ -4,11 +4,13 @@
  * Title:        arm_yule_distance.c
  * Description:  Yule distance between two vectors
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -25,7 +27,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/distance_functions.h"
 #include <limits.h>
 #include <math.h>
 
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_cos_f32.c b/CMSIS/DSP/Source/FastMathFunctions/arm_cos_f32.c
index e9a7d7b1257585d014ce2fa72fab0e9bfced1ffa..de037c8749d90762cb922590687bef684c031e79 100644
--- a/CMSIS/DSP/Source/FastMathFunctions/arm_cos_f32.c
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_cos_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cos_f32.c
  * Description:  Fast cosine calculation for floating-point values
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/fast_math_functions.h"
 #include "arm_common_tables.h"
 
 /**
@@ -44,7 +44,7 @@
   [0 +0.9999] mapping to [0 2*pi).  The fixed-point range is chosen so that a
   value of 2*pi wraps around to 0.
 
-  The implementation is based on table lookup using 256 values together with linear interpolation.
+  The implementation is based on table lookup using 512 values together with linear interpolation.
   The steps used are:
    -# Calculation of the nearest integer table index
    -# Compute the fractional portion (fract) of the table index.
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_cos_q15.c b/CMSIS/DSP/Source/FastMathFunctions/arm_cos_q15.c
index 3bb829ce3be5af9479f0f05215e0dc69d63d171a..6b552f11e0cc139951a4b58f7d10cee760c4c5ce 100644
--- a/CMSIS/DSP/Source/FastMathFunctions/arm_cos_q15.c
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_cos_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_cos_q15.c
  * Description:  Fast cosine calculation for Q15 values
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/fast_math_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_cos_q31.c b/CMSIS/DSP/Source/FastMathFunctions/arm_cos_q31.c
index 8b7ff78d67d666b2da71b84238efd27835f7c0d1..8386c8f2a71efa36740596050783b425b1455037 100644
--- a/CMSIS/DSP/Source/FastMathFunctions/arm_cos_q31.c
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_cos_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_cos_q31.c
  * Description:  Fast cosine calculation for Q31 values
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/fast_math_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_divide_q15.c b/CMSIS/DSP/Source/FastMathFunctions/arm_divide_q15.c
new file mode 100644
index 0000000000000000000000000000000000000000..57dd806caf70303b181fab115fbed41ce7f7bfbb
--- /dev/null
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_divide_q15.c
@@ -0,0 +1,110 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cos_q15.c
+ * Description:  Fast cosine calculation for Q15 values
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/fast_math_functions.h"
+#include "arm_common_tables.h"
+
+#include <stdlib.h>
+
+/**
+  @ingroup groupFastMath
+ */
+
+/**
+  @defgroup divide Fixed point division
+
+ */
+
+/**
+  @addtogroup divide
+  @{
+ */
+
+/**
+  @brief         Fixed point division
+  @param[in]     numerator    Numerator
+  @param[in]     denominator  Denominator
+  @param[out]    quotient     Quotient value normalized between -1.0 and 1.0
+  @param[out]    shift        Shift left value to get the unnormalized quotient
+  @return        error status
+
+  When dividing by 0, an error ARM_MATH_NANINF is returned. And the quotient is forced
+  to the saturated negative or positive value.
+ */
+
+arm_status arm_divide_q15(q15_t numerator,
+  q15_t denominator,
+  q15_t *quotient,
+  int16_t *shift)
+{
+  int16_t sign=0;
+  q31_t temp;
+  int16_t shiftForNormalizing;
+
+  *shift = 0;
+
+  sign = (numerator>>15) ^ (denominator>>15);
+
+  if (denominator == 0)
+  {
+     if (sign)
+     {
+        *quotient = 0x8000;
+     }
+     else
+     {
+        *quotient = 0x7FFF;
+     }
+     return(ARM_MATH_NANINF);
+  }
+
+  numerator = abs(numerator);
+  denominator = abs(denominator);
+  
+  temp = ((q31_t)numerator << 15) / ((q31_t)denominator);
+
+  shiftForNormalizing= 17 - __CLZ(temp);
+  if (shiftForNormalizing > 0)
+  {
+     *shift = shiftForNormalizing;
+     temp = temp >> shiftForNormalizing;
+  }
+
+  if (sign)
+  {
+    temp = -temp;
+  }
+
+  *quotient=temp;
+
+  return(ARM_MATH_SUCCESS);
+}
+
+/**
+  @} end of divide group
+ */
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_sin_f32.c b/CMSIS/DSP/Source/FastMathFunctions/arm_sin_f32.c
index 97c69029c22f4029fd1a9242d158bb14010161f5..2fcd6bb1b748ed79e3f44741bcd75bca0a1674de 100644
--- a/CMSIS/DSP/Source/FastMathFunctions/arm_sin_f32.c
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_sin_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_sin_f32.c
  * Description:  Fast sine calculation for floating-point values
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/fast_math_functions.h"
 #include "arm_common_tables.h"
 
 /**
@@ -44,7 +44,7 @@
   [0 +0.9999] mapping to [0 2*pi).  The fixed-point range is chosen so that a
   value of 2*pi wraps around to 0.
 
-  The implementation is based on table lookup using 256 values together with linear interpolation.
+  The implementation is based on table lookup using 512 values together with linear interpolation.
   The steps used are:
    -# Calculation of the nearest integer table index
    -# Compute the fractional portion (fract) of the table index.
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_sin_q15.c b/CMSIS/DSP/Source/FastMathFunctions/arm_sin_q15.c
index 6924b5d3ab0078c2c489ed7eac0d6f5abafb8e75..81f0d627fc7ab802cb147b84e664c083b342c904 100644
--- a/CMSIS/DSP/Source/FastMathFunctions/arm_sin_q15.c
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_sin_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_sin_q15.c
  * Description:  Fast sine calculation for Q15 values
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/fast_math_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_sin_q31.c b/CMSIS/DSP/Source/FastMathFunctions/arm_sin_q31.c
index b1f154474250dd854c31778b2f12f087fecd3f1f..06262fd5df740052bc2499341d91b968ec83be59 100644
--- a/CMSIS/DSP/Source/FastMathFunctions/arm_sin_q31.c
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_sin_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_sin_q31.c
  * Description:  Fast sine calculation for Q31 values
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/fast_math_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_sqrt_q15.c b/CMSIS/DSP/Source/FastMathFunctions/arm_sqrt_q15.c
index fab0a32b17ccf0eeb2faedbfdbc4e15fd8fcb4e1..652fe2df82a4d825aace776745bdff19c7f2eda5 100644
--- a/CMSIS/DSP/Source/FastMathFunctions/arm_sqrt_q15.c
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_sqrt_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_sqrt_q15.c
  * Description:  Q15 square root function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/fast_math_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_sqrt_q31.c b/CMSIS/DSP/Source/FastMathFunctions/arm_sqrt_q31.c
index 9889b1312310792b784bc07f14e2d9803abd026f..9ce561a9b82b2ddcb1ae9c7c65a048a3d4ed806b 100644
--- a/CMSIS/DSP/Source/FastMathFunctions/arm_sqrt_q31.c
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_sqrt_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_sqrt_q31.c
  * Description:  Q31 square root function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/fast_math_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_vexp_f16.c b/CMSIS/DSP/Source/FastMathFunctions/arm_vexp_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..82a6357cd5a1529f8bd3bfd65cd2120c279dad86
--- /dev/null
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_vexp_f16.c
@@ -0,0 +1,82 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_vlog_f16.c
+ * Description:  Fast vectorized log
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/fast_math_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include "arm_common_tables.h"
+
+#include "arm_vec_math_f16.h"
+
+
+void arm_vexp_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+   uint32_t blkCnt; 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+   f16x8_t src;
+   f16x8_t dst;
+
+   blkCnt = blockSize >> 3;
+
+   while (blkCnt > 0U)
+   {
+      src = vld1q(pSrc);
+      dst = vexpq_f16(src);
+      vst1q(pDst, dst);
+
+      pSrc += 8;
+      pDst += 8;
+      /* Decrement loop counter */
+      blkCnt--;
+   }
+
+   blkCnt = blockSize & 7;
+#else
+   blkCnt = blockSize;
+#endif
+
+   while (blkCnt > 0U)
+   {
+      /* C = log(A) */
+  
+      /* Calculate log and store result in destination buffer. */
+      *pDst++ = expf(*pSrc++);
+  
+      /* Decrement loop counter */
+      blkCnt--;
+   }
+}
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_vexp_f32.c b/CMSIS/DSP/Source/FastMathFunctions/arm_vexp_f32.c
index 197890e3a21bf78c73e865b73681206b4516b633..43b2a86d340b4c13ad26f8ca5bb6232538b80690 100644
--- a/CMSIS/DSP/Source/FastMathFunctions/arm_vexp_f32.c
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_vexp_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_vlog_f32.c
  * Description:  Fast vectorized log
  *
- * $Date:        15. Octoboer 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/fast_math_functions.h"
 #include "arm_common_tables.h"
 
 #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM) || defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_vinverse_f16.c b/CMSIS/DSP/Source/FastMathFunctions/arm_vinverse_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..bf7840036f07da9ae39dc202a529711249c24dcf
--- /dev/null
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_vinverse_f16.c
@@ -0,0 +1,79 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_vinverse_f16.c
+ * Description:  Fast vectorized inverse
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/fast_math_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include "arm_common_tables.h"
+
+#include "arm_vec_math_f16.h"
+
+void arm_vinverse_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+   uint32_t blkCnt; 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+   f16x8_t src;
+   f16x8_t dst;
+
+   blkCnt = blockSize >> 3;
+
+   while (blkCnt > 0U)
+   {
+      src = vld1q(pSrc);
+      dst = vrecip_hiprec_f16(src);
+      vst1q(pDst, dst);
+
+      pSrc += 8;
+      pDst += 8;
+      /* Decrement loop counter */
+      blkCnt--;
+   }
+
+   blkCnt = blockSize & 7;
+#else
+   blkCnt = blockSize;
+#endif
+
+   while (blkCnt > 0U)
+   {
+      
+      *pDst++ = 1.0 / *pSrc++;
+  
+      /* Decrement loop counter */
+      blkCnt--;
+   }
+}
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_vlog_f16.c b/CMSIS/DSP/Source/FastMathFunctions/arm_vlog_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..18800727476278a67996c8c45ef34a7bdcdb91c2
--- /dev/null
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_vlog_f16.c
@@ -0,0 +1,80 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_vlog_f16.c
+ * Description:  Fast vectorized log
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/fast_math_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include "arm_common_tables.h"
+#include "arm_vec_math_f16.h"
+
+void arm_vlog_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+   uint32_t blkCnt; 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+   f16x8_t src;
+   f16x8_t dst;
+
+   blkCnt = blockSize >> 3;
+
+   while (blkCnt > 0U)
+   {
+      src = vld1q(pSrc);
+      dst = vlogq_f16(src);
+      vst1q(pDst, dst);
+
+      pSrc += 8;
+      pDst += 8;
+      /* Decrement loop counter */
+      blkCnt--;
+   }
+
+   blkCnt = blockSize & 7;
+#else
+   blkCnt = blockSize;
+#endif
+
+   while (blkCnt > 0U)
+   {
+      /* C = log(A) */
+  
+      /* Calculate log and store result in destination buffer. */
+      *pDst++ = logf(*pSrc++);
+  
+      /* Decrement loop counter */
+      blkCnt--;
+   }
+}
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/FastMathFunctions/arm_vlog_f32.c b/CMSIS/DSP/Source/FastMathFunctions/arm_vlog_f32.c
index 1ad6094b0dfeb3e3468f1fdf118145fe5c1df658..af8a692b330b375a44d52615905e65de7a9a1fbb 100644
--- a/CMSIS/DSP/Source/FastMathFunctions/arm_vlog_f32.c
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_vlog_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_vlog_f32.c
  * Description:  Fast vectorized log
  *
- * $Date:        15. Octoboer 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/fast_math_functions.h"
 #include "arm_common_tables.h"
 
 #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM) || defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c
index ac2313fbb864d847e171b893bd343759e2803296..565b494b78345ef131b1a750be17f0226b30e9d2 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df1_32x64_init_q31.c
  * Description:  High precision Q31 Biquad cascade filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
index 2b70c2ac91212a616a4f00c1691b491663a57017..2c8c911b3b9637a52c4a561c058ec3d92133b5dd 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df1_32x64_q31.c
  * Description:  High precision Q31 Biquad cascade filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -172,9 +172,115 @@
                    - \ref arm_biquad_cascade_df1_fast_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
+
+static void arm_biquad_cas_df1_32x64_q31_scalar(const arm_biquad_cas_df1_32x64_ins_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+  const q31_t *pIn = pSrc;                             /* input pointer initialization */
+        q31_t *pOut = pDst;                            /* output pointer initialization */
+        q63_t *pState = S->pState;                     /* state pointer initialization */
+  const q31_t *pCoeffs = S->pCoeffs;                   /* coeff pointer initialization */
+        q63_t acc;                                     /* accumulator */
+        q31_t Xn1, Xn2;                                /* Input Filter state variables */
+        q63_t Yn1, Yn2;                                /* Output Filter state variables */
+        q31_t b0, b1, b2, a1, a2;                      /* Filter coefficients */
+        q31_t Xn;                                      /* temporary input */
+        int32_t shift = (int32_t) S->postShift + 1;    /* Shift to be applied to the output */
+        uint32_t sample, stage = S->numStages;         /* loop counters */
+        q31_t acc_l, acc_h;                            /* temporary output */
+        uint32_t uShift = ((uint32_t) S->postShift + 1U);
+        uint32_t lShift = 32U - uShift;                /* Shift to be applied to the output */
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the state values */
+    Xn1 = (q31_t) (pState[0]);
+    Xn2 = (q31_t) (pState[1]);
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+
+    /* Initialize blkCnt with number of samples */
+    sample = blockSize;
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+
+      /* acc =  b0 * x[n] */
+      acc = (q63_t) Xn * b0;
+      /* acc +=  b1 * x[n-1] */
+      acc += (q63_t) Xn1 * b1;
+      /* acc +=  b[2] * x[n-2] */
+      acc += (q63_t) Xn2 * b2;
+      /* acc +=  a1 * y[n-1] */
+      acc += mult32x64(Yn1, a1);
+      /* acc +=  a2 * y[n-2] */
+      acc += mult32x64(Yn2, a2);
+
+      /* Every time after the output is computed state should be updated. */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+
+      /* The result is converted to 1.63, Yn1 variable is reused  */
+      Yn1 = acc << shift;
+
+      /* Calc lower part of acc */
+      acc_l = acc & 0xffffffff;
+
+      /* Calc upper part of acc */
+      acc_h = (acc >> 32) & 0xffffffff;
+
+      /* Apply shift for lower part of acc and upper part of acc */
+      acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+      /* Store the output in the destination buffer in 1.31 format. */
+      *pOut++ = acc_h;
+      /* Yn1 = acc << shift; */
+
+      /* Store the output in the destination buffer in 1.31 format. */
+/*    *pOut++ = (q31_t) (acc >> (32 - shift));  */
+
+      /* decrement loop counter */
+      sample--;
+    }
+
+    /* The first stage output is given as input to the second stage. */
+    pIn = pDst;
+
+    /* Reset to destination buffer working pointer */
+    pOut = pDst;
+
+    /*  Store the updated state variables back into the pState array */
+    *pState++ = (q63_t) Xn1;
+    *pState++ = (q63_t) Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+  } while (--stage);
+
+}
+
 void arm_biquad_cas_df1_32x64_q31(
   const arm_biquad_cas_df1_32x64_ins_q31 * S,
   const q31_t * pSrc,
@@ -193,218 +299,225 @@ void arm_biquad_cas_df1_32x64_q31(
     q31x4_t vecCoef, vecIn;
     q63_t     acc;
 
-    do
+    if (blockSize <= 3)
+    {
+      arm_biquad_cas_df1_32x64_q31_scalar(S,pSrc,pDst,blockSize);
+    }
+    else
     {
-        uint32_t  i;
-        /*
-         * Reading the coefficients
-         */
-        b0 = *pCoeffs++;
-        b1 = *pCoeffs++;
-        b2 = *pCoeffs++;
-        a1 = *pCoeffs++;
-        a2 = *pCoeffs++;
-
-        vecCoef[0] = 0;
-        vecCoef[1] = b2;
-        vecCoef[2] = b1;
-        vecCoef[3] = b0;
-        /*
-         * Reading the state values
-         */
-        Xn1 = pState[0];
-        Xn2 = pState[1];
-        Yn1 = pState[2];
-        Yn2 = pState[3];
-
-        /*
-         * append history with initial samples
-         */
-        q31_t     hist[6];
-        hist[0] = 0;
-        hist[1] = Xn2;
-        hist[2] = Xn1;
-        hist[3] = pIn[0];
-        hist[4] = pIn[1];
-        hist[5] = pIn[2];
-
-        const q31_t  *pIn1 = hist;
-        q31x4_t vecIn0 = *(q31x4_t *) & pIn[0];
-        q31x4_t vecIn1 = *(q31x4_t *) & pIn[1];
-        q31x4_t vecIn2 = *(q31x4_t *) & pIn[2];
-
-        i = 3;
-        do
-        {
-            acc = mult32x64(Yn1, a1);
-            acc += mult32x64(Yn2, a2);
-            Yn2 = Yn1;
-            Yn1 = acc;
-            vecIn = vld1q(pIn1);
-            pIn1 += 1;
-            Yn1 = vmlaldavaq(Yn1, vecIn, vecCoef);
-            Yn1 = asrl(Yn1, -shift);
-            /*
-             * Store the output in the destination buffer in 1.31 format.
-             */
-            *pOut++ = (q31_t) (Yn1 >> 32);
-        }
-        while (--i);
-
-        sample = blockSize - 3;
-        pIn1 = pIn + 3;
-
-        i = sample / 4;
-        while (i > 0U)
-        {
-
-            acc = mult32x64(Yn1, a1);
-            acc += mult32x64(Yn2, a2);
-            Yn2 = Yn1;
-            Yn1 = acc;
-            Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
-            vecIn = vld1q(pIn1);
-            pIn1 += 1;
-            Yn1 = asrl(Yn1, -shift);
-            /*
-             * Store the output in the destination buffer in 1.31 format.
-             */
-            *pOut++ = (q31_t) (Yn1 >> 32);
-
-            acc = mult32x64(Yn1, a1);
-            acc += mult32x64(Yn2, a2);
-            Yn2 = Yn1;
-            Yn1 = acc;
-            Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
-            vecIn0 = vld1q(pIn1);
-            pIn1 += 1;
-            Yn1 = asrl(Yn1, -shift);
-            *pOut++ = (q31_t) (Yn1 >> 32);
-
-            acc = mult32x64(Yn1, a1);
-            acc += mult32x64(Yn2, a2);
-            Yn2 = Yn1;
-            Yn1 = acc;
-            Yn1 = vmlaldavaq(Yn1, vecIn2, vecCoef);
-            vecIn1 = vld1q(pIn1);
-            pIn1 += 1;
-            Yn1 = asrl(Yn1, -shift);
-            *pOut++ = (q31_t) (Yn1 >> 32);
-
-            acc = mult32x64(Yn1, a1);
-            acc += mult32x64(Yn2, a2);
-            Yn2 = Yn1;
-            Yn1 = acc;
-            Yn1 = vmlaldavaq(Yn1, vecIn, vecCoef);
-            vecIn2 = vld1q(pIn1);
-            pIn1 += 1;
-            Yn1 = asrl(Yn1, -shift);
-            *pOut++ = (q31_t) (Yn1 >> 32);
-            /*
-             * Decrement the loop counter
-             */
-            i--;
-        }
-        /*
-         * save input state
-         */
-        Xn2 = vecIn[2];
-        Xn1 = vecIn[3];
-
-        int       loopRemainder = blockSize - 3 - 4 * ((blockSize - 3) / 4);
-        if (loopRemainder == 1)
-        {
-            /*
-             * remainder
-             */
-            acc = mult32x64(Yn1, a1);
-            acc += mult32x64(Yn2, a2);
-            Yn2 = Yn1;
-            Yn1 = acc;
-            Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
-            Yn1 = asrl(Yn1, -shift);
-            *pOut++ = (q31_t) (Yn1 >> 32);
-            /*
-             * save input state
-             */
-            Xn2 = vecIn0[2];
-            Xn1 = vecIn0[3];
-
-        }
-        else if (loopRemainder == 2)
-        {
-            acc = mult32x64(Yn1, a1);
-            acc += mult32x64(Yn2, a2);
-            Yn2 = Yn1;
-            Yn1 = acc;
-            Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
-            Yn1 = asrl(Yn1, -shift);
-            *pOut++ = (q31_t) (Yn1 >> 32);
-
-            acc = mult32x64(Yn1, a1);
-            acc += mult32x64(Yn2, a2);
-            Yn2 = Yn1;
-            Yn1 = acc;
-            Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
-            Yn1 = asrl(Yn1, -shift);
-            *pOut++ = (q31_t) (Yn1 >> 32);
-            /*
-             * save input state
-             */
-            Xn2 = vecIn1[2];
-            Xn1 = vecIn1[3];
-
-        }
-        else if (loopRemainder == 3)
-        {
-            acc = mult32x64(Yn1, a1);
-            acc += mult32x64(Yn2, a2);
-            Yn2 = Yn1;
-            Yn1 = acc;
-            Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
-            Yn1 = asrl(Yn1, -shift);
-            *pOut++ = (q31_t) (Yn1 >> 32);
-
-            acc = mult32x64(Yn1, a1);
-            acc += mult32x64(Yn2, a2);
-            Yn2 = Yn1;
-            Yn1 = acc;
-            Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
-            Yn1 = asrl(Yn1, -shift);
-            *pOut++ = (q31_t) (Yn1 >> 32);
-
-            acc = mult32x64(Yn1, a1);
-            acc += mult32x64(Yn2, a2);
-            Yn2 = Yn1;
-            Yn1 = acc;
-            Yn1 = vmlaldavaq(Yn1, vecIn2, vecCoef);
-            Yn1 = asrl(Yn1, -shift);
-            *pOut++ = (q31_t) (Yn1 >> 32);
-            /*
-             * save input state
-             */
-            Xn2 = vecIn2[2];
-            Xn1 = vecIn2[3];
-
-        }
-
-        /*
-         * The first stage output is given as input to the second stage.
-         */
-        pIn = pDst;
-        /*
-         * Reset to destination buffer working pointer
-         */
-        pOut = pDst;
-        /*
-         * Store the updated state variables back into the pState array
-         */
-        *pState++ = (q63_t) Xn1;
-        *pState++ = (q63_t) Xn2;
-        *pState++ = Yn1;
-        *pState++ = Yn2;
+      do
+      {
+          uint32_t  i;
+          /*
+           * Reading the coefficients
+           */
+          b0 = *pCoeffs++;
+          b1 = *pCoeffs++;
+          b2 = *pCoeffs++;
+          a1 = *pCoeffs++;
+          a2 = *pCoeffs++;
+  
+          vecCoef[0] = 0;
+          vecCoef[1] = b2;
+          vecCoef[2] = b1;
+          vecCoef[3] = b0;
+          /*
+           * Reading the state values
+           */
+          Xn1 = pState[0];
+          Xn2 = pState[1];
+          Yn1 = pState[2];
+          Yn2 = pState[3];
+  
+          /*
+           * append history with initial samples
+           */
+          q31_t     hist[6];
+          hist[0] = 0;
+          hist[1] = Xn2;
+          hist[2] = Xn1;
+          hist[3] = pIn[0];
+          hist[4] = pIn[1];
+          hist[5] = pIn[2];
+  
+          const q31_t  *pIn1 = hist;
+          q31x4_t vecIn0 = *(q31x4_t *) & pIn[0];
+          q31x4_t vecIn1 = *(q31x4_t *) & pIn[1];
+          q31x4_t vecIn2 = *(q31x4_t *) & pIn[2];
+  
+          i = 3;
+          do
+          {
+              acc = mult32x64(Yn1, a1);
+              acc += mult32x64(Yn2, a2);
+              Yn2 = Yn1;
+              Yn1 = acc;
+              vecIn = vld1q(pIn1);
+              pIn1 += 1;
+              Yn1 = vmlaldavaq(Yn1, vecIn, vecCoef);
+              Yn1 = asrl(Yn1, -shift);
+              /*
+               * Store the output in the destination buffer in 1.31 format.
+               */
+              *pOut++ = (q31_t) (Yn1 >> 32);
+          }
+          while (--i);
+  
+          sample = blockSize - 3;
+          pIn1 = pIn + 3;
+  
+          i = sample / 4;
+          while (i > 0U)
+          {
+  
+              acc = mult32x64(Yn1, a1);
+              acc += mult32x64(Yn2, a2);
+              Yn2 = Yn1;
+              Yn1 = acc;
+              Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
+              vecIn = vld1q(pIn1);
+              pIn1 += 1;
+              Yn1 = asrl(Yn1, -shift);
+              /*
+               * Store the output in the destination buffer in 1.31 format.
+               */
+              *pOut++ = (q31_t) (Yn1 >> 32);
+  
+              acc = mult32x64(Yn1, a1);
+              acc += mult32x64(Yn2, a2);
+              Yn2 = Yn1;
+              Yn1 = acc;
+              Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
+              vecIn0 = vld1q(pIn1);
+              pIn1 += 1;
+              Yn1 = asrl(Yn1, -shift);
+              *pOut++ = (q31_t) (Yn1 >> 32);
+  
+              acc = mult32x64(Yn1, a1);
+              acc += mult32x64(Yn2, a2);
+              Yn2 = Yn1;
+              Yn1 = acc;
+              Yn1 = vmlaldavaq(Yn1, vecIn2, vecCoef);
+              vecIn1 = vld1q(pIn1);
+              pIn1 += 1;
+              Yn1 = asrl(Yn1, -shift);
+              *pOut++ = (q31_t) (Yn1 >> 32);
+  
+              acc = mult32x64(Yn1, a1);
+              acc += mult32x64(Yn2, a2);
+              Yn2 = Yn1;
+              Yn1 = acc;
+              Yn1 = vmlaldavaq(Yn1, vecIn, vecCoef);
+              vecIn2 = vld1q(pIn1);
+              pIn1 += 1;
+              Yn1 = asrl(Yn1, -shift);
+              *pOut++ = (q31_t) (Yn1 >> 32);
+              /*
+               * Decrement the loop counter
+               */
+              i--;
+          }
+          /*
+           * save input state
+           */
+          Xn2 = vecIn[2];
+          Xn1 = vecIn[3];
+  
+          int       loopRemainder = blockSize - 3 - 4 * ((blockSize - 3) / 4);
+          if (loopRemainder == 1)
+          {
+              /*
+               * remainder
+               */
+              acc = mult32x64(Yn1, a1);
+              acc += mult32x64(Yn2, a2);
+              Yn2 = Yn1;
+              Yn1 = acc;
+              Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
+              Yn1 = asrl(Yn1, -shift);
+              *pOut++ = (q31_t) (Yn1 >> 32);
+              /*
+               * save input state
+               */
+              Xn2 = vecIn0[2];
+              Xn1 = vecIn0[3];
+  
+          }
+          else if (loopRemainder == 2)
+          {
+              acc = mult32x64(Yn1, a1);
+              acc += mult32x64(Yn2, a2);
+              Yn2 = Yn1;
+              Yn1 = acc;
+              Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
+              Yn1 = asrl(Yn1, -shift);
+              *pOut++ = (q31_t) (Yn1 >> 32);
+  
+              acc = mult32x64(Yn1, a1);
+              acc += mult32x64(Yn2, a2);
+              Yn2 = Yn1;
+              Yn1 = acc;
+              Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
+              Yn1 = asrl(Yn1, -shift);
+              *pOut++ = (q31_t) (Yn1 >> 32);
+              /*
+               * save input state
+               */
+              Xn2 = vecIn1[2];
+              Xn1 = vecIn1[3];
+  
+          }
+          else if (loopRemainder == 3)
+          {
+              acc = mult32x64(Yn1, a1);
+              acc += mult32x64(Yn2, a2);
+              Yn2 = Yn1;
+              Yn1 = acc;
+              Yn1 = vmlaldavaq(Yn1, vecIn0, vecCoef);
+              Yn1 = asrl(Yn1, -shift);
+              *pOut++ = (q31_t) (Yn1 >> 32);
+  
+              acc = mult32x64(Yn1, a1);
+              acc += mult32x64(Yn2, a2);
+              Yn2 = Yn1;
+              Yn1 = acc;
+              Yn1 = vmlaldavaq(Yn1, vecIn1, vecCoef);
+              Yn1 = asrl(Yn1, -shift);
+              *pOut++ = (q31_t) (Yn1 >> 32);
+  
+              acc = mult32x64(Yn1, a1);
+              acc += mult32x64(Yn2, a2);
+              Yn2 = Yn1;
+              Yn1 = acc;
+              Yn1 = vmlaldavaq(Yn1, vecIn2, vecCoef);
+              Yn1 = asrl(Yn1, -shift);
+              *pOut++ = (q31_t) (Yn1 >> 32);
+              /*
+               * save input state
+               */
+              Xn2 = vecIn2[2];
+              Xn1 = vecIn2[3];
+  
+          }
+  
+          /*
+           * The first stage output is given as input to the second stage.
+           */
+          pIn = pDst;
+          /*
+           * Reset to destination buffer working pointer
+           */
+          pOut = pDst;
+          /*
+           * Store the updated state variables back into the pState array
+           */
+          *pState++ = (q63_t) Xn1;
+          *pState++ = (q63_t) Xn2;
+          *pState++ = Yn1;
+          *pState++ = Yn2;
+      }
+      while (--stage);
     }
-    while (--stage);
 }
 #else
 void arm_biquad_cas_df1_32x64_q31(
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f16.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..72db81c443e34ba0fb3ecb653ca5f5b5201c179c
--- /dev/null
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f16.c
@@ -0,0 +1,491 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_biquad_cascade_df1_f16.c
+ * Description:  Processing function for the floating-point Biquad cascade DirectFormI(DF1) filter
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+/**
+  @ingroup groupFilters
+ */
+
+
+/**
+  @addtogroup BiquadCascadeDF1
+  @{
+ */
+
+/**
+  @brief         Processing function for the floating-point Biquad cascade filter.
+  @param[in]     S         points to an instance of the floating-point Biquad cascade structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize  number of samples to process
+  @return        none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_biquad_cascade_df1_f16(
+  const arm_biquad_casd_df1_inst_f16 * S,
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+    float16_t *pIn = (float16_t *)pSrc;      /*  source pointer            */
+    float16_t *pOut = pDst;     /*  destination pointer       */
+    float16_t *pState = S->pState;  /*  pState pointer            */
+    const float16_t *pCoeffs = S->pCoeffs;    /*  coefficient pointer       */
+    float16_t Xn1, Xn2, Yn1, Yn2;   /*  Filter pState variables   */
+    float16_t X0, X1, X2, X3;   /*  temporary input           */
+    float16_t X4, X5, X6, X7;   /*  temporary input           */
+    _Float16 lastX, lastY;             /*  X,Y history for tail handling */
+    f16x8_t coeffs;
+    f16x8_t accVec;           /* accumultor vector */
+    uint32_t  sample, stage = S->numStages; /*  loop counters             */
+
+    do
+    {
+        /*
+         * Reading the pState values
+         */
+        Xn1 = pState[0];
+        Xn2 = pState[1];
+        Yn1 = pState[2];
+        Yn2 = pState[3];
+
+        sample = blockSize >> 3U;
+
+        /*
+         * First part of the processing with loop unrolling.  Compute 8 outputs at a time.
+         */
+        while (sample > 0U)
+        {
+            X0 = *pIn++;
+            X1 = *pIn++;
+            X2 = *pIn++;
+            X3 = *pIn++;
+            X4 = *pIn++;
+            X5 = *pIn++;
+            X6 = *pIn++;
+            X7 = *pIn++;
+
+            coeffs = vld1q(pCoeffs);
+            accVec = vmulq(coeffs, X7);
+
+            coeffs = vld1q(&pCoeffs[8]);
+            accVec = vfmaq(accVec, coeffs, X6);
+
+            coeffs = vld1q(&pCoeffs[16]);
+            accVec = vfmaq(accVec, coeffs, X5);
+
+            coeffs = vld1q(&pCoeffs[24]);
+            accVec = vfmaq(accVec, coeffs, X4);
+
+            coeffs = vld1q(&pCoeffs[32]);
+            accVec = vfmaq(accVec, coeffs, X3);
+
+            coeffs = vld1q(&pCoeffs[40]);
+            accVec = vfmaq(accVec, coeffs, X2);
+
+            coeffs = vld1q(&pCoeffs[48]);
+            accVec = vfmaq(accVec, coeffs, X1);
+
+            coeffs = vld1q(&pCoeffs[56]);
+            accVec = vfmaq(accVec, coeffs, X0);
+
+            coeffs = vld1q(&pCoeffs[64]);
+            accVec = vfmaq(accVec, coeffs, Xn1);
+
+            coeffs = vld1q(&pCoeffs[72]);
+            accVec = vfmaq(accVec, coeffs, Xn2);
+
+            coeffs = vld1q(&pCoeffs[80]);
+            accVec = vfmaq(accVec, coeffs, Yn1);
+
+            coeffs = vld1q(&pCoeffs[88]);
+            accVec = vfmaq(accVec, coeffs, Yn2);
+            /*
+             * Store the result in the accumulator in the destination buffer.
+             */
+            vst1q(pOut, accVec);
+            pOut += 8;
+
+            /*
+             * update recurrence
+             */
+            Xn1 = X7;
+            Xn2 = X6;
+            Yn1 = vgetq_lane(accVec, 7);
+            Yn2 = vgetq_lane(accVec, 6);
+            /*
+             * decrement the loop counter
+             */
+            sample--;
+        }
+
+        /*
+         * If the blockSize is not a multiple of 8,
+         * compute any remaining output samples here.
+         */
+        sample = blockSize & 0x7U;
+        if (sample)
+        {
+            /* save previous X, Y for modulo 1 length case */
+            lastX = X7;
+            lastY = Yn1;
+
+            X0 = *pIn++;
+            X1 = *pIn++;
+            X2 = *pIn++;
+            X3 = *pIn++;
+            X4 = *pIn++;
+            X5 = *pIn++;
+            X6 = *pIn++;
+            X7 = *pIn++;
+
+            coeffs = vld1q(pCoeffs);
+            accVec = vmulq(coeffs, X7);
+
+            coeffs = vld1q(&pCoeffs[8]);
+            accVec = vfmaq(accVec, coeffs, X6);
+
+            coeffs = vld1q(&pCoeffs[16]);
+            accVec = vfmaq(accVec, coeffs, X5);
+
+            coeffs = vld1q(&pCoeffs[24]);
+            accVec = vfmaq(accVec, coeffs, X4);
+
+            coeffs = vld1q(&pCoeffs[32]);
+            accVec = vfmaq(accVec, coeffs, X3);
+
+            coeffs = vld1q(&pCoeffs[40]);
+            accVec = vfmaq(accVec, coeffs, X2);
+
+            coeffs = vld1q(&pCoeffs[48]);
+            accVec = vfmaq(accVec, coeffs, X1);
+
+            coeffs = vld1q(&pCoeffs[56]);
+            accVec = vfmaq(accVec, coeffs, X0);
+
+            coeffs = vld1q(&pCoeffs[64]);
+            accVec = vfmaq(accVec, coeffs, Xn1);
+
+            coeffs = vld1q(&pCoeffs[72]);
+            accVec = vfmaq(accVec, coeffs, Xn2);
+
+            coeffs = vld1q(&pCoeffs[80]);
+            accVec = vfmaq(accVec, coeffs, Yn1);
+
+            coeffs = vld1q(&pCoeffs[88]);
+            accVec = vfmaq(accVec, coeffs, Yn2);
+
+            switch(sample)
+            {
+               case 1:
+                 *pOut++ = vgetq_lane(accVec, 0);
+                  Xn1 = X0;
+                  Xn2 = lastX;
+                  Yn1 = vgetq_lane(accVec, 0);
+                  Yn2 = lastY;
+               break;
+               case 2:
+                 *pOut++ = vgetq_lane(accVec, 0);
+                 *pOut++ = vgetq_lane(accVec, 1);
+                 Xn1 = X1;
+                 Xn2 = X0;
+                 Yn1 = vgetq_lane(accVec, 1);
+                 Yn2 = vgetq_lane(accVec, 0);
+               break;
+               case 3:
+                *pOut++ = vgetq_lane(accVec, 0);
+                *pOut++ = vgetq_lane(accVec, 1);
+                *pOut++ = vgetq_lane(accVec, 2);
+                Xn1 = X2;
+                Xn2 = X1;
+                Yn1 = vgetq_lane(accVec, 2);
+                Yn2 = vgetq_lane(accVec, 1);
+               break;
+
+               case 4:
+                *pOut++ = vgetq_lane(accVec, 0);
+                *pOut++ = vgetq_lane(accVec, 1);
+                *pOut++ = vgetq_lane(accVec, 2);
+                *pOut++ = vgetq_lane(accVec, 3);
+                Xn1 = X3;
+                Xn2 = X2;
+                Yn1 = vgetq_lane(accVec, 3);
+                Yn2 = vgetq_lane(accVec, 2);
+               break;
+
+               case 5:
+                *pOut++ = vgetq_lane(accVec, 0);
+                *pOut++ = vgetq_lane(accVec, 1);
+                *pOut++ = vgetq_lane(accVec, 2);
+                *pOut++ = vgetq_lane(accVec, 3);
+                *pOut++ = vgetq_lane(accVec, 4);
+                Xn1 = X4;
+                Xn2 = X3;
+                Yn1 = vgetq_lane(accVec, 4);
+                Yn2 = vgetq_lane(accVec, 3);
+               break;
+
+               case 6:
+                *pOut++ = vgetq_lane(accVec, 0);
+                *pOut++ = vgetq_lane(accVec, 1);
+                *pOut++ = vgetq_lane(accVec, 2);
+                *pOut++ = vgetq_lane(accVec, 3);
+                *pOut++ = vgetq_lane(accVec, 4);
+                *pOut++ = vgetq_lane(accVec, 5);
+                Xn1 = X5;
+                Xn2 = X4;
+                Yn1 = vgetq_lane(accVec, 5);
+                Yn2 = vgetq_lane(accVec, 4);
+               break;
+
+               case 7:
+                *pOut++ = vgetq_lane(accVec, 0);
+                *pOut++ = vgetq_lane(accVec, 1);
+                *pOut++ = vgetq_lane(accVec, 2);
+                *pOut++ = vgetq_lane(accVec, 3);
+                *pOut++ = vgetq_lane(accVec, 4);
+                *pOut++ = vgetq_lane(accVec, 5);
+                *pOut++ = vgetq_lane(accVec, 6);
+                Xn1 = X6;
+                Xn2 = X5;
+                Yn1 = vgetq_lane(accVec, 6);
+                Yn2 = vgetq_lane(accVec, 5);
+               break;
+            }
+        }
+        /*
+         * Store the updated state variables back into the pState array
+         */
+        *pState++ = Xn1;
+        *pState++ = Xn2;
+        *pState++ = Yn1;
+        *pState++ = Yn2;
+
+        pCoeffs += sizeof(arm_biquad_mod_coef_f16) / sizeof(float16_t);
+        /*
+         * The first stage goes from the input buffer to the output buffer.
+         * Subsequent numStages  occur in-place in the output buffer
+         */
+        pIn = pDst;
+        /*
+         * Reset the output pointer
+         */
+        pOut = pDst;
+        /*
+         * decrement the loop counter
+         */
+        stage--;
+    }
+    while (stage > 0U);
+}
+
+#else
+void arm_biquad_cascade_df1_f16(
+  const arm_biquad_casd_df1_inst_f16 * S,
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+  const float16_t *pIn = pSrc;                         /* Source pointer */
+        float16_t *pOut = pDst;                        /* Destination pointer */
+        float16_t *pState = S->pState;                 /* pState pointer */
+  const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        _Float16 acc;                                 /* Accumulator */
+        _Float16 b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        _Float16 Xn1, Xn2, Yn1, Yn2;                  /* Filter pState variables */
+        _Float16 Xn;                                  /* Temporary input */
+        uint32_t sample, stage = S->numStages;         /* Loop counters */
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the pState values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+    /* Apply loop unrolling and compute 4 output values simultaneously. */
+    /* Variable acc hold output values that are being computed:
+     *
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     */
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+    sample = blockSize >> 2U;
+
+    while (sample > 0U)
+    {
+      /* Read the first input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      Yn2 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
+
+      /* Store output in destination buffer. */
+      *pOut++ = Yn2;
+
+      /* Every time after the output is computed state should be updated. */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
+
+      /* Read the second input */
+      Xn2 = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      Yn1 = (b0 * Xn2) + (b1 * Xn) + (b2 * Xn1) + (a1 * Yn2) + (a2 * Yn1);
+
+      /* Store output in destination buffer. */
+      *pOut++ = Yn1;
+
+      /* Every time after the output is computed state should be updated. */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
+
+      /* Read the third input */
+      Xn1 = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      Yn2 = (b0 * Xn1) + (b1 * Xn2) + (b2 * Xn) + (a1 * Yn1) + (a2 * Yn2);
+
+      /* Store output in destination buffer. */
+      *pOut++ = Yn2;
+
+      /* Every time after the output is computed state should be updated. */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
+
+      /* Read the forth input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      Yn1 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn2) + (a2 * Yn1);
+
+      /* Store output in destination buffer. */
+      *pOut++ = Yn1;
+
+      /* Every time after the output is computed state should be updated. */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+
+      /* decrement loop counter */
+      sample--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    sample = blockSize & 0x3U;
+
+#else
+
+    /* Initialize blkCnt with number of samples */
+    sample = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
+
+      /* Store output in destination buffer. */
+      *pOut++ = acc;
+
+      /* Every time after the output is computed state should be updated. */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+      Yn1 = acc;
+
+      /* decrement loop counter */
+      sample--;
+    }
+
+    /* Store the updated state variables back into the pState array */
+    *pState++ = Xn1;
+    *pState++ = Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+    /* The first stage goes from the input buffer to the output buffer. */
+    /* Subsequent numStages occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset output pointer */
+    pOut = pDst;
+
+    /* decrement loop counter */
+    stage--;
+
+  } while (stage > 0U);
+
+}
+
+/**
+  @} end of BiquadCascadeDF1 group
+ */
+#endif /* #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+#endif /*#if defined(ARM_FLOAT16_SUPPORTED)*/
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
index 0abc16596e0a1d781696b759ab1e58d5c199e2d9..77d862bf65ff52a1426fb565ee1d3af7740db2fa 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df1_f32.c
  * Description:  Processing function for the floating-point Biquad cascade DirectFormI(DF1) filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c
index 1a568d70d2b1676bb67046141096a6aacafbc561..65539406f36e5359956d2efcfce076fe44eadc8a 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df1_fast_q15.c
  * Description:  Fast processing function for the Q15 Biquad cascade filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c
index 586296b32b026d385069004b78bea0f9e0c44817..cefa592a2d46c88052feb54cd2bea7a4d199e9d7 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df1_fast_q31.c
  * Description:  Processing function for the Q31 Fast Biquad cascade DirectFormI(DF1) filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f16.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..3773031419ea2e8ead60f22bb45d875da78e2ba6
--- /dev/null
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f16.c
@@ -0,0 +1,160 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_biquad_cascade_df1_init_f16.c
+ * Description:  Floating-point Biquad cascade DirectFormI(DF1) filter initialization function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+/**
+  @ingroup groupFilters
+ */
+
+/**
+  @addtogroup BiquadCascadeDF1
+  @{
+ */
+
+/**
+  @brief         Initialization function for the floating-point Biquad cascade filter.
+  @param[in,out] S           points to an instance of the floating-point Biquad cascade structure.
+  @param[in]     numStages   number of 2nd order stages in the filter.
+  @param[in]     pCoeffs     points to the filter coefficients.
+  @param[in]     pState      points to the state buffer.
+  @return        none
+
+  @par           Coefficient and State Ordering
+                   The coefficients are stored in the array <code>pCoeffs</code> in the following order:
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+
+  @par
+                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
+                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
+                   and so on. The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
+  @par
+                   The <code>pState</code> is a pointer to state array.
+                   Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.
+                   The state variables are arranged in the <code>pState</code> array as:
+  <pre>
+      {x[n-1], x[n-2], y[n-1], y[n-2]}
+  </pre>
+                   The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+                   The state array has a total length of <code>4*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
+ 
+  @par             For MVE code, an additional buffer of modified coefficients is required.
+                   Its size is numStages and each element of this buffer has type arm_biquad_mod_coef_f16.
+                   So, its total size is 96*numStages float16_t elements.
+
+                   The initialization function which must be used is arm_biquad_cascade_df1_mve_init_f16.
+ */
+
+
+void arm_biquad_cascade_df1_init_f16(
+        arm_biquad_casd_df1_inst_f16 * S,
+        uint8_t numStages,
+  const float16_t * pCoeffs,
+        float16_t * pState)
+{
+  /* Assign filter stages */
+  S->numStages = numStages;
+
+  /* Assign coefficient pointer */
+  S->pCoeffs = pCoeffs;
+
+  /* Clear state buffer and size is always 4 * numStages */
+  memset(pState, 0, (4U * (uint32_t) numStages) * sizeof(float16_t));
+
+  /* Assign state pointer */
+  S->pState = pState;
+}
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+static void generateCoefsFastBiquadF16(float16_t b0, float16_t b1, float16_t b2, float16_t a1, float16_t a2,
+                                arm_biquad_mod_coef_f16 * newCoef)
+{
+    float32_t coeffs[8][12] = {
+        {0, 0, 0, 0, 0, 0, 0, b0, b1, b2, a1, a2},
+        {0, 0, 0, 0, 0, 0, b0, b1, b2, 0, a2, 0},
+        {0, 0, 0, 0, 0, b0, b1, b2, 0, 0, 0, 0},
+        {0, 0, 0, 0, b0, b1, b2, 0, 0, 0, 0, 0},
+        {0, 0, 0, b0, b1, b2, 0, 0, 0, 0, 0, 0},
+        {0, 0, b0, b1, b2, 0, 0, 0, 0, 0, 0, 0},
+        {0, b0, b1, b2, 0, 0, 0, 0, 0, 0, 0, 0},
+        {b0, b1, b2, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+    };
+
+    for (int i = 0; i < 12; i++)
+    {
+        coeffs[1][i] += (a1 * coeffs[0][i]);
+        coeffs[2][i] += (a1 * coeffs[1][i]) + (a2 * coeffs[0][i]);
+        coeffs[3][i] += (a1 * coeffs[2][i]) + (a2 * coeffs[1][i]);
+        coeffs[4][i] += (a1 * coeffs[3][i]) + (a2 * coeffs[2][i]);
+        coeffs[5][i] += (a1 * coeffs[4][i]) + (a2 * coeffs[3][i]);
+        coeffs[6][i] += (a1 * coeffs[5][i]) + (a2 * coeffs[4][i]);
+        coeffs[7][i] += (a1 * coeffs[6][i]) + (a2 * coeffs[5][i]);
+
+        /*
+         * transpose
+         */
+        newCoef->coeffs[i][0] = (float16_t) coeffs[0][i];
+        newCoef->coeffs[i][1] = (float16_t) coeffs[1][i];
+        newCoef->coeffs[i][2] = (float16_t) coeffs[2][i];
+        newCoef->coeffs[i][3] = (float16_t) coeffs[3][i];
+        newCoef->coeffs[i][4] = (float16_t) coeffs[4][i];
+        newCoef->coeffs[i][5] = (float16_t) coeffs[5][i];
+        newCoef->coeffs[i][6] = (float16_t) coeffs[6][i];
+        newCoef->coeffs[i][7] = (float16_t) coeffs[7][i];
+
+    }
+}
+
+void arm_biquad_cascade_df1_mve_init_f16(arm_biquad_casd_df1_inst_f16 * S,
+                                         uint8_t numStages,
+                                         const float16_t * pCoeffs, 
+                                         arm_biquad_mod_coef_f16 * pCoeffsMod, 
+                                         float16_t * pState)
+{
+    arm_biquad_cascade_df1_init_f16(S, numStages, (float16_t *)pCoeffsMod, pState);
+
+    /* Generate SIMD friendly modified coefs */
+    for (int i = 0; i < numStages; i++)
+    {
+        generateCoefsFastBiquadF16(pCoeffs[0], pCoeffs[1], pCoeffs[2], pCoeffs[3], pCoeffs[4], pCoeffsMod);
+        pCoeffs += 5;
+        pCoeffsMod++;
+    }
+}
+
+#endif
+
+/**
+  @} end of BiquadCascadeDF1 group
+ */
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c
index 684fa36422d700452b6f48afb52e8ba472e7cc17..1614594b600d2d4b6db2cbd2db675617c0fb114c 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df1_init_f32.c
  * Description:  Floating-point Biquad cascade DirectFormI(DF1) filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c
index c2e542c43df8840fdfc1350a2bd32a2c701bc51f..0fa38255ee542785c7d50605a8c0db79e81354bb 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df1_init_q15.c
  * Description:  Q15 Biquad cascade DirectFormI(DF1) filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c
index 8637889b597b04dd18c276a3be4e8882339ae7aa..2b3cb6288216255cdc616f6fa30ac0dd51c7a1a7 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df1_init_q31.c
  * Description:  Q31 Biquad cascade DirectFormI(DF1) filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
index 4149aacea10cb5bb614cc9e0dc768bce1f28b5e5..c77a05c49ce0f75f884f13c6f8fe0a61c1572ed8 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df1_q15.c
  * Description:  Processing function for the Q15 Biquad cascade DirectFormI(DF1) filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -56,7 +56,7 @@
                    Refer to \ref arm_biquad_cascade_df1_fast_q15() for a faster but less precise implementation of this filter.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 void arm_biquad_cascade_df1_q15(
   const arm_biquad_casd_df1_inst_q15 * S,
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
index 04413a3da2d2e32e56ee5abe8b4a947c358ccc49..d938025ef24ccd91dcacc3270961ef03aeb3d180 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df1_q31.c
  * Description:  Processing function for the Q31 Biquad cascade filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -55,7 +55,7 @@
   @remark
                    Refer to \ref arm_biquad_cascade_df1_fast_q31() for a faster but less precise implementation of this filter.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 void arm_biquad_cascade_df1_q31(
   const arm_biquad_casd_df1_inst_q31 * S,
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f16.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..11564e925fc2177e4c41af713229fcfe880da893
--- /dev/null
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f16.c
@@ -0,0 +1,495 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_biquad_cascade_df2T_f16.c
+ * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+/**
+  @ingroup groupFilters
+*/
+
+/**
+  @addtogroup BiquadCascadeDF2T
+  @{
+ */
+
+/**
+  @brief         Processing function for the floating-point transposed direct form II Biquad cascade filter.
+  @param[in]     S         points to an instance of the filter data structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize number of samples to process
+  @return        none
+ */
+
+#if (defined(ARM_MATH_MVE_FLOAT16) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_biquad_cascade_df2T_f16(
+  const arm_biquad_cascade_df2T_instance_f16 * S,
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+    float16_t *pIn = (float16_t *)pSrc;                  /*  source pointer            */
+    float16_t Xn0, Xn1;
+    float16_t acc0, acc1;
+    float16_t *pOut = pDst;                 /*  destination pointer       */
+    float16_t *pState = S->pState;          /*  State pointer             */
+    uint32_t  sample, stage = S->numStages; /*  loop counters             */
+    float16_t const *pCurCoeffs =          /*  coefficient pointer       */
+                (float16_t const *) S->pCoeffs;
+    f16x8_t b0Coeffs, a0Coeffs;           /*  Coefficients vector       */
+    f16x8_t b1Coeffs, a1Coeffs;           /*  Modified coef. vector     */
+    f16x8_t state;                        /*  State vector              */
+
+    do
+    {
+        /*
+         * temporary carry variable for feeding the 128-bit vector shifter
+         */
+        uint32_t  tmp = 0;
+        /*
+         * Reading the coefficients
+         * b0Coeffs = {b0, b1, b2, x, x, x, x, x}
+         * a0Coeffs = { x, a1, a2, x, x, x, x, x}
+         */
+        b0Coeffs = vld1q(pCurCoeffs);   pCurCoeffs += 2;
+        a0Coeffs = vld1q(pCurCoeffs);   pCurCoeffs += 3;
+        /*
+         * Reading the state values
+         * state = {d1, d2, 0, 0, x, x, x, x}
+         */
+        state = *(f16x8_t *) pState;
+        state = vsetq_lane((float16_t)0.0, state, 2);
+        state = vsetq_lane((float16_t)0.0, state, 3);
+
+        /* b1Coeffs = {0, b0, b1, b2, x, x, x, x} */
+        /* b1Coeffs = { x, x, a1, a2, x, x, x, x} */
+        b1Coeffs = (f16x8_t)vshlcq_s16((int16x8_t)b0Coeffs, &tmp, 16);
+        a1Coeffs = (f16x8_t)vshlcq_s16((int16x8_t)a0Coeffs, &tmp, 16);
+
+        sample = blockSize / 2;
+
+        /* unrolled 2 x */
+        while (sample > 0U)
+        {
+            /*
+             * Read 2 inputs
+             */
+            Xn0 = *pIn++;
+            Xn1 = *pIn++;
+
+            /*
+             * 1st half:
+             * / acc1 \   / b0 \         / d1 \   / 0  \
+             * |  d1  |   | b1 |         | d2 |   | a1 |
+             * |  d2  |   | b2 |         | 0  |   | a2 |
+             * |  x   | = | x  | * Xn1 + | x  | + | x  | x acc1
+             *   ...       ...            ...      ...
+             * \  x   /   \ x  /         \ x  /   \ x  /
+             */
+
+            state = vfmaq(state, b0Coeffs, Xn0);
+            acc0 = vgetq_lane(state, 0);
+            state = vfmaq(state, a0Coeffs, acc0);
+            state = vsetq_lane((float16_t)0.0, state, 3);
+
+            /*
+             * 2nd half:
+             * same as 1st half, but all vector elements shifted down.
+             * /  x   \   / x  \         / x  \   / x  \
+             * | acc1 |   | b0 |         | d1 |   | 0  |
+             * |  d1  |   | b1 |         | d2 |   | a1 |
+             * |  d2  |   | b2 |         | 0  |   | a2 |
+             * |  x   | = | x  | * Xn1 + | x  | + | x  | x acc1
+             *   ...       ...            ...      ...
+             * \  x   /   \ x  /         \ x  /   \ x  /
+             */
+
+            state = vfmaq(state, b1Coeffs, Xn1);
+            acc1 = vgetq_lane(state, 1);
+            state = vfmaq(state, a1Coeffs, acc1);
+
+            /* move d1, d2 up + clearing */
+            /* expect dual move or long move */
+            state = vsetq_lane(vgetq_lane(state, 2), state, 0);
+            state = vsetq_lane(vgetq_lane(state, 3), state, 1);
+            state = vsetq_lane((float16_t)0.0, state, 2);
+            /*
+             * Store the results in the destination buffer.
+             */
+            *pOut++ = acc0;
+            *pOut++ = acc1;
+            /*
+             * decrement the loop counter
+             */
+            sample--;
+        }
+
+        /* compiler does not come back when enabled */
+        /*
+         * tail handling
+         */
+        if (blockSize & 1)
+        {
+            Xn0 = *pIn++;
+            state = vfmaq_n_f16(state, b0Coeffs, Xn0);
+            acc0 = vgetq_lane(state, 0);
+
+            state = vfmaq_n_f16(state, a0Coeffs, acc0);
+            *pOut++ = acc0;
+            *pState++ = vgetq_lane(state, 1);
+            *pState++ = vgetq_lane(state, 2);
+        }
+        else
+        {
+            *pState++ = vgetq_lane(state, 0);
+            *pState++ = vgetq_lane(state, 1);
+        }
+        /*
+         * The current stage input is given as the output to the next stage
+         */
+        pIn = pDst;
+        /*
+         * Reset the output working pointer
+         */
+        pOut = pDst;
+        /*
+         * decrement the loop counter
+         */
+        stage--;
+    }
+    while (stage > 0U);
+}
+#else
+LOW_OPTIMIZATION_ENTER
+void arm_biquad_cascade_df2T_f16(
+  const arm_biquad_cascade_df2T_instance_f16 * S,
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+  const float16_t *pIn = pSrc;                         /* Source pointer */
+        float16_t *pOut = pDst;                        /* Destination pointer */
+        float16_t *pState = S->pState;                 /* State pointer */
+  const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        _Float16 acc1;                                /* Accumulator */
+        _Float16 b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        _Float16 Xn1;                                 /* Temporary input */
+        _Float16 d1, d2;                              /* State variables */
+        uint32_t sample, stage = S->numStages;         /* Loop counters */
+
+  do
+  {
+     /* Reading the coefficients */
+     b0 = pCoeffs[0];
+     b1 = pCoeffs[1];
+     b2 = pCoeffs[2];
+     a1 = pCoeffs[3];
+     a2 = pCoeffs[4];
+
+     /* Reading the state values */
+     d1 = pState[0];
+     d2 = pState[1];
+
+     pCoeffs += 5U;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+     /* Loop unrolling: Compute 16 outputs at a time */
+     sample = blockSize >> 4U;
+
+     while (sample > 0U) {
+
+       /* y[n] = b0 * x[n] + d1 */
+       /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+       /* d2 = b2 * x[n] + a2 * y[n] */
+
+/*  1 */
+       Xn1 = *pIn++;
+
+       acc1 = b0 * Xn1 + d1;
+
+       d1 = b1 * Xn1 + d2;
+       d1 += a1 * acc1;
+
+       d2 = b2 * Xn1;
+       d2 += a2 * acc1;
+
+       *pOut++ = acc1;
+
+/*  2 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  3 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  4 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  5 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  6 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  7 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  8 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/*  9 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 10 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 11 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 12 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 13 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 14 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 15 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+/* 16 */
+         Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+        /* decrement loop counter */
+        sample--;
+      }
+
+      /* Loop unrolling: Compute remaining outputs */
+      sample = blockSize & 0xFU;
+
+#else
+
+      /* Initialize blkCnt with number of samples */
+      sample = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+      while (sample > 0U) {
+        Xn1 = *pIn++;
+
+        acc1 = b0 * Xn1 + d1;
+
+        d1 = b1 * Xn1 + d2;
+        d1 += a1 * acc1;
+
+        d2 = b2 * Xn1;
+        d2 += a2 * acc1;
+
+        *pOut++ = acc1;
+
+        /* decrement loop counter */
+        sample--;
+      }
+
+      /* Store the updated state variables back into the state array */
+      pState[0] = d1;
+      pState[1] = d2;
+
+      pState += 2U;
+
+      /* The current stage output is given as the input to the next stage */
+      pIn = pDst;
+
+      /* Reset the output working pointer */
+      pOut = pDst;
+
+      /* decrement loop counter */
+      stage--;
+
+   } while (stage > 0U);
+
+}
+LOW_OPTIMIZATION_EXIT
+#endif /* #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of BiquadCascadeDF2T group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
index f8d424c2c4d21390bfd99e6a7bdbb4eb01bddab8..f03a3c6c2982b62791a6682b8a5e435b0495db20 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df2T_f32.c
  * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -45,7 +45,7 @@
   @param[in]     blockSize number of samples to process
   @return        none
  */
-#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+#if (defined(ARM_MATH_MVEF) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 
 void arm_biquad_cascade_df2T_f32(
@@ -89,8 +89,8 @@ void arm_biquad_cascade_df2T_f32(
 
         /* b1Coeffs = {b0, b1, b2, x} */
         /* b1Coeffs = { x, x, a1, a2} */
-        b1Coeffs = vshlcq_s32(b0Coeffs, &tmp, 32);
-        a1Coeffs = vshlcq_s32(a0Coeffs, &tmp, 32);
+        b1Coeffs = (f32x4_t)vshlcq_s32((int32x4_t)b0Coeffs, &tmp, 32);
+        a1Coeffs = (f32x4_t)vshlcq_s32((int32x4_t)a0Coeffs, &tmp, 32);
 
         sample = blockSize / 2;
 
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
index 35d401d5ece496ecc424a39d924a8b8bb164ae0f..c24637346eb10a010ea475b0d21037a552506ac6 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df2T_f64.c
  * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f16.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..3661d7160b69a0911be7a865a463f3a39f6b0a33
--- /dev/null
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f16.c
@@ -0,0 +1,114 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_biquad_cascade_df2T_init_f16.c
+ * Description:  Initialization function for floating-point transposed direct form II Biquad cascade filter
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+/**
+  @ingroup groupFilters
+ */
+
+/**
+  @addtogroup BiquadCascadeDF2T
+  @{
+ */
+
+/**
+  @brief         Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+  @param[in,out] S           points to an instance of the filter data structure.
+  @param[in]     numStages   number of 2nd order stages in the filter.
+  @param[in]     pCoeffs     points to the filter coefficients.
+  @param[in]     pState      points to the state buffer.
+  @return        none
+
+  @par           Coefficient and State Ordering
+                   The coefficients are stored in the array <code>pCoeffs</code> in the following order
+                   in the not Neon version.
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+                   
+  @par
+                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
+                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
+                   and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
+
+                   For Neon version, this array is bigger. If numstages = 4x + y, then the array has size:
+                   32*x + 5*y
+                   and it must be initialized using the function
+                   arm_biquad_cascade_df2T_compute_coefs_f16 which is taking the
+                   standard array coefficient as parameters.
+
+                   But, an array of 8*numstages is a good approximation.
+
+                   Then, the initialization can be done with:
+  <pre>
+                   arm_biquad_cascade_df2T_init_f16(&SNeon, nbCascade, neonCoefs, stateNeon);
+                   arm_biquad_cascade_df2T_compute_coefs_f16(&SNeon,nbCascade,coefs);
+  </pre>
+
+  @par             In this example, neonCoefs is a bigger array of size 8 * numStages.
+                   coefs is the standard array:
+
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+
+
+  @par
+                   The <code>pState</code> is a pointer to state array.
+                   Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.
+                   The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
+                   The state array has a total length of <code>2*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
+ */
+
+void arm_biquad_cascade_df2T_init_f16(
+        arm_biquad_cascade_df2T_instance_f16 * S,
+        uint8_t numStages,
+  const float16_t * pCoeffs,
+        float16_t * pState)
+{
+  /* Assign filter stages */
+  S->numStages = numStages;
+
+  /* Assign coefficient pointer */
+  S->pCoeffs = pCoeffs;
+
+  /* Clear state buffer and size is always 2 * numStages */
+  memset(pState, 0, (2U * (uint32_t) numStages) * sizeof(float16_t));
+
+  /* Assign state pointer */
+  S->pState = pState;
+}
+
+/**
+  @} end of BiquadCascadeDF2T group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c
index 4a50515ad548bb4f1f080cf30044242dab319f0b..53b68079afe1e4851b94dba83e4bc8387de43749 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df2T_init_f32.c
  * Description:  Initialization function for floating-point transposed direct form II Biquad cascade filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c
index 2ab272f3067924c677833e57591e1207f1874cd8..5ab1c9a2e75e20da30c372678ce3811ddb3efcb7 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df2T_init_f64.c
  * Description:  Initialization function for floating-point transposed direct form II Biquad cascade filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..6cd934d0b0c91e000a3c2259d22a7e07a9a9d4e0
--- /dev/null
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c
@@ -0,0 +1,434 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_biquad_cascade_stereo_df2T_f16.c
+ * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter. 2 channels
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+/**
+  @ingroup groupFilters
+*/
+
+/**
+  @addtogroup BiquadCascadeDF2T
+  @{
+ */
+
+/**
+  @brief         Processing function for the floating-point transposed direct form II Biquad cascade filter.
+  @param[in]     S         points to an instance of the filter data structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize number of samples to process
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_biquad_cascade_stereo_df2T_f16 built. Helium version has build issues with gcc."
+#endif 
+
+#if (defined(ARM_MATH_MVE_FLOAT16) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
+void arm_biquad_cascade_stereo_df2T_f16(
+  const arm_biquad_cascade_stereo_df2T_instance_f16 * S,
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+    float16_t *pIn = (float16_t *)pSrc;      /*  source pointer            */
+    float16_t *pOut = pDst;     /*  destination pointer       */
+    float16_t *pState = S->pState;  /*  State pointer             */
+    const float16_t *pCoeffs = S->pCoeffs;    /*  coefficient pointer       */
+    float16_t b0, b1, b2, a1, a2;   /*  Filter coefficients       */
+    uint32_t  sample, stage = S->numStages; /*  loop counters             */
+    static const uint16_t idx2[] = {2, 3, 8, 9, 2, 3, 8, 9};
+    f16x8_t aCoeffs, bCoeffs;
+    float16_t scratch[16];
+    uint16x8_t loadIdxVec;
+    uint16x8_t reshufledIdxVec;
+    uint16_t  startIdx = 0;
+    f16x8_t stateVec0, stateVec1;
+    f16x8_t inVec;
+
+    /*
+     * {0, 1, 0, 1, 0, 1, 0, 1} generator
+     */
+    loadIdxVec = viwdupq_u16(startIdx, 2, 1);
+    reshufledIdxVec = *(uint16x8_t *)&idx2;
+
+    /*
+     * scratch top clearing
+     * layout : [d1a d1b d2a d2b d1a d1b d2a d2b 0 0]
+     */
+    scratch[8] = (float16_t)0.0;
+    scratch[9] = (float16_t)0.0;
+
+    do
+    {
+        /*
+         * Reading the coefficients
+         */
+        b0 = *pCoeffs++;
+        b1 = *pCoeffs++;
+        b2 = *pCoeffs++;
+        a1 = *pCoeffs++;
+        a2 = *pCoeffs++;
+
+        /* aCoeffs = {a1 a1 a2 a2 a1 a1 a2 a2} */
+        aCoeffs = vdupq_n_f16(a1);
+        aCoeffs = vsetq_lane(a2, aCoeffs, 2);
+        aCoeffs = vsetq_lane(a2, aCoeffs, 3);
+        aCoeffs = vsetq_lane(a2, aCoeffs, 6);
+        aCoeffs = vsetq_lane(a2, aCoeffs, 7);
+
+        /* bCoeffs = {b1 b1 b2 b2 b1 b1 b2 b2} */
+        bCoeffs = vdupq_n_f16(b1);
+        bCoeffs = vsetq_lane(b2, bCoeffs, 2);
+        bCoeffs = vsetq_lane(b2, bCoeffs, 3);
+        bCoeffs = vsetq_lane(b2, bCoeffs, 6);
+        bCoeffs = vsetq_lane(b2, bCoeffs, 7);
+
+        /*
+         * Reading the state values
+         * Save into scratch
+         */
+        *(f16x8_t *) scratch = *(f16x8_t *) pState;
+
+        sample = blockSize;
+
+        while (sample > 0U)
+        {
+            /*
+             * step 1
+             *
+             * 0   | acc1a = xn1a * b0 + d1a
+             * 1   | acc1b = xn1b * b0 + d1b
+             * 2   | acc1a = xn1a * b0 + d1a
+             * 3   | acc1b = xn1b * b0 + d1b
+             * 4   |   <repeat>
+             * 5   |   ...
+             */
+
+            /*
+             * load {d1a, d1b, d1a, d1b, d1a, d1b, d1a, d1b}
+             */
+            stateVec0 = vldrhq_gather_shifted_offset((float16_t const *) scratch, loadIdxVec);
+            /*
+             * load {in0 in1 in0 in1 in0 in1 in0 in1}
+             */
+            inVec = vldrhq_gather_shifted_offset_f16(pIn, loadIdxVec);
+
+            stateVec0 = vfmaq(stateVec0, inVec, b0);
+            *pOut++ = vgetq_lane(stateVec0, 0);
+            *pOut++ = vgetq_lane(stateVec0, 1);
+
+            /*
+             * step 2
+             *
+             * 0  | d1a = b1 * xn1a  +  a1 * acc1a  +  d2a
+             * 1  | d1b = b1 * xn1b  +  a1 * acc1b  +  d2b
+             * 2  | d2a = b2 * xn1a  +  a2 * acc1a  +  0
+             * 3  | d2b = b2 * xn1b  +  a2 * acc1b  +  0
+             * 4  |   <repeat>
+             * 5  |   ...
+             */
+
+            /*
+             * load {d2a, d2b, 0, 0, d2a, d2b, 0, 0}
+             */
+            stateVec1 = vldrhq_gather_shifted_offset((float16_t const *) scratch, reshufledIdxVec);
+            stateVec1 = vfmaq(stateVec1, stateVec0, aCoeffs);
+            stateVec1 = vfmaq(stateVec1, inVec, bCoeffs);
+            *(f16x8_t *) scratch = stateVec1;
+
+            pIn = pIn + 2;
+            sample--;
+        }
+
+        /*
+         * Store the updated state variables back into the state array
+         */
+         *pState++ = vgetq_lane(stateVec1, 0);
+         *pState++ = vgetq_lane(stateVec1, 1);
+         *pState++ = vgetq_lane(stateVec1, 2);
+         *pState++ = vgetq_lane(stateVec1, 3);
+
+        /*
+         * The current stage input is given as the output to the next stage
+         */
+        pIn = pDst;
+        /*
+         * Reset the output working pointer
+         */
+        pOut = pDst;
+        /*
+         * decrement the loop counter
+         */
+        stage--;
+    }
+    while (stage > 0U);
+}
+#else
+LOW_OPTIMIZATION_ENTER
+void arm_biquad_cascade_stereo_df2T_f16(
+  const arm_biquad_cascade_stereo_df2T_instance_f16 * S,
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+  const float16_t *pIn = pSrc;                         /* Source pointer */
+        float16_t *pOut = pDst;                        /* Destination pointer */
+        float16_t *pState = S->pState;                 /* State pointer */
+  const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        _Float16 acc1a, acc1b;                        /* Accumulator */
+        _Float16 b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        _Float16 Xn1a, Xn1b;                          /* Temporary input */
+        _Float16 d1a, d2a, d1b, d2b;                  /* State variables */
+        uint32_t sample, stage = S->numStages;         /* Loop counters */
+
+    do
+    {
+        /* Reading the coefficients */
+        b0 = pCoeffs[0];
+        b1 = pCoeffs[1];
+        b2 = pCoeffs[2];
+        a1 = pCoeffs[3];
+        a2 = pCoeffs[4];
+
+        /* Reading the state values */
+        d1a = pState[0];
+        d2a = pState[1];
+        d1b = pState[2];
+        d2b = pState[3];
+
+        pCoeffs += 5U;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+    /* Loop unrolling: Compute 8 outputs at a time */
+        sample = blockSize >> 3U;
+
+        while (sample > 0U) {
+          /* y[n] = b0 * x[n] + d1 */
+          /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+          /* d2 = b2 * x[n] + a2 * y[n] */
+
+/*  1 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */
+
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;
+
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;
+
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/*  2 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */
+
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;
+
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;
+
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/*  3 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */
+
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;
+
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;
+
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/*  4 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */
+
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;
+
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;
+
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/*  5 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */
+
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;
+
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;
+
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/*  6 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */
+
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;
+
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;
+
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/*  7 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */
+
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;
+
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;
+
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/*  8 */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */
+
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;
+
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;
+
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+          /* decrement loop counter */
+          sample--;
+        }
+
+        /* Loop unrolling: Compute remaining outputs */
+        sample = blockSize & 0x7U;
+
+#else
+
+        /* Initialize blkCnt with number of samples */
+        sample = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+        while (sample > 0U) {
+          /* Read the input */
+          Xn1a = *pIn++; /* Channel a */
+          Xn1b = *pIn++; /* Channel b */
+
+          /* y[n] = b0 * x[n] + d1 */
+          acc1a = (b0 * Xn1a) + d1a;
+          acc1b = (b0 * Xn1b) + d1b;
+
+          /* Store the result in the accumulator in the destination buffer. */
+          *pOut++ = acc1a;
+          *pOut++ = acc1b;
+
+          /* Every time after the output is computed state should be updated. */
+          /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+          d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+          d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+          /* d2 = b2 * x[n] + a2 * y[n] */
+          d2a = (b2 * Xn1a) + (a2 * acc1a);
+          d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+          /* decrement loop counter */
+          sample--;
+        }
+
+        /* Store the updated state variables back into the state array */
+        pState[0] = d1a;
+        pState[1] = d2a;
+
+        pState[2] = d1b;
+        pState[3] = d2b;
+
+        pState += 4U;
+
+        /* The current stage output is given as the input to the next stage */
+        pIn = pDst;
+
+        /* Reset the output working pointer */
+        pOut = pDst;
+
+        /* Decrement the loop counter */
+        stage--;
+
+    } while (stage > 0U);
+
+}
+LOW_OPTIMIZATION_EXIT
+#endif /* #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of BiquadCascadeDF2T group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c
index 426efef0eb20f635bf416b3768deafadc4ec8f7d..ee0ac093b74f6a63d6aa809c19ccd0140afc1e1b 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_stereo_df2T_f32.c
  * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter. 2 channels
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -45,7 +45,7 @@
   @param[in]     blockSize number of samples to process
   @return        none
  */
-#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+#if (defined(ARM_MATH_MVEF) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 
 void arm_biquad_cascade_stereo_df2T_f32(
@@ -125,11 +125,11 @@ void arm_biquad_cascade_stereo_df2T_f32(
             /*
              * load {d1a, d1b, d1a, d1b}
              */
-            stateVec0 = vldrwq_gather_shifted_offset((uint32_t const *) scratch, loadIdxVec);
+            stateVec0 = (f32x4_t)vldrwq_gather_shifted_offset((uint32_t const *) scratch, loadIdxVec);
             /*
              * load {in0 in1 in0 in1}
              */
-            inVec = vldrwq_gather_shifted_offset((uint32_t const *) pIn, loadIdxVec);
+            inVec = (f32x4_t)vldrwq_gather_shifted_offset((uint32_t const *) pIn, loadIdxVec);
 
             stateVec0 = vfmaq(stateVec0, inVec, b0);
             *pOut++ = vgetq_lane(stateVec0, 0);
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f16.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..633fbfef282a371109946e846234a65a5bc41576
--- /dev/null
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f16.c
@@ -0,0 +1,90 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_biquad_cascade_stereo_df2T_init_f16.c
+ * Description:  Initialization function for floating-point transposed direct form II Biquad cascade filter
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+  @ingroup groupFilters
+ */
+
+/**
+  @addtogroup BiquadCascadeDF2T
+  @{
+ */
+
+/**
+  @brief         Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+  @param[in,out] S           points to an instance of the filter data structure.
+  @param[in]     numStages   number of 2nd order stages in the filter.
+  @param[in]     pCoeffs     points to the filter coefficients.
+  @param[in]     pState      points to the state buffer.
+  @return        none
+
+  @par           Coefficient and State Ordering
+                   The coefficients are stored in the array <code>pCoeffs</code> in the following order:
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+  @par
+                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
+                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
+                   and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
+  @par
+                   The <code>pState</code> is a pointer to state array.
+                   Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code> for each channel.
+                   The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
+                   The state array has a total length of <code>2*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
+ */
+
+void arm_biquad_cascade_stereo_df2T_init_f16(
+        arm_biquad_cascade_stereo_df2T_instance_f16 * S,
+        uint8_t numStages,
+  const float16_t * pCoeffs,
+        float16_t * pState)
+{
+  /* Assign filter stages */
+  S->numStages = numStages;
+
+  /* Assign coefficient pointer */
+  S->pCoeffs = pCoeffs;
+
+  /* Clear state buffer and size is always 4 * numStages */
+  memset(pState, 0, (4U * (uint32_t) numStages) * sizeof(float16_t));
+
+  /* Assign state pointer */
+  S->pState = pState;
+}
+
+/**
+  @} end of BiquadCascadeDF2T group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c
index d398f18172af9657593efcd201b070e83354058e..6932b9a48b30e61e748d8003e47bb41c9098df9f 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_stereo_df2T_init_f32.c
  * Description:  Initialization function for floating-point transposed direct form II Biquad cascade filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_f32.c
index f217f1edefdd2f271063cb79490faeab89332dd2..7f2dd81d12062db508cf94b9c6b09a7e54d31a0b 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_f32.c
  * Description:  Convolution of floating-point sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c
index ed2aea940d295f9a156c8d430ddc2d2d9319fbaa..06629f1acffd6901a226ba7fcff5a893d544cc2e 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_fast_opt_q15.c
  * Description:  Fast Q15 Convolution
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -220,7 +220,7 @@ void arm_conv_fast_opt_q15(
       y1 = read_q15x2_ia ((q15_t **) &pIn2);
       y2 = read_q15x2_ia ((q15_t **) &pIn2);
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc0 = __SMLAD(x1, y1, acc0);
       acc2 = __SMLAD(x2, y1, acc2);
 
@@ -231,13 +231,13 @@ void arm_conv_fast_opt_q15(
       x3 = __PKHBT(x1, x2, 0);
 #endif
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc1 = __SMLADX(x3, y1, acc1);
 
       /* Read next two samples from scratch1 buffer */
       x1 = read_q15x2_ia (&pScr1);
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc0 = __SMLAD(x2, y2, acc0);
       acc2 = __SMLAD(x1, y2, acc2);
 
@@ -273,7 +273,7 @@ void arm_conv_fast_opt_q15(
 
     while (tapCnt > 0U)
     {
-      /* accumlate the results */
+      /* accumulate the results */
       acc0 += (*pScr1++ * *pIn2);
       acc1 += (*pScr1++ * *pIn2);
       acc2 += (*pScr1++ * *pIn2);
@@ -340,7 +340,7 @@ void arm_conv_fast_opt_q15(
     while (tapCnt > 0U)
     {
 
-      /* accumlate the results */
+      /* accumulate the results */
       acc0 += (*pScr1++ * *pIn2++);
 
       /* Decrement loop counter */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c
index 3102a05cdbb24e3b454d44b48ce6fd0573a57cd0..5337a82eac919b2e63196def7637d92527fef40a 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_fast_q15.c
  * Description:  Fast Q15 Convolution
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c
index e87eddc73f838f2cc44b8ae9f4874c672e803942..133c322c2e88830748d817447f52504804337e1c 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_fast_q31.c
  * Description:  Fast Q31 Convolution
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c
index 6ad34cdc666c46d3f3625f66e6f8d4e237e1aaa8..1f3efe00a1983326d047ce628e93bb16230e2f5b 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_opt_q15.c
  * Description:  Convolution of Q15 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -216,7 +216,7 @@ void arm_conv_opt_q15(
       y1 = read_q15x2_ia ((q15_t **) &pIn2);
       y2 = read_q15x2_ia ((q15_t **) &pIn2);
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc0 = __SMLALD(x1, y1, acc0);
       acc2 = __SMLALD(x2, y1, acc2);
 
@@ -227,13 +227,13 @@ void arm_conv_opt_q15(
       x3 = __PKHBT(x1, x2, 0);
 #endif
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc1 = __SMLALDX(x3, y1, acc1);
 
       /* Read next two samples from scratch1 buffer */
       x1 = read_q15x2_ia (&pScr1);
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc0 = __SMLALD(x2, y2, acc0);
       acc2 = __SMLALD(x1, y2, acc2);
 
@@ -269,7 +269,7 @@ void arm_conv_opt_q15(
 
     while (tapCnt > 0U)
     {
-      /* accumlate the results */
+      /* accumulate the results */
       acc0 += (*pScr1++ * *pIn2);
       acc1 += (*pScr1++ * *pIn2);
       acc2 += (*pScr1++ * *pIn2);
@@ -336,7 +336,7 @@ void arm_conv_opt_q15(
     while (tapCnt > 0U)
     {
 
-      /* accumlate the results */
+      /* accumulate the results */
       acc0 += (*pScr1++ * *pIn2++);
 
       /* Decrement loop counter */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c
index fb9e2eccfea27e5cfec0cc4833d668eae9563326..1f12f0d2b6a546c7007e92fdda94811e8dbb662e 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_opt_q7.c
  * Description:  Convolution of Q7 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -223,7 +223,7 @@ void arm_conv_opt_q7(
       /* Read four samples from smaller buffer */
       y1 = read_q15x2_ia (&pScr2);
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc0 = __SMLAD(x1, y1, acc0);
       acc2 = __SMLAD(x2, y1, acc2);
 
@@ -234,7 +234,7 @@ void arm_conv_opt_q7(
       x3 = __PKHBT(x1, x2, 0);
 #endif
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc1 = __SMLADX(x3, y1, acc1);
 
       /* Read next two samples from scratch1 buffer */
@@ -280,7 +280,7 @@ void arm_conv_opt_q7(
 
     while (tapCnt > 0U)
     {
-      /* accumlate the results */
+      /* accumulate the results */
       acc0 += (*pScr1++ * *pScr2);
       acc1 += (*pScr1++ * *pScr2);
       acc2 += (*pScr1++ * *pScr2);
@@ -335,7 +335,7 @@ void arm_conv_opt_q7(
     /* apply same above for remaining samples of smaller length sequence */
     while (tapCnt > 0U)
     {
-      /* accumlate the results */
+      /* accumulate the results */
       acc0 += (*pScr1++ * *pScr2++);
 
       /* Decrement loop counter */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c
index 34c77f24de34065f5009ce2812ae148773ef3783..a8a9bd1a3340db36d06e146439dd7c9e19d4d87d 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_partial_f32.c
  * Description:  Partial convolution of floating-point sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -95,7 +95,7 @@ arm_status arm_conv_partial_f32(
   const float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */
         float32_t sum;                                 /* Accumulator */
         uint32_t j, k, count, blkCnt, check;
-        uint32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
+        int32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
         arm_status status;                             /* Status of Partial convolution */
 
 #if defined (ARM_MATH_LOOPUNROLL)
@@ -142,7 +142,7 @@ arm_status arm_conv_partial_f32(
     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
-    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : numPoints) : 0;
+    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
     blockSize2 = ((int32_t) check - blockSize3) - (blockSize1 + (int32_t) firstIndex);
     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 
@@ -187,7 +187,7 @@ arm_status arm_conv_partial_f32(
      * ----------------------*/
 
     /* The first stage starts here */
-    while (blockSize1 > 0U)
+    while (blockSize1 > 0)
     {
       /* Accumulator is made zero for every iteration */
       sum = 0.0f;
@@ -539,7 +539,14 @@ arm_status arm_conv_partial_f32(
     count = srcBLen - 1U;
 
     /* Working pointer of inputA */
-    pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+    if (firstIndex > srcALen)
+    {
+       pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
+    }
+    else
+    {
+       pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+    }
     px = pSrc1;
 
     /* Working pointer of inputB */
@@ -550,7 +557,7 @@ arm_status arm_conv_partial_f32(
      * Stage3 process
      * ------------------*/
 
-    while (blockSize3 > 0U)
+    while (blockSize3 > 0)
     {
       /* Accumulator is made zero for every iteration */
       sum = 0.0f;
@@ -627,7 +634,6 @@ arm_status arm_conv_partial_f32(
         float32_t sum;                                 /* Accumulator */
         uint32_t i, j;                                 /* Loop counters */
         arm_status status;                             /* Status of Partial convolution */
-
   /* Check for range of output samples to be calculated */
   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
   {
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c
index 7166b577cfdc27872df37be77cd190adce56db7c..a6b9ecf3bea66b8aba4a2b10edcff37e9df43a6a 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_partial_fast_opt_q15.c
  * Description:  Fast Q15 Partial convolution
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -230,7 +230,7 @@ arm_status arm_conv_partial_fast_opt_q15(
         y1 = read_q15x2_ia ((q15_t **) &pIn2);
         y2 = read_q15x2_ia ((q15_t **) &pIn2);
 
-        /* multiply and accumlate */
+        /* multiply and accumulate */
         acc0 = __SMLAD(x1, y1, acc0);
         acc2 = __SMLAD(x2, y1, acc2);
 
@@ -241,13 +241,13 @@ arm_status arm_conv_partial_fast_opt_q15(
         x3 = __PKHBT(x1, x2, 0);
 #endif
 
-        /* multiply and accumlate */
+        /* multiply and accumulate */
         acc1 = __SMLADX(x3, y1, acc1);
 
         /* Read next two samples from scratch1 buffer */
         x1 = read_q15x2_ia (&pScr1);
 
-        /* multiply and accumlate */
+        /* multiply and accumulate */
         acc0 = __SMLAD(x2, y2, acc0);
         acc2 = __SMLAD(x1, y2, acc2);
 
@@ -269,7 +269,7 @@ arm_status arm_conv_partial_fast_opt_q15(
         x3 = __PKHBT(x1, x2, 0);
 #endif
 
-        /* multiply and accumlate */
+        /* multiply and accumulate */
         acc3 = __SMLADX(x3, y2, acc3);
 
         /* Decrement loop counter */
@@ -284,7 +284,7 @@ arm_status arm_conv_partial_fast_opt_q15(
 
       while (tapCnt > 0U)
       {
-        /* accumlate the results */
+        /* accumulate the results */
         acc0 += (*pScr1++ * *pIn2);
         acc1 += (*pScr1++ * *pIn2);
         acc2 += (*pScr1++ * *pIn2);
@@ -342,7 +342,7 @@ arm_status arm_conv_partial_fast_opt_q15(
         /* Read two samples from smaller buffer */
         y1 = read_q15x2_ia ((q15_t **) &pIn2);
 
-        /* multiply and accumlate */
+        /* multiply and accumulate */
         acc0 = __SMLAD(x1, y1, acc0);
 
         /* Decrement loop counter */
@@ -354,7 +354,7 @@ arm_status arm_conv_partial_fast_opt_q15(
       /* apply same above for remaining samples of smaller length sequence */
       while (tapCnt > 0U)
       {
-        /* accumlate the results */
+        /* accumulate the results */
         acc0 += (*pScr1++ * *pIn2++);
 
         /* Decrement loop counter */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c
index 535fbc793b837d605a1e5ecb9a70f8c913bada27..3e928180b9e0912207f10a3378749a729e14335f 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_partial_fast_q15.c
  * Description:  Fast Q15 Partial convolution
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -574,7 +574,14 @@ arm_status arm_conv_partial_fast_q15(
     count = srcBLen - 1U;
 
     /* Working pointer of inputA */
-    pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+    if (firstIndex > srcALen)
+    {
+       pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
+    }
+    else
+    {
+       pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+    }
     px = pSrc1;
 
     /* Working pointer of inputB */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c
index fe2e2bbbe907c505a3670445fa24c7003be1116f..65269bc95ea4379d4d64299b66ef5bff1e41f91a 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_partial_fast_q31.c
  * Description:  Fast Q31 Partial convolution
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -71,7 +71,7 @@ arm_status arm_conv_partial_fast_q31(
   const q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
         q31_t sum;                                     /* Accumulators */
         uint32_t j, k, count, check, blkCnt;
-        uint32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
+        int32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
         arm_status status;                             /* Status of Partial convolution */
 
 #if defined (ARM_MATH_LOOPUNROLL)
@@ -118,7 +118,7 @@ arm_status arm_conv_partial_fast_q31(
     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
-    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :  numPoints) : 0;
+    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :  (int32_t)numPoints) : 0;
     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 
@@ -163,7 +163,7 @@ arm_status arm_conv_partial_fast_q31(
      * ----------------------*/
 
     /* The first stage starts here */
-    while (blockSize1 > 0U)
+    while (blockSize1 > 0)
     {
       /* Accumulator is made zero for every iteration */
       sum = 0;
@@ -526,7 +526,14 @@ arm_status arm_conv_partial_fast_q31(
     count = srcBLen - 1U;
 
     /* Working pointer of inputA */
-    pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+    if (firstIndex > srcALen)
+    {
+       pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
+    }
+    else
+    {
+       pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+    }
     px = pSrc1;
 
     /* Working pointer of inputB */
@@ -537,7 +544,7 @@ arm_status arm_conv_partial_fast_q31(
      * Stage3 process
      * ------------------*/
 
-    while (blockSize3 > 0U)
+    while (blockSize3 > 0)
     {
       /* Accumulator is made zero for every iteration */
       sum = 0;
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c
index 21999d2d97ca2c1cb2b7b438a824ccce920733b1..7a9d6b1892e389103fcc0fd74af6c77116bd9c85 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_partial_opt_q15.c
  * Description:  Partial convolution of Q15 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -231,7 +231,7 @@ arm_status arm_conv_partial_opt_q15(
         y1 = read_q15x2_ia ((q15_t **) &pIn2);
         y2 = read_q15x2_ia ((q15_t **) &pIn2);
 
-        /* multiply and accumlate */
+        /* multiply and accumulate */
         acc0 = __SMLALD(x1, y1, acc0);
         acc2 = __SMLALD(x2, y1, acc2);
 
@@ -242,13 +242,13 @@ arm_status arm_conv_partial_opt_q15(
         x3 = __PKHBT(x1, x2, 0);
 #endif
 
-        /* multiply and accumlate */
+        /* multiply and accumulate */
         acc1 = __SMLALDX(x3, y1, acc1);
 
         /* Read next two samples from scratch1 buffer */
         x1 = read_q15x2_ia (&pScr1);
 
-        /* multiply and accumlate */
+        /* multiply and accumulate */
         acc0 = __SMLALD(x2, y2, acc0);
         acc2 = __SMLALD(x1, y2, acc2);
 
@@ -284,7 +284,7 @@ arm_status arm_conv_partial_opt_q15(
 
       while (tapCnt > 0U)
       {
-        /* accumlate the results */
+        /* accumulate the results */
         acc0 += (*pScr1++ * *pIn2);
         acc1 += (*pScr1++ * *pIn2);
         acc2 += (*pScr1++ * *pIn2);
@@ -353,7 +353,7 @@ arm_status arm_conv_partial_opt_q15(
       /* apply same above for remaining samples of smaller length sequence */
       while (tapCnt > 0U)
       {
-        /* accumlate the results */
+        /* accumulate the results */
         acc0 += (*pScr1++ * *pIn2++);
 
         /* Decrement loop counter */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c
index 811f386eeb78ffa044c324333e046c38f6c6545c..a5c16c50a96ca7b3434375117034d8bba5eafff5 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_partial_opt_q7.c
  * Description:  Partial convolution of Q7 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -241,7 +241,7 @@ arm_status arm_conv_partial_opt_q7(
         /* Read four samples from smaller buffer */
         y1 = read_q15x2_ia (&pScr2);
 
-        /* multiply and accumlate */
+        /* multiply and accumulate */
         acc0 = __SMLAD(x1, y1, acc0);
         acc2 = __SMLAD(x2, y1, acc2);
 
@@ -252,7 +252,7 @@ arm_status arm_conv_partial_opt_q7(
         x3 = __PKHBT(x1, x2, 0);
 #endif
 
-        /* multiply and accumlate */
+        /* multiply and accumulate */
         acc1 = __SMLADX(x3, y1, acc1);
 
         /* Read next two samples from scratch1 buffer */
@@ -298,7 +298,7 @@ arm_status arm_conv_partial_opt_q7(
 
       while (tapCnt > 0U)
       {
-        /* accumlate the results */
+        /* accumulate the results */
         acc0 += (*pScr1++ * *pScr2);
         acc1 += (*pScr1++ * *pScr2);
         acc2 += (*pScr1++ * *pScr2);
@@ -360,7 +360,7 @@ arm_status arm_conv_partial_opt_q7(
       while (tapCnt > 0U)
       {
 
-        /* accumlate the results */
+        /* accumulate the results */
         acc0 += (*pScr1++ * *pScr2++);
 
         /* Decrement loop counter */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c
index eb24641794b7be1d0fc69f728fae8b04ad1b8dc4..cfab5168fe85e43fbe2f4c6b05d3799b51f0853b 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_partial_q15.c
  * Description:  Partial convolution of Q15 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -76,7 +76,7 @@ arm_status arm_conv_partial_q15(
   const q15_t *py;                                     /* Intermediate inputB pointer */
   const q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
         q31_t x0, x1, x2, x3, c0;                      /* Temporary input variables to hold state and coefficient values */
-        uint32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
+        int32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
         uint32_t j, k, count, blkCnt, check;
         arm_status status;                             /* Status of Partial convolution */
 
@@ -119,7 +119,7 @@ arm_status arm_conv_partial_q15(
     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
-    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :  numPoints) : 0;
+    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :  (int32_t)numPoints) : 0;
     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 
@@ -168,7 +168,7 @@ arm_status arm_conv_partial_q15(
     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
 
     /* The first part of the stage starts here */
-    while ((count < 4U) && (blockSize1 > 0U))
+    while ((count < 4U) && (blockSize1 > 0))
     {
       /* Accumulator is made zero for every iteration */
       sum = 0;
@@ -206,7 +206,7 @@ arm_status arm_conv_partial_q15(
      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
     py = py - 1;
 
-    while (blockSize1 > 0U)
+    while (blockSize1 > 0)
     {
       /* Accumulator is made zero for every iteration */
       sum = 0;
@@ -580,7 +580,14 @@ arm_status arm_conv_partial_q15(
     count = srcBLen - 1U;
 
     /* Working pointer of inputA */
-    pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+    if (firstIndex > srcALen)
+    {
+       pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
+    }
+    else
+    {
+       pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+    }
     px = pSrc1;
 
     /* Working pointer of inputB */
@@ -599,7 +606,7 @@ arm_status arm_conv_partial_q15(
     /* The first part of the stage starts here */
     j = count >> 2U;
 
-    while ((j > 0U) && (blockSize3 > 0U))
+    while ((j > 0U) && (blockSize3 > 0))
     {
       /* Accumulator is made zero for every iteration */
       sum = 0;
@@ -660,7 +667,7 @@ arm_status arm_conv_partial_q15(
      * so pointer py is updated to read only one sample at a time */
     py = py + 1U;
 
-    while (blockSize3 > 0U)
+    while (blockSize3 > 0)
     {
       /* Accumulator is made zero for every iteration */
       sum = 0;
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c
index cfb1ad017a4ccfb028140b9bc8763470eb37e3c9..bcd52983c421a7a64af138171bbed961826d93e3 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_partial_q31.c
  * Description:  Partial convolution of Q31 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -74,7 +74,7 @@ arm_status arm_conv_partial_q31(
   const q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
         q63_t sum;                                     /* Accumulator */
         uint32_t j, k, count, blkCnt, check;
-        uint32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
+        int32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
         arm_status status;                             /* Status of Partial convolution */
 
 #if defined (ARM_MATH_LOOPUNROLL)
@@ -121,7 +121,7 @@ arm_status arm_conv_partial_q31(
     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
-    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :  numPoints) : 0;
+    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :  (int32_t)numPoints) : 0;
     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 
@@ -166,7 +166,7 @@ arm_status arm_conv_partial_q31(
      * ----------------------*/
 
     /* The first stage starts here */
-    while (blockSize1 > 0U)
+    while (blockSize1 > 0)
     {
       /* Accumulator is made zero for every iteration */
       sum = 0;
@@ -499,7 +499,14 @@ arm_status arm_conv_partial_q31(
     count = srcBLen - 1U;
 
     /* Working pointer of inputA */
-    pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+    if (firstIndex > srcALen)
+    {
+       pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
+    }
+    else
+    {
+       pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+    }
     px = pSrc1;
 
     /* Working pointer of inputB */
@@ -510,7 +517,7 @@ arm_status arm_conv_partial_q31(
      * Stage3 process
      * ------------------*/
 
-    while (blockSize3 > 0U)
+    while (blockSize3 > 0)
     {
       /* Accumulator is made zero for every iteration */
       sum = 0;
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c
index bcfc51eb2a4b9276bdb70f93b775b458251794dc..116a8e696dab779c3247afaccd4abff5af1c0fc4 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_partial_q7.c
  * Description:  Partial convolution of Q7 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -74,7 +74,7 @@ arm_status arm_conv_partial_q7(
   const q7_t *pSrc1, *pSrc2;                           /* Intermediate pointers */
         q31_t sum;                                     /* Accumulator */
         uint32_t j, k, count, blkCnt, check;           /* Loop counters */
-        uint32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
+        int32_t blockSize1, blockSize2, blockSize3;    /* Loop counters */
         arm_status status;                             /* Status of Partial convolution */
 
 #if defined (ARM_MATH_LOOPUNROLL)
@@ -123,7 +123,7 @@ arm_status arm_conv_partial_q7(
     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
-    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : numPoints) : 0;
+    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 
@@ -168,7 +168,7 @@ arm_status arm_conv_partial_q7(
      * ----------------------*/
 
     /* The first stage starts here */
-    while (blockSize1 > 0U)
+    while (blockSize1 > 0)
     {
       /* Accumulator is made zero for every iteration */
       sum = 0;
@@ -602,7 +602,14 @@ arm_status arm_conv_partial_q7(
     count = srcBLen - 1U;
 
     /* Working pointer of inputA */
-    pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+    if (firstIndex > srcALen)
+    {
+       pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
+    }
+    else
+    {
+       pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+    }
     px = pSrc1;
 
     /* Working pointer of inputB */
@@ -613,7 +620,7 @@ arm_status arm_conv_partial_q7(
      * Stage3 process
      * ------------------*/
 
-    while (blockSize3 > 0U)
+    while (blockSize3 > 0)
     {
       /* Accumulator is made zero for every iteration */
       sum = 0;
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q15.c
index e96c81ead3c897cc95de0eeb9207d7240326c190..0a28b95907b3c9083806cfdf858fb9120cdde8a8 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_q15.c
  * Description:  Convolution of Q15 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -58,7 +58,7 @@
   @remark
                    Refer to \ref arm_conv_opt_q15() for a faster implementation of this function using scratch buffers.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 #include "arm_vec_filtering.h"
 
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q31.c
index 88c6cc4b7b6cef14d8295ec28e541826249e15ec..ee37824fa8027663c56c04d761620190300d9fb5 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_q31.c
  * Description:  Convolution of Q31 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -59,7 +59,7 @@
   @remark
                    Refer to \ref arm_conv_fast_q31() for a faster but less precise implementation of this function.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 #include "arm_vec_filtering.h"
 
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q7.c b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q7.c
index 97d540a11b1fb98d6a224160ac1e987af0a800e7..e29a16dd1715b5838b744e0fdb26cc89712d62b8 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q7.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_conv_q7.c
  * Description:  Convolution of Q7 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -55,7 +55,7 @@
   @remark
                    Refer to \ref arm_conv_opt_q7() for a faster implementation of this function.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 
 #include "arm_vec_filtering.h"
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f16.c b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..f52439bb37f2d35412e8379977b22259dfbf8f1e
--- /dev/null
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f16.c
@@ -0,0 +1,1159 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_correlate_f16.c
+ * Description:  Correlation of floating-point sequences
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+/**
+  @ingroup groupFilters
+ */
+
+/**
+  @defgroup Corr Correlation
+
+  Correlation is a mathematical operation that is similar to convolution.
+  As with convolution, correlation uses two signals to produce a third signal.
+  The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
+  Correlation is commonly used to measure the similarity between two signals.
+  It has applications in pattern recognition, cryptanalysis, and searching.
+  The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
+  Fast versions of the Q15 and Q31 functions are also provided.
+
+  @par           Algorithm
+                   Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
+                   The convolution of the two signals is denoted by
+  <pre>
+      c[n] = a[n] * b[n]
+  </pre>
+                   In correlation, one of the signals is flipped in time
+  <pre>
+       c[n] = a[n] * b[-n]
+  </pre>
+  @par
+                   and this is mathematically defined as
+                   \image html CorrelateEquation.gif
+  @par
+                   The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
+                   The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>.
+                   The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result.
+
+  @note
+                   The <code>pDst</code> should be initialized to all zeros before being used.
+
+  @par           Fixed-Point Behavior
+                   Correlation requires summing up a large number of intermediate products.
+                   As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
+                   Refer to the function specific documentation below for further details of the particular algorithm used.
+
+  @par           Fast Versions
+                   Fast versions are supported for Q31 and Q15.  Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
+                   the input signals should be scaled down to avoid intermediate overflows.
+
+  @par           Opt Versions
+                   Opt versions are supported for Q15 and Q7.  Design uses internal scratch buffer for getting good optimisation.
+                   These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions of correlate
+ */
+
+/**
+  @addtogroup Corr
+  @{
+ */
+
+/**
+  @brief         Correlation of floating-point sequences.
+  @param[in]     pSrcA      points to the first input sequence
+  @param[in]     srcALen    length of the first input sequence
+  @param[in]     pSrcB      points to the second input sequence
+  @param[in]     srcBLen    length of the second input sequence
+  @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_filtering.h"
+
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_F16(acc0, acc1, pX, pY, count)                            \
+{                                                                                                   \
+    float16_t const *pSrcX, *pSrcY;                                                                 \
+    f16x8_t   acc0Vec, acc1Vec, xVec, yVec;                                                       \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    acc0Vec = vdupq_n_f16(0.0f);                                                                    \
+    acc1Vec = vdupq_n_f16(0.0f);                                                                    \
+    pSrcX = (float16_t const *) pX;                                                                 \
+    pSrcY = (float16_t const *) pY;                                                                 \
+    k = count >> 3;                                                                                 \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        xVec = vld1q(pSrcX); pSrcX += 8;                                                \
+        yVec = vldrhq_f16(&pSrcY[-1]);                                                              \
+        acc1Vec = vfmaq_f16(acc1Vec, xVec, yVec);                                                   \
+        yVec = vld1q(pSrcY); pSrcY += 8;                                                \
+        acc0Vec = vfmaq_f16(acc0Vec, xVec, yVec);                                                   \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    k = count % 0x8U;                                                                               \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc1 requires 1 additional sample  */                                                        \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    mve_pred16_t p0 = vctp16q(k+1);                                                      \
+    xVec = vld1q(pSrcX); pSrcX += 8;                                                    \
+    yVec = vldrhq_f16(&pSrcY[-1]);                                                                  \
+    acc1Vec = vfmaq_m_f16(acc1Vec, xVec, yVec,p0);                                                  \
+    /* acc0 requires exact number of sample  */                                                     \
+    /* disable extra lanes in final MAC computation  */                                             \
+    p0 = vctp16q(k);                                                                     \
+    yVec = vld1q(pSrcY); pSrcY += 8;                                                    \
+    acc0Vec = vfmaq_m_f16(acc0Vec, xVec, yVec,p0);                                                  \
+                                                                                                    \
+    acc0 = vecAddAcrossF16Mve(acc0Vec);                                                             \
+    acc1 = vecAddAcrossF16Mve(acc1Vec);                                                             \
+}
+
+#define MVE_INTR_CORR_SINGLE_F16(acc, pX, pY, count)                                                \
+{                                                                                                   \
+    float16_t const *pSrcX, *pSrcY;                                                                 \
+    f16x8_t   accVec, xVec, yVec;                                                                 \
+    uint16_t    k;                                                                                  \
+                                                                                                    \
+    accVec = vdupq_n_f16(0.0f);                                                                     \
+    pSrcX = (float16_t const *) pX;                                                                 \
+    pSrcY = (float16_t const *) pY;                                                                 \
+    k = count >> 3;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        yVec = vld1q(pSrcY); pSrcY += 8;                                                \
+        xVec = vld1q(pSrcX); pSrcX += 8;                                                \
+        accVec = vfmaq(accVec, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* Loop with tail predication expected here  */                                                 \
+    k = count % 0x8U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp16q(k);                                                    \
+        yVec = vld1q(pSrcY); pSrcY += 8;                                                \
+        xVec = vld1q(pSrcX); pSrcX += 8;                                                \
+        accVec = vfmaq_m(accVec, xVec, yVec, p0);                                                   \
+    }                                                                                               \
+                                                                                                    \
+  acc = vecAddAcrossF16Mve(accVec);                                                                 \
+}
+
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_F16(acc0, acc1, acc2, acc3, pX, pY, count)              \
+{                                                                                                   \
+    float16_t const *pSrcX, *pSrcY;                                                                 \
+    f16x8_t   acc0Vec, acc1Vec, acc2Vec, acc3Vec, xVec, yVec;                                     \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    acc0Vec = vdupq_n_f16(0.0f);                                                                    \
+    acc1Vec = vdupq_n_f16(0.0f);                                                                    \
+    acc2Vec = vdupq_n_f16(0.0f);                                                                    \
+    acc3Vec = vdupq_n_f16(0.0f);                                                                    \
+    pSrcX = (float16_t const *) pX;                                                                 \
+    pSrcY = (float16_t const *) pY;                                                                 \
+    k = count >> 3;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        yVec = vld1q(pSrcY); pSrcY += 8;                                                \
+        xVec = vldrhq_f16(&pSrcX[1]);                                                               \
+        acc1Vec = vfmaq_f16(acc1Vec, xVec, yVec);                                                   \
+        xVec = vldrhq_f16(&pSrcX[2]);                                                               \
+        acc2Vec = vfmaq_f16(acc2Vec, xVec, yVec);                                                   \
+        xVec = vldrhq_f16(&pSrcX[3]);                                                               \
+        acc3Vec = vfmaq_f16(acc3Vec, xVec, yVec);                                                   \
+        xVec = vld1q(pSrcX); pSrcX += 8;                                                \
+        acc0Vec = vfmaq_f16(acc0Vec, xVec, yVec);                                                   \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* loop + tail predication expected here  */                                                    \
+    k = count % 0x8U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp16q(k);                                                    \
+        yVec = vld1q(pSrcY); pSrcY += 8;                                                \
+        xVec = vldrhq_f16(&pSrcX[1]);                                                               \
+        acc1Vec = vfmaq_m_f16(acc1Vec, xVec, yVec, p0);                                             \
+        xVec = vldrhq_f16(&pSrcX[2]);                                                               \
+        acc2Vec = vfmaq_m_f16(acc2Vec, xVec, yVec, p0);                                             \
+        xVec = vldrhq_f16(&pSrcX[3]);                                                               \
+        acc3Vec = vfmaq_m_f16(acc3Vec, xVec, yVec, p0);                                             \
+        xVec = vld1q(pSrcX); pSrcX += 8;                                                \
+        acc0Vec = vfmaq_m_f16(acc0Vec, xVec, yVec, p0);                                             \
+    }                                                                                               \
+                                                                                                    \
+    acc0 = vecAddAcrossF16Mve(acc0Vec);                                                             \
+    acc1 = vecAddAcrossF16Mve(acc1Vec);                                                             \
+    acc2 = vecAddAcrossF16Mve(acc2Vec);                                                             \
+    acc3 = vecAddAcrossF16Mve(acc3Vec);                                                             \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_F16(acc0, acc1, pX, pY, count)                          \
+{                                                                                                   \
+    float16_t const *pSrcX, *pSrcY;                                                                 \
+    f16x8_t   acc0Vec, acc1Vec, xVec, yVec;                                                       \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    acc0Vec = vdupq_n_f16(0.0f);                                                                    \
+    acc1Vec = vdupq_n_f16(0.0f);                                                                    \
+    pSrcX = (float16_t const *) pX;                                                                 \
+    pSrcY = (float16_t const *) pY;                                                                 \
+    k = count >> 3;                                                                                 \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        yVec = vld1q(pSrcY); pSrcY += 8;                                                \
+        xVec = vldrhq_f16(&pSrcX[1]);                                                               \
+        acc1Vec = vfmaq_f16(acc1Vec, xVec, yVec);                                                   \
+        xVec = vld1q(pSrcX); pSrcX += 8;                                                \
+        acc0Vec = vfmaq_f16(acc0Vec, xVec, yVec);                                                   \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* loop + tail predication expected here  */                                                    \
+    k = count % 0x8U;                                                                               \
+    if (k > 0U)                                                                                     \
+    {                                                                                               \
+        mve_pred16_t p0 = vctp16q(k);                                                    \
+        yVec = vld1q(pSrcY); pSrcY += 8;;                                               \
+        xVec = vldrhq_f16(&pSrcX[1]);                                                               \
+        acc1Vec = vfmaq_m_f16(acc1Vec, xVec, yVec, p0);                                             \
+        xVec = vld1q(pSrcX); pSrcX += 8;                                                \
+        acc0Vec = vfmaq_m_f16(acc0Vec, xVec, yVec, p0);                                             \
+    }                                                                                               \
+                                                                                                    \
+    acc0 = vecAddAcrossF16Mve(acc0Vec);                                                             \
+    acc1 = vecAddAcrossF16Mve(acc1Vec);                                                             \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_F16(acc0, acc1, pX, pY, count)                            \
+{                                                                                                   \
+    float16_t const *pSrcX, *pSrcY;                                                                 \
+    f16x8_t   acc0Vec, acc1Vec, xVec, yVec;                                                       \
+    uint32_t    k;                                                                                  \
+                                                                                                    \
+    acc0Vec = vdupq_n_f16(0.0f);                                                                    \
+    acc1Vec = vdupq_n_f16(0.0f);                                                                    \
+    pSrcX = (float16_t const *) pX;                                                                 \
+    pSrcY = (float16_t const *) pY;                                                                 \
+    k = (count-1) >> 3;                                                                             \
+                                                                                                    \
+    while (k > 0U)                                                                                  \
+    {                                                                                               \
+        yVec = vld1q(pSrcY); pSrcY += 8;                                                \
+        xVec = vldrhq_f16(&pSrcX[1]);                                                               \
+        acc1Vec = vfmaq_f16(acc1Vec, xVec, yVec);                                                   \
+        xVec = vld1q(pSrcX); pSrcX += 8;                                                \
+        acc0Vec = vfmaq_f16(acc0Vec, xVec, yVec);                                                   \
+        /*  Decrement the loop counter   */                                                         \
+        k--;                                                                                        \
+    }                                                                                               \
+    /* use predication to finalize MAC sum */                                                       \
+    /* acc1 requires exact number of sample (count-1)  */                                           \
+    /* disable extra lanes in final MAC computation  */                                             \
+    k = (count-1) % 0x8U;                                                                           \
+    mve_pred16_t p0 = vctp16q(k);                                                        \
+    yVec = vld1q(pSrcY); pSrcY += 8;                                                    \
+    xVec = vldrhq_f16(&pSrcX[1]);                                                                   \
+    acc1Vec = vfmaq_m_f16(acc1Vec, xVec, yVec, p0);                                                 \
+    /* acc0 requires 1 additional sample  (count) */                                                \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
+    p0 = vctp16q(k+1);                                                                   \
+    xVec = vld1q(pSrcX); pSrcX += 8;                                                    \
+    acc0Vec = vfmaq_m_f16(acc0Vec, xVec, yVec, p0);                                                 \
+                                                                                                    \
+    acc0 = vecAddAcrossF16Mve(acc0Vec);                                                             \
+    acc1 = vecAddAcrossF16Mve(acc1Vec);                                                             \
+}
+
+
+
+void arm_correlate_f16(
+  const float16_t * pSrcA,
+        uint32_t srcALen,
+  const float16_t * pSrcB,
+        uint32_t srcBLen,
+        float16_t * pDst)
+{
+    float16_t *pIn1 = (float16_t *)pSrcA;                    /* inputA pointer               */
+    float16_t *pIn2 = (float16_t *)pSrcB + (srcBLen - 1U);   /* inputB pointer               */
+    float16_t *pX;
+    float16_t *pY;
+    float16_t *pA;
+    float16_t *pB;
+    int32_t   i = 0U, j = 0;    /* loop counters */
+    int32_t   inv = 2U;         /* Reverse order flag */
+    uint32_t  tot = 0U;         /* Length */
+    int32_t   block1, block2, block3;
+    int32_t   incr;
+
+    tot = ((srcALen + srcBLen) - 2U);
+    if (srcALen > srcBLen)
+    {
+        /*
+         * Calculating the number of zeros to be padded to the output
+         */
+        j = srcALen - srcBLen;
+        /*
+         * Initialize the pointer after zero padding
+         */
+        pDst += j;
+    }
+    else if (srcALen < srcBLen)
+    {
+        /*
+         * Initialization to inputB pointer
+         */
+        pIn1 = (float16_t *)pSrcB;
+        /*
+         * Initialization to the end of inputA pointer
+         */
+        pIn2 = (float16_t *)pSrcA + (srcALen - 1U);
+        /*
+         * Initialisation of the pointer after zero padding
+         */
+        pDst = pDst + tot;
+        /*
+         * Swapping the lengths
+         */
+
+        j = srcALen;
+        srcALen = srcBLen;
+        srcBLen = j;
+        /*
+         * Setting the reverse flag
+         */
+        inv = -2;
+
+    }
+
+    block1 = srcBLen - 1;
+    block2 = srcALen - srcBLen + 1;
+    block3 = srcBLen - 1;
+
+    pA = pIn1;
+    pB = pIn2;
+    incr = inv / 2;
+
+    for (i = 0U; i <= block1 - 2; i += 2)
+    {
+        uint32_t  count = i + 1;
+        _Float16 acc0;
+        _Float16 acc1;
+        /*
+         * compute 2 accumulators per loop
+         * size is incrementing for second accumulator
+         * Y pointer is decrementing for second accumulator
+         */
+        pX = pA;
+        pY = pB;
+        MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_F16(acc0, acc1, pX, pY, count);
+
+        *pDst = acc0;
+        pDst += incr;
+        *pDst = acc1;
+        pDst += incr;
+        pB -= 2;
+    }
+    for (; i < block1; i++)
+    {
+        uint32_t  count = i + 1;
+        _Float16 acc;
+
+        pX = pA;
+        pY = pB;
+        MVE_INTR_CORR_SINGLE_F16(acc, pX, pY, count);
+
+        *pDst = acc;
+        pDst += incr;
+        pB--;
+    }
+
+    for (i = 0U; i <= block2 - 4; i += 4)
+    {
+        _Float16 acc0;
+        _Float16 acc1;
+        _Float16 acc2;
+        _Float16 acc3;
+
+        pX = pA;
+        pY = pB;
+        /*
+         * compute 4 accumulators per loop
+         * size is fixed for all accumulators
+         * X pointer is incrementing for successive accumulators
+         */
+        MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_F16(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
+
+        *pDst = acc0;
+        pDst += incr;
+        *pDst = acc1;
+        pDst += incr;
+        *pDst = acc2;
+        pDst += incr;
+        *pDst = acc3;
+        pDst += incr;
+        pA += 4;
+    }
+
+    for (; i <= block2 - 2; i += 2)
+    {
+        _Float16 acc0;
+        _Float16 acc1;
+
+        pX = pA;
+        pY = pB;
+        /*
+         * compute 2 accumulators per loop
+         * size is fixed for all accumulators
+         * X pointer is incrementing for second accumulator
+         */
+        MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_F16(acc0, acc1, pX, pY, srcBLen);
+
+        *pDst = acc0;
+        pDst += incr;
+        *pDst = acc1;
+        pDst += incr;
+        pA += 2;
+    }
+
+    if (block2 & 1)
+    {
+        _Float16 acc;
+
+        pX = pA;
+        pY = pB;
+        MVE_INTR_CORR_SINGLE_F16(acc, pX, pY, srcBLen);
+
+        *pDst = acc;
+        pDst += incr;
+        pA++;
+    }
+
+    for (i = block3 - 1; i >= 0; i -= 2)
+    {
+
+        uint32_t  count = (i + 1);
+        _Float16 acc0;
+        _Float16 acc1;
+
+        pX = pA;
+        pY = pB;
+        /*
+         * compute 2 accumulators per loop
+         * size is decrementing for second accumulator
+         * X pointer is incrementing for second accumulator
+         */
+        MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_F16(acc0, acc1, pX, pY, count);
+
+        *pDst = acc0;
+        pDst += incr;
+        *pDst = acc1;
+        pDst += incr;
+        pA += 2;
+
+    }
+    for (; i >= 0; i--)
+    {
+        uint32_t  count = (i + 1);
+        _Float16 acc;
+
+        pX = pA;
+        pY = pB;
+        MVE_INTR_CORR_SINGLE_F16(acc, pX, pY, count);
+
+        *pDst = acc;
+        pDst += incr;
+        pA++;
+    }
+}
+
+#else
+void arm_correlate_f16(
+  const float16_t * pSrcA,
+        uint32_t srcALen,
+  const float16_t * pSrcB,
+        uint32_t srcBLen,
+        float16_t * pDst)
+{
+
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_AUTOVECTORIZE)
+  
+  const float16_t *pIn1;                               /* InputA pointer */
+  const float16_t *pIn2;                               /* InputB pointer */
+        float16_t *pOut = pDst;                        /* Output pointer */
+  const float16_t *px;                                 /* Intermediate inputA pointer */
+  const float16_t *py;                                 /* Intermediate inputB pointer */
+  const float16_t *pSrc1;
+        _Float16 sum;
+        uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
+        uint32_t j, k, count, blkCnt;                  /* Loop counters */
+        uint32_t outBlockSize;                         /* Loop counter */
+        int32_t inc = 1;                               /* Destination address modifier */
+
+#if defined (ARM_MATH_LOOPUNROLL) 
+    _Float16 acc0, acc1, acc2, acc3,c0;                    /* Accumulators */
+    _Float16 x0, x1, x2, x3;                        /* temporary variables for holding input and coefficient values */
+#endif
+
+  /* The algorithm implementation is based on the lengths of the inputs. */
+  /* srcB is always made to slide across srcA. */
+  /* So srcBLen is always considered as shorter or equal to srcALen */
+  /* But CORR(x, y) is reverse of CORR(y, x) */
+  /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+  /* and the destination pointer modifier, inc is set to -1 */
+  /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+  /* But to improve the performance,
+   * we assume zeroes in the output instead of zero padding either of the the inputs*/
+  /* If srcALen > srcBLen,
+   * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+  /* If srcALen < srcBLen,
+   * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+  if (srcALen >= srcBLen)
+  {
+    /* Initialization of inputA pointer */
+    pIn1 = pSrcA;
+
+    /* Initialization of inputB pointer */
+    pIn2 = pSrcB;
+
+    /* Number of output samples is calculated */
+    outBlockSize = (2U * srcALen) - 1U;
+
+    /* When srcALen > srcBLen, zero padding has to be done to srcB
+     * to make their lengths equal.
+     * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+     * number of output samples are made zero */
+    j = outBlockSize - (srcALen + (srcBLen - 1U));
+
+    /* Updating the pointer position to non zero value */
+    pOut += j;
+  }
+  else
+  {
+    /* Initialization of inputA pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization of inputB pointer */
+    pIn2 = pSrcA;
+
+    /* srcBLen is always considered as shorter or equal to srcALen */
+    j = srcBLen;
+    srcBLen = srcALen;
+    srcALen = j;
+
+    /* CORR(x, y) = Reverse order(CORR(y, x)) */
+    /* Hence set the destination pointer to point to the last output sample */
+    pOut = pDst + ((srcALen + srcBLen) - 2U);
+
+    /* Destination address modifier is set to -1 */
+    inc = -1;
+  }
+
+  /* The function is internally
+   * divided into three stages according to the number of multiplications that has to be
+   * taken place between inputA samples and inputB samples. In the first stage of the
+   * algorithm, the multiplications increase by one for every iteration.
+   * In the second stage of the algorithm, srcBLen number of multiplications are done.
+   * In the third stage of the algorithm, the multiplications decrease by one
+   * for every iteration. */
+
+  /* The algorithm is implemented in three stages.
+     The loop counters of each stage is initiated here. */
+  blockSize1 = srcBLen - 1U;
+  blockSize2 = srcALen - (srcBLen - 1U);
+  blockSize3 = blockSize1;
+
+  /* --------------------------
+   * Initializations of stage1
+   * -------------------------*/
+
+  /* sum = x[0] * y[srcBlen - 1]
+   * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
+   * ....
+   * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+   */
+
+  /* In this stage the MAC operations are increased by 1 for every iteration.
+     The count variable holds the number of MAC operations performed */
+  count = 1U;
+
+  /* Working pointer of inputA */
+  px = pIn1;
+
+  /* Working pointer of inputB */
+  pSrc1 = pIn2 + (srcBLen - 1U);
+  py = pSrc1;
+
+  /* ------------------------
+   * Stage1 process
+   * ----------------------*/
+
+  /* The first stage starts here */
+  while (blockSize1 > 0U)
+  {
+    /* Accumulator is made zero for every iteration */
+    sum = 0.0f16;
+
+#if defined (ARM_MATH_LOOPUNROLL) 
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+    k = count >> 2U;
+
+
+    /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
+     ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+    while (k > 0U)
+    {
+      /* x[0] * y[srcBLen - 4] */
+      sum += *px++ * *py++;
+
+      /* x[1] * y[srcBLen - 3] */
+      sum += *px++ * *py++;
+
+      /* x[2] * y[srcBLen - 2] */
+      sum += *px++ * *py++;
+
+      /* x[3] * y[srcBLen - 1] */
+      sum += *px++ * *py++;
+
+      /* Decrement loop counter */
+      k--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    k = count % 0x4U;
+
+#else
+
+    /* Initialize k with number of samples */
+    k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL)  */
+
+    while (k > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      /* x[0] * y[srcBLen - 1] */
+      sum += *px++ * *py++;
+
+      /* Decrement loop counter */
+      k--;
+    }
+
+    /* Store the result in the accumulator in the destination buffer. */
+    *pOut = sum;
+    /* Destination pointer is updated according to the address modifier, inc */
+    pOut += inc;
+
+    /* Update the inputA and inputB pointers for next MAC calculation */
+    py = pSrc1 - count;
+    px = pIn1;
+
+    /* Increment MAC count */
+    count++;
+
+    /* Decrement loop counter */
+    blockSize1--;
+  }
+
+  /* --------------------------
+   * Initializations of stage2
+   * ------------------------*/
+
+  /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+   * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen]   * y[srcBLen-1]
+   * ....
+   * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+   */
+
+  /* Working pointer of inputA */
+  px = pIn1;
+
+  /* Working pointer of inputB */
+  py = pIn2;
+
+  /* count is index by which the pointer pIn1 to be incremented */
+  count = 0U;
+
+  /* -------------------
+   * Stage2 process
+   * ------------------*/
+
+  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+   * So, to loop unroll over blockSize2,
+   * srcBLen should be greater than or equal to 4 */
+  if (srcBLen >= 4U)
+  {
+#if defined (ARM_MATH_LOOPUNROLL) 
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+    blkCnt = blockSize2 >> 2U;
+
+    while (blkCnt > 0U)
+    {
+      /* Set all accumulators to zero */
+      acc0 = 0.0f16;
+      acc1 = 0.0f16;
+      acc2 = 0.0f16;
+      acc3 = 0.0f16;
+
+
+      /* read x[0], x[1], x[2] samples */
+      x0 = *px++;
+      x1 = *px++;
+      x2 = *px++;
+
+      /* Apply loop unrolling and compute 4 MACs simultaneously. */
+      k = srcBLen >> 2U;
+
+      /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
+       ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+      do
+      {
+        /* Read y[0] sample */
+        c0 = *(py++);
+        /* Read x[3] sample */
+        x3 = *(px++);
+
+        /* Perform the multiply-accumulate */
+        /* acc0 +=  x[0] * y[0] */
+        acc0 += x0 * c0;
+        /* acc1 +=  x[1] * y[0] */
+        acc1 += x1 * c0;
+        /* acc2 +=  x[2] * y[0] */
+        acc2 += x2 * c0;
+        /* acc3 +=  x[3] * y[0] */
+        acc3 += x3 * c0;
+
+        /* Read y[1] sample */
+        c0 = *(py++);
+        /* Read x[4] sample */
+        x0 = *(px++);
+
+        /* Perform the multiply-accumulate */
+        /* acc0 +=  x[1] * y[1] */
+        acc0 += x1 * c0;
+        /* acc1 +=  x[2] * y[1] */
+        acc1 += x2 * c0;
+        /* acc2 +=  x[3] * y[1] */
+        acc2 += x3 * c0;
+        /* acc3 +=  x[4] * y[1] */
+        acc3 += x0 * c0;
+
+        /* Read y[2] sample */
+        c0 = *(py++);
+        /* Read x[5] sample */
+        x1 = *(px++);
+
+        /* Perform the multiply-accumulate */
+        /* acc0 +=  x[2] * y[2] */
+        acc0 += x2 * c0;
+        /* acc1 +=  x[3] * y[2] */
+        acc1 += x3 * c0;
+        /* acc2 +=  x[4] * y[2] */
+        acc2 += x0 * c0;
+        /* acc3 +=  x[5] * y[2] */
+        acc3 += x1 * c0;
+
+        /* Read y[3] sample */
+        c0 = *(py++);
+        /* Read x[6] sample */
+        x2 = *(px++);
+
+        /* Perform the multiply-accumulate */
+        /* acc0 +=  x[3] * y[3] */
+        acc0 += x3 * c0;
+        /* acc1 +=  x[4] * y[3] */
+        acc1 += x0 * c0;
+        /* acc2 +=  x[5] * y[3] */
+        acc2 += x1 * c0;
+        /* acc3 +=  x[6] * y[3] */
+        acc3 += x2 * c0;
+
+      } while (--k);
+
+      /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+       ** No loop unrolling is used. */
+      k = srcBLen % 0x4U;
+
+      while (k > 0U)
+      {
+        /* Read y[4] sample */
+        c0 = *(py++);
+        /* Read x[7] sample */
+        x3 = *(px++);
+
+        /* Perform the multiply-accumulate */
+        /* acc0 +=  x[4] * y[4] */
+        acc0 += x0 * c0;
+        /* acc1 +=  x[5] * y[4] */
+        acc1 += x1 * c0;
+        /* acc2 +=  x[6] * y[4] */
+        acc2 += x2 * c0;
+        /* acc3 +=  x[7] * y[4] */
+        acc3 += x3 * c0;
+
+        /* Reuse the present samples for the next MAC */
+        x0 = x1;
+        x1 = x2;
+        x2 = x3;
+
+        /* Decrement the loop counter */
+        k--;
+      }
+
+      /* Store the result in the accumulator in the destination buffer. */
+      *pOut = acc0;
+      /* Destination pointer is updated according to the address modifier, inc */
+      pOut += inc;
+
+      *pOut = acc1;
+      pOut += inc;
+
+      *pOut = acc2;
+      pOut += inc;
+
+      *pOut = acc3;
+      pOut += inc;
+
+      /* Increment the pointer pIn1 index, count by 4 */
+      count += 4U;
+
+      /* Update the inputA and inputB pointers for next MAC calculation */
+      px = pIn1 + count;
+      py = pIn2;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    blkCnt = blockSize2 % 0x4U;
+
+#else
+
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL)  */
+
+    while (blkCnt > 0U)
+    {
+      /* Accumulator is made zero for every iteration */
+      sum = 0.0f16;
+
+#if defined (ARM_MATH_LOOPUNROLL) 
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+      k = srcBLen >> 2U;
+
+
+      /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
+       ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+      while (k > 0U)
+      {
+        /* Perform the multiply-accumulate */
+        sum += *px++ * *py++;
+        sum += *px++ * *py++;
+        sum += *px++ * *py++;
+        sum += *px++ * *py++;
+
+        /* Decrement loop counter */
+        k--;
+      }
+      /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+       ** No loop unrolling is used. */
+      k = srcBLen % 0x4U;
+#else
+
+      /* Initialize blkCnt with number of samples */
+      k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+      while (k > 0U)
+      {
+        /* Perform the multiply-accumulate */
+        sum += *px++ * *py++;
+
+        /* Decrement the loop counter */
+        k--;
+      }
+
+      /* Store the result in the accumulator in the destination buffer. */
+      *pOut = sum;
+
+      /* Destination pointer is updated according to the address modifier, inc */
+      pOut += inc;
+
+      /* Increment the pointer pIn1 index, count by 1 */
+      count++;
+
+      /* Update the inputA and inputB pointers for next MAC calculation */
+      px = pIn1 + count;
+      py = pIn2;
+
+      /* Decrement the loop counter */
+      blkCnt--;
+    }
+  }
+  else
+  {
+    /* If the srcBLen is not a multiple of 4,
+     * the blockSize2 loop cannot be unrolled by 4 */
+    blkCnt = blockSize2;
+
+    while (blkCnt > 0U)
+    {
+      /* Accumulator is made zero for every iteration */
+      sum = 0.0f16;
+
+      /* Loop over srcBLen */
+      k = srcBLen;
+
+      while (k > 0U)
+      {
+        /* Perform the multiply-accumulate */
+        sum += *px++ * *py++;
+
+        /* Decrement the loop counter */
+        k--;
+      }
+
+      /* Store the result in the accumulator in the destination buffer. */
+      *pOut = sum;
+      /* Destination pointer is updated according to the address modifier, inc */
+      pOut += inc;
+
+      /* Increment the pointer pIn1 index, count by 1 */
+      count++;
+
+      /* Update the inputA and inputB pointers for next MAC calculation */
+      px = pIn1 + count;
+      py = pIn2;
+
+      /* Decrement the loop counter */
+      blkCnt--;
+    }
+  }
+
+
+  /* --------------------------
+   * Initializations of stage3
+   * -------------------------*/
+
+  /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+   * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+   * ....
+   * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+   * sum +=  x[srcALen-1] * y[0]
+   */
+
+  /* In this stage the MAC operations are decreased by 1 for every iteration.
+     The count variable holds the number of MAC operations performed */
+  count = srcBLen - 1U;
+
+  /* Working pointer of inputA */
+  pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
+  px = pSrc1;
+
+  /* Working pointer of inputB */
+  py = pIn2;
+
+  /* -------------------
+   * Stage3 process
+   * ------------------*/
+
+  while (blockSize3 > 0U)
+  {
+    /* Accumulator is made zero for every iteration */
+    sum = 0.0f16;
+
+#if defined (ARM_MATH_LOOPUNROLL) 
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+    k = count >> 2U;
+
+
+    /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
+     ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+    while (k > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      /* sum += x[srcALen - srcBLen + 4] * y[3] */
+      sum += *px++ * *py++;
+
+      /* sum += x[srcALen - srcBLen + 3] * y[2] */
+      sum += *px++ * *py++;
+
+      /* sum += x[srcALen - srcBLen + 2] * y[1] */
+      sum += *px++ * *py++;
+
+      /* sum += x[srcALen - srcBLen + 1] * y[0] */
+      sum += *px++ * *py++;
+
+      /* Decrement loop counter */
+      k--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    k = count % 0x4U;
+
+#else
+
+    /* Initialize blkCnt with number of samples */
+    k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL)  */
+
+    while (k > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      sum += *px++ * *py++;
+
+      /* Decrement loop counter */
+      k--;
+    }
+
+    /* Store the result in the accumulator in the destination buffer. */
+    *pOut = sum;
+    /* Destination pointer is updated according to the address modifier, inc */
+    pOut += inc;
+
+    /* Update the inputA and inputB pointers for next MAC calculation */
+    px = ++pSrc1;
+    py = pIn2;
+
+    /* Decrement MAC count */
+    count--;
+
+    /* Decrement the loop counter */
+    blockSize3--;
+  }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+  const float16_t *pIn1 = pSrcA;                       /* inputA pointer */
+  const float16_t *pIn2 = pSrcB + (srcBLen - 1U);      /* inputB pointer */
+        _Float16 sum;                                 /* Accumulator */
+        uint32_t i = 0U, j;                            /* Loop counters */
+        uint32_t inv = 0U;                             /* Reverse order flag */
+        uint32_t tot = 0U;                             /* Length */
+
+  /* The algorithm implementation is based on the lengths of the inputs. */
+  /* srcB is always made to slide across srcA. */
+  /* So srcBLen is always considered as shorter or equal to srcALen */
+  /* But CORR(x, y) is reverse of CORR(y, x) */
+  /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+  /* and a varaible, inv is set to 1 */
+  /* If lengths are not equal then zero pad has to be done to  make the two
+   * inputs of same length. But to improve the performance, we assume zeroes
+   * in the output instead of zero padding either of the the inputs*/
+  /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
+   * starting of the output buffer */
+  /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
+   * ending of the output buffer */
+  /* Once the zero padding is done the remaining of the output is calcualted
+   * using convolution but with the shorter signal time shifted. */
+
+  /* Calculate the length of the remaining sequence */
+  tot = ((srcALen + srcBLen) - 2U);
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    j = srcALen - srcBLen;
+
+    /* Initialise the pointer after zero padding */
+    pDst += j;
+  }
+
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + (srcALen - 1U);
+
+    /* Initialisation of the pointer after zero padding */
+    pDst = pDst + tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+
+  }
+
+  /* Loop to calculate convolution for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry out MAC operations */
+    sum = 0.0f16;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((((i - j) < srcBLen) && (j < srcALen)))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)];
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = sum;
+    else
+      *pDst++ = sum;
+  }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of Corr group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f32.c
index 4dd5bb93bb3122ddfbc1a2ad9305aabfc23a0874..41a69a6721ffa5abf33f2a4fb2e70a55f58604f5 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_correlate_f32.c
  * Description:  Correlation of floating-point sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -1074,7 +1074,7 @@ void arm_correlate_f32(
       if ((((i - j) < srcBLen) && (j < srcALen)))
       {
         /* z[i] += x[i-j] * y[j] */
-        sum += pIn1[j] * pIn2[-((int32_t) i - j)];
+        sum += pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)];
       }
     }
 
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c
index 13661cbf2fa88a15750ed3f601ac2fae768dae60..0b37b187416020bcb2d41b53ae450470cb9db8e3 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_correlate_fast_opt_q15.c
  * Description:  Fast Q15 Correlation
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -197,7 +197,7 @@ void arm_correlate_fast_opt_q15(
       y1 = read_q15x2_ia ((q15_t **) &pIn2);
       y2 = read_q15x2_ia ((q15_t **) &pIn2);
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc0 = __SMLAD(x1, y1, acc0);
       acc2 = __SMLAD(x2, y1, acc2);
 
@@ -208,13 +208,13 @@ void arm_correlate_fast_opt_q15(
       x3 = __PKHBT(x1, x2, 0);
 #endif
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc1 = __SMLADX(x3, y1, acc1);
 
       /* Read next two samples from scratch buffer */
       x1 = read_q15x2_ia (&pScr1);
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc0 = __SMLAD(x2, y2, acc0);
       acc2 = __SMLAD(x1, y2, acc2);
 
@@ -250,7 +250,7 @@ void arm_correlate_fast_opt_q15(
 
     while (tapCnt > 0U)
     {
-      /* accumlate the results */
+      /* accumulate the results */
       acc0 += (*pScr1++ * *pIn2);
       acc1 += (*pScr1++ * *pIn2);
       acc2 += (*pScr1++ * *pIn2);
@@ -318,7 +318,7 @@ void arm_correlate_fast_opt_q15(
     while (tapCnt > 0U)
     {
 
-      /* accumlate the results */
+      /* accumulate the results */
       acc0 += (*pScr1++ * *pIn2++);
 
       /* Decrement loop counter */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c
index 6898618f46d7d97cdf163bbec480e050178510ba..7d8857adb9bdd651c71889fbc55a3c267e8d2f2d 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_correlate_fast_q15.c
  * Description:  Fast Q15 Correlation
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c
index a5840b73364db5f36eefc3106ceb9e9e24901c1e..e3d2bbf8c3425df426931876e7d76c4c9ae029c6 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_correlate_fast_q31.c
  * Description:  Fast Q31 Correlation
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c
index d46d9a0e5562e5e6aa4f586eb96cb9ee9152e4ad..00bb08897a45a1412a806ac7c42c082e978bc523 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_correlate_opt_q15.c
  * Description:  Correlation of Q15 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -192,7 +192,7 @@ void arm_correlate_opt_q15(
       y1 = read_q15x2_ia ((q15_t **) &pIn2);
       y2 = read_q15x2_ia ((q15_t **) &pIn2);
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc0 = __SMLALD(x1, y1, acc0);
       acc2 = __SMLALD(x2, y1, acc2);
 
@@ -203,13 +203,13 @@ void arm_correlate_opt_q15(
       x3 = __PKHBT(x1, x2, 0);
 #endif
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc1 = __SMLALDX(x3, y1, acc1);
 
       /* Read next two samples from scratch1 buffer */
       x1 = read_q15x2_ia (&pScr1);
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc0 = __SMLALD(x2, y2, acc0);
       acc2 = __SMLALD(x1, y2, acc2);
 
@@ -245,7 +245,7 @@ void arm_correlate_opt_q15(
 
     while (tapCnt > 0U)
     {
-      /* accumlate the results */
+      /* accumulate the results */
       acc0 += (*pScr1++ * *pIn2);
       acc1 += (*pScr1++ * *pIn2);
       acc2 += (*pScr1++ * *pIn2);
@@ -314,7 +314,7 @@ void arm_correlate_opt_q15(
     /* apply same above for remaining samples of smaller length sequence */
     while (tapCnt > 0U)
     {
-      /* accumlate the results */
+      /* accumulate the results */
       acc0 += (*pScr1++ * *pIn2++);
 
       /* Decrement loop counter */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c
index 035bfba38b4dd4dfa9348de0e02569821052f6de..145ba2ae8fe290d26583cb363ee57b8ff8e057fd 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_correlate_opt_q7.c
  * Description:  Correlation of Q7 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -248,7 +248,7 @@ void arm_correlate_opt_q7(
       /* Read four samples from smaller buffer */
       y1 = read_q15x2_ia (&pScr2);
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc0 = __SMLAD(x1, y1, acc0);
       acc2 = __SMLAD(x2, y1, acc2);
 
@@ -259,7 +259,7 @@ void arm_correlate_opt_q7(
       x3 = __PKHBT(x1, x2, 0);
 #endif
 
-      /* multiply and accumlate */
+      /* multiply and accumulate */
       acc1 = __SMLADX(x3, y1, acc1);
 
       /* Read next two samples from scratch1 buffer */
@@ -305,7 +305,7 @@ void arm_correlate_opt_q7(
 
     while (tapCnt > 0U)
     {
-      /* accumlate the results */
+      /* accumulate the results */
       acc0 += (*pScr1++ * *pScr2);
       acc1 += (*pScr1++ * *pScr2);
       acc2 += (*pScr1++ * *pScr2);
@@ -362,7 +362,7 @@ void arm_correlate_opt_q7(
     /* apply same above for remaining samples of smaller length sequence */
     while (tapCnt > 0U)
     {
-      /* accumlate the results */
+      /* accumulate the results */
       acc0 += (*pScr1++ * *pScr2++);
 
       /* Decrement loop counter */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q15.c
index 0ec4ab7c5e0d9c0574e5649d3e24ff06bfea652c..2dd16d72cf710d17f655dca6bb4dd51319f98730 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_correlate_q15.c
  * Description:  Correlation of Q15 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -58,7 +58,7 @@
   @remark
                    Refer to \ref arm_correlate_opt_q15() for a faster implementation of this function using scratch buffers.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 #include "arm_vec_filtering.h"
 
@@ -882,7 +882,7 @@ void arm_correlate_q15(
       if (((i - j) < srcBLen) && (j < srcALen))
       {
         /* z[i] += x[i-j] * y[j] */
-        sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+        sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)]);
       }
     }
 
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c
index 1c3e33b5497432dd77c7a07fd47813a1e436b467..a5f15fec7bf7cdc9fe53dd4e10d63e946caf2601 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_correlate_q31.c
  * Description:  Correlation of Q31 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -59,7 +59,7 @@
   @remark
                    Refer to \ref arm_correlate_fast_q31() for a faster but less precise implementation of this function.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 #include "arm_vec_filtering.h"
 void arm_correlate_q31(
@@ -858,7 +858,7 @@ void arm_correlate_q31(
       if (((i - j) < srcBLen) && (j < srcALen))
       {
         /* z[i] += x[i-j] * y[j] */
-        sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+        sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)]);
       }
     }
 
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q7.c b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q7.c
index 113fdbccaea690355c46f58528f3ebc5071b9f6f..0b924e825e240288a1e78b75f2009a02689e593e 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q7.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_correlate_q7.c
  * Description:  Correlation of Q7 sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -56,7 +56,7 @@
  @remark
                    Refer to \ref arm_correlate_opt_q7() for a faster implementation of this function.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 
 #include "arm_vec_filtering.h"
@@ -981,7 +981,7 @@ void arm_correlate_q7(
       if (((i - j) < srcBLen) && (j < srcALen))
       {
         /* z[i] += x[i-j] * y[j] */
-        sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+        sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)]);
       }
     }
 
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c
index 86c78755e3803a43d4026319e9e677d4921d06f4..443efa028066c4e9f32459db65f523908662af7c 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_decimate_f32.c
  * Description:  FIR decimation for floating-point sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c
index 948b15c2d6027aee5dc1775160840c73dfe439cb..15bbb78a5774e89041857e8da322f2ef2334efef 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_decimate_fast_q15.c
  * Description:  Fast Q15 FIR Decimator
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c
index 2c3a28acad61cb0480ef7663d46be5b5a8235ec4..993f7ac73eb577364cf2a2d8aa45f94f846c8649 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_decimate_fast_q31.c
  * Description:  Fast Q31 FIR Decimator
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c
index 9382f099d6af3c80c182874229709d0b1586f0f0..de31b9f41518e02d72b6e2cadc67d131db3684fe 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_decimate_init_f32.c
  * Description:  Floating-point FIR Decimator initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c
index f583a037d5fa160eec069bef7f68f7b38e2d8ed9..b43df2a06c3b3002563629c62a1a1de112ce471a 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_decimate_init_q15.c
  * Description:  Initialization function for the Q15 FIR Decimator
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c
index 5ee69c6e16c7bd0a689162d9088ebec14a85274f..7a9490a3a6b0cb5988d29244653d53447a185027 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_decimate_init_q31.c
  * Description:  Initialization function for Q31 FIR Decimation filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q15.c
index c48a233cb8c0087a1bf8c9bb99ffa9c5af17cd74..21f18e5c6bb2d52bdbcb4777f318892d6dd54aec 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_decimate_q15.c
  * Description:  Q15 FIR Decimator
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -57,7 +57,7 @@
                    Refer to \ref arm_fir_decimate_fast_q15() for a faster but less precise implementation of this function.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
@@ -534,9 +534,8 @@ void arm_fir_decimate_q15(
 
   /* Points to the start of the state buffer */
   pStateCur = S->pState;
-
   i = (numTaps - 1U) >> 2U;
-
+ 
   /* copy data */
   while (i > 0U)
   {
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c
index b88748e4b11423d41a4f70c4d63fe0ea95bd10e9..77bfc1686f59a4f1b9106ae3b422a0f2baf8a1e6 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_decimate_q31.c
  * Description:  Q31 FIR Decimator
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -56,7 +56,7 @@
                    Refer to \ref arm_fir_decimate_fast_q31() for a faster but less precise implementation of this function.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f16.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..ad198dcf0f57a372b6bbaea3ab97e3e110d2e5dc
--- /dev/null
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f16.c
@@ -0,0 +1,940 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_fir_f16.c
+ * Description:  Floating-point FIR filter processing function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+/**
+  @ingroup groupFilters
+ */
+
+
+/**
+  @addtogroup FIR
+  @{
+ */
+
+/**
+  @brief         Processing function for floating-point FIR filter.
+  @param[in]     S          points to an instance of the floating-point FIR filter structure
+  @param[in]     pSrc       points to the block of input data
+  @param[out]    pDst       points to the block of output data
+  @param[in]     blockSize  number of samples to process
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define FIR_F32_MAX_COEF_BLK        8
+
+#define FIR_F16_CORE(pSamples, c, NB_TAPS)                                 \
+        vecAcc0 = vdupq_n_f16(0.0f16);                                     \
+        for (int i = 0; i < NB_TAPS; i++) {                                \
+            vecIn0 = vld1q(&pSamples[i]);                                  \
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]);                        \
+        }
+
+#define NB_TAPS 4
+__STATIC_INLINE void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, 
+    const float16_t * __restrict pSrc, 
+    float16_t * __restrict pDst, uint32_t blockSize)
+{
+    float16_t      *pState = S->pState;     /* State pointer */
+    const float16_t *pCoeffs = S->pCoeffs;  /* Coefficient pointer */
+    float16_t      *pStateCur;              /* Points to the current sample of the state */
+    const float16_t *pSamples;              /* Temporary pointer to the sample buffer */
+    float16_t      *pOutput;                /* Temporary pointer to the output buffer */
+    const float16_t *pTempSrc;              /* Temporary pointer to the source data */
+    float16_t      *pTempDest;              /* Temporary pointer to the destination buffer */
+    uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */
+    int32_t         blkCnt;
+    float16x8_t         vecIn0;
+    float16x8_t         vecAcc0;
+    float16_t       c[NB_TAPS];
+
+
+    /*
+     * pState points to state array which contains previous frame (numTaps - 1) samples
+     * pStateCur points to the location where the new input data should be written
+     */
+    pStateCur = &(pState[(numTaps - 1u)]);
+    /*
+     * Copy new data into state so that we obtain a continuous sample buffer
+     * containing both the tail end of the old data and the new data.
+     */
+    pSamples = pState;
+    pTempSrc = pSrc;
+    pOutput = pDst;
+
+    for (int i = 0; i < NB_TAPS; i++)
+        c[i] = pCoeffs[i];
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0) {
+        /*
+         * Save 8 input samples in the history buffer
+         */
+        vst1q(pStateCur, vld1q(pTempSrc));
+        pStateCur += 8;
+        pTempSrc += 8;
+
+        FIR_F16_CORE(pSamples, c, NB_TAPS);
+
+        vst1q(pOutput, vecAcc0);
+
+        pOutput += 8;
+        pSamples += 8;
+
+        blkCnt--;
+    }
+
+    blkCnt = blockSize & 7;
+    if (blkCnt)
+    {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
+
+        vst1q(pStateCur, vld1q(pTempSrc));
+        pStateCur += 8;
+        pTempSrc += 8;
+
+        FIR_F16_CORE(pSamples, c, NB_TAPS);
+
+        vstrhq_p_f16(pOutput, vecAcc0, p0);
+    }
+
+    /*
+     * Copy the samples back into the history buffer start
+     */
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
+
+    blkCnt = numTaps >> 3;
+    while (blkCnt > 0) {
+        vst1q(pTempDest, vld1q(pTempSrc));
+        pTempSrc += 8;
+        pTempDest += 8;
+        blkCnt--;
+    }
+    blkCnt = numTaps & 7;
+    if (blkCnt > 0) {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
+        vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
+    }
+
+}
+#undef NB_TAPS
+
+#define NB_TAPS 8
+__STATIC_INLINE void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, 
+    const float16_t * __restrict pSrc, 
+    float16_t * __restrict pDst, uint32_t blockSize)
+{
+    float16_t      *pState = S->pState;     /* State pointer */
+    const float16_t *pCoeffs = S->pCoeffs;  /* Coefficient pointer */
+    float16_t      *pStateCur;              /* Points to the current sample of the state */
+    const float16_t *pSamples;              /* Temporary pointer to the sample buffer */
+    float16_t      *pOutput;                /* Temporary pointer to the output buffer */
+    const float16_t *pTempSrc;              /* Temporary pointer to the source data */
+    float16_t      *pTempDest;              /* Temporary pointer to the destination buffer */
+    uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */
+    int32_t         blkCnt;
+    float16x8_t         vecIn0;
+    float16x8_t         vecAcc0;
+    float16_t       c[NB_TAPS];
+
+
+    /*
+     * pState points to state array which contains previous frame (numTaps - 1) samples
+     * pStateCur points to the location where the new input data should be written
+     */
+    pStateCur = &(pState[(numTaps - 1u)]);
+    /*
+     * Copy new data into state so that we obtain a continuous sample buffer
+     * containing both the tail end of the old data and the new data.
+     */
+    pSamples = pState;
+    pTempSrc = pSrc;
+    pOutput = pDst;
+
+    for (int i = 0; i < NB_TAPS; i++)
+        c[i] = pCoeffs[i];
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0) {
+        /*
+         * Save 8 input samples in the history buffer
+         */
+        vst1q(pStateCur, vld1q(pTempSrc));
+        pStateCur += 8;
+        pTempSrc += 8;
+
+        FIR_F16_CORE(pSamples, c, NB_TAPS);
+
+        vst1q(pOutput, vecAcc0);
+
+        pOutput += 8;
+        pSamples += 8;
+
+        blkCnt--;
+    }
+
+    blkCnt = blockSize & 7;
+    if (blkCnt)
+    {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
+
+        vst1q(pStateCur, vld1q(pTempSrc));
+        pStateCur += 8;
+        pTempSrc += 8;
+
+        FIR_F16_CORE(pSamples, c, NB_TAPS);
+
+        vstrhq_p_f16(pOutput, vecAcc0, p0);
+    }
+
+    /*
+     * Copy the samples back into the history buffer start
+     */
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
+
+    blkCnt = numTaps >> 3;
+    while (blkCnt > 0) {
+        vst1q(pTempDest, vld1q(pTempSrc));
+        pTempSrc += 8;
+        pTempDest += 8;
+        blkCnt--;
+    }
+    blkCnt = numTaps & 7;
+    if (blkCnt > 0) {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
+        vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
+    }
+
+}
+#undef NB_TAPS
+
+void arm_fir_f16(const arm_fir_instance_f16 * S, 
+  const float16_t * pSrc, 
+  float16_t * pDst, 
+  uint32_t blockSize)
+{
+    float16_t *pRefStatePtr = S->pState + ROUND_UP(blockSize, 8);
+    float16_t *pState = pRefStatePtr ;      /* State pointer */
+    const float16_t *pCoeffs = S->pCoeffs;      /* Coefficient pointer */
+    const float16_t *pSamples;  /* Temporary pointer to the sample buffer */
+    float16_t      *pOutput;    /* Temporary pointer to the output buffer */
+    const float16_t *pTempSrc;  /* Temporary pointer to the source data */
+    float16_t      *pTempDest;  /* Temporary pointer to the destination buffer */
+    uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
+    uint32_t        blkCnt;
+    float16_t       c0, c1, c2, c3;
+    float16_t       c4, c5, c6, c7;
+
+    /*
+     * [1 to 8 taps] specialized routines
+     */
+    if (numTaps <= 4) {
+        arm_fir_f16_1_4_mve(S, pSrc, pDst, blockSize);
+        return;
+    } else if (numTaps <= 8) {
+        arm_fir_f16_5_8_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+
+    pTempSrc = pSrc;
+    pTempDest = &(pState[(numTaps - 1u)]);
+    int             cnt = blockSize;
+    do {
+        mve_pred16_t    p0 = vctp16q(cnt);
+        vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
+        pTempDest += 8;
+        pTempSrc += 8;
+        cnt -= 8;
+    } while (cnt > 0);
+
+    float16_t      *partial_accu_ptr = S->pState;
+
+    pSamples = pState;
+    c0 = *pCoeffs++;
+    c1 = *pCoeffs++;
+    c2 = *pCoeffs++;
+    c3 = *pCoeffs++;
+    c4 = *pCoeffs++;
+    c5 = *pCoeffs++;
+    c6 = *pCoeffs++;
+    c7 = *pCoeffs++;
+
+    cnt = blockSize >> 3;
+    while (cnt > 0) {
+        float16x8_t     vecAcc0;
+        float16x8_t     vecIn0;
+
+        vecIn0 = vld1q(pSamples);
+        vecAcc0 = vmulq(vecIn0, c0);
+        vecIn0 = vld1q(&pSamples[1]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+        vecIn0 = vld1q(&pSamples[2]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+        vecIn0 = vld1q(&pSamples[3]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+        vecIn0 = vld1q(&pSamples[4]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+        vecIn0 = vld1q(&pSamples[5]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+        vecIn0 = vld1q(&pSamples[6]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+        vecIn0 = vld1q(&pSamples[7]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+        pSamples += 8;
+        vst1q(partial_accu_ptr, vecAcc0);
+        cnt--;
+        partial_accu_ptr += 8;
+    }
+
+    cnt = blockSize & 7;
+    if (cnt > 0) {
+        float16x8_t     vecAcc0;
+        float16x8_t     vecIn0;
+
+        mve_pred16_t p0 = vctp16q(cnt);
+
+
+        vecIn0 = vld1q(pSamples);
+        vecAcc0 = vmulq(vecIn0, c0);
+        vecIn0 = vld1q(&pSamples[1]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+        vecIn0 = vld1q(&pSamples[2]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+        vecIn0 = vld1q(&pSamples[3]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+        vecIn0 = vld1q(&pSamples[4]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+        vecIn0 = vld1q(&pSamples[5]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+        vecIn0 = vld1q(&pSamples[6]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+        vecIn0 = vld1q(&pSamples[7]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+        vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
+    }
+
+    int             localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
+    int             sample_offset = FIR_F32_MAX_COEF_BLK;
+    while (localTaps > FIR_F32_MAX_COEF_BLK) {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        c4 = *pCoeffs++;
+        c5 = *pCoeffs++;
+        c6 = *pCoeffs++;
+        c7 = *pCoeffs++;
+
+        partial_accu_ptr = S->pState;
+        pSamples = pState + sample_offset;
+        int  cnt = blockSize >> 3;
+        while (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
+
+
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            vecIn0 = vld1q(&pSamples[4]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(&pSamples[5]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+            vecIn0 = vld1q(&pSamples[6]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+            vecIn0 = vld1q(&pSamples[7]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            pSamples += 8;
+            vecAcc0 += vld1q_f16(partial_accu_ptr);
+            vst1q(partial_accu_ptr, vecAcc0);
+            cnt--;
+            partial_accu_ptr += 8;
+        }
+
+        cnt = blockSize & 7;
+        if (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
+
+            mve_pred16_t p0 = vctp16q(cnt);
+
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            vecIn0 = vld1q(&pSamples[4]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(&pSamples[5]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+            vecIn0 = vld1q(&pSamples[6]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+            vecIn0 = vld1q(&pSamples[7]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            vecAcc0 += vld1q_f16(partial_accu_ptr);
+            vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
+        }
+
+        localTaps -= FIR_F32_MAX_COEF_BLK;
+        sample_offset += FIR_F32_MAX_COEF_BLK;
+    }
+
+    pSamples = pState + sample_offset;
+
+    if (localTaps > 4) {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        c4 = *pCoeffs++;
+        c5 = *pCoeffs++;
+        c6 = *pCoeffs++;
+        c7 = *pCoeffs++;
+        pOutput = pDst;
+
+        partial_accu_ptr = S->pState;
+        cnt = blockSize >> 3;
+        while (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
+
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            vecIn0 = vld1q(&pSamples[4]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(&pSamples[5]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+            vecIn0 = vld1q(&pSamples[6]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+            vecIn0 = vld1q(&pSamples[7]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            pSamples += 8;
+            float16x8_t     pap = vld1q_f16(partial_accu_ptr);
+            vst1q(pOutput, vecAcc0 + pap);
+            cnt--;
+            partial_accu_ptr += 8;
+            pOutput += 8;
+        }
+
+        cnt = blockSize & 7;
+        if (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
+
+            mve_pred16_t p0 = vctp16q(cnt);
+
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            vecIn0 = vld1q(&pSamples[4]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(&pSamples[5]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+            vecIn0 = vld1q(&pSamples[6]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+            vecIn0 = vld1q(&pSamples[7]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            float16x8_t     pap = vld1q_f16(partial_accu_ptr);
+            vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
+            pOutput += cnt;
+        }
+
+    } else {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        pOutput = pDst;
+
+        partial_accu_ptr = S->pState;
+        cnt = blockSize >> 3;
+        while (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
+
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            pSamples += 8;
+            float16x8_t     pap = vld1q_f16(partial_accu_ptr);
+            vst1q(pOutput, vecAcc0 + pap);
+            cnt--;
+            partial_accu_ptr += 8;
+            pOutput += 8;
+        }
+
+        cnt = blockSize & 7;
+        if (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
+
+            mve_pred16_t p0 = vctp16q(cnt);
+
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            float16x8_t     pap = vld1q_f16(partial_accu_ptr);
+            vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
+            pOutput += cnt;
+        }
+    }
+
+    /*
+     * Copy the samples back into the history buffer start
+     */
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
+
+    blkCnt = numTaps >> 3;
+    while (blkCnt > 0U) {
+        vst1q(pTempDest, vld1q(pTempSrc));
+        pTempSrc += 8;
+        pTempDest += 8;
+        blkCnt--;
+    }
+    blkCnt = numTaps & 7;
+    if (blkCnt > 0U) {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
+        vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
+    }
+}
+
+#else
+
+void arm_fir_f16(
+  const arm_fir_instance_f16 * S,
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+        float16_t *pState = S->pState;                 /* State pointer */
+  const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float16_t *pStateCurnt;                        /* Points to the current sample of the state */
+        float16_t *px;                                 /* Temporary pointer for state buffer */
+  const float16_t *pb;                                 /* Temporary pointer for coefficient buffer */
+        _Float16 acc0;                                /* Accumulator */
+        uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+        uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+        _Float16 acc1, acc2, acc3, acc4, acc5, acc6, acc7;     /* Accumulators */
+        _Float16 x0, x1, x2, x3, x4, x5, x6, x7;               /* Temporary variables to hold state values */
+        _Float16 c0;                                           /* Temporary variable to hold coefficient value */
+#endif
+
+  /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 8 output values simultaneously.
+   * The variables acc0 ... acc7 hold output values that are being computed:
+   *
+   *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+   *    acc1 =  b[numTaps-1] * x[n-numTaps]   + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+   *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps]   + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+   *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
+   */
+
+  blkCnt = blockSize >> 3U;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy 4 new input samples into the state buffer. */
+    *pStateCurnt++ = *pSrc++;
+    *pStateCurnt++ = *pSrc++;
+    *pStateCurnt++ = *pSrc++;
+    *pStateCurnt++ = *pSrc++;
+
+    /* Set all accumulators to zero */
+    acc0 = 0.0f;
+    acc1 = 0.0f;
+    acc2 = 0.0f;
+    acc3 = 0.0f;
+    acc4 = 0.0f;
+    acc5 = 0.0f;
+    acc6 = 0.0f;
+    acc7 = 0.0f;
+
+    /* Initialize state pointer */
+    px = pState;
+
+    /* Initialize coefficient pointer */
+    pb = pCoeffs;
+
+    /* This is separated from the others to avoid
+     * a call to __aeabi_memmove which would be slower
+     */
+    *pStateCurnt++ = *pSrc++;
+    *pStateCurnt++ = *pSrc++;
+    *pStateCurnt++ = *pSrc++;
+    *pStateCurnt++ = *pSrc++;
+
+    /* Read the first 7 samples from the state buffer:  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
+    x0 = *px++;
+    x1 = *px++;
+    x2 = *px++;
+    x3 = *px++;
+    x4 = *px++;
+    x5 = *px++;
+    x6 = *px++;
+
+    /* Loop unrolling: process 8 taps at a time. */
+    tapCnt = numTaps >> 3U;
+
+    while (tapCnt > 0U)
+    {
+      /* Read the b[numTaps-1] coefficient */
+      c0 = *(pb++);
+
+      /* Read x[n-numTaps-3] sample */
+      x7 = *(px++);
+
+      /* acc0 +=  b[numTaps-1] * x[n-numTaps] */
+      acc0 += x0 * c0;
+
+      /* acc1 +=  b[numTaps-1] * x[n-numTaps-1] */
+      acc1 += x1 * c0;
+
+      /* acc2 +=  b[numTaps-1] * x[n-numTaps-2] */
+      acc2 += x2 * c0;
+
+      /* acc3 +=  b[numTaps-1] * x[n-numTaps-3] */
+      acc3 += x3 * c0;
+
+      /* acc4 +=  b[numTaps-1] * x[n-numTaps-4] */
+      acc4 += x4 * c0;
+
+      /* acc1 +=  b[numTaps-1] * x[n-numTaps-5] */
+      acc5 += x5 * c0;
+
+      /* acc2 +=  b[numTaps-1] * x[n-numTaps-6] */
+      acc6 += x6 * c0;
+
+      /* acc3 +=  b[numTaps-1] * x[n-numTaps-7] */
+      acc7 += x7 * c0;
+
+      /* Read the b[numTaps-2] coefficient */
+      c0 = *(pb++);
+
+      /* Read x[n-numTaps-4] sample */
+      x0 = *(px++);
+
+      /* Perform the multiply-accumulate */
+      acc0 += x1 * c0;
+      acc1 += x2 * c0;
+      acc2 += x3 * c0;
+      acc3 += x4 * c0;
+      acc4 += x5 * c0;
+      acc5 += x6 * c0;
+      acc6 += x7 * c0;
+      acc7 += x0 * c0;
+
+      /* Read the b[numTaps-3] coefficient */
+      c0 = *(pb++);
+
+      /* Read x[n-numTaps-5] sample */
+      x1 = *(px++);
+
+      /* Perform the multiply-accumulates */
+      acc0 += x2 * c0;
+      acc1 += x3 * c0;
+      acc2 += x4 * c0;
+      acc3 += x5 * c0;
+      acc4 += x6 * c0;
+      acc5 += x7 * c0;
+      acc6 += x0 * c0;
+      acc7 += x1 * c0;
+
+      /* Read the b[numTaps-4] coefficient */
+      c0 = *(pb++);
+
+      /* Read x[n-numTaps-6] sample */
+      x2 = *(px++);
+
+      /* Perform the multiply-accumulates */
+      acc0 += x3 * c0;
+      acc1 += x4 * c0;
+      acc2 += x5 * c0;
+      acc3 += x6 * c0;
+      acc4 += x7 * c0;
+      acc5 += x0 * c0;
+      acc6 += x1 * c0;
+      acc7 += x2 * c0;
+
+      /* Read the b[numTaps-4] coefficient */
+      c0 = *(pb++);
+
+      /* Read x[n-numTaps-6] sample */
+      x3 = *(px++);
+      /* Perform the multiply-accumulates */
+      acc0 += x4 * c0;
+      acc1 += x5 * c0;
+      acc2 += x6 * c0;
+      acc3 += x7 * c0;
+      acc4 += x0 * c0;
+      acc5 += x1 * c0;
+      acc6 += x2 * c0;
+      acc7 += x3 * c0;
+
+      /* Read the b[numTaps-4] coefficient */
+      c0 = *(pb++);
+
+      /* Read x[n-numTaps-6] sample */
+      x4 = *(px++);
+
+      /* Perform the multiply-accumulates */
+      acc0 += x5 * c0;
+      acc1 += x6 * c0;
+      acc2 += x7 * c0;
+      acc3 += x0 * c0;
+      acc4 += x1 * c0;
+      acc5 += x2 * c0;
+      acc6 += x3 * c0;
+      acc7 += x4 * c0;
+
+      /* Read the b[numTaps-4] coefficient */
+      c0 = *(pb++);
+
+      /* Read x[n-numTaps-6] sample */
+      x5 = *(px++);
+
+      /* Perform the multiply-accumulates */
+      acc0 += x6 * c0;
+      acc1 += x7 * c0;
+      acc2 += x0 * c0;
+      acc3 += x1 * c0;
+      acc4 += x2 * c0;
+      acc5 += x3 * c0;
+      acc6 += x4 * c0;
+      acc7 += x5 * c0;
+
+      /* Read the b[numTaps-4] coefficient */
+      c0 = *(pb++);
+
+      /* Read x[n-numTaps-6] sample */
+      x6 = *(px++);
+
+      /* Perform the multiply-accumulates */
+      acc0 += x7 * c0;
+      acc1 += x0 * c0;
+      acc2 += x1 * c0;
+      acc3 += x2 * c0;
+      acc4 += x3 * c0;
+      acc5 += x4 * c0;
+      acc6 += x5 * c0;
+      acc7 += x6 * c0;
+
+      /* Decrement loop counter */
+      tapCnt--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    tapCnt = numTaps % 0x8U;
+
+    while (tapCnt > 0U)
+    {
+      /* Read coefficients */
+      c0 = *(pb++);
+
+      /* Fetch 1 state variable */
+      x7 = *(px++);
+
+      /* Perform the multiply-accumulates */
+      acc0 += x0 * c0;
+      acc1 += x1 * c0;
+      acc2 += x2 * c0;
+      acc3 += x3 * c0;
+      acc4 += x4 * c0;
+      acc5 += x5 * c0;
+      acc6 += x6 * c0;
+      acc7 += x7 * c0;
+
+      /* Reuse the present sample states for next sample */
+      x0 = x1;
+      x1 = x2;
+      x2 = x3;
+      x3 = x4;
+      x4 = x5;
+      x5 = x6;
+      x6 = x7;
+
+      /* Decrement loop counter */
+      tapCnt--;
+    }
+
+    /* Advance the state pointer by 8 to process the next group of 8 samples */
+    pState = pState + 8;
+
+    /* The results in the 8 accumulators, store in the destination buffer. */
+    *pDst++ = acc0;
+    *pDst++ = acc1;
+    *pDst++ = acc2;
+    *pDst++ = acc3;
+    *pDst++ = acc4;
+    *pDst++ = acc5;
+    *pDst++ = acc6;
+    *pDst++ = acc7;
+
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining output samples */
+  blkCnt = blockSize % 0x8U;
+
+#else
+
+  /* Initialize blkCnt with number of taps */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* Copy one sample at a time into state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Set the accumulator to zero */
+    acc0 = 0.0f;
+
+    /* Initialize state pointer */
+    px = pState;
+
+    /* Initialize Coefficient pointer */
+    pb = pCoeffs;
+
+    i = numTaps;
+
+    /* Perform the multiply-accumulates */
+    while (i > 0U)
+    {
+      /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+      acc0 += *px++ * *pb++;
+
+      i--;
+    }
+
+    /* Store result in destination buffer. */
+    *pDst++ = acc0;
+
+    /* Advance state pointer by 1 for the next sample */
+    pState = pState + 1U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.
+     Now copy the last numTaps - 1 samples to the start of the state buffer.
+     This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 taps at a time */
+  tapCnt = (numTaps - 1U) >> 2U;
+
+  /* Copy data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+    *pStateCurnt++ = *pState++;
+    *pStateCurnt++ = *pState++;
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement loop counter */
+    tapCnt--;
+  }
+
+  /* Calculate remaining number of copies */
+  tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+  /* Initialize tapCnt with number of taps */
+  tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  /* Copy remaining data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement loop counter */
+    tapCnt--;
+  }
+
+}
+
+#endif /* #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+* @} end of FIR group
+*/
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f32.c
index 4ce8dfaa14794020226e09c03b6165ade5cb1335..f8c1d8061cb411f480befc4d5a6a2776c760b3d1 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_f32.c
  * Description:  Floating-point FIR filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -61,7 +61,7 @@
                    Samples in the state buffer are stored in the following order.
   @par
   <pre>
-      {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
+      {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[n](==pSrc[0]), x[n+1](==pSrc[1]), ..., x[n+blockSize-1](==pSrc[blockSize-1])}
   </pre>
   @par
                    Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code>.
@@ -97,17 +97,41 @@
                    where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
                    <code>pCoeffs</code> is the address of the coefficient buffer.
   @par          Initialization of Helium version
-                 For Helium version the array of coefficients must be a multiple of 16 even if less
-                 then 16 coefficients are used. The additional coefficients must be set to 0.
-                 It does not mean that all the coefficients will be used in the filter (numTaps
-                 is still set to its right value in the init function.) It just means that
+                 For Helium version the array of coefficients must be padded with zero to contain
+                 a full number of lanes.
+
+                 The array length L must be a multiple of x. L = x * a :
+                 - x is 4  for f32
+                 - x is 4  for q31
+                 - x is 4  for f16 (so managed like the f32 version and not like the q15 one)
+                 - x is 8  for q15
+                 - x is 16 for q7
+
+                 The additional coefficients 
+                 (x * a - numTaps) must be set to 0.
+                 numTaps is still set to its right value in the init function. It means that
                  the implementation may require to read more coefficients due to the vectorization and
                  to avoid having to manage too many different cases in the code.
 
+                
+  @par          Helium state buffer
+                 The state buffer must contain some additional temporary data
+                 used during the computation but which is not the state of the FIR.
+                 The first A samples are temporary data.
+                 The remaining samples are the state of the FIR filter.
+  @par                 
+                 So the state buffer has size <code> numTaps + A + blockSize - 1 </code> :
+                 - A is blockSize for f32
+                 - A is 8*ceil(blockSize/8) for f16
+                 - A is 8*ceil(blockSize/4) for q31
+                 - A is 0 for other datatypes (q15 and q7)
+
+
   @par           Fixed-Point Behavior
                    Care must be taken when using the fixed-point versions of the FIR filter functions.
                    In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
                    Refer to the function specific documentation below for usage guidelines.
+
  */
 
 /**
@@ -126,578 +150,534 @@
 
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 
-static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, const float32_t * pSrc, float32_t * pDst, uint32_t blockSize)
+#define FIR_F32_MAX_COEF_BLK        8
+
+#define FIR_F32_CORE(pSamples, c, NB_TAPS)                                 \
+        vecAcc0 = vdupq_n_f32(0.0f);                                       \
+        for (int i = 0; i < NB_TAPS; i++) {                                \
+            vecIn0 = vld1q(&pSamples[i]);                                  \
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]);                        \
+        }
+
+
+#define NB_TAPS 4
+__STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, 
+  const float32_t * __restrict pSrc, 
+  float32_t * __restrict pDst, uint32_t blockSize)
 {
-    float32_t *pState = S->pState;      /* State pointer */
-    const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
-    float32_t *pStateCur;               /* Points to the current sample of the state */
-    const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
-    float32_t *pOutput;                 /* Temporary pointer to the output buffer */
-    const float32_t *pTempSrc;          /* Temporary pointer to the source data */
-    float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
-    uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
-    uint32_t  blkCnt;
-    f32x4_t vecIn0;
-    f32x4_t vecAcc0;
-    float32_t c0, c1, c2, c3;
+    float32_t *pRefStatePtr = S->pState + blockSize;
+    float32_t      *pState = pRefStatePtr; /* State pointer */
+    const float32_t *pCoeffs = S->pCoeffs;      /* Coefficient pointer */
+    float32_t      *pStateCur;  /* Points to the current sample of the state */
+    const float32_t *pSamples;  /* Temporary pointer to the sample buffer */
+    float32_t      *pOutput;    /* Temporary pointer to the output buffer */
+    const float32_t *pTempSrc;  /* Temporary pointer to the source data */
+    float32_t      *pTempDest;  /* Temporary pointer to the destination buffer */
+    uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
+    int32_t         blkCnt;
+    float32x4_t         vecIn0;
+    float32x4_t         vecAcc0;
+    float32_t       c[NB_TAPS];
+    const float32_t *pCoeffsCur = pCoeffs;
 
     /*
      * pState points to state array which contains previous frame (numTaps - 1) samples
      * pStateCur points to the location where the new input data should be written
      */
     pStateCur = &(pState[(numTaps - 1u)]);
-    pSamples  = pState;
-    pTempSrc  = pSrc;
-    pOutput   = pDst;
-
-    if (((numTaps - 1) / 4) == 0)
-    {
-        const float32_t *pCoeffsCur = pCoeffs;
-
-        c0 = *pCoeffsCur++;
-        c1 = *pCoeffsCur++;
-        c2 = *pCoeffsCur++;
-        c3 = *pCoeffsCur++;
-
-        blkCnt = blockSize >> 2;
-        while (blkCnt > 0U)
-        {
-            /*
-             * Save 4 input samples in the history buffer
-             */
-            vst1q(pStateCur, vld1q(pTempSrc));
-            pStateCur += 4;
-            pTempSrc += 4;
-
-            vecIn0 = vld1q(pSamples);
-            vecAcc0 = vmulq(vecIn0, c0);
-
-            vecIn0 = vld1q(&pSamples[1]);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
-            vecIn0 = vld1q(&pSamples[2]);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+    pTempSrc = pSrc;
 
-            vecIn0 = vld1q(&pSamples[3]);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+    pSamples = pState;
+    pOutput = pDst;
 
-            vst1q(pOutput, vecAcc0);
+    for (int i = 0; i < NB_TAPS; i++)
+        c[i] = *pCoeffsCur++;
 
-            pOutput += 4;
-            pSamples += 4;
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0) {
+        /*
+         * Save 4 input samples in the history buffer
+         */
+        vst1q(pStateCur, vld1q(pTempSrc));
+        pStateCur += 4;
+        pTempSrc += 4;
 
-            blkCnt--;
-        }
+        FIR_F32_CORE(pSamples, c, NB_TAPS);
 
-        blkCnt = blockSize & 3;
-        if (blkCnt > 0U)
-        {
-            mve_pred16_t p0 = vctp32q(blkCnt);
+        vst1q(pOutput, vecAcc0);
 
-            vstrwq_p_f32(pStateCur, vld1q(pTempSrc),p0);
-            pStateCur += blkCnt;
-            pTempSrc += blkCnt;
+        pOutput += 4;
+        pSamples += 4;
 
-            vecIn0 = vld1q(pSamples);
-            vecAcc0 = vmulq(vecIn0, c0);
+        blkCnt--;
+    }
 
-            vecIn0 = vld1q(&pSamples[1]);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+    blkCnt = blockSize & 3;
+    if (blkCnt)
+    {
+        mve_pred16_t    p0 = vctp32q(blkCnt);
 
-            vecIn0 = vld1q(&pSamples[2]);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+        vst1q(pStateCur, vld1q(pTempSrc));
+        pStateCur += 4;
+        pTempSrc += 4;
 
-            vecIn0 = vld1q(&pSamples[3]);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+        FIR_F32_CORE(pSamples, c, NB_TAPS);
 
-            vstrwq_p_f32(pOutput, vecAcc0, p0);
-        }
+        vstrwq_p_f32(pOutput, vecAcc0, p0);
     }
 
     /*
      * Copy the samples back into the history buffer start
      */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
 
-    blkCnt = numTaps >> 2;
-    while (blkCnt > 0U)
-    {
-        vst1q(pTempDest, vld1q(pTempSrc));
+    blkCnt = numTaps - 1;
+    do {
+        mve_pred16_t    p = vctp32q(blkCnt);
+
+        vstrwq_p_f32(pTempDest, vldrwq_z_f32(pTempSrc, p), p);
         pTempSrc += 4;
         pTempDest += 4;
-        blkCnt--;
-    }
-
-    blkCnt = numTaps & 3;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp32q(blkCnt);
-        vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
+        blkCnt -= 4;
     }
+    while (blkCnt > 0);
 }
+#undef NB_TAPS
 
-
-static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, const float32_t * pSrc, float32_t * pDst, uint32_t blockSize)
+__STATIC_INLINE void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, 
+  const float32_t * __restrict pSrc, 
+  float32_t * __restrict pDst, uint32_t blockSize)
 {
-    float32_t *pState = S->pState;      /* State pointer */
+    float32_t *pRefStatePtr = S->pState + blockSize;
+    float32_t *pState = pRefStatePtr;      /* State pointer */
     const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
-    float32_t *pStateCur;               /* Points to the current sample of the state */
     const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
-    float32_t *pOutput;                 /* Temporary pointer to the output buffer */
     const float32_t *pTempSrc;          /* Temporary pointer to the source data */
     float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
     uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
-    uint32_t  blkCnt;
-    f32x4_t vecIn0;
-    f32x4_t vecAcc0;
+    int32_t  blkCnt;
     float32_t c0, c1, c2, c3;
     float32_t c4, c5, c6, c7;
-    const float32_t *pCoeffsCur = pCoeffs;
 
-    /*
-     * pState points to state array which contains previous frame (numTaps - 1) samples
-     * pStateCur points to the location where the new input data should be written
-     */
-    pStateCur = &(pState[(numTaps - 1u)]);
+
     pTempSrc = pSrc;
+    pTempDest = &(pState[(numTaps - 1u)]);
+    int cnt = blockSize;
+    do {
+        mve_pred16_t p0 = vctp32q(cnt);
+        vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
+        pTempDest += 4;
+        pTempSrc += 4;
+        cnt -= 4;
+    } while(cnt > 0);
 
-    pSamples = pState;
-    pOutput = pDst;
 
-    c0 = *pCoeffsCur++;
-    c1 = *pCoeffsCur++;
-    c2 = *pCoeffsCur++;
-    c3 = *pCoeffsCur++;
-    c4 = *pCoeffsCur++;
-    c5 = *pCoeffsCur++;
-    c6 = *pCoeffsCur++;
-    c7 = *pCoeffsCur++;
 
-    blkCnt = blockSize >> 2;
-    while (blkCnt > 0U)
+    pSamples = pState;
+    c0 = *pCoeffs++;
+    c1 = *pCoeffs++;
+    c2 = *pCoeffs++;
+    c3 = *pCoeffs++;
+    c4 = *pCoeffs++;
+    c5 = *pCoeffs++;
+    c6 = *pCoeffs++;
+    c7 = *pCoeffs++;
+
+    cnt = blockSize >> 2;
+    while(cnt > 0) 
     {
-        /*
-         * Save 4 input samples in the history buffer
-         */
-        vst1q(pStateCur, vld1q(pTempSrc));
-        pStateCur += 4;
-        pTempSrc += 4;
+        float32x4_t vecAcc0;
+        float32x4_t vecIn0;
 
         vecIn0 = vld1q(pSamples);
         vecAcc0 = vmulq(vecIn0, c0);
-
         vecIn0 = vld1q(&pSamples[1]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
         vecIn0 = vld1q(&pSamples[2]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
         vecIn0 = vld1q(&pSamples[3]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
         vecIn0 = vld1q(&pSamples[4]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
         vecIn0 = vld1q(&pSamples[5]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
         vecIn0 = vld1q(&pSamples[6]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
         vecIn0 = vld1q(&pSamples[7]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-
-        vst1q(pOutput, vecAcc0);
-
-        pOutput += 4;
         pSamples += 4;
-
-        blkCnt--;
+        vst1q(pDst, vecAcc0);
+        cnt--;
+        pDst += 4;
     }
 
-    blkCnt = blockSize & 3;
-    if (blkCnt > 0U)
+    cnt = blockSize & 3;
+    if (cnt > 0) 
     {
-        mve_pred16_t p0 = vctp32q(blkCnt);
+        float32x4_t vecAcc0;
+        float32x4_t vecIn0;
 
-        vstrwq_p_f32(pStateCur, vld1q(pTempSrc),p0);
-        pStateCur += blkCnt;
-        pTempSrc += blkCnt;
+        mve_pred16_t p0 = vctp32q(cnt);
 
         vecIn0 = vld1q(pSamples);
         vecAcc0 = vmulq(vecIn0, c0);
-
         vecIn0 = vld1q(&pSamples[1]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
         vecIn0 = vld1q(&pSamples[2]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
         vecIn0 = vld1q(&pSamples[3]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
         vecIn0 = vld1q(&pSamples[4]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
         vecIn0 = vld1q(&pSamples[5]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
         vecIn0 = vld1q(&pSamples[6]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
         vecIn0 = vld1q(&pSamples[7]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-
-        vstrwq_p_f32(pOutput, vecAcc0, p0);
+        vstrwq_p_f32(pDst, vecAcc0,p0);
     }
 
+
     /*
      * Copy the samples back into the history buffer start
      */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
-
-    blkCnt = numTaps >> 2;
-    while (blkCnt > 0U)
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
+    blkCnt = numTaps;
+    while (blkCnt > 0)
     {
-        vst1q(pTempDest, vld1q(pTempSrc));
-        pTempSrc += 4;
-        pTempDest += 4;
+        *pTempDest++ = *pTempSrc++;
         blkCnt--;
     }
-
-    blkCnt = numTaps & 3;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp32q(blkCnt);
-        vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
-    }
 }
 
 
+
 void arm_fir_f32(
 const arm_fir_instance_f32 * S,
 const float32_t * pSrc,
 float32_t * pDst,
 uint32_t blockSize)
 {
-    float32_t *pState = S->pState;      /* State pointer */
+    /* 
+       S->pState is the arm_fir_partial_accu
+       S->pState + blockSize is the FIR state
+    */
+    float32_t *pRefStatePtr = S->pState + blockSize;
+    float32_t *pState = pRefStatePtr ;      /* State pointer */
     const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
-    float32_t *pStateCur;               /* Points to the current sample of the state */
     const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
     float32_t *pOutput;                 /* Temporary pointer to the output buffer */
     const float32_t *pTempSrc;          /* Temporary pointer to the source data */
     float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
     uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
     uint32_t  blkCnt;
-    int32_t numCnt;
-    f32x4_t vecIn0;
-    f32x4_t vecAcc0;
     float32_t c0, c1, c2, c3;
     float32_t c4, c5, c6, c7;
 
     /*
      * [1 to 8 taps] specialized routines
      */
-    if (blockSize >= 8)
+    if (numTaps <= 4)
     {
-       if (numTaps <= 4)
-       {
-           arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
-           return;
-       }
+        arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
+        return;
     }
-    if (blockSize >= 8)
+    else if (numTaps <= 8)
     {
-       if (numTaps <= 8)
-       {
-           arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
-           return;
-       }
+        arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+
+    pTempSrc = pSrc;
+    pTempDest = &(pState[(numTaps - 1u)]);
+    int cnt = blockSize;
+    do {
+        mve_pred16_t p0 = vctp32q(cnt);
+        vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
+        pTempDest += 4;
+        pTempSrc += 4;
+        cnt -= 4;
+    } while(cnt > 0);
+
+    float32_t *partial_accu_ptr = S->pState;
+
+    pSamples = pState;
+    c0 = *pCoeffs++;
+    c1 = *pCoeffs++;
+    c2 = *pCoeffs++;
+    c3 = *pCoeffs++;
+    c4 = *pCoeffs++;
+    c5 = *pCoeffs++;
+    c6 = *pCoeffs++;
+    c7 = *pCoeffs++;
+
+    cnt = blockSize >> 2;
+    while(cnt > 0) {
+        float32x4_t vecAcc0;
+        float32x4_t vecIn0;
+
+        vecIn0 = vld1q(pSamples);
+        vecAcc0 = vmulq(vecIn0, c0);
+        vecIn0 = vld1q(&pSamples[1]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+        vecIn0 = vld1q(&pSamples[2]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+        vecIn0 = vld1q(&pSamples[3]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+        vecIn0 = vld1q(&pSamples[4]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+        vecIn0 = vld1q(&pSamples[5]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+        vecIn0 = vld1q(&pSamples[6]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+        vecIn0 = vld1q(&pSamples[7]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+        pSamples += 4;
+        vst1q(partial_accu_ptr, vecAcc0);
+        cnt--;
+        partial_accu_ptr += 4;
     }
 
-    if (blockSize >= 8)
+    cnt = blockSize & 3;
+    if (cnt > 0) 
     {
-        /*
-         * pState points to state array which contains previous frame (numTaps - 1) samples
-         * pStateCur points to the location where the new input data should be written
-         */
-        pStateCur = &(pState[(numTaps - 1u)]);
-        pTempSrc = pSrc;
-        pSamples = pState;
-        pOutput = pDst;
+        float32x4_t vecAcc0;
+        float32x4_t vecIn0;
 
-        blkCnt = blockSize >> 2;
-        while (blkCnt > 0U)
-        {
-            int32_t       i;
-            const float32_t *pCoeffsCur = pCoeffs;
-
-            c0 = *pCoeffsCur++;
-            c1 = *pCoeffsCur++;
-            c2 = *pCoeffsCur++;
-            c3 = *pCoeffsCur++;
-            c4 = *pCoeffsCur++;
-            c5 = *pCoeffsCur++;
-            c6 = *pCoeffsCur++;
-            c7 = *pCoeffsCur++;
-
-            vst1q(pStateCur, vld1q(pTempSrc));
-            pStateCur += 4;
-            pTempSrc += 4;
+        mve_pred16_t p0 = vctp32q(cnt);
+
+        vecIn0 = vld1q(pSamples);
+        vecAcc0 = vmulq(vecIn0, c0);
+        vecIn0 = vld1q(&pSamples[1]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+        vecIn0 = vld1q(&pSamples[2]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+        vecIn0 = vld1q(&pSamples[3]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+        vecIn0 = vld1q(&pSamples[4]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+        vecIn0 = vld1q(&pSamples[5]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+        vecIn0 = vld1q(&pSamples[6]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+        vecIn0 = vld1q(&pSamples[7]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+        vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
+    }
+
+    int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
+    int sample_offset = FIR_F32_MAX_COEF_BLK;
+    while (localTaps > FIR_F32_MAX_COEF_BLK) {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        c4 = *pCoeffs++;
+        c5 = *pCoeffs++;
+        c6 = *pCoeffs++;
+        c7 = *pCoeffs++;
+
+        partial_accu_ptr = S->pState;
+        pSamples = pState + sample_offset;
+        int cnt = blockSize >> 2;
+        while(cnt > 0) {
+            float32x4_t vecAcc0;
+            float32x4_t vecIn0;
 
             vecIn0 = vld1q(pSamples);
             vecAcc0 = vmulq(vecIn0, c0);
-
             vecIn0 = vld1q(&pSamples[1]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
             vecIn0 = vld1q(&pSamples[2]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
             vecIn0 = vld1q(&pSamples[3]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
             vecIn0 = vld1q(&pSamples[4]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
             vecIn0 = vld1q(&pSamples[5]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
             vecIn0 = vld1q(&pSamples[6]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
             vecIn0 = vld1q(&pSamples[7]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            pSamples += 4;
+            vecAcc0 += vld1q_f32(partial_accu_ptr);
+            vst1q(partial_accu_ptr, vecAcc0);
+            cnt--;
+            partial_accu_ptr += 4;
+        }
 
-            pSamples += 8;
-
-            numCnt = ((int32_t)numTaps - 8) / 8;
-
-            for (i = 0; i < numCnt; i++)
-            {
-                c0 = *pCoeffsCur++;
-                c1 = *pCoeffsCur++;
-                c2 = *pCoeffsCur++;
-                c3 = *pCoeffsCur++;
-                c4 = *pCoeffsCur++;
-                c5 = *pCoeffsCur++;
-                c6 = *pCoeffsCur++;
-                c7 = *pCoeffsCur++;
-
-                vecIn0 = vld1q(pSamples);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-
-                vecIn0 = vld1q(&pSamples[1]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
-                vecIn0 = vld1q(&pSamples[2]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
-                vecIn0 = vld1q(&pSamples[3]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
-                vecIn0 = vld1q(&pSamples[4]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
-                vecIn0 = vld1q(&pSamples[5]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
-                vecIn0 = vld1q(&pSamples[6]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+        cnt = blockSize & 3;
+        if (cnt > 0) {
+            float32x4_t vecAcc0;
+            float32x4_t vecIn0;
 
-                vecIn0 = vld1q(&pSamples[7]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            mve_pred16_t p0 = vctp32q(cnt);
 
-                pSamples += 8;
-            }
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            vecIn0 = vld1q(&pSamples[4]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(&pSamples[5]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+            vecIn0 = vld1q(&pSamples[6]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+            vecIn0 = vld1q(&pSamples[7]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            vecAcc0 += vld1q_f32(partial_accu_ptr);
+            vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
+        }
 
-            numCnt = ((int32_t)numTaps - 8) & 7;
+        localTaps -= FIR_F32_MAX_COEF_BLK;
+        sample_offset += FIR_F32_MAX_COEF_BLK;
+    }
 
-            while (numCnt > 0)
-            {
-                c0 = *pCoeffsCur++;
-                vecIn0 = vld1q(pSamples);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-                pSamples ++;
+    pSamples = pState + sample_offset;
+
+    if (localTaps > 4) {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        c4 = *pCoeffs++;
+        c5 = *pCoeffs++;
+        c6 = *pCoeffs++;
+        c7 = *pCoeffs++;
+        pOutput = pDst;
 
-                numCnt --;
-            }
+        partial_accu_ptr = S->pState;
+        cnt = blockSize  >> 2;
+        while(cnt > 0) {
+            float32x4_t vecAcc0;
+            float32x4_t vecIn0;
 
-            vst1q(pOutput, vecAcc0);
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            vecIn0 = vld1q(&pSamples[4]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(&pSamples[5]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+            vecIn0 = vld1q(&pSamples[6]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+            vecIn0 = vld1q(&pSamples[7]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            pSamples += 4;
+            float32x4_t pap = vld1q_f32(partial_accu_ptr);
+            vst1q(pOutput, vecAcc0+pap);
+            cnt--;
+            partial_accu_ptr += 4;
             pOutput += 4;
-            pSamples = pSamples - numTaps + 4;
-
-            blkCnt--;
         }
 
-        blkCnt = blockSize & 3;
-        if (blkCnt > 0U)
-        {
-            mve_pred16_t p0 = vctp32q(blkCnt);
-            int32_t       i;
-            const float32_t *pCoeffsCur = pCoeffs;
-
-            vst1q(pStateCur, vld1q(pTempSrc));
-            pStateCur += 4;
-            pTempSrc += 4;
-
-            c0 = *pCoeffsCur++;
-            c1 = *pCoeffsCur++;
-            c2 = *pCoeffsCur++;
-            c3 = *pCoeffsCur++;
-            c4 = *pCoeffsCur++;
-            c5 = *pCoeffsCur++;
-            c6 = *pCoeffsCur++;
-            c7 = *pCoeffsCur++;
+        cnt = blockSize  & 3;
+        if (cnt > 0) {
+            float32x4_t vecAcc0;
+            float32x4_t vecIn0;
+
+            mve_pred16_t p0 = vctp32q(cnt);
 
             vecIn0 = vld1q(pSamples);
             vecAcc0 = vmulq(vecIn0, c0);
-
             vecIn0 = vld1q(&pSamples[1]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
             vecIn0 = vld1q(&pSamples[2]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
             vecIn0 = vld1q(&pSamples[3]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
             vecIn0 = vld1q(&pSamples[4]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
             vecIn0 = vld1q(&pSamples[5]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
             vecIn0 = vld1q(&pSamples[6]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
             vecIn0 = vld1q(&pSamples[7]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-
-            pSamples += 8;
-
-            numCnt = ((int32_t)numTaps - 8) / 8;
-
-            for (i = 0; i < numCnt; i++)
-            {
-                c0 = *pCoeffsCur++;
-                c1 = *pCoeffsCur++;
-                c2 = *pCoeffsCur++;
-                c3 = *pCoeffsCur++;
-                c4 = *pCoeffsCur++;
-                c5 = *pCoeffsCur++;
-                c6 = *pCoeffsCur++;
-                c7 = *pCoeffsCur++;
-
-                vecIn0 = vld1q(pSamples);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-
-                vecIn0 = vld1q(&pSamples[1]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
-                vecIn0 = vld1q(&pSamples[2]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
-                vecIn0 = vld1q(&pSamples[3]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
-                vecIn0 = vld1q(&pSamples[4]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
-                vecIn0 = vld1q(&pSamples[5]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
-                vecIn0 = vld1q(&pSamples[6]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
-                vecIn0 = vld1q(&pSamples[7]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-
-                pSamples += 8;
-            }
-
-            numCnt = ((int32_t)numTaps - 8) & 7;
-
-            while (numCnt > 0)
-            {
-                c0 = *pCoeffsCur++;
-                vecIn0 = vld1q(pSamples);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-                pSamples ++;
-
-                numCnt --;
-            }
-
-            vstrwq_p_f32(pOutput, vecAcc0, p0);
+            float32x4_t pap = vld1q_f32(partial_accu_ptr);
+            vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
+            pOutput += cnt;
         }
     }
-    else
-    {
-        float32_t *pStateCurnt;                        /* Points to the current sample of the state */
-        float32_t *px;                                 /* Temporary pointer for state buffer */
-  const float32_t *pb;                                 /* Temporary pointer for coefficient buffer */
-        float32_t acc0;                                /* Accumulator */
-        uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
-        uint32_t i, blkCnt;                    /* Loop counters */
-        pStateCurnt = &(S->pState[(numTaps - 1U)]);
-
-        blkCnt = blockSize;
-        while (blkCnt > 0U)
-        {
-          /* Copy one sample at a time into state buffer */
-          *pStateCurnt++ = *pSrc++;
-
-          /* Set the accumulator to zero */
-          acc0 = 0.0f;
-
-          /* Initialize state pointer */
-          px = pState;
-
-          /* Initialize Coefficient pointer */
-          pb = pCoeffs;
-
-          i = numTaps;
-
-          /* Perform the multiply-accumulates */
-          while (i > 0U)
-          {
-            /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
-            acc0 += *px++ * *pb++;
+    else {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        pOutput = pDst;
 
-            i--;
-          }
+        partial_accu_ptr = S->pState;
+        cnt = blockSize >> 2;
+        while(cnt > 0) {
+            float32x4_t vecAcc0;
+            float32x4_t vecIn0;
 
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            pSamples += 4;
+            float32x4_t pap = vld1q_f32(partial_accu_ptr);
+            vst1q(pOutput, vecAcc0+pap);
+            cnt--;
+            partial_accu_ptr += 4;
+            pOutput += 4;
+        }
 
-          /* Store result in destination buffer. */
-          *pDst++ = acc0;
+        cnt = blockSize & 3;
+        if (cnt > 0) {
+            float32x4_t vecAcc0;
+            float32x4_t vecIn0;
 
-          /* Advance state pointer by 1 for the next sample */
-          pState = pState + 1U;
+            mve_pred16_t p0 = vctp32q(cnt);
 
-          /* Decrement loop counter */
-          blkCnt--;
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            float32x4_t pap = vld1q_f32(partial_accu_ptr);
+            vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
+            pOutput += cnt;
         }
     }
 
     /*
      * Copy the samples back into the history buffer start
      */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
+    pTempSrc = &pRefStatePtr[blockSize];
+    pTempDest = pRefStatePtr;
 
     blkCnt = numTaps >> 2;
-    while (blkCnt > 0U)
+    while (blkCnt > 0)
     {
         vst1q(pTempDest, vld1q(pTempSrc));
         pTempSrc += 4;
         pTempDest += 4;
         blkCnt--;
     }
-
     blkCnt = numTaps & 3;
-    if (blkCnt > 0U)
+    if (blkCnt > 0)
     {
         mve_pred16_t p0 = vctp32q(blkCnt);
         vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c
index 5f8df95422df9398b8929ba1f66f90cc887790cb..ccd3ed8efdc8a340979195e763453917f9b7acd0 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_fast_q15.c
  * Description:  Q15 Fast FIR filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c
index 513cb7280e9379c8dba82d27715982b36c8906a5..78b6bd8cd9f8600bc37d32d972b31aff0541ecf4 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_fast_q31.c
  * Description:  Processing function for the Q31 Fast FIR filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f16.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..34913432d0bd2a392709a57c9ebaa737ef646d7e
--- /dev/null
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f16.c
@@ -0,0 +1,105 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_fir_init_f16.c
+ * Description:  Floating-point FIR filter initialization function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+  @ingroup groupFilters
+ */
+
+/**
+  @addtogroup FIR
+  @{
+ */
+
+/**
+  @brief         Initialization function for the floating-point FIR filter.
+  @param[in,out] S          points to an instance of the floating-point FIR filter structure
+  @param[in] 	 numTaps    number of filter coefficients in the filter
+  @param[in]     pCoeffs    points to the filter coefficients buffer
+  @param[in]     pState     points to the state buffer
+  @param[in]     blockSize  number of samples processed per call
+  @return        none
+
+  @par           Details
+                   <code>pCoeffs</code> points to the array of filter coefficients stored in time reversed order:
+  <pre>
+      {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+  </pre>
+  @par
+                   <code>pState</code> points to the array of state variables.
+                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f16()</code>.
+  @par          Initialization of Helium version
+                 For Helium version the array of coefficients must be a multiple of 4 (4a) even if less
+                 then 4a coefficients are defined in the FIR. The additional coefficients 
+                 (4a - numTaps) must be set to 0.
+                 numTaps is still set to its right value in the init function. It means that
+                 the implementation may require to read more coefficients due to the vectorization and
+                 to avoid having to manage too many different cases in the code.
+
+
+  @par          Helium state buffer
+                 The state buffer must contain some additional temporary data
+                 used during the computation but which is not the state of the FIR.
+                 The first 8*ceil(blockSize/8) samples are temporary data.
+                 The remaining samples are the state of the FIR filter.
+                 So the state buffer has size <code> numTaps + 8*ceil(blockSize/8) + blockSize - 1 </code>
+
+ */
+
+void arm_fir_init_f16(
+        arm_fir_instance_f16 * S,
+        uint16_t numTaps,
+  const float16_t * pCoeffs,
+        float16_t * pState,
+        uint32_t blockSize)
+{
+  /* Assign filter taps */
+  S->numTaps = numTaps;
+
+  /* Assign coefficient pointer */
+  S->pCoeffs = pCoeffs;
+
+  /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+  memset(pState, 0, (numTaps + (blockSize - 1U) + ROUND_UP(blockSize, 8)) * sizeof(float16_t));
+#else
+  memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float16_t));
+#endif 
+
+  /* Assign state pointer */
+  S->pState = pState;
+}
+
+/**
+  @} end of FIR group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f32.c
index 2405c3f6a74aec24a511f61d252572a059c39dbd..e0a58755bbe014ea25aa08522f399dafb21e46b5 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_init_f32.c
  * Description:  Floating-point FIR filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -52,16 +52,23 @@
       {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
   </pre>
   @par
-                   <code>pState</code> points to the array of state variables.
-                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f32()</code>.
+                   <code>pState</code> points to the array of state variables and some working memory for the Helium version.
+                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f32()</code>.
   @par          Initialization of Helium version
-                 For Helium version the array of coefficients must be a multiple of 16 even if less
-                 then 16 coefficients are used. The additional coefficients must be set to 0.
-                 It does not mean that all the coefficients will be used in the filter (numTaps
-                 is still set to its right value in the init function.) It just means that
+                 For Helium version the array of coefficients must be a multiple of 4 (4a) even if less
+                 then 4a coefficients are defined in the FIR. The additional coefficients 
+                 (4a - numTaps) must be set to 0.
+                 numTaps is still set to its right value in the init function. It means that
                  the implementation may require to read more coefficients due to the vectorization and
                  to avoid having to manage too many different cases in the code.
 
+  @par          Helium state buffer
+                 The state buffer must contain some additional temporary data
+                 used during the computation but which is not the state of the FIR.
+                 The first blockSize samples are temporary data.
+                 The remaining samples are the state of the FIR filter.
+                 So the state buffer has size <code> numTaps + 2 * blockSize - 1 </code>
+
  */
 
 void arm_fir_init_f32(
@@ -78,8 +85,11 @@ void arm_fir_init_f32(
   S->pCoeffs = pCoeffs;
 
   /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+  memset(pState, 0, (numTaps + (blockSize - 1U) + blockSize) * sizeof(float32_t));
+#else
   memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float32_t));
-
+#endif
   /* Assign state pointer */
   S->pState = pState;
 }
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q15.c
index a5b2d06ad0daa58f9424c20dc4ac42d796ea783e..3682fb09f53c285727add67eea634305f67e29bb 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_init_q15.c
  * Description:  Q15 FIR filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -73,6 +73,14 @@
   </pre>
                    <code>pState</code> points to the array of state variables.
                    <code>pState</code> is of length <code>numTaps+blockSize</code>, when running on Cortex-M4 and Cortex-M3  and is of length <code>numTaps+blockSize-1</code>, when running on Cortex-M0 where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_q15()</code>.
+ 
+  @par          Initialization of Helium version
+                   For Helium version the array of coefficients must be a multiple of 8 (8a) even if less
+                   then 8a coefficients are defined in the FIR. The additional coefficients 
+                   (8a - numTaps) must be set to 0.
+                   numTaps is still set to its right value in the init function. It means that
+                   the implementation may require to read more coefficients due to the vectorization and
+                   to avoid having to manage too many different cases in the code.
  */
 
 arm_status arm_fir_init_q15(
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q31.c
index 7d8376f80089ce2ce3642167f5941caf68aee94d..96d1fdde36f34cc35422ad24f472a2c76a1398f9 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_init_q31.c
  * Description:  Q31 FIR filter initialization function.
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -52,7 +52,23 @@
       {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
   </pre>
                    <code>pState</code> points to the array of state variables.
-                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_q31()</code>.
+                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_q31()</code>.
+
+   @par          Initialization of Helium version
+                   For Helium version the array of coefficients must be a multiple of 4 (4a) even if less
+                   then 4a coefficients are defined in the FIR. The additional coefficients 
+                   (4a - numTaps) must be set to 0.
+                   numTaps is still set to its right value in the init function. It means that
+                   the implementation may require to read more coefficients due to the vectorization and
+                   to avoid having to manage too many different cases in the code.
+  
+    @par          Helium state buffer
+                   The state buffer must contain some additional temporary data
+                   used during the computation but which is not the state of the FIR.
+                   The first 2*4*ceil(blockSize/4) samples are temporary data.
+                   The remaining samples are the state of the FIR filter.
+                   So the state buffer has size <code> numTaps + 8*ceil(blockSize/4) + blockSize - 1 </code>
+  
  */
 
 void arm_fir_init_q31(
@@ -69,7 +85,11 @@ void arm_fir_init_q31(
   S->pCoeffs = pCoeffs;
 
   /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+  memset(pState, 0, (numTaps + (blockSize - 1U) + 2*ROUND_UP(blockSize, 4)) * sizeof(q31_t));
+  #else
   memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q31_t));
+  #endif
 
   /* Assign state pointer */
   S->pState = pState;
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q7.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q7.c
index f96d25090793b7901f1c1d8e3b930a1487f93117..1ad05a5575cb163ea32c215a357730222b7b8ed8 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q7.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_init_q7.c
  * Description:  Q7 FIR filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -54,6 +54,15 @@
   @par
                    <code>pState</code> points to the array of state variables.
                    <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_q7()</code>.
+  
+  @par          Initialization of Helium version
+                   For Helium version the array of coefficients must be a multiple of 16 (16a) even if less
+                   then 16a coefficients are defined in the FIR. The additional coefficients 
+                   (16a - numTaps) must be set to 0.
+                   numTaps is still set to its right value in the init function. It means that
+                   the implementation may require to read more coefficients due to the vectorization and
+                   to avoid having to manage too many different cases in the code.
+
  */
 
 void arm_fir_init_q7(
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c
index f8582ef7012b8dc5ae1e36aa0bb3b3ed0e60ad66..b6a6ecb7cd62b91c0c175af6d6513c9f4ebc93a5 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_interpolate_f32.c
  * Description:  Floating-point FIR interpolation sequences
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @defgroup FIR_Interpolate Finite Impulse Response (FIR) Interpolator
@@ -147,7 +147,7 @@ static void arm_fir_interpolate2_f32_mve(
     uint32_t  blkCnt;           /* Loop counters */
     uint16_t  phaseLen = S->phaseLength;    /* Length of each polyphase filter component */
     uint32_t  strides[4] = { 0, 1 * 2, 2 * 2, 3 * 2 };
-    uint32x4_t vec_strides0 = *(uint32x4_t *) strides;
+    uint32x4_t vec_strides0 = vld1q_u32(strides);
     uint32x4_t vec_strides1 = vec_strides0 + 1;
     f32x4_t acc0, acc1;
 
@@ -271,8 +271,8 @@ void arm_fir_interpolate_f32(
     uint16_t  phaseLen = S->phaseLength;    /* Length of each polyphase filter component */
     uint32_t  strides[4] = { 0, 1 * S->L, 2 * S->L, 3 * S->L };
     uint32_t  stridesM[4] = { 4, 3, 2, 1 };
-    uint32x4_t vec_stridesM = *(uint32x4_t *) stridesM;
-    uint32x4_t vec_strides = *(uint32x4_t *) strides;
+    uint32x4_t vec_stridesM =  vld1q_u32(stridesM);
+    uint32x4_t vec_strides =  vld1q_u32(strides);
     f32x4_t acc;
 
 
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c
index 287d3471ba8abd282e29a084baf34b3d19a2ce83..67bdff9c0a1e6db26fd379f1ef782452d2d20a10 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_interpolate_init_f32.c
  * Description:  Floating-point FIR interpolator initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c
index 7f43bbfa0d0a3cbb0c6053193c468ef74a408cd7..7436aa434bb4c77dee6c7cb9c956b14a2cb7003d 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_interpolate_init_q15.c
  * Description:  Q15 FIR interpolator initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c
index 973e715942625f533bd9dcd1c833c7b9a477f2bb..15f9e0d3aca76e6d90b28057862aaf81230b5472 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_interpolate_init_q31.c
  * Description:  Q31 FIR interpolator initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c
index a88979376450791d1280c8ffe56c7320ce265df2..523e1557fe8225b830d323351971d606040dc150 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_interpolate_q15.c
  * Description:  Q15 FIR interpolation
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -54,7 +54,7 @@
                    Lastly, the accumulator is saturated to yield a result in 1.15 format.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 void arm_fir_interpolate_q15(
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c
index 826b156d154e80d78dbfcfac047ab82540a7a1be..ea217603ecd2d8bc37ce98ab7d1139e3cf5cd928 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_interpolate_q31.c
  * Description:  Q31 FIR interpolation
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -54,7 +54,7 @@
                    After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 void arm_fir_interpolate_q31(
@@ -71,7 +71,7 @@ void arm_fir_interpolate_q31(
     uint32_t  i, blkCnt;        /* Loop counters */
     uint16_t  phaseLen = S->phaseLength;    /* Length of each polyphase filter component */
     uint32_t  strides[4] = { 0, 1 * S->L, 2 * S->L, 3 * S->L };
-    uint32x4_t vec_strides0 = *(uint32x4_t *) strides;
+    uint32x4_t vec_strides0 =  vld1q_u32(strides);
     uint32x4_t vec_strides1 = vec_strides0 + 1;
     uint32x4_t vec_strides2 = vec_strides0 + 2;
     uint32x4_t vec_strides3 = vec_strides0 + 3;
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c
index a3d95c1ae53841c4f7dca907c2c19bfce29e6a7d..77ea75c15be7033b0b4c523ad285087e9c3366ca 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_lattice_f32.c
  * Description:  Processing function for floating-point FIR Lattice filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c
index 7929629a4699bed193796d7251e5b4e71d0eed36..a3a0c224cd192486e3d0bb87c8789cd68b074458 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_lattice_init_f32.c
  * Description:  Floating-point FIR Lattice filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c
index 5c80dff604e49790d4404c28770159af4b231af9..3996d484668fbbc5599df465f6707ac7f736b3b6 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_lattice_init_q15.c
  * Description:  Q15 FIR Lattice filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c
index 476296d60babc92b085f58c992c0daa71cdf11a3..4a91b590434979f87c872cc4e816efb42522df58 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_lattice_init_q31.c
  * Description:  Q31 FIR lattice filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c
index 42e7c0d48ae607ad9831f43cda97ef4ea2d71efe..14b7076d6c544ab116d046f6e10a8ed879364f99 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_lattice_q15.c
  * Description:  Q15 FIR lattice filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c
index c8d28d7c7d79582c576b97626cf5feed422f3441..3b8355185acd3681ad418e5306ed6b931681cb8e 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_lattice_q31.c
  * Description:  Q31 FIR lattice filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q15.c
index 6a184a694e2e49bab185d9177953b8293a9b7552..f00959b8e0ae8fa4557a5af8a99dd56213da9248 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_q15.c
  * Description:  Q15 FIR filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -56,11 +56,151 @@
   @remark
                    Refer to \ref arm_fir_fast_q15() for a faster but less precise implementation of this function.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
 
-static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pSrc, q15_t * pDst, uint32_t blockSize)
+
+#define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs)        \
+        for (int j = 0; j < nbAcc; j++) {                                  \
+            const q15_t    *pSmp = &pSample[j];                            \
+            q63_t           acc[4];                                        \
+                                                                           \
+            acc[j] = 0;                                                    \
+            for (int i = 0; i < nbVecTaps; i++) {                          \
+                vecIn0 = vld1q(pSmp + 8 * i);                  \
+                acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]);         \
+            }                                                              \
+            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15);               \
+        }
+
+#define FIR_Q15_MAIN_CORE()                                                                  \
+{                                                                                            \
+    q15_t          *pState = S->pState;     /* State pointer */                              \
+    const q15_t    *pCoeffs = S->pCoeffs;   /* Coefficient pointer */                        \
+    q15_t          *pStateCur;              /* Points to the current sample of the state */  \
+    const q15_t    *pSamples;               /* Temporary pointer to the sample buffer */     \
+    q15_t          *pOutput;                /* Temporary pointer to the output buffer */     \
+    const q15_t    *pTempSrc;               /* Temporary pointer to the source data */       \
+    q15_t          *pTempDest;              /* Temporary pointer to the destination buffer */\
+    uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */\
+    int32_t         blkCnt;                                                                  \
+    q15x8_t         vecIn0;                                                                  \
+                                                                                             \
+    /*                                                                                       \
+     * load coefs                                                                            \
+     */                                                                                      \
+    q15x8_t         vecCoeffs[NBVECTAPS];                                                    \
+                                                                                             \
+    for (int i = 0; i < NBVECTAPS; i++)                                                      \
+        vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i);                                          \
+                                                                                             \
+    /*                                                                                       \
+     * pState points to state array which contains previous frame (numTaps - 1) samples      \
+     * pStateCur points to the location where the new input data should be written           \
+     */                                                                                      \
+    pStateCur = &(pState[(numTaps - 1u)]);                                                   \
+    pTempSrc = pSrc;                                                                         \
+    pSamples = pState;                                                                       \
+    pOutput = pDst;                                                                          \
+                                                                                             \
+    blkCnt = blockSize >> 2;                                                                 \
+    while (blkCnt > 0) {                                                                     \
+        /*                                                                                   \
+         * Save 4 input samples in the history buffer                                        \
+         */                                                                                  \
+        vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc));                                         \
+        pStateCur += 4;                                                                      \
+        pTempSrc += 4;                                                                       \
+                                                                                             \
+        FIR_Q15_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs);                            \
+        pSamples += 4;                                                                       \
+                                                                                             \
+        blkCnt--;                                                                            \
+    }                                                                                        \
+                                                                                             \
+    /* tail */                                                                               \
+    int32_t        residual = blockSize & 3;                                                \
+                                                                                             \
+    for (int i = 0; i < residual; i++)                                                       \
+        *pStateCur++ = *pTempSrc++;                                                          \
+                                                                                             \
+    FIR_Q15_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs);                         \
+                                                                                             \
+    /*                                                                                       \
+     * Copy the samples back into the history buffer start                                   \
+     */                                                                                      \
+    pTempSrc = &pState[blockSize];                                                           \
+    pTempDest = pState;                                                                      \
+                                                                                             \
+    /* current compiler limitation */                                                        \
+    blkCnt = (numTaps - 1) >> 3;                                                             \
+    while (blkCnt > 0)                                                                       \
+    {                                                                                        \
+        vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc));                                         \
+        pTempSrc += 8;                                                                       \
+        pTempDest += 8;                                                                      \
+        blkCnt--;                                                                            \
+    }                                                                                        \
+    blkCnt = (numTaps - 1) & 7;                                                              \
+    if (blkCnt > 0)                                                                          \
+    {                                                                                        \
+        mve_pred16_t p = vctp16q(blkCnt);                                                    \
+        vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p);                               \
+    }                                                                                        \
+}
+    
+static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S, 
+  const q15_t * __restrict pSrc,
+  q15_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 32
+    #define NBVECTAPS (NBTAPS / 8)
+    FIR_Q15_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S, 
+  const q15_t * __restrict pSrc,
+  q15_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 24
+    #define NBVECTAPS (NBTAPS / 8)
+    FIR_Q15_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+
+static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S, 
+  const q15_t * __restrict pSrc,
+  q15_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 16
+    #define NBVECTAPS (NBTAPS / 8)
+    FIR_Q15_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, 
+  const q15_t * __restrict pSrc, 
+  q15_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 8
+    #define NBVECTAPS (NBTAPS / 8)
+    FIR_Q15_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+
+void arm_fir_q15(
+  const arm_fir_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
 {
     q15_t    *pState = S->pState;   /* State pointer */
     const q15_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
@@ -72,46 +212,81 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS
     uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
     uint32_t  blkCnt;
     q15x8_t vecIn0;
-    /*
-     * load 8 coefs
-     */
-    q15x8_t vecCoeffs = *(q15x8_t *) pCoeffs;
+    uint32_t  tapsBlkCnt = (numTaps + 7) / 8;
+    q63_t     acc0, acc1, acc2, acc3;
+
+
+int32_t nbTaps = (numTaps + 7) >> 3;
 
+switch(nbTaps) {
+
+    case 1:
+        arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize);
+        return;
+    case 2:
+        arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize);
+        return;
+    case 3:
+        arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize);
+        return;
+    case 4:
+        arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
     /*
      * pState points to state array which contains previous frame (numTaps - 1) samples
      * pStateCur points to the location where the new input data should be written
      */
-    pStateCur = &(pState[(numTaps - 1u)]);
-    pTempSrc = pSrc;
-    pSamples = pState;
-    pOutput = pDst;
-
-    q63_t     acc0, acc1, acc2, acc3;
-
-    blkCnt = blockSize >> 2;
+    pStateCur   = &(pState[(numTaps - 1u)]);
+    pTempSrc    = pSrc;
+    pSamples    = pState;
+    pOutput     = pDst;
+    blkCnt      = blockSize >> 2;
 
     while (blkCnt > 0U)
     {
+        const q15_t    *pCoeffsTmp = pCoeffs;
         const q15_t    *pSamplesTmp = pSamples;
 
+        acc0 = 0LL;
+        acc1 = 0LL;
+        acc2 = 0LL;
+        acc3 = 0LL;
+
         /*
-         * Save 4 input samples in the history buffer
+         * Save 8 input samples in the history buffer
          */
         vst1q(pStateCur, vld1q(pTempSrc));
         pStateCur += 8;
         pTempSrc += 8;
 
-        vecIn0 = vld1q(pSamplesTmp);
-        acc0 = vmlaldavq(vecIn0, vecCoeffs);
+        int       i = tapsBlkCnt;
+        while (i > 0)
+        {
+            /*
+             * load 8 coefs
+             */
+            q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
+
+            vecIn0 = vld1q(pSamplesTmp);
+            acc0 =  vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
+            vecIn0 = vld1q(&pSamplesTmp[1]);
+            acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
 
-        vecIn0 = vld1q(&pSamplesTmp[1]);
-        acc1 = vmlaldavq(vecIn0, vecCoeffs);
+            vecIn0 = vld1q(&pSamplesTmp[2]);
+            acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
 
-        vecIn0 = vld1q(&pSamplesTmp[2]);
-        acc2 = vmlaldavq(vecIn0, vecCoeffs);
+            vecIn0 = vld1q(&pSamplesTmp[3]);
+            acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);
 
-        vecIn0 = vld1q(&pSamplesTmp[3]);
-        acc3 = vmlaldavq(vecIn0, vecCoeffs);
+            pSamplesTmp += 8;
+            pCoeffsTmp += 8;
+            /*
+             * Decrement the taps block loop counter
+             */
+            i--;
+        }
 
         *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
         *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
@@ -130,6 +305,7 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS
     {
     case 3:
         {
+            const q15_t    *pCoeffsTmp = pCoeffs;
             const q15_t    *pSamplesTmp = pSamples;
 
             acc0 = 0LL;
@@ -137,20 +313,40 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS
             acc2 = 0LL;
 
             /*
-             * Save 4 input samples in the history buffer
+             * Save 8 input samples in the history buffer
              */
             *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
             pStateCur += 8;
             pTempSrc += 8;
 
-            vecIn0 = vld1q(pSamplesTmp);
-            acc0 = vmlaldavq(vecIn0, vecCoeffs);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                /*
+                 * load 8 coefs
+                 */
+                q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
 
-            vecIn0 = vld1q(&pSamplesTmp[1]);
-            acc1 = vmlaldavq(vecIn0, vecCoeffs);
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamplesTmp[2]);
-            acc2 = vmlaldavq(vecIn0, vecCoeffs);
+                vecIn0 = vld1q(&pSamplesTmp[2]);
+                acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
+
+                vecIn0 = vld1q(&pSamplesTmp[4]);
+                acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
+
+                pSamplesTmp += 8;
+                pCoeffsTmp += 8;
+                /*
+                 * Decrement the taps block loop counter
+                 */
+                i--;
+            }
+
+            acc0 = asrl(acc0, 15);
+            acc1 = asrl(acc1, 15);
+            acc2 = asrl(acc2, 15);
 
             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
@@ -160,23 +356,39 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS
 
     case 2:
         {
+            const q15_t    *pCoeffsTmp = pCoeffs;
             const q15_t    *pSamplesTmp = pSamples;
 
             acc0 = 0LL;
             acc1 = 0LL;
-
             /*
-             * Save 4 input samples in the history buffer
+             * Save 8 input samples in the history buffer
              */
             vst1q(pStateCur, vld1q(pTempSrc));
             pStateCur += 8;
             pTempSrc += 8;
 
-            vecIn0 = vld1q(pSamplesTmp);
-            acc0 = vmlaldavq(vecIn0, vecCoeffs);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                /*
+                 * load 8 coefs
+                 */
+                q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
 
-            vecIn0 = vld1q(&pSamplesTmp[1]);
-            acc1 = vmlaldavq(vecIn0, vecCoeffs);
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
+                vecIn0 = vld1q(&pSamplesTmp[2]);
+                acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
+
+                pSamplesTmp += 8;
+                pCoeffsTmp += 8;
+                /*
+                 * Decrement the taps block loop counter
+                 */
+                i--;
+            }
 
             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
@@ -185,126 +397,29 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS
 
     case 1:
         {
+            const q15_t    *pCoeffsTmp = pCoeffs;
             const q15_t    *pSamplesTmp = pSamples;
 
             acc0 = 0LL;
 
-            /*
-             * Save 4 input samples in the history buffer
-             */
-            vst1q(pStateCur, vld1q(pTempSrc));
-            pStateCur += 8;
-            pTempSrc += 8;
-
-            vecIn0 = vld1q(pSamplesTmp);
-            acc0 = vmlaldavq(vecIn0, vecCoeffs);
-
-            pSamplesTmp += 4;
-
-            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
-        }
-        break;
-    }
-
-    /*
-     * Copy the samples back into the history buffer start
-     */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
-
-    blkCnt = numTaps >> 3;
-    while (blkCnt > 0U)
-    {
-        vst1q(pTempDest, vld1q(pTempSrc));
-        pTempSrc += 8;
-        pTempDest += 8;
-        blkCnt--;
-    }
-    blkCnt = numTaps & 7;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp16q(blkCnt);
-        vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0);
-    }
-}
-
-void arm_fir_q15(
-  const arm_fir_instance_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize)
-{
-    q15_t    *pState = S->pState;   /* State pointer */
-    const q15_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
-    q15_t    *pStateCur;        /* Points to the current sample of the state */
-    const q15_t    *pSamples;         /* Temporary pointer to the sample buffer */
-    q15_t    *pOutput;          /* Temporary pointer to the output buffer */
-    const q15_t    *pTempSrc;         /* Temporary pointer to the source data */
-    q15_t    *pTempDest;        /* Temporary pointer to the destination buffer */
-    uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
-    uint32_t  blkCnt;
-    q15x8_t vecIn0;
-    uint32_t  tapsBlkCnt = (numTaps + 7) / 8;
-    q63_t     acc0, acc1, acc2, acc3;
-
-    if (blockSize >= 12)
-    {
-       if(numTaps <= 8) {
-           /* [1 to 8 taps] specialized routine */
-           arm_fir_q15_1_8_mve(S,pSrc, pDst, blockSize);
-           return;
-       }
-    }
-
-    if (blockSize >= 12)
-    {
-        /*
-         * pState points to state array which contains previous frame (numTaps - 1) samples
-         * pStateCur points to the location where the new input data should be written
-         */
-        pStateCur   = &(pState[(numTaps - 1u)]);
-        pTempSrc    = pSrc;
-        pSamples    = pState;
-        pOutput     = pDst;
-        blkCnt      = blockSize >> 2;
-    
-        while (blkCnt > 0U)
-        {
-            const q15_t    *pCoeffsTmp = pCoeffs;
-            const q15_t    *pSamplesTmp = pSamples;
-    
-            acc0 = 0LL;
-            acc1 = 0LL;
-            acc2 = 0LL;
-            acc3 = 0LL;
-    
             /*
              * Save 8 input samples in the history buffer
              */
             vst1q(pStateCur, vld1q(pTempSrc));
             pStateCur += 8;
             pTempSrc += 8;
-    
-            uint32_t       i = tapsBlkCnt;
-            while (i > 0U)
+
+            int       i = tapsBlkCnt;
+            while (i > 0)
             {
                 /*
                  * load 8 coefs
                  */
                 q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
-    
+
                 vecIn0 = vld1q(pSamplesTmp);
-                acc0 =  vmlaldavaq(acc0, vecIn0, vecCoeffs);
-    
-                vecIn0 = vld1q(&pSamplesTmp[1]);
-                acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
-    
-                vecIn0 = vld1q(&pSamplesTmp[2]);
-                acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
-    
-                vecIn0 = vld1q(&pSamplesTmp[3]);
-                acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);
-    
+                acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
                 pSamplesTmp += 8;
                 pCoeffsTmp += 8;
                 /*
@@ -312,197 +427,17 @@ void arm_fir_q15(
                  */
                 i--;
             }
-    
+
             *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
-            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
-            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
-            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15);
-    
-            pSamples += 4;
-            /*
-             * Decrement the sample block loop counter
-             */
-            blkCnt--;
         }
-    
-        uint32_t  residual = blockSize & 3;
-        switch (residual)
-        {
-        case 3:
-            {
-                const q15_t    *pCoeffsTmp = pCoeffs;
-                const q15_t    *pSamplesTmp = pSamples;
-    
-                acc0 = 0LL;
-                acc1 = 0LL;
-                acc2 = 0LL;
-    
-                /*
-                 * Save 8 input samples in the history buffer
-                 */
-                *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
-                pStateCur += 8;
-                pTempSrc += 8;
-    
-                uint32_t       i = tapsBlkCnt;
-                while (i > 0U)
-                {
-                    /*
-                     * load 8 coefs
-                     */
-                    q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
-    
-                    vecIn0 = vld1q(pSamplesTmp);
-                    acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
-    
-                    vecIn0 = vld1q(&pSamplesTmp[1]);
-                    acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
-    
-                    vecIn0 = vld1q(&pSamplesTmp[2]);
-                    acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
-    
-                    pSamplesTmp += 8;
-                    pCoeffsTmp += 8;
-                    /*
-                     * Decrement the taps block loop counter
-                     */
-                    i--;
-                }
-    
-              
-                *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
-                *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
-                *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
-            }
-            break;
-    
-        case 2:
-            {
-                const q15_t    *pCoeffsTmp = pCoeffs;
-                const q15_t    *pSamplesTmp = pSamples;
-    
-                acc0 = 0LL;
-                acc1 = 0LL;
-                /*
-                 * Save 8 input samples in the history buffer
-                 */
-                vst1q(pStateCur, vld1q(pTempSrc));
-                pStateCur += 8;
-                pTempSrc += 8;
-    
-                uint32_t       i = tapsBlkCnt;
-                while (i > 0U)
-                {
-                    /*
-                     * load 8 coefs
-                     */
-                    q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
-    
-                    vecIn0 = vld1q(pSamplesTmp);
-                    acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
-    
-                    vecIn0 = vld1q(&pSamplesTmp[1]);
-                    acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
-    
-                    pSamplesTmp += 8;
-                    pCoeffsTmp += 8;
-                    /*
-                     * Decrement the taps block loop counter
-                     */
-                    i--;
-                }
-    
-                *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
-                *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
-            }
-            break;
-    
-        case 1:
-            {
-                const q15_t    *pCoeffsTmp = pCoeffs;
-                const q15_t    *pSamplesTmp = pSamples;
-    
-                acc0 = 0LL;
-    
-                /*
-                 * Save 8 input samples in the history buffer
-                 */
-                vst1q(pStateCur, vld1q(pTempSrc));
-                pStateCur += 8;
-                pTempSrc += 8;
-    
-                uint32_t       i = tapsBlkCnt;
-                while (i > 0U)
-                {
-                    /*
-                     * load 8 coefs
-                     */
-                    q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
-    
-                    vecIn0 = vld1q(pSamplesTmp);
-                    acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
-    
-                    pSamplesTmp += 8;
-                    pCoeffsTmp += 8;
-                    /*
-                     * Decrement the taps block loop counter
-                     */
-                    i--;
-                }
-    
-                *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
-            }
-            break;
-        }
-    }
-    else
-    {
-        q15_t *pStateCurnt;                            /* Points to the current sample of the state */
-            q15_t *px;                                     /* Temporary pointer for state buffer */
-      const q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
-            q63_t acc0;                                    /* Accumulator */
-            uint32_t  blkCnt,tapCnt;                    /* Loop counters */
-      pStateCurnt = &(S->pState[(numTaps - 1U)]);
-      blkCnt = blockSize;
-      while (blkCnt > 0U)
-      {
-        /* Copy two samples into state buffer */
-        *pStateCurnt++ = *pSrc++;
-    
-        /* Set the accumulator to zero */
-        acc0 = 0;
-    
-        /* Use SIMD to hold states and coefficients */
-        px = pState;
-        pb = pCoeffs;
-    
-        tapCnt = numTaps >> 1U;
-    
-        while (tapCnt > 0U)
-        {
-          acc0 += (q15_t) *px++ * *pb++;
-          acc0 += (q15_t) *px++ * *pb++;
-    
-          tapCnt--;
-        }
-        
-    
-        /* The result is in 2.30 format. Convert to 1.15 with saturation.
-           Then store the output in the destination buffer. */
-        *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
-    
-        /* Advance state pointer by 1 for the next sample */
-        pState = pState + 1U;
-    
-        /* Decrement loop counter */
-        blkCnt--;
-      }
+        break;
     }
+
     /*
      * Copy the samples back into the history buffer start
      */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
 
     blkCnt = numTaps >> 3;
     while (blkCnt > 0U)
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q31.c
index e236877abaff642e6cf1723eba4788cdaa92de87..40fe527094adfde40f39f0965f794871b4b22039 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_q31.c
  * Description:  Q31 FIR filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,8 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
+
 
 /**
   @ingroup groupFilters
@@ -55,15 +56,162 @@
  @remark
                    Refer to \ref arm_fir_fast_q31() for a faster but less precise implementation of this filter.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
-                                        
 
 
-static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pSrc, q31_t * pDst, uint32_t blockSize)
+#define FIR_Q31_CORE(nbAcc, nbVecTaps, pSample, vecCoeffs)                 \
+        for (int j = 0; j < nbAcc; j++) {                                  \
+            const q31_t    *pSmp = &pSamples[j];                           \
+            q31x4_t         vecIn0;                                        \
+            q63_t           acc[4];                                        \
+                                                                           \
+            acc[j] = 0;                                                    \
+            for (int i = 0; i < nbVecTaps; i++) {                          \
+                vecIn0 = vld1q(pSmp + 4 * i);                  \
+                acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]);       \
+            }                                                              \
+            *pOutput++ = (q31_t)asrl(acc[j], 23);                          \
+        }
+
+
+#define FIR_Q31_CORE_STR_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs)     \
+        for (int j = 0; j < nbAcc; j++) {                                  \
+            const q31_t    *pSmp = &pSamples[j];                           \
+            q31x4_t         vecIn0;                                        \
+                                                                           \
+            acc[j] = 0;                                                    \
+            for (int i = 0; i < nbVecTaps; i++) {                          \
+                vecIn0 = vld1q(pSmp + 4 * i);                  \
+                acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]);       \
+            }                                                              \
+            *arm_fir_partial_accu_ptr++ = acc[j];                          \
+        }
+
+
+#define FIR_Q31_CORE_LD_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs)      \
+        for (int j = 0; j < nbAcc; j++) {                                  \
+            const q31_t    *pSmp = &pSamples[j];                           \
+            q31x4_t         vecIn0;                                        \
+                                                                           \
+            acc[j] = *arm_fir_partial_accu_ptr++;                          \
+                                                                           \
+            for (int i = 0; i < nbVecTaps; i++) {                          \
+                vecIn0 = vld1q(pSmp + 4 * i);                  \
+                acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]);       \
+            }                                                              \
+            *pOutput++ = (q31_t)asrl(acc[j], 23);                          \
+        }
+
+                      
+#define FIR_Q31_MAIN_CORE()                                                              \
+{                                                                                        \
+    q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);                          \
+    q31_t      *pState = pRefStatePtr; /* State pointer */                               \
+    const q31_t *pCoeffs = S->pCoeffs;  /* Coefficient pointer */                        \
+    q31_t       *pStateCur;             /* Points to the current sample of the state */  \
+    const q31_t *pSamples;              /* Temporary pointer to the sample buffer */     \
+    q31_t       *pOutput;               /* Temporary pointer to the output buffer */     \
+    const q31_t *pTempSrc;              /* Temporary pointer to the source data */       \
+    q31_t       *pTempDest;             /* Temporary pointer to the destination buffer */\
+    uint32_t     numTaps = S->numTaps;  /* Number of filter coefficients in the filter */\
+    int32_t      blkCnt;                                                                 \
+                                                                                         \
+    /*                                                                                   \
+     * load coefs                                                                        \
+     */                                                                                  \
+    q31x4_t         vecCoeffs[NBVECTAPS];                                                \
+                                                                                         \
+    for (int i = 0; i < NBVECTAPS; i++)                                                  \
+        vecCoeffs[i] = vld1q(pCoeffs + 4 * i);                                           \
+                                                                                         \
+    /*                                                                                   \
+     * pState points to state array which contains previous frame (numTaps - 1) samples  \
+     * pStateCur points to the location where the new input data should be written       \
+     */                                                                                  \
+    pStateCur = &(pState[(numTaps - 1u)]);                                               \
+    pTempSrc = pSrc;                                                                     \
+    pSamples = pState;                                                                   \
+    pOutput = pDst;                                                                      \
+                                                                                         \
+    blkCnt = blockSize >> 2;                                                             \
+    while (blkCnt > 0) {                                                                 \
+        /*                                                                               \
+         * Save 4 input samples in the history buffer                                    \
+         */                                                                              \
+        vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc));                                     \
+        pStateCur += 4;                                                                  \
+        pTempSrc += 4;                                                                   \
+                                                                                         \
+        FIR_Q31_CORE(4, NBVECTAPS, pSamples, vecCoeffs);                                 \
+                                                                                         \
+        pSamples += 4;                                                                   \
+        /*                                                                               \
+         * Decrement the sample block loop counter                                       \
+         */                                                                              \
+        blkCnt--;                                                                        \
+    }                                                                                    \
+                                                                                         \
+    /* tail */                                                                           \
+    int32_t        residual = blockSize & 3;                                             \
+    switch (residual) {                                                                  \
+      case 3:                                                                            \
+          {                                                                              \
+              for (int i = 0; i < residual; i++)                                         \
+                  *pStateCur++ = *pTempSrc++;                                            \
+                                                                                         \
+              FIR_Q31_CORE(3, NBVECTAPS, pSamples, vecCoeffs);                           \
+          }                                                                              \
+          break;                                                                         \
+                                                                                         \
+      case 2:                                                                            \
+          {                                                                              \
+              for (int i = 0; i < residual; i++)                                         \
+                  *pStateCur++ = *pTempSrc++;                                            \
+                                                                                         \
+               FIR_Q31_CORE(2, NBVECTAPS, pSamples, vecCoeffs);                          \
+          }                                                                              \
+          break;                                                                         \
+                                                                                         \
+      case 1:                                                                            \
+          {                                                                              \
+              for (int i = 0; i < residual; i++)                                         \
+                  *pStateCur++ = *pTempSrc++;                                            \
+                                                                                         \
+              FIR_Q31_CORE(1, NBVECTAPS, pSamples, vecCoeffs);                           \
+          }                                                                              \
+          break;                                                                         \
+    }                                                                                    \
+                                                                                         \
+    /*                                                                                   \
+     * Copy the samples back into the history buffer start                               \
+     */                                                                                  \
+    pTempSrc = &pState[blockSize];                                                       \
+    pTempDest = pState;                                                                  \
+                                                                                         \
+    blkCnt =(numTaps - 1) >> 2;                                                          \
+    while (blkCnt > 0)                                                                   \
+    {                                                                                    \
+        vstrwq_s32(pTempDest, vldrwq_s32(pTempSrc));                                     \
+        pTempSrc += 4;                                                                   \
+        pTempDest += 4;                                                                  \
+        blkCnt--;                                                                        \
+    }                                                                                    \
+    blkCnt = (numTaps - 1) & 3;                                                          \
+    if (blkCnt > 0)                                                                      \
+    {                                                                                    \
+        mve_pred16_t p0 = vctp32q(blkCnt);                                               \
+        vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p0), p0);                         \
+    }                                                                                    \
+}
+
+static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
 {
-    q31_t    *pState = S->pState;   /* State pointer */
+    q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
+    q31_t      *pState = pRefStatePtr; /* State pointer */
     const q31_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
     q31_t    *pStateCur;        /* Points to the current sample of the state */
     const q31_t    *pSamples;         /* Temporary pointer to the sample buffer */
@@ -74,6 +222,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
     uint32_t  blkCnt;
     q31x4_t vecIn0;
 
+
     /*
      * pState points to state array which contains previous frame (numTaps - 1) samples
      * pStateCur points to the location where the new input data should be written
@@ -83,7 +232,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
     pSamples = pState;
     pOutput = pDst;
 
-    q63_t     acc0, acc1, acc2, acc3;
+    q63_t     acc0=0, acc1=0, acc2=0, acc3=0;
     /*
      * load 4 coefs
      */
@@ -131,7 +280,6 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
     }
 
     uint32_t  residual = blockSize & 3;
-    
     switch (residual)
     {
     case 3:
@@ -139,7 +287,6 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
             /*
              * Save 4 input samples in the history buffer
              */
-
             *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
             pStateCur += 4;
             pTempSrc += 4;
@@ -205,14 +352,13 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
         break;
     }
 
-
     /*
      * Copy the samples back into the history buffer start
      */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
 
-    blkCnt = numTaps >> 2;
+    blkCnt = (numTaps-1) >> 2;
     while (blkCnt > 0U)
     {
         vst1q(pTempDest, vld1q(pTempSrc));
@@ -220,7 +366,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
         pTempDest += 4;
         blkCnt--;
     }
-    blkCnt = numTaps & 3;
+    blkCnt = (numTaps-1) & 3;
     if (blkCnt > 0U)
     {
         mve_pred16_t p0 = vctp32q(blkCnt);
@@ -228,9 +374,286 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
     }
 }
 
-static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pSrc, q31_t * pDst, uint32_t blockSize)
+
+
+static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
 {
-    q31_t    *pState = S->pState;   /* State pointer */
+    #define NBTAPS 8
+    #define NBVECTAPS (NBTAPS / 4)
+    FIR_Q31_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+
+static void arm_fir_q31_9_12_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 12
+    #define NBVECTAPS (NBTAPS / 4)
+    FIR_Q31_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+
+static void arm_fir_q31_13_16_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 16
+    #define NBVECTAPS (NBTAPS / 4)
+    FIR_Q31_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+
+static void arm_fir_q31_17_20_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 20
+    #define NBVECTAPS (NBTAPS / 4)
+    FIR_Q31_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+
+static void arm_fir_q31_21_24_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 24
+    #define NBVECTAPS (NBTAPS / 4)
+    FIR_Q31_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+
+static void arm_fir_q31_25_28_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 28
+    #define NBVECTAPS (NBTAPS / 4)
+    FIR_Q31_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+static void arm_fir_q31_29_32_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst,
+                               uint32_t blockSize)
+{
+    q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
+    q31_t      *pState = pRefStatePtr; /* State pointer */
+    const q31_t    *pCoeffs = S->pCoeffs;       /* Coefficient pointer */
+    q31_t          *pStateCur;  /* Points to the current sample of the state */
+    const q31_t    *pSamples;   /* Temporary pointer to the sample buffer */
+    q31_t          *pOutput;    /* Temporary pointer to the output buffer */
+    const q31_t    *pTempSrc;   /* Temporary pointer to the source data */
+    q31_t          *pTempDest;  /* Temporary pointer to the destination buffer */
+    uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
+    int32_t         blkCnt;
+    q63_t           acc0, acc1, acc2, acc3;
+
+#define MAX_VECT_BATCH 7
+
+    /*
+     * pre-load 28 1st coefs
+     */
+    q31x4_t         vecCoeffs0 = vld1q(pCoeffs + 4 * 0);
+    q31x4_t         vecCoeffs1 = vld1q(pCoeffs + 4 * 1);
+    q31x4_t         vecCoeffs2 = vld1q(pCoeffs + 4 * 2);
+    q31x4_t         vecCoeffs3 = vld1q(pCoeffs + 4 * 3);
+    q31x4_t         vecCoeffs4 = vld1q(pCoeffs + 4 * 4);
+    q31x4_t         vecCoeffs5 = vld1q(pCoeffs + 4 * 5);
+    q31x4_t         vecCoeffs6 = vld1q(pCoeffs + 4 * 6);
+
+    /*
+     * pState points to state array which contains previous frame (numTaps - 1) samples
+     * pStateCur points to the location where the new input data should be written
+     */
+    pStateCur = &(pState[(numTaps - 1u)]);
+    pTempSrc = pSrc;
+    pSamples = pState;
+
+    q63_t          *arm_fir_partial_accu_ptr = (q63_t*)S->pState;
+
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0) {
+        /*
+         * Save 4 input samples in the history buffer
+         */
+        vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc));
+        pStateCur += 4;
+        pTempSrc += 4;
+
+        const q31_t    *pSmp;
+        q31x4_t         vecIn0;
+
+        pSmp = &pSamples[0];
+
+        vecIn0 = vld1q(pSmp);
+        acc0 = vrmlaldavhq(vecIn0, vecCoeffs0);
+        vecIn0 = vld1q(pSmp + 4 * 1);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs1);
+        vecIn0 = vld1q(pSmp + 4 * 2);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs2);
+        vecIn0 = vld1q(pSmp + 4 * 3);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs3);
+        vecIn0 = vld1q(pSmp + 4 * 4);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs4);
+        vecIn0 = vld1q(pSmp + 4 * 5);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5);
+        vecIn0 = vld1q(pSmp + 4 * 6);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs6);
+
+        *arm_fir_partial_accu_ptr++ = acc0;
+
+        pSmp = &pSamples[1];
+
+        vecIn0 = vld1q(pSmp);
+        acc1 = vrmlaldavhq(vecIn0, vecCoeffs0);
+        vecIn0 = vld1q(pSmp + 4 * 1);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs1);
+        vecIn0 = vld1q(pSmp + 4 * 2);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs2);
+        vecIn0 = vld1q(pSmp + 4 * 3);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs3);
+        vecIn0 = vld1q(pSmp + 4 * 4);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs4);
+        vecIn0 = vld1q(pSmp + 4 * 5);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5);
+        vecIn0 = vld1q(pSmp + 4 * 6);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs6);
+
+        *arm_fir_partial_accu_ptr++ = acc1;
+
+        pSmp = &pSamples[2];
+
+        vecIn0 = vld1q(pSmp);
+        acc2 = vrmlaldavhq(vecIn0, vecCoeffs0);
+        vecIn0 = vld1q(pSmp + 4 * 1);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs1);
+        vecIn0 = vld1q(pSmp + 4 * 2);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs2);
+        vecIn0 = vld1q(pSmp + 4 * 3);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs3);
+        vecIn0 = vld1q(pSmp + 4 * 4);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs4);
+        vecIn0 = vld1q(pSmp + 4 * 5);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5);
+        vecIn0 = vld1q(pSmp + 4 * 6);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs6);
+        *arm_fir_partial_accu_ptr++ = acc2;
+
+        pSmp = &pSamples[3];
+
+        vecIn0 = vld1q(pSmp);
+        acc3 = vrmlaldavhq(vecIn0, vecCoeffs0);
+        vecIn0 = vld1q(pSmp + 4 * 1);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs1);
+        vecIn0 = vld1q(pSmp + 4 * 2);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs2);
+        vecIn0 = vld1q(pSmp + 4 * 3);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs3);
+        vecIn0 = vld1q(pSmp + 4 * 4);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs4);
+        vecIn0 = vld1q(pSmp + 4 * 5);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5);
+        vecIn0 = vld1q(pSmp + 4 * 6);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs6);
+
+        *arm_fir_partial_accu_ptr++ = acc3;
+
+        pSamples += 4;
+        /*
+         * Decrement the sample block loop counter
+         */
+        blkCnt--;
+    }
+
+
+    /* reminder */
+
+    /* load last 4 coef */
+    vecCoeffs0 = vld1q(pCoeffs + 4 * MAX_VECT_BATCH);
+    arm_fir_partial_accu_ptr = (q63_t*)S->pState;
+    pOutput = pDst;
+    pSamples = pState + (MAX_VECT_BATCH * 4);
+
+
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0) {
+        q31x4_t         vecIn0;
+
+        /* reload intermediate MAC */
+        acc0 = *arm_fir_partial_accu_ptr++;
+        acc1 = *arm_fir_partial_accu_ptr++;
+        acc2 = *arm_fir_partial_accu_ptr++;
+        acc3 = *arm_fir_partial_accu_ptr++;
+
+
+        vecIn0 = vld1q(&pSamples[0]);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs0);
+
+        vecIn0 = vld1q(&pSamples[1]);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs0);
+
+        vecIn0 = vld1q(&pSamples[2]);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs0);
+
+        vecIn0 = vld1q(&pSamples[3]);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs0);
+
+        *pOutput++ = asrl(acc0, 23);
+        *pOutput++ = asrl(acc1, 23);
+        *pOutput++ = asrl(acc2, 23);
+        *pOutput++ = asrl(acc3, 23);
+
+        pSamples += 4;
+        /*
+         * Decrement the sample block loop counter
+         */
+        blkCnt--;
+    }
+
+    /*
+     * Copy the samples back into the history buffer start
+     */
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
+
+    blkCnt = numTaps - 1;
+    do {
+        mve_pred16_t    p = vctp32q(blkCnt);
+
+        vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p), p);
+        pTempSrc += 4;
+        pTempDest += 4;
+        blkCnt -= 4;
+    }
+    while (blkCnt > 0);
+}
+
+
+
+void arm_fir_q31(
+  const arm_fir_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+    q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
+    q31_t      *pState = pRefStatePtr; /* State pointer */
     const q31_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
     q31_t    *pStateCur;        /* Points to the current sample of the state */
     const q31_t    *pSamples;         /* Temporary pointer to the sample buffer */
@@ -240,60 +663,110 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
     uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
     uint32_t  blkCnt;
     q31x4_t vecIn0;
+    uint32_t  tapsBlkCnt = (numTaps + 3) / 4;
     q63_t     acc0, acc1, acc2, acc3;
-    q31x4_t vecCoeffs1_4, vecCoeffs5_8;
+    q31x4_t vecCoeffs;
+
 
     /*
-     * pState points to state array which contains previous frame (numTaps - 1) samples
-     * pStateCur points to the location where the new input data should be written
+     * [1 to 32 taps] specialized routines
      */
-    pStateCur = &(pState[(numTaps - 1u)]);
-    pTempSrc = pSrc;
-    pSamples = pState;
-    pOutput = pDst;
-
+    if (numTaps <= 4)
+    {
+        arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 8)
+    {
+        arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 12)
+    {
+        arm_fir_q31_9_12_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 16)
+    {
+        arm_fir_q31_13_16_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 20)
+    {
+        arm_fir_q31_17_20_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 24)
+    {
+        arm_fir_q31_21_24_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 28)
+    {
+        arm_fir_q31_25_28_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if ((numTaps <= 32)  && (blockSize >= 32))
+    {
+        arm_fir_q31_29_32_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
 
     /*
-     * load 8 coefs
+     * pState points to state array which contains previous frame (numTaps - 1) samples
+     * pStateCur points to the location where the new input data should be written
      */
-    vecCoeffs1_4 = *(q31x4_t *) pCoeffs;
-    vecCoeffs5_8 = *(q31x4_t *) (pCoeffs + 4);
-
-    blkCnt = blockSize >> 2;
-    while (blkCnt > 0U)
+    pStateCur   = &(pState[(numTaps - 1u)]);
+    pSamples    = pState;
+    pTempSrc    = pSrc;
+    pOutput     = pDst;
+    blkCnt      = blockSize >> 2;
+    while (blkCnt > 0)
     {
+        const q31_t    *pCoeffsTmp = pCoeffs;
         const q31_t    *pSamplesTmp = pSamples;
 
+        acc0 = 0LL;
+        acc1 = 0LL;
+        acc2 = 0LL;
+        acc3 = 0LL;
+
         /*
          * Save 4 input samples in the history buffer
          */
         vst1q(pStateCur, vld1q(pTempSrc));
+        pStateCur += 4;
+        pTempSrc += 4;
 
-        vecIn0 = vld1q(pSamplesTmp);
-        acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
-        vecIn0 = vld1q(&pSamplesTmp[1]);
-        acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
-        vecIn0 = vld1q(&pSamplesTmp[2]);
-        acc2 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
-        vecIn0 = vld1q(&pSamplesTmp[3]);
-        acc3 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+        int       i = tapsBlkCnt;
+        while (i > 0)
+        {
+            /*
+             * load 4 coefs
+             */
+            vecCoeffs = *(q31x4_t *) pCoeffsTmp;
 
-        vecIn0 = vld1q(&pSamplesTmp[4]);
-        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
+            vecIn0 = vld1q(pSamplesTmp);
+            acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
 
-        vecIn0 = vld1q(&pSamplesTmp[5]);
-        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
+            vecIn0 = vld1q(&pSamplesTmp[1]);
+            acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
 
-        vecIn0 = vld1q(&pSamplesTmp[6]);
-        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5_8);
+            vecIn0 = vld1q(&pSamplesTmp[2]);
+            acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
 
-        vecIn0 = vld1q(&pSamplesTmp[7]);
-        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5_8);
+            vecIn0 = vld1q(&pSamplesTmp[3]);
+            acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs);
 
+            pSamplesTmp += 4;
+            pCoeffsTmp += 4;
+            /*
+             * Decrement the taps block loop counter
+             */
+            i--;
+        }
 
+        /* .54-> .31 conversion and store accumulators */
         acc0 = asrl(acc0, 23);
         acc1 = asrl(acc1, 23);
         acc2 = asrl(acc2, 23);
@@ -305,8 +778,6 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
         *pOutput++ = (q31_t) acc3;
 
         pSamples += 4;
-        pStateCur += 4;
-        pTempSrc += 4;
 
         /*
          * Decrement the sample block loop counter
@@ -314,11 +785,18 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
         blkCnt--;
     }
 
-    uint32_t  residual = blockSize & 3;
+    int32_t  residual = blockSize & 3;
     switch (residual)
     {
     case 3:
         {
+            const q31_t    *pCoeffsTmp = pCoeffs;
+            const q31_t    *pSamplesTmp = pSamples;
+
+            acc0 = 0LL;
+            acc1 = 0LL;
+            acc2 = 0LL;
+
             /*
              * Save 4 input samples in the history buffer
              */
@@ -326,23 +804,24 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
             pStateCur += 4;
             pTempSrc += 4;
 
-            vecIn0 = vld1q(pSamples);
-            acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
-            vecIn0 = vld1q(&pSamples[1]);
-            acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                vecCoeffs = *(q31x4_t *) pCoeffsTmp;
 
-            vecIn0 = vld1q(&pSamples[2]);
-            acc2 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamples[4]);
-            acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
+                vecIn0 = vld1q(&pSamplesTmp[1]);
+                acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamples[5]);
-            acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
+                vecIn0 = vld1q(&pSamplesTmp[2]);
+                acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamples[6]);
-            acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5_8);
+                pSamplesTmp += 4;
+                pCoeffsTmp += 4;
+                i--;
+            }
 
             acc0 = asrl(acc0, 23);
             acc1 = asrl(acc1, 23);
@@ -356,6 +835,12 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
 
     case 2:
         {
+            const q31_t    *pCoeffsTmp = pCoeffs;
+            const q31_t    *pSamplesTmp = pSamples;
+
+            acc0 = 0LL;
+            acc1 = 0LL;
+
             /*
              * Save 4 input samples in the history buffer
              */
@@ -363,17 +848,21 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
             pStateCur += 4;
             pTempSrc += 4;
 
-            vecIn0 = vld1q(pSamples);
-            acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                vecCoeffs = *(q31x4_t *) pCoeffsTmp;
 
-            vecIn0 = vld1q(&pSamples[1]);
-            acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamples[4]);
-            acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
+                vecIn0 = vld1q(&pSamplesTmp[1]);
+                acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamples[5]);
-            acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
+                pSamplesTmp += 4;
+                pCoeffsTmp += 4;
+                i--;
+            }
 
             acc0 = asrl(acc0, 23);
             acc1 = asrl(acc1, 23);
@@ -384,431 +873,55 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
         break;
 
     case 1:
-        {
-            /*
-             * Save 4 input samples in the history buffer
-             */
-            vst1q(pStateCur, vld1q(pTempSrc));
-            pStateCur += 4;
-            pTempSrc += 4;
-
-            vecIn0 = vld1q(pSamples);
-            acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
-            vecIn0 = vld1q(&pSamples[4]);
-            acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
-
-            acc0 = asrl(acc0, 23);
-
-            *pOutput++ = (q31_t) acc0;
-        }
-        break;
-    }
-
-    /*
-     * Copy the samples back into the history buffer start
-     */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
-
-    blkCnt = numTaps >> 2;
-    while (blkCnt > 0U)
-    {
-        vst1q(pTempDest, vld1q(pTempSrc));
-        pTempSrc += 4;
-        pTempDest += 4;
-        blkCnt--;
-    }
-    blkCnt = numTaps & 3;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp32q(blkCnt);
-        vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
-    }
-}
-
-void arm_fir_q31(
-  const arm_fir_instance_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize)
-{
-    q31_t    *pState = S->pState;   /* State pointer */
-    const q31_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
-    q31_t    *pStateCur;        /* Points to the current sample of the state */
-    const q31_t    *pSamples;         /* Temporary pointer to the sample buffer */
-    q31_t    *pOutput;          /* Temporary pointer to the output buffer */
-    const q31_t    *pTempSrc;         /* Temporary pointer to the source data */
-    q31_t    *pTempDest;        /* Temporary pointer to the destination buffer */
-    uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
-    uint32_t  blkCnt;
-    q31x4_t vecIn0;
-    uint32_t  tapsBlkCnt = (numTaps + 3) / 4;
-    q63_t     acc0, acc1, acc2, acc3;
-    q31x4_t vecCoeffs;
-
-    /*
-     * [1 to 8 taps] specialized routines
-     */
-    
-    if (blockSize >= 8)
-    {
-        if (numTaps <= 4)
-        {
-            arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize);
-            return;
-        }
-        else if (numTaps <= 8)
-        {
-            arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize);
-            return;
-        }
-    }
-
-
-    /*
-     * pState points to state array which contains previous frame (numTaps - 1) samples
-     * pStateCur points to the location where the new input data should be written
-     */
-    if (blockSize >= 8)
-    {
-        pStateCur   = &(pState[(numTaps - 1u)]);
-        pSamples    = pState;
-        pTempSrc    = pSrc;
-        pOutput     = pDst;
-        blkCnt      = blockSize >> 2;
-        while (blkCnt > 0U)
         {
             const q31_t    *pCoeffsTmp = pCoeffs;
             const q31_t    *pSamplesTmp = pSamples;
-    
+
             acc0 = 0LL;
-            acc1 = 0LL;
-            acc2 = 0LL;
-            acc3 = 0LL;
-    
+
             /*
              * Save 4 input samples in the history buffer
              */
             vst1q(pStateCur, vld1q(pTempSrc));
             pStateCur += 4;
             pTempSrc += 4;
-    
-            tapsBlkCnt = (numTaps ) / 4;
-            uint32_t       i = tapsBlkCnt ;
-            while (i > 0U)
+
+            int       i = tapsBlkCnt;
+            while (i > 0)
             {
-                /*
-                 * load 4 coefs
-                 */
                 vecCoeffs = *(q31x4_t *) pCoeffsTmp;
-    
+
                 vecIn0 = vld1q(pSamplesTmp);
                 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
-    
-                vecIn0 = vld1q(&pSamplesTmp[1]);
-                acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
-    
-                vecIn0 = vld1q(&pSamplesTmp[2]);
-                acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
-    
-                vecIn0 = vld1q(&pSamplesTmp[3]);
-                acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs);
-    
+
                 pSamplesTmp += 4;
                 pCoeffsTmp += 4;
-                /*
-                 * Decrement the taps block loop counter
-                 */
                 i--;
             }
 
-            tapsBlkCnt = (numTaps ) & 3;
-            i = tapsBlkCnt ;
-            while (i > 0U)
-            {
-                /*
-                 * load 4 coefs
-                 */
-
-                /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
-                acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
-                acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
-                acc2 += ((q63_t) pSamplesTmp[2] * *pCoeffsTmp) >> 8;
-                acc3 += ((q63_t) pSamplesTmp[3] * *pCoeffsTmp) >> 8;
-
-    
-                pSamplesTmp += 1;
-                pCoeffsTmp += 1;
-                /*
-                 * Decrement the taps block loop counter
-                 */
-                i--;
-            }
-    
-            /* .54-> .31 conversion and store accumulators */
             acc0 = asrl(acc0, 23);
-            acc1 = asrl(acc1, 23);
-            acc2 = asrl(acc2, 23);
-            acc3 = asrl(acc3, 23);
-    
-            *pOutput++ = (q31_t) acc0;
-            *pOutput++ = (q31_t) acc1;
-            *pOutput++ = (q31_t) acc2;
-            *pOutput++ = (q31_t) acc3;
-    
-            pSamples += 4;
 
-            
-            /*
-             * Decrement the sample block loop counter
-             */
-            blkCnt--;
-        }
-    
-        uint32_t  residual = blockSize & 3;
-        switch (residual)
-        {
-        case 3:
-            {
-                const q31_t    *pCoeffsTmp = pCoeffs;
-                const q31_t    *pSamplesTmp = pSamples;
-    
-                acc0 = 0LL;
-                acc1 = 0LL;
-                acc2 = 0LL;
-    
-                /*
-                 * Save 4 input samples in the history buffer
-                 */
-              
-                *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
-                pStateCur += 4;
-                pTempSrc += 4;
-    
-                tapsBlkCnt = numTaps  / 4;
-                uint32_t       i = tapsBlkCnt;
-                while (i > 0U)
-                {
-                    vecCoeffs = *(q31x4_t *) pCoeffsTmp;
-
-                    vecIn0 = vld1q(pSamplesTmp);
-                    acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
-    
-                    vecIn0 = vld1q(&pSamplesTmp[1]);
-                    acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
-    
-                    vecIn0 = vld1q(&pSamplesTmp[2]);
-                    acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
-    
-                    pSamplesTmp += 4;
-                    pCoeffsTmp += 4;
-                    i--;
-                }
-
-                tapsBlkCnt = (numTaps ) & 3;
-                
-                i = tapsBlkCnt ;
-                while (i > 0U)
-                {
-                   
-                    /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
-                    acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
-                    acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
-                    acc2 += ((q63_t) pSamplesTmp[2] * *pCoeffsTmp) >> 8;
-    
-                    pSamplesTmp += 1;
-                    pCoeffsTmp += 1;
-                    /*
-                     * Decrement the taps block loop counter
-                     */
-                    i--;
-                }
-    
-    
-                acc0 = asrl(acc0, 23);
-                acc1 = asrl(acc1, 23);
-                acc2 = asrl(acc2, 23);
-    
-                *pOutput++ = (q31_t) acc0;
-                *pOutput++ = (q31_t) acc1;
-                *pOutput++ = (q31_t) acc2;
-            }
-            break;
-    
-        case 2:
-            {
-                const q31_t    *pCoeffsTmp = pCoeffs;
-                const q31_t    *pSamplesTmp = pSamples;
-    
-                acc0 = 0LL;
-                acc1 = 0LL;
-    
-                /*
-                 * Save 4 input samples in the history buffer
-                 */
-                vst1q(pStateCur, vld1q(pTempSrc));
-                pStateCur += 4;
-                pTempSrc += 4;
-    
-                tapsBlkCnt = (numTaps ) / 4;
-                uint32_t       i = tapsBlkCnt;
-                while (i > 0U)
-                {
-                    vecCoeffs = *(q31x4_t *) pCoeffsTmp;
-    
-                    vecIn0 = vld1q(pSamplesTmp);
-                    acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
-    
-                    vecIn0 = vld1q(&pSamplesTmp[1]);
-                    acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
-    
-                    pSamplesTmp += 4;
-                    pCoeffsTmp += 4;
-                    i--;
-                }
-
-                tapsBlkCnt = (numTaps ) & 3;
-                i = tapsBlkCnt ;
-                while (i > 0U)
-                {
-                   
-
-                    /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
-                    acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
-                    acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
-    
-                    pSamplesTmp += 1;
-                    pCoeffsTmp += 1;
-                    /*
-                     * Decrement the taps block loop counter
-                     */
-                    i--;
-                }
-    
-                acc0 = asrl(acc0, 23);
-                acc1 = asrl(acc1, 23);
-    
-                *pOutput++ = (q31_t) acc0;
-                *pOutput++ = (q31_t) acc1;
-            }
-            break;
-    
-        case 1:
-            {
-                const q31_t    *pCoeffsTmp = pCoeffs;
-                const q31_t    *pSamplesTmp = pSamples;
-    
-                acc0 = 0LL;
-    
-                /*
-                 * Save 4 input samples in the history buffer
-                 */
-                vst1q(pStateCur, vld1q(pTempSrc));
-                pStateCur += 4;
-                pTempSrc += 4;
-    
-                tapsBlkCnt = (numTaps ) / 4;
-                uint32_t       i = tapsBlkCnt;
-                while (i > 0U)
-                {
-                    vecCoeffs = *(q31x4_t *) pCoeffsTmp;
-    
-                    vecIn0 = vld1q(pSamplesTmp);
-                    acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
-    
-                    pSamplesTmp += 4;
-                    pCoeffsTmp += 4;
-                    i--;
-                }
-
-                tapsBlkCnt = (numTaps ) & 3;
-                i = tapsBlkCnt ;
-                while (i > 0U)
-                {
-                   
-
-                    /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
-                    acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
-    
-                    pSamplesTmp += 1;
-                    pCoeffsTmp += 1;
-                    /*
-                     * Decrement the taps block loop counter
-                     */
-                    i--;
-                }
-    
-                acc0 = asrl(acc0, 23);
-    
-                *pOutput++ = (q31_t) acc0;
-            }
-            break;
-        }
-    }
-    else
-    {
-         
-                q31_t *pStateCurnt;                            /* Points to the current sample of the state */
-                q31_t *px;                                     /* Temporary pointer for state buffer */
-          const q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
-                q63_t acc0;                                    /* Accumulator */
-                uint32_t i, blkCnt;                    /* Loop counters */
-          pStateCurnt = &(S->pState[(numTaps - 1U)]);
-          blkCnt = blockSize;
-        
-          while (blkCnt > 0U)
-          {
-            /* Copy one sample at a time into state buffer */
-            *pStateCurnt++ = *pSrc++;
-        
-            /* Set the accumulator to zero */
-            acc0 = 0;
-        
-            /* Initialize state pointer */
-            px = pState;
-        
-            /* Initialize Coefficient pointer */
-            pb = pCoeffs;
-        
-            i = numTaps;
-        
-            /* Perform the multiply-accumulates */
-            do
-            {
-              /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
-              acc0 += (q63_t) *px++ * *pb++;
-        
-              i--;
-            } while (i > 0U);
-        
-            /* Result is in 2.62 format. Convert to 1.31 and store in destination buffer. */
-            *pDst++ = (q31_t) (acc0 >> 31U);
-        
-            /* Advance state pointer by 1 for the next sample */
-            pState = pState + 1U;
-        
-            /* Decrement loop counter */
-            blkCnt--;
+            *pOutput++ = (q31_t) acc0;
         }
+        break;
     }
 
     /*
      * Copy the samples back into the history buffer start
      */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
 
-    blkCnt = numTaps >> 2;
-    while (blkCnt > 0U)
+    blkCnt = (numTaps - 1U) >> 2;
+    while (blkCnt > 0)
     {
         vst1q(pTempDest, vld1q(pTempSrc));
         pTempSrc += 4;
         pTempDest += 4;
         blkCnt--;
     }
-    blkCnt = numTaps & 3;
-    if (blkCnt > 0U)
+    blkCnt = (numTaps - 1U) & 3;
+    if (blkCnt > 0)
     {
         mve_pred16_t p0 = vctp32q(blkCnt);
         vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q7.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q7.c
index 219ea0f2f2cafa40e6b98e2d470c644b50bf64fb..c05fa32146685a1d9b62b06b013d5514a8706b80 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q7.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_q7.c
  * Description:  Q7 FIR filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
@@ -54,9 +54,144 @@
                    Finally, the result is truncated to 1.7 format.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define FIR_Q7_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs)         \
+        for (int j = 0; j < nbAcc; j++) {                                  \
+            const q7_t     *pSmp = &pSample[j];                            \
+            q31_t           acc[4];                                        \
+                                                                           \
+            acc[j] = 0;                                                    \
+            for (int i = 0; i < nbVecTaps; i++) {                          \
+                vecIn0 = vld1q(pSmp + 16 * i);                   \
+                acc[j] = vmladavaq(acc[j], vecIn0, vecCoeffs[i]);          \
+            }                                                              \
+            *pOutput++ = (q7_t) __SSAT((acc[j] >> 7U), 8);                 \
+        }
+
+#define FIR_Q7_MAIN_CORE()                                                                  \
+{                                                                                           \
+     q7_t          *pState = S->pState;     /* State pointer */                             \
+    const q7_t    *pCoeffs = S->pCoeffs;   /* Coefficient pointer */                        \
+    q7_t          *pStateCur;              /* Points to the current sample of the state */  \
+    const q7_t    *pSamples;               /* Temporary pointer to the sample buffer */     \
+    q7_t          *pOutput;                /* Temporary pointer to the output buffer */     \
+    const q7_t    *pTempSrc;               /* Temporary pointer to the source data */       \
+    q7_t          *pTempDest;              /* Temporary pointer to the destination buffer */\
+    uint32_t       numTaps = S->numTaps;   /* Number of filter coefficients in the filter */\
+    int32_t        blkCnt;                                                                  \
+    q7x16_t        vecIn0;                                                                  \
+                                                                                            \
+    /*                                                                                      \
+     * load coefs                                                                           \
+     */                                                                                     \
+    q7x16_t         vecCoeffs[NBVECTAPS];                                                   \
+                                                                                            \
+    for (int i = 0; i < NBVECTAPS; i++)                                                     \
+        vecCoeffs[i] = vldrbq_s8(pCoeffs + 16 * i);                               \
+                                                                                            \
+    /*                                                                                      \
+     * pState points to state array which contains previous frame (numTaps - 1) samples     \
+     * pStateCur points to the location where the new input data should be written          \
+     */                                                                                     \
+    pStateCur = &(pState[(numTaps - 1u)]);                                                  \
+    pTempSrc = pSrc;                                                                        \
+    pSamples = pState;                                                                      \
+    pOutput = pDst;                                                                         \
+                                                                                            \
+    blkCnt = blockSize >> 2;                                                                \
+    while (blkCnt > 0) {                                                                   \
+        /*                                                                                  \
+         * Save 4 input samples in the history buffer                                       \
+         */                                                                                 \
+        vstrbq_s32(pStateCur, vldrbq_s32(pTempSrc));                                        \
+        pStateCur += 4;                                                                     \
+        pTempSrc += 4;                                                                      \
+                                                                                            \
+        FIR_Q7_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs);                            \
+        pSamples += 4;                                                                      \
+                                                                                            \
+        blkCnt--;                                                                           \
+    }                                                                                       \
+                                                                                            \
+    /* tail */                                                                              \
+    int32_t        residual = blockSize & 3;                                               \
+                                                                                            \
+    for (int i = 0; i < residual; i++)                                                      \
+        *pStateCur++ = *pTempSrc++;                                                         \
+                                                                                            \
+    FIR_Q7_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs);                         \
+                                                                                            \
+                                                                                            \
+    /*                                                                                      \
+     * Copy the samples back into the history buffer start                                  \
+     */                                                                                     \
+    pTempSrc = &pState[blockSize];                                                          \
+    pTempDest = pState;                                                                     \
+    blkCnt = numTaps - 1;                                                                   \
+    do {                                                                                    \
+        mve_pred16_t    p = vctp8q(blkCnt);                                                 \
+                                                                                            \
+        vstrbq_p_s8(pTempDest, vldrbq_z_s8(pTempSrc, p), p);                                \
+        pTempSrc += 16;                                                           \
+        pTempDest += 16;                                                          \
+        blkCnt -= 16;                                                             \
+    }                                                                                       \
+    while (blkCnt > 0);                                                                     \
+}
+
+
+static void arm_fir_q7_49_64_mve(const arm_fir_instance_q7 * S,
+  const q7_t * __restrict pSrc,
+  q7_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 64
+    #define NBVECTAPS (NBTAPS / 16)
+    FIR_Q7_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+
+void arm_fir_q7_33_48_mve(const arm_fir_instance_q7 * S,
+  const q7_t * __restrict pSrc,
+  q7_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 48
+    #define NBVECTAPS (NBTAPS / 16)
+    FIR_Q7_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+static void arm_fir_q7_17_32_mve(const arm_fir_instance_q7 * S,
+  const q7_t * __restrict pSrc,
+  q7_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 32
+    #define NBVECTAPS (NBTAPS / 16)
+    FIR_Q7_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
 
-void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t * pDst, uint32_t blockSize)
+void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S,
+  const q7_t * __restrict pSrc,
+  q7_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 16
+    #define NBVECTAPS (NBTAPS / 16)
+    FIR_Q7_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+void arm_fir_q7(
+  const arm_fir_instance_q7 * S,
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
 {
     q7_t     *pState = S->pState;   /* State pointer */
     const q7_t     *pCoeffs = S->pCoeffs; /* Coefficient pointer */
@@ -68,9 +203,43 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
     uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
     uint32_t  blkCnt;
     q7x16_t  vecIn0;
+    uint32_t  tapsBlkCnt = (numTaps + 15) / 16;
     q31_t     acc0, acc1, acc2, acc3;
     q7x16_t  vecCoeffs;
 
+    if (numTaps <= 16)
+    {
+        /*
+         * [1 to 16 taps] specialized routine
+         */
+        arm_fir_q7_1_16_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 32)
+    {
+        /*
+         * [17 to 32 taps] specialized routine
+         */
+        arm_fir_q7_17_32_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 48)
+    {
+        /*
+         * [33 to 48 taps] specialized routine
+         */
+        arm_fir_q7_33_48_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 64)
+    {
+        /*
+         * [49 to 64 taps] specialized routine
+         */
+        arm_fir_q7_49_64_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+
     /*
      * pState points to state array which contains previous frame (numTaps - 1) samples
      * pStateCur points to the location where the new input data should be written
@@ -82,12 +251,17 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
     blkCnt      = blockSize >> 2;
 
     /*
-     * load 16 coefs
+     * outer samples loop
      */
-    vecCoeffs = *(q7x16_t *) pCoeffs;
-
     while (blkCnt > 0U)
     {
+        const q7_t     *pCoeffsTmp = pCoeffs;
+        const q7_t     *pSamplesTmp = pSamples;
+
+        acc0 = 0;
+        acc1 = 0;
+        acc2 = 0;
+        acc3 = 0;
         /*
          * Save 16 input samples in the history buffer
          */
@@ -95,18 +269,36 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
         pStateCur += 16;
         pTempSrc += 16;
 
-        vecIn0 = vld1q(pSamples);
-        acc0 = vmladavq(vecIn0, vecCoeffs);
+        /*
+         * inner coefficients loop
+         */
+        int       i = tapsBlkCnt;
+        while (i > 0)
+        {
+            /*
+             * load 16 coefs
+             */
+            vecCoeffs = *(q7x16_t *) pCoeffsTmp;
+
+            vecIn0 = vld1q(pSamplesTmp);
+            acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
 
-        vecIn0 = vld1q(&pSamples[1]);;
-        acc1 = vmladavq(vecIn0, vecCoeffs);
+            vecIn0 = vld1q(&pSamplesTmp[1]);
+            acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
 
-        vecIn0 = vld1q(&pSamples[2]);;
-        acc2 = vmladavq(vecIn0, vecCoeffs);
+            vecIn0 = vld1q(&pSamplesTmp[2]);
+            acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
 
-        vecIn0 = vld1q(&pSamples[3]);
-        acc3 = vmladavq(vecIn0, vecCoeffs);
+            vecIn0 = vld1q(&pSamplesTmp[3]);
+            acc3 = vmladavaq(acc3, vecIn0, vecCoeffs);
 
+            pSamplesTmp += 16;
+            pCoeffsTmp += 16;
+            /*
+             * Decrement the taps block loop counter
+             */
+            i--;
+        }
         /*
          * Store the 1.7 format filter output in destination buffer
          */
@@ -127,18 +319,37 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
     {
     case 3:
         {
+            const q7_t     *pCoeffsTmp = pCoeffs;
+            const q7_t     *pSamplesTmp = pSamples;
+
+            acc0 = 0;
+            acc1 = 0;
+            acc2 = 0;
+            /*
+             * Save 16 input samples in the history buffer
+             */
             vst1q(pStateCur, vld1q(pTempSrc));
             pStateCur += 16;
             pTempSrc += 16;
 
-            vecIn0 = vld1q(pSamples);
-            acc0 = vmladavq(vecIn0, vecCoeffs);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                vecCoeffs = *(q7x16_t *) pCoeffsTmp;
+
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamples[1]);
-            acc1 = vmladavq(vecIn0, vecCoeffs);
+                vecIn0 = vld1q(&pSamplesTmp[4]);
+                acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamples[2]);
-            acc2 = vmladavq(vecIn0, vecCoeffs);
+                vecIn0 = vld1q(&pSamplesTmp[8]);
+                acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
+
+                pSamplesTmp += 16;
+                pCoeffsTmp += 16;
+                i--;
+            }
 
             *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
             *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
@@ -148,15 +359,33 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
 
     case 2:
         {
+            const q7_t     *pCoeffsTmp = pCoeffs;
+            const q7_t     *pSamplesTmp = pSamples;
+
+            acc0 = 0;
+            acc1 = 0;
+            /*
+             * Save 16 input samples in the history buffer
+             */
             vst1q(pStateCur, vld1q(pTempSrc));
             pStateCur += 16;
             pTempSrc += 16;
 
-            vecIn0 = vld1q(pSamples);
-            acc0 = vmladavq(vecIn0, vecCoeffs);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                vecCoeffs = *(q7x16_t *) pCoeffsTmp;
+
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamples[1]);
-            acc1 = vmladavq(vecIn0, vecCoeffs);
+                vecIn0 = vld1q(&pSamplesTmp[4]);
+                acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
+
+                pSamplesTmp += 16;
+                pCoeffsTmp += 16;
+                i--;
+            }
 
             *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
             *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
@@ -165,13 +394,29 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
 
     case 1:
         {
+            const q7_t     *pCoeffsTmp = pCoeffs;
+            const q7_t     *pSamplesTmp = pSamples;
+
+            acc0 = 0;
+            /*
+             * Save 16 input samples in the history buffer
+             */
             vst1q(pStateCur, vld1q(pTempSrc));
             pStateCur += 16;
             pTempSrc += 16;
 
-            vecIn0 = vld1q(pSamples);
-            acc0 = vmladavq(vecIn0, vecCoeffs);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                vecCoeffs = *(q7x16_t *) pCoeffsTmp;
 
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
+
+                pSamplesTmp += 16;
+                pCoeffsTmp += 16;
+                i--;
+            }
             *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
         }
         break;
@@ -198,288 +443,6 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
         vstrbq_p_s8(pTempDest, vld1q(pTempSrc), p0);
     }
 }
-
-void arm_fir_q7(
-  const arm_fir_instance_q7 * S,
-  const q7_t * pSrc,
-        q7_t * pDst,
-        uint32_t blockSize)
-{
-    q7_t     *pState = S->pState;   /* State pointer */
-    const q7_t     *pCoeffs = S->pCoeffs; /* Coefficient pointer */
-    q7_t     *pStateCur;        /* Points to the current sample of the state */
-    const q7_t     *pSamples;         /* Temporary pointer to the sample buffer */
-    q7_t     *pOutput;          /* Temporary pointer to the output buffer */
-    const q7_t     *pTempSrc;         /* Temporary pointer to the source data */
-    q7_t     *pTempDest;        /* Temporary pointer to the destination buffer */
-    uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
-    uint32_t  blkCnt;
-    q7x16_t  vecIn0;
-    uint32_t  tapsBlkCnt = (numTaps + 15) / 16;
-    q31_t     acc0, acc1, acc2, acc3;
-    q7x16_t  vecCoeffs;
-
-    if (blockSize >= 20)
-    {
-        if (numTaps <= 16)
-        {
-            /*
-             * [1 to 16 taps] specialized routine
-             */
-            arm_fir_q7_1_16_mve(S, pSrc, pDst, blockSize);
-            return;
-        }
-    }
-
-    if (blockSize >= 20)
-    {
-      /*
-       * pState points to state array which contains previous frame (numTaps - 1) samples
-       * pStateCur points to the location where the new input data should be written
-       */
-      pStateCur   = &(pState[(numTaps - 1u)]);
-      pSamples    = pState;
-      pTempSrc    = pSrc;
-      pOutput     = pDst;
-      blkCnt      = blockSize >> 2;
-  
-      /*
-       * outer samples loop
-       */
-      while (blkCnt > 0U)
-      {
-          const q7_t     *pCoeffsTmp = pCoeffs;
-          const q7_t     *pSamplesTmp = pSamples;
-  
-          acc0 = 0;
-          acc1 = 0;
-          acc2 = 0;
-          acc3 = 0;
-          /*
-           * Save 16 input samples in the history buffer
-           */
-          vst1q(pStateCur, vld1q(pTempSrc));
-          pStateCur += 16;
-          pTempSrc += 16;
-  
-          /*
-           * inner coefficients loop
-           */
-          uint32_t       i = tapsBlkCnt;
-          while (i > 0U)
-          {
-              /*
-               * load 16 coefs
-               */
-              vecCoeffs = *(q7x16_t *) pCoeffsTmp;
-  
-              vecIn0 = vld1q(pSamplesTmp);
-              acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
-  
-              vecIn0 = vld1q(&pSamplesTmp[1]);
-              acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
-  
-              vecIn0 = vld1q(&pSamplesTmp[2]);
-              acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
-  
-              vecIn0 = vld1q(&pSamplesTmp[3]);
-              acc3 = vmladavaq(acc3, vecIn0, vecCoeffs);
-  
-              pSamplesTmp += 16;
-              pCoeffsTmp += 16;
-              /*
-               * Decrement the taps block loop counter
-               */
-              i--;
-          }
-          /*
-           * Store the 1.7 format filter output in destination buffer
-           */
-          *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
-          *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
-          *pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8);
-          *pOutput++ = (q7_t) __SSAT((acc3 >> 7U), 8);
-  
-          pSamples += 4;
-          /*
-           * Decrement the sample block loop counter
-           */
-          blkCnt--;
-      }
-  
-      uint32_t  residual = blockSize & 3;
-      switch (residual)
-      {
-      case 3:
-          {
-              const q7_t     *pCoeffsTmp = pCoeffs;
-              const q7_t     *pSamplesTmp = pSamples;
-  
-              acc0 = 0;
-              acc1 = 0;
-              acc2 = 0;
-              /*
-               * Save 16 input samples in the history buffer
-               */
-              vst1q(pStateCur, vld1q(pTempSrc));
-              pStateCur += 16;
-              pTempSrc += 16;
-  
-              uint32_t       i = tapsBlkCnt;
-              while (i > 0U)
-              {
-                  vecCoeffs = *(q7x16_t *) pCoeffsTmp;
-  
-                  vecIn0 = vld1q(pSamplesTmp);
-                  acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
-  
-                  vecIn0 = vld1q(&pSamplesTmp[1]);
-                  acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
-  
-                  vecIn0 = vld1q(&pSamplesTmp[2]);
-                  acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
-  
-                  pSamplesTmp += 16;
-                  pCoeffsTmp += 16;
-                  i--;
-              }
-  
-              *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
-              *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
-              *pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8);
-          }
-          break;
-  
-      case 2:
-          {
-              const q7_t     *pCoeffsTmp = pCoeffs;
-              const q7_t     *pSamplesTmp = pSamples;
-  
-              acc0 = 0;
-              acc1 = 0;
-              /*
-               * Save 16 input samples in the history buffer
-               */
-              vst1q(pStateCur, vld1q(pTempSrc));
-              pStateCur += 16;
-              pTempSrc += 16;
-  
-              uint32_t       i = tapsBlkCnt;
-              while (i > 0U)
-              {
-                  vecCoeffs = *(q7x16_t *) pCoeffsTmp;
-  
-                  vecIn0 = vld1q(pSamplesTmp);
-                  acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
-  
-                  vecIn0 = vld1q(&pSamplesTmp[1]);
-                  acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
-  
-                  pSamplesTmp += 16;
-                  pCoeffsTmp += 16;
-                  i--;
-              }
-  
-              *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
-              *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
-          }
-          break;
-  
-      case 1:
-          {
-              const q7_t     *pCoeffsTmp = pCoeffs;
-              const q7_t     *pSamplesTmp = pSamples;
-  
-              acc0 = 0;
-              /*
-               * Save 16 input samples in the history buffer
-               */
-              vst1q(pStateCur, vld1q(pTempSrc));
-              pStateCur += 16;
-              pTempSrc += 16;
-  
-              uint32_t       i = tapsBlkCnt;
-              while (i > 0U)
-              {
-                  vecCoeffs = *(q7x16_t *) pCoeffsTmp;
-  
-                  vecIn0 = vld1q(pSamplesTmp);
-                  acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
-  
-                  pSamplesTmp += 16;
-                  pCoeffsTmp += 16;
-                  i--;
-              }
-              *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
-          }
-          break;
-      }
-    }
-    else
-    {
-        q7_t *pStateCurnt;                            /* Points to the current sample of the state */
-            q7_t *px;                                     /* Temporary pointer for state buffer */
-      const q7_t *pb;                                     /* Temporary pointer for coefficient buffer */
-            q31_t acc0;                                    /* Accumulator */
-            uint32_t  i,blkCnt;                    /* Loop counters */
-      pStateCurnt = &(S->pState[(numTaps - 1U)]);
-      blkCnt = blockSize;
-
-         while (blkCnt > 0U)
-           {
-             /* Copy one sample at a time into state buffer */
-             *pStateCurnt++ = *pSrc++;
-         
-             /* Set the accumulator to zero */
-             acc0 = 0;
-         
-             /* Initialize state pointer */
-             px = pState;
-         
-             /* Initialize Coefficient pointer */
-             pb = pCoeffs;
-         
-             i = numTaps;
-         
-             /* Perform the multiply-accumulates */
-             while (i > 0U)
-             {
-               acc0 += (q15_t) * (px++) * (*(pb++));
-               i--;
-             } 
-         
-             /* The result is in 2.14 format. Convert to 1.7
-                Then store the output in the destination buffer. */
-             *pDst++ = __SSAT((acc0 >> 7U), 8);
-         
-             /* Advance state pointer by 1 for the next sample */
-             pState = pState + 1U;
-         
-             /* Decrement loop counter */
-             blkCnt--;
-           }
-    }
-    /*
-     * Copy the samples back into the history buffer start
-     */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
-
-    blkCnt = numTaps >> 4;
-    while (blkCnt > 0U)
-    {
-        vst1q(pTempDest, vld1q(pTempSrc));
-        pTempSrc += 16;
-        pTempDest += 16;
-        blkCnt--;
-    }
-    blkCnt = numTaps & 0xF;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp8q(blkCnt);
-        vstrbq_p_s8(pTempDest, vld1q(pTempSrc), p0);
-    }
-}
 #else
 void arm_fir_q7(
   const arm_fir_instance_q7 * S,
@@ -687,7 +650,7 @@ void arm_fir_q7(
     {
       acc0 += (q15_t) * (px++) * (*(pb++));
       i--;
-    } 
+    }
 
     /* The result is in 2.14 format. Convert to 1.7
        Then store the output in the destination buffer. */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c
index f44f037ff4e7db520e5beac747883d739b036044..e6f4d78c848e7a6bb41ff9771bf8ba5c15ca1ae6 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_sparse_f32.c
  * Description:  Floating-point sparse FIR filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c
index 7745e716cd3b9eeb8b6b1c08819ce99e7214abe9..dce0d6f7d80b713b0ececdcb667d084866e1c53b 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_sparse_init_f32.c
  * Description:  Floating-point sparse FIR filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c
index d07d61197577349ab07ff4255806fd92514c53b8..6a48743ed28682a5de66868efc86260ee66a8ff5 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_sparse_init_q15.c
  * Description:  Q15 sparse FIR filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c
index 7c32cea1173df447d81c68fac14e0a5cfeb012a6..a333f1cc852706b00b09aab61ec54af78f96ef49 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_sparse_init_q31.c
  * Description:  Q31 sparse FIR filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c
index 98153f321e018e0b256e871829f1e8f25b06c20f..3e2b9c9a2a7d9837028a22ee436661967331e31e 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_sparse_init_q7.c
  * Description:  Q7 sparse FIR filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c
index 9cea93e2daeb8bf2d369bfe69e098d90ca5b03ea..606692255c8d09ad6dd5fc87c170b0e846a009fb 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_sparse_q15.c
  * Description:  Q15 sparse FIR filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c
index 86d3e1db0008964929473d799271d9563c3e9c66..33c66df47de545b9bea32f4b3f80a654e6a53666 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_sparse_q31.c
  * Description:  Q31 sparse FIR filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c
index 7a2b57f1881b8df54df6537a28954ab9e29e2c5d..e47d889c282e3e8e8cb0315499dad12b0aa731a4 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_fir_sparse_q7.c
  * Description:  Q7 sparse FIR filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c
index c48efe30ef534465c82691a78f610c0a8e87e3ab..bf0b178c434b81340d51825b26bbc49c94ad08c9 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_iir_lattice_f32.c
  * Description:  Floating-point IIR Lattice filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c
index bd9f9338a6d4edf1a1a99be264be7e68d0480c8d..389e1ea8fd7d9ea6db9f46defa530ba9a185ec86 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_iir_lattice_init_f32.c
  * Description:  Floating-point IIR lattice filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c
index 01abf48e75eb8ad0bf2babf256e301d76c1665d0..873c342ae5d9bfd463738f959a2f59d07d76136e 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_iir_lattice_init_q15.c
  * Description:  Q15 IIR lattice filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c
index b472f6ce64ed35f96733ca33cc0247287ab217a8..59b26d3c713f91825df938a16c39d0ef1852d12f 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_iir_lattice_init_q31.c
  * Description:  Initialization function for the Q31 IIR lattice filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c
index 9dbea811995ab92f069b97c4fdf31d6272d10adb..d3cf7f668fcc3ba665839078b3e33b85a93cbc82 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_iir_lattice_q15.c
  * Description:  Q15 IIR Lattice filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c
index c4b9a76268dd68b8cd97efe18d902036ef60bbab..63344955316f80d1d8812dfb2a5343ba39cfe5cb 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_iir_lattice_q31.c
  * Description:  Q31 IIR Lattice filter processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_f16.c b/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..0f0f57f580ea1cf88ab553c5049af6ea765fae3a
--- /dev/null
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_f16.c
@@ -0,0 +1,276 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_levinson_durbin_f16.c
+ * Description:  f16 version of Levinson Durbin algorithm
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+/**
+  @ingroup groupFilters
+ */
+
+/**
+  @defgroup LD Levinson Durbin Algorithm
+
+ */
+
+/**
+  @addtogroup LD
+  @{
+ */
+
+/**
+  @brief         Levinson Durbin
+  @param[in]     phi      autocovariance vector starting with lag 0 (length is nbCoefs + 1)
+  @param[out]    a        autoregressive coefficients
+  @param[out]    err      prediction error (variance)
+  @param[in]     nbCoefs  number of autoregressive coefficients
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_levinson_durbin_f16 built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(__CMSIS_GCC_H)
+
+#include "arm_helium_utils.h"
+
+#define LANE4567_MASK 0xFF00
+
+void arm_levinson_durbin_f16(const float16_t *phi,
+  float16_t *a, 
+  float16_t *err,
+  int nbCoefs)
+{
+   _Float16 e;
+   static const uint16_t revOffsetArray[8] = {7,6,5,4,3,2,1,0};
+
+   a[0] = (_Float16)phi[1] / (_Float16)phi[0];
+
+   e = (_Float16)phi[0] - (_Float16)phi[1] * (_Float16)a[0];
+   for(int p=1; p < nbCoefs; p++)
+   {
+      _Float16 suma = 0.0f16;
+      _Float16 sumb = 0.0f16;
+      f16x8_t vecA,vecRevPhi,vecPhi,vecSumA, vecSumB;
+      _Float16 k;
+      uint32_t blkCnt; 
+      const float16_t *pPhi,*pRevPhi,*pA;
+      uint16x8_t revOffset;
+
+      int nb,j,i;
+
+      revOffset = vld1q(revOffsetArray);
+      vecSumA = vdupq_n_f16(0.0f16);
+      vecSumB = vdupq_n_f16(0.0f16);
+
+      pRevPhi = &phi[p-7];
+      pPhi = &phi[1];
+      pA = a;
+
+      i = 0;
+      blkCnt = p >> 3;
+      while(blkCnt > 0)
+      {
+         vecA = vld1q(pA);
+         pA += 8;
+
+         vecPhi = vld1q(pPhi);
+         pPhi += 8;
+
+         vecRevPhi = vldrhq_gather_shifted_offset_f16(pRevPhi,revOffset);
+         pRevPhi -= 8;
+
+         vecSumA = vfmaq(vecSumA,vecA,vecRevPhi);
+         vecSumB = vfmaq(vecSumB,vecA,vecPhi);
+
+         i += 8;
+         blkCnt--;
+
+      }
+
+      suma = vecAddAcrossF16Mve(vecSumA);
+      sumb = vecAddAcrossF16Mve(vecSumB);
+
+      blkCnt = p & 7;
+      while(blkCnt > 0)
+      {
+         suma += (_Float16)a[i] * (_Float16)phi[p - i];
+         sumb += (_Float16)a[i] * (_Float16)phi[i + 1];
+
+         i++;
+         blkCnt--;
+      }
+
+      k = ((_Float16)phi[p+1] - suma)/((_Float16)phi[0] - sumb);
+
+      f16x8_t vecRevA,tmp;
+      static uint16_t orgOffsetArray[8]={0,1,2,3,-1,-2,-3,-4};
+      static const uint16_t offsetIncArray[8]={4,4,4,4,-4,-4,-4,-4};
+
+      uint16x8_t offset,offsetInc,vecTmp;
+
+
+      offset = vld1q(orgOffsetArray);
+      vecTmp = vdupq_n_u16(p);
+
+      offset = vaddq_m_u16(offset,offset,vecTmp,LANE4567_MASK);
+      offsetInc = vld1q(offsetIncArray);
+
+      nb = p >> 3;
+      j=0;
+      for(int i = 0; i < nb ; i++)
+      {
+          
+          /*
+            x0=a[j] - k * a[p-1-j];
+            x1=a[j+1] - k * a[p-2-j];
+            x3=a[p-1-j] - k * a[j];
+            x4=a[p-2-j] - k * a[j+1];
+
+            a[j] = x0;
+            a[j+1] = x1;
+            a[p-1-j] = x2;
+            a[p-2-j] = x3;
+          */
+
+          uint64_t tmpa,tmpb;
+          vecA = vldrhq_gather_shifted_offset_f16(a,offset);
+
+          
+          tmpa = vgetq_lane_u64((uint64x2_t)vecA,0);
+          tmpb = vgetq_lane_u64((uint64x2_t)vecA,1);
+          vecRevA = (f16x8_t) vsetq_lane_u64(tmpb,(uint64x2_t)vecRevA,0);
+          vecRevA = (f16x8_t) vsetq_lane_u64(tmpa,(uint64x2_t)vecRevA,1);
+          
+
+          tmp = vsubq(vecA,vmulq_n_f16(vecRevA,k));
+          vstrhq_scatter_shifted_offset_f16(a, offset, tmp);
+
+          offset = vaddq(offset,offsetInc);
+ 
+          j+=4;
+
+      }
+
+      blkCnt = p & 7;
+
+      if (blkCnt)
+      {
+         nb = blkCnt >> 1;
+         for(int i =0;i < nb ; i++)
+         {
+             _Float16 x,y;
+   
+             x=(_Float16)a[j] - (_Float16)k * (_Float16)a[p-1-j];
+             y=(_Float16)a[p-1-j] - (_Float16)k * (_Float16)a[j];
+   
+             a[j] = x;
+             a[p-1-j] = y;
+   
+             j++;
+         }
+   
+         nb = blkCnt & 1;
+         if (nb)
+         {
+               a[j]=(_Float16)a[j]- (_Float16)k * (_Float16)a[p-1-j];
+         }
+      }
+
+     
+      a[p] = k;
+      e = e * (1.0f16 - k*k);
+
+
+   }
+   *err = e;
+}
+
+#else
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+void arm_levinson_durbin_f16(const float16_t *phi,
+  float16_t *a, 
+  float16_t *err,
+  int nbCoefs)
+{
+   _Float16 e;
+
+   a[0] = (_Float16)phi[1] / (_Float16)phi[0];
+
+   e = (_Float16)phi[0] - (_Float16)phi[1] * (_Float16)a[0];
+   for(int p=1; p < nbCoefs; p++)
+   {
+      _Float16 suma=0.0f16;
+      _Float16 sumb=0.0f16;
+      _Float16 k;
+      int nb,j;
+
+      for(int i=0; i < p; i++)
+      {
+         suma += (_Float16)a[i] * (_Float16)phi[p - i];
+         sumb += (_Float16)a[i] * (_Float16)phi[i + 1];
+      }
+
+      k = ((_Float16)phi[p+1]-suma)/((_Float16)phi[0] - sumb);
+
+
+      nb = p >> 1;
+      j=0;
+      for(int i =0;i < nb ; i++)
+      {
+          _Float16 x,y;
+
+          x=(_Float16)a[j] - (_Float16)k * (_Float16)a[p-1-j];
+          y=(_Float16)a[p-1-j] - (_Float16)k * (_Float16)a[j];
+
+          a[j] = x;
+          a[p-1-j] = y;
+
+          j++;
+      }
+
+      nb = p & 1;
+      if (nb)
+      {
+            a[j]=(_Float16)a[j]- (_Float16)k * (_Float16)a[p-1-j];
+      }
+
+      a[p] = k;
+      e = e * (1.0f16 - k*k);
+
+
+   }
+   *err = e;
+}
+#endif /* defined(ARM_FLOAT16_SUPPORTED */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of LD group
+ */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..0a29e0358e491951dfe26699d7a9808fa65f36bb
--- /dev/null
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_f32.c
@@ -0,0 +1,278 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_levinson_durbin_f32.c
+ * Description:  f32 version of Levinson Durbin algorithm
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+  @ingroup groupFilters
+ */
+
+/**
+  @defgroup LD Levinson Durbin Algorithm
+
+ */
+
+/**
+  @addtogroup LD
+  @{
+ */
+
+/**
+  @brief         Levinson Durbin
+  @param[in]     phi      autocovariance vector starting with lag 0 (length is nbCoefs + 1)
+  @param[out]    a        autoregressive coefficients
+  @param[out]    err      prediction error (variance)
+  @param[in]     nbCoefs  number of autoregressive coefficients
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_levinson_durbin_f32 built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(__CMSIS_GCC_H)
+
+#include "arm_helium_utils.h"
+
+#define LANE23_MASK 0xFF00
+
+void arm_levinson_durbin_f32(const float32_t *phi,
+  float32_t *a, 
+  float32_t *err,
+  int nbCoefs)
+{
+   float32_t e;
+   static const uint32_t revOffsetArray[4] = {3,2,1,0};
+
+   a[0] = phi[1] / phi[0];
+
+   e = phi[0] - phi[1] * a[0];
+   for(int p=1; p < nbCoefs; p++)
+   {
+      float32_t suma = 0.0f;
+      float32_t sumb = 0.0f;
+      f32x4_t vecA,vecRevPhi,vecPhi,vecSumA, vecSumB;
+      float32_t k;
+      uint32_t blkCnt; 
+      const float32_t *pPhi,*pRevPhi,*pA;
+      uint32x4_t revOffset;
+
+      int nb,j,i;
+
+      revOffset = vld1q(revOffsetArray);
+      vecSumA = vdupq_n_f32(0.0f);
+      vecSumB = vdupq_n_f32(0.0f);
+
+      pRevPhi = &phi[p-3];
+      pPhi = &phi[1];
+      pA = a;
+
+      i = 0;
+      blkCnt = p >> 2;
+      while(blkCnt > 0)
+      {
+         vecA = vld1q(pA);
+         pA += 4;
+
+         vecPhi = vld1q(pPhi);
+         pPhi += 4;
+
+         vecRevPhi = vldrwq_gather_shifted_offset_f32(pRevPhi,revOffset);
+         pRevPhi -= 4;
+
+         vecSumA = vfmaq(vecSumA,vecA,vecRevPhi);
+         vecSumB = vfmaq(vecSumB,vecA,vecPhi);
+
+         i += 4;
+         blkCnt--;
+
+      }
+
+      suma = vecAddAcrossF32Mve(vecSumA);
+      sumb = vecAddAcrossF32Mve(vecSumB);
+
+      blkCnt = p & 3;
+      while(blkCnt > 0)
+      {
+         suma += a[i] * phi[p - i];
+         sumb += a[i] * phi[i + 1];
+
+         i++;
+         blkCnt--;
+      }
+
+      k = (phi[p+1] - suma)/(phi[0] - sumb);
+
+      f32x4_t vecRevA,tmp;
+      static uint32_t orgOffsetArray[4]={0,1,-1,-2};
+      static const uint32_t offsetIncArray[4]={2,2,-2,-2};
+
+      uint32x4_t offset,offsetInc,vecTmp;
+
+
+      offset = vld1q(orgOffsetArray);
+      vecTmp = vdupq_n_u32(p);
+
+      offset = vaddq_m_u32(offset,offset,vecTmp,LANE23_MASK);
+      offsetInc = vld1q(offsetIncArray);
+
+      nb = p >> 2;
+      j=0;
+      for(int i = 0; i < nb ; i++)
+      {
+          
+          /*
+            x0=a[j] - k * a[p-1-j];
+            x1=a[j+1] - k * a[p-2-j];
+            x3=a[p-1-j] - k * a[j];
+            x4=a[p-2-j] - k * a[j+1];
+
+            a[j] = x0;
+            a[j+1] = x1;
+            a[p-1-j] = x2;
+            a[p-2-j] = x3;
+          */
+
+          uint64_t tmpa,tmpb;
+          vecA = vldrwq_gather_shifted_offset_f32(a,offset);
+
+          
+          tmpa = vgetq_lane_u64((uint64x2_t)vecA,0);
+          tmpb = vgetq_lane_u64((uint64x2_t)vecA,1);
+          vecRevA = (f32x4_t) vsetq_lane_u64(tmpb,(uint64x2_t)vecRevA,0);
+          vecRevA = (f32x4_t) vsetq_lane_u64(tmpa,(uint64x2_t)vecRevA,1);
+          
+
+          tmp = vsubq(vecA,vmulq_n_f32(vecRevA,k));
+          vstrwq_scatter_shifted_offset_f32(a, offset, tmp);
+
+          offset = vaddq(offset,offsetInc);
+ 
+          j+=2;
+
+      }
+
+      switch(p & 3)
+      {
+         case 3:
+         {
+            float32_t x,y;
+            x = a[j] - k * a[p-1-j];
+            y = a[p-1-j] - k * a[j];
+
+            a[j] = x;
+            a[p-1-j] = y;
+
+            a[j+1] = a[j+1] - k * a[p-1-(j+1)];
+         }
+         break;
+
+         case 2:
+         {
+            float32_t x,y;
+            x = a[j] - k * a[p-1-j];
+            y = a[p-1-j] - k * a[j];
+
+            a[j] = x;
+            a[p-1-j] = y;
+         }
+         break;
+
+         case 1:
+            a[j] = a[j]- k * a[p-1-j];
+         break;
+      }
+
+      a[p] = k;
+      e = e * (1.0f - k*k);
+
+
+   }
+   *err = e;
+}
+
+#else
+void arm_levinson_durbin_f32(const float32_t *phi,
+  float32_t *a, 
+  float32_t *err,
+  int nbCoefs)
+{
+   float32_t e;
+
+   a[0] = phi[1] / phi[0];
+
+   e = phi[0] - phi[1] * a[0];
+   for(int p=1; p < nbCoefs; p++)
+   {
+      float32_t suma=0.0f;
+      float32_t sumb=0.0f;
+      float32_t k;
+      int nb,j;
+
+      for(int i=0; i < p; i++)
+      {
+         suma += a[i] * phi[p - i];
+         sumb += a[i] * phi[i + 1];
+      }
+
+      k = (phi[p+1]-suma)/(phi[0] - sumb);
+
+
+      nb = p >> 1;
+      j=0;
+      for(int i =0; i < nb ; i++)
+      {
+          float32_t x,y;
+
+          x=a[j] - k * a[p-1-j];
+          y=a[p-1-j] - k * a[j];
+
+          a[j] = x;
+          a[p-1-j] = y;
+
+          j++;
+      }
+
+      nb = p & 1;
+      if (nb)
+      {
+            a[j]=a[j]- k * a[p-1-j];
+      }
+
+      a[p] = k;
+      e = e * (1.0f - k*k);
+
+
+   }
+   *err = e;
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of LD group
+ */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_q31.c
new file mode 100644
index 0000000000000000000000000000000000000000..a226f023103af65aa38fa73d628f62fe478b31ff
--- /dev/null
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_q31.c
@@ -0,0 +1,378 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_levinson_durbin_q31.c
+ * Description:  q31 version of Levinson Durbin algorithm
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+#define ONE_Q31 0x7FFFFFFFL
+#define TWO_Q30 0x7FFFFFFFL
+
+#define HALF_Q31 0x00008000L
+#define ONE_Q15 0x7FFF
+#define HALF_Q15 0x3FFF
+#define LOWPART_MASK 0x07FFF
+
+__STATIC_FORCEINLINE q31_t mul32x16(q31_t a, q15_t b)
+{
+  q31_t r = ((q63_t)a * (q63_t)b) >> 15;
+
+  return(r);
+  
+}
+
+__STATIC_FORCEINLINE q31_t mul32x32(q31_t a, q31_t b)
+{
+  //q31_t r = __SSAT(((q63_t)a * b) >> 31,31);
+  q31_t r = ((q63_t)a * b) >> 31;
+
+  return(r);
+  
+}
+
+__STATIC_FORCEINLINE q31_t divide(q31_t n, q31_t d)
+{
+  arm_status status;
+  int16_t shift;
+  q15_t inverse;
+  q31_t r;
+  // We are computing:
+  // n / d = n / (h + l) where h and l are the high end and low end part.
+  // 1 / (h + l) = 1 / h (1 - l / h)
+  // Our division algorithm has a shift. So it is returning a scaled value sh.
+  // So we need a << shift to convert 1/ sh to 1/h.
+  // In below code, we are organizing the computation differently. Instead of computing:
+  // 1 / h (1 - l / h) 
+  // we are computing
+  // 1 / h (2 - (l + h) / h) 
+  // 1 / h (2 - d / h)
+  // Also, we are not computing 1/h in Q15 but in Q14.
+  // 2 is expressed in Q30.
+  // So at the end of all computation we need a << 2
+
+  // Result is in Q14 because of use of HALF_Q15 instead of ONE_Q15.
+  status=arm_divide_q15(HALF_Q15,d>>16,&inverse,&shift);
+  (void)status;
+  
+  // d is used instead of l
+  // So we will need to substract to 2 instead of 1.
+  r = mul32x16(d,inverse);
+  r = TWO_Q30 - (r << shift);
+  r = mul32x16(r, inverse);
+  r = mul32x32(r,n) ;
+  r = r << (shift + 2);
+  
+  return(r);
+  
+}
+
+/**
+  @ingroup groupFilters
+ */
+
+/**
+  @defgroup LD Levinson Durbin Algorithm
+
+ */
+
+/**
+  @addtogroup LD
+  @{
+ */
+
+/**
+  @brief         Levinson Durbin
+  @param[in]     phi      autocovariance vector starting with lag 0 (length is nbCoefs + 1)
+  @param[out]    a        autoregressive coefficients
+  @param[out]    err      prediction error (variance)
+  @param[in]     nbCoefs  number of autoregressive coefficients
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_levinson_durbin_q31 built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(__CMSIS_GCC_H)
+
+#define LANE23_MASK 0xFF00
+
+#include "arm_helium_utils.h"
+void arm_levinson_durbin_q31(const q31_t *phi,
+  q31_t *a, 
+  q31_t *err,
+  int nbCoefs)
+{
+    q31_t e;
+
+    static const uint32_t revOffsetArray[4] = {3,2,1,0};
+
+   //a[0] = phi[1] / phi[0];
+   a[0] = divide(phi[1], phi[0]);
+   
+
+   //e = phi[0] - phi[1] * a[0];
+   e = phi[0] - mul32x32(phi[1],a[0]);
+
+   for(int p=1; p < nbCoefs; p++)
+   {
+      q63_t suma=0;
+      q63_t sumb=0;
+      q31x4_t vecA,vecRevPhi,vecPhi;
+      q31_t k;
+      uint32_t blkCnt; 
+      const q31_t *pPhi,*pRevPhi,*pA;
+      uint32x4_t revOffset;
+
+      
+      int nb,j,i;
+
+      revOffset = vld1q(revOffsetArray);
+
+      pRevPhi = &phi[p-3];
+      pPhi = &phi[1];
+      pA = a;
+
+      i = 0;
+      blkCnt = p >> 2;
+      while(blkCnt > 0)
+      {
+         vecA = vld1q(pA);
+         pA += 4;
+
+         vecPhi = vld1q(pPhi);
+         pPhi += 4;
+
+         vecRevPhi = vldrwq_gather_shifted_offset_s32(pRevPhi,revOffset);
+         pRevPhi -= 4;
+
+         suma = vmlaldavaq(suma,vecA,vecRevPhi);
+         sumb = vmlaldavaq(sumb,vecA,vecPhi); 
+
+         i += 4;
+         blkCnt--;
+      }
+
+
+      blkCnt = p & 3;
+      while(blkCnt > 0)
+      {
+         suma += ((q63_t)a[i] * phi[p - i]);
+         sumb += ((q63_t)a[i] * phi[i + 1]);
+
+         i++;
+         blkCnt--;
+      }
+
+      suma = asrl(suma, 31);
+      sumb = asrl(sumb, 31);
+
+
+
+      //k = (phi[p+1]-suma)/(phi[0] - sumb);
+      k = divide(phi[p+1]-(q31_t)suma,phi[0] - (q31_t)sumb);
+
+      q31x4_t vecRevA,tmp;
+      static uint32_t orgOffsetArray[4]={0,1,-1,-2};
+      static const uint32_t offsetIncArray[4]={2,2,-2,-2};
+
+      uint32x4_t offset,offsetInc,vecTmp;
+
+
+      offset = vld1q(orgOffsetArray);
+      vecTmp = vdupq_n_u32(p);
+
+      offset = vaddq_m_u32(offset,offset,vecTmp,LANE23_MASK);
+      offsetInc = vld1q(offsetIncArray);
+
+
+      nb = p >> 2;
+      j=0;
+      for(int i =0;i < nb ; i++)
+      {
+        /*
+          q31_t x0,x1,x2,x3;
+
+          //x = a[j] - k * a[p-1-j];
+          x0 = a[j] - mul32x32(k,a[p-1-j]);
+          x1 = a[j+1] - mul32x32(k,a[p-2-j]);
+
+          //y = a[p-1-j] - k * a[j];
+          x2 = a[p-1-j] - mul32x32(k , a[j]);
+          x3 = a[p-2-j] - mul32x32(k , a[j+1]);
+
+          a[j] = x0;
+          a[j+1] = x1;
+          a[p-1-j] = x2;
+          a[p-2-j] = x3;
+        */
+
+          uint64_t tmpa,tmpb;
+          vecA = vldrwq_gather_shifted_offset_s32(a,offset);
+
+          
+          tmpa = vgetq_lane_u64((uint64x2_t)vecA,0);
+          tmpb = vgetq_lane_u64((uint64x2_t)vecA,1);
+          vecRevA = (q31x4_t) vsetq_lane_u64(tmpb,(uint64x2_t)vecRevA,0);
+          vecRevA = (q31x4_t) vsetq_lane_u64(tmpa,(uint64x2_t)vecRevA,1);
+          
+
+          tmp = vsubq(vecA,vqdmulhq_n_s32(vecRevA,k));
+          vstrwq_scatter_shifted_offset_s32(a, offset, tmp);
+
+          offset = vaddq(offset,offsetInc);
+
+          j+=2;
+      }
+
+      switch(p & 3)
+      {
+         case 3:
+         {
+          q31_t x,y;
+
+          //x = a[j] - k * a[p-1-j];
+          x = a[j] - mul32x32(k,a[p-1-j]);
+
+          //y = a[p-1-j] - k * a[j];
+          y = a[p-1-j] - mul32x32(k , a[j]);
+
+          a[j] = x;
+          a[p-1-j] = y;
+
+          //a[j] = a[j]- k * a[p-1-j];
+          a[j+1] = a[j+1] - mul32x32(k,a[p-2-j]);
+         }
+         break;
+
+         case 2:
+         {
+          q31_t x,y;
+
+          //x = a[j] - k * a[p-1-j];
+          x = a[j] - mul32x32(k,a[p-1-j]);
+
+          //y = a[p-1-j] - k * a[j];
+          y = a[p-1-j] - mul32x32(k , a[j]);
+
+          a[j] = x;
+          a[p-1-j] = y;
+         }
+         break;
+
+         case 1:
+            //a[j] = a[j]- k * a[p-1-j];
+            a[j] = a[j] - mul32x32(k,a[p-1-j]);
+         break;
+      }
+
+      a[p] = k;
+
+      // e = e * (1 - k*k);
+      e = mul32x32(e,ONE_Q31 - mul32x32(k,k));
+
+
+   }
+   *err = e;
+}
+
+#else
+
+void arm_levinson_durbin_q31(const q31_t *phi,
+  q31_t *a, 
+  q31_t *err,
+  int nbCoefs)
+{
+   q31_t e;
+
+   //a[0] = phi[1] / phi[0];
+   a[0] = divide(phi[1], phi[0]);
+   
+
+   //e = phi[0] - phi[1] * a[0];
+   e = phi[0] - mul32x32(phi[1],a[0]);
+
+   for(int p=1; p < nbCoefs; p++)
+   {
+      q63_t suma=0;
+      q63_t sumb=0;
+      q31_t k;
+      int nb,j;
+
+      for(int i=0; i < p; i++)
+      {
+         suma += ((q63_t)a[i] * phi[p - i]);
+         sumb += ((q63_t)a[i] * phi[i + 1]);
+      }
+
+      suma = suma >> 31;
+      sumb = sumb >> 31;
+
+
+
+      //k = (phi[p+1]-suma)/(phi[0] - sumb);
+      k = divide(phi[p+1]-(q31_t)suma,phi[0] - (q31_t)sumb);
+
+
+      nb = p >> 1;
+      j=0;
+      for(int i =0;i < nb ; i++)
+      {
+          q31_t x,y;
+
+          //x = a[j] - k * a[p-1-j];
+          x = a[j] - mul32x32(k,a[p-1-j]);
+
+          //y = a[p-1-j] - k * a[j];
+          y = a[p-1-j] - mul32x32(k , a[j]);
+
+          a[j] = x;
+          a[p-1-j] = y;
+
+          j++;
+      }
+
+      nb = p & 1;
+      if (nb)
+      {
+            //a[j] = a[j]- k * a[p-1-j];
+            a[j] = a[j] - mul32x32(k,a[p-1-j]);
+      }
+
+      a[p] = k;
+
+      // e = e * (1 - k*k);
+      e = mul32x32(e,ONE_Q31 - mul32x32(k,k));
+
+
+   }
+   *err = e;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of LD group
+ */
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_f32.c
index 4a863594f6180c472c1a024887b9574f35794cdb..99cbeb4242e1e8be4525c4068ae6e4d0cfd34463 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_lms_f32.c
  * Description:  Processing function for the floating-point LMS filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_f32.c
index f418f46109f833ff726769f0593464d51f2dcb44..cf472afde01d34dfdf8e1d2abe26835171411cef 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_lms_init_f32.c
  * Description:  Floating-point LMS filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @addtogroup LMS
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q15.c
index fe0a5c51618dadaed3aae2e39f58102d30e8879e..31c2bff2447305b30b2bed26cdffa2341b9dad95 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_lms_init_q15.c
  * Description:  Q15 LMS filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q31.c
index 3410b9f214a0bc95d64376f8b783378d5e973368..e657a9382411df84638a66ade863145683ae48e6 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_lms_init_q31.c
  * Description:  Q31 LMS filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c
index 693d45f809883298dcf0dd5317e167ff34a32462..85f8211c98c8e93bc388a00dacddc7091512b070 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_lms_norm_f32.c
  * Description:  Processing function for the floating-point NLMS filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c
index 543dc725881fb3e26b5501b05421e14b6b743df3..b0b7ea505b86cdc6175431425d9b2a69120843f9 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_lms_norm_init_f32.c
  * Description:  Floating-point NLMS filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q15.c
index d581ac18ec6bb9dd5ceea00186a50346afce904d..5bbfc42052175dd26e144dfb9049d007913cbfdd 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_lms_norm_init_q15.c
  * Description:  Q15 NLMS filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q31.c
index 30e78ec4254c941f15e852a103059dcb8f545f7c..cc429b0097a69fbdf915913899cf3809f6bd71c4 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_lms_norm_init_q31.c
  * Description:  Q31 NLMS filter initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c
index c15ad5eb471e0179870a612db82ebda1c817efba..b07efdda58a1487669cd0ba804ebb1e266307b68 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_lms_norm_q15.c
  * Description:  Processing function for Q15 normalized LMS filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c
index e26219ed379c88837124f57d9f7149b1e022c558..6375d8ff5ed126165cd4332821923ef7644d0d92 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_lms_norm_q31.c
  * Description:  Processing function for the Q31 NLMS filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q15.c b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q15.c
index 0fc98783afe949a4ca004723d271ff55ecafde63..76cab0320acbf8f23930be7835b4e2efa23e7b6d 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q15.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_lms_q15.c
  * Description:  Processing function for Q15 LMS filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q31.c b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q31.c
index b0c0e275925d1c169070de160e5689bcfe6b6349..1ed61f276b117bd5b951d169384a739753d19884 100644
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q31.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_lms_q31.c
  * Description:  Processing function for the Q31 LMS filter
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/filtering_functions.h"
 
 /**
   @ingroup groupFilters
diff --git a/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_f16.c b/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..c1946acaf140d2479393ae45cd856102d499bf28
--- /dev/null
+++ b/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_f16.c
@@ -0,0 +1,167 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_bilinear_interp_f16.c
+ * Description:  Floating-point bilinear interpolation
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupInterpolation
+ */
+
+/**
+   * @defgroup BilinearInterpolate Bilinear Interpolation
+   *
+   * Bilinear interpolation is an extension of linear interpolation applied to a two dimensional grid.
+   * The underlying function <code>f(x, y)</code> is sampled on a regular grid and the interpolation process
+   * determines values between the grid points.
+   * Bilinear interpolation is equivalent to two step linear interpolation, first in the x-dimension and then in the y-dimension.
+   * Bilinear interpolation is often used in image processing to rescale images.
+   * The CMSIS DSP library provides bilinear interpolation functions for Q7, Q15, Q31, and floating-point data types.
+   *
+   * <b>Algorithm</b>
+   * \par
+   * The instance structure used by the bilinear interpolation functions describes a two dimensional data table.
+   * For floating-point, the instance structure is defined as:
+   * <pre>
+   *   typedef struct
+   *   {
+   *     uint16_t numRows;
+   *     uint16_t numCols;
+   *     float16_t *pData;
+   * } arm_bilinear_interp_instance_f16;
+   * </pre>
+   *
+   * \par
+   * where <code>numRows</code> specifies the number of rows in the table;
+   * <code>numCols</code> specifies the number of columns in the table;
+   * and <code>pData</code> points to an array of size <code>numRows*numCols</code> values.
+   * The data table <code>pTable</code> is organized in row order and the supplied data values fall on integer indexes.
+   * That is, table element (x,y) is located at <code>pTable[x + y*numCols]</code> where x and y are integers.
+   *
+   * \par
+   * Let <code>(x, y)</code> specify the desired interpolation point.  Then define:
+   * <pre>
+   *     XF = floor(x)
+   *     YF = floor(y)
+   * </pre>
+   * \par
+   * The interpolated output point is computed as:
+   * <pre>
+   *  f(x, y) = f(XF, YF) * (1-(x-XF)) * (1-(y-YF))
+   *           + f(XF+1, YF) * (x-XF)*(1-(y-YF))
+   *           + f(XF, YF+1) * (1-(x-XF))*(y-YF)
+   *           + f(XF+1, YF+1) * (x-XF)*(y-YF)
+   * </pre>
+   * Note that the coordinates (x, y) contain integer and fractional components.
+   * The integer components specify which portion of the table to use while the
+   * fractional components control the interpolation processor.
+   *
+   * \par
+   * if (x,y) are outside of the table boundary, Bilinear interpolation returns zero output.
+   */
+
+
+  /**
+   * @addtogroup BilinearInterpolate
+   * @{
+   */
+
+
+  /**
+  * @brief  Floating-point bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate.
+  * @param[in]     Y  interpolation coordinate.
+  * @return out interpolated value.
+  */
+  float16_t arm_bilinear_interp_f16(
+  const arm_bilinear_interp_instance_f16 * S,
+  float16_t X,
+  float16_t Y)
+  {
+    float16_t out;
+    float16_t f00, f01, f10, f11;
+    float16_t *pData = S->pData;
+    int32_t xIndex, yIndex, index;
+    float16_t xdiff, ydiff;
+    float16_t b1, b2, b3, b4;
+
+    xIndex = (int32_t) X;
+    yIndex = (int32_t) Y;
+
+    /* Care taken for table outside boundary */
+    /* Returns zero output when values are outside table boundary */
+    if (xIndex < 0 || xIndex > (S->numCols - 2) || yIndex < 0 || yIndex > (S->numRows - 2))
+    {
+      return (0);
+    }
+
+    /* Calculation of index for two nearest points in X-direction */
+    index = (xIndex ) + (yIndex ) * S->numCols;
+
+
+    /* Read two nearest points in X-direction */
+    f00 = pData[index];
+    f01 = pData[index + 1];
+
+    /* Calculation of index for two nearest points in Y-direction */
+    index = (xIndex ) + (yIndex+1) * S->numCols;
+
+
+    /* Read two nearest points in Y-direction */
+    f10 = pData[index];
+    f11 = pData[index + 1];
+
+    /* Calculation of intermediate values */
+    b1 = f00;
+    b2 = f01 - f00;
+    b3 = f10 - f00;
+    b4 = f00 - f01 - f10 + f11;
+
+    /* Calculation of fractional part in X */
+    xdiff = X - xIndex;
+
+    /* Calculation of fractional part in Y */
+    ydiff = Y - yIndex;
+
+    /* Calculation of bi-linear interpolated output */
+    out = b1 + b2 * xdiff + b3 * ydiff + b4 * xdiff * ydiff;
+
+    /* return to application */
+    return (out);
+  }
+
+  /**
+   * @} end of BilinearInterpolate group
+   */
+
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_f32.c b/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..fc001d72d67eabeba945eb8e4d200f8cf238cadd
--- /dev/null
+++ b/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_f32.c
@@ -0,0 +1,161 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_bilinear_interp_f32.c
+ * Description:  Floating-point bilinear interpolation
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+  @ingroup groupInterpolation
+ */
+
+/**
+   * @defgroup BilinearInterpolate Bilinear Interpolation
+   *
+   * Bilinear interpolation is an extension of linear interpolation applied to a two dimensional grid.
+   * The underlying function <code>f(x, y)</code> is sampled on a regular grid and the interpolation process
+   * determines values between the grid points.
+   * Bilinear interpolation is equivalent to two step linear interpolation, first in the x-dimension and then in the y-dimension.
+   * Bilinear interpolation is often used in image processing to rescale images.
+   * The CMSIS DSP library provides bilinear interpolation functions for Q7, Q15, Q31, and floating-point data types.
+   *
+   * <b>Algorithm</b>
+   * \par
+   * The instance structure used by the bilinear interpolation functions describes a two dimensional data table.
+   * For floating-point, the instance structure is defined as:
+   * <pre>
+   *   typedef struct
+   *   {
+   *     uint16_t numRows;
+   *     uint16_t numCols;
+   *     float32_t *pData;
+   * } arm_bilinear_interp_instance_f32;
+   * </pre>
+   *
+   * \par
+   * where <code>numRows</code> specifies the number of rows in the table;
+   * <code>numCols</code> specifies the number of columns in the table;
+   * and <code>pData</code> points to an array of size <code>numRows*numCols</code> values.
+   * The data table <code>pTable</code> is organized in row order and the supplied data values fall on integer indexes.
+   * That is, table element (x,y) is located at <code>pTable[x + y*numCols]</code> where x and y are integers.
+   *
+   * \par
+   * Let <code>(x, y)</code> specify the desired interpolation point.  Then define:
+   * <pre>
+   *     XF = floor(x)
+   *     YF = floor(y)
+   * </pre>
+   * \par
+   * The interpolated output point is computed as:
+   * <pre>
+   *  f(x, y) = f(XF, YF) * (1-(x-XF)) * (1-(y-YF))
+   *           + f(XF+1, YF) * (x-XF)*(1-(y-YF))
+   *           + f(XF, YF+1) * (1-(x-XF))*(y-YF)
+   *           + f(XF+1, YF+1) * (x-XF)*(y-YF)
+   * </pre>
+   * Note that the coordinates (x, y) contain integer and fractional components.
+   * The integer components specify which portion of the table to use while the
+   * fractional components control the interpolation processor.
+   *
+   * \par
+   * if (x,y) are outside of the table boundary, Bilinear interpolation returns zero output.
+   */
+
+
+  /**
+   * @addtogroup BilinearInterpolate
+   * @{
+   */
+
+
+  /**
+  * @brief  Floating-point bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate.
+  * @param[in]     Y  interpolation coordinate.
+  * @return out interpolated value.
+  */
+  float32_t arm_bilinear_interp_f32(
+  const arm_bilinear_interp_instance_f32 * S,
+  float32_t X,
+  float32_t Y)
+  {
+    float32_t out;
+    float32_t f00, f01, f10, f11;
+    float32_t *pData = S->pData;
+    int32_t xIndex, yIndex, index;
+    float32_t xdiff, ydiff;
+    float32_t b1, b2, b3, b4;
+
+    xIndex = (int32_t) X;
+    yIndex = (int32_t) Y;
+
+    /* Care taken for table outside boundary */
+    /* Returns zero output when values are outside table boundary */
+    if (xIndex < 0 || xIndex > (S->numCols - 2) || yIndex < 0 || yIndex > (S->numRows - 2))
+    {
+      return (0);
+    }
+
+    /* Calculation of index for two nearest points in X-direction */
+    index = (xIndex ) + (yIndex ) * S->numCols;
+
+
+    /* Read two nearest points in X-direction */
+    f00 = pData[index];
+    f01 = pData[index + 1];
+
+    /* Calculation of index for two nearest points in Y-direction */
+    index = (xIndex ) + (yIndex+1) * S->numCols;
+
+
+    /* Read two nearest points in Y-direction */
+    f10 = pData[index];
+    f11 = pData[index + 1];
+
+    /* Calculation of intermediate values */
+    b1 = f00;
+    b2 = f01 - f00;
+    b3 = f10 - f00;
+    b4 = f00 - f01 - f10 + f11;
+
+    /* Calculation of fractional part in X */
+    xdiff = X - xIndex;
+
+    /* Calculation of fractional part in Y */
+    ydiff = Y - yIndex;
+
+    /* Calculation of bi-linear interpolated output */
+    out = b1 + b2 * xdiff + b3 * ydiff + b4 * xdiff * ydiff;
+
+    /* return to application */
+    return (out);
+  }
+
+  /**
+   * @} end of BilinearInterpolate group
+   */
+
diff --git a/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q15.c b/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q15.c
new file mode 100644
index 0000000000000000000000000000000000000000..67267366554f80d73c3ac2fa9b939a4e64b5500f
--- /dev/null
+++ b/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q15.c
@@ -0,0 +1,121 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_linear_interp_q15.c
+ * Description:  Q15 linear interpolation
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+  @ingroup groupInterpolation
+ */
+
+/**
+   * @addtogroup BilinearInterpolate
+   * @{
+   */
+
+  /**
+  * @brief  Q15 bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate in 12.20 format.
+  * @param[in]     Y  interpolation coordinate in 12.20 format.
+  * @return out interpolated value.
+  */
+  q15_t arm_bilinear_interp_q15(
+  arm_bilinear_interp_instance_q15 * S,
+  q31_t X,
+  q31_t Y)
+  {
+    q63_t acc = 0;                               /* output */
+    q31_t out;                                   /* Temporary output */
+    q15_t x1, x2, y1, y2;                        /* Nearest output values */
+    q31_t xfract, yfract;                        /* X, Y fractional parts */
+    int32_t rI, cI;                              /* Row and column indices */
+    q15_t *pYData = S->pData;                    /* pointer to output table values */
+    uint32_t nCols = S->numCols;                 /* num of rows */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    rI = ((X & (q31_t)0xFFF00000) >> 20);
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    cI = ((Y & (q31_t)0xFFF00000) >> 20);
+
+    /* Care taken for table outside boundary */
+    /* Returns zero output when values are outside table boundary */
+    if (rI < 0 || rI > (S->numCols - 2) || cI < 0 || cI > (S->numRows - 2))
+    {
+      return (0);
+    }
+
+    /* 20 bits for the fractional part */
+    /* xfract should be in 12.20 format */
+    xfract = (X & 0x000FFFFF);
+
+    /* Read two nearest output values from the index */
+    x1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI)    ];
+    x2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) + 1];
+
+    /* 20 bits for the fractional part */
+    /* yfract should be in 12.20 format */
+    yfract = (Y & 0x000FFFFF);
+
+    /* Read two nearest output values from the index */
+    y1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1)    ];
+    y2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) + 1];
+
+    /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 13.51 format */
+
+    /* x1 is in 1.15(q15), xfract in 12.20 format and out is in 13.35 format */
+    /* convert 13.35 to 13.31 by right shifting  and out is in 1.31 */
+    out = (q31_t) (((q63_t) x1 * (0x0FFFFF - xfract)) >> 4U);
+    acc = ((q63_t) out * (0x0FFFFF - yfract));
+
+    /* x2 * (xfract) * (1-yfract)  in 1.51 and adding to acc */
+    out = (q31_t) (((q63_t) x2 * (0x0FFFFF - yfract)) >> 4U);
+    acc += ((q63_t) out * (xfract));
+
+    /* y1 * (1 - xfract) * (yfract)  in 1.51 and adding to acc */
+    out = (q31_t) (((q63_t) y1 * (0x0FFFFF - xfract)) >> 4U);
+    acc += ((q63_t) out * (yfract));
+
+    /* y2 * (xfract) * (yfract)  in 1.51 and adding to acc */
+    out = (q31_t) (((q63_t) y2 * (xfract)) >> 4U);
+    acc += ((q63_t) out * (yfract));
+
+    /* acc is in 13.51 format and down shift acc by 36 times */
+    /* Convert out to 1.15 format */
+    return ((q15_t)(acc >> 36));
+  }
+
+
+  /**
+   * @} end of BilinearInterpolate group
+   */
+
diff --git a/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q31.c b/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q31.c
new file mode 100644
index 0000000000000000000000000000000000000000..cc8560ec84a490b91dd307d5ceab2ff07fa4a5ae
--- /dev/null
+++ b/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q31.c
@@ -0,0 +1,119 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_linear_interp_q31.c
+ * Description:  Q31 linear interpolation
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+  @ingroup groupInterpolation
+ */
+
+
+/**
+   * @addtogroup BilinearInterpolate
+   * @{
+   */
+
+ /**
+  * @brief  Q31 bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate in 12.20 format.
+  * @param[in]     Y  interpolation coordinate in 12.20 format.
+  * @return out interpolated value.
+  */
+  q31_t arm_bilinear_interp_q31(
+  arm_bilinear_interp_instance_q31 * S,
+  q31_t X,
+  q31_t Y)
+  {
+    q31_t out;                                   /* Temporary output */
+    q31_t acc = 0;                               /* output */
+    q31_t xfract, yfract;                        /* X, Y fractional parts */
+    q31_t x1, x2, y1, y2;                        /* Nearest output values */
+    int32_t rI, cI;                              /* Row and column indices */
+    q31_t *pYData = S->pData;                    /* pointer to output table values */
+    uint32_t nCols = S->numCols;                 /* num of rows */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    rI = ((X & (q31_t)0xFFF00000) >> 20);
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    cI = ((Y & (q31_t)0xFFF00000) >> 20);
+
+    /* Care taken for table outside boundary */
+    /* Returns zero output when values are outside table boundary */
+    if (rI < 0 || rI > (S->numCols - 2) || cI < 0 || cI > (S->numRows - 2))
+    {
+      return (0);
+    }
+
+    /* 20 bits for the fractional part */
+    /* shift left xfract by 11 to keep 1.31 format */
+    xfract = (X & 0x000FFFFF) << 11U;
+
+    /* Read two nearest output values from the index */
+    x1 = pYData[(rI) + (int32_t)nCols * (cI)    ];
+    x2 = pYData[(rI) + (int32_t)nCols * (cI) + 1];
+
+    /* 20 bits for the fractional part */
+    /* shift left yfract by 11 to keep 1.31 format */
+    yfract = (Y & 0x000FFFFF) << 11U;
+
+    /* Read two nearest output values from the index */
+    y1 = pYData[(rI) + (int32_t)nCols * (cI + 1)    ];
+    y2 = pYData[(rI) + (int32_t)nCols * (cI + 1) + 1];
+
+    /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 3.29(q29) format */
+    out = ((q31_t) (((q63_t) x1  * (0x7FFFFFFF - xfract)) >> 32));
+    acc = ((q31_t) (((q63_t) out * (0x7FFFFFFF - yfract)) >> 32));
+
+    /* x2 * (xfract) * (1-yfract)  in 3.29(q29) and adding to acc */
+    out = ((q31_t) ((q63_t) x2 * (0x7FFFFFFF - yfract) >> 32));
+    acc += ((q31_t) ((q63_t) out * (xfract) >> 32));
+
+    /* y1 * (1 - xfract) * (yfract)  in 3.29(q29) and adding to acc */
+    out = ((q31_t) ((q63_t) y1 * (0x7FFFFFFF - xfract) >> 32));
+    acc += ((q31_t) ((q63_t) out * (yfract) >> 32));
+
+    /* y2 * (xfract) * (yfract)  in 3.29(q29) and adding to acc */
+    out = ((q31_t) ((q63_t) y2 * (xfract) >> 32));
+    acc += ((q31_t) ((q63_t) out * (yfract) >> 32));
+
+    /* Convert acc to 1.31(q31) format */
+    return ((q31_t)(acc << 2));
+  }
+
+
+
+  /**
+   * @} end of BilinearInterpolate group
+   */
+
diff --git a/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q7.c b/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q7.c
new file mode 100644
index 0000000000000000000000000000000000000000..204342e333d18ae2b3ec05c8a18bcf9ebc952bbb
--- /dev/null
+++ b/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q7.c
@@ -0,0 +1,117 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_linear_interp_q7.c
+ * Description:  Q7 linear interpolation
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+  @ingroup groupInterpolation
+ */
+
+
+/**
+   * @addtogroup BilinearInterpolate
+   * @{
+   */
+
+/**
+  * @brief  Q7 bilinear interpolation.
+  * @param[in,out] S  points to an instance of the interpolation structure.
+  * @param[in]     X  interpolation coordinate in 12.20 format.
+  * @param[in]     Y  interpolation coordinate in 12.20 format.
+  * @return out interpolated value.
+  */
+  q7_t arm_bilinear_interp_q7(
+  arm_bilinear_interp_instance_q7 * S,
+  q31_t X,
+  q31_t Y)
+  {
+    q63_t acc = 0;                               /* output */
+    q31_t out;                                   /* Temporary output */
+    q31_t xfract, yfract;                        /* X, Y fractional parts */
+    q7_t x1, x2, y1, y2;                         /* Nearest output values */
+    int32_t rI, cI;                              /* Row and column indices */
+    q7_t *pYData = S->pData;                     /* pointer to output table values */
+    uint32_t nCols = S->numCols;                 /* num of rows */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    rI = ((X & (q31_t)0xFFF00000) >> 20);
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    cI = ((Y & (q31_t)0xFFF00000) >> 20);
+
+    /* Care taken for table outside boundary */
+    /* Returns zero output when values are outside table boundary */
+    if (rI < 0 || rI > (S->numCols - 2) || cI < 0 || cI > (S->numRows - 2))
+    {
+      return (0);
+    }
+
+    /* 20 bits for the fractional part */
+    /* xfract should be in 12.20 format */
+    xfract = (X & (q31_t)0x000FFFFF);
+
+    /* Read two nearest output values from the index */
+    x1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI)    ];
+    x2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) + 1];
+
+    /* 20 bits for the fractional part */
+    /* yfract should be in 12.20 format */
+    yfract = (Y & (q31_t)0x000FFFFF);
+
+    /* Read two nearest output values from the index */
+    y1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1)    ];
+    y2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) + 1];
+
+    /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 16.47 format */
+    out = ((x1 * (0xFFFFF - xfract)));
+    acc = (((q63_t) out * (0xFFFFF - yfract)));
+
+    /* x2 * (xfract) * (1-yfract)  in 2.22 and adding to acc */
+    out = ((x2 * (0xFFFFF - yfract)));
+    acc += (((q63_t) out * (xfract)));
+
+    /* y1 * (1 - xfract) * (yfract)  in 2.22 and adding to acc */
+    out = ((y1 * (0xFFFFF - xfract)));
+    acc += (((q63_t) out * (yfract)));
+
+    /* y2 * (xfract) * (yfract)  in 2.22 and adding to acc */
+    out = ((y2 * (yfract)));
+    acc += (((q63_t) out * (xfract)));
+
+    /* acc in 16.47 format and down shift by 40 to convert to 1.7 format */
+    return ((q7_t)(acc >> 40));
+  }
+
+  /**
+   * @} end of BilinearInterpolate group
+   */
+
diff --git a/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_f16.c b/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..f1c1bee633345d3e21d72fc7e3e837858e4d2de2
--- /dev/null
+++ b/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_f16.c
@@ -0,0 +1,131 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_linear_interp_f16.c
+ * Description:  Floating-point linear interpolation
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupInterpolation
+ */
+
+/**
+   * @defgroup LinearInterpolate Linear Interpolation
+   *
+   * Linear interpolation is a method of curve fitting using linear polynomials.
+   * Linear interpolation works by effectively drawing a straight line between two neighboring samples and returning the appropriate point along that line
+   *
+   * \par
+   * \image html LinearInterp.gif "Linear interpolation"
+   *
+   * \par
+   * A  Linear Interpolate function calculates an output value(y), for the input(x)
+   * using linear interpolation of the input values x0, x1( nearest input values) and the output values y0 and y1(nearest output values)
+   *
+   * \par Algorithm:
+   * <pre>
+   *       y = y0 + (x - x0) * ((y1 - y0)/(x1-x0))
+   *       where x0, x1 are nearest values of input x
+   *             y0, y1 are nearest values to output y
+   * </pre>
+   *
+   * \par
+   * This set of functions implements Linear interpolation process
+   * for Q7, Q15, Q31, and floating-point data types.  The functions operate on a single
+   * sample of data and each call to the function returns a single processed value.
+   * <code>S</code> points to an instance of the Linear Interpolate function data structure.
+   * <code>x</code> is the input sample value. The functions returns the output value.
+   *
+   * \par
+   * if x is outside of the table boundary, Linear interpolation returns first value of the table
+   * if x is below input range and returns last value of table if x is above range.
+   */
+
+/**
+   * @addtogroup LinearInterpolate
+   * @{
+   */
+
+  /**
+   * @brief  Process function for the floating-point Linear Interpolation Function.
+   * @param[in,out] S  is an instance of the floating-point Linear Interpolation structure
+   * @param[in]     x  input sample to process
+   * @return y processed output sample.
+   *
+   */
+  float16_t arm_linear_interp_f16(
+  arm_linear_interp_instance_f16 * S,
+  float16_t x)
+  {
+    float16_t y;
+    float16_t x0, x1;                            /* Nearest input values */
+    float16_t y0, y1;                            /* Nearest output values */
+    float16_t xSpacing = S->xSpacing;            /* spacing between input values */
+    int32_t i;                                   /* Index variable */
+    float16_t *pYData = S->pYData;               /* pointer to output table */
+
+    /* Calculation of index */
+    i = (int32_t) ((x - S->x1) / xSpacing);
+
+    if (i < 0)
+    {
+      /* Iniatilize output for below specified range as least output value of table */
+      y = pYData[0];
+    }
+    else if ((uint32_t)i >= (S->nValues - 1))
+    {
+      /* Iniatilize output for above specified range as last output value of table */
+      y = pYData[S->nValues - 1];
+    }
+    else
+    {
+      /* Calculation of nearest input values */
+      x0 = S->x1 +  i      * xSpacing;
+      x1 = S->x1 + (i + 1) * xSpacing;
+
+      /* Read of nearest output values */
+      y0 = pYData[i];
+      y1 = pYData[i + 1];
+
+      /* Calculation of output */
+      y = y0 + (x - x0) * ((y1 - y0) / (x1 - x0));
+
+    }
+
+    /* returns output value */
+    return (y);
+  }
+
+  /**
+   * @} end of LinearInterpolate group
+   */
+
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_f32.c b/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..fc60bec45405f53395a455d2199be6891cec7ae3
--- /dev/null
+++ b/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_f32.c
@@ -0,0 +1,125 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_linear_interp_f32.c
+ * Description:  Floating-point linear interpolation
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+  @ingroup groupInterpolation
+ */
+
+/**
+   * @defgroup LinearInterpolate Linear Interpolation
+   *
+   * Linear interpolation is a method of curve fitting using linear polynomials.
+   * Linear interpolation works by effectively drawing a straight line between two neighboring samples and returning the appropriate point along that line
+   *
+   * \par
+   * \image html LinearInterp.gif "Linear interpolation"
+   *
+   * \par
+   * A  Linear Interpolate function calculates an output value(y), for the input(x)
+   * using linear interpolation of the input values x0, x1( nearest input values) and the output values y0 and y1(nearest output values)
+   *
+   * \par Algorithm:
+   * <pre>
+   *       y = y0 + (x - x0) * ((y1 - y0)/(x1-x0))
+   *       where x0, x1 are nearest values of input x
+   *             y0, y1 are nearest values to output y
+   * </pre>
+   *
+   * \par
+   * This set of functions implements Linear interpolation process
+   * for Q7, Q15, Q31, and floating-point data types.  The functions operate on a single
+   * sample of data and each call to the function returns a single processed value.
+   * <code>S</code> points to an instance of the Linear Interpolate function data structure.
+   * <code>x</code> is the input sample value. The functions returns the output value.
+   *
+   * \par
+   * if x is outside of the table boundary, Linear interpolation returns first value of the table
+   * if x is below input range and returns last value of table if x is above range.
+   */
+
+/**
+   * @addtogroup LinearInterpolate
+   * @{
+   */
+
+  /**
+   * @brief  Process function for the floating-point Linear Interpolation Function.
+   * @param[in,out] S  is an instance of the floating-point Linear Interpolation structure
+   * @param[in]     x  input sample to process
+   * @return y processed output sample.
+   *
+   */
+  float32_t arm_linear_interp_f32(
+  arm_linear_interp_instance_f32 * S,
+  float32_t x)
+  {
+    float32_t y;
+    float32_t x0, x1;                            /* Nearest input values */
+    float32_t y0, y1;                            /* Nearest output values */
+    float32_t xSpacing = S->xSpacing;            /* spacing between input values */
+    int32_t i;                                   /* Index variable */
+    float32_t *pYData = S->pYData;               /* pointer to output table */
+
+    /* Calculation of index */
+    i = (int32_t) ((x - S->x1) / xSpacing);
+
+    if (i < 0)
+    {
+      /* Iniatilize output for below specified range as least output value of table */
+      y = pYData[0];
+    }
+    else if ((uint32_t)i >= (S->nValues - 1))
+    {
+      /* Iniatilize output for above specified range as last output value of table */
+      y = pYData[S->nValues - 1];
+    }
+    else
+    {
+      /* Calculation of nearest input values */
+      x0 = S->x1 +  i      * xSpacing;
+      x1 = S->x1 + (i + 1) * xSpacing;
+
+      /* Read of nearest output values */
+      y0 = pYData[i];
+      y1 = pYData[i + 1];
+
+      /* Calculation of output */
+      y = y0 + (x - x0) * ((y1 - y0) / (x1 - x0));
+
+    }
+
+    /* returns output value */
+    return (y);
+  }
+
+  /**
+   * @} end of LinearInterpolate group
+   */
+
diff --git a/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q15.c b/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q15.c
new file mode 100644
index 0000000000000000000000000000000000000000..70019b666b7d638f344f0dba090953d9f8ef04d1
--- /dev/null
+++ b/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q15.c
@@ -0,0 +1,101 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_linear_interp_q15.c
+ * Description:  Q15 linear interpolation
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+  @ingroup groupInterpolation
+ */
+
+/**
+   * @addtogroup LinearInterpolate
+   * @{
+   */
+
+  /**
+   *
+   * @brief  Process function for the Q15 Linear Interpolation Function.
+   * @param[in] pYData   pointer to Q15 Linear Interpolation table
+   * @param[in] x        input sample to process
+   * @param[in] nValues  number of table values
+   * @return y processed output sample.
+   *
+   * \par
+   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+   * This function can support maximum of table size 2^12.
+   *
+   */
+  q15_t arm_linear_interp_q15(
+  q15_t * pYData,
+  q31_t x,
+  uint32_t nValues)
+  {
+    q63_t y;                                     /* output */
+    q15_t y0, y1;                                /* Nearest output values */
+    q31_t fract;                                 /* fractional part */
+    int32_t index;                               /* Index to read nearest output values */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    index = ((x & (int32_t)0xFFF00000) >> 20);
+
+    if (index >= (int32_t)(nValues - 1))
+    {
+      return (pYData[nValues - 1]);
+    }
+    else if (index < 0)
+    {
+      return (pYData[0]);
+    }
+    else
+    {
+      /* 20 bits for the fractional part */
+      /* fract is in 12.20 format */
+      fract = (x & 0x000FFFFF);
+
+      /* Read two nearest output values from the index */
+      y0 = pYData[index];
+      y1 = pYData[index + 1];
+
+      /* Calculation of y0 * (1-fract) and y is in 13.35 format */
+      y = ((q63_t) y0 * (0xFFFFF - fract));
+
+      /* Calculation of (y0 * (1-fract) + y1 * fract) and y is in 13.35 format */
+      y += ((q63_t) y1 * (fract));
+
+      /* convert y to 1.15 format */
+      return (q15_t) (y >> 20);
+    }
+  }
+
+
+  /**
+   * @} end of LinearInterpolate group
+   */
+
diff --git a/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q31.c b/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q31.c
new file mode 100644
index 0000000000000000000000000000000000000000..55348edab19c0ced17a68686dc6ed4978af85012
--- /dev/null
+++ b/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q31.c
@@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_linear_interp_q31.c
+ * Description:  Q31 linear interpolation
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+  @ingroup groupInterpolation
+ */
+
+
+/**
+   * @addtogroup LinearInterpolate
+   * @{
+   */
+
+  /**
+   *
+   * @brief  Process function for the Q31 Linear Interpolation Function.
+   * @param[in] pYData   pointer to Q31 Linear Interpolation table
+   * @param[in] x        input sample to process
+   * @param[in] nValues  number of table values
+   * @return y processed output sample.
+   *
+   * \par
+   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+   * This function can support maximum of table size 2^12.
+   *
+   */
+  q31_t arm_linear_interp_q31(
+  q31_t * pYData,
+  q31_t x,
+  uint32_t nValues)
+  {
+    q31_t y;                                     /* output */
+    q31_t y0, y1;                                /* Nearest output values */
+    q31_t fract;                                 /* fractional part */
+    int32_t index;                               /* Index to read nearest output values */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    index = ((x & (q31_t)0xFFF00000) >> 20);
+
+    if (index >= (int32_t)(nValues - 1))
+    {
+      return (pYData[nValues - 1]);
+    }
+    else if (index < 0)
+    {
+      return (pYData[0]);
+    }
+    else
+    {
+      /* 20 bits for the fractional part */
+      /* shift left by 11 to keep fract in 1.31 format */
+      fract = (x & 0x000FFFFF) << 11;
+
+      /* Read two nearest output values from the index in 1.31(q31) format */
+      y0 = pYData[index];
+      y1 = pYData[index + 1];
+
+      /* Calculation of y0 * (1-fract) and y is in 2.30 format */
+      y = ((q31_t) ((q63_t) y0 * (0x7FFFFFFF - fract) >> 32));
+
+      /* Calculation of y0 * (1-fract) + y1 *fract and y is in 2.30 format */
+      y += ((q31_t) (((q63_t) y1 * fract) >> 32));
+
+      /* Convert y to 1.31 format */
+      return (y << 1U);
+    }
+  }
+
+
+
+  /**
+   * @} end of LinearInterpolate group
+   */
+
diff --git a/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q7.c b/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q7.c
new file mode 100644
index 0000000000000000000000000000000000000000..db34b01eca7d59fbc4a552791e640e9dfdb06fc7
--- /dev/null
+++ b/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q7.c
@@ -0,0 +1,99 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_linear_interp_q7.c
+ * Description:  Q7 linear interpolation
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+  @ingroup groupInterpolation
+ */
+
+
+/**
+   * @addtogroup LinearInterpolate
+   * @{
+   */
+
+  /**
+   *
+   * @brief  Process function for the Q7 Linear Interpolation Function.
+   * @param[in] pYData   pointer to Q7 Linear Interpolation table
+   * @param[in] x        input sample to process
+   * @param[in] nValues  number of table values
+   * @return y processed output sample.
+   *
+   * \par
+   * Input sample <code>x</code> is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+   * This function can support maximum of table size 2^12.
+   */
+  q7_t arm_linear_interp_q7(
+  q7_t * pYData,
+  q31_t x,
+  uint32_t nValues)
+  {
+    q31_t y;                                     /* output */
+    q7_t y0, y1;                                 /* Nearest output values */
+    q31_t fract;                                 /* fractional part */
+    uint32_t index;                              /* Index to read nearest output values */
+
+    /* Input is in 12.20 format */
+    /* 12 bits for the table index */
+    /* Index value calculation */
+    if (x < 0)
+    {
+      return (pYData[0]);
+    }
+    index = (x >> 20) & 0xfff;
+
+    if (index >= (nValues - 1))
+    {
+      return (pYData[nValues - 1]);
+    }
+    else
+    {
+      /* 20 bits for the fractional part */
+      /* fract is in 12.20 format */
+      fract = (x & 0x000FFFFF);
+
+      /* Read two nearest output values from the index and are in 1.7(q7) format */
+      y0 = pYData[index];
+      y1 = pYData[index + 1];
+
+      /* Calculation of y0 * (1-fract ) and y is in 13.27(q27) format */
+      y = ((y0 * (0xFFFFF - fract)));
+
+      /* Calculation of y1 * fract + y0 * (1-fract) and y is in 13.27(q27) format */
+      y += (y1 * fract);
+
+      /* convert y to 1.7(q7) format */
+      return (q7_t) (y >> 20);
+     }
+  }
+  /**
+   * @} end of LinearInterpolate group
+   */
+
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_f32.c b/CMSIS/DSP/Source/InterpolationFunctions/arm_spline_interp_f32.c
similarity index 96%
rename from CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_f32.c
rename to CMSIS/DSP/Source/InterpolationFunctions/arm_spline_interp_f32.c
index 0f829e800600b2b48fa93166f49bd0894c568121..3e3d0918115b08006ad0cd503379dc6bfab938bc 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_f32.c
+++ b/CMSIS/DSP/Source/InterpolationFunctions/arm_spline_interp_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_spline_interp_f32.c
  * Description:  Floating-point cubic spline interpolation
  *
- * $Date:        13 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,10 +26,10 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/interpolation_functions.h"
 
 /**
-  @ingroup groupSupport
+  @ingroup groupInterpolation
  */
 
 /**
@@ -138,7 +138,7 @@
 /**
  * @brief Processing function for the floating-point cubic spline interpolation.
  * @param[in]  S          points to an instance of the floating-point spline structure.
- * @param[in]  xq         points to the x values ot the interpolated data points.
+ * @param[in]  xq         points to the x values of the interpolated data points.
  * @param[out] pDst       points to the block of output data.
  * @param[in]  blockSize  number of samples of output data.
  */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_init_f32.c b/CMSIS/DSP/Source/InterpolationFunctions/arm_spline_interp_init_f32.c
similarity index 95%
rename from CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_init_f32.c
rename to CMSIS/DSP/Source/InterpolationFunctions/arm_spline_interp_init_f32.c
index 492bb20ffff53eee0acdbfc04549a39ecbd299cb..ae3f2da00cbf98a279bad0c08f951a24ff6c32ce 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_init_f32.c
+++ b/CMSIS/DSP/Source/InterpolationFunctions/arm_spline_interp_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_spline_interp_init_f32.c
  * Description:  Floating-point cubic spline initialization function
  *
- * $Date:        13 November 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,10 +26,10 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/interpolation_functions.h"
 
 /**
-  @ingroup groupSupport
+  @ingroup groupInterpolation
  */
 
 /**
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..c666179b9eed5b105328dd54b4731c7eb9756d45
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f16.c
@@ -0,0 +1,217 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_add_f16.c
+ * Description:  Floating-point matrix addition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixAdd
+  @{
+ */
+
+
+/**
+  @brief         Floating-point matrix addition.
+  @param[in]     pSrcA      points to first input matrix structure
+  @param[in]     pSrcB      points to second input matrix structure
+  @param[out]    pDst       points to output matrix structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_add_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+  arm_matrix_instance_f16 * pDst)
+{
+    arm_status status;  
+    uint32_t  numSamples;       /* total number of elements in the matrix  */
+    float16_t *pDataA, *pDataB, *pDataDst;
+    f16x8_t vecA, vecB, vecDst;
+    float16_t const *pSrcAVec;
+    float16_t const *pSrcBVec;
+    uint32_t  blkCnt;           /* loop counters */
+
+    pDataA = pSrcA->pData;
+    pDataB = pSrcB->pData;
+    pDataDst = pDst->pData;
+    pSrcAVec = (float16_t const *) pDataA;
+    pSrcBVec = (float16_t const *) pDataB;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numRows != pSrcB->numRows) ||
+     (pSrcA->numCols != pSrcB->numCols) ||
+     (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif
+ {
+    /*
+     * Total number of samples in the input matrix
+     */
+    numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+    blkCnt = numSamples >> 3;
+    while (blkCnt > 0U)
+    {
+        /* C(m,n) = A(m,n) + B(m,n) */
+        /* Add and then store the results in the destination buffer. */
+        vecA = vld1q(pSrcAVec); 
+        pSrcAVec += 8;
+        vecB = vld1q(pSrcBVec); 
+        pSrcBVec += 8;
+        vecDst = vaddq(vecA, vecB);
+        vst1q(pDataDst, vecDst);  
+        pDataDst += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     */
+    blkCnt = numSamples & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcAVec); 
+        vecB = vld1q(pSrcBVec); 
+        vecDst = vaddq_m(vecDst, vecA, vecB, p0);
+        vstrhq_p(pDataDst, vecDst, p0);
+    }
+    /* set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+  return (status);
+}
+#else
+
+arm_status arm_mat_add_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+        arm_matrix_instance_f16 * pDst)
+{
+  float16_t *pInA = pSrcA->pData;                /* input data matrix pointer A */
+  float16_t *pInB = pSrcB->pData;                /* input data matrix pointer B */
+  float16_t *pOut = pDst->pData;                 /* output data matrix pointer */
+
+  uint32_t numSamples;                           /* total number of elements in the matrix */
+  uint32_t blkCnt;                               /* loop counters */
+  arm_status status;                             /* status of matrix addition */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numRows != pSrcB->numRows) ||
+      (pSrcA->numCols != pSrcB->numCols) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcA->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* Total number of samples in input matrix */
+    numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+    blkCnt = numSamples >> 2U;
+
+    while (blkCnt > 0U)
+    {
+      /* C(m,n) = A(m,n) + B(m,n) */
+
+      /* Add and store result in destination buffer. */
+      *pOut++ = *pInA++ + *pInB++;
+
+      *pOut++ = *pInA++ + *pInB++;
+
+      *pOut++ = *pInA++ + *pInB++;
+
+      *pOut++ = *pInA++ + *pInB++;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    blkCnt = numSamples % 0x4U;
+
+#else
+
+    /* Initialize blkCnt with number of samples */
+    blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+    while (blkCnt > 0U)
+    {
+      /* C(m,n) = A(m,n) + B(m,n) */
+
+      /* Add and store result in destination buffer. */
+      *pOut++ = *pInA++ + *pInB++;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixAdd group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f32.c
index 008f2996771b7453c4026eb4f78693b28587b2fa..879c8f8f50d711ed37af62a3645e0ac40895a28a 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f32.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_add_f32.c
  * Description:  Floating-point matrix addition
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q15.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q15.c
index 7aab4b3ede6ec709645d66f869ed6d6cf9ef625a..e27b72fa52b71230d989401fa22490bf2fc76125 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q15.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_add_q15.c
  * Description:  Q15 matrix addition
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -50,7 +50,7 @@
                    The function uses saturating arithmetic.
                    Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 arm_status arm_mat_add_q15(
   const arm_matrix_instance_q15 * pSrcA,
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q31.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q31.c
index 1ff37033942fdadf2cc53ac7263d748cf99ef554..57f67e71583499463086d3ed29eb28b9311fd28b 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q31.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_add_q31.c
  * Description:  Q31 matrix addition
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -50,7 +50,7 @@
                    The function uses saturating arithmetic.
                    Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 arm_status arm_mat_add_q31(
   const arm_matrix_instance_q31 * pSrcA,
   const arm_matrix_instance_q31 * pSrcB,
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..3e9062c0efd843b642d4ed86c58add8e7e90352e
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f16.c
@@ -0,0 +1,253 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_cholesky_f16.c
+ * Description:  Floating-point Cholesky decomposition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @addtogroup MatrixChol
+  @{
+ */
+
+/**
+   * @brief Floating-point Cholesky decomposition of positive-definite matrix.
+   * @param[in]  pSrc   points to the instance of the input floating-point matrix structure.
+   * @param[out] pDst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+                   - \ref ARM_MATH_DECOMPOSITION_FAILURE      : Input matrix cannot be decomposed
+   * @par
+   * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
+   * The decomposition of A is returning a lower triangular matrix U such that A = U U^t
+   */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_cholesky_f16(
+  const arm_matrix_instance_f16 * pSrc,
+        arm_matrix_instance_f16 * pDst)
+{
+
+  arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pDst->numRows != pDst->numCols) ||
+      (pSrc->numRows != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    int i,j,k;
+    int n = pSrc->numRows;
+    _Float16 invSqrtVj;
+    float16_t *pA,*pG;
+    int kCnt;
+
+    mve_pred16_t p0;
+
+    f16x8_t acc, acc0, acc1, acc2, acc3;
+    f16x8_t vecGi;
+    f16x8_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
+
+
+    pA = pSrc->pData;
+    pG = pDst->pData;
+    
+    for(i=0 ;i < n ; i++)
+    {
+       for(j=i ; j+3 < n ; j+=4)
+       {
+          acc0 = vdupq_n_f16(0.0f16);
+          acc0[0]=pA[(j + 0) * n + i];
+
+          acc1 = vdupq_n_f16(0.0f16);
+          acc1[0]=pA[(j + 1) * n + i];
+
+          acc2 = vdupq_n_f16(0.0f16);
+          acc2[0]=pA[(j + 2) * n + i];
+
+          acc3 = vdupq_n_f16(0.0f16);
+          acc3[0]=pA[(j + 3) * n + i];
+
+          kCnt = i;
+          for(k=0; k < i ; k+=8)
+          {
+             p0 = vctp16q(kCnt);
+
+             vecGi=vldrhq_z_f16(&pG[i * n + k],p0);
+             
+             vecGj0=vldrhq_z_f16(&pG[(j + 0) * n + k],p0);
+             vecGj1=vldrhq_z_f16(&pG[(j + 1) * n + k],p0);
+             vecGj2=vldrhq_z_f16(&pG[(j + 2) * n + k],p0);
+             vecGj3=vldrhq_z_f16(&pG[(j + 3) * n + k],p0);
+
+             acc0 = vfmsq_m(acc0, vecGi, vecGj0, p0);
+             acc1 = vfmsq_m(acc1, vecGi, vecGj1, p0);
+             acc2 = vfmsq_m(acc2, vecGi, vecGj2, p0);
+             acc3 = vfmsq_m(acc3, vecGi, vecGj3, p0);
+
+             kCnt -= 8;
+          }
+          pG[(j + 0) * n + i] = vecAddAcrossF16Mve(acc0);
+          pG[(j + 1) * n + i] = vecAddAcrossF16Mve(acc1);
+          pG[(j + 2) * n + i] = vecAddAcrossF16Mve(acc2);
+          pG[(j + 3) * n + i] = vecAddAcrossF16Mve(acc3);
+       }
+
+       for(; j < n ; j++)
+       {
+
+          kCnt = i;
+          acc = vdupq_n_f16(0.0f16);
+          acc[0] = pA[j * n + i];
+
+          for(k=0; k < i ; k+=8)
+          {
+             p0 = vctp16q(kCnt);
+
+             vecGi=vldrhq_z_f16(&pG[i * n + k],p0);
+             vecGj=vldrhq_z_f16(&pG[j * n + k],p0);
+
+             acc = vfmsq_m(acc, vecGi, vecGj,p0);
+
+             kCnt -= 8;
+          }
+          pG[j * n + i] = vecAddAcrossF16Mve(acc);
+       }
+
+       if (pG[i * n + i] <= 0.0f16)
+       {
+         return(ARM_MATH_DECOMPOSITION_FAILURE);
+       }
+
+       invSqrtVj = (_Float16)1.0f/sqrtf(pG[i * n + i]);
+       for(j=i; j < n ; j++)
+       {
+         pG[j * n + i] = (_Float16)pG[j * n + i] * invSqrtVj ;
+       }
+    }
+
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+
+#else
+arm_status arm_mat_cholesky_f16(
+  const arm_matrix_instance_f16 * pSrc,
+        arm_matrix_instance_f16 * pDst)
+{
+
+  arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pDst->numRows != pDst->numCols) ||
+      (pSrc->numRows != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    int i,j,k;
+    int n = pSrc->numRows;
+    float16_t invSqrtVj;
+    float16_t *pA,*pG;
+
+    pA = pSrc->pData;
+    pG = pDst->pData;
+    
+
+    for(i=0 ; i < n ; i++)
+    {
+       for(j=i ; j < n ; j++)
+       {
+          pG[j * n + i] = pA[j * n + i];
+
+          for(k=0; k < i ; k++)
+          {
+             pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
+          }
+       }
+
+       if (pG[i * n + i] <= 0.0f)
+       {
+         return(ARM_MATH_DECOMPOSITION_FAILURE);
+       }
+
+       invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
+       for(j=i ; j < n ; j++)
+       {
+         pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
+       }
+    }
+
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixChol group
+ */
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..69842110f328d9d0bf27a126beedf07de2bf2036
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f32.c
@@ -0,0 +1,436 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_cholesky_f32.c
+ * Description:  Floating-point Cholesky decomposition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @defgroup MatrixChol Cholesky and LDLT decompositions
+
+  Computes the Cholesky or LDL^t decomposition of a matrix.
+
+
+  If the input matrix does not have a decomposition, then the 
+  algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE.
+ */
+
+/**
+  @addtogroup MatrixChol
+  @{
+ */
+
+/**
+   * @brief Floating-point Cholesky decomposition of positive-definite matrix.
+   * @param[in]  pSrc   points to the instance of the input floating-point matrix structure.
+   * @param[out] pDst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+                   - \ref ARM_MATH_DECOMPOSITION_FAILURE      : Input matrix cannot be decomposed
+   * @par
+   * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
+   * The decomposition of A is returning a lower triangular matrix U such that A = U U^t
+   */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_cholesky_f32(
+  const arm_matrix_instance_f32 * pSrc,
+        arm_matrix_instance_f32 * pDst)
+{
+
+  arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pDst->numRows != pDst->numCols) ||
+      (pSrc->numRows != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    int i,j,k;
+    int n = pSrc->numRows;
+    float32_t invSqrtVj;
+    float32_t *pA,*pG;
+    int kCnt;
+
+    mve_pred16_t p0;
+
+    f32x4_t acc, acc0, acc1, acc2, acc3;
+    f32x4_t vecGi;
+    f32x4_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
+
+
+    pA = pSrc->pData;
+    pG = pDst->pData;
+    
+    for(i=0 ;i < n ; i++)
+    {
+       for(j=i ; j+3 < n ; j+=4)
+       {
+          pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
+          pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
+          pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
+          pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
+
+          kCnt = i;
+          acc0 = vdupq_n_f32(0.0f);
+          acc1 = vdupq_n_f32(0.0f);
+          acc2 = vdupq_n_f32(0.0f);
+          acc3 = vdupq_n_f32(0.0f);
+
+          for(k=0; k < i ; k+=4)
+          {
+             p0 = vctp32q(kCnt);
+
+             vecGi=vldrwq_z_f32(&pG[i * n + k],p0);
+             
+             vecGj0=vldrwq_z_f32(&pG[(j + 0) * n + k],p0);
+             vecGj1=vldrwq_z_f32(&pG[(j + 1) * n + k],p0);
+             vecGj2=vldrwq_z_f32(&pG[(j + 2) * n + k],p0);
+             vecGj3=vldrwq_z_f32(&pG[(j + 3) * n + k],p0);
+
+             acc0 = vfmaq_m(acc0, vecGi, vecGj0, p0);
+             acc1 = vfmaq_m(acc1, vecGi, vecGj1, p0);
+             acc2 = vfmaq_m(acc2, vecGi, vecGj2, p0);
+             acc3 = vfmaq_m(acc3, vecGi, vecGj3, p0);
+
+             kCnt -= 4;
+          }
+          pG[(j + 0) * n + i] -= vecAddAcrossF32Mve(acc0);
+          pG[(j + 1) * n + i] -= vecAddAcrossF32Mve(acc1);
+          pG[(j + 2) * n + i] -= vecAddAcrossF32Mve(acc2);
+          pG[(j + 3) * n + i] -= vecAddAcrossF32Mve(acc3);
+       }
+
+       for(; j < n ; j++)
+       {
+          pG[j * n + i] = pA[j * n + i];
+
+          kCnt = i;
+          acc = vdupq_n_f32(0.0f);
+
+          for(k=0; k < i ; k+=4)
+          {
+             p0 = vctp32q(kCnt);
+
+             vecGi=vldrwq_z_f32(&pG[i * n + k],p0);
+             vecGj=vldrwq_z_f32(&pG[j * n + k],p0);
+
+             acc = vfmaq_m(acc, vecGi, vecGj,p0);
+
+             kCnt -= 4;
+          }
+          pG[j * n + i] -= vecAddAcrossF32Mve(acc);
+       }
+
+       if (pG[i * n + i] <= 0.0f)
+       {
+         return(ARM_MATH_DECOMPOSITION_FAILURE);
+       }
+
+       invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
+       for(j=i; j < n ; j++)
+       {
+         pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
+       }
+    }
+
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_cholesky_f32(
+  const arm_matrix_instance_f32 * pSrc,
+        arm_matrix_instance_f32 * pDst)
+{
+
+  arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pDst->numRows != pDst->numCols) ||
+      (pSrc->numRows != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    int i,j,k;
+    int n = pSrc->numRows;
+    float32_t invSqrtVj;
+    float32_t *pA,*pG;
+    int kCnt;
+
+
+    f32x4_t acc, acc0, acc1, acc2, acc3;
+    f32x4_t vecGi;
+    f32x4_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
+    f32x2_t tmp = vdup_n_f32(0);    
+    float32_t sum=0.0f;
+    float32_t sum0=0.0f,sum1=0.0f,sum2=0.0f,sum3=0.0f;
+
+
+    pA = pSrc->pData;
+    pG = pDst->pData;
+    
+    for(i=0 ;i < n ; i++)
+    {
+       for(j=i ; j+3 < n ; j+=4)
+       {
+          pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
+          pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
+          pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
+          pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
+
+          acc0 = vdupq_n_f32(0.0f);
+          acc1 = vdupq_n_f32(0.0f);
+          acc2 = vdupq_n_f32(0.0f);
+          acc3 = vdupq_n_f32(0.0f);
+
+          kCnt = i >> 2;
+          k=0;
+          while(kCnt > 0)
+          {
+
+             vecGi=vld1q_f32(&pG[i * n + k]);
+             
+             vecGj0=vld1q_f32(&pG[(j + 0) * n + k]);
+             vecGj1=vld1q_f32(&pG[(j + 1) * n + k]);
+             vecGj2=vld1q_f32(&pG[(j + 2) * n + k]);
+             vecGj3=vld1q_f32(&pG[(j + 3) * n + k]);
+
+             acc0 = vfmaq_f32(acc0, vecGi, vecGj0);
+             acc1 = vfmaq_f32(acc1, vecGi, vecGj1);
+             acc2 = vfmaq_f32(acc2, vecGi, vecGj2);
+             acc3 = vfmaq_f32(acc3, vecGi, vecGj3);
+
+             kCnt--;
+             k+=4;
+          }
+
+#if __aarch64__
+          sum0 = vpadds_f32(vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)));
+          sum1 = vpadds_f32(vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1)));
+          sum2 = vpadds_f32(vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2)));
+          sum3 = vpadds_f32(vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3)));
+
+#else
+          tmp = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+          sum0 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
+
+          tmp = vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
+          sum1 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
+
+          tmp = vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
+          sum2 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
+
+          tmp = vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
+          sum3 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
+#endif
+
+          kCnt = i & 3;
+          while(kCnt > 0)
+          {
+
+             sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k];
+             sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k];
+             sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k];
+             sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k];
+             kCnt--;
+             k++;
+          }
+
+          pG[(j + 0) * n + i] -= sum0;
+          pG[(j + 1) * n + i] -= sum1;
+          pG[(j + 2) * n + i] -= sum2;
+          pG[(j + 3) * n + i] -= sum3;
+       }
+
+       for(; j < n ; j++)
+       {
+          pG[j * n + i] = pA[j * n + i];
+
+          acc = vdupq_n_f32(0.0f);
+
+          kCnt = i >> 2;
+          k=0;
+          while(kCnt > 0)
+          {
+
+             vecGi=vld1q_f32(&pG[i * n + k]);
+             vecGj=vld1q_f32(&pG[j * n + k]);
+
+             acc = vfmaq_f32(acc, vecGi, vecGj);
+
+             kCnt--;
+             k+=4;
+          }
+
+#if __aarch64__
+          sum = vpadds_f32(vpadd_f32(vget_low_f32(acc), vget_high_f32(acc)));
+#else
+          tmp = vpadd_f32(vget_low_f32(acc), vget_high_f32(acc));
+          sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
+#endif
+
+          kCnt = i & 3;
+          while(kCnt > 0)
+          {
+             sum = sum + pG[i * n + k] * pG[(j + 0) * n + k];
+
+            
+             kCnt--;
+             k++;
+          }
+
+          pG[j * n + i] -= sum;
+       }
+
+       if (pG[i * n + i] <= 0.0f)
+       {
+         return(ARM_MATH_DECOMPOSITION_FAILURE);
+       }
+
+       invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
+       for(j=i; j < n ; j++)
+       {
+         pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
+       }
+    }
+
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+
+#else
+arm_status arm_mat_cholesky_f32(
+  const arm_matrix_instance_f32 * pSrc,
+        arm_matrix_instance_f32 * pDst)
+{
+
+  arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pDst->numRows != pDst->numCols) ||
+      (pSrc->numRows != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    int i,j,k;
+    int n = pSrc->numRows;
+    float32_t invSqrtVj;
+    float32_t *pA,*pG;
+
+    pA = pSrc->pData;
+    pG = pDst->pData;
+    
+
+    for(i=0 ; i < n ; i++)
+    {
+       for(j=i ; j < n ; j++)
+       {
+          pG[j * n + i] = pA[j * n + i];
+
+          for(k=0; k < i ; k++)
+          {
+             pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
+          }
+       }
+
+       if (pG[i * n + i] <= 0.0f)
+       {
+         return(ARM_MATH_DECOMPOSITION_FAILURE);
+       }
+
+       invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
+       for(j=i ; j < n ; j++)
+       {
+         pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
+       }
+    }
+
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixChol group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f64.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f64.c
new file mode 100644
index 0000000000000000000000000000000000000000..1cd1bf71d5c3759b69672287ae787c7ce704752c
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f64.c
@@ -0,0 +1,122 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_cholesky_f64.c
+ * Description:  Floating-point Cholesky decomposition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @addtogroup MatrixChol
+  @{
+ */
+
+/**
+   * @brief Floating-point Cholesky decomposition of positive-definite matrix.
+   * @param[in]  pSrc   points to the instance of the input floating-point matrix structure.
+   * @param[out] pDst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+                   - \ref ARM_MATH_DECOMPOSITION_FAILURE      : Input matrix cannot be decomposed
+   * @par
+   * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
+   * The decomposition of A is returning a lower triangular matrix U such that A = U U^t
+   */
+
+
+arm_status arm_mat_cholesky_f64(
+  const arm_matrix_instance_f64 * pSrc,
+        arm_matrix_instance_f64 * pDst)
+{
+
+  arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pDst->numRows != pDst->numCols) ||
+      (pSrc->numRows != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    int i,j,k;
+    int n = pSrc->numRows;
+    float64_t invSqrtVj;
+    float64_t *pA,*pG;
+
+    pA = pSrc->pData;
+    pG = pDst->pData;
+    
+
+    for(i=0 ; i < n ; i++)
+    {
+       for(j=i ; j < n ; j++)
+       {
+          pG[j * n + i] = pA[j * n + i];
+
+          for(k=0; k < i ; k++)
+          {
+             pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
+          }
+       }
+
+       if (pG[i * n + i] <= 0.0f)
+       {
+         return(ARM_MATH_DECOMPOSITION_FAILURE);
+       }
+
+       invSqrtVj = 1.0/sqrt(pG[i * n + i]);
+       for(j=i ; j < n ; j++)
+       {
+         pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
+       }
+    }
+
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+
+/**
+  @} end of MatrixChol group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..977f381a0c5b143666b3fe81b7ec80562f373fbf
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
@@ -0,0 +1,932 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_cmplx_mult_f16.c
+ * Description:  Floating-point matrix multiplication
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup CmplxMatrixMult
+  @{
+ */
+
+/**
+  @brief         Floating-point Complex matrix multiplication.
+  @param[in]     pSrcA      points to first input complex matrix structure
+  @param[in]     pSrcB      points to second input complex matrix structure
+  @param[out]    pDst       points to output complex matrix structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_mat_cmplx_mult_f16 built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(__CMSIS_GCC_H)
+
+#include "arm_helium_utils.h"
+
+#define DONTCARE            0 /* inactive lane content */
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_2x2_mve(
+    const arm_matrix_instance_f16 * pSrcA,
+    const arm_matrix_instance_f16 * pSrcB,
+    arm_matrix_instance_f16 * pDst)
+{
+    const uint16_t   MATRIX_DIM = 2;
+    float16_t const *pInB = pSrcB->pData;  /* input data matrix pointer B */
+    float16_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
+    float16_t       *pOut = pDst->pData;   /* output data matrix pointer */
+    uint16x8_t     vecColBOffs0,vecColAOffs0,vecColAOffs1;
+    float16_t       *pInA0 = pInA;
+    f16x8_t        acc0, acc1;
+    f16x8_t        vecB, vecA0, vecA1;
+    f16x8_t        vecTmp;
+    uint16_t         tmp;
+    static const uint16_t offsetB0[8] = { 0, 1,
+        MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
+        2, 3,
+        MATRIX_DIM * CMPLX_DIM + 2 , MATRIX_DIM * CMPLX_DIM + 3,
+    };
+
+
+    vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
+
+    tmp = 0;
+    vecColAOffs0 = viwdupq_u16(tmp, 4, 1);
+
+    tmp = (CMPLX_DIM * MATRIX_DIM);
+    vecColAOffs1 = vecColAOffs0 + (uint16_t)(CMPLX_DIM * MATRIX_DIM);
+
+
+    pInB = (float16_t const *)pSrcB->pData;
+
+    vecA0 = vldrhq_gather_shifted_offset_f16(pInA0, vecColAOffs0);
+    vecA1 = vldrhq_gather_shifted_offset_f16(pInA0, vecColAOffs1);
+
+
+    vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
+
+    acc0 = vcmulq(vecA0, vecB);
+    acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
+
+    acc1 = vcmulq(vecA1, vecB);
+    acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
+
+
+    /*
+     * Compute
+     *  re0+re1 | im0+im1 | re0+re1 | im0+im1
+     *  re2+re3 | im2+im3 | re2+re3 | im2+im3
+     */
+
+    vecTmp = (f16x8_t) vrev64q_s32((int32x4_t) acc0);
+    vecTmp = vaddq(vecTmp, acc0);
+
+
+    *(float32_t *)(&pOut[0 * CMPLX_DIM * MATRIX_DIM]) = ((f32x4_t)vecTmp)[0];
+    *(float32_t *)(&pOut[0 * CMPLX_DIM * MATRIX_DIM + CMPLX_DIM]) = ((f32x4_t)vecTmp)[2];
+
+    vecTmp = (f16x8_t) vrev64q_s32((int32x4_t) acc1);
+    vecTmp = vaddq(vecTmp, acc1);
+
+    *(float32_t *)(&pOut[1 * CMPLX_DIM * MATRIX_DIM]) = ((f32x4_t)vecTmp)[0];
+    *(float32_t *)(&pOut[1 * CMPLX_DIM * MATRIX_DIM + CMPLX_DIM]) = ((f32x4_t)vecTmp)[2];
+
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_3x3_mve(
+    const arm_matrix_instance_f16 * pSrcA,
+    const arm_matrix_instance_f16 * pSrcB,
+    arm_matrix_instance_f16 * pDst)
+{
+    const uint16_t   MATRIX_DIM = 3;
+    float16_t const *pInB = pSrcB->pData;  /* input data matrix pointer B */
+    float16_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
+    float16_t       *pOut = pDst->pData;   /* output data matrix pointer */
+    uint16x8_t     vecColBOffs0;
+    float16_t       *pInA0 = pInA;
+    float16_t       *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM;
+    float16_t       *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM;
+    f16x8_t        acc0, acc1, acc2;
+    f16x8_t        vecB, vecA0, vecA1, vecA2;
+    static const uint16_t offsetB0[8] = { 0, 1,
+        MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
+        2 * MATRIX_DIM * CMPLX_DIM, 2 * MATRIX_DIM * CMPLX_DIM + 1,
+        DONTCARE, DONTCARE
+    };
+
+    
+    /* enable predication to disable upper half complex vector element */
+    mve_pred16_t p0 = vctp16q(MATRIX_DIM * CMPLX_DIM);
+
+    vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
+
+    pInB = (float16_t const *)pSrcB->pData;
+
+    vecA0 = vldrhq_f16(pInA0);
+    vecA1 = vldrhq_f16(pInA1);
+    vecA2 = vldrhq_f16(pInA2);
+
+    vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
+
+    acc0 = vcmulq(vecA0, vecB);
+    acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
+
+    acc1 = vcmulq(vecA1, vecB);
+    acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
+
+    acc2 = vcmulq(vecA2, vecB);
+    acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
+
+    mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+    pOut += CMPLX_DIM;
+    /*
+     * move to next B column
+     */
+    pInB = pInB + CMPLX_DIM;
+
+    vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
+
+    acc0 = vcmulq(vecA0, vecB);
+    acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
+
+    acc1 = vcmulq(vecA1, vecB);
+    acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
+
+    acc2 = vcmulq(vecA2, vecB);
+    acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
+
+    mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+    pOut += CMPLX_DIM;
+    /*
+     * move to next B column
+     */
+    pInB = pInB + CMPLX_DIM;
+
+    vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
+
+    acc0 = vcmulq(vecA0, vecB);
+    acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
+
+    acc1 = vcmulq(vecA1, vecB);
+    acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
+
+    acc2 = vcmulq(vecA2, vecB);
+    acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
+
+    mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_4x4_mve(
+    const arm_matrix_instance_f16 * pSrcA,
+    const arm_matrix_instance_f16 * pSrcB,
+    arm_matrix_instance_f16 * pDst)
+{
+    const uint16_t   MATRIX_DIM = 4;
+    float16_t const *pInB = pSrcB->pData;  /* input data matrix pointer B */
+    float16_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
+    float16_t       *pOut = pDst->pData;   /* output data matrix pointer */
+    uint16x8_t     vecColBOffs0;
+    float16_t       *pInA0 = pInA;
+    float16_t       *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM;
+    float16_t       *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM;
+    float16_t       *pInA3 = pInA2 + CMPLX_DIM * MATRIX_DIM;
+    f16x8_t        acc0, acc1, acc2, acc3;
+    f16x8_t        vecB, vecA;
+    static const uint16_t offsetB0[8] = { 0, 1,
+        MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
+        2 * MATRIX_DIM * CMPLX_DIM, 2 * MATRIX_DIM * CMPLX_DIM + 1,
+        3 * MATRIX_DIM * CMPLX_DIM, 3 * MATRIX_DIM * CMPLX_DIM + 1
+    };
+
+    vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
+
+    pInB = (float16_t const *)pSrcB->pData;
+
+    vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
+
+    vecA = vldrhq_f16(pInA0);
+    acc0 = vcmulq(vecA, vecB);
+    acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+    vecA = vldrhq_f16(pInA1);
+    acc1 = vcmulq(vecA, vecB);
+    acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+    vecA = vldrhq_f16(pInA2);
+    acc2 = vcmulq(vecA, vecB);
+    acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+    vecA = vldrhq_f16(pInA3);
+    acc3 = vcmulq(vecA, vecB);
+    acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+
+    mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
+    pOut += CMPLX_DIM;
+    /*
+     * move to next B column
+     */
+    pInB = pInB + CMPLX_DIM;
+
+    vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
+
+    vecA = vldrhq_f16(pInA0);
+    acc0 = vcmulq(vecA, vecB);
+    acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+    vecA = vldrhq_f16(pInA1);
+    acc1 = vcmulq(vecA, vecB);
+    acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+    vecA = vldrhq_f16(pInA2);
+    acc2 = vcmulq(vecA, vecB);
+    acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+    vecA = vldrhq_f16(pInA3);
+    acc3 = vcmulq(vecA, vecB);
+    acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+
+    mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
+    pOut += CMPLX_DIM;
+    /*
+     * move to next B column
+     */
+    pInB = pInB + CMPLX_DIM;
+
+    vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
+
+    vecA = vldrhq_f16(pInA0);
+    acc0 = vcmulq(vecA, vecB);
+    acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+    vecA = vldrhq_f16(pInA1);
+    acc1 = vcmulq(vecA, vecB);
+    acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+    vecA = vldrhq_f16(pInA2);
+    acc2 = vcmulq(vecA, vecB);
+    acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+    vecA = vldrhq_f16(pInA3);
+    acc3 = vcmulq(vecA, vecB);
+    acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+
+    mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
+    pOut += CMPLX_DIM;
+    /*
+     * move to next B column
+     */
+    pInB = pInB + CMPLX_DIM;
+
+    vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
+
+    vecA = vldrhq_f16(pInA0);
+    acc0 = vcmulq(vecA, vecB);
+    acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+    vecA = vldrhq_f16(pInA1);
+    acc1 = vcmulq(vecA, vecB);
+    acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+    vecA = vldrhq_f16(pInA2);
+    acc2 = vcmulq(vecA, vecB);
+    acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+    vecA = vldrhq_f16(pInA3);
+    acc3 = vcmulq(vecA, vecB);
+    acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+
+    mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+    mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+
+
+arm_status arm_mat_cmplx_mult_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+  arm_matrix_instance_f16 * pDst)
+{
+    float16_t const *pInB = (float16_t const *) pSrcB->pData;   /* input data matrix pointer B */
+    float16_t const *pInA = (float16_t const *) pSrcA->pData;   /* input data matrix pointer A */
+    float16_t *pOut = pDst->pData;  /* output data matrix pointer */
+    float16_t *px;              /* Temporary output data matrix pointer */
+    uint16_t  numRowsA = pSrcA->numRows;    /* number of rows of input matrix A    */
+    uint16_t  numColsB = pSrcB->numCols;    /* number of columns of input matrix B */
+    uint16_t  numColsA = pSrcA->numCols;    /* number of columns of input matrix A */
+    uint16_t  col, i = 0U, row = numRowsA;  /* loop counters */
+    arm_status status;          /* status of matrix multiplication */
+    uint16x8_t vecOffs, vecColBOffs;
+    uint32_t  blkCnt,rowCnt;           /* loop counters */
+
+    #ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+if ((pSrcA->numCols != pSrcB->numRows) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcB->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+
+    /*
+     * small squared matrix specialized routines
+     */
+    if (numRowsA == numColsB && numColsB == numColsA)
+    {
+        if (numRowsA == 1)
+        {
+            pOut[0] = pInA[0] * pInB[0] - pInA[1] * pInB[1];
+            pOut[1] = pInA[0] * pInB[1] + pInA[1] * pInB[0];
+            return (ARM_MATH_SUCCESS);
+        }
+        else if  (numRowsA == 2)
+            return arm_mat_cmplx_mult_f16_2x2_mve(pSrcA, pSrcB, pDst);
+        else if (numRowsA == 3)
+            return arm_mat_cmplx_mult_f16_3x3_mve(pSrcA, pSrcB, pDst);
+        else if (numRowsA == 4)
+            return arm_mat_cmplx_mult_f16_4x4_mve(pSrcA, pSrcB, pDst);
+    }
+
+    vecColBOffs[0] = 0;
+    vecColBOffs[1] = 1;
+    vecColBOffs[2] = numColsB * CMPLX_DIM;
+    vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
+    vecColBOffs[4] = 2*numColsB * CMPLX_DIM;
+    vecColBOffs[5] = 2*(numColsB * CMPLX_DIM) + 1;
+    vecColBOffs[6] = 3*numColsB * CMPLX_DIM;
+    vecColBOffs[7] = 3*(numColsB * CMPLX_DIM) + 1;
+
+    /*
+     * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
+     */
+
+    /*
+     * row loop
+     */
+    rowCnt = row >> 2;
+    while (rowCnt > 0u)
+    {
+        /*
+         * Output pointer is set to starting address of the row being processed
+         */
+        px = pOut + i * CMPLX_DIM;
+        i = i + 4 * numColsB;
+        /*
+         * For every row wise process, the column loop counter is to be initiated
+         */
+        col = numColsB;
+        /*
+         * For every row wise process, the pInB pointer is set
+         * to the starting address of the pSrcB data
+         */
+        pInB = (float16_t const *) pSrcB->pData;
+        /*
+         * column loop
+         */
+        while (col > 0u)
+        {
+            /*
+             * generate 4 columns elements
+             */
+            /*
+             * Matrix A columns number of MAC operations are to be performed
+             */
+
+            float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
+            float16_t const *pInA0 = pInA;
+            float16_t const *pInA1 = pInA0 + numColsA * CMPLX_DIM;
+            float16_t const *pInA2 = pInA1 + numColsA * CMPLX_DIM;
+            float16_t const *pInA3 = pInA2 + numColsA * CMPLX_DIM;
+            f16x8_t acc0, acc1, acc2, acc3;
+
+            acc0 = vdupq_n_f16(0.0f16);
+            acc1 = vdupq_n_f16(0.0f16);
+            acc2 = vdupq_n_f16(0.0f16);
+            acc3 = vdupq_n_f16(0.0f16);
+
+            pSrcA0Vec = (float16_t const *) pInA0;
+            pSrcA1Vec = (float16_t const *) pInA1;
+            pSrcA2Vec = (float16_t const *) pInA2;
+            pSrcA3Vec = (float16_t const *) pInA3;
+
+            vecOffs = vecColBOffs;
+
+            /*
+             * process 1 x 4 block output
+             */
+            blkCnt = (numColsA * CMPLX_DIM) >> 3;
+            while (blkCnt > 0U)
+            {
+                f16x8_t vecB, vecA;
+
+                vecB = vldrhq_gather_shifted_offset_f16(pInB, vecOffs);
+                /*
+                 * move Matrix B read offsets, 4 rows down
+                 */
+                vecOffs = vaddq_n_u16(vecOffs , (uint16_t) (numColsB * 4 * CMPLX_DIM));
+
+                vecA = vld1q(pSrcA0Vec);  pSrcA0Vec += 8;
+                acc0 = vcmlaq(acc0, vecA, vecB);
+                acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+                vecA = vld1q(pSrcA1Vec);  pSrcA1Vec += 8;
+                acc1 = vcmlaq(acc1, vecA, vecB);
+                acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+                vecA = vld1q(pSrcA2Vec);  pSrcA2Vec += 8;
+                acc2 = vcmlaq(acc2, vecA, vecB);
+                acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+                vecA = vld1q(pSrcA3Vec);  pSrcA3Vec += 8;
+                acc3 = vcmlaq(acc3, vecA, vecB);
+                acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+                blkCnt--;
+            }
+            /*
+             * Unsupported addressing mode compiler crash
+             */
+            /*
+             * tail
+             * (will be merged thru tail predication)
+             */
+            blkCnt = (numColsA * CMPLX_DIM) & 7;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp16q(blkCnt);
+                f16x8_t vecB, vecA;
+
+                vecB = vldrhq_gather_shifted_offset_z_f16(pInB, vecOffs, p0);
+                /*
+                 * move Matrix B read offsets, 4 rows down
+                 */
+                vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
+
+                vecA = vld1q(pSrcA0Vec);
+                acc0 = vcmlaq(acc0, vecA, vecB);
+                acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+                vecA = vld1q(pSrcA1Vec);
+                acc1 = vcmlaq(acc1, vecA, vecB);
+                acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+                vecA = vld1q(pSrcA2Vec);
+                acc2 = vcmlaq(acc2, vecA, vecB);
+                acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+                vecA = vld1q(pSrcA3Vec);
+                acc3 = vcmlaq(acc3, vecA, vecB);
+                acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+            }
+
+
+            mve_cmplx_sum_intra_vec_f16(acc0, &px[0 * CMPLX_DIM * numColsB + 0]);
+            mve_cmplx_sum_intra_vec_f16(acc1, &px[1 * CMPLX_DIM * numColsB + 0]);
+            mve_cmplx_sum_intra_vec_f16(acc2, &px[2 * CMPLX_DIM * numColsB + 0]);
+            mve_cmplx_sum_intra_vec_f16(acc3, &px[3 * CMPLX_DIM * numColsB + 0]);
+           
+            px += CMPLX_DIM;
+            /*
+             * Decrement the column loop counter
+             */
+            col--;
+            /*
+             * Update the pointer pInB to point to the  starting address of the next column
+             */
+            pInB = (float16_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
+        }
+
+        /*
+         * Update the pointer pInA to point to the  starting address of the next row
+         */
+        pInA += (numColsA * 4) * CMPLX_DIM;
+        /*
+         * Decrement the row loop counter
+         */
+        rowCnt --;
+
+    }
+
+    rowCnt = row & 3;
+    while (rowCnt > 0u)
+    {
+           /*
+         * Output pointer is set to starting address of the row being processed
+         */
+        px = pOut + i * CMPLX_DIM;
+        i = i + numColsB;
+        /*
+         * For every row wise process, the column loop counter is to be initiated
+         */
+        col = numColsB;
+        /*
+         * For every row wise process, the pInB pointer is set
+         * to the starting address of the pSrcB data
+         */
+        pInB = (float16_t const *) pSrcB->pData;
+        /*
+         * column loop
+         */
+        while (col > 0u)
+        {
+            /*
+             * generate 4 columns elements
+             */
+            /*
+             * Matrix A columns number of MAC operations are to be performed
+             */
+
+            float16_t const *pSrcA0Vec;
+            float16_t const *pInA0 = pInA;
+            f16x8_t acc0;
+
+            acc0 = vdupq_n_f16(0.0f16);
+
+            pSrcA0Vec = (float16_t const *) pInA0;
+           
+            vecOffs = vecColBOffs;
+
+            /*
+             * process 1 x 4 block output
+             */
+            blkCnt = (numColsA * CMPLX_DIM) >> 3;
+            while (blkCnt > 0U)
+            {
+                f16x8_t vecB, vecA;
+
+                vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
+                /*
+                 * move Matrix B read offsets, 4 rows down
+                 */
+                vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (4*numColsB * CMPLX_DIM));
+
+                vecA = vld1q(pSrcA0Vec);  
+                pSrcA0Vec += 8;
+                acc0 = vcmlaq(acc0, vecA, vecB);
+                acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+                
+
+                blkCnt--;
+            }
+
+
+            /*
+             * tail
+             */
+            blkCnt = (numColsA * CMPLX_DIM) & 7;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp16q(blkCnt);
+                f16x8_t vecB, vecA;
+
+                vecB = vldrhq_gather_shifted_offset_z(pInB, vecOffs, p0);
+               
+                vecA = vld1q(pSrcA0Vec);
+                acc0 = vcmlaq(acc0, vecA, vecB);
+                acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+            }
+
+            mve_cmplx_sum_intra_vec_f16(acc0, &px[0]);
+
+           
+            px += CMPLX_DIM;
+            /*
+             * Decrement the column loop counter
+             */
+            col--;
+            /*
+             * Update the pointer pInB to point to the  starting address of the next column
+             */
+            pInB = (float16_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
+        }
+
+        /*
+         * Update the pointer pInA to point to the  starting address of the next row
+         */
+        pInA += numColsA  * CMPLX_DIM;
+        rowCnt--;
+    }
+
+    /*
+     * set status as ARM_MATH_SUCCESS
+     */
+    status = ARM_MATH_SUCCESS;
+ }
+    /*
+     * Return to application
+     */
+    return (status);
+}
+#else
+
+arm_status arm_mat_cmplx_mult_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+        arm_matrix_instance_f16 * pDst)
+{
+  float16_t *pIn1 = pSrcA->pData;                /* Input data matrix pointer A */
+  float16_t *pIn2 = pSrcB->pData;                /* Input data matrix pointer B */
+  float16_t *pInA = pSrcA->pData;                /* Input data matrix pointer A */
+  float16_t *pOut = pDst->pData;                 /* Output data matrix pointer */
+  float16_t *px;                                 /* Temporary output data matrix pointer */
+  uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A */
+  uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
+  uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
+  _Float16 sumReal, sumImag;                    /* Accumulator */
+  _Float16 a1, b1, c1, d1;
+  uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
+  arm_status status;                             /* status of matrix multiplication */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+  _Float16 a0, b0, c0, d0;
+#endif
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcB->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+    /* row loop */
+    do
+    {
+      /* Output pointer is set to starting address of the row being processed */
+      px = pOut + 2 * i;
+
+      /* For every row wise process, the column loop counter is to be initiated */
+      col = numColsB;
+
+      /* For every row wise process, the pIn2 pointer is set
+       ** to the starting address of the pSrcB data */
+      pIn2 = pSrcB->pData;
+
+      j = 0U;
+
+      /* column loop */
+      do
+      {
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sumReal = 0.0f16;
+        sumImag = 0.0f16;
+
+        /* Initiate pointer pIn1 to point to starting address of column being processed */
+        pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+        /* Apply loop unrolling and compute 4 MACs simultaneously. */
+        colCnt = numColsA >> 2U;
+
+        /* matrix multiplication */
+        while (colCnt > 0U)
+        {
+
+          /* Reading real part of complex matrix A */
+          a0 = *pIn1;
+
+          /* Reading real part of complex matrix B */
+          c0 = *pIn2;
+
+          /* Reading imaginary part of complex matrix A */
+          b0 = *(pIn1 + 1U);
+
+          /* Reading imaginary part of complex matrix B */
+          d0 = *(pIn2 + 1U);
+
+          /* Multiply and Accumlates */
+          sumReal += a0 * c0;
+          sumImag += b0 * c0;
+
+          /* update pointers */
+          pIn1 += 2U;
+          pIn2 += 2 * numColsB;
+
+          /* Multiply and Accumlates */
+          sumReal -= b0 * d0;
+          sumImag += a0 * d0;
+
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+          /* read real and imag values from pSrcA and pSrcB buffer */
+          a1 = *(pIn1     );
+          c1 = *(pIn2     );
+          b1 = *(pIn1 + 1U);
+          d1 = *(pIn2 + 1U);
+
+          /* Multiply and Accumlates */
+          sumReal += a1 * c1;
+          sumImag += b1 * c1;
+
+          /* update pointers */
+          pIn1 += 2U;
+          pIn2 += 2 * numColsB;
+
+          /* Multiply and Accumlates */
+          sumReal -= b1 * d1;
+          sumImag += a1 * d1;
+
+          a0 = *(pIn1     );
+          c0 = *(pIn2     );
+          b0 = *(pIn1 + 1U);
+          d0 = *(pIn2 + 1U);
+
+          /* Multiply and Accumlates */
+          sumReal += a0 * c0;
+          sumImag += b0 * c0;
+
+          /* update pointers */
+          pIn1 += 2U;
+          pIn2 += 2 * numColsB;
+
+          /* Multiply and Accumlates */
+          sumReal -= b0 * d0;
+          sumImag += a0 * d0;
+
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+          a1 = *(pIn1     );
+          c1 = *(pIn2     );
+          b1 = *(pIn1 + 1U);
+          d1 = *(pIn2 + 1U);
+
+          /* Multiply and Accumlates */
+          sumReal += a1 * c1;
+          sumImag += b1 * c1;
+
+          /* update pointers */
+          pIn1 += 2U;
+          pIn2 += 2 * numColsB;
+
+          /* Multiply and Accumlates */
+          sumReal -= b1 * d1;
+          sumImag += a1 * d1;
+
+          /* Decrement loop count */
+          colCnt--;
+        }
+
+        /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+         ** No loop unrolling is used. */
+        colCnt = numColsA % 0x4U;
+
+#else
+
+        /* Initialize blkCnt with number of samples */
+        colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+          a1 = *(pIn1     );
+          c1 = *(pIn2     );
+          b1 = *(pIn1 + 1U);
+          d1 = *(pIn2 + 1U);
+
+          /* Multiply and Accumlates */
+          sumReal += a1 * c1;
+          sumImag += b1 * c1;
+
+          /* update pointers */
+          pIn1 += 2U;
+          pIn2 += 2 * numColsB;
+
+          /* Multiply and Accumlates */
+          sumReal -= b1 * d1;
+          sumImag += a1 * d1;
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Store result in destination buffer */
+        *px++ = sumReal;
+        *px++ = sumImag;
+
+        /* Update pointer pIn2 to point to starting address of next column */
+        j++;
+        pIn2 = pSrcB->pData + 2U * j;
+
+        /* Decrement column loop counter */
+        col--;
+
+      } while (col > 0U);
+
+      /* Update pointer pInA to point to starting address of next row */
+      i = i + numColsB;
+      pInA = pInA + 2 * numColsA;
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixMult group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
index 1b79eabeb97cee94b99616640f190aa073b63797..5add938ba77cff05388b284417a16a81261842d4 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_cmplx_mult_f32.c
  * Description:  Floating-point matrix multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -532,7 +532,7 @@ arm_status arm_mat_cmplx_mult_f32(
     uint16_t  numRowsA = pSrcA->numRows;    /* number of rows of input matrix A    */
     uint16_t  numColsB = pSrcB->numCols;    /* number of columns of input matrix B */
     uint16_t  numColsA = pSrcA->numCols;    /* number of columns of input matrix A */
-    uint16_t  col, i = 0U, row = numRowsA, colCnt;  /* loop counters */
+    uint16_t  col, i = 0U, row = numRowsA;  /* loop counters */
     arm_status status;          /* status of matrix multiplication */
     uint32x4_t vecOffs, vecColBOffs;
     uint32_t  blkCnt, rowCnt;           /* loop counters */
@@ -611,7 +611,6 @@ arm_status arm_mat_cmplx_mult_f32(
             /*
              * Matrix A columns number of MAC operations are to be performed
              */
-            colCnt = numColsA;
 
             float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
             float32_t const *pInA0 = pInA;
@@ -752,7 +751,6 @@ arm_status arm_mat_cmplx_mult_f32(
             /*
              * Matrix A columns number of MAC operations are to be performed
              */
-            colCnt = numColsA;
 
             float32_t const *pSrcA0Vec;
             float32_t const *pInA0 = pInA;
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c
index 827151e00f1e0f9746cc7c9ef25075ea33d1cce6..30e2f2f6185ed8574de87bb8d1cdf8032586474e 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_cmplx_mat_mult_q15.c
  * Description:  Q15 complex matrix multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -57,7 +57,7 @@
                    This approach provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
                    truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
 
@@ -171,16 +171,16 @@ arm_status arm_mat_cmplx_mult_q15(
                 pSrcAVec += 8;
                 vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
 
-                acc0 = vmlsldavaq(acc0, vecA, vecB);
-                acc1 = vmlaldavaxq(acc1, vecA, vecB);
+                acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
+                acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
                 vecB2 = vldrhq_gather_shifted_offset(pInB2, vecOffs);
                 /*
                  * move Matrix B read offsets, 4 rows down
                  */
-                vecOffs = vaddq(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
+                vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
 
-                acc2 = vmlsldavaq(acc2, vecA, vecB2);
-                acc3 = vmlaldavaxq(acc3, vecA, vecB2);
+                acc2 = vmlsldavaq_s16(acc2, vecA, vecB2);
+                acc3 = vmlaldavaxq_s16(acc3, vecA, vecB2);
 
                 blkCnt--;
             }
@@ -196,17 +196,17 @@ arm_status arm_mat_cmplx_mult_q15(
 
                 vecA = vldrhq_z_s16(pSrcAVec, p0);
 
-                acc0 = vmlsldavaq(acc0, vecA, vecB);
-                acc1 = vmlaldavaxq(acc1, vecA, vecB);
+                acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
+                acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
                 vecB2 = vldrhq_gather_shifted_offset(pInB2, vecOffs);
 
                 /*
                  * move Matrix B read offsets, 4 rows down
                  */
-                vecOffs = vaddq(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
+                vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
 
-                acc2 = vmlsldavaq(acc2, vecA, vecB2);
-                acc3 = vmlaldavaxq(acc3, vecA, vecB2);
+                acc2 = vmlsldavaq_s16(acc2, vecA, vecB2);
+                acc3 = vmlaldavaxq_s16(acc3, vecA, vecB2);
 
             }
             /*
@@ -264,12 +264,12 @@ arm_status arm_mat_cmplx_mult_q15(
                 pSrcAVec += 8;
                 vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
 
-                acc0 = vmlsldavaq(acc0, vecA, vecB);
-                acc1 = vmlaldavaxq(acc1, vecA, vecB);
+                acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
+                acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
                 /*
                  * move Matrix B read offsets, 4 rows down
                  */
-                vecOffs = vaddq(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
+                vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
 
                 blkCnt--;
             }
@@ -284,8 +284,8 @@ arm_status arm_mat_cmplx_mult_q15(
                 vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
                 vecA = vldrhq_z_s16(pSrcAVec, p0);
 
-                acc0 = vmlsldavaq(acc0, vecA, vecB);
-                acc1 = vmlaldavaxq(acc1, vecA, vecB);
+                acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
+                acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
                
             }
             /*
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c
index 510ea9f28d3b173a44161dfebd4665428f33cfe2..ee784a6bd8842df3e316158dbf227e03e143567e 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_cmplx_mult_q31.c
  * Description:  Floating-point matrix multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -56,7 +56,7 @@
                    to avoid overflows, as a total of numColsA additions are performed internally.
                    The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
@@ -526,7 +526,7 @@ arm_status arm_mat_cmplx_mult_q31(
     uint16_t  numRowsA = pSrcA->numRows;    /* number of rows of input matrix A    */
     uint16_t  numColsB = pSrcB->numCols;    /* number of columns of input matrix B */
     uint16_t  numColsA = pSrcA->numCols;    /* number of columns of input matrix A */
-    uint16_t  col, i = 0U, row = numRowsA, colCnt;  /* loop counters */
+    uint16_t  col, i = 0U, row = numRowsA;  /* loop counters */
     arm_status status;          /* status of matrix multiplication */
     uint32x4_t vecOffs, vecColBOffs;
     uint32_t  blkCnt, rowCnt;           /* loop counters */
@@ -611,7 +611,6 @@ arm_status arm_mat_cmplx_mult_q31(
             /*
              * Matrix A columns number of MAC operations are to be performed
              */
-            colCnt = numColsA;
 
             q31_t const *pSrcA0Vec, *pSrcA1Vec;
             q31_t const *pInA0 = pInA;
@@ -742,7 +741,6 @@ arm_status arm_mat_cmplx_mult_q31(
             /*
              * Matrix A columns number of MAC operations are to be performed
              */
-            colCnt = numColsA;
 
             q31_t const *pSrcA0Vec;
             q31_t const *pInA0 = pInA;
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..5eda8c3cbe295408c9ac2d46df6be1c867e6ca65
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_f16.c
@@ -0,0 +1,131 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_cmplx_trans_f16.c
+ * Description:  Floating-point complex matrix transpose
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixComplexTrans
+  @{
+ */
+
+/**
+  @brief         Floating-point matrix transpose.
+  @param[in]     pSrc      points to input matrix
+  @param[out]    pDst      points to output matrix
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_cmplx_trans_f16(const arm_matrix_instance_f16 * pSrc, arm_matrix_instance_f16 * pDst)
+{
+    return arm_mat_cmplx_trans_16bit(pSrc->numRows, pSrc->numCols, (uint16_t *) pSrc->pData,
+                                   pDst->numRows, pDst->numCols, (uint16_t *) pDst->pData);
+}
+
+#else
+arm_status arm_mat_cmplx_trans_f16(
+  const arm_matrix_instance_f16 * pSrc,
+  arm_matrix_instance_f16 * pDst)
+{
+  float16_t *pIn = pSrc->pData;                  /* input data matrix pointer */
+  float16_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  float16_t *px;                                 /* Temporary output data matrix pointer */
+  uint16_t nRows = pSrc->numRows;                /* number of rows */
+  uint16_t nColumns = pSrc->numCols;             /* number of columns */
+  uint16_t col, i = 0U, row = nRows;             /* loop counters */
+  arm_status status;                             /* status of matrix transpose  */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*      #ifdef ARM_MATH_MATRIX_CHECK    */
+
+  {
+    /* Matrix transpose by exchanging the rows with columns */
+    /* row loop     */
+    do
+    {
+      /* The pointer px is set to starting address of the column being processed */
+      px = pOut + CMPLX_DIM * i;
+
+      /* Initialize column loop counter */
+      col = nColumns;
+
+      while (col > 0U)
+      {
+        /* Read and store the input element in the destination */
+        px[0] = *pIn++; // real
+        px[1] = *pIn++; // imag
+
+        /* Update the pointer px to point to the next row of the transposed matrix */
+        px += CMPLX_DIM * nRows;
+
+        /* Decrement the column loop counter */
+        col--;
+      }
+      i++;
+
+      /* Decrement the row loop counter */
+      row--;
+
+    } while (row > 0U);          /* row loop end  */
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixTrans group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..222f349e44ba050aad907d129b615bd65a3fbdd1
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_f32.c
@@ -0,0 +1,133 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_cmplx_trans_f32.c
+ * Description:  Floating-point complex matrix transpose
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @defgroup MatrixComplexTrans Complex Matrix Transpose
+
+  Tranposes a complex matrix.
+
+  Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
+  \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
+ */
+
+/**
+  @addtogroup MatrixComplexTrans
+  @{
+ */
+
+/**
+  @brief         Floating-point matrix transpose.
+  @param[in]     pSrc      points to input matrix
+  @param[out]    pDst      points to output matrix
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_cmplx_trans_f32(const arm_matrix_instance_f32 * pSrc, arm_matrix_instance_f32 * pDst)
+{
+    return arm_mat_cmplx_trans_32bit(pSrc->numRows, pSrc->numCols, (uint32_t *) pSrc->pData,
+                                   pDst->numRows, pDst->numCols, (uint32_t *) pDst->pData);
+}
+
+#else
+arm_status arm_mat_cmplx_trans_f32(
+  const arm_matrix_instance_f32 * pSrc,
+  arm_matrix_instance_f32 * pDst)
+{
+  float32_t *pIn = pSrc->pData;                  /* input data matrix pointer */
+  float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  float32_t *px;                                 /* Temporary output data matrix pointer */
+  uint16_t nRows = pSrc->numRows;                /* number of rows */
+  uint16_t nColumns = pSrc->numCols;             /* number of columns */
+  uint16_t col, i = 0U, row = nRows;             /* loop counters */
+  arm_status status;                             /* status of matrix transpose  */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*      #ifdef ARM_MATH_MATRIX_CHECK    */
+
+  {
+    /* Matrix transpose by exchanging the rows with columns */
+    /* row loop     */
+    do
+    {
+      /* The pointer px is set to starting address of the column being processed */
+      px = pOut + CMPLX_DIM * i;
+
+      /* Initialize column loop counter */
+      col = nColumns;
+
+      while (col > 0U)
+      {
+        /* Read and store the input element in the destination */
+        px[0] = *pIn++; // real
+        px[1] = *pIn++; // imag
+
+        /* Update the pointer px to point to the next row of the transposed matrix */
+        px += CMPLX_DIM * nRows;
+
+        /* Decrement the column loop counter */
+        col--;
+      }
+      i++;
+
+      /* Decrement the row loop counter */
+      row--;
+
+    } while (row > 0U);          /* row loop end  */
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_q15.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_q15.c
new file mode 100644
index 0000000000000000000000000000000000000000..8a3724ee6ebf49fb7d29a80f1e6e425ccf9dceba
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_q15.c
@@ -0,0 +1,124 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_cmplx_trans_q31.c
+ * Description:  Q15 complex matrix transpose
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @addtogroup MatrixComplexTrans
+  @{
+ */
+
+/**
+  @brief         Q15 complex matrix transpose.
+  @param[in]     pSrc      points to input matrix
+  @param[out]    pDst      points to output matrix
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_cmplx_trans_q15(const arm_matrix_instance_q15 * pSrc, arm_matrix_instance_q15 * pDst)
+{
+    return arm_mat_cmplx_trans_16bit(pSrc->numRows, pSrc->numCols, (uint16_t *) pSrc->pData,
+                                   pDst->numRows, pDst->numCols, (uint16_t *) pDst->pData);
+}
+
+
+#else
+arm_status arm_mat_cmplx_trans_q15(
+  const arm_matrix_instance_q15 * pSrc,
+  arm_matrix_instance_q15 * pDst)
+{
+  q15_t *pSrcA = pSrc->pData;                    /* input data matrix pointer */
+  q15_t *pOut = pDst->pData;                     /* output data matrix pointer */
+  uint16_t nRows = pSrc->numRows;                /* number of nRows */
+  uint16_t nColumns = pSrc->numCols;             /* number of nColumns */
+  uint16_t col, row = nRows, i = 0U;             /* row and column loop counters */
+  arm_status status;                             /* status of matrix transpose */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
+
+  {
+    /* Matrix transpose by exchanging the rows with columns */
+    /* row loop     */
+    do
+    {
+      /* The pointer pOut is set to starting address of the column being processed */
+      pOut = pDst->pData + CMPLX_DIM * i;
+
+      /* Initialize column loop counter */
+      col = nColumns;
+
+      while (col > 0U)
+      {
+        /* Read and store the input element in the destination */
+        pOut[0] = *pSrcA++; //real
+        pOut[1] = *pSrcA++; //imag
+
+        /* Update the pointer pOut to point to the next row of the transposed matrix */
+        pOut += CMPLX_DIM *nRows;
+
+        /* Decrement the column loop counter */
+        col--;
+      }
+
+      i++;
+
+      /* Decrement the row loop counter */
+      row--;
+
+    } while (row > 0U);
+
+    /* set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_q31.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_q31.c
new file mode 100644
index 0000000000000000000000000000000000000000..a8612a30ebfc013768d9c2b215f28f670b9cf526
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_q31.c
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_cmplx_trans_q31.c
+ * Description:  Q31 complex matrix transpose
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+
+/**
+  @addtogroup MatrixComplexTrans
+  @{
+ */
+
+/**
+  @brief         Q31 complex matrix transpose.
+  @param[in]     pSrc      points to input matrix
+  @param[out]    pDst      points to output matrix
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+
+arm_status arm_mat_cmplx_trans_q31(const arm_matrix_instance_q31 * pSrc, arm_matrix_instance_q31 * pDst)
+{
+    return arm_mat_cmplx_trans_32bit(pSrc->numRows, pSrc->numCols, (uint32_t *) pSrc->pData,
+                                   pDst->numRows, pDst->numCols, (uint32_t *) pDst->pData);
+}
+
+
+#else
+arm_status arm_mat_cmplx_trans_q31(
+  const arm_matrix_instance_q31 * pSrc,
+  arm_matrix_instance_q31 * pDst)
+{
+  q31_t *pIn = pSrc->pData;                      /* input data matrix pointer  */
+  q31_t *pOut = pDst->pData;                     /* output data matrix pointer  */
+  q31_t *px;                                     /* Temporary output data matrix pointer */
+  uint16_t nRows = pSrc->numRows;                /* number of nRows */
+  uint16_t nColumns = pSrc->numCols;             /* number of nColumns  */
+  uint16_t col, i = 0U, row = nRows;             /* loop counters */
+  arm_status status;                             /* status of matrix transpose */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
+
+  {
+    /* Matrix transpose by exchanging the rows with columns */
+    /* row loop     */
+    do
+    {
+      /* The pointer px is set to starting address of the column being processed */
+      px = pOut + CMPLX_DIM * i;
+
+      /* Initialize column loop counter */
+      col = nColumns;
+
+      while (col > 0U)
+      {
+        /* Read and store the input element in the destination */
+        px[0] = *pIn++; // real
+        px[1] = *pIn++; // imag
+
+        /* Update the pointer px to point to the next row of the transposed matrix */
+        px += CMPLX_DIM * nRows;
+
+        /* Decrement the column loop counter */
+        col--;
+      }
+
+      i++;
+
+      /* Decrement the row loop counter */
+      row--;
+
+    }
+    while (row > 0U);            /* row loop end */
+
+    /* set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..6a48102d1100606714cade703529d05ac6aea551
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_f16.c
@@ -0,0 +1,74 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_init_f16.c
+ * Description:  Floating-point matrix initialization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixInit
+  @{
+ */
+
+/**
+  @brief         Floating-point matrix initialization.
+  @param[in,out] S         points to an instance of the floating-point matrix structure
+  @param[in]     nRows     number of rows in the matrix
+  @param[in]     nColumns  number of columns in the matrix
+  @param[in]     pData     points to the matrix data array
+  @return        none
+ */
+
+void arm_mat_init_f16(
+  arm_matrix_instance_f16 * S,
+  uint16_t nRows,
+  uint16_t nColumns,
+  float16_t * pData)
+{
+  /* Assign Number of Rows */
+  S->numRows = nRows;
+
+  /* Assign Number of Columns */
+  S->numCols = nColumns;
+
+  /* Assign Data pointer */
+  S->pData = pData;
+}
+
+/**
+  @} end of MatrixInit group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_f32.c
index ce02a258385b48ead7778b372a2e70f66106ad6b..d2a52013a47c30078687e1c6ee6ca90c7b3b74b9 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_f32.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_init_f32.c
  * Description:  Floating-point matrix initialization
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q15.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q15.c
index 02755034f1b72dbc1a19bb86f9648de717a58fd9..33942b50bb1645b3e92ad8e230c98425fdc46297 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q15.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_init_q15.c
  * Description:  Q15 matrix initialization
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q31.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q31.c
index d5c5722202043a918404c44b55ffe47d546beffb..ab735d0624781cd0ca0e55ccd7aff341ab064388 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q31.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_init_q31.c
  * Description:  Q31 matrix initialization
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..f40dcb35853f15be425afe8f5f1033b4b702d75f
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f16.c
@@ -0,0 +1,891 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_inverse_f16.c
+ * Description:  Floating-point matrix inverse
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixInv
+  @{
+ */
+
+/**
+  @brief         Floating-point matrix inverse.
+  @param[in]     pSrc      points to input matrix structure. The source matrix is modified by the function.
+  @param[out]    pDst      points to output matrix structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+                   - \ref ARM_MATH_SINGULAR      : Input matrix is found to be singular (non-invertible)
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_inverse_f16(
+  const arm_matrix_instance_f16 * pSrc,
+  arm_matrix_instance_f16 * pDst)
+{
+    float16_t *pIn = pSrc->pData;   /* input data matrix pointer */
+    float16_t *pOut = pDst->pData;  /* output data matrix pointer */
+    float16_t *pInT1, *pInT2;   /* Temporary input data matrix pointer */
+    float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
+    float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;    /* Temporary input and output data matrix pointer */
+
+    uint32_t  numRows = pSrc->numRows;  /* Number of rows in the matrix  */
+    uint32_t  numCols = pSrc->numCols;  /* Number of Cols in the matrix  */
+    float16_t *pTmpA, *pTmpB;
+
+    _Float16 in = 0.0f16;        /* Temporary input values  */
+    uint32_t  i, rowCnt, flag = 0U, j, loopCnt, l;   /* loop counters */
+    arm_status status;          /* status of matrix inverse */
+    uint32_t  blkCnt;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+   /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
+     || (pSrc->numRows != pDst->numRows))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
+  {
+
+    /*--------------------------------------------------------------------------------------------------------------
+     * Matrix Inverse can be solved using elementary row operations.
+     *
+     *  Gauss-Jordan Method:
+     *
+     *     1. First combine the identity matrix and the input matrix separated by a bar to form an
+     *        augmented matrix as follows:
+     *                      _  _          _     _      _   _         _         _
+     *                     |  |  a11  a12  | | | 1   0  |   |       |  X11 X12  |
+     *                     |  |            | | |        |   |   =   |           |
+     *                     |_ |_ a21  a22 _| | |_0   1 _|  _|       |_ X21 X21 _|
+     *
+     *      2. In our implementation, pDst Matrix is used as identity matrix.
+     *
+     *      3. Begin with the first row. Let i = 1.
+     *
+     *      4. Check to see if the pivot for row i is zero.
+     *         The pivot is the element of the main diagonal that is on the current row.
+     *         For instance, if working with row i, then the pivot element is aii.
+     *         If the pivot is zero, exchange that row with a row below it that does not
+     *         contain a zero in column i. If this is not possible, then an inverse
+     *         to that matrix does not exist.
+     *
+     *      5. Divide every element of row i by the pivot.
+     *
+     *      6. For every row below and  row i, replace that row with the sum of that row and
+     *         a multiple of row i so that each new element in column i below row i is zero.
+     *
+     *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+     *         for every element below and above the main diagonal.
+     *
+     *      8. Now an identical matrix is formed to the left of the bar(input matrix, src).
+     *         Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
+     *----------------------------------------------------------------------------------------------------------------*/
+
+        /*
+         * Working pointer for destination matrix
+         */
+        pOutT1 = pOut;
+        /*
+         * Loop over the number of rows
+         */
+        rowCnt = numRows;
+        /*
+         * Making the destination matrix as identity matrix
+         */
+        while (rowCnt > 0U)
+        {
+            /*
+             * Writing all zeroes in lower triangle of the destination matrix
+             */
+            j = numRows - rowCnt;
+            while (j > 0U)
+            {
+                *pOutT1++ = 0.0f16;
+                j--;
+            }
+            /*
+             * Writing all ones in the diagonal of the destination matrix
+             */
+            *pOutT1++ = 1.0f16;
+            /*
+             * Writing all zeroes in upper triangle of the destination matrix
+             */
+            j = rowCnt - 1U;
+            while (j > 0U)
+            {
+                *pOutT1++ = 0.0f16;
+                j--;
+            }
+            /*
+             * Decrement the loop counter
+             */
+            rowCnt--;
+        }
+
+        /*
+         * Loop over the number of columns of the input matrix.
+         * All the elements in each column are processed by the row operations
+         */
+        loopCnt = numCols;
+        /*
+         * Index modifier to navigate through the columns
+         */
+        l = 0U;
+        while (loopCnt > 0U)
+        {
+            /*
+             * Check if the pivot element is zero..
+             * If it is zero then interchange the row with non zero row below.
+             * If there is no non zero element to replace in the rows below,
+             * then the matrix is Singular.
+             */
+
+            /*
+             * Working pointer for the input matrix that points
+             * * to the pivot element of the particular row
+             */
+            pInT1 = pIn + (l * numCols);
+            /*
+             * Working pointer for the destination matrix that points
+             * * to the pivot element of the particular row
+             */
+            pOutT1 = pOut + (l * numCols);
+            /*
+             * Temporary variable to hold the pivot value
+             */
+            in = *pInT1;
+            
+
+            /*
+             * Check if the pivot element is zero
+             */
+            if (*pInT1 == 0.0f16)
+            {
+                /*
+                 * Loop over the number rows present below
+                 */
+                for (i = 1U; i < numRows-l; i++)
+                {
+                    /*
+                     * Update the input and destination pointers
+                     */
+                    pInT2 = pInT1 + (numCols * i);
+                    pOutT2 = pOutT1 + (numCols * i);
+                    /*
+                     * Check if there is a non zero pivot element to
+                     * * replace in the rows below
+                     */
+                    if (*pInT2 != 0.0f16)
+                    {
+                        f16x8_t vecA, vecB;
+                        /*
+                         * Loop over number of columns
+                         * * to the right of the pilot element
+                         */
+                        pTmpA = pInT1;
+                        pTmpB = pInT2;
+                        blkCnt = (numCols - l) >> 3;
+                        while (blkCnt > 0U)
+                        {
+                            
+                            vecA = vldrhq_f16(pTmpA);
+                            vecB = vldrhq_f16(pTmpB);
+                            vstrhq_f16(pTmpB, vecA);
+                            vstrhq_f16(pTmpA, vecB);
+
+                            pTmpA += 8;
+                            pTmpB += 8;
+                            /*
+                             * Decrement the blockSize loop counter
+                             */
+                            blkCnt--;
+                        }
+                        /*
+                         * tail
+                         * (will be merged thru tail predication)
+                         */
+                        blkCnt = (numCols - l) & 7;
+                        if (blkCnt > 0U)
+                        {
+                            mve_pred16_t p0 = vctp16q(blkCnt);
+
+                            vecA = vldrhq_f16(pTmpA);
+                            vecB = vldrhq_f16(pTmpB);
+                            vstrhq_p_f16(pTmpB, vecA, p0);
+                            vstrhq_p_f16(pTmpA, vecB, p0);
+                        }
+
+                        pInT1 += numCols - l;
+                        pInT2 += numCols - l;
+                        pTmpA = pOutT1;
+                        pTmpB = pOutT2;
+                        blkCnt = numCols >> 3;
+                        while (blkCnt > 0U)
+                        {
+
+                            vecA = vldrhq_f16(pTmpA);
+                            vecB = vldrhq_f16(pTmpB);
+                            vstrhq_f16(pTmpB, vecA);
+                            vstrhq_f16(pTmpA, vecB);
+                            pTmpA += 8;
+                            pTmpB += 8;
+                            /*
+                             * Decrement the blockSize loop counter
+                             */
+                            blkCnt--;
+                        }
+                        /*
+                         * tail
+                         */
+                        blkCnt = numCols & 7;
+                        if (blkCnt > 0U)
+                        {
+                            mve_pred16_t p0 = vctp16q(blkCnt);
+
+                            vecA = vldrhq_f16(pTmpA);
+                            vecB = vldrhq_f16(pTmpB);
+                            vstrhq_p_f16(pTmpB, vecA, p0);
+                            vstrhq_p_f16(pTmpA, vecB, p0);
+                        }
+
+                        pOutT1 += numCols;
+                        pOutT2 += numCols;
+                        /*
+                         * Flag to indicate whether exchange is done or not
+                         */
+                        flag = 1U;
+
+                        /*
+                         * Break after exchange is done
+                         */
+                        break;
+                    }
+              
+                }
+            }
+
+            /*
+             * Update the status if the matrix is singular
+             */
+            if ((flag != 1U) && (in == 0.0f16))
+            {
+                return ARM_MATH_SINGULAR;
+            }
+
+            /*
+             * Points to the pivot row of input and destination matrices
+             */
+            pPivotRowIn = pIn + (l * numCols);
+            pPivotRowDst = pOut + (l * numCols);
+
+            /*
+             * Temporary pointers to the pivot row pointers
+             */
+            pInT1 = pPivotRowIn;
+            pOutT1 = pPivotRowDst;
+
+            /*
+             * Pivot element of the row
+             */
+            in = *(pIn + (l * numCols));
+
+            pTmpA = pInT1;
+
+            f16x8_t invIn = vdupq_n_f16(1.0f16 / in);
+
+            blkCnt = (numCols - l) >> 3;
+            f16x8_t vecA;
+            while (blkCnt > 0U)
+            {
+                *(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA * invIn;
+                pTmpA += 8;
+                /*
+                 * Decrement the blockSize loop counter
+                 */
+                blkCnt--;
+            }
+            /*
+             * tail
+             */
+            blkCnt = (numCols - l) & 7;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp16q(blkCnt);
+                
+
+                vecA = vldrhq_f16(pTmpA);
+                vecA = vecA * invIn;
+                vstrhq_p_f16(pTmpA, vecA, p0);
+            }
+
+            pInT1 += numCols - l;
+            /*
+             * Loop over number of columns
+             * * to the right of the pilot element
+             */
+
+            pTmpA = pOutT1;
+            blkCnt = numCols >> 3;
+            while (blkCnt > 0U)
+            {
+                *(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA *invIn;
+                pTmpA += 8;
+                /*
+                 * Decrement the blockSize loop counter
+                 */
+                blkCnt--;
+            }
+            /*
+             * tail
+             * (will be merged thru tail predication)
+             */
+            blkCnt = numCols & 7;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp16q(blkCnt);
+
+                vecA = vldrhq_f16(pTmpA);
+                vecA = vecA * invIn;
+                vstrhq_p_f16(pTmpA, vecA, p0);
+            }
+
+            pOutT1 += numCols;
+
+            /*
+             * Replace the rows with the sum of that row and a multiple of row i
+             * * so that each new element in column i above row i is zero.
+             */
+
+            /*
+             * Temporary pointers for input and destination matrices
+             */
+            pInT1 = pIn;
+            pOutT1 = pOut;
+
+            for (i = 0U; i < numRows; i++)
+            {
+                /*
+                 * Check for the pivot element
+                 */
+                if (i == l)
+                {
+                    /*
+                     * If the processing element is the pivot element,
+                     * only the columns to the right are to be processed
+                     */
+                    pInT1 += numCols - l;
+                    pOutT1 += numCols;
+                }
+                else
+                {
+                    /*
+                     * Element of the reference row
+                     */
+
+                    /*
+                     * Working pointers for input and destination pivot rows
+                     */
+                    pPRT_in = pPivotRowIn;
+                    pPRT_pDst = pPivotRowDst;
+                    /*
+                     * Loop over the number of columns to the right of the pivot element,
+                     * to replace the elements in the input matrix
+                     */
+
+                    in = *pInT1;
+                    f16x8_t tmpV = vdupq_n_f16(in);
+
+                    blkCnt = (numCols - l) >> 3;
+                    while (blkCnt > 0U)
+                    {
+                        f16x8_t vec1, vec2;
+                        /*
+                         * Replace the element by the sum of that row
+                         * and a multiple of the reference row
+                         */
+                        vec1 = vldrhq_f16(pInT1);
+                        vec2 = vldrhq_f16(pPRT_in);
+                        vec1 = vfmsq_f16(vec1, tmpV, vec2);
+                        vstrhq_f16(pInT1, vec1);
+                        pPRT_in += 8;
+                        pInT1 += 8;
+                        /*
+                         * Decrement the blockSize loop counter
+                         */
+                        blkCnt--;
+                    }
+                    /*
+                     * tail
+                     * (will be merged thru tail predication)
+                     */
+                    blkCnt = (numCols - l) & 7;
+                    if (blkCnt > 0U)
+                    {
+                        f16x8_t vec1, vec2;
+                        mve_pred16_t p0 = vctp16q(blkCnt);
+
+                        vec1 = vldrhq_f16(pInT1);
+                        vec2 = vldrhq_f16(pPRT_in);
+                        vec1 = vfmsq_f16(vec1, tmpV, vec2);
+                        vstrhq_p_f16(pInT1, vec1, p0);
+                        pInT1 += blkCnt;
+                    }
+
+                    blkCnt = numCols >> 3;
+                    while (blkCnt > 0U)
+                    {
+                        f16x8_t vec1, vec2;
+
+                        /*
+                         * Replace the element by the sum of that row
+                         * and a multiple of the reference row
+                         */
+                        vec1 = vldrhq_f16(pOutT1);
+                        vec2 = vldrhq_f16(pPRT_pDst);
+                        vec1 = vfmsq_f16(vec1, tmpV, vec2);
+                        vstrhq_f16(pOutT1, vec1);
+                        pPRT_pDst += 8;
+                        pOutT1 += 8;
+                        /*
+                         * Decrement the blockSize loop counter
+                         */
+                        blkCnt--;
+                    }
+                    /*
+                     * tail
+                     * (will be merged thru tail predication)
+                     */
+                    blkCnt = numCols & 7;
+                    if (blkCnt > 0U)
+                    {
+                        f16x8_t vec1, vec2;
+                        mve_pred16_t p0 = vctp16q(blkCnt);
+
+                        vec1 = vldrhq_f16(pOutT1);
+                        vec2 = vldrhq_f16(pPRT_pDst);
+                        vec1 = vfmsq_f16(vec1, tmpV, vec2);
+                        vstrhq_p_f16(pOutT1, vec1, p0);
+
+                        pInT2 += blkCnt;
+                        pOutT1 += blkCnt;
+                    }
+                }
+                /*
+                 * Increment the temporary input pointer
+                 */
+                pInT1 = pInT1 + l;
+            }
+            /*
+             * Increment the input pointer
+             */
+            pIn++;
+            /*
+             * Decrement the loop counter
+             */
+            loopCnt--;
+            /*
+             * Increment the index modifier
+             */
+            l++;
+        }
+
+        /*
+         * Set status as ARM_MATH_SUCCESS
+         */
+        status = ARM_MATH_SUCCESS;
+
+        if ((flag != 1U) && (in == 0.0f16))
+        {
+            pIn = pSrc->pData;
+            for (i = 0; i < numRows * numCols; i++)
+            {
+                if (pIn[i] != 0.0f16)
+                    break;
+            }
+
+            if (i == numRows * numCols)
+                status = ARM_MATH_SINGULAR;
+        }
+  }
+  /* Return to application */
+  return (status);
+}
+
+#else
+
+arm_status arm_mat_inverse_f16(
+  const arm_matrix_instance_f16 * pSrc,
+        arm_matrix_instance_f16 * pDst)
+{
+  float16_t *pIn = pSrc->pData;                  /* input data matrix pointer */
+  float16_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  float16_t *pInT1, *pInT2;                      /* Temporary input data matrix pointer */
+  float16_t *pOutT1, *pOutT2;                    /* Temporary output data matrix pointer */
+  float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;  /* Temporary input and output data matrix pointer */
+  uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
+  uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */
+
+  _Float16 Xchg, in = 0.0f16, in1;                /* Temporary input values  */
+  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l;      /* loop counters */
+  arm_status status;                             /* status of matrix inverse */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pDst->numRows != pDst->numCols) ||
+      (pSrc->numRows != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+
+    /*--------------------------------------------------------------------------------------------------------------
+     * Matrix Inverse can be solved using elementary row operations.
+     *
+     *  Gauss-Jordan Method:
+     *
+     *      1. First combine the identity matrix and the input matrix separated by a bar to form an
+     *        augmented matrix as follows:
+     *                      _                  _         _         _
+     *                     |  a11  a12 | 1   0  |       |  X11 X12  |
+     *                     |           |        |   =   |           |
+     *                     |_ a21  a22 | 0   1 _|       |_ X21 X21 _|
+     *
+     *      2. In our implementation, pDst Matrix is used as identity matrix.
+     *
+     *      3. Begin with the first row. Let i = 1.
+     *
+     *      4. Check to see if the pivot for row i is zero.
+     *         The pivot is the element of the main diagonal that is on the current row.
+     *         For instance, if working with row i, then the pivot element is aii.
+     *         If the pivot is zero, exchange that row with a row below it that does not
+     *         contain a zero in column i. If this is not possible, then an inverse
+     *         to that matrix does not exist.
+     *
+     *      5. Divide every element of row i by the pivot.
+     *
+     *      6. For every row below and  row i, replace that row with the sum of that row and
+     *         a multiple of row i so that each new element in column i below row i is zero.
+     *
+     *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+     *         for every element below and above the main diagonal.
+     *
+     *      8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
+     *         Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
+     *----------------------------------------------------------------------------------------------------------------*/
+
+    /* Working pointer for destination matrix */
+    pOutT1 = pOut;
+
+    /* Loop over the number of rows */
+    rowCnt = numRows;
+
+    /* Making the destination matrix as identity matrix */
+    while (rowCnt > 0U)
+    {
+      /* Writing all zeroes in lower triangle of the destination matrix */
+      j = numRows - rowCnt;
+      while (j > 0U)
+      {
+        *pOutT1++ = 0.0f16;
+        j--;
+      }
+
+      /* Writing all ones in the diagonal of the destination matrix */
+      *pOutT1++ = 1.0f16;
+
+      /* Writing all zeroes in upper triangle of the destination matrix */
+      j = rowCnt - 1U;
+      while (j > 0U)
+      {
+        *pOutT1++ = 0.0f16;
+        j--;
+      }
+
+      /* Decrement loop counter */
+      rowCnt--;
+    }
+
+    /* Loop over the number of columns of the input matrix.
+       All the elements in each column are processed by the row operations */
+    loopCnt = numCols;
+
+    /* Index modifier to navigate through the columns */
+    l = 0U;
+
+    while (loopCnt > 0U)
+    {
+      /* Check if the pivot element is zero..
+       * If it is zero then interchange the row with non zero row below.
+       * If there is no non zero element to replace in the rows below,
+       * then the matrix is Singular. */
+
+      /* Working pointer for the input matrix that points
+       * to the pivot element of the particular row  */
+      pInT1 = pIn + (l * numCols);
+
+      /* Working pointer for the destination matrix that points
+       * to the pivot element of the particular row  */
+      pOutT1 = pOut + (l * numCols);
+
+      /* Temporary variable to hold the pivot value */
+      in = *pInT1;
+
+
+      /* Check if the pivot element is zero */
+      if (*pInT1 == 0.0f16)
+      {
+        /* Loop over the number rows present below */
+
+        for (i = 1U; i < numRows-l; i++)
+        {
+          /* Update the input and destination pointers */
+          pInT2 = pInT1 + (numCols * i);
+          pOutT2 = pOutT1 + (numCols * i);
+
+          /* Check if there is a non zero pivot element to
+           * replace in the rows below */
+          if (*pInT2 != 0.0f16)
+          {
+            /* Loop over number of columns
+             * to the right of the pilot element */
+            j = numCols - l;
+
+            while (j > 0U)
+            {
+              /* Exchange the row elements of the input matrix */
+              Xchg = *pInT2;
+              *pInT2++ = *pInT1;
+              *pInT1++ = Xchg;
+
+              /* Decrement the loop counter */
+              j--;
+            }
+
+            /* Loop over number of columns of the destination matrix */
+            j = numCols;
+
+            while (j > 0U)
+            {
+              /* Exchange the row elements of the destination matrix */
+              Xchg = *pOutT2;
+              *pOutT2++ = *pOutT1;
+              *pOutT1++ = Xchg;
+
+              /* Decrement loop counter */
+              j--;
+            }
+
+            /* Flag to indicate whether exchange is done or not */
+            flag = 1U;
+
+            /* Break after exchange is done */
+            break;
+          }
+
+        }
+      }
+
+      /* Update the status if the matrix is singular */
+      if ((flag != 1U) && (in == 0.0f16))
+      {
+        return ARM_MATH_SINGULAR;
+      }
+
+      /* Points to the pivot row of input and destination matrices */
+      pPivotRowIn = pIn + (l * numCols);
+      pPivotRowDst = pOut + (l * numCols);
+
+      /* Temporary pointers to the pivot row pointers */
+      pInT1 = pPivotRowIn;
+      pInT2 = pPivotRowDst;
+
+      /* Pivot element of the row */
+      in = *pPivotRowIn;
+
+      /* Loop over number of columns
+       * to the right of the pilot element */
+      j = (numCols - l);
+
+      while (j > 0U)
+      {
+        /* Divide each element of the row of the input matrix
+         * by the pivot element */
+        in1 = *pInT1;
+        *pInT1++ = in1 / in;
+
+        /* Decrement the loop counter */
+        j--;
+      }
+
+      /* Loop over number of columns of the destination matrix */
+      j = numCols;
+
+      while (j > 0U)
+      {
+        /* Divide each element of the row of the destination matrix
+         * by the pivot element */
+        in1 = *pInT2;
+        *pInT2++ = in1 / in;
+
+        /* Decrement the loop counter */
+        j--;
+      }
+
+      /* Replace the rows with the sum of that row and a multiple of row i
+       * so that each new element in column i above row i is zero.*/
+
+      /* Temporary pointers for input and destination matrices */
+      pInT1 = pIn;
+      pInT2 = pOut;
+
+      /* index used to check for pivot element */
+      i = 0U;
+
+      /* Loop over number of rows */
+      /*  to be replaced by the sum of that row and a multiple of row i */
+      k = numRows;
+
+      while (k > 0U)
+      {
+        /* Check for the pivot element */
+        if (i == l)
+        {
+          /* If the processing element is the pivot element,
+             only the columns to the right are to be processed */
+          pInT1 += numCols - l;
+
+          pInT2 += numCols;
+        }
+        else
+        {
+          /* Element of the reference row */
+          in = *pInT1;
+
+          /* Working pointers for input and destination pivot rows */
+          pPRT_in = pPivotRowIn;
+          pPRT_pDst = pPivotRowDst;
+
+          /* Loop over the number of columns to the right of the pivot element,
+             to replace the elements in the input matrix */
+          j = (numCols - l);
+
+          while (j > 0U)
+          {
+            /* Replace the element by the sum of that row
+               and a multiple of the reference row  */
+            in1 = *pInT1;
+            *pInT1++ = in1 - (in * *pPRT_in++);
+
+            /* Decrement the loop counter */
+            j--;
+          }
+
+          /* Loop over the number of columns to
+             replace the elements in the destination matrix */
+          j = numCols;
+
+          while (j > 0U)
+          {
+            /* Replace the element by the sum of that row
+               and a multiple of the reference row  */
+            in1 = *pInT2;
+            *pInT2++ = in1 - (in * *pPRT_pDst++);
+
+            /* Decrement loop counter */
+            j--;
+          }
+
+        }
+
+        /* Increment temporary input pointer */
+        pInT1 = pInT1 + l;
+
+        /* Decrement loop counter */
+        k--;
+
+        /* Increment pivot index */
+        i++;
+      }
+
+      /* Increment the input pointer */
+      pIn++;
+
+      /* Decrement the loop counter */
+      loopCnt--;
+
+      /* Increment the index modifier */
+      l++;
+    }
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+
+    if ((flag != 1U) && (in == 0.0f16))
+    {
+      pIn = pSrc->pData;
+      for (i = 0; i < numRows * numCols; i++)
+      {
+        if (pIn[i] != 0.0f16)
+            break;
+      }
+
+      if (i == numRows * numCols)
+        status = ARM_MATH_SINGULAR;
+    }
+  }
+
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixInv group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c
index ac51945271f6d6fd54bc55b40d3ffdfc28d75d5f..b0ef7608efc5a832c7650e98f57382226914ce65 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_inverse_f32.c
  * Description:  Floating-point matrix inverse
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,8 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
+
 
 /**
   @ingroup groupMatrix
@@ -84,7 +85,7 @@ arm_status arm_mat_inverse_f32(
     float32_t *pTmpA, *pTmpB;
 
     float32_t in = 0.0f;        /* Temporary input values  */
-    uint32_t  i, rowCnt, flag = 0U, j, loopCnt, k, l;   /* loop counters */
+    uint32_t  i, rowCnt, flag = 0U, j, loopCnt, l;   /* loop counters */
     arm_status status;          /* status of matrix inverse */
     uint32_t  blkCnt;
 
@@ -208,10 +209,7 @@ arm_status arm_mat_inverse_f32(
              * Temporary variable to hold the pivot value
              */
             in = *pInT1;
-            /*
-             * Destination pointer modifier
-             */
-            k = 1U;
+           
 
             /*
              * Check if the pivot element is zero
@@ -221,13 +219,13 @@ arm_status arm_mat_inverse_f32(
                 /*
                  * Loop over the number rows present below
                  */
-                for (i = (l + 1U); i < numRows; i++)
+                for (i = 1U; i < numRows-l; i++)
                 {
                     /*
                      * Update the input and destination pointers
                      */
                     pInT2 = pInT1 + (numCols * i);
-                    pOutT2 = pOutT1 + (numCols * k);
+                    pOutT2 = pOutT1 + (numCols * i);
                     /*
                      * Check if there is a non zero pivot element to
                      * * replace in the rows below
@@ -317,10 +315,7 @@ arm_status arm_mat_inverse_f32(
                          */
                         break;
                     }
-                    /*
-                     * Update the destination pointer modifier
-                     */
-                    k++;
+                    
                 }
             }
 
@@ -699,19 +694,15 @@ arm_status arm_mat_inverse_f32(
       /* Temporary variable to hold the pivot value */
       in = *pInT1;
 
-    
-      /* Destination pointer modifier */
-      k = 1U;
-
       /* Check if the pivot element is zero */
       if (*pInT1 == 0.0f)
       {
         /* Loop over the number rows present below */
-        for (i = (l + 1U); i < numRows; i++)
+        for (i = 1U; i < numRows - l; i++)
         {
           /* Update the input and destination pointers */
           pInT2 = pInT1 + (numCols * i);
-          pOutT2 = pOutT1 + (numCols * k);
+          pOutT2 = pOutT1 + (numCols * i);
 
           /* Check if there is a non zero pivot element to
            * replace in the rows below */
@@ -753,8 +744,7 @@ arm_status arm_mat_inverse_f32(
             break;
           }
 
-          /* Update the destination pointer modifier */
-          k++;
+         
         }
       }
 
@@ -997,7 +987,7 @@ arm_status arm_mat_inverse_f32(
 #if defined (ARM_MATH_DSP)
 
   float32_t Xchg, in = 0.0f, in1;                /* Temporary input values  */
-  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l;      /* loop counters */
+  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l;      /* loop counters */
   arm_status status;                             /* status of matrix inverse */
 
 #ifdef ARM_MATH_MATRIX_CHECK
@@ -1108,20 +1098,18 @@ arm_status arm_mat_inverse_f32(
       /* Temporary variable to hold the pivot value */
       in = *pInT1;
 
-      
-      /* Destination pointer modifier */
-      k = 1U;
+    
 
       /* Check if the pivot element is zero */
       if (*pInT1 == 0.0f)
       {
         /* Loop over the number rows present below */
 
-        for (i = (l + 1U); i < numRows; i++)
+        for (i = 1U; i < numRows - l; i++)
         {
           /* Update the input and destination pointers */
           pInT2 = pInT1 + (numCols * i);
-          pOutT2 = pOutT1 + (numCols * k);
+          pOutT2 = pOutT1 + (numCols * i);
 
           /* Check if there is a non zero pivot element to
            * replace in the rows below */
@@ -1163,8 +1151,6 @@ arm_status arm_mat_inverse_f32(
             break;
           }
 
-          /* Update the destination pointer modifier */
-          k++;
 
           /* Decrement loop counter */
         }
@@ -1306,7 +1292,7 @@ arm_status arm_mat_inverse_f32(
 #else
 
   float32_t Xchg, in = 0.0f;                     /* Temporary input values  */
-  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l;      /* loop counters */
+  uint32_t i, rowCnt, flag = 0U, j, loopCnt, l;      /* loop counters */
   arm_status status;                             /* status of matrix inverse */
 
 #ifdef ARM_MATH_MATRIX_CHECK
@@ -1417,18 +1403,15 @@ arm_status arm_mat_inverse_f32(
       /* Temporary variable to hold the pivot value */
       in = *pInT1;
 
-      /* Destination pointer modifier */
-      k = 1U;
-
       /* Check if the pivot element is zero */
       if (*pInT1 == 0.0f)
       {
         /* Loop over the number rows present below */
-        for (i = (l + 1U); i < numRows; i++)
+        for (i = 1U; i < numRows-l; i++)
         {
           /* Update the input and destination pointers */
           pInT2 = pInT1 + (numCols * i);
-          pOutT2 = pOutT1 + (numCols * k);
+          pOutT2 = pOutT1 + (numCols * i);
 
           /* Check if there is a non zero pivot element to
            * replace in the rows below */
@@ -1457,12 +1440,10 @@ arm_status arm_mat_inverse_f32(
             /* Break after exchange is done */
             break;
           }
-
-          /* Update the destination pointer modifier */
-          k++;
         }
       }
 
+
       /* Update the status if the matrix is singular */
       if ((flag != 1U) && (in == 0.0f))
       {
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c
index b9b930994224b2e2d905b509261893aa1e868951..bf99a7920d3de913f6c9bc7d3ba031b0d197e0e2 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_inverse_f64.c
  * Description:  Floating-point matrix inverse
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -63,7 +63,7 @@ arm_status arm_mat_inverse_f64(
 #if defined (ARM_MATH_DSP)
 
   float64_t Xchg, in = 0.0, in1;                /* Temporary input values  */
-  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l;      /* loop counters */
+  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l;      /* loop counters */
   arm_status status;                             /* status of matrix inverse */
 
 #ifdef ARM_MATH_MATRIX_CHECK
@@ -174,18 +174,18 @@ arm_status arm_mat_inverse_f64(
       /* Temporary variable to hold the pivot value */
       in = *pInT1;
 
-      /* Destination pointer modifier */
-      k = 1U;
+    
 
       /* Check if the pivot element is zero */
       if (*pInT1 == 0.0)
       {
         /* Loop over the number rows present below */
-        for (i = (l + 1U); i < numRows; i++)
+
+        for (i = 1U; i < numRows - l; i++)
         {
           /* Update the input and destination pointers */
           pInT2 = pInT1 + (numCols * i);
-          pOutT2 = pOutT1 + (numCols * k);
+          pOutT2 = pOutT1 + (numCols * i);
 
           /* Check if there is a non zero pivot element to
            * replace in the rows below */
@@ -227,11 +227,8 @@ arm_status arm_mat_inverse_f64(
             break;
           }
 
-          /* Update the destination pointer modifier */
-          k++;
 
           /* Decrement loop counter */
-          i--;
         }
       }
 
@@ -371,7 +368,7 @@ arm_status arm_mat_inverse_f64(
 #else
 
   float64_t Xchg, in = 0.0;                     /* Temporary input values  */
-  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l;      /* loop counters */
+  uint32_t i, rowCnt, flag = 0U, j, loopCnt, l;      /* loop counters */
   arm_status status;                             /* status of matrix inverse */
 
 #ifdef ARM_MATH_MATRIX_CHECK
@@ -482,18 +479,15 @@ arm_status arm_mat_inverse_f64(
       /* Temporary variable to hold the pivot value */
       in = *pInT1;
 
-      /* Destination pointer modifier */
-      k = 1U;
-
       /* Check if the pivot element is zero */
       if (*pInT1 == 0.0)
       {
         /* Loop over the number rows present below */
-        for (i = (l + 1U); i < numRows; i++)
+        for (i = 1U; i < numRows-l; i++)
         {
           /* Update the input and destination pointers */
           pInT2 = pInT1 + (numCols * i);
-          pOutT2 = pOutT1 + (numCols * k);
+          pOutT2 = pOutT1 + (numCols * i);
 
           /* Check if there is a non zero pivot element to
            * replace in the rows below */
@@ -522,12 +516,10 @@ arm_status arm_mat_inverse_f64(
             /* Break after exchange is done */
             break;
           }
-
-          /* Update the destination pointer modifier */
-          k++;
         }
       }
 
+
       /* Update the status if the matrix is singular */
       if ((flag != 1U) && (in == 0.0))
       {
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..bcca8308b6076a1ea4a367c18f7d96758442c103
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f32.c
@@ -0,0 +1,500 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_ldl_f32.c
+ * Description:  Floating-point LDL decomposition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+
+
+
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+
+/// @private
+#define SWAP_ROWS_F32(A,i,j)                 \
+  {                                      \
+    int cnt = n;                         \
+                                         \
+    for(int w=0;w < n; w+=4)             \
+    {                                    \
+       f32x4_t tmpa,tmpb;                \
+       mve_pred16_t p0 = vctp32q(cnt);   \
+                                         \
+       tmpa=vldrwq_z_f32(&A[i*n + w],p0);\
+       tmpb=vldrwq_z_f32(&A[j*n + w],p0);\
+                                         \
+       vstrwq_p(&A[i*n + w], tmpb, p0);  \
+       vstrwq_p(&A[j*n + w], tmpa, p0);  \
+                                         \
+       cnt -= 4;                         \
+    }                                    \
+  }
+
+/// @private
+#define SWAP_COLS_F32(A,i,j)     \
+  for(int w=0;w < n; w++)    \
+  {                          \
+     float32_t tmp;          \
+     tmp = A[w*n + i];       \
+     A[w*n + i] = A[w*n + j];\
+     A[w*n + j] = tmp;       \
+  }
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @addtogroup MatrixChol
+  @{
+ */
+
+/**
+   * @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
+   * @param[in]  pSrc   points to the instance of the input floating-point matrix structure.
+   * @param[out] pl   points to the instance of the output floating-point triangular matrix structure.
+   * @param[out] pd   points to the instance of the output floating-point diagonal matrix structure.
+   * @param[out] pp   points to the instance of the output floating-point permutation vector.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+                   - \ref ARM_MATH_DECOMPOSITION_FAILURE      : Input matrix cannot be decomposed
+   * @par
+   *  Computes the LDL^t decomposition of a matrix A such that P A P^t = L D L^t.
+   */
+arm_status arm_mat_ldlt_f32(
+  const arm_matrix_instance_f32 * pSrc,
+  arm_matrix_instance_f32 * pl,
+  arm_matrix_instance_f32 * pd,
+  uint16_t * pp)
+{
+
+  arm_status status;                             /* status of matrix inverse */
+ 
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pl->numRows != pl->numCols) ||
+      (pd->numRows != pd->numCols) ||
+      (pl->numRows != pd->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+
+    const int n=pSrc->numRows;
+    int fullRank = 1, diag,k;
+    float32_t *pA;
+
+    memcpy(pl->pData,pSrc->pData,n*n*sizeof(float32_t));
+    pA = pl->pData;
+
+    int cnt = n;
+    uint16x8_t vecP;
+
+    for(int k=0;k < n; k+=8)
+    {
+      mve_pred16_t p0;
+      p0 = vctp16q(cnt);
+
+      vecP = vidupq_u16((uint16_t)k, 1);
+
+      vstrhq_p(&pp[k], vecP, p0);
+
+      cnt -= 8;
+    }
+
+
+    for(k=0;k < n; k++)
+    {
+        /* Find pivot */
+        float32_t m=F32_MIN,a;
+        int j=k; 
+
+
+        for(int r=k;r<n;r++)
+        {
+           if (pA[r*n+r] > m)
+           {
+             m = pA[r*n+r];
+             j = r;
+           }
+        }
+
+        if(j != k)
+        {
+          SWAP_ROWS_F32(pA,k,j);
+          SWAP_COLS_F32(pA,k,j);
+        }
+
+
+        pp[k] = j;
+
+        a = pA[k*n+k];
+
+        if (fabs(a) < 1.0e-8)
+        {
+
+            fullRank = 0;
+            break;
+        }
+
+        float32_t invA;
+
+        invA = 1.0f / a;
+
+        int32x4_t vecOffs;
+        int w;
+        vecOffs = vidupq_u32((uint32_t)0, 1);
+        vecOffs = vmulq_n_s32(vecOffs,n);
+
+        for(w=k+1; w<n; w+=4)
+        {
+          int cnt = n - k - 1;
+
+          f32x4_t vecX;
+
+          f32x4_t vecA;
+          f32x4_t vecW0,vecW1, vecW2, vecW3;
+
+          mve_pred16_t p0;
+
+          vecW0 = vdupq_n_f32(pA[(w + 0)*n+k]);
+          vecW1 = vdupq_n_f32(pA[(w + 1)*n+k]);
+          vecW2 = vdupq_n_f32(pA[(w + 2)*n+k]);
+          vecW3 = vdupq_n_f32(pA[(w + 3)*n+k]);
+
+          for(int x=k+1;x<n;x += 4)
+          {
+             p0 = vctp32q(cnt);
+
+             //pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
+
+
+             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], vecOffs, p0);
+             vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
+
+             
+             vecA = vldrwq_z_f32(&pA[(w + 0)*n+x],p0);
+             vecA = vfmsq_m(vecA, vecW0, vecX, p0);
+             vstrwq_p(&pA[(w + 0)*n+x], vecA, p0);  
+
+             vecA = vldrwq_z_f32(&pA[(w + 1)*n+x],p0);
+             vecA = vfmsq_m(vecA, vecW1, vecX, p0);
+             vstrwq_p(&pA[(w + 1)*n+x], vecA, p0);  
+
+             vecA = vldrwq_z_f32(&pA[(w + 2)*n+x],p0);
+             vecA = vfmsq_m(vecA, vecW2, vecX, p0);
+             vstrwq_p(&pA[(w + 2)*n+x], vecA, p0);  
+
+             vecA = vldrwq_z_f32(&pA[(w + 3)*n+x],p0);
+             vecA = vfmsq_m(vecA, vecW3, vecX, p0);
+             vstrwq_p(&pA[(w + 3)*n+x], vecA, p0);  
+
+             cnt -= 4;
+          }
+        }
+
+        for(; w<n; w++)
+        {
+          int cnt = n - k - 1;
+
+          f32x4_t vecA,vecX,vecW;
+
+
+          mve_pred16_t p0;
+
+          vecW = vdupq_n_f32(pA[w*n+k]);
+
+          for(int x=k+1;x<n;x += 4)
+          {
+             p0 = vctp32q(cnt);
+
+             //pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
+
+             vecA = vldrwq_z_f32(&pA[w*n+x],p0);
+             
+             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], vecOffs, p0);
+             vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
+
+             vecA = vfmsq_m(vecA, vecW, vecX, p0);
+
+             vstrwq_p(&pA[w*n+x], vecA, p0);  
+
+             cnt -= 4;
+          }
+        }
+
+        for(int w=k+1;w<n;w++)
+        {
+               pA[w*n+k] = pA[w*n+k] * invA;
+        }
+
+        
+
+    }
+
+
+
+    diag=k;
+    if (!fullRank)
+    {
+      diag--;
+      for(int row=0; row < n;row++)
+      {
+        mve_pred16_t p0; 
+        int cnt= n-k;
+        f32x4_t zero=vdupq_n_f32(0.0f);
+
+        for(int col=k; col < n;col += 4)
+        {
+           p0 = vctp32q(cnt);
+         
+           vstrwq_p(&pl->pData[row*n+col], zero, p0);  
+
+           cnt -= 4;
+        }
+      }
+    }
+
+    for(int row=0; row < n;row++)
+    {
+       mve_pred16_t p0; 
+       int cnt= n-row-1;
+       f32x4_t zero=vdupq_n_f32(0.0f);
+       
+       for(int col=row+1; col < n;col+=4)
+       {
+         p0 = vctp32q(cnt);
+         
+         vstrwq_p(&pl->pData[row*n+col], zero, p0);  
+
+         cnt -= 4;
+       }
+    }
+
+    for(int d=0; d < diag;d++)
+    {
+      pd->pData[d*n+d] = pl->pData[d*n+d];
+      pl->pData[d*n+d] = 1.0;
+    }
+  
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+#else
+
+/// @private
+#define SWAP_ROWS_F32(A,i,j)     \
+  for(int w=0;w < n; w++)    \
+  {                          \
+     float32_t tmp;          \
+     tmp = A[i*n + w];       \
+     A[i*n + w] = A[j*n + w];\
+     A[j*n + w] = tmp;       \
+  }
+
+/// @private
+#define SWAP_COLS_F32(A,i,j)     \
+  for(int w=0;w < n; w++)    \
+  {                          \
+     float32_t tmp;          \
+     tmp = A[w*n + i];       \
+     A[w*n + i] = A[w*n + j];\
+     A[w*n + j] = tmp;       \
+  }
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @addtogroup MatrixChol
+  @{
+ */
+  
+/**
+   * @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
+   * @param[in]  pSrc   points to the instance of the input floating-point matrix structure.
+   * @param[out] pl   points to the instance of the output floating-point triangular matrix structure.
+   * @param[out] pd   points to the instance of the output floating-point diagonal matrix structure.
+   * @param[out] pp   points to the instance of the output floating-point permutation vector.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+                   - \ref ARM_MATH_DECOMPOSITION_FAILURE      : Input matrix cannot be decomposed
+   * @par
+   *  Computes the LDL^t decomposition of a matrix A such that P A P^t = L D L^t.
+   */
+arm_status arm_mat_ldlt_f32(
+  const arm_matrix_instance_f32 * pSrc,
+  arm_matrix_instance_f32 * pl,
+  arm_matrix_instance_f32 * pd,
+  uint16_t * pp)
+{
+
+  arm_status status;                             /* status of matrix inverse */
+ 
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pl->numRows != pl->numCols) ||
+      (pd->numRows != pd->numCols) ||
+      (pl->numRows != pd->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+
+    const int n=pSrc->numRows;
+    int fullRank = 1, diag,k;
+    float32_t *pA;
+
+    memcpy(pl->pData,pSrc->pData,n*n*sizeof(float32_t));
+    pA = pl->pData;
+
+    for(int k=0;k < n; k++)
+    {
+      pp[k] = k;
+    }
+
+
+    for(k=0;k < n; k++)
+    {
+        /* Find pivot */
+        float32_t m=F32_MIN,a;
+        int j=k; 
+
+
+        for(int r=k;r<n;r++)
+        {
+           if (pA[r*n+r] > m)
+           {
+             m = pA[r*n+r];
+             j = r;
+           }
+        }
+
+        if(j != k)
+        {
+          SWAP_ROWS_F32(pA,k,j);
+          SWAP_COLS_F32(pA,k,j);
+        }
+
+
+        pp[k] = j;
+
+        a = pA[k*n+k];
+
+        if (fabs(a) < 1.0e-8)
+        {
+
+            fullRank = 0;
+            break;
+        }
+
+        for(int w=k+1;w<n;w++)
+        {
+          for(int x=k+1;x<n;x++)
+          {
+             pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * pA[x*n+k] / a;
+          }
+        }
+
+        for(int w=k+1;w<n;w++)
+        {
+               pA[w*n+k] = pA[w*n+k] / a;
+        }
+
+        
+
+    }
+
+
+
+    diag=k;
+    if (!fullRank)
+    {
+      diag--;
+      for(int row=0; row < n;row++)
+      {
+        for(int col=k; col < n;col++)
+        {
+           pl->pData[row*n+col]=0.0;
+        }
+      }
+    }
+
+    for(int row=0; row < n;row++)
+    {
+       for(int col=row+1; col < n;col++)
+       {
+         pl->pData[row*n+col] = 0.0;
+       }
+    }
+
+    for(int d=0; d < diag;d++)
+    {
+      pd->pData[d*n+d] = pl->pData[d*n+d];
+      pl->pData[d*n+d] = 1.0;
+    }
+  
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixChol group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f64.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f64.c
new file mode 100644
index 0000000000000000000000000000000000000000..e1c3f8dab82d51387d82dbd3bc75fe8919b0db94
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f64.c
@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_ldl_f64.c
+ * Description:  Floating-point LDL decomposition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+#include <math.h>
+
+
+
+/// @private
+#define SWAP_ROWS_F64(A,i,j)     \
+  for(int w=0;w < n; w++)    \
+  {                          \
+     float64_t tmp;          \
+     tmp = A[i*n + w];       \
+     A[i*n + w] = A[j*n + w];\
+     A[j*n + w] = tmp;       \
+  }
+/// @private
+#define SWAP_COLS_F64(A,i,j)     \
+  for(int w=0;w < n; w++)    \
+  {                          \
+     float64_t tmp;          \
+     tmp = A[w*n + i];       \
+     A[w*n + i] = A[w*n + j];\
+     A[w*n + j] = tmp;       \
+  }
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @addtogroup MatrixChol
+  @{
+ */
+
+/**
+   * @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
+   * @param[in]  pSrc   points to the instance of the input floating-point matrix structure.
+   * @param[out] pl   points to the instance of the output floating-point triangular matrix structure.
+   * @param[out] pd   points to the instance of the output floating-point diagonal matrix structure.
+   * @param[out] pp   points to the instance of the output floating-point permutation vector.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+                   - \ref ARM_MATH_DECOMPOSITION_FAILURE      : Input matrix cannot be decomposed
+   * @par
+   *  Computes the LDL^t decomposition of a matrix A such that P A P^t = L D L^t.
+   */
+
+arm_status arm_mat_ldlt_f64(
+  const arm_matrix_instance_f64 * pSrc,
+  arm_matrix_instance_f64 * pl,
+  arm_matrix_instance_f64 * pd,
+  uint16_t * pp)
+{
+
+  arm_status status;                             /* status of matrix inverse */
+ 
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pl->numRows != pl->numCols) ||
+      (pd->numRows != pd->numCols) ||
+      (pl->numRows != pd->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+
+    const int n=pSrc->numRows;
+    int fullRank = 1, diag,k;
+    float64_t *pA;
+
+    memcpy(pl->pData,pSrc->pData,n*n*sizeof(float64_t));
+    pA = pl->pData;
+
+    for(int k=0;k < n; k++)
+    {
+      pp[k] = k;
+    }
+
+
+    for(k=0;k < n; k++)
+    {
+        /* Find pivot */
+        float64_t m=F64_MIN,a;
+        int j=k; 
+
+
+        for(int r=k;r<n;r++)
+        {
+           if (pA[r*n+r] > m)
+           {
+             m = pA[r*n+r];
+             j = r;
+           }
+        }
+
+        if(j != k)
+        {
+          SWAP_ROWS_F64(pA,k,j);
+          SWAP_COLS_F64(pA,k,j);
+        }
+
+
+        pp[k] = j;
+
+        a = pA[k*n+k];
+
+        if (fabs(a) < 1.0e-18)
+        {
+
+            fullRank = 0;
+            break;
+        }
+
+        for(int w=k+1;w<n;w++)
+        {
+          for(int x=k+1;x<n;x++)
+          {
+             pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * pA[x*n+k] / a;
+          }
+        }
+
+        for(int w=k+1;w<n;w++)
+        {
+               pA[w*n+k] = pA[w*n+k] / a;
+        }
+
+        
+
+    }
+
+
+
+    diag=k;
+    if (!fullRank)
+    {
+      diag--;
+      for(int row=0; row < n;row++)
+      {
+        for(int col=k; col < n;col++)
+        {
+           pl->pData[row*n+col]=0.0;
+        }
+      }
+    }
+
+    for(int row=0; row < n;row++)
+    {
+       for(int col=row+1; col < n;col++)
+       {
+         pl->pData[row*n+col] = 0.0;
+       }
+    }
+
+    for(int d=0; d < diag;d++)
+    {
+      pd->pData[d*n+d] = pl->pData[d*n+d];
+      pl->pData[d*n+d] = 1.0;
+    }
+  
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+
+/**
+  @} end of MatrixChol group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..cba7c1ded2b9e8b956dedda32fba9f7f468897ae
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f16.c
@@ -0,0 +1,763 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_mult_f16.c
+ * Description:  Floating-point matrix multiplication
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @return     		The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_2x2_mve(
+    const arm_matrix_instance_f16 *pSrcA,
+    const arm_matrix_instance_f16 *pSrcB,
+    arm_matrix_instance_f16 *pDst)
+{
+    static const uint16_t offsetA[8] = { 0, 0, 2, 2, 0, 0, 2, 2 };
+    /* offsetB allows to read and duplicate 1 row of B */
+    static const uint16_t offsetB[8] = { 0, 1, 0, 1, 0, 1, 0, 1 };
+    uint16x8_t    vecOffsA, vecOffsB;
+    f16x8_t       vecInA, vecInB, vecDst;
+    float16_t      *pOut = pDst->pData;  /* output data matrix pointer */
+
+    /*
+     * load initial offsets
+     */
+    vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
+    vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
+    /*
+     * load {a00 a00 a10 a10 x x x x }
+     */
+    vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+    /*
+     * load {b00 b01 b00 b01 x x x x }
+     */
+    vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+    /*
+     *  { a00 b00       a00 b01
+     *    a10 b00       a10 b01
+     *       x             x
+     *       x             x   }
+     */
+    vecDst = vmulq(vecInA, vecInB);
+    /*
+     * move to 2nd column of matrix A
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
+    /*
+     * load {a01 a01 a11 a11 x x x x}
+     */
+    vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+    /*
+     * move to next B row
+     */
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 2);
+    /*
+     * load {b10, b11, b10, b11, x x x x }
+     */
+    vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+    /*
+     *  { a00 b00 + a01 b10   a00 b01 + a01 b11
+     *    a10 b00 + a11 b10     a10 b01 + a11 b11
+     *             x                    x
+     *             x                    x       }
+     */
+    vecDst = vfmaq(vecDst, vecInA, vecInB);
+
+    mve_pred16_t p0 = vctp16q(2*2);
+    /*
+     * Store the result in the destination buffer
+     * (lower half of the vector)
+     */
+    vstrhq_p(pOut, vecDst, p0);
+
+    return (ARM_MATH_SUCCESS);
+}
+
+
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
+    const arm_matrix_instance_f16 *pSrcA,
+    const arm_matrix_instance_f16 *pSrcB,
+    arm_matrix_instance_f16 *pDst)
+{
+    static const uint16_t offsetA[8] = { 0, 0, 0, 3, 3, 3, 6, 6 };
+    /* offsetB allows to read and duplicate 1 row of B */
+    static const uint16_t offsetB[8] = { 0, 1, 2, 0, 1, 2, 0, 1 };
+    uint16x8_t    vecOffsA, vecOffsB;
+    f16x8_t       vecInA, vecInB, vecDst;
+    float16_t      *pOut = pDst->pData;  /* output data matrix pointer */
+
+    /*
+     * load initial offsets
+     */
+    vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
+    vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
+
+    /*
+     * load {a00 a00 a00 a10 a10 a10 a20 a20}
+     */
+    vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+    /*
+     * load {b00 b01 b02 b00 b01 b02 b00 b01}
+     */
+    vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+    /*
+     *  { a00 b00       a00 b01     a00 b02
+     *    a10 b00       a10 b01     a10 b02
+     *    a20 b00       a20 b01}
+     */
+    vecDst = vmulq(vecInA, vecInB);
+
+    /*
+     * move to 2nd column of matrix A
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
+    /*
+     * load {a01 a01 a01 a11 a11 a11 a21 a21}
+     */
+    vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+    /*
+     * move to next B row
+     */
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
+    /*
+     * load {b10, b11, b12, b10, b11, b12, b10, b11}
+     */
+    vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+    /*
+     *  { a00 b00 + a01 b10   a00 b01 + a01 b11     a00 b02 + a01 b12
+     *    a10 b00 + a11 b10     a10 b01 + a11 b11     a10 b02 + a11 b12
+     *    a20 b00 + a21 b10     a20 b01 + a21 b11   }
+     */
+    vecDst = vfmaq(vecDst, vecInA, vecInB);
+    /*
+     * move to 3rd column of matrix A
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
+    /*
+     * load {a02 a02 a02 a12 a12 a12 a22 a22}
+     */
+    vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+    /*
+     * move to next B row
+     */
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
+    /*
+     * load {b20, b21, b22, b20, b21, b22, b20, b21}
+     */
+    vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+    /*
+     *  {a00 b00 + a01 b10 + a02 b20  a00 b01 + a01 b11 + a02 b21     a00 b02 + a01 b12 + a02 b22},
+     *   a10 b00 + a11 b10 + a12 b20    a10 b01 + a11 b11 + a12 b21     a10 b02 + a11 b12 + a12 b22},
+     *   a20 b00 + a21 b10 + a22 b20    a20 b01 + a21 b11 + a22 b21   }
+     */
+    vecDst = vfmaq(vecDst, vecInA, vecInB);
+
+    /*
+     * Store the result in the destination buffer
+     */
+    vst1q(pOut, vecDst); pOut += 8;
+
+    /* last element computed in scalar mode
+     * a20 b02 + a21 b12 + a22 b22
+     */
+    _Float16 * pA = (_Float16 *)pSrcA->pData;
+    _Float16 * pB = (_Float16 *)pSrcB->pData;
+    *pOut = pA[2*3] * pB[2] + pA[2*3+1] * pB[3+2] + pA[2*3+2] * pB[2*3+2];
+
+    return (ARM_MATH_SUCCESS);
+}
+
+
+
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
+    const arm_matrix_instance_f16 *pSrcA,
+    const arm_matrix_instance_f16 *pSrcB,
+    arm_matrix_instance_f16 *pDst)
+{
+    /* offsetA allows to read and duplicate 2 successive column elements of A */
+    static const uint16_t offsetA[8] = { 0, 0, 0, 0, 4, 4, 4, 4 };
+    /* offsetB allows to read and duplicate 1 row of B */
+    static const uint16_t offsetB[8] = { 0, 1, 2, 3, 0, 1, 2, 3 };
+    uint16x8_t    vecOffsA, vecOffsB;
+    f16x8_t       vecInA, vecInB, vecDst0, vecDst1;
+    float16_t      *pOut = pDst->pData;  /* output data matrix pointer */
+
+    /*
+     * load initial offsets
+     */
+    vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
+    vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
+
+    /*
+     * load {a00 a00 a00 a00 a10 a10 a10 a10}
+     */
+    vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+    /*
+     * load {b00 b01 b02 b03 b00 b01 b02 b03}
+     */
+    vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+    /*
+     *  { a00 b00       a00 b01     a00 b02     a00 b03
+     *    a10 b00       a10 b01     a10 b02     a10 b03 }
+     */
+    vecDst0 = vmulq(vecInA, vecInB);
+    /*
+     * jump 2 x A rows (2nd half of matrix)
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
+    /*
+     * load {a20 a20 a20 a20 a30 a30 a30 a30}
+     */
+    vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+    /*
+     *  { a20 b00       a20 b01     a20 b02     a20 b03
+     *    a30 b00       a30 b01     a30 b02 +   a31 b12 }
+     */
+    vecDst1 = vmulq(vecInA, vecInB);
+    /*
+     * rewind back to top half of the A matrix (2nd column)
+     */
+    vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
+    /*
+     * load {a01 a01 a01 a01 a11 a11 a11 a11}
+     */
+    vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+    /*
+     * move to next B row
+     */
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
+    /*
+     * load {b10, b11, b12, b13, b10, b11, b12, b13}
+     */
+    vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+    /*
+     *  { a00 b00 + a01 b10         a00 b01 + a01 b11       a00 b02 + a01 b12       a00 b03 + a01 b13
+     *    a10 b00 + a11 b10         a10 b01 + a11 b11       a10 b02 + a11 b12       a10 b03 + a11 b13 }
+     */
+    vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
+    /*
+     * jump 2 x A rows (2nd half of matrix)
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
+    /*
+     * load {a21 a21 a21 a21 a31 a31 a31 a31}
+     */
+    vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+    /*
+     *  {a20 b00 + a21 b10      a20 b01 + a21 b11       a20 b02 + a21 b12       a20 b03 + a21 b13
+     *   a30 b00 + a31 b10      a30 b01 + a31 b11       a30 b02 + a31 b12       a30 b03 + a31 b13 }
+     */
+    vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
+
+    /*
+     * rewind back to top half of the A matrix (3rd column)
+     */
+    vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
+    /*
+     * load {a02 a02 a02 a02 a12 a12 a12 a12}
+     */
+    vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+    /*
+     * move to next B row
+     */
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
+    /*
+     * load {b20, b21, b22, b23, b20, b21, b22, b23}
+     */
+    vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+    /*
+     *  { a00 b00 + a01 b10 + a02 b20    a00 b01 + a01 b11 + a02 b21    a00 b02 + a01 b12 + a02 b22   a00 b03 + a01 b13 + a02 b23
+     *    a10 b00 + a11 b10 + a12 b20    a10 b01 + a11 b11 + a12 b21    a10 b02 + a11 b12 + a12 b22   a10 b03 + a11 b13 + a12 b23 }
+     */
+    vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
+    /*
+     * jump 2 x A rows
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
+
+    /*
+     * load {a22 a22 a22 a22 a32 a32 a32 a32}
+     */
+    vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+    /*
+     *  {a20 b00 + a21 b10 + a22 b20   a20 b01 + a21 b11 + a22 b21  a20 b02 + a21 b12 + a22 b22    a20 b03 + a21 b13 + a22 b23
+     *   a30 b00 + a31 b10 + a32 b20   a30 b01 + a31 b11 + a32 b21  a30 b02 + a31 b12 + a32 b22    a30 b03 + a31 b13 + a32 b23 }
+     */
+    vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
+
+    /*
+     * rewind back to top half of the A matrix (4th column)
+     */
+    vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
+    /*
+     * load {a03 a03 a03 a03 a13 a13 a13 a13}
+     */
+    vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+    /*
+     * move to next B row
+     */
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
+    /*
+     * load {b30, b31, b32, b33, b30, b31, b32, b33}
+     */
+    vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+    /*
+     * { a00 b00 +...+ a03 b30,    a00 b01 +...+ a03 b31,   a00 b02 +...+ a03 b32,   a00 b03 +...+ a03 b33
+     *   a10 b00 +...+ a13 b30,    a10 b01 +...+ a13 b31,   a10 b02 +...+ a13 b32,   a10 b03 +...+ a13 b33 }
+     */
+    vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
+    /*
+     * jump 2 x A rows
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
+    /*
+     * load {a23 a23 a23 a23 a33 a33 a33 a33}
+     */
+    vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+    /*
+     *  {a20 b00 +...+ a23 b30,   a20 b01 +...+ a23 b31,   a20 b02 +...+ a23 b32,   a20 b03 +...+ a23 b33
+     *   a30 b00 +...+ a33 b30,   a30 b01 +...+ a33 b31,   a30 b02 +...+ a33 b32,   a30 b03 +...+ a33 b33 }
+     */
+    vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
+
+    /*
+     * Store the result in the destination buffer
+     */
+    vst1q(pOut, vecDst0); pOut += 8;
+    vst1q(pOut, vecDst1);
+
+    return (ARM_MATH_SUCCESS);
+}
+
+
+arm_status arm_mat_mult_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+  arm_matrix_instance_f16 * pDst)
+{
+       float16_t  *pInB = pSrcB->pData;        /* input data matrix pointer B */
+    float16_t  *pInA = pSrcA->pData;        /* input data matrix pointer A  */
+    float16_t  *pOut = pDst->pData;         /* output data matrix pointer */
+    int         numRowsA = pSrcA->numRows;  /* number of rows of input matrix A */
+    int         numColsB = pSrcB->numCols;  /* number of columns of input matrix B */
+    int         numColsA = pSrcA->numCols;  /* number of columns of input matrix A */
+    uint32_t    blkCnt;                     /* loop counters */
+    int         i;
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcB->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    return(ARM_MATH_SIZE_MISMATCH);
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+{
+    /* small squared matrix specialized routines */
+    if(numRowsA == numColsB && numColsB == numColsA) {
+        if(numRowsA == 2)
+            return arm_mat_mult_f16_2x2_mve(pSrcA, pSrcB, pDst);
+        else if(numRowsA == 3)
+            return arm_mat_mult_f16_3x3_mve(pSrcA, pSrcB, pDst);
+        else if(numRowsA == 4)
+            return arm_mat_mult_f16_4x4_mve(pSrcA, pSrcB, pDst);
+    }
+
+    /* main loop process 4 rows */
+    i = numRowsA / 4;
+    while(i > 0)
+    {
+        float16_t   *pInA0, *pInA1, *pInA2, *pInA3;
+        float16_t   *pInB0;
+        float16_t   *pOut0, *pOut1, *pOut2, *pOut3;
+        f16x8_t    vecMac0, vecMac1, vecMac2, vecMac3;
+        f16x8_t    vecInB;
+
+        /* pointers to 4 consecutive output rows */
+        pOut0 = pOut;
+        pOut1 = pOut0 + numColsB;
+        pOut2 = pOut1 + numColsB;
+        pOut3 = pOut2 + numColsB;
+        pInB0 = pInB;
+
+        int       k = numColsB >> 3;
+        while(k > 0)
+        {
+            /* pointers to 4 consecutive Matrix A rows */
+            pInA0 = pInA;
+            pInA1 = pInA0 + numColsA;
+            pInA2 = pInA1 + numColsA;
+            pInA3 = pInA2 + numColsA;
+
+            vecMac0 = vdupq_n_f16(0.0f16);
+            vecMac1 = vdupq_n_f16(0.0f16);
+            vecMac2 = vdupq_n_f16(0.0f16);
+            vecMac3 = vdupq_n_f16(0.0f16);
+
+            blkCnt = numColsA;
+
+            while (blkCnt > 0U)
+            {
+                /*
+                 * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3..., bi,4n+7}
+                 */
+                vecInB = *(f16x8_t *)pInB0; /* vldrhq_f16(pInB0, 0); */
+
+                vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+                vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+                vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+                vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+
+                pInB0 = pInB0 + numColsB;
+                /*
+                 * Decrement the blockSize loop counter
+                 */
+                blkCnt--;
+            }
+
+            /* Store the results (4 x 8 block) in the destination buffer */
+            vst1q(pOut0, vecMac0);  pOut0 += 8;
+            vst1q(pOut1, vecMac1);  pOut1 += 8;
+            vst1q(pOut2, vecMac2);  pOut2 += 8;
+            vst1q(pOut3, vecMac3);  pOut3 += 8;
+            /*
+             * rewind
+             */
+            pInB0 -= (numColsB * numColsA) - 8;
+            k--;
+        }
+
+        int       colBLeft = numColsB & 7;
+        if (colBLeft)
+        {
+            pInA0 = pInA;
+            pInA1 = pInA0 + numColsA;
+            pInA2 = pInA1 + numColsA;
+            pInA3 = pInA2 + numColsA;
+            mve_pred16_t p0 = vctp16q(colBLeft);
+
+            vecMac0 = vdupq_n_f16(0.0f16);
+            vecMac1 = vdupq_n_f16(0.0f16);
+            vecMac2 = vdupq_n_f16(0.0f16);
+            vecMac3 = vdupq_n_f16(0.0f16);
+
+            blkCnt = numColsA;
+
+            while (blkCnt > 0U)
+            {
+                /*
+                 * load {bi,4n+0, bi,4n+1, bi,4n+2, ..bi,4n+colBLeft-1, 0, ..}
+                 */
+                vecInB = vldrhq_z_f16(pInB0, p0);
+
+                vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+                vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+                vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+                vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+
+                pInB0 = pInB0 + numColsB;
+                /*
+                 * Decrement the blockSize loop counter
+                 */
+                blkCnt--;
+            }
+
+            /* Store the results (4 x colBLeft block) in the destination buffer */
+            vstrhq_p_f16(pOut0, vecMac0, p0);
+            vstrhq_p_f16(pOut1, vecMac1, p0);
+            vstrhq_p_f16(pOut2, vecMac2, p0);
+            vstrhq_p_f16(pOut3, vecMac3, p0);
+        }
+
+        pInA += 4 * numColsA;
+        pOut += 4 * numColsB;
+        i--;
+    }
+
+    /*
+     * non multiple of 4 rows for Matrix A
+     * process single row
+     */
+    if (numRowsA & 3)
+    {
+        i = numRowsA & 3;
+        do
+        {
+            float16_t   *pInA0;
+            float16_t   *pInB0;
+            float16_t   *pOut0;
+            f16x8_t    vecInB;
+            f16x8_t    vecMac0;
+
+            pOut0 = pOut;
+            pInB0 = pInB;
+
+            int       k = numColsB >> 3;
+            while(k > 0)
+            {
+                pInA0 = pInA;
+
+                vecMac0 = vdupq_n_f16(0.0f16);
+                blkCnt = numColsA;
+
+                while (blkCnt > 0U)
+                {
+                    /*
+                     * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3, ...bi,4n+7}
+                     */
+                    vecInB = *(f16x8_t *)pInB0; /* vldrhq_f16(pInB0, 0); */
+
+                    vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+
+                    pInB0 = pInB0 + numColsB;
+                    /*
+                     * Decrement the blockSize loop counter
+                     */
+                    blkCnt--;
+                }
+                /* Store the results (1 x 8 block) in the destination buffer */
+                vst1q(pOut0, vecMac0);   pOut0 += 8;
+                /*
+                 * rewind
+                 */
+                pInB0 -= (numColsB * numColsA) - 8;
+                k--;
+            }
+
+            int  colBLeft = numColsB & 7;
+            if (colBLeft)
+            {
+                pInA0 = pInA;
+                mve_pred16_t p0 = vctp16q(colBLeft);
+
+                vecMac0 = vdupq_n_f16(0.0f16);
+                blkCnt = numColsA;
+
+                while (blkCnt > 0U)
+                {
+                    /*
+                     * load {bi,4n+0, bi,4n+1, bi,4n+2, ..., bi,4n+colBLeft, 0, ...}
+                     */
+                    vecInB = vldrhq_z_f16(pInB0, p0);
+
+                    vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+
+                    pInB0 = pInB0 + numColsB;
+                    /*
+                     * Decrement the blockSize loop counter
+                     */
+                    blkCnt--;
+                }
+                /* Store the results (1 x colBLeft block) in the destination buffer */
+                vstrhq_p_f16(pOut0, vecMac0, p0);
+            }
+
+            pInA += 1 * numColsA;
+            pOut += 1 * numColsB;
+        }
+        while (--i);
+    }
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+  }
+}
+#else
+
+
+arm_status arm_mat_mult_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+        arm_matrix_instance_f16 * pDst)
+{
+  float16_t *pIn1 = pSrcA->pData;                /* Input data matrix pointer A */
+  float16_t *pIn2 = pSrcB->pData;                /* Input data matrix pointer B */
+  float16_t *pInA = pSrcA->pData;                /* Input data matrix pointer A */
+  float16_t *pInB = pSrcB->pData;                /* Input data matrix pointer B */
+  float16_t *pOut = pDst->pData;                 /* Output data matrix pointer */
+  float16_t *px;                                 /* Temporary output data matrix pointer */
+  _Float16 sum;                                 /* Accumulator */
+  uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A */
+  uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
+  uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
+  uint32_t col, i = 0U, row = numRowsA, colCnt;  /* Loop counters */
+  arm_status status;                             /* Status of matrix multiplication */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcB->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+    /* row loop */
+    do
+    {
+      /* Output pointer is set to starting address of row being processed */
+      px = pOut + i;
+
+      /* For every row wise process, column loop counter is to be initiated */
+      col = numColsB;
+
+      /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+      pIn2 = pSrcB->pData;
+
+      /* column loop */
+      do
+      {
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum = 0.0f16;
+
+        /* Initialize pointer pIn1 to point to starting address of column being processed */
+        pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+        /* Loop unrolling: Compute 4 MACs at a time. */
+        colCnt = numColsA >> 2U;
+
+        /* matrix multiplication */
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+          /* Perform the multiply-accumulates */
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Loop unrolling: Compute remaining MACs */
+        colCnt = numColsA % 0x4U;
+
+#else
+
+        /* Initialize cntCnt with number of columns */
+        colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+          /* Perform the multiply-accumulates */
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Store result in destination buffer */
+        *px++ = sum;
+
+        /* Decrement column loop counter */
+        col--;
+
+        /* Update pointer pIn2 to point to starting address of next column */
+        pIn2 = pInB + (numColsB - col);
+
+      } while (col > 0U);
+
+      /* Update pointer pInA to point to starting address of next row */
+      i = i + numColsB;
+      pInA = pInA + numColsA;
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixMult group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f32.c
index 90b8e95204f014f3f70889a0997934cb1067503c..54481187a37a9239d283c06f0590a96f7e949587 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f32.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_mult_f32.c
  * Description:  Floating-point matrix multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
  * @ingroup groupMatrix
@@ -906,7 +906,7 @@ arm_status arm_mat_mult_f32(
         /* matrix multiplication */
         while (colCnt > 0U)
         {
-          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+          /* c(m,p) = a(m,1) * b(1,p) + a(m,2) * b(2,p) + .... + a(m,n) * b(n,p) */
 
           /* Perform the multiply-accumulates */
           sum += *pIn1++ * *pIn2;
@@ -937,7 +937,7 @@ arm_status arm_mat_mult_f32(
 
         while (colCnt > 0U)
         {
-          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+          /* c(m,p) = a(m,1) * b(1,p) + a(m,2) * b(2,p) + .... + a(m,n) * b(n,p) */
 
           /* Perform the multiply-accumulates */
           sum += *pIn1++ * *pIn2;
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f64.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f64.c
new file mode 100644
index 0000000000000000000000000000000000000000..cf59ef4e8e7f35954f5f4ab3c5dc8731f0a6cfbe
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f64.c
@@ -0,0 +1,202 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_mult_f64.c
+ * Description:  Floating-point matrix multiplication
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @defgroup MatrixMult Matrix Multiplication
+ *
+ * Multiplies two matrices.
+ *
+ * \image html MatrixMultiplication.gif "Multiplication of two 3 x 3 matrices"
+
+ * Matrix multiplication is only defined if the number of columns of the
+ * first matrix equals the number of rows of the second matrix.
+ * Multiplying an <code>M x N</code> matrix with an <code>N x P</code> matrix results
+ * in an <code>M x P</code> matrix.
+ * When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
+ * <code>pSrcA</code> and <code>pSrcB</code> are equal; and (2) that the size of the output
+ * matrix equals the outer dimensions of <code>pSrcA</code> and <code>pSrcB</code>.
+ */
+
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @return     		The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ */
+
+
+arm_status arm_mat_mult_f64(
+  const arm_matrix_instance_f64 * pSrcA,
+  const arm_matrix_instance_f64 * pSrcB,
+        arm_matrix_instance_f64 * pDst)
+{
+  float64_t *pIn1 = pSrcA->pData;                /* Input data matrix pointer A */
+  float64_t *pIn2 = pSrcB->pData;                /* Input data matrix pointer B */
+  float64_t *pInA = pSrcA->pData;                /* Input data matrix pointer A */
+  float64_t *pInB = pSrcB->pData;                /* Input data matrix pointer B */
+  float64_t *pOut = pDst->pData;                 /* Output data matrix pointer */
+  float64_t *px;                                 /* Temporary output data matrix pointer */
+  float64_t sum;                                 /* Accumulator */
+  uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A */
+  uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
+  uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
+  uint64_t col, i = 0U, row = numRowsA, colCnt;  /* Loop counters */
+  arm_status status;                             /* Status of matrix multiplication */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcB->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+    /* row loop */
+    do
+    {
+      /* Output pointer is set to starting address of row being processed */
+      px = pOut + i;
+
+      /* For every row wise process, column loop counter is to be initiated */
+      col = numColsB;
+
+      /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+      pIn2 = pSrcB->pData;
+
+      /* column loop */
+      do
+      {
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum = 0.0f;
+
+        /* Initialize pointer pIn1 to point to starting address of column being processed */
+        pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+        /* Loop unrolling: Compute 4 MACs at a time. */
+        colCnt = numColsA >> 2U;
+
+        /* matrix multiplication */
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+          /* Perform the multiply-accumulates */
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Loop unrolling: Compute remaining MACs */
+        colCnt = numColsA % 0x4U;
+
+#else
+
+        /* Initialize cntCnt with number of columns */
+        colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+          /* Perform the multiply-accumulates */
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Store result in destination buffer */
+        *px++ = sum;
+
+        /* Decrement column loop counter */
+        col--;
+
+        /* Update pointer pIn2 to point to starting address of next column */
+        pIn2 = pInB + (numColsB - col);
+
+      } while (col > 0U);
+
+      /* Update pointer pInA to point to starting address of next row */
+      i = i + numColsB;
+      pInA = pInA + numColsA;
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c
index 670ace1f448eb6e46138186110a1598e1a8e1c84..62ddcaf8fc2f2c1cbad5df2ceef0d60f42935abe 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_mult_fast_q15.c
  * Description:  Q15 matrix multiplication (fast variant)
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -78,7 +78,7 @@ arm_status arm_mat_mult_fast_q15(
         uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A */
         uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
         uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
-        uint16_t numRowsB = pSrcB->numRows;            /* Number of rows of input matrix A */
+        uint16_t numRowsB = pSrcB->numRows;            /* Number of rows of input matrix B */
         uint32_t col, i = 0U, row = numRowsB, colCnt;  /* Loop counters */
         arm_status status;                             /* Status of matrix multiplication */
 
@@ -258,7 +258,7 @@ arm_status arm_mat_mult_fast_q15(
         pInA2 = pInA + numColsA;
         pInB2 = pInB + numRowsB;
 
-        /* Read in two elements at once - alows dual MAC instruction */
+        /* Read in two elements at once - allows dual MAC instruction */
         colCnt = numColsA >> 1U;
 #else
         colCnt = numColsA >> 2U;
@@ -277,7 +277,7 @@ arm_status arm_mat_mult_fast_q15(
           inA2 = read_q15x2_ia ((q15_t **) &pInA2);
           inB2 = read_q15x2_ia ((q15_t **) &pInB2);
 
-          /* Multiply and Accumlates */
+          /* Multiply and Accumulates */
           sum  = __SMLAD(inA1, inB1, sum);
           sum2 = __SMLAD(inA1, inB2, sum2);
           sum3 = __SMLAD(inA2, inB1, sum3);
@@ -286,7 +286,7 @@ arm_status arm_mat_mult_fast_q15(
           /* read real and imag values from pSrcA and pSrcB buffer */
           inA1 = *pInA++;
           inB1 = *pInB++;
-          /* Multiply and Accumlates */
+          /* Multiply and Accumulates */
           sum += inA1 * inB1;
 
           inA2 = *pInA++;
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q31.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q31.c
index 011959adb92202fbc0ed627a4540be859444ffe0..99a42328e072c6bec7ae4c6d37b6f3bf7b186c06 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q31.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_mult_fast_q31.c
  * Description:  Q31 matrix multiplication (fast variant)
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c
index 0b531f2a87d1478f4e5d9e3a04081eb353c424ad..8eed6ee5d6b12944219e71f846f3a2d2da78a772 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_mult_q15.c
  * Description:  Q15 matrix multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -57,7 +57,7 @@
   @par
                    Refer to \ref arm_mat_mult_fast_q15() for a faster but less precise version of this function.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
 
@@ -328,7 +328,7 @@ arm_status arm_mat_mult_q15(
     uint16_t  numRowsA = pSrcA->numRows;    /* number of rows of input matrix A    */
     uint16_t  numColsB = pSrcB->numCols;    /* number of columns of input matrix B */
     uint16_t  numColsA = pSrcA->numCols;    /* number of columns of input matrix A */
-    uint16_t  col, i = 0U, row = numRowsA, colCnt;  /* loop counters */
+    uint16_t  col, i = 0U, row = numRowsA;  /* loop counters */
     uint16x8_t vecOffs, vecColBOffs;
     uint32_t  blkCnt,rowCnt;           /* loop counters */
     arm_status status;                             /* Status of matrix multiplication */
@@ -345,7 +345,7 @@ arm_status arm_mat_mult_q15(
     status = ARM_MATH_SIZE_MISMATCH;
   }
   else
-#endif 
+#endif
   {
     /* small squared matrix specialized routines */
     if(numRowsA == numColsB && numColsB == numColsA) {
@@ -403,7 +403,6 @@ arm_status arm_mat_mult_q15(
             /*
              * Matrix A columns number of MAC operations are to be performed
              */
-            colCnt = numColsA;
 
             q15_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
             q15_t    *pInA0 = pInA;
@@ -519,7 +518,6 @@ arm_status arm_mat_mult_q15(
             /*
              * Matrix A columns number of MAC operations are to be performed
              */
-            colCnt = numColsA;
 
             q15_t const *pSrcA0Vec;
             q15_t    *pInA0 = pInA;
@@ -528,7 +526,7 @@ arm_status arm_mat_mult_q15(
             acc0 = 0LL;
 
             pSrcA0Vec = (q15_t const *) pInA0;
-           
+
             vecOffs = vecColBOffs;
 
             blkCnt = (numColsA) >> 3;
@@ -539,10 +537,10 @@ arm_status arm_mat_mult_q15(
                 vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
                 vecOffs = vecOffs + (uint16_t) (numColsB * 8);
 
-                vecA = vld1q(pSrcA0Vec);  
+                vecA = vld1q(pSrcA0Vec);
                 pSrcA0Vec += 8;
                 acc0 = vmlaldavaq(acc0, vecA, vecB);
-                
+
                 blkCnt--;
 
             }
@@ -560,11 +558,11 @@ arm_status arm_mat_mult_q15(
 
                 vecA = vld1q(pSrcA0Vec);
                 acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
-                
+
             }
 
             px[0]            = (q15_t)MVE_ASRL_SAT16(acc0, 15);
-          
+
             px++;
             /*
              * Decrement the column loop counter
@@ -608,10 +606,10 @@ arm_status arm_mat_mult_q15(
         uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A */
         uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
         uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
-        uint16_t numRowsB = pSrcB->numRows;            /* Number of rows of input matrix A */
+        uint16_t numRowsB = pSrcB->numRows;            /* Number of rows of input matrix B */
         uint32_t col, i = 0U, row = numRowsB, colCnt;  /* Loop counters */
         arm_status status;                             /* Status of matrix multiplication */
-        
+
         q31_t in;                                      /* Temporary variable to hold the input value */
         q31_t inA1, inB1, inA2, inB2;
 
@@ -750,7 +748,7 @@ arm_status arm_mat_mult_q15(
           inA2 = read_q15x2_ia ((q15_t **) &pInA);
           inB2 = read_q15x2_ia ((q15_t **) &pInB);
 
-          /* Multiply and Accumlates */
+          /* Multiply and Accumulates */
           sum = __SMLALD(inA1, inB1, sum);
           sum = __SMLALD(inA2, inB2, sum);
 
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q31.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q31.c
index 8f2e7d0db053c902beda00b921c114ee8cbe6d4a..1873827966dbec05145c82c6f743f242ae13aca4 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q31.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_mult_q31.c
  * Description:  Q31 matrix multiplication
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -58,7 +58,7 @@
   @remark
                    Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #define MATRIX_DIM2 2
 #define MATRIX_DIM3 3
@@ -344,7 +344,7 @@ arm_status arm_mat_mult_q31(
     uint16_t    numRowsA = pSrcA->numRows;    /* number of rows of input matrix A    */
     uint16_t    numColsB = pSrcB->numCols;    /* number of columns of input matrix B */
     uint16_t    numColsA = pSrcA->numCols;    /* number of columns of input matrix A */
-    uint16_t    col, i = 0U, row = numRowsA, colCnt;  /* loop counters */
+    uint16_t    col, i = 0U, row = numRowsA;  /* loop counters */
     arm_status  status;          /* status of matrix multiplication */
     uint32x4_t  vecOffs, vecColBOffs;
     uint32_t    blkCnt, rowCnt;           /* loop counters */
@@ -418,7 +418,6 @@ arm_status arm_mat_mult_q31(
             /*
              * Matrix A columns number of MAC operations are to be performed
              */
-            colCnt = numColsA;
 
             q31_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
             q31_t const   *pInA0 = pInA;
@@ -541,7 +540,6 @@ arm_status arm_mat_mult_q31(
             /*
              * Matrix A columns number of MAC operations are to be performed
              */
-            colCnt = numColsA;
 
             q31_t const *pSrcA0Vec;
             q31_t const   *pInA0 = pInA;
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q7.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q7.c
new file mode 100644
index 0000000000000000000000000000000000000000..3ce0fe6044d91b07af2f0e2c4a1d92610d51abc9
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q7.c
@@ -0,0 +1,678 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_mult_q7.c
+ * Description:  Q15 matrix multiplication
+ *
+ * $Date:        23 April 2021
+ *
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @addtogroup MatrixMult
+  @{
+ */
+
+/**
+ * @brief Q7 matrix multiplication
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @param[in]       *pState points to the array for storing intermediate results (Unused in some versions)
+ * @return          The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ *
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ *
+ * \par
+ * The function is implemented using a 32-bit internal accumulator saturated to 1.7 format.
+ *
+ *
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_2x2_mve(
+    const arm_matrix_instance_q7 * pSrcA,
+    const arm_matrix_instance_q7 * pSrcB,
+    arm_matrix_instance_q7 * pDst)
+{
+    const uint32_t MATRIX_DIM = 2;
+    q7_t const *pInB = (q7_t const *)pSrcB->pData;  /* input data matrix pointer B */
+    q7_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
+    q7_t       *pOut = pDst->pData;   /* output data matrix pointer */
+    uint8x16_t vecColBOffs;
+    q7_t       *pInA0 = pInA;
+    q7_t       *pInA1 = pInA0 + MATRIX_DIM;
+    q31_t       acc0, acc1;
+    q7x16_t    vecB, vecA0, vecA1;
+    mve_pred16_t p0 = vctp8q(MATRIX_DIM);
+
+    vecColBOffs = vidupq_u8((uint32_t)0, 2); /* MATRIX_DIM */
+
+    pInB = pSrcB->pData;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    vecA0 = vldrbq_s8(pInA0);
+    vecA1 = vldrbq_s8(pInA1);
+
+    acc0 = vmladavq_s8(vecA0, vecB);
+    acc1 = vmladavq_s8(vecA1, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    acc0 = vmladavq_s8(vecA0, vecB);
+    acc1 = vmladavq_s8(vecA1, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_3x3_mve(
+    const arm_matrix_instance_q7 * pSrcA,
+    const arm_matrix_instance_q7 * pSrcB,
+    arm_matrix_instance_q7 * pDst)
+{
+    const uint8_t  MATRIX_DIM = 3;
+    q7_t const     *pInB = (q7_t const *)pSrcB->pData;  /* input data matrix pointer B */
+    q7_t           *pInA = pSrcA->pData;  /* input data matrix pointer A */
+    q7_t           *pOut = pDst->pData;   /* output data matrix pointer */
+    uint8x16_t     vecColBOffs;
+    q7_t           *pInA0 = pInA;
+    q7_t           *pInA1 = pInA0 + MATRIX_DIM;
+    q7_t           *pInA2 = pInA1 + MATRIX_DIM;
+    q31_t           acc0, acc1, acc2;
+    q7x16_t        vecB, vecA0, vecA1, vecA2;
+    mve_pred16_t    p0 = vctp8q(MATRIX_DIM);
+
+    vecColBOffs = vidupq_u8((uint32_t)0, 1);
+    vecColBOffs = vecColBOffs * MATRIX_DIM;
+
+    pInB = pSrcB->pData;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    vecA0 = vldrbq_s8(pInA0);
+    vecA1 = vldrbq_s8(pInA1);
+    vecA2 = vldrbq_s8(pInA2);
+
+    acc0 = vmladavq_s8(vecA0, vecB);
+    acc1 = vmladavq_s8(vecA1, vecB);
+    acc2 = vmladavq_s8(vecA2, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    acc0 = vmladavq_s8(vecA0, vecB);
+    acc1 = vmladavq_s8(vecA1, vecB);
+    acc2 = vmladavq_s8(vecA2, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    acc0 = vmladavq_s8(vecA0, vecB);
+    acc1 = vmladavq_s8(vecA1, vecB);
+    acc2 = vmladavq_s8(vecA2, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_4x4_mve(
+    const arm_matrix_instance_q7 * pSrcA,
+    const arm_matrix_instance_q7 * pSrcB,
+    arm_matrix_instance_q7 * pDst)
+{
+    const uint32_t MATRIX_DIM = 4;
+    q7_t const *pInB = (q7_t const *)pSrcB->pData;  /* input data matrix pointer B */
+    q7_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
+    q7_t       *pOut = pDst->pData;   /* output data matrix pointer */
+    uint8x16_t vecColBOffs;
+    q7_t       *pInA0 = pInA;
+    q7_t       *pInA1 = pInA0 + MATRIX_DIM;
+    q7_t       *pInA2 = pInA1 + MATRIX_DIM;
+    q7_t       *pInA3 = pInA2 + MATRIX_DIM;
+    q31_t       acc0, acc1, acc2, acc3;
+    q7x16_t    vecB, vecA0, vecA1, vecA2, vecA3;
+    mve_pred16_t p0 = vctp8q(MATRIX_DIM);
+
+    vecColBOffs = vidupq_u8((uint32_t)0, 4);
+
+    pInB = pSrcB->pData;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    vecA0 = vldrbq_s8(pInA0);
+    vecA1 = vldrbq_s8(pInA1);
+    vecA2 = vldrbq_s8(pInA2);
+    vecA3 = vldrbq_s8(pInA3);
+
+    acc0 = vmladavq_s8(vecA0, vecB);
+    acc1 = vmladavq_s8(vecA1, vecB);
+    acc2 = vmladavq_s8(vecA2, vecB);
+    acc3 = vmladavq_s8(vecA3, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    acc0 = vmladavq_s8(vecA0, vecB);
+    acc1 = vmladavq_s8(vecA1, vecB);
+    acc2 = vmladavq_s8(vecA2, vecB);
+    acc3 = vmladavq_s8(vecA3, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    acc0 = vmladavq_s8(vecA0, vecB);
+    acc1 = vmladavq_s8(vecA1, vecB);
+    acc2 = vmladavq_s8(vecA2, vecB);
+    acc3 = vmladavq_s8(vecA3, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    acc0 = vmladavq_s8(vecA0, vecB);
+    acc1 = vmladavq_s8(vecA1, vecB);
+    acc2 = vmladavq_s8(vecA2, vecB);
+    acc3 = vmladavq_s8(vecA3, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+arm_status arm_mat_mult_q7(
+    const arm_matrix_instance_q7 * pSrcA,
+    const arm_matrix_instance_q7 * pSrcB,
+    arm_matrix_instance_q7 * pDst,
+    q7_t * pState)
+{
+    q7_t    *pInA = pSrcA->pData;  /* input data matrix pointer A of Q7 type */
+    q7_t    *pInB = pSrcB->pData;  /* input data matrix pointer B of Q7 type */
+    q7_t    *pInA2;
+    q7_t    *pInB2;
+    q7_t    *px;               /* Temporary output data matrix pointer */
+    q7_t    *px2;              /* Temporary output data matrix pointer */
+    uint32_t  numRowsA = pSrcA->numRows;    /* number of rows of input matrix A    */
+    uint32_t  numColsB = pSrcB->numCols;    /* number of columns of input matrix B */
+    uint32_t  numColsA = pSrcA->numCols;    /* number of columns of input matrix A */
+    uint32_t  numRowsB = pSrcB->numRows;    /* number of rows of input matrix A    */
+    uint32_t  col, i = 0u, j, row = numRowsB;   /* loop counters */
+    q7_t    *pSrcBT = pState;   /* input data matrix pointer for transpose */
+    uint32_t  blkCnt;           /* loop counters */
+    arm_status status;                            /* status of matrix multiplication */
+    arm_matrix_instance_q7 BT;
+
+
+   #ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcB->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+  {
+    /* small squared matrix specialized routines */
+    if(numRowsA == numColsB && numColsB == numColsA) {
+        if(numRowsA == 2)
+            return arm_mat_mult_q7_2x2_mve(pSrcA, pSrcB, pDst);
+        else if(numRowsA == 3)
+            return arm_mat_mult_q7_3x3_mve(pSrcA, pSrcB, pDst);
+        else if (numRowsA == 4)
+            return arm_mat_mult_q7_4x4_mve(pSrcA, pSrcB, pDst);
+    }
+    /*
+     * Matrix transpose
+     */
+
+    BT.numRows = numColsB;
+    BT.numCols = numRowsB;
+    BT.pData = pSrcBT;
+
+    arm_mat_trans_q7(pSrcB, &BT);
+
+    /*
+     * Reset the variables for the usage in the following multiplication process
+     */
+    i = 0;
+    row = numRowsA >> 1;
+    px = pDst->pData;
+    px2 = px + numColsB;
+
+    /*
+     * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
+     */
+
+    /*
+     * row loop
+     */
+    while (row > 0u)
+    {
+        /*
+         * For every row wise process, the column loop counter is to be initiated
+         */
+        col = numColsB >> 1;
+        /*
+         * For every row wise process, the pIn2 pointer is set
+         * to the starting address of the transposed pSrcB data
+         */
+        pInB = pSrcBT;
+        pInB2 = pInB + numRowsB;
+        j = 0;
+
+        /*
+         * column loop
+         */
+        while (col > 0u)
+        {
+            q7_t const     *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
+            q7x16_t        vecA, vecA2, vecB, vecB2;
+            q31_t           acc0, acc1, acc2, acc3;
+
+            /*
+             * Initiate the pointer pIn1 to point to the starting address of the column being processed
+             */
+            pInA = pSrcA->pData + i;
+            pInA2 = pInA + numColsA;
+            pInB = pSrcBT + j;
+            pInB2 = pInB + numRowsB;
+
+            pSrcAVec = (q7_t const *) pInA;
+            pSrcA2Vec = (q7_t const *)pInA2;
+            pSrcBVec = (q7_t const *) pInB;
+            pSrcB2Vec = (q7_t const *)pInB2;
+
+            acc0 = 0L;
+            acc1 = 0L;
+            acc2 = 0L;
+            acc3 = 0L;
+
+            vecA = vld1q(pSrcAVec);  
+            pSrcAVec += 16;
+
+            blkCnt = numColsA >> 4;
+            while (blkCnt > 0U)
+            {
+                vecB = vld1q(pSrcBVec);  
+                pSrcBVec += 16;
+                acc0 = vmladavaq_s8(acc0, vecA, vecB);
+                vecA2 = vld1q(pSrcA2Vec);  
+                pSrcA2Vec += 16;
+                acc1 = vmladavaq_s8(acc1, vecA2, vecB);
+                vecB2 = vld1q(pSrcB2Vec);  
+                pSrcB2Vec += 16;
+                acc2 = vmladavaq_s8(acc2, vecA, vecB2);
+                vecA = vld1q(pSrcAVec);  
+                pSrcAVec += 16;
+                acc3 = vmladavaq_s8(acc3, vecA2, vecB2);
+
+                blkCnt--;
+            }
+            /*
+             * tail
+             * (will be merged thru tail predication)
+             */
+            blkCnt = numColsA & 0xF;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp8q(blkCnt);
+                vecB = vld1q(pSrcBVec);
+                acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
+                vecA2 = vld1q(pSrcA2Vec);
+                acc1 = vmladavaq_p_s8(acc1, vecA2, vecB, p0);
+                vecB2 = vld1q(pSrcB2Vec);
+                acc2 = vmladavaq_p_s8(acc2, vecA, vecB2, p0);
+                vecA = vld1q(pSrcAVec);
+                acc3 = vmladavaq_p_s8(acc3, vecA2, vecB2, p0);
+            }
+
+            *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
+            *px++ = (q7_t) __SSAT(acc2 >> 7, 8);
+            *px2++ = (q7_t) __SSAT(acc1 >> 7, 8);
+            *px2++ = (q7_t) __SSAT(acc3 >> 7, 8);
+            j += numRowsB * 2;
+            /*
+             * Decrement the column loop counter
+             */
+            col--;
+
+        }
+
+        i = i + numColsA * 2;
+        px = px2 + (numColsB & 1u);
+        px2 = px + numColsB;
+        /*
+         * Decrement the row loop counter
+         */
+        row--;
+    }
+
+    /*
+     * Compute remaining row and/or column below
+     */
+
+    if (numColsB & 1u)
+    {
+        row = numRowsA & (~0x1);    //avoid redundant computation
+        px = pDst->pData + numColsB - 1;
+        i = 0;
+
+        /*
+         * row loop
+         */
+        while (row > 0)
+        {
+            q7_t const   *pSrcAVec, *pSrcBVec;
+            q7x16_t       vecA, vecB;
+            q63_t           acc0;
+
+            /*
+             * point to last column in matrix B
+             */
+            pInB = pSrcBT + numRowsB * (numColsB - 1);
+            pInA = pSrcA->pData + i;
+
+            pSrcAVec = (q7_t const *) pInA;
+            pSrcBVec = (q7_t const *) pInB;
+
+            acc0 = 0LL;
+            blkCnt = (numColsA) >> 4;
+            while (blkCnt > 0U)
+            {
+                vecA = vld1q(pSrcAVec);  
+                pSrcAVec += 16;
+                vecB = vld1q(pSrcBVec);  
+                pSrcBVec += 16;
+                acc0 = vmladavaq_s8(acc0, vecA, vecB);
+
+                blkCnt--;
+            }
+            /*
+             * tail
+             * (will be merged thru tail predication)
+             */
+            blkCnt = numColsA & 0xF;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp8q(blkCnt);
+                vecA = vld1q(pSrcAVec);
+                vecB = vld1q(pSrcBVec);
+                acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
+            }
+
+            *px = (q7_t) __SSAT(acc0 >> 7, 8);
+
+            px += numColsB;
+
+            i += numColsA;
+            /*
+             * Decrement the row loop counter
+             */
+            row--;
+        }
+    }
+
+    if (numRowsA & 1u)
+    {
+        col = numColsB;
+        i = 0u;
+        /*
+         * point to last row in output matrix
+         */
+        px = pDst->pData + (numColsB) * (numRowsA - 1);
+        /*
+         * col loop
+         */
+        while (col > 0)
+        {
+            q7_t const    *pSrcAVec, *pSrcBVec;
+            q7x16_t       vecA, vecB;
+            q63_t           acc0;
+
+            /*
+             * point to last row in matrix A
+             */
+            pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
+            pInB = pSrcBT + i;
+
+            /*
+             * Set the variable sum, that acts as accumulator, to zero
+             */
+            pSrcAVec = (q7_t const *) pInA;
+            pSrcBVec = (q7_t const *) pInB;
+            acc0 = 0LL;
+
+            blkCnt = (numColsA) >> 4;
+            while (blkCnt > 0U)
+            {
+                vecA = vld1q(pSrcAVec); 
+                pSrcAVec += 16;
+                vecB = vld1q(pSrcBVec); 
+                pSrcBVec += 16;
+                acc0 = vmladavaq_s8(acc0, vecA, vecB);
+
+                blkCnt--;
+            }
+            /*
+             * tail
+             * (will be merged thru tail predication)
+             */
+            blkCnt = numColsA & 0xF;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp8q(blkCnt);
+                vecA = vld1q(pSrcAVec);
+                vecB = vld1q(pSrcBVec);
+                acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
+            }
+
+            *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
+
+            i += numColsA;
+
+            /*
+             * Decrement the col loop counter
+             */
+            col--;
+        }
+    }
+    /*
+     * Return to application
+     */
+     status = ARM_MATH_SUCCESS;
+    }
+    return(status);
+}
+#else
+arm_status arm_mat_mult_q7(const arm_matrix_instance_q7 *pSrcA, const arm_matrix_instance_q7 *pSrcB, arm_matrix_instance_q7 *pDst, q7_t *pState)
+{
+    q31_t sum; /* accumulator */
+    q7_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
+    q7_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
+    q7_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q7 type */
+    q7_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q7 type */
+    q7_t *pOut = pDst->pData;                     /* output data matrix pointer */
+    q7_t *px;                                     /* Temporary output data matrix pointer */
+    uint16_t numColsB = pSrcB->numCols;           /* number of columns of input matrix B */
+    uint16_t numColsA = pSrcA->numCols;           /* number of columns of input matrix A */
+    uint16_t numRowsA = pSrcA->numRows;           /* number of rows of input matrix A    */
+    uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
+    arm_status status;                            /* status of matrix multiplication */
+
+    (void)pState;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcB->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+    {
+        /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+        /* row loop */
+        do {
+            /* Output pointer is set to starting address of the row being processed */
+            px = pOut + i;
+
+            /* For every row wise process, the column loop counter is to be initiated */
+            col = numColsB;
+
+            /* For every row wise process, the pIn2 pointer is set
+             ** to the starting address of the pSrcB data */
+            pIn2 = pSrcB->pData;
+
+            /* column loop */
+            do {
+                /* Set the variable sum, that acts as accumulator, to zero */
+                sum = 0;
+
+                /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
+                pIn1 = pInA;
+
+                /* Matrix A columns number of MAC operations are to be performed */
+                colCnt = numColsA;
+
+                /* matrix multiplication */
+                while (colCnt > 0U) {
+                    /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+                    /* Perform the multiply-accumulates */
+                    sum += (q31_t)*pIn1++ * *pIn2;
+                    pIn2 += numColsB;
+
+                    /* Decrement the loop counter */
+                    colCnt--;
+                }
+
+                /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
+                /* Saturate and store the result in the destination buffer */
+                *px++ = (q7_t)__SSAT((sum >> 7), 8);
+
+                /* Decrement the column loop counter */
+                col--;
+
+                /* Update the pointer pIn2 to point to the  starting address of the next column */
+                pIn2 = pInB + (numColsB - col);
+
+            } while (col > 0U);
+
+            /* Update the pointer pSrcA to point to the  starting address of the next row */
+            i = i + numColsB;
+            pInA = pInA + numColsA;
+
+            /* Decrement the row loop counter */
+            row--;
+
+        } while (row > 0U);
+
+        /* set status as ARM_MATH_SUCCESS */
+        status = ARM_MATH_SUCCESS;
+    }
+
+    /* Return to application */
+    return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of MatrixMult group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..5f09105de8403c01a2e4645d1911dacf57d9ff01
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_f16.c
@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_scale_f16.c
+ * Description:  Multiplies a floating-point matrix by a scalar
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixScale
+  @{
+ */
+
+/**
+  @brief         Floating-point matrix scaling.
+  @param[in]     pSrc       points to input matrix
+  @param[in]     scale      scale factor to be applied
+  @param[out]    pDst       points to output matrix structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_scale_f16(
+  const arm_matrix_instance_f16 * pSrc,
+  float16_t scale,
+  arm_matrix_instance_f16 * pDst)
+{
+  arm_status status;                             /* status of matrix scaling     */
+  #ifdef ARM_MATH_MATRIX_CHECK
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
+  {
+    float16_t *pIn = pSrc->pData;   /* input data matrix pointer */
+    float16_t *pOut = pDst->pData;  /* output data matrix pointer */
+    uint32_t  numSamples;           /* total number of elements in the matrix */
+    uint32_t  blkCnt;               /* loop counters */
+    f16x8_t vecIn, vecOut, vecScale;
+    float16_t const *pInVec;
+
+    pInVec = (float16_t const *) pIn;
+
+    vecScale = vdupq_n_f16(scale);
+    /*
+     * Total number of samples in the input matrix
+     */
+    numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+    blkCnt = numSamples >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C(m,n) = A(m,n) * scale
+         * Scaling and results are stored in the destination buffer.
+         */
+        vecIn = vld1q(pInVec); 
+        pInVec += 8;
+
+        vecOut = vmulq_f16(vecIn, vecScale);
+
+        vst1q(pOut, vecOut); 
+        pOut += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     */
+    blkCnt = numSamples & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecIn = vld1q(pInVec); 
+        vecOut = vecIn * scale;
+
+        vstrhq_p(pOut, vecOut, p0);
+    }
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+
+}
+#else
+
+arm_status arm_mat_scale_f16(
+  const arm_matrix_instance_f16 * pSrc,
+        float16_t                 scale,
+        arm_matrix_instance_f16 * pDst)
+{
+  float16_t *pIn = pSrc->pData;                  /* Input data matrix pointer */
+  float16_t *pOut = pDst->pData;                 /* Output data matrix pointer */
+  uint32_t numSamples;                           /* Total number of elements in the matrix */
+  uint32_t blkCnt;                               /* Loop counters */
+  arm_status status;                             /* Status of matrix scaling */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numRows) ||
+      (pSrc->numCols != pDst->numCols)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* Total number of samples in input matrix */
+    numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+    blkCnt = numSamples >> 2U;
+
+    while (blkCnt > 0U)
+    {
+      /* C(m,n) = A(m,n) * scale */
+
+      /* Scale and store result in destination buffer. */
+      *pOut++ = (*pIn++) * scale;
+      *pOut++ = (*pIn++) * scale;
+      *pOut++ = (*pIn++) * scale;
+      *pOut++ = (*pIn++) * scale;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    blkCnt = numSamples % 0x4U;
+
+#else
+
+    /* Initialize blkCnt with number of samples */
+    blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+    while (blkCnt > 0U)
+    {
+      /* C(m,n) = A(m,n) * scale */
+
+      /* Scale and store result in destination buffer. */
+      *pOut++ = (*pIn++) * scale;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixScale group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_f32.c
index 518d3dbc18519eb9b01c6fcde04cd0c1a86c10c1..daebbb34d73d57de371c24017f1be6a77f798981 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_f32.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_scale_f32.c
  * Description:  Multiplies a floating-point matrix by a scalar
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q15.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q15.c
index b413f8a8c88c7b1d27fc338fa4e484080432932a..8292880f626a50bc2099b6676c2ea89f674c3749 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q15.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_scale_q15.c
  * Description:  Multiplies a Q15 matrix by a scalar
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -51,7 +51,7 @@
                    The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
                    These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 arm_status arm_mat_scale_q15(
   const arm_matrix_instance_q15 * pSrc,
         q15_t                     scaleFract,
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q31.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q31.c
index 0a52bfcc2796a6b23ab40eeeba26c405b83f1368..33c086807d47d401065ac65c9a0c7036b9b7799c 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q31.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_scale_q31.c
  * Description:  Multiplies a Q31 matrix by a scalar
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -51,7 +51,7 @@
                    The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
                    These are multiplied to yield a 2.62 intermediate result which is shifted with saturation to 1.31 format.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 arm_status arm_mat_scale_q31(
   const arm_matrix_instance_q31 * pSrc,
         q31_t                     scaleFract,
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..4d0517f8792dc5e683e7b61722eadf0b0a54ff3f
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f16.c
@@ -0,0 +1,234 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_solve_lower_triangular_f16.c
+ * Description:  Solve linear system LT X = A with LT lower triangular matrix
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixInv
+  @{
+ */
+
+
+   /**
+   * @brief Solve LT . X = A where LT is a lower triangular matrix
+   * @param[in]  lt  The lower triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of LT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+   */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+  arm_status arm_mat_solve_lower_triangular_f16(
+  const arm_matrix_instance_f16 * lt,
+  const arm_matrix_instance_f16 * a,
+  arm_matrix_instance_f16 * dst)
+  {
+  arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((lt->numRows != lt->numCols) ||
+      (a->numRows != a->numCols) ||
+      (lt->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* a1 b1 c1   x1 = a1
+          b2 c2   x2   a2
+             c3   x3   a3
+
+    x3 = a3 / c3 
+    x2 = (a2 - c2 x3) / b2
+
+    */
+    int i,j,k,n;
+
+    n = dst->numRows;
+
+    float16_t *pX = dst->pData;
+    float16_t *pLT = lt->pData;
+    float16_t *pA = a->pData;
+
+    float16_t *lt_row;
+    float16_t *a_col;
+
+    _Float16 invLT;
+
+    f16x8_t vecA;
+    f16x8_t vecX;
+
+    for(i=0; i < n ; i++)
+    {
+
+      for(j=0; j+7 < n; j += 8)
+      {
+            vecA = vld1q_f16(&pA[i * n + j]);
+
+            for(k=0; k < i; k++)
+            {
+                vecX = vld1q_f16(&pX[n*k+j]);
+                vecA = vfmsq(vecA,vdupq_n_f16(pLT[n*i + k]),vecX);
+            }
+
+            if (pLT[n*i + i]==0.0f16)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+
+            invLT = 1.0f16 / (_Float16)pLT[n*i + i];
+            vecA = vmulq(vecA,vdupq_n_f16(invLT));
+            vst1q(&pX[i*n+j],vecA);
+
+       }
+
+       for(; j < n; j ++)
+       {
+            a_col = &pA[j];
+            lt_row = &pLT[n*i];
+
+            _Float16 tmp=a_col[i * n];
+            
+            for(k=0; k < i; k++)
+            {
+                tmp -= (_Float16)lt_row[k] * (_Float16)pX[n*k+j];
+            }
+
+            if (lt_row[i]==0.0f16)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / (_Float16)lt_row[i];
+            pX[i*n+j] = tmp;
+        }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  /* Return to application */
+  return (status);
+}
+
+#else
+  arm_status arm_mat_solve_lower_triangular_f16(
+  const arm_matrix_instance_f16 * lt,
+  const arm_matrix_instance_f16 * a,
+  arm_matrix_instance_f16 * dst)
+  {
+  arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((lt->numRows != lt->numCols) ||
+      (a->numRows != a->numCols) ||
+      (lt->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* a1 b1 c1   x1 = a1
+          b2 c2   x2   a2
+             c3   x3   a3
+
+    x3 = a3 / c3 
+    x2 = (a2 - c2 x3) / b2
+
+    */
+    int i,j,k,n;
+
+    n = dst->numRows;
+
+    float16_t *pX = dst->pData;
+    float16_t *pLT = lt->pData;
+    float16_t *pA = a->pData;
+
+    float16_t *lt_row;
+    float16_t *a_col;
+
+    for(j=0; j < n; j ++)
+    {
+       a_col = &pA[j];
+
+       for(i=0; i < n ; i++)
+       {
+            lt_row = &pLT[n*i];
+
+            float16_t tmp=a_col[i * n];
+            
+            for(k=0; k < i; k++)
+            {
+                tmp -= lt_row[k] * pX[n*k+j];
+            }
+
+            if (lt_row[i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / lt_row[i];
+            pX[i*n+j] = tmp;
+       }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixInv group
+ */
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..bee02bc24b8e27c7af31e50a60891c72910b446a
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f32.c
@@ -0,0 +1,332 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_solve_lower_triangular_f32.c
+ * Description:  Solve linear system LT X = A with LT lower triangular matrix
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixInv
+  @{
+ */
+
+
+   /**
+   * @brief Solve LT . X = A where LT is a lower triangular matrix
+   * @param[in]  lt  The lower triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of LT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+   */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+  arm_status arm_mat_solve_lower_triangular_f32(
+  const arm_matrix_instance_f32 * lt,
+  const arm_matrix_instance_f32 * a,
+  arm_matrix_instance_f32 * dst)
+  {
+  arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((lt->numRows != lt->numCols) ||
+      (a->numRows != a->numCols) ||
+      (lt->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* a1 b1 c1   x1 = a1
+          b2 c2   x2   a2
+             c3   x3   a3
+
+    x3 = a3 / c3 
+    x2 = (a2 - c2 x3) / b2
+
+    */
+    int i,j,k,n;
+
+    n = dst->numRows;
+
+    float32_t *pX = dst->pData;
+    float32_t *pLT = lt->pData;
+    float32_t *pA = a->pData;
+
+    float32_t *lt_row;
+    float32_t *a_col;
+
+    float32_t invLT;
+
+    f32x4_t vecA;
+    f32x4_t vecX;
+
+    for(i=0; i < n ; i++)
+    {
+
+      for(j=0; j+3 < n; j += 4)
+      {
+            vecA = vld1q_f32(&pA[i * n + j]);
+
+            for(k=0; k < i; k++)
+            {
+                vecX = vld1q_f32(&pX[n*k+j]);
+                vecA = vfmsq(vecA,vdupq_n_f32(pLT[n*i + k]),vecX);
+            }
+
+            if (pLT[n*i + i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+
+            invLT = 1.0f / pLT[n*i + i];
+            vecA = vmulq(vecA,vdupq_n_f32(invLT));
+            vst1q(&pX[i*n+j],vecA);
+
+       }
+
+       for(; j < n; j ++)
+       {
+            a_col = &pA[j];
+            lt_row = &pLT[n*i];
+
+            float32_t tmp=a_col[i * n];
+            
+            for(k=0; k < i; k++)
+            {
+                tmp -= lt_row[k] * pX[n*k+j];
+            }
+
+            if (lt_row[i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / lt_row[i];
+            pX[i*n+j] = tmp;
+        }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  /* Return to application */
+  return (status);
+}
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+  arm_status arm_mat_solve_lower_triangular_f32(
+  const arm_matrix_instance_f32 * lt,
+  const arm_matrix_instance_f32 * a,
+  arm_matrix_instance_f32 * dst)
+  {
+  arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((lt->numRows != lt->numCols) ||
+      (a->numRows != a->numCols) ||
+      (lt->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* a1 b1 c1   x1 = a1
+          b2 c2   x2   a2
+             c3   x3   a3
+
+    x3 = a3 / c3 
+    x2 = (a2 - c2 x3) / b2
+
+    */
+    int i,j,k,n;
+
+    n = dst->numRows;
+
+    float32_t *pX = dst->pData;
+    float32_t *pLT = lt->pData;
+    float32_t *pA = a->pData;
+
+    float32_t *lt_row;
+    float32_t *a_col;
+
+    float32_t invLT;
+
+    f32x4_t vecA;
+    f32x4_t vecX;
+
+    for(i=0; i < n ; i++)
+    {
+
+      for(j=0; j+3 < n; j += 4)
+      {
+            vecA = vld1q_f32(&pA[i * n + j]);
+
+            for(k=0; k < i; k++)
+            {
+                vecX = vld1q_f32(&pX[n*k+j]);
+                vecA = vfmsq_f32(vecA,vdupq_n_f32(pLT[n*i + k]),vecX);
+            }
+
+            if (pLT[n*i + i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+
+            invLT = 1.0f / pLT[n*i + i];
+            vecA = vmulq_f32(vecA,vdupq_n_f32(invLT));
+            vst1q_f32(&pX[i*n+j],vecA);
+
+       }
+
+       for(; j < n; j ++)
+       {
+            a_col = &pA[j];
+            lt_row = &pLT[n*i];
+
+            float32_t tmp=a_col[i * n];
+            
+            for(k=0; k < i; k++)
+            {
+                tmp -= lt_row[k] * pX[n*k+j];
+            }
+
+            if (lt_row[i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / lt_row[i];
+            pX[i*n+j] = tmp;
+        }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  /* Return to application */
+  return (status);
+}
+#else
+  arm_status arm_mat_solve_lower_triangular_f32(
+  const arm_matrix_instance_f32 * lt,
+  const arm_matrix_instance_f32 * a,
+  arm_matrix_instance_f32 * dst)
+  {
+  arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+  /* Check for matrix mismatch condition */
+  if ((lt->numRows != lt->numCols) ||
+      (a->numRows != a->numCols) ||
+      (lt->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* a1 b1 c1   x1 = a1
+          b2 c2   x2   a2
+             c3   x3   a3
+
+    x3 = a3 / c3 
+    x2 = (a2 - c2 x3) / b2
+
+    */
+    int i,j,k,n;
+
+    n = dst->numRows;
+
+    float32_t *pX = dst->pData;
+    float32_t *pLT = lt->pData;
+    float32_t *pA = a->pData;
+
+    float32_t *lt_row;
+    float32_t *a_col;
+
+    for(j=0; j < n; j ++)
+    {
+       a_col = &pA[j];
+
+       for(i=0; i < n ; i++)
+       {
+            lt_row = &pLT[n*i];
+
+            float32_t tmp=a_col[i * n];
+            
+            for(k=0; k < i; k++)
+            {
+                tmp -= lt_row[k] * pX[n*k+j];
+            }
+
+            if (lt_row[i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / lt_row[i];
+            pX[i*n+j] = tmp;
+       }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  /* Return to application */
+  return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixInv group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
new file mode 100644
index 0000000000000000000000000000000000000000..e389357ca3c113799216f2d22c9fec145d57297c
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
@@ -0,0 +1,124 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_solve_lower_triangular_f64.c
+ * Description:  Solve linear system LT X = A with LT lower triangular matrix
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixInv
+  @{
+ */
+
+
+   /**
+   * @brief Solve LT . X = A where LT is a lower triangular matrix
+   * @param[in]  lt  The lower triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of LT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+   */
+  arm_status arm_mat_solve_lower_triangular_f64(
+  const arm_matrix_instance_f64 * lt,
+  const arm_matrix_instance_f64 * a,
+  arm_matrix_instance_f64 * dst)
+  {
+  arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((lt->numRows != lt->numCols) ||
+      (a->numRows != a->numCols) ||
+      (lt->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* a1 b1 c1   x1 = a1
+          b2 c2   x2   a2
+             c3   x3   a3
+
+    x3 = a3 / c3 
+    x2 = (a2 - c2 x3) / b2
+
+    */
+    int i,j,k,n;
+
+    n = dst->numRows;
+
+    float64_t *pX = dst->pData;
+    float64_t *pLT = lt->pData;
+    float64_t *pA = a->pData;
+
+    float64_t *lt_row;
+    float64_t *a_col;
+
+    for(j=0; j < n; j ++)
+    {
+       a_col = &pA[j];
+
+       for(i=0; i < n ; i++)
+       {
+            lt_row = &pLT[n*i];
+
+            float64_t tmp=a_col[i * n];
+            
+            for(k=0; k < i; k++)
+            {
+                tmp -= lt_row[k] * pX[n*k+j];
+            }
+
+            if (lt_row[i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / lt_row[i];
+            pX[i*n+j] = tmp;
+       }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  /* Return to application */
+  return (status);
+}
+/**
+  @} end of MatrixInv group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..e3ac42599c9142df17bd92d9d46fc11bf6260e73
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f16.c
@@ -0,0 +1,226 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_solve_upper_triangular_f16.c
+ * Description:  Solve linear system UT X = A with UT upper triangular matrix
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixInv
+  @{
+ */
+
+/**
+   * @brief Solve UT . X = A where UT is an upper triangular matrix
+   * @param[in]  ut  The upper triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of UT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+  */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+  arm_status arm_mat_solve_upper_triangular_f16(
+  const arm_matrix_instance_f16 * ut,
+  const arm_matrix_instance_f16 * a,
+  arm_matrix_instance_f16 * dst)
+  {
+arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((ut->numRows != ut->numCols) ||
+      (a->numRows != a->numCols) ||
+      (ut->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+
+    int i,j,k,n;
+
+    n = dst->numRows;
+
+    float16_t *pX = dst->pData;
+    float16_t *pUT = ut->pData;
+    float16_t *pA = a->pData;
+
+    float16_t *ut_row;
+    float16_t *a_col;
+
+    _Float16 invUT;
+
+    f16x8_t vecA;
+    f16x8_t vecX;
+    
+    for(i=n-1; i >= 0 ; i--)
+    {
+      for(j=0; j+7 < n; j +=8)
+      {
+            vecA = vld1q_f16(&pA[i * n + j]);
+            
+            for(k=n-1; k > i; k--)
+            {
+                vecX = vld1q_f16(&pX[n*k+j]);          
+                vecA = vfmsq(vecA,vdupq_n_f16(pUT[n*i + k]),vecX);
+            }
+
+            if (pUT[n*i + i]==0.0f16)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+
+            invUT = 1.0f16 / (_Float16)pUT[n*i + i];
+            vecA = vmulq(vecA,vdupq_n_f16(invUT));
+           
+
+            vst1q(&pX[i*n+j],vecA);
+      }
+
+      for(; j < n; j ++)
+      {
+            a_col = &pA[j];
+
+            ut_row = &pUT[n*i];
+
+            _Float16 tmp=a_col[i * n];
+            
+            for(k=n-1; k > i; k--)
+            {
+                tmp -= (_Float16)ut_row[k] * (_Float16)pX[n*k+j];
+            }
+
+            if (ut_row[i]==0.0f16)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / (_Float16)ut_row[i];
+            pX[i*n+j] = tmp;
+       }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+
+#else
+  arm_status arm_mat_solve_upper_triangular_f16(
+  const arm_matrix_instance_f16 * ut,
+  const arm_matrix_instance_f16 * a,
+  arm_matrix_instance_f16 * dst)
+  {
+arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((ut->numRows != ut->numCols) ||
+      (a->numRows != a->numCols) ||
+      (ut->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+
+    int i,j,k,n;
+
+    n = dst->numRows;
+
+    float16_t *pX = dst->pData;
+    float16_t *pUT = ut->pData;
+    float16_t *pA = a->pData;
+
+    float16_t *ut_row;
+    float16_t *a_col;
+
+    for(j=0; j < n; j ++)
+    {
+       a_col = &pA[j];
+
+       for(i=n-1; i >= 0 ; i--)
+       {
+            ut_row = &pUT[n*i];
+
+            float16_t tmp=a_col[i * n];
+            
+            for(k=n-1; k > i; k--)
+            {
+                tmp -= ut_row[k] * pX[n*k+j];
+            }
+
+            if (ut_row[i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / ut_row[i];
+            pX[i*n+j] = tmp;
+       }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixInv group
+ */
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..cdf17d2c759bc10aa2e8608be6f80d715602f457
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f32.c
@@ -0,0 +1,319 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_solve_upper_triangular_f32.c
+ * Description:  Solve linear system UT X = A with UT upper triangular matrix
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixInv
+  @{
+ */
+
+/**
+   * @brief Solve UT . X = A where UT is an upper triangular matrix
+   * @param[in]  ut  The upper triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of UT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+  */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+  arm_status arm_mat_solve_upper_triangular_f32(
+  const arm_matrix_instance_f32 * ut,
+  const arm_matrix_instance_f32 * a,
+  arm_matrix_instance_f32 * dst)
+  {
+arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((ut->numRows != ut->numCols) ||
+      (a->numRows != a->numCols) ||
+      (ut->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+
+    int i,j,k,n;
+
+    n = dst->numRows;
+
+    float32_t *pX = dst->pData;
+    float32_t *pUT = ut->pData;
+    float32_t *pA = a->pData;
+
+    float32_t *ut_row;
+    float32_t *a_col;
+
+    float32_t invUT;
+
+    f32x4_t vecA;
+    f32x4_t vecX;
+    
+    for(i=n-1; i >= 0 ; i--)
+    {
+      for(j=0; j+3 < n; j +=4)
+      {
+            vecA = vld1q_f32(&pA[i * n + j]);
+            
+            for(k=n-1; k > i; k--)
+            {
+                vecX = vld1q_f32(&pX[n*k+j]);          
+                vecA = vfmsq(vecA,vdupq_n_f32(pUT[n*i + k]),vecX);
+            }
+
+            if (pUT[n*i + i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+
+            invUT = 1.0f / pUT[n*i + i];
+            vecA = vmulq(vecA,vdupq_n_f32(invUT));
+           
+
+            vst1q(&pX[i*n+j],vecA);
+      }
+
+      for(; j < n; j ++)
+      {
+            a_col = &pA[j];
+
+            ut_row = &pUT[n*i];
+
+            float32_t tmp=a_col[i * n];
+            
+            for(k=n-1; k > i; k--)
+            {
+                tmp -= ut_row[k] * pX[n*k+j];
+            }
+
+            if (ut_row[i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / ut_row[i];
+            pX[i*n+j] = tmp;
+       }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+  arm_status arm_mat_solve_upper_triangular_f32(
+  const arm_matrix_instance_f32 * ut,
+  const arm_matrix_instance_f32 * a,
+  arm_matrix_instance_f32 * dst)
+  {
+arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((ut->numRows != ut->numCols) ||
+      (a->numRows != a->numCols) ||
+      (ut->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+
+    int i,j,k,n;
+
+    n = dst->numRows;
+
+    float32_t *pX = dst->pData;
+    float32_t *pUT = ut->pData;
+    float32_t *pA = a->pData;
+
+    float32_t *ut_row;
+    float32_t *a_col;
+
+    float32_t invUT;
+
+    f32x4_t vecA;
+    f32x4_t vecX;
+    
+    for(i=n-1; i >= 0 ; i--)
+    {
+      for(j=0; j+3 < n; j +=4)
+      {
+            vecA = vld1q_f32(&pA[i * n + j]);
+            
+            for(k=n-1; k > i; k--)
+            {
+                vecX = vld1q_f32(&pX[n*k+j]);          
+                vecA = vfmsq_f32(vecA,vdupq_n_f32(pUT[n*i + k]),vecX);
+            }
+
+            if (pUT[n*i + i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+
+            invUT = 1.0f / pUT[n*i + i];
+            vecA = vmulq_f32(vecA,vdupq_n_f32(invUT));
+           
+
+            vst1q_f32(&pX[i*n+j],vecA);
+      }
+
+      for(; j < n; j ++)
+      {
+            a_col = &pA[j];
+
+            ut_row = &pUT[n*i];
+
+            float32_t tmp=a_col[i * n];
+            
+            for(k=n-1; k > i; k--)
+            {
+                tmp -= ut_row[k] * pX[n*k+j];
+            }
+
+            if (ut_row[i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / ut_row[i];
+            pX[i*n+j] = tmp;
+       }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+
+#else
+  arm_status arm_mat_solve_upper_triangular_f32(
+  const arm_matrix_instance_f32 * ut,
+  const arm_matrix_instance_f32 * a,
+  arm_matrix_instance_f32 * dst)
+  {
+arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((ut->numRows != ut->numCols) ||
+      (a->numRows != a->numCols) ||
+      (ut->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+
+    int i,j,k,n;
+
+    n = dst->numRows;
+
+    float32_t *pX = dst->pData;
+    float32_t *pUT = ut->pData;
+    float32_t *pA = a->pData;
+
+    float32_t *ut_row;
+    float32_t *a_col;
+
+    for(j=0; j < n; j ++)
+    {
+       a_col = &pA[j];
+
+       for(i=n-1; i >= 0 ; i--)
+       {
+            ut_row = &pUT[n*i];
+
+            float32_t tmp=a_col[i * n];
+            
+            for(k=n-1; k > i; k--)
+            {
+                tmp -= ut_row[k] * pX[n*k+j];
+            }
+
+            if (ut_row[i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / ut_row[i];
+            pX[i*n+j] = tmp;
+       }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixInv group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
new file mode 100644
index 0000000000000000000000000000000000000000..70320949ea649b37cafeebd57b197413d6346aa0
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
@@ -0,0 +1,120 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_solve_upper_triangular_f64.c
+ * Description:  Solve linear system UT X = A with UT upper triangular matrix
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixInv
+  @{
+ */
+
+/**
+   * @brief Solve UT . X = A where UT is an upper triangular matrix
+   * @param[in]  ut  The upper triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of UT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+  */
+  arm_status arm_mat_solve_upper_triangular_f64(
+  const arm_matrix_instance_f64 * ut,
+  const arm_matrix_instance_f64 * a,
+  arm_matrix_instance_f64 * dst)
+  {
+arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((ut->numRows != ut->numCols) ||
+      (a->numRows != a->numCols) ||
+      (ut->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+
+    int i,j,k,n;
+
+    n = dst->numRows;
+
+    float64_t *pX = dst->pData;
+    float64_t *pUT = ut->pData;
+    float64_t *pA = a->pData;
+
+    float64_t *ut_row;
+    float64_t *a_col;
+
+    for(j=0; j < n; j ++)
+    {
+       a_col = &pA[j];
+
+       for(i=n-1; i >= 0 ; i--)
+       {
+            ut_row = &pUT[n*i];
+
+            float64_t tmp=a_col[i * n];
+            
+            for(k=n-1; k > i; k--)
+            {
+                tmp -= ut_row[k] * pX[n*k+j];
+            }
+
+            if (ut_row[i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / ut_row[i];
+            pX[i*n+j] = tmp;
+       }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+
+
+/**
+  @} end of MatrixInv group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..559f2fde85ced0d428ebe6a6053329400a186add
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f16.c
@@ -0,0 +1,215 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_sub_f16.c
+ * Description:  Floating-point matrix subtraction
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixSub
+  @{
+ */
+
+/**
+  @brief         Floating-point matrix subtraction.
+  @param[in]     pSrcA      points to the first input matrix structure
+  @param[in]     pSrcB      points to the second input matrix structure
+  @param[out]    pDst       points to output matrix structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_sub_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+  arm_matrix_instance_f16 * pDst)
+{
+    arm_status status;                             /* status of matrix subtraction */
+    uint32_t  numSamples;       /* total number of elements in the matrix  */
+    float16_t *pDataA, *pDataB, *pDataDst;
+    f16x8_t vecA, vecB, vecDst;
+    float16_t const *pSrcAVec;
+    float16_t const *pSrcBVec;
+    uint32_t  blkCnt;           /* loop counters */
+
+    pDataA = pSrcA->pData;
+    pDataB = pSrcB->pData;
+    pDataDst = pDst->pData;
+    pSrcAVec = (float16_t const *) pDataA;
+    pSrcBVec = (float16_t const *) pDataB;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numRows != pSrcB->numRows) ||
+     (pSrcA->numCols != pSrcB->numCols) ||
+     (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
+  {
+    /*
+     * Total number of samples in the input matrix
+     */
+    numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+    blkCnt = numSamples >> 3;
+    while (blkCnt > 0U)
+    {
+        /* C(m,n) = A(m,n) + B(m,n) */
+        /* sub and then store the results in the destination buffer. */
+        vecA = vld1q(pSrcAVec); 
+        pSrcAVec += 8;
+        vecB = vld1q(pSrcBVec); 
+        pSrcBVec += 8;
+        vecDst = vsubq(vecA, vecB);
+        vst1q(pDataDst, vecDst);  
+        pDataDst += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = numSamples & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcAVec); 
+        vecB = vld1q(pSrcBVec); 
+        vecDst = vsubq_m(vecDst, vecA, vecB, p0);
+        vstrhq_p(pDataDst, vecDst, p0);
+    }
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+
+#else
+
+arm_status arm_mat_sub_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+        arm_matrix_instance_f16 * pDst)
+{
+  float16_t *pInA = pSrcA->pData;                /* input data matrix pointer A */
+  float16_t *pInB = pSrcB->pData;                /* input data matrix pointer B */
+  float16_t *pOut = pDst->pData;                 /* output data matrix pointer */
+
+  uint32_t numSamples;                           /* total number of elements in the matrix */
+  uint32_t blkCnt;                               /* loop counters */
+  arm_status status;                             /* status of matrix subtraction */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numRows != pSrcB->numRows) ||
+      (pSrcA->numCols != pSrcB->numCols) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcA->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* Total number of samples in input matrix */
+    numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+    blkCnt = numSamples >> 2U;
+
+    while (blkCnt > 0U)
+    {
+      /* C(m,n) = A(m,n) - B(m,n) */
+
+      /* Subtract and store result in destination buffer. */
+      *pOut++ = (*pInA++) - (*pInB++);
+      *pOut++ = (*pInA++) - (*pInB++);
+      *pOut++ = (*pInA++) - (*pInB++);
+      *pOut++ = (*pInA++) - (*pInB++);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    blkCnt = numSamples % 0x4U;
+
+#else
+
+    /* Initialize blkCnt with number of samples */
+    blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+    while (blkCnt > 0U)
+    {
+      /* C(m,n) = A(m,n) - B(m,n) */
+
+      /* Subtract and store result in destination buffer. */
+      *pOut++ = (*pInA++) - (*pInB++);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixSub group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f32.c
index d76537bc8e1a01d7569dab36fd4a5761fb42d5a6..e9a17d1bd225497f5cbb93dac931dd3add8e82cd 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f32.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_sub_f32.c
  * Description:  Floating-point matrix subtraction
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f64.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f64.c
new file mode 100644
index 0000000000000000000000000000000000000000..245a76d075d3598f94704152b174dbfe28dc3edf
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f64.c
@@ -0,0 +1,143 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_sub_f64.c
+ * Description:  Floating-point matrix subtraction
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @defgroup MatrixSub Matrix Subtraction
+
+  Subtract two matrices.
+  \image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
+
+  The functions check to make sure that
+  <code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
+  number of rows and columns.
+ */
+
+/**
+  @addtogroup MatrixSub
+  @{
+ */
+
+/**
+  @brief         Floating-point matrix subtraction.
+  @param[in]     pSrcA      points to the first input matrix structure
+  @param[in]     pSrcB      points to the second input matrix structure
+  @param[out]    pDst       points to output matrix structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+
+arm_status arm_mat_sub_f64(
+  const arm_matrix_instance_f64 * pSrcA,
+  const arm_matrix_instance_f64 * pSrcB,
+        arm_matrix_instance_f64 * pDst)
+{
+  float64_t *pInA = pSrcA->pData;                /* input data matrix pointer A */
+  float64_t *pInB = pSrcB->pData;                /* input data matrix pointer B */
+  float64_t *pOut = pDst->pData;                 /* output data matrix pointer */
+
+  uint64_t numSamples;                           /* total number of elements in the matrix */
+  uint64_t blkCnt;                               /* loop counters */
+  arm_status status;                             /* status of matrix subtraction */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numRows != pSrcB->numRows) ||
+      (pSrcA->numCols != pSrcB->numCols) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcA->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* Total number of samples in input matrix */
+    numSamples = (uint64_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+    blkCnt = numSamples >> 2U;
+
+    while (blkCnt > 0U)
+    {
+      /* C(m,n) = A(m,n) - B(m,n) */
+
+      /* Subtract and store result in destination buffer. */
+      *pOut++ = (*pInA++) - (*pInB++);
+      *pOut++ = (*pInA++) - (*pInB++);
+      *pOut++ = (*pInA++) - (*pInB++);
+      *pOut++ = (*pInA++) - (*pInB++);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    blkCnt = numSamples % 0x4U;
+
+#else
+
+    /* Initialize blkCnt with number of samples */
+    blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+    while (blkCnt > 0U)
+    {
+      /* C(m,n) = A(m,n) - B(m,n) */
+
+      /* Subtract and store result in destination buffer. */
+      *pOut++ = (*pInA++) - (*pInB++);
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+
+/**
+  @} end of MatrixSub group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q15.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q15.c
index b0c7bf37f4728229cda340792c81a3e824087e4c..c9f9c83059318599b4d0780177b188214b8e5583 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q15.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_sub_q15.c
  * Description:  Q15 Matrix subtraction
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -50,7 +50,7 @@
                    The function uses saturating arithmetic.
                    Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 arm_status arm_mat_sub_q15(
   const arm_matrix_instance_q15 * pSrcA,
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q31.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q31.c
index 5356ce794d6f0050d872062b4983354690c4c59f..5045b298ed63d806012ef8ca81f67c15f9b3b655 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q31.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_sub_q31.c
  * Description:  Q31 matrix subtraction
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -50,7 +50,7 @@
                    The function uses saturating arithmetic.
                    Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 arm_status arm_mat_sub_q31(
   const arm_matrix_instance_q31 * pSrcA,
   const arm_matrix_instance_q31 * pSrcB,
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..8a41ccbc14aa21adeec6c232620a5a31ee39c396
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f16.c
@@ -0,0 +1,202 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_trans_f16.c
+ * Description:  Floating-point matrix transpose
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @addtogroup MatrixTrans
+  @{
+ */
+
+/**
+  @brief         Floating-point matrix transpose.
+  @param[in]     pSrc      points to input matrix
+  @param[out]    pDst      points to output matrix
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_trans_f16(
+  const arm_matrix_instance_f16 * pSrc,
+  arm_matrix_instance_f16 * pDst)
+{
+  arm_status status;                             /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numCols) ||
+      (pSrc->numCols != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    if (pDst->numRows == pDst->numCols)
+    {
+        if (pDst->numCols == 1)
+        {
+          pDst->pData[0] = pSrc->pData[0];
+          return(ARM_MATH_SUCCESS);
+        }
+        if (pDst->numCols == 2)
+            return arm_mat_trans_16bit_2x2((uint16_t  *)pSrc->pData, (uint16_t  *)pDst->pData);
+        if (pDst->numCols == 3)
+            return arm_mat_trans_16bit_3x3_mve((uint16_t  *)pSrc->pData, (uint16_t  *)pDst->pData);
+        if (pDst->numCols == 4)
+            return arm_mat_trans_16bit_4x4_mve((uint16_t  *)pSrc->pData, (uint16_t  *)pDst->pData);
+    }
+
+    arm_mat_trans_16bit_generic(pSrc->numRows, pSrc->numCols, (uint16_t  *)pSrc->pData, (uint16_t  *)pDst->pData);
+      /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+
+#else
+
+arm_status arm_mat_trans_f16(
+  const arm_matrix_instance_f16 * pSrc,
+        arm_matrix_instance_f16 * pDst)
+{
+  float16_t *pIn = pSrc->pData;                  /* input data matrix pointer */
+  float16_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  float16_t *px;                                 /* Temporary output data matrix pointer */
+  uint16_t nRows = pSrc->numRows;                /* number of rows */
+  uint16_t nCols = pSrc->numCols;                /* number of columns */
+  uint32_t col, row = nRows, i = 0U;             /* Loop counters */
+  arm_status status;                             /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numCols) ||
+      (pSrc->numCols != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* Matrix transpose by exchanging the rows with columns */
+    /* row loop */
+    do
+    {
+      /* Pointer px is set to starting address of column being processed */
+      px = pOut + i;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+      /* Loop unrolling: Compute 4 outputs at a time */
+      col = nCols >> 2U;
+
+      while (col > 0U)        /* column loop */
+      {
+        /* Read and store input element in destination */
+        *px = *pIn++;
+        /* Update pointer px to point to next row of transposed matrix */
+        px += nRows;
+
+        *px = *pIn++;
+        px += nRows;
+
+        *px = *pIn++;
+        px += nRows;
+
+        *px = *pIn++;
+        px += nRows;
+
+        /* Decrement column loop counter */
+        col--;
+      }
+
+      /* Loop unrolling: Compute remaining outputs */
+      col = nCols % 0x4U;
+
+#else
+
+      /* Initialize col with number of samples */
+      col = nCols;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+      while (col > 0U)
+      {
+        /* Read and store input element in destination */
+        *px = *pIn++;
+
+        /* Update pointer px to point to next row of transposed matrix */
+        px += nRows;
+
+        /* Decrement column loop counter */
+        col--;
+      }
+
+      i++;
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);          /* row loop end */
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixTrans group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f32.c
index 44047b96e5f30cb0ae7da32a5cc34d6f24ff3e77..b4116630d4933d1a7936ca246ce50b750f4e30ef 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f32.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_trans_f32.c
  * Description:  Floating-point matrix transpose
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f64.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f64.c
new file mode 100644
index 0000000000000000000000000000000000000000..57b5043fc5bd1f4e9d94a3f56ac4bdb553ddfb90
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f64.c
@@ -0,0 +1,155 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_trans_f64.c
+ * Description:  Floating-point matrix transpose
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @defgroup MatrixTrans Matrix Transpose
+
+  Tranposes a matrix.
+
+  Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
+  \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
+ */
+
+/**
+  @addtogroup MatrixTrans
+  @{
+ */
+
+/**
+  @brief         Floating-point matrix transpose.
+  @param[in]     pSrc      points to input matrix
+  @param[out]    pDst      points to output matrix
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+
+arm_status arm_mat_trans_f64(
+  const arm_matrix_instance_f64 * pSrc,
+        arm_matrix_instance_f64 * pDst)
+{
+  float64_t *pIn = pSrc->pData;                  /* input data matrix pointer */
+  float64_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  float64_t *px;                                 /* Temporary output data matrix pointer */
+  uint16_t nRows = pSrc->numRows;                /* number of rows */
+  uint16_t nCols = pSrc->numCols;                /* number of columns */
+  uint64_t col, row = nRows, i = 0U;             /* Loop counters */
+  arm_status status;                             /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numCols) ||
+      (pSrc->numCols != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* Matrix transpose by exchanging the rows with columns */
+    /* row loop */
+    do
+    {
+      /* Pointer px is set to starting address of column being processed */
+      px = pOut + i;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+      /* Loop unrolling: Compute 4 outputs at a time */
+      col = nCols >> 2U;
+
+      while (col > 0U)        /* column loop */
+      {
+        /* Read and store input element in destination */
+        *px = *pIn++;
+        /* Update pointer px to point to next row of transposed matrix */
+        px += nRows;
+
+        *px = *pIn++;
+        px += nRows;
+
+        *px = *pIn++;
+        px += nRows;
+
+        *px = *pIn++;
+        px += nRows;
+
+        /* Decrement column loop counter */
+        col--;
+      }
+
+      /* Loop unrolling: Compute remaining outputs */
+      col = nCols % 0x4U;
+
+#else
+
+      /* Initialize col with number of samples */
+      col = nCols;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+      while (col > 0U)
+      {
+        /* Read and store input element in destination */
+        *px = *pIn++;
+
+        /* Update pointer px to point to next row of transposed matrix */
+        px += nRows;
+
+        /* Decrement column loop counter */
+        col--;
+      }
+
+      i++;
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);          /* row loop end */
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q15.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q15.c
index 325511fe966834ffe568848ad691a4b0e024eb1a..e2c2c932a4fbd81ca252d2968b60232a77aa923b 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q15.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_trans_q15.c
  * Description:  Q15 matrix transpose
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -46,125 +46,10 @@
                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
  */
  
-#if defined(ARM_MATH_MVEI)
-
-__STATIC_INLINE arm_status arm_mat_trans_16bit_2x2(uint16_t * pDataSrc, uint16_t * pDataDest)
-{
-    pDataDest[0] = pDataSrc[0];
-    pDataDest[3] = pDataSrc[3];
-    pDataDest[2] = pDataSrc[1];
-    pDataDest[1] = pDataSrc[2];
-
-    return (ARM_MATH_SUCCESS);
-}
-
-static arm_status arm_mat_trans_16bit_3x3_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
-{
-    static const uint16_t stridesTr33[8] = { 0, 3, 6, 1, 4, 7, 2, 5 };
-    uint16x8_t    vecOffs1;
-    uint16x8_t    vecIn1;
-    /*
-     *
-     *  | 0   1   2 |       | 0   3   6 |  8 x 16 flattened version | 0   3   6   1   4   7   2   5 |
-     *  | 3   4   5 | =>    | 1   4   7 |            =>             | 8   .   .   .   .   .   .   . |
-     *  | 6   7   8 |       | 2   5   8 |       (row major)
-     *
-     */
-    vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr33);
-    vecIn1 = vldrhq_u16((uint16_t const *) pDataSrc);
-
-    vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
-
-    pDataDest[8] = pDataSrc[8];
-
-    return (ARM_MATH_SUCCESS);
-}
-
-
-static arm_status arm_mat_trans_16bit_4x4_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
-{
-    static const uint16_t stridesTr44_1[8] = { 0, 4, 8, 12, 1, 5, 9, 13 };
-    static const uint16_t stridesTr44_2[8] = { 2, 6, 10, 14, 3, 7, 11, 15 };
-    uint16x8_t    vecOffs1, vecOffs2;
-    uint16x8_t    vecIn1, vecIn2;
-    uint16_t const * pDataSrcVec = (uint16_t const *) pDataSrc;
-
-    /*
-     * 4x4 Matrix transposition
-     *
-     * | 0   1   2   3  |       | 0   4   8   12 |   8 x 16 flattened version
-     * | 4   5   6   7  |  =>   | 1   5   9   13 |   =>      [0   4   8   12  1   5   9   13]
-     * | 8   9   10  11 |       | 2   6   10  14 |           [2   6   10  14  3   7   11  15]
-     * | 12  13  14  15 |       | 3   7   11  15 |
-     */
-
-    vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr44_1);
-    vecOffs2 = vldrhq_u16((uint16_t const *) stridesTr44_2);
-    vecIn1 = vldrhq_u16(pDataSrcVec);
-    pDataSrcVec += 8;
-    vecIn2 = vldrhq_u16(pDataSrcVec);
-
-    vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
-    vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs2, vecIn2);
-
-
-    return (ARM_MATH_SUCCESS);
-}
-
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
+#include "arm_helium_utils.h"
 
-static arm_status arm_mat_trans_16bit_generic(
-    uint16_t    srcRows,
-    uint16_t    srcCols,
-    uint16_t  * pDataSrc,
-    uint16_t  * pDataDest)
-{
-    uint16x8_t    vecOffs;
-    uint32_t        i;
-    uint32_t        blkCnt;
-    uint16_t const *pDataC;
-    uint16_t       *pDataDestR;
-    uint16x8_t    vecIn;
-
-    vecOffs = vidupq_u16((uint32_t)0, 1);
-    vecOffs = vecOffs * srcCols;
-
-    i = srcCols;
-    while(i > 0U)
-    {
-        pDataC = (uint16_t const *) pDataSrc;
-        pDataDestR = pDataDest;
-
-        blkCnt = srcRows >> 3;
-        while (blkCnt > 0U)
-        {
-            vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
-            vstrhq_u16(pDataDestR, vecIn); 
-            pDataDestR += 8;
-            pDataC = pDataC + srcCols * 8;
-            /*
-             * Decrement the blockSize loop counter
-             */
-            blkCnt--;
-        }
-
-        /*
-         * tail
-         */
-        blkCnt = srcRows & 7;
-        if (blkCnt > 0U)
-        {
-            mve_pred16_t p0 = vctp16q(blkCnt);
-            vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
-            vstrhq_p_u16(pDataDestR, vecIn, p0);
-        }
-        pDataSrc += 1;
-        pDataDest += srcRows;
-        i--;
-    }
-
-    return (ARM_MATH_SUCCESS);
-}
 
 
 arm_status arm_mat_trans_q15(
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q31.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q31.c
index 9d92abde02bd69c628413168562be7f1f10c845a..2c77254704a16987a3d67de4ffd3094b24a8e8b1 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q31.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_mat_trans_q31.c
  * Description:  Q31 matrix transpose
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/matrix_functions.h"
 
 /**
   @ingroup groupMatrix
@@ -45,7 +45,7 @@
                    - \ref ARM_MATH_SUCCESS       : Operation successful
                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q7.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q7.c
new file mode 100644
index 0000000000000000000000000000000000000000..a500d055ebfc856af10deefd4a2ddaf52ed842a3
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q7.c
@@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_trans_q7.c
+ * Description:  Q7 matrix transpose
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @addtogroup MatrixTrans
+  @{
+ */
+
+/**
+  @brief         Q7 matrix transpose.
+  @param[in]     pSrc      points to input matrix
+  @param[out]    pDst      points to output matrix
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+arm_status arm_mat_trans_q7(const arm_matrix_instance_q7 *pSrc, arm_matrix_instance_q7 *pDst)
+{
+
+    uint16x8_t    vecOffs;
+    uint32_t        i;
+    uint32_t        blkCnt;
+    uint8_t const  *pDataC;
+    uint8_t        *pDataDestR;
+    uint16x8_t    vecIn;
+
+    const uint8_t   * pDataSrc=(const uint8_t  *)pSrc->pData;
+    uint8_t   * pDataDst=(uint8_t  *)pDst->pData;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+    /* Check for matrix mismatch condition */
+    if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+    {
+        /* Set status as ARM_MATH_SIZE_MISMATCH */
+        return ARM_MATH_SIZE_MISMATCH;
+    }
+#endif
+
+    vecOffs = vidupq_u16((uint32_t)0, 1);
+    vecOffs = vecOffs * pSrc->numCols;
+
+    i = pSrc->numCols;
+    do
+    {
+        pDataC = (uint8_t const *) pDataSrc;
+        pDataDestR = (uint8_t*)pDataDst;
+
+        blkCnt = pSrc->numRows >> 3;
+        while (blkCnt > 0U)
+        {
+            /* widened loads */
+            vecIn = vldrbq_gather_offset_u16(pDataC, vecOffs);
+            vstrbq_u16(pDataDestR, vecIn);  
+            pDataDestR += 8;
+            pDataC = pDataC + pSrc->numCols * 8;
+            /*
+             * Decrement the blockSize loop counter
+             */
+            blkCnt--;
+        }
+
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = pSrc->numRows & 7;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp16q(blkCnt);
+            vecIn = vldrbq_gather_offset_u16(pDataC, vecOffs);
+            vstrbq_p_u16(pDataDestR, vecIn, p0);
+        }
+        pDataSrc += 1;
+        pDataDst += pSrc->numRows;
+    }
+    while (--i);
+
+    return (ARM_MATH_SUCCESS);
+}
+#else
+arm_status arm_mat_trans_q7(const arm_matrix_instance_q7 *pSrc, arm_matrix_instance_q7 *pDst)
+{
+    q7_t *pSrcA = pSrc->pData;         /* input data matrix pointer */
+    q7_t *pOut = pDst->pData;          /* output data matrix pointer */
+    uint16_t nRows = pSrc->numRows;    /* number of nRows */
+    uint16_t nColumns = pSrc->numCols; /* number of nColumns */
+    uint16_t col, row = nRows, i = 0U; /* row and column loop counters */
+    arm_status status;                 /* status of matrix transpose */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+    /* Check for matrix mismatch condition */
+    if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows)) {
+        /* Set status as ARM_MATH_SIZE_MISMATCH */
+        status = ARM_MATH_SIZE_MISMATCH;
+    } else
+#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
+
+    {
+        /* Matrix transpose by exchanging the rows with columns */
+        /* row loop     */
+        do {
+            /* The pointer pOut is set to starting address of the column being processed */
+            pOut = pDst->pData + i;
+
+            /* Initialize column loop counter */
+            col = nColumns;
+
+
+            while (col > 0U) {
+                /* Read and store the input element in the destination */
+                *pOut = *pSrcA++;
+
+                /* Update the pointer pOut to point to the next row of the transposed matrix */
+                pOut += nRows;
+
+                /* Decrement the column loop counter */
+                col--;
+            }
+
+            i++;
+
+            /* Decrement the row loop counter */
+            row--;
+
+        } while (row > 0U);
+
+        /* set status as ARM_MATH_SUCCESS */
+        status = ARM_MATH_SUCCESS;
+    }
+    /* Return to application */
+    return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+
+/**
+  @} end of MatrixTrans group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..badf530a0f1ac02e825b1d0c7fef538aad04bcfb
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_f16.c
@@ -0,0 +1,396 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_vec_mult_f16.c
+ * Description:  Floating-point matrix and vector multiplication
+ *
+ * $Date:        23 April 2021
+ *
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+/**
+ * @addtogroup MatrixVectMult
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix and vector multiplication.
+ * @param[in]       *pSrcMat points to the input matrix structure
+ * @param[in]       *pVec points to input vector
+ * @param[out]      *pDst points to output vector
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mat_vec_mult_f16(
+    const arm_matrix_instance_f16   *pSrcMat,
+    const float16_t                 *pSrcVec,
+    float16_t                       *pDstVec)
+{
+    uint32_t         numRows = pSrcMat->numRows;
+    uint32_t         numCols = pSrcMat->numCols;
+    const float16_t *pSrcA = pSrcMat->pData;
+    const float16_t *pInA0;
+    const float16_t *pInA1;
+    float16_t       *px;
+    int32_t          row;
+    uint32_t         blkCnt;           /* loop counters */
+
+    row = numRows;
+    px = pDstVec;
+
+    /*
+     * compute 4 rows in parallel
+     */
+    while (row >= 4)
+    {
+        const float16_t     *pInA2, *pInA3;
+        float16_t const    *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f16x8_t            vecIn, acc0, acc1, acc2, acc3;
+        float16_t const     *pSrcVecPtr = pSrcVec;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec =  pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        acc2 = vdupq_n_f16(0.0f);
+        acc3 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U)
+        {
+            f16x8_t vecA;
+
+            vecIn = vld1q(pInVec);      
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);    
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);  
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);  
+            pSrcA2Vec += 8;
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);  
+            pSrcA3Vec += 8;
+            acc3 = vfmaq(acc3, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp16q(blkCnt);
+            f16x8_t vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vld1q(pSrcA0Vec);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);
+            acc3 = vfmaq(acc3, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        *px++ = vecAddAcrossF16Mve(acc0);
+        *px++ = vecAddAcrossF16Mve(acc1);
+        *px++ = vecAddAcrossF16Mve(acc2);
+        *px++ = vecAddAcrossF16Mve(acc3);
+
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parrallel
+     */
+    if (row >= 2)
+    {
+        float16_t const    *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f16x8_t            vecIn, acc0, acc1;
+        float16_t const     *pSrcVecPtr = pSrcVec;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U)
+        {
+            f16x8_t vecA;
+
+            vecIn = vld1q(pInVec);      
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);    
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);    
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp16q(blkCnt);
+            f16x8_t vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vld1q(pSrcA0Vec);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        *px++ = vecAddAcrossF16Mve(acc0);
+        *px++ = vecAddAcrossF16Mve(acc1);
+
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1)
+    {
+        f16x8_t             vecIn, acc0;
+        float16_t const     *pSrcA0Vec, *pInVec;
+        float16_t const      *pSrcVecPtr = pSrcVec;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U)
+        {
+            f16x8_t vecA;
+
+            vecIn = vld1q(pInVec);      
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);    
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp16q(blkCnt);
+            f16x8_t vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vld1q(pSrcA0Vec);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        *px++ = vecAddAcrossF16Mve(acc0);
+    }
+}
+#else
+void arm_mat_vec_mult_f16(const arm_matrix_instance_f16 *pSrcMat, const float16_t *pVec, float16_t *pDst)
+{
+    uint32_t numRows = pSrcMat->numRows;
+    uint32_t numCols = pSrcMat->numCols;
+    const float16_t *pSrcA = pSrcMat->pData;
+    const float16_t *pInA1;      /* input data matrix pointer A of Q31 type */
+    const float16_t *pInA2;      /* input data matrix pointer A of Q31 type */
+    const float16_t *pInA3;      /* input data matrix pointer A of Q31 type */
+    const float16_t *pInA4;      /* input data matrix pointer A of Q31 type */
+    const float16_t *pInVec;     /* input data matrix pointer B of Q31 type */
+    float16_t *px;               /* Temporary output data matrix pointer */
+    uint16_t i, row, colCnt; /* loop counters */
+    float16_t matData, matData2, vecData, vecData2;
+
+
+    /* Process 4 rows at a time */
+    row = numRows >> 2;
+    i = 0u;
+    px = pDst;
+
+    /* The following loop performs the dot-product of each row in pSrcA with the vector */
+    /* row loop */
+    while (row > 0) {
+        /* For every row wise process, the pInVec pointer is set
+         ** to the starting address of the vector */
+        pInVec = pVec;
+
+        /* Initialize accumulators */
+        float16_t sum1 = 0.0f;
+        float16_t sum2 = 0.0f;
+        float16_t sum3 = 0.0f;
+        float16_t sum4 = 0.0f;
+
+        /* Loop unrolling: process 2 columns per iteration */
+        colCnt = numCols;
+
+        /* Initialize pointers to the starting address of the column being processed */
+        pInA1 = pSrcA + i;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        pInA4 = pInA3 + numCols;
+
+
+        // Main loop: matrix-vector multiplication
+        while (colCnt > 0u) {
+            // Read 2 values from vector
+            vecData = *(pInVec)++;
+            // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
+            matData = *(pInA1)++;
+            sum1 += matData * vecData;
+            matData = *(pInA2)++;
+            sum2 += matData * vecData;
+            matData = *(pInA3)++;
+            sum3 += matData * vecData;
+            matData = *(pInA4)++;
+            sum4 += matData * vecData;
+
+            // Decrement the loop counter
+            colCnt--;
+        }
+
+        /* Saturate and store the result in the destination buffer */
+        *px++ = sum1;
+        *px++ = sum2;
+        *px++ = sum3;
+        *px++ = sum4;
+
+        i = i + numCols * 4;
+
+        /* Decrement the row loop counter */
+        row--;
+    }
+
+    /* process any remaining rows */
+    row = numRows & 3u;
+    while (row > 0) {
+
+        float16_t sum = 0.0f;
+        pInVec = pVec;
+        pInA1 = pSrcA + i;
+
+        colCnt = numCols >> 1;
+
+        while (colCnt > 0) {
+            vecData = *(pInVec)++;
+            vecData2 = *(pInVec)++;
+            matData = *(pInA1)++;
+            matData2 = *(pInA1)++;
+            sum += matData * vecData;
+            sum += matData2 * vecData2;
+            colCnt--;
+        }
+        // process remainder of row
+        colCnt = numCols & 1u;
+        while (colCnt > 0) {
+            sum += *pInA1++ * *pInVec++;
+            colCnt--;
+        }
+
+        *px++ = sum;
+        i = i + numCols;
+        row--;
+    }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixMult group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..03a94f6d7afa21ec91abbcc643bfc5e8de4273df
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_f32.c
@@ -0,0 +1,399 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_vec_mult_f32.c
+ * Description:  Floating-point matrix and vector multiplication
+ *
+ * $Date:        23 April 2021
+ *
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @defgroup MatrixVectMult Matrix Vector Multiplication
+ *
+ * Multiplies a matrix and a vector.
+ *
+ */
+
+/**
+ * @addtogroup MatrixVectMult
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix and vector multiplication.
+ * @param[in]       *pSrcMat points to the input matrix structure
+ * @param[in]       *pVec points to input vector
+ * @param[out]      *pDst points to output vector
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mat_vec_mult_f32(
+    const arm_matrix_instance_f32   *pSrcMat,
+    const float32_t                 *pSrcVec,
+    float32_t                       *pDstVec)
+{
+    uint32_t         numRows = pSrcMat->numRows;
+    uint32_t         numCols = pSrcMat->numCols;
+    const float32_t *pSrcA = pSrcMat->pData;
+    const float32_t *pInA0;
+    const float32_t *pInA1;
+    float32_t       *px;
+    int32_t          row;
+    uint32_t         blkCnt;           /* loop counters */
+
+    row = numRows;
+    px = pDstVec;
+
+    /*
+     * compute 4 rows in parallel
+     */
+    while (row >= 4)
+    {
+        const float32_t     *pInA2, *pInA3;
+        float32_t const    *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f32x4_t            vecIn, acc0, acc1, acc2, acc3;
+        float32_t const     *pSrcVecPtr = pSrcVec;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec =  pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+        acc1 = vdupq_n_f32(0.0f);
+        acc2 = vdupq_n_f32(0.0f);
+        acc3 = vdupq_n_f32(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U)
+        {
+            f32x4_t vecA;
+
+            vecIn = vld1q(pInVec);      
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);    
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);  
+            pSrcA1Vec += 4;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);  
+            pSrcA2Vec += 4;
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);  
+            pSrcA3Vec += 4;
+            acc3 = vfmaq(acc3, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+            f32x4_t vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vld1q(pSrcA0Vec);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);
+            acc3 = vfmaq(acc3, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        *px++ = vecAddAcrossF32Mve(acc0);
+        *px++ = vecAddAcrossF32Mve(acc1);
+        *px++ = vecAddAcrossF32Mve(acc2);
+        *px++ = vecAddAcrossF32Mve(acc3);
+
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parrallel
+     */
+    if (row >= 2)
+    {
+        float32_t const    *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f32x4_t            vecIn, acc0, acc1;
+        float32_t const     *pSrcVecPtr = pSrcVec;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+        acc1 = vdupq_n_f32(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U)
+        {
+            f32x4_t vecA;
+
+            vecIn = vld1q(pInVec);      
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);    
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);    
+            pSrcA1Vec += 4;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+            f32x4_t vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vld1q(pSrcA0Vec);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        *px++ = vecAddAcrossF32Mve(acc0);
+        *px++ = vecAddAcrossF32Mve(acc1);
+
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1)
+    {
+        f32x4_t             vecIn, acc0;
+        float32_t const     *pSrcA0Vec, *pInVec;
+        float32_t const      *pSrcVecPtr = pSrcVec;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U)
+        {
+            f32x4_t vecA;
+
+            vecIn = vld1q(pInVec);      
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);    
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+            f32x4_t vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vld1q(pSrcA0Vec);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        *px++ = vecAddAcrossF32Mve(acc0);
+    }
+}
+#else
+
+void arm_mat_vec_mult_f32(const arm_matrix_instance_f32 *pSrcMat, const float32_t *pVec, float32_t *pDst)
+{
+    uint32_t numRows = pSrcMat->numRows;
+    uint32_t numCols = pSrcMat->numCols;
+    const float32_t *pSrcA = pSrcMat->pData;
+    const float32_t *pInA1;      /* input data matrix pointer A of Q31 type */
+    const float32_t *pInA2;      /* input data matrix pointer A of Q31 type */
+    const float32_t *pInA3;      /* input data matrix pointer A of Q31 type */
+    const float32_t *pInA4;      /* input data matrix pointer A of Q31 type */
+    const float32_t *pInVec;     /* input data matrix pointer B of Q31 type */
+    float32_t *px;               /* Temporary output data matrix pointer */
+    uint16_t i, row, colCnt; /* loop counters */
+    float32_t matData, matData2, vecData, vecData2;
+
+
+    /* Process 4 rows at a time */
+    row = numRows >> 2;
+    i = 0u;
+    px = pDst;
+
+    /* The following loop performs the dot-product of each row in pSrcA with the vector */
+    /* row loop */
+    while (row > 0) {
+        /* For every row wise process, the pInVec pointer is set
+         ** to the starting address of the vector */
+        pInVec = pVec;
+
+        /* Initialize accumulators */
+        float32_t sum1 = 0.0f;
+        float32_t sum2 = 0.0f;
+        float32_t sum3 = 0.0f;
+        float32_t sum4 = 0.0f;
+
+        /* Loop unrolling: process 2 columns per iteration */
+        colCnt = numCols;
+
+        /* Initialize pointers to the starting address of the column being processed */
+        pInA1 = pSrcA + i;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        pInA4 = pInA3 + numCols;
+
+
+        // Main loop: matrix-vector multiplication
+        while (colCnt > 0u) {
+            // Read 2 values from vector
+            vecData = *(pInVec)++;
+            // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
+            matData = *(pInA1)++;
+            sum1 += matData * vecData;
+            matData = *(pInA2)++;
+            sum2 += matData * vecData;
+            matData = *(pInA3)++;
+            sum3 += matData * vecData;
+            matData = *(pInA4)++;
+            sum4 += matData * vecData;
+
+            // Decrement the loop counter
+            colCnt--;
+        }
+
+        /* Saturate and store the result in the destination buffer */
+        *px++ = sum1;
+        *px++ = sum2;
+        *px++ = sum3;
+        *px++ = sum4;
+
+        i = i + numCols * 4;
+
+        /* Decrement the row loop counter */
+        row--;
+    }
+
+    /* process any remaining rows */
+    row = numRows & 3u;
+    while (row > 0) {
+
+        float32_t sum = 0.0f;
+        pInVec = pVec;
+        pInA1 = pSrcA + i;
+
+        colCnt = numCols >> 1;
+        while (colCnt > 0) {
+            vecData = *(pInVec)++;
+            vecData2 = *(pInVec)++;
+            matData = *(pInA1)++;
+            matData2 = *(pInA1)++;
+            sum += matData * vecData;
+            sum += matData2 * vecData2;
+            colCnt--;
+        }
+        // process remainder of row
+        colCnt = numCols & 1u;
+
+
+        while (colCnt > 0) {
+            sum += *pInA1++ * *pInVec++;
+            colCnt--;
+        }
+
+        *px++ = sum;
+        i = i + numCols;
+        row--;
+    }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q15.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q15.c
new file mode 100644
index 0000000000000000000000000000000000000000..92be9aec6a69bea99f93db5701d75a2ca2772e41
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q15.c
@@ -0,0 +1,388 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_vec_mult_q15.c
+ * Description:  Q15 matrix and vector multiplication
+ *
+ * $Date:        23 April 2021
+ *
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+
+/**
+ * @addtogroup MatrixVectMult
+ * @{
+ */
+
+/**
+ * @brief Q15 matrix and vector multiplication.
+ * @param[in]       *pSrcMat points to the input matrix structure
+ * @param[in]       *pVec points to input vector
+ * @param[out]      *pDst points to output vector
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mat_vec_mult_q15(
+    const arm_matrix_instance_q15 * pSrcMat,
+    const q15_t     *pSrcVec,
+    q15_t           *pDstVec)
+{
+    const q15_t *pMatSrc = pSrcMat->pData;
+    const q15_t *pMat0, *pMat1;
+    uint32_t     numRows = pSrcMat->numRows;
+    uint32_t     numCols = pSrcMat->numCols;
+    q15_t       *px;
+    int32_t      row;
+    uint16_t     blkCnt;           /* loop counters */
+
+    row = numRows;
+    px = pDstVec;
+
+    /*
+     * compute 3x64-bit accumulators per loop
+     */
+    while (row >= 3)
+    {
+        q15_t const *pMat0Vec, *pMat1Vec, *pMat2Vec, *pVec;
+        const q15_t  *pMat2;
+        q15_t const  *pSrcVecPtr = pSrcVec;
+        q63_t         acc0, acc1, acc2;
+        q15x8_t     vecMatA0, vecMatA1, vecMatA2, vecIn;
+
+
+        pVec = pSrcVec;
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+        pMat1 = pMat0 + numCols;
+        pMat2 = pMat1 + numCols;
+
+        acc0 = 0LL;
+        acc1 = 0LL;
+        acc2 = 0LL;
+
+        pMat0Vec = pMat0;
+        pMat1Vec = pMat1;
+        pMat2Vec = pMat2;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 8;
+            vecMatA1 = vld1q(pMat1Vec); 
+            pMat1Vec += 8;
+            vecMatA2 = vld1q(pMat2Vec); 
+            pMat2Vec += 8;
+            vecIn = vld1q(pVec);        
+            pVec += 8;
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+            acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp16q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecMatA1 = vld1q(pMat1Vec);
+            vecMatA2 = vld1q(pMat2Vec);
+            vecIn = vldrhq_z_s16(pVec, p0);
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+            acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
+        }
+
+        *px++ = MVE_ASRL_SAT16(acc0, 15);
+        *px++ = MVE_ASRL_SAT16(acc1, 15);
+        *px++ = MVE_ASRL_SAT16(acc2, 15);
+
+        pMatSrc += numCols * 3;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 3;
+    }
+
+    /*
+     * process any remaining rows pair
+     */
+    if (row >= 2)
+    {
+        q15_t const *pMat0Vec, *pMat1Vec, *pVec;
+        q15_t const  *pSrcVecPtr = pSrcVec;
+        q63_t         acc0, acc1;
+        q15x8_t     vecMatA0, vecMatA1, vecIn;
+
+        /*
+         * For every row wise process, the pInVec pointer is set
+         * to the starting address of the vector
+         */
+        pVec = pSrcVec;
+
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+        pMat1 = pMat0 + numCols;
+
+        acc0 = 0LL;
+        acc1 = 0LL;
+
+        pMat0Vec = pMat0;
+        pMat1Vec = pMat1;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 8;
+            vecMatA1 = vld1q(pMat1Vec); 
+            pMat1Vec += 8;
+            vecIn = vld1q(pVec);        
+            pVec += 8;
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+
+            blkCnt--;
+        }
+
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp16q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecMatA1 = vld1q(pMat1Vec);
+            vecIn = vldrhq_z_s16(pVec, p0);
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+        }
+
+        *px++ = MVE_ASRL_SAT16(acc0, 15);
+        *px++ = MVE_ASRL_SAT16(acc1, 15);
+
+        pMatSrc += numCols * 2;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 2;
+    }
+
+    if (row >= 1)
+    {
+        q15_t const *pMat0Vec, *pVec;
+        q15_t const  *pSrcVecPtr = pSrcVec;
+        q63_t         acc0;
+        q15x8_t     vecMatA0, vecIn;
+
+        /*
+         * For every row wise process, the pInVec pointer is set
+         * to the starting address of the vector
+         */
+        pVec = pSrcVec;
+
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+
+        acc0 = 0LL;
+
+        pMat0Vec = pMat0;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 8;
+            vecIn = vld1q(pVec);        
+            pVec += 8;
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp16q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecIn = vldrhq_z_s16(pVec, p0);
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+        }
+        *px++ = MVE_ASRL_SAT16(acc0, 15);
+    }
+}
+
+#else
+void arm_mat_vec_mult_q15(const arm_matrix_instance_q15 *pSrcMat, const q15_t *pVec, q15_t *pDst)
+{
+    uint32_t numRows = pSrcMat->numRows;
+    uint32_t numCols = pSrcMat->numCols;
+    const q15_t *pSrcA = pSrcMat->pData;
+    const q15_t *pInA1;      /* input data matrix pointer A of Q15 type */
+    const q15_t *pInA2;      /* input data matrix pointer A of Q15 type */
+    const q15_t *pInA3;      /* input data matrix pointer A of Q15 type */
+    const q15_t *pInA4;      /* input data matrix pointer A of Q15 type */
+    const q15_t *pInVec;     /* input data matrix pointer B of Q15 type */
+    q15_t *px;               /* Temporary output data matrix pointer */
+    uint16_t i, row, colCnt; /* loop counters */
+    q31_t matData, matData2, vecData, vecData2;
+
+
+    /* Process 4 rows at a time */
+    row = numRows >> 2;
+    i = 0u;
+    px = pDst;
+
+    /* The following loop performs the dot-product of each row in pSrcA with the vector */
+    /* row loop */
+    while (row > 0) {
+        /* For every row wise process, the pInVec pointer is set
+         ** to the starting address of the vector */
+        pInVec = pVec;
+
+        /* Initialize accumulators */
+        q63_t sum1 = 0;
+        q63_t sum2 = 0;
+        q63_t sum3 = 0;
+        q63_t sum4 = 0;
+
+        /* Loop unrolling: process 2 columns per iteration */
+        colCnt = numCols >> 1;
+
+        /* Initialize pointers to the starting address of the column being processed */
+        pInA1 = pSrcA + i;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        pInA4 = pInA3 + numCols;
+
+        // Main loop: matrix-vector multiplication
+        while (colCnt > 0u) {
+            // Read 2 values from vector
+            vecData = read_q15x2_ia ((q15_t **) &pInVec);
+
+            // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
+            matData =  read_q15x2_ia ((q15_t **) &pInA1);
+            sum1 = __SMLALD(matData, vecData, sum1);
+            matData = read_q15x2_ia ((q15_t **) &pInA2);
+            sum2 = __SMLALD(matData, vecData, sum2);
+            matData = read_q15x2_ia ((q15_t **) &pInA3);
+            sum3 = __SMLALD(matData, vecData, sum3);
+            matData = read_q15x2_ia ((q15_t **) &pInA4);
+            sum4 = __SMLALD(matData, vecData, sum4);
+
+            // Decrement the loop counter
+            colCnt--;
+        }
+
+        /* process any remaining columns */
+        colCnt = numCols & 1u;
+        if (numCols & 1u) {
+            vecData = *pInVec++;
+            sum1 += (q63_t)*pInA1++ * vecData;
+            sum2 += (q63_t)*pInA2++ * vecData;
+            sum3 += (q63_t)*pInA3++ * vecData;
+            sum4 += (q63_t)*pInA4++ * vecData;
+        }
+
+        /* Saturate and store the result in the destination buffer */
+        *px++ = (q15_t)(__SSAT((sum1 >> 15), 16));
+        *px++ = (q15_t)(__SSAT((sum2 >> 15), 16));
+        *px++ = (q15_t)(__SSAT((sum3 >> 15), 16));
+        *px++ = (q15_t)(__SSAT((sum4 >> 15), 16));
+
+        i = i + numCols * 4;
+
+        /* Decrement the row loop counter */
+        row--;
+    }
+
+    /* process any remaining rows */
+    row = numRows & 3u;
+    while (row > 0) {
+
+        q63_t sum = 0;
+        pInVec = pVec;
+        pInA1 = pSrcA + i;
+
+        // loop unrolling - process 4 elements at a time
+        colCnt = numCols >> 2;
+
+        while (colCnt > 0) {
+            vecData = read_q15x2_ia ((q15_t **) &pInVec);
+            vecData2 = read_q15x2_ia ((q15_t **) &pInVec);
+            matData = read_q15x2_ia ((q15_t **) &pInA1);
+            matData2 = read_q15x2_ia ((q15_t **) &pInA1);
+            sum = __SMLALD(matData, vecData, sum);
+            sum = __SMLALD(matData2, vecData2, sum);
+            colCnt--;
+        }
+
+        // process remainder of row
+        colCnt = numCols & 3u;
+        while (colCnt > 0) {
+            sum += (q63_t)*pInA1++ * *pInVec++;
+            colCnt--;
+        }
+        *px++ = (q15_t)(__SSAT((sum >> 15), 16));
+        i = i + numCols;
+        row--;
+    }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q31.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q31.c
new file mode 100644
index 0000000000000000000000000000000000000000..6d86e6f946dc60f7192fe9fbdb79a15f68bb0a2a
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q31.c
@@ -0,0 +1,376 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_vec_mult_q31.c
+ * Description:  Q31 matrix and vector multiplication
+ *
+ * $Date:        23 April 2021
+ *
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+
+/**
+ * @addtogroup MatrixVectMult
+ * @{
+ */
+
+/**
+ * @brief Q31 matrix and vector multiplication.
+ * @param[in]       *pSrcMat points to the input matrix structure
+ * @param[in]       *pVec points to the input vector
+ * @param[out]      *pDst points to the output vector
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_mat_vec_mult_q31(
+    const arm_matrix_instance_q31 * pSrcMat,
+    const q31_t     *pSrcVec,
+    q31_t           *pDstVec)
+{
+    const q31_t *pMatSrc = pSrcMat->pData;
+    const q31_t *pMat0, *pMat1;
+    uint32_t     numRows = pSrcMat->numRows;
+    uint32_t     numCols = pSrcMat->numCols;
+    q31_t       *px;
+    int32_t      row;
+    uint16_t     blkCnt;           /* loop counters */
+
+    row = numRows;
+    px = pDstVec;
+
+    /*
+     * compute 3x64-bit accumulators per loop
+     */
+    while (row >= 3)
+    {
+        q31_t const *pMat0Vec, *pMat1Vec, *pMat2Vec, *pVec;
+        const q31_t  *pMat2;
+        q31_t const  *pSrcVecPtr = pSrcVec;
+        q63_t         acc0, acc1, acc2;
+        q31x4_t     vecMatA0, vecMatA1, vecMatA2, vecIn;
+
+
+        pVec = pSrcVec;
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+        pMat1 = pMat0 + numCols;
+        pMat2 = pMat1 + numCols;
+
+        acc0 = 0LL;
+        acc1 = 0LL;
+        acc2 = 0LL;
+
+        pMat0Vec = pMat0;
+        pMat1Vec = pMat1;
+        pMat2Vec = pMat2;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 4;
+            vecMatA1 = vld1q(pMat1Vec); 
+            pMat1Vec += 4;
+            vecMatA2 = vld1q(pMat2Vec); 
+            pMat2Vec += 4;
+            vecIn = vld1q(pVec);        
+            pVec += 4;
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+            acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecMatA1 = vld1q(pMat1Vec);
+            vecMatA2 = vld1q(pMat2Vec);
+            vecIn = vldrwq_z_s32(pVec, p0);
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+            acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
+        }
+
+        *px++ = asrl(acc0, 31);
+        *px++ = asrl(acc1, 31);
+        *px++ = asrl(acc2, 31);
+
+        pMatSrc += numCols * 3;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 3;
+    }
+
+    /*
+     * process any remaining rows pair
+     */
+    if (row >= 2)
+    {
+        q31_t const *pMat0Vec, *pMat1Vec, *pVec;
+        q31_t const  *pSrcVecPtr = pSrcVec;
+        q63_t         acc0, acc1;
+        q31x4_t     vecMatA0, vecMatA1, vecIn;
+
+        /*
+         * For every row wise process, the pInVec pointer is set
+         * to the starting address of the vector
+         */
+        pVec = pSrcVec;
+
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+        pMat1 = pMat0 + numCols;
+
+        acc0 = 0LL;
+        acc1 = 0LL;
+
+        pMat0Vec = pMat0;
+        pMat1Vec = pMat1;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 4;
+            vecMatA1 = vld1q(pMat1Vec); 
+            pMat1Vec += 4;
+            vecIn = vld1q(pVec);        
+            pVec += 4;
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+
+            blkCnt--;
+        }
+
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecMatA1 = vld1q(pMat1Vec);
+            vecIn = vldrwq_z_s32(pVec, p0);
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+        }
+
+        *px++ = asrl(acc0, 31);
+        *px++ = asrl(acc1, 31);
+
+        pMatSrc += numCols * 2;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 2;
+    }
+
+    if (row >= 1)
+    {
+        q31_t const *pMat0Vec, *pVec;
+        q31_t const  *pSrcVecPtr = pSrcVec;
+        q63_t         acc0;
+        q31x4_t     vecMatA0, vecIn;
+
+        /*
+         * For every row wise process, the pInVec pointer is set
+         * to the starting address of the vector
+         */
+        pVec = pSrcVec;
+
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+
+        acc0 = 0LL;
+
+        pMat0Vec = pMat0;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 4;
+            vecIn = vld1q(pVec);        
+            pVec += 4;
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecIn = vldrwq_z_s32(pVec, p0);
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+        }
+
+        *px++ = asrl(acc0, 31);
+    }
+}
+#else
+void arm_mat_vec_mult_q31(const arm_matrix_instance_q31 *pSrcMat, const q31_t *pVec, q31_t *pDst)
+{
+    uint32_t numRows = pSrcMat->numRows;
+    uint32_t numCols = pSrcMat->numCols;
+    const q31_t *pSrcA = pSrcMat->pData;
+    const q31_t *pInA1;      /* input data matrix pointer A of Q31 type */
+    const q31_t *pInA2;      /* input data matrix pointer A of Q31 type */
+    const q31_t *pInA3;      /* input data matrix pointer A of Q31 type */
+    const q31_t *pInA4;      /* input data matrix pointer A of Q31 type */
+    const q31_t *pInVec;     /* input data matrix pointer B of Q31 type */
+    q31_t *px;               /* Temporary output data matrix pointer */
+    uint16_t i, row, colCnt; /* loop counters */
+    q31_t matData, matData2, vecData, vecData2;
+
+
+    /* Process 4 rows at a time */
+    row = numRows >> 2;
+    i = 0u;
+    px = pDst;
+
+    /* The following loop performs the dot-product of each row in pSrcA with the vector */
+    /* row loop */
+    while (row > 0) {
+        /* For every row wise process, the pInVec pointer is set
+         ** to the starting address of the vector */
+        pInVec = pVec;
+
+        /* Initialize accumulators */
+        q63_t sum1 = 0;
+        q63_t sum2 = 0;
+        q63_t sum3 = 0;
+        q63_t sum4 = 0;
+
+        /* Loop unrolling: process 2 columns per iteration */
+        colCnt = numCols;
+
+        /* Initialize pointers to the starting address of the column being processed */
+        pInA1 = pSrcA + i;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        pInA4 = pInA3 + numCols;
+
+
+        // Main loop: matrix-vector multiplication
+        while (colCnt > 0u) {
+            // Read 2 values from vector
+            vecData = *(pInVec)++;
+
+            // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
+            matData = *(pInA1)++;
+            sum1 += (q63_t)matData * vecData;
+            matData = *(pInA2)++;
+            sum2 += (q63_t)matData * vecData;
+            matData = *(pInA3)++;
+            sum3 += (q63_t)matData * vecData;
+            matData = *(pInA4)++;
+            sum4 += (q63_t)matData * vecData;
+
+            // Decrement the loop counter
+            colCnt--;
+        }
+
+        /* Saturate and store the result in the destination buffer */
+        *px++ = (q31_t)(sum1 >> 31);
+        *px++ = (q31_t)(sum2 >> 31);
+        *px++ = (q31_t)(sum3 >> 31);
+        *px++ = (q31_t)(sum4 >> 31);
+
+        i = i + numCols * 4;
+
+        /* Decrement the row loop counter */
+        row--;
+    }
+
+    /* process any remaining rows */
+    row = numRows & 3u;
+    while (row > 0) {
+
+        q63_t sum = 0;
+        pInVec = pVec;
+        pInA1 = pSrcA + i;
+
+        colCnt = numCols >> 1;
+
+        while (colCnt > 0) {
+            vecData = *(pInVec)++;
+            vecData2 = *(pInVec)++;
+            matData = *(pInA1)++;
+            matData2 = *(pInA1)++;
+            sum += (q63_t)matData * vecData;
+            sum += (q63_t)matData2 * vecData2;
+            colCnt--;
+        }
+
+        // process remainder of row
+        colCnt = numCols & 1u;
+        while (colCnt > 0) {
+            sum += (q63_t)*pInA1++ * *pInVec++;
+            colCnt--;
+        }
+
+        *px++ = (q31_t)(sum >> 31);
+        i = i + numCols;
+        row--;
+    }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q7.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q7.c
new file mode 100644
index 0000000000000000000000000000000000000000..79c41f7c0936ab45f11cd2af8246c389a05e87bd
--- /dev/null
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q7.c
@@ -0,0 +1,420 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_vec_mult_q7.c
+ * Description:  Q7 matrix and vector multiplication
+ *
+ * $Date:        23 April 2021
+ *
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+
+/**
+ * @addtogroup MatrixVectMult
+ * @{
+ */
+
+/**
+ * @brief Q7 matrix and vector multiplication.
+ * @param[in]       *pSrcMat points to the input matrix structure
+ * @param[in]       *pVec points to the input vector
+ * @param[out]      *pDst points to the output vector
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mat_vec_mult_q7(
+    const arm_matrix_instance_q7 * pSrcMat,
+    const q7_t     *pSrcVec,
+    q7_t           *pDstVec)
+{
+    const q7_t *pMatSrc = pSrcMat->pData;
+    const q7_t *pMat0, *pMat1;
+    uint32_t     numRows = pSrcMat->numRows;
+    uint32_t     numCols = pSrcMat->numCols;
+    q7_t       *px;
+    int32_t      row;
+    uint16_t     blkCnt;           /* loop counters */
+
+    row = numRows;
+    px = pDstVec;
+
+    /*
+     * compute 4x64-bit accumulators per loop
+     */
+    while (row >= 4)
+    {
+        q7_t const *pMat0Vec, *pMat1Vec, *pMat2Vec, *pMat3Vec, *pVec;
+        const q7_t  *pMat2, *pMat3;
+        q7_t const  *pSrcVecPtr = pSrcVec;
+        q31_t        acc0, acc1, acc2, acc3;
+        q7x16_t      vecMatA0, vecMatA1, vecMatA2, vecMatA3, vecIn;
+
+        pVec = pSrcVec;
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+        pMat1 = pMat0 + numCols;
+        pMat2 = pMat1 + numCols;
+        pMat3 = pMat2 + numCols;
+
+        acc0 = 0L;
+        acc1 = 0L;
+        acc2 = 0L;
+        acc3 = 0L;
+
+        pMat0Vec = pMat0;
+        pMat1Vec = pMat1;
+        pMat2Vec = pMat2;
+        pMat3Vec = pMat3;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 4;
+        while (blkCnt > 0U)
+        {
+
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 16;
+            vecMatA1 = vld1q(pMat1Vec); 
+            pMat1Vec += 16;
+            vecMatA2 = vld1q(pMat2Vec); 
+            pMat2Vec += 16;
+            vecMatA3 = vld1q(pMat3Vec); 
+            pMat3Vec += 16;
+            vecIn = vld1q(pVec);        
+            pVec += 16;
+
+            acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmladavaq(acc1, vecIn, vecMatA1);
+            acc2 = vmladavaq(acc2, vecIn, vecMatA2);
+            acc3 = vmladavaq(acc3, vecIn, vecMatA3);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 0xF;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp8q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecMatA1 = vld1q(pMat1Vec);
+            vecMatA2 = vld1q(pMat2Vec);
+            vecMatA3 = vld1q(pMat3Vec);
+            vecIn = vldrbq_z_s8(pVec, p0);
+
+            acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmladavaq(acc1, vecIn, vecMatA1);
+            acc2 = vmladavaq(acc2, vecIn, vecMatA2);
+            acc3 = vmladavaq(acc3, vecIn, vecMatA3);
+        }
+
+        *px++ = __SSAT(acc0 >> 7, 8);
+        *px++ = __SSAT(acc1 >> 7, 8);
+        *px++ = __SSAT(acc2 >> 7, 8);
+        *px++ = __SSAT(acc3 >> 7, 8);
+
+        pMatSrc += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * process any remaining rows pair
+     */
+    if (row >= 2)
+    {
+        q7_t const  *pMat0Vec, *pMat1Vec, *pVec;
+        q7_t const  *pSrcVecPtr = pSrcVec;
+        q31_t         acc0, acc1;
+        q7x16_t     vecMatA0, vecMatA1, vecIn;
+
+        /*
+         * For every row wise process, the pInVec pointer is set
+         * to the starting address of the vector
+         */
+        pVec = pSrcVec;
+
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+        pMat1 = pMat0 + numCols;
+
+        acc0 = 0;
+        acc1 = 0;
+
+        pMat0Vec = pMat0;
+        pMat1Vec = pMat1;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 4;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 16;
+            vecMatA1 = vld1q(pMat1Vec); 
+            pMat1Vec += 16;
+            vecIn = vld1q(pVec);        
+            pVec += 16;
+
+            acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmladavaq(acc1, vecIn, vecMatA1);
+
+            blkCnt--;
+        }
+
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 0xF;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp8q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecMatA1 = vld1q(pMat1Vec);
+            vecIn = vldrbq_z_s8(pVec, p0);
+
+            acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmladavaq(acc1, vecIn, vecMatA1);
+        }
+
+        *px++ = __SSAT(acc0 >> 7, 8);
+        *px++ = __SSAT(acc1 >> 7, 8);
+
+        pMatSrc += numCols * 2;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 2;
+    }
+
+    if (row >= 1)
+    {
+        q7_t const  *pMat0Vec, *pVec;
+        q7_t const  *pSrcVecPtr = pSrcVec;
+        q31_t         acc0;
+        q7x16_t     vecMatA0, vecIn;
+
+        /*
+         * For every row wise process, the pInVec pointer is set
+         * to the starting address of the vector
+         */
+        pVec = pSrcVec;
+
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+
+        acc0 = 0LL;
+
+        pMat0Vec = pMat0;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 4;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 16;
+            vecIn = vld1q(pVec);        
+            pVec += 16;
+
+            acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 0xF;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp8q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecIn = vldrbq_z_s8(pVec, p0);
+            acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+        }
+        *px++ = __SSAT(acc0 >> 7, 8);
+    }
+}
+
+#else
+void arm_mat_vec_mult_q7(const arm_matrix_instance_q7 *pSrcMat, const q7_t *pVec, q7_t *pDst)
+{
+    uint32_t numRows = pSrcMat->numRows;
+    uint32_t numCols = pSrcMat->numCols;
+    const q7_t *pSrcA = pSrcMat->pData;
+    const q7_t *pInA1;       /* input data matrix pointer of Q7 type */
+    const q7_t *pInA2;       /* input data matrix pointer of Q7 type */
+    const q7_t *pInA3;       /* input data matrix pointer of Q7 type */
+    const q7_t *pInA4;       /* input data matrix pointer of Q7 type */
+    const q7_t *pInVec;      /* input data vector pointer of Q7 type */
+    q7_t *px;                /* output data pointer */
+    uint32_t i, row, colCnt; /* loop counters */
+
+    q31_t matData, matData2, vecData, vecData2;
+
+
+    /* Process 4 rows at a time */
+    row = numRows >> 2;
+    i = 0u;
+    px = pDst;
+
+
+
+    /* The following loop performs the dot-product of each row in pSrcA with the vector */
+    while (row > 0) {
+        /* For every row wise process, the pInVec pointer is set
+         ** to the starting address of the vector */
+        pInVec = pVec;
+
+        /* Initialize accumulators */
+        q31_t sum1 = 0;
+        q31_t sum2 = 0;
+        q31_t sum3 = 0;
+        q31_t sum4 = 0;
+
+        /* Loop unrolling: process 4 columns per iteration */
+        colCnt = numCols >> 2;
+
+        /* Initialize row pointers so we can track 4 rows at once */
+        pInA1 = pSrcA + i;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        pInA4 = pInA3 + numCols;
+
+
+        // Inner loop: matrix-vector multiplication
+
+        while (colCnt > 0u) {
+            // Read 4 values from vector
+            vecData = read_q7x4_ia ((q7_t **) &pInVec);
+            vecData2 = __SXTB16(__ROR(vecData, 8));
+            vecData = __SXTB16(vecData);
+            // Read 16 values from the matrix - 4 values from each of 4 rows, and do multiply accumulate
+            matData = read_q7x4_ia ((q7_t **) &pInA1);
+            matData2 = __SXTB16(__ROR(matData, 8));
+            matData = __SXTB16(matData);
+            sum1 = __SMLAD(matData, vecData, sum1);
+            sum1 = __SMLAD(matData2, vecData2, sum1);
+            matData = read_q7x4_ia ((q7_t **) &pInA2);
+            matData2 = __SXTB16(__ROR(matData, 8));
+            matData = __SXTB16(matData);
+            sum2 = __SMLAD(matData, vecData, sum2);
+            sum2 = __SMLAD(matData2, vecData2, sum2);
+            matData = read_q7x4_ia ((q7_t **) &pInA3);
+            matData2 = __SXTB16(__ROR(matData, 8));
+            matData = __SXTB16(matData);
+            sum3 = __SMLAD(matData, vecData, sum3);
+            sum3 = __SMLAD(matData2, vecData2, sum3);
+            matData = read_q7x4_ia ((q7_t **) &pInA4);
+            matData2 = __SXTB16(__ROR(matData, 8));
+            matData = __SXTB16(matData);
+            sum4 = __SMLAD(matData, vecData, sum4);
+            sum4 = __SMLAD(matData2, vecData2, sum4);
+
+            // Decrement the loop counter
+            colCnt--;
+        }
+
+        /* process any remaining columns */
+
+        colCnt = numCols & 3u;
+
+        while (colCnt > 0) {
+            vecData = *pInVec++;
+            sum1 += *pInA1++ * vecData;
+            sum2 += *pInA2++ * vecData;
+            sum3 += *pInA3++ * vecData;
+            sum4 += *pInA4++ * vecData;
+            colCnt--;
+        }
+
+        /* Saturate and store the result in the destination buffer */
+        *px++ = (q7_t)(__SSAT((sum1 >> 7), 8));
+        *px++ = (q7_t)(__SSAT((sum2 >> 7), 8));
+        *px++ = (q7_t)(__SSAT((sum3 >> 7), 8));
+        *px++ = (q7_t)(__SSAT((sum4 >> 7), 8));
+
+        i = i + numCols * 4;
+
+        /* Decrement the row loop counter */
+        row--;
+    }
+
+    /* process any remaining rows */
+    row = numRows & 3u;
+    while (row > 0) {
+
+        q31_t sum = 0;
+        pInVec = pVec;
+        pInA1 = pSrcA + i;
+
+        // loop unrolling - process 4 elements at a time
+        colCnt = numCols >> 2;
+
+        while (colCnt > 0) {
+            vecData = read_q7x4_ia ((q7_t **) &pInVec);
+            vecData2 = __SXTB16(__ROR(vecData, 8));
+            vecData = __SXTB16(vecData);
+            matData = read_q7x4_ia ((q7_t **) &pInA1);
+            matData2 = __SXTB16(__ROR(matData, 8));
+            matData = __SXTB16(matData);
+            sum = __SMLAD(matData, vecData, sum);
+            sum = __SMLAD(matData2, vecData2, sum);
+            colCnt--;
+        }
+
+        // process remainder of row
+        colCnt = numCols & 3u;
+        while (colCnt > 0) {
+            sum += *pInA1++ * *pInVec++;
+            colCnt--;
+        }
+        *px++ = (q7_t)(__SSAT((sum >> 7), 8));
+        i = i + numCols;
+        row--;
+    }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion2rotation_f32.c b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion2rotation_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..f70dc606db22bb903b83a18a07cbb6bb407fda02
--- /dev/null
+++ b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion2rotation_f32.c
@@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_quaternion2rotation_f32.c
+ * Description:  Floating-point quaternion 2 rotation conversion
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include <math.h>
+
+/**
+  @ingroup groupQuaternionMath
+ */
+
+/**
+  @defgroup QuatConv Quaternion conversions
+
+  Conversions between quaternion and rotation representations.
+ */
+
+/**
+  @ingroup QuatConv
+ */
+
+/**
+  @defgroup QuatRot Quaternion to Rotation
+
+  Conversions from quaternion to rotation.
+ */
+
+/**
+  @addtogroup QuatRot
+  @{
+ */
+
+/**
+   @brief Conversion of quaternion to equivalent rotation matrix.
+   @param[in]       pInputQuaternions points to an array of normalized quaternions
+   @param[out]      pOutputRotations points to an array of 3x3 rotations (in row order)
+   @param[in]       nbQuaternions number of quaternions in the array
+   @return none.
+  
+   @par
+   Format of rotation matrix
+   
+   
+   The quaternion a + ib + jc + kd is converted into rotation matrix:
+   <pre>
+     a^2 + b^2 - c^2 - d^2                 2bc - 2ad                 2bd + 2ac
+                 2bc + 2ad     a^2 - b^2 + c^2 - d^2                 2cd - 2ab
+                 2bd - 2ac                 2cd + 2ab     a^2 - b^2 - c^2 + d^2
+   </pre>
+   Rotation matrix is saved in row order : R00 R01 R02 R10 R11 R12 R20 R21 R22
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_quaternion2rotation_f32(const float32_t *pInputQuaternions, 
+    float32_t *pOutputRotations, 
+    uint32_t nbQuaternions)
+{
+  f32x4_t vec0,vec1, vec2 ,vec3;
+  float32_t q2q3, tmp1, tmp2 ;
+
+  for(uint32_t nb=0; nb < nbQuaternions; nb++)
+  {
+
+    // q0 q1 q2 q3
+    vec0 = vld1q(pInputQuaternions);
+
+    // q0^2 q1^2 q2^2 q3^2
+    vec1 = vmulq(vec0,vec0);
+
+    // q0^2 q1q0 q2q0 q3q0
+    vec2 = vmulq_n_f32(vec0, vgetq_lane(vec0,0));
+
+    // 2 (q0^2 q1q0 q2q0 q3q0)
+    vec2 = vmulq_n_f32(vec2, 2.0f);
+    
+
+    // 2 q2q3
+    q2q3 = vgetq_lane(vec0,2) * vgetq_lane(vec0,3);
+    q2q3 = q2q3 * 2.0f;
+
+    // 2 (q0q1 q1^2 q2q1 q3q1)
+    vec3 = vmulq_n_f32(vec0, vgetq_lane(vec0,1));
+    vec3 = vmulq_n_f32(vec3, 2.0f);
+   
+
+    
+    vec0 = vsetq_lane(vgetq_lane(vec1,0) + vgetq_lane(vec1,1),vec0,0);
+    vec0 = vsetq_lane(vgetq_lane(vec0,0) - vgetq_lane(vec1,2),vec0,0);
+    vec0 = vsetq_lane(vgetq_lane(vec0,0) - vgetq_lane(vec1,3),vec0,0);
+    vec0 = vsetq_lane(vgetq_lane(vec3,2) - vgetq_lane(vec2,3),vec0,1);
+    vec0 = vsetq_lane(vgetq_lane(vec3,3) + vgetq_lane(vec2,2),vec0,2);
+    vec0 = vsetq_lane(vgetq_lane(vec3,2) + vgetq_lane(vec2,3),vec0,3);
+
+    vst1q(pOutputRotations, vec0);
+    pOutputRotations += 4;
+
+    tmp1 = vgetq_lane(vec1,0) - vgetq_lane(vec1,1);
+    tmp2 = vgetq_lane(vec1,2) - vgetq_lane(vec1,3);
+
+  
+    vec0 = vsetq_lane(tmp1 + tmp2,vec0,0);
+    vec0 = vsetq_lane(q2q3 - vgetq_lane(vec2,1) ,vec0,1);
+    vec0 = vsetq_lane(vgetq_lane(vec3,3) - vgetq_lane(vec2,2),vec0,2);
+    vec0 = vsetq_lane(q2q3 + vgetq_lane(vec2,1) ,vec0,3);
+
+    vst1q(pOutputRotations, vec0);
+    pOutputRotations += 4;
+
+    *pOutputRotations = tmp1 - tmp2;
+    pOutputRotations ++;
+
+    pInputQuaternions += 4;
+  }
+}
+
+#else
+void arm_quaternion2rotation_f32(const float32_t *pInputQuaternions, 
+    float32_t *pOutputRotations, 
+    uint32_t nbQuaternions)
+{
+   for(uint32_t nb=0; nb < nbQuaternions; nb++)
+   {
+        float32_t q00 = SQ(pInputQuaternions[0 + nb * 4]);
+        float32_t q11 = SQ(pInputQuaternions[1 + nb * 4]);
+        float32_t q22 = SQ(pInputQuaternions[2 + nb * 4]);
+        float32_t q33 = SQ(pInputQuaternions[3 + nb * 4]);
+        float32_t q01 =  pInputQuaternions[0 + nb * 4]*pInputQuaternions[1 + nb * 4];
+        float32_t q02 =  pInputQuaternions[0 + nb * 4]*pInputQuaternions[2 + nb * 4];
+        float32_t q03 =  pInputQuaternions[0 + nb * 4]*pInputQuaternions[3 + nb * 4];
+        float32_t q12 =  pInputQuaternions[1 + nb * 4]*pInputQuaternions[2 + nb * 4];
+        float32_t q13 =  pInputQuaternions[1 + nb * 4]*pInputQuaternions[3 + nb * 4];
+        float32_t q23 =  pInputQuaternions[2 + nb * 4]*pInputQuaternions[3 + nb * 4];
+
+        float32_t xx = q00 + q11 - q22 - q33;
+        float32_t yy = q00 - q11 + q22 - q33;
+        float32_t zz = q00 - q11 - q22 + q33;
+        float32_t xy = 2*(q12 - q03);
+        float32_t xz = 2*(q13 + q02);
+        float32_t yx = 2*(q12 + q03);
+        float32_t yz = 2*(q23 - q01);
+        float32_t zx = 2*(q13 - q02);
+        float32_t zy = 2*(q23 + q01);
+
+        pOutputRotations[0 + nb * 9] = xx; pOutputRotations[1 + nb * 9] = xy; pOutputRotations[2 + nb * 9] = xz;
+        pOutputRotations[3 + nb * 9] = yx; pOutputRotations[4 + nb * 9] = yy; pOutputRotations[5 + nb * 9] = yz;
+        pOutputRotations[6 + nb * 9] = zx; pOutputRotations[7 + nb * 9] = zy; pOutputRotations[8 + nb * 9] = zz;
+   }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of QuatRot group
+ */
diff --git a/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_conjugate_f32.c b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_conjugate_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..d70faf476043d1b28875726b6607fdd9f323a66c
--- /dev/null
+++ b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_conjugate_f32.c
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_quaternion_conjugate_f32.c
+ * Description:  Floating-point quaternion conjugate
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include <math.h>
+
+/**
+  @ingroup groupQuaternionMath
+ */
+
+/**
+  @defgroup QuatConjugate Quaternion Conjugate
+
+  Compute the conjugate of a quaternion.
+ */
+
+/**
+  @addtogroup QuatConjugate
+  @{
+ */
+
+/**
+  @brief         Floating-point quaternion conjugates.
+  @param[in]     pInputQuaternions            points to the input vector of quaternions
+  @param[out]    pConjugateQuaternions        points to the output vector of conjugate quaternions
+  @param[in]     nbQuaternions                number of quaternions in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_quaternion_conjugate_f32(const float32_t *pInputQuaternions, 
+    float32_t *pConjugateQuaternions, 
+    uint32_t nbQuaternions)
+{
+   f32x4_t vec1;
+
+   for(uint32_t i=0; i < nbQuaternions; i++)
+   {
+      vec1 = vld1q(pInputQuaternions);
+      
+
+      vec1 = vsetq_lane_f32(-vgetq_lane(vec1, 0),vec1,0);
+      vec1 = vnegq_f32(vec1);
+
+      vst1q(pConjugateQuaternions, vec1);
+
+
+      pInputQuaternions   += 4;
+      pConjugateQuaternions += 4;
+   }
+}
+#else
+void arm_quaternion_conjugate_f32(const float32_t *pInputQuaternions, 
+    float32_t *pConjugateQuaternions, 
+    uint32_t nbQuaternions)
+{
+   for(uint32_t i=0; i < nbQuaternions; i++)
+   {
+
+      pConjugateQuaternions[4 * i + 0] = pInputQuaternions[4 * i + 0];
+      pConjugateQuaternions[4 * i + 1] = -pInputQuaternions[4 * i + 1];
+      pConjugateQuaternions[4 * i + 2] = -pInputQuaternions[4 * i + 2];
+      pConjugateQuaternions[4 * i + 3] = -pInputQuaternions[4 * i + 3];
+   }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of QuatConjugate group
+ */
diff --git a/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_inverse_f32.c b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_inverse_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..5362ff620c07870fb46dd1f1d511350ca2051da7
--- /dev/null
+++ b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_inverse_f32.c
@@ -0,0 +1,113 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_quaternion_inverse_f32.c
+ * Description:  Floating-point quaternion inverse
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include <math.h>
+
+/**
+  @ingroup groupQuaternionMath
+ */
+
+/**
+  @defgroup QuatInverse Quaternion Inverse
+
+  Compute the inverse of a quaternion.
+ */
+
+/**
+  @addtogroup QuatInverse
+  @{
+ */
+
+/**
+  @brief         Floating-point quaternion inverse.
+  @param[in]     pInputQuaternions            points to the input vector of quaternions
+  @param[out]    pInverseQuaternions          points to the output vector of inverse quaternions
+  @param[in]     nbQuaternions                number of quaternions in each vector
+  @return        none
+ */
+
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_quaternion_inverse_f32(const float32_t *pInputQuaternions, 
+  float32_t *pInverseQuaternions,
+  uint32_t nbQuaternions)
+{
+   f32x4_t vec1,vec2;
+   float32_t squaredSum;
+
+   for(uint32_t i=0; i < nbQuaternions; i++)
+   {
+     
+      vec1 = vld1q(pInputQuaternions);
+      vec2 = vmulq(vec1,vec1);
+      squaredSum = vecAddAcrossF32Mve(vec2);
+      
+
+      vec1 = vmulq_n_f32(vec1, 1.0f / squaredSum);
+      vec1 = vsetq_lane_f32(-vgetq_lane(vec1, 0),vec1,0);
+      vec1 = vnegq_f32(vec1);
+
+      vst1q(pInverseQuaternions, vec1);
+
+
+      pInputQuaternions   += 4;
+      pInverseQuaternions += 4;
+
+   }
+}
+
+#else
+void arm_quaternion_inverse_f32(const float32_t *pInputQuaternions, 
+  float32_t *pInverseQuaternions,
+  uint32_t nbQuaternions)
+{
+   float32_t temp;
+
+   for(uint32_t i=0; i < nbQuaternions; i++)
+   {
+
+      temp = SQ(pInputQuaternions[4 * i + 0]) +
+             SQ(pInputQuaternions[4 * i + 1]) +
+             SQ(pInputQuaternions[4 * i + 2]) +
+             SQ(pInputQuaternions[4 * i + 3]);
+
+      pInverseQuaternions[4 * i + 0] = pInputQuaternions[4 * i + 0] / temp;
+      pInverseQuaternions[4 * i + 1] = -pInputQuaternions[4 * i + 1] / temp;
+      pInverseQuaternions[4 * i + 2] = -pInputQuaternions[4 * i + 2] / temp;
+      pInverseQuaternions[4 * i + 3] = -pInputQuaternions[4 * i + 3] / temp;
+   }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of QuatInverse group
+ */
diff --git a/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_norm_f32.c b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_norm_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..8494736ecbe9f66d594814ec83731d6ad11bf7d9
--- /dev/null
+++ b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_norm_f32.c
@@ -0,0 +1,101 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_quaternion_norm_f32.c
+ * Description:  Floating-point quaternion Norm
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include <math.h>
+
+/**
+  @ingroup groupQuaternionMath
+ */
+
+/**
+  @defgroup QuatNorm Quaternion Norm
+
+  Compute the norm of a quaternion.
+ */
+
+/**
+  @addtogroup QuatNorm
+  @{
+ */
+
+/**
+  @brief         Floating-point quaternion Norm.
+  @param[in]     pInputQuaternions       points to the input vector of quaternions
+  @param[out]    pNorms                  points to the output vector of norms
+  @param[in]     nbQuaternions           number of quaternions in the input vector
+  @return        none
+ */
+
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_quaternion_norm_f32(const float32_t *pInputQuaternions, 
+  float32_t *pNorms,
+  uint32_t nbQuaternions)
+{
+  f32x4_t vec1;
+  float32_t squaredSum;
+
+  for(uint32_t i=0; i < nbQuaternions; i++)
+  {
+       vec1 = vld1q(pInputQuaternions);
+       vec1 = vmulq(vec1,vec1);
+       squaredSum = vecAddAcrossF32Mve(vec1);
+       arm_sqrt_f32(squaredSum,pNorms);
+
+       pInputQuaternions+= 4;
+       pNorms ++;
+  }
+
+}
+
+#else
+
+void arm_quaternion_norm_f32(const float32_t *pInputQuaternions, 
+  float32_t *pNorms,
+  uint32_t nbQuaternions)
+{
+   float32_t temp;
+
+   for(uint32_t i=0; i < nbQuaternions; i++)
+   {
+      temp = SQ(pInputQuaternions[4 * i + 0]) +
+             SQ(pInputQuaternions[4 * i + 1]) +
+             SQ(pInputQuaternions[4 * i + 2]) +
+             SQ(pInputQuaternions[4 * i + 3]);
+      pNorms[i] = sqrtf(temp);
+   }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of QuatNorm group
+ */
diff --git a/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_normalize_f32.c b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_normalize_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..6ae96d4ea020808d73ad66cba2525aba5352235e
--- /dev/null
+++ b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_normalize_f32.c
@@ -0,0 +1,106 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_quaternion_normalize_f32.c
+ * Description:  Floating-point quaternion normalization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include <math.h>
+
+/**
+  @ingroup groupQuaternionMath
+ */
+
+/**
+  @defgroup QuatNormalized Quaternion normalization
+
+  Compute a normalized quaternion.
+ */
+
+/**
+  @addtogroup QuatNormalized
+  @{
+ */
+
+/**
+  @brief         Floating-point normalization of quaternions.
+  @param[in]     pInputQuaternions            points to the input vector of quaternions
+  @param[out]    pNormalizedQuaternions       points to the output vector of normalized quaternions
+  @param[in]     nbQuaternions                number of quaternions in each vector
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_quaternion_normalize_f32(const float32_t *pInputQuaternions, 
+    float32_t *pNormalizedQuaternions, 
+    uint32_t nbQuaternions)
+{
+   f32x4_t vec1,vec2;
+   float32_t squaredSum,norm;
+
+   for(uint32_t i=0; i < nbQuaternions; i++)
+   {
+      vec1 = vld1q(pInputQuaternions);
+      vec2 = vmulq(vec1,vec1);
+      squaredSum = vecAddAcrossF32Mve(vec2);
+      arm_sqrt_f32(squaredSum,&norm);
+      vec1 = vmulq_n_f32(vec1, 1.0f / norm);
+      vst1q(pNormalizedQuaternions, vec1);
+
+      pInputQuaternions += 4;
+      pNormalizedQuaternions += 4;
+
+   }
+}
+
+#else
+void arm_quaternion_normalize_f32(const float32_t *pInputQuaternions, 
+    float32_t *pNormalizedQuaternions, 
+    uint32_t nbQuaternions)
+{
+   float32_t temp;
+
+   for(uint32_t i=0; i < nbQuaternions; i++)
+   {
+      temp = SQ(pInputQuaternions[4 * i + 0]) +
+             SQ(pInputQuaternions[4 * i + 1]) +
+             SQ(pInputQuaternions[4 * i + 2]) +
+             SQ(pInputQuaternions[4 * i + 3]);
+      temp = sqrtf(temp);
+
+      pNormalizedQuaternions[4 * i + 0] = pInputQuaternions[4 * i + 0] / temp;
+      pNormalizedQuaternions[4 * i + 1] = pInputQuaternions[4 * i + 1] / temp;
+      pNormalizedQuaternions[4 * i + 2] = pInputQuaternions[4 * i + 2] / temp;
+      pNormalizedQuaternions[4 * i + 3] = pInputQuaternions[4 * i + 3] / temp;
+   }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of QuatNormalized group
+ */
diff --git a/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_product_f32.c b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_product_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..ec4cdf4c5979a5581e05e34e517aec804522cad6
--- /dev/null
+++ b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_product_f32.c
@@ -0,0 +1,148 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_quaternion_product_f32.c
+ * Description:  Floating-point quaternion product
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include <math.h>
+
+/**
+  @ingroup groupQuaternionMath
+ */
+
+/**
+  @defgroup QuatProd Quaternion Product
+
+  Compute the product of quaternions.
+ */
+
+/**
+  @ingroup QuatProd
+ */
+
+/**
+  @defgroup QuatProdVect Elementwise Quaternion Product
+
+  Compute the elementwise product of quaternions.
+ */
+
+/**
+  @addtogroup QuatProdVect
+  @{
+ */
+
+/**
+  @brief         Floating-point elementwise product two quaternions.
+  @param[in]     qa                  first array of quaternions
+  @param[in]     qb                  second array of quaternions
+  @param[out]    qr                   elementwise product of quaternions
+  @param[in]     nbQuaternions       number of quaternions in the array
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_quaternion_product_f32(const float32_t *qa, 
+    const float32_t *qb, 
+    float32_t *qr,
+    uint32_t nbQuaternions)
+{
+    static uint32_t patternA[4] = { 0, 1, 0, 1 };
+    static uint32_t patternB[4] = { 3, 2, 3, 2 };
+    static uint32_t patternC[4] = { 3, 2, 1, 0 };
+    static float32_t   signA[4] = { -1, -1, 1, 1 };
+
+    uint32x4_t vecA = vld1q_u32(patternA);
+    uint32x4_t vecB = vld1q_u32(patternB);
+    uint32x4_t vecC = vld1q_u32(patternC);
+    f32x4_t vecSignA = vld1q_f32(signA);
+
+    while (nbQuaternions > 0U)
+    {
+        f32x4_t vecTmpA, vecTmpB, vecAcc;
+
+        vecTmpA = vldrwq_gather_shifted_offset_f32(qa, vecA);
+        vecTmpB = vld1q(qb);
+        /*
+         * vcmul(r, [a1, a2, a1, a2], [b1, b2, b3, b4], 0)
+         */
+        vecAcc = vcmulq(vecTmpA, vecTmpB);
+        /*
+         * vcmla(r, [a1, a2, a1, a2], [b1, b2, b3, b4], 90)
+         */
+        vecAcc = vcmlaq_rot90(vecAcc, vecTmpA, vecTmpB);
+
+        vecTmpA = vldrwq_gather_shifted_offset_f32(qa, vecB);
+        vecTmpB = vldrwq_gather_shifted_offset_f32(qb, vecC);
+        /*
+         * build [-b4, -b3, b2, b1]
+         */
+        vecTmpB = vecTmpB * vecSignA;
+        /*
+         * vcmla(r, [a4, a3, a4, a3], [-b4, -b3, b2, b1], 270)
+         */
+        vecAcc = vcmlaq_rot270(vecAcc, vecTmpA, vecTmpB);
+        /*
+         * vcmla(r, [a4, a3, a4, a3], [-b4, -b3, b2, b1], 0)
+         */
+        vecAcc = vcmlaq(vecAcc, vecTmpA, vecTmpB);
+        /*
+         * store accumulator
+         */
+        vst1q_f32(qr, vecAcc);
+
+        /* move to next quaternion */
+        qa += 4;
+        qb += 4;
+        qr += 4;
+
+        nbQuaternions--;
+    }
+}
+
+#else
+
+void arm_quaternion_product_f32(const float32_t *qa, 
+    const float32_t *qb, 
+    float32_t *qr,
+    uint32_t nbQuaternions)
+{
+   for(uint32_t i=0; i < nbQuaternions; i++)
+   {
+     arm_quaternion_product_single_f32(qa, qb, qr);
+
+     qa += 4;
+     qb += 4;
+     qr += 4;
+   }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of QuatProdVect group
+ */
diff --git a/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_product_single_f32.c b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_product_single_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..96f23e99b10a4b427eef70ddb1d201e6be1387fe
--- /dev/null
+++ b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_product_single_f32.c
@@ -0,0 +1,107 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_quaternion_product_single_f32.c
+ * Description:  Floating-point quaternion product
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include <math.h>
+
+
+/**
+  @ingroup QuatProd
+ */
+
+/**
+  @defgroup QuatProdSingle Quaternion Product
+
+  Compute the  product of two quaternions.
+ */
+
+/**
+  @addtogroup QuatProdSingle
+  @{
+ */
+
+/**
+  @brief         Floating-point product of two quaternions.
+  @param[in]     qa       first quaternion
+  @param[in]     qb       second quaternion
+  @param[out]    qr       product of two quaternions
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_quaternion_product_single_f32(const float32_t *qa, 
+    const float32_t *qb, 
+    float32_t *qr)
+{
+    static uint32_t patternA[4] = { 0, 1, 0, 1 };
+    static uint32_t patternB[4] = { 3, 2, 3, 2 };
+    static uint32_t patternC[4] = { 3, 2, 1, 0 };
+    static float32_t signA[4] = { -1, -1, 1, 1 };
+
+    uint32x4_t vecA = vld1q_u32(patternA);
+    uint32x4_t vecB = vld1q_u32(patternB);
+    uint32x4_t vecC = vld1q_u32(patternC);
+    f32x4_t vecSignA = vld1q_f32(signA);
+
+
+    f32x4_t vecTmpA, vecTmpB, vecAcc;
+
+    vecTmpA = vldrwq_gather_shifted_offset_f32(qa, vecA);
+    vecTmpB = vld1q_f32(qb);
+
+    vecAcc = vcmulq_f32(vecTmpA, vecTmpB);
+    vecAcc = vcmlaq_rot90_f32(vecAcc, vecTmpA, vecTmpB);
+
+    vecTmpA = vldrwq_gather_shifted_offset_f32(qa, vecB);
+    vecTmpB = vldrwq_gather_shifted_offset_f32(qb, vecC);
+
+    vecTmpB = vecTmpB * vecSignA;
+
+    vecAcc = vcmlaq_rot270_f32(vecAcc, vecTmpA, vecTmpB);
+    vecAcc = vcmlaq_f32(vecAcc, vecTmpA, vecTmpB);
+
+    vst1q_f32(qr, vecAcc);
+}
+
+#else
+void arm_quaternion_product_single_f32(const float32_t *qa, 
+    const float32_t *qb, 
+    float32_t *qr)
+{
+    qr[0] = qa[0] * qb[0] - qa[1] * qb[1] - qa[2] * qb[2] - qa[3] * qb[3];
+    qr[1] = qa[0] * qb[1] + qa[1] * qb[0] + qa[2] * qb[3] - qa[3] * qb[2];
+    qr[2] = qa[0] * qb[2] + qa[2] * qb[0] + qa[3] * qb[1] - qa[1] * qb[3];
+    qr[3] = qa[0] * qb[3] + qa[3] * qb[0] + qa[1] * qb[2] - qa[2] * qb[1];
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of QuatProdSingle group
+ */
diff --git a/CMSIS/DSP/Source/QuaternionMathFunctions/arm_rotation2quaternion_f32.c b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_rotation2quaternion_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..b2930770b9a3a3129866e8bbe10b6e6aec01a31a
--- /dev/null
+++ b/CMSIS/DSP/Source/QuaternionMathFunctions/arm_rotation2quaternion_f32.c
@@ -0,0 +1,224 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rotation2quaternion_f32.c
+ * Description:  Floating-point rotation to quaternion conversion
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include <math.h>
+
+#define RI(x,y) r[(3*(x) + (y))]
+
+
+/**
+  @ingroup QuatConv
+ */
+
+/**
+  @defgroup RotQuat Rotation to Quaternion
+
+  Conversions from rotation to quaternion.
+ */
+
+/**
+  @addtogroup RotQuat
+  @{
+ */
+
+/**
+ * @brief Conversion of a rotation matrix to an equivalent quaternion.
+ * @param[in]       pInputRotations points to an array 3x3 rotation matrix (in row order)
+ * @param[out]      pOutputQuaternions points to an array quaternions
+ * @param[in]       nbQuaternions number of quaternions in the array
+ * @return none.
+ *
+ * q and -q are representing the same rotation. This ambiguity must be taken into
+ * account when using the output of this function.
+ * 
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+#define R00  vgetq_lane(q1,0)
+#define R01  vgetq_lane(q1,1)
+#define R02  vgetq_lane(q1,2)
+#define R10  vgetq_lane(q1,3)
+#define R11  vgetq_lane(q2,0)
+#define R12  vgetq_lane(q2,1)
+#define R20  vgetq_lane(q2,2)
+#define R21  vgetq_lane(q2,3)
+#define R22  ro22
+
+void arm_rotation2quaternion_f32(const float32_t *pInputRotations, 
+    float32_t *pOutputQuaternions,  
+    uint32_t nbQuaternions)
+{
+   float32_t ro22, trace;
+   f32x4_t q1,q2, q; 
+
+   float32_t doubler;
+   float32_t s;
+
+   q = vdupq_n_f32(0.0f);
+
+   for(uint32_t nb=0; nb < nbQuaternions; nb++)
+   {
+      q1 = vld1q(pInputRotations);
+      pInputRotations += 4;
+
+      q2 = vld1q(pInputRotations);
+      pInputRotations += 4;
+
+      ro22 = *pInputRotations++;
+
+      trace = R00 + R11 + R22;
+
+
+      if (trace > 0)
+      {
+        (void)arm_sqrt_f32(trace + 1.0, &doubler) ; // invs=4*qw
+        doubler = 2*doubler;
+        s = 1.0 / doubler;
+
+        q1 = vmulq_n_f32(q1,s);
+        q2 = vmulq_n_f32(q2,s);
+
+        q[0] = 0.25 * doubler;
+        q[1] = R21 - R12;
+        q[2] = R02 - R20;
+        q[3] = R10 - R01;
+      }
+      else if ((R00 > R11) && (R00 > R22) )
+      {
+        (void)arm_sqrt_f32(1.0 + R00 - R11 - R22,&doubler); // invs=4*qx
+        doubler = 2*doubler;
+        s = 1.0 / doubler;
+
+        q1 = vmulq_n_f32(q1,s);
+        q2 = vmulq_n_f32(q2,s);
+
+        q[0] = R21 - R12;
+        q[1] = 0.25 * doubler;
+        q[2] = R01 + R10;
+        q[3] = R02 + R20;
+      }
+      else if (R11 > R22)
+      {
+        (void)arm_sqrt_f32(1.0 + R11 - R00 - R22,&doubler); // invs=4*qy
+        doubler = 2*doubler;
+        s = 1.0 / doubler;
+
+        q1 = vmulq_n_f32(q1,s);
+        q2 = vmulq_n_f32(q2,s);
+
+        q[0] = R02 - R20;
+        q[1] = R01 + R10;
+        q[2] = 0.25 * doubler;
+        q[3] = R12 + R21;
+      }
+      else
+      {
+        (void)arm_sqrt_f32(1.0 + R22 - R00 - R11,&doubler); // invs=4*qz
+        doubler = 2*doubler;
+        s = 1.0 / doubler;
+
+        q1 = vmulq_n_f32(q1,s);
+        q2 = vmulq_n_f32(q2,s);
+
+        q[0] = R10 - R01;
+        q[1] = R02 + R20;
+        q[2] = R12 + R21;
+        q[3] = 0.25 * doubler;
+      }
+
+      vst1q(pOutputQuaternions, q);
+      pOutputQuaternions += 4;
+
+   }
+}
+
+#else
+void arm_rotation2quaternion_f32(const float32_t *pInputRotations, 
+    float32_t *pOutputQuaternions,  
+    uint32_t nbQuaternions)
+{
+   for(uint32_t nb=0; nb < nbQuaternions; nb++)
+   {
+       const float32_t *r=&pInputRotations[nb*9];
+       float32_t *q=&pOutputQuaternions[nb*4];
+
+       float32_t trace = RI(0,0) + RI(1,1) + RI(2,2);
+
+       float32_t doubler;
+       float32_t s;
+
+
+
+      if (trace > 0)
+      {
+        doubler = sqrtf(trace + 1.0) * 2; // invs=4*qw
+        s = 1.0 / doubler;
+        q[0] = 0.25 * doubler;
+        q[1] = (RI(2,1) - RI(1,2)) * s;
+        q[2] = (RI(0,2) - RI(2,0)) * s;
+        q[3] = (RI(1,0) - RI(0,1)) * s;
+      }
+      else if ((RI(0,0) > RI(1,1)) && (RI(0,0) > RI(2,2)) )
+      {
+        doubler = sqrtf(1.0 + RI(0,0) - RI(1,1) - RI(2,2)) * 2; // invs=4*qx
+        s = 1.0 / doubler;
+        q[0] = (RI(2,1) - RI(1,2)) * s;
+        q[1] = 0.25 * doubler;
+        q[2] = (RI(0,1) + RI(1,0)) * s;
+        q[3] = (RI(0,2) + RI(2,0)) * s;
+      }
+      else if (RI(1,1) > RI(2,2))
+      {
+        doubler = sqrtf(1.0 + RI(1,1) - RI(0,0) - RI(2,2)) * 2; // invs=4*qy
+        s = 1.0 / doubler;
+        q[0] = (RI(0,2) - RI(2,0)) * s;
+        q[1] = (RI(0,1) + RI(1,0)) * s;
+        q[2] = 0.25 * doubler;
+        q[3] = (RI(1,2) + RI(2,1)) * s;
+      }
+      else
+      {
+        doubler = sqrtf(1.0 + RI(2,2) - RI(0,0) - RI(1,1)) * 2; // invs=4*qz
+        s = 1.0 / doubler;
+        q[0] = (RI(1,0) - RI(0,1)) * s;
+        q[1] = (RI(0,2) + RI(2,0)) * s;
+        q[2] = (RI(1,2) + RI(2,1)) * s;
+        q[3] = 0.25 * doubler;
+      }
+
+    }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of RotQuat group
+ */
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..a3ebc7f712a4793b996dbc4286fbe4d35f0e5d34
--- /dev/null
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c
@@ -0,0 +1,98 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_linear_init_f16.c
+ * Description:  SVM Linear Instance Initialization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+ * @defgroup groupSVM SVM Functions
+ *
+ */
+
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup linearsvm Linear SVM
+
+  Linear SVM classifier
+ */
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief        SVM linear instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in]    S                      Parameters for the SVM function
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @return none.
+ *
+ */
+
+
+void arm_svm_linear_init_f16(arm_svm_linear_instance_f16 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float16_t intercept,
+  const float16_t *dualCoefficients,
+  const float16_t *supportVectors,
+  const int32_t *classes)
+{
+   S->nbOfSupportVectors = nbOfSupportVectors;
+   S->vectorDimension = vectorDimension;
+   S->intercept = intercept;
+   S->dualCoefficients = dualCoefficients;
+   S->supportVectors = supportVectors;
+   S->classes = classes;
+}
+
+
+
+/**
+ * @} end of linearsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c
index bddeafece982562a459dafd4bde6b4248db08b58..75395aa9c514c034badb33f2dac455995ab3f7f8 100644
--- a/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_svm_linear_init_f32.c
  * Description:  SVM Linear Instance Initialization
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/svm_functions.h"
 #include <limits.h>
 #include <math.h>
 
@@ -33,9 +35,18 @@
  *
  */
 
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup linearsvm Linear SVM
+
+  Linear SVM classifier
+ */
 
 /**
- * @addtogroup groupSVM
+ * @addtogroup linearsvm
  * @{
  */
 
@@ -77,5 +88,5 @@ void arm_svm_linear_init_f32(arm_svm_linear_instance_f32 *S,
 
 
 /**
- * @} end of groupSVM group
+ * @} end of linearsvm group
  */
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..494ef9a64fb03703412a88ed53ff7d273370543d
--- /dev/null
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c
@@ -0,0 +1,314 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_linear_predict_f16.c
+ * Description:  SVM Linear Classifier
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM linear prediction
+ * @param[in]    S          Pointer to an instance of the linear SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_svm_linear_predict_f16(
+    const arm_svm_linear_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+        /* inlined Matrix x Vector function interleaved with dot prod */
+    uint32_t        numRows = S->nbOfSupportVectors;
+    uint32_t        numCols = S->vectorDimension;
+    const float16_t *pSupport = S->supportVectors;
+    const float16_t *pSrcA = pSupport;
+    const float16_t *pInA0;
+    const float16_t *pInA1;
+    uint32_t         row;
+    uint32_t         blkCnt;     /* loop counters */
+    const float16_t *pDualCoef = S->dualCoefficients;
+    _Float16       sum = S->intercept;
+    row = numRows;
+
+    /*
+     * compute 4 rows in parrallel
+     */
+    while (row >= 4) 
+    {
+        const float16_t *pInA2, *pInA3;
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1, acc2, acc3;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        acc2 = vdupq_n_f16(0.0f);
+        acc3 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);
+            pSrcA2Vec += 8;
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);
+            pSrcA3Vec += 8;
+            acc3 = vfmaq(acc3, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA2Vec, p0);
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+            acc3 = vfmaq(acc3, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        acc0 = vmulq_n_f16(acc0,*pDualCoef++);
+        acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
+        acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
+        acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);
+
+        sum += (_Float16)vecAddAcrossF16Mve(acc0);
+
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parallel
+     */
+    if (row >= 2) {
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        acc0 = vmulq_n_f16(acc0,*pDualCoef++);
+        acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
+
+        sum += (_Float16)vecAddAcrossF16Mve(acc0);
+
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1) {
+        f16x8_t         vecIn, acc0;
+        float16_t const *pSrcA0Vec, *pInVec;
+        float16_t const *pSrcVecPtr = in;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0);
+
+    }
+
+    *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_linear_predict_f16(
+    const arm_svm_linear_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+    _Float16 sum=S->intercept;
+    _Float16 dot=0;
+    uint32_t i,j;
+    const float16_t *pSupport = S->supportVectors;
+
+    for(i=0; i < S->nbOfSupportVectors; i++)
+    {
+        dot=0;
+        for(j=0; j < S->vectorDimension; j++)
+        {
+            dot = dot + in[j]* *pSupport++;
+        }
+        sum += S->dualCoefficients[i] * dot;
+    }
+    *pResult=S->classes[STEP(sum)];
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of linearsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c
index 503ba16fe772cda61e6a7ea4f203c33b91ee5e11..caf09df07b5819005ce9be10aebad96edf8be083 100644
--- a/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_svm_linear_predict_f32.c
  * Description:  SVM Linear Classifier
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,13 +26,13 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/svm_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 /**
- * @addtogroup groupSVM
+ * @addtogroup linearsvm
  * @{
  */
 
@@ -142,10 +144,13 @@ void arm_svm_linear_predict_f32(
         /*
          * Sum the partial parts
          */
-        sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
-        sum += *pDualCoef++ * vecAddAcrossF32Mve(acc1);
-        sum += *pDualCoef++ * vecAddAcrossF32Mve(acc2);
-        sum += *pDualCoef++ * vecAddAcrossF32Mve(acc3);
+
+        acc0 = vmulq_n_f32(acc0,*pDualCoef++);
+        acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
+        acc0 = vfmaq_n_f32(acc0,acc2,*pDualCoef++);
+        acc0 = vfmaq_n_f32(acc0,acc3,*pDualCoef++);
+
+        sum += vecAddAcrossF32Mve(acc0);
 
         pSrcA += numCols * 4;
         /*
@@ -212,8 +217,11 @@ void arm_svm_linear_predict_f32(
         /*
          * Sum the partial parts
          */
-        sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
-        sum += *pDualCoef++ * vecAddAcrossF32Mve(acc1);
+        acc0 = vmulq_n_f32(acc0,*pDualCoef++);
+        acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
+
+        sum += vecAddAcrossF32Mve(acc0);
+
 
         pSrcA += numCols * 2;
         row -= 2;
@@ -449,5 +457,5 @@ void arm_svm_linear_predict_f32(
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
- * @} end of groupSVM group
+ * @} end of linearsvm group
  */
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..558ab450c52d2e259e448521b3dad565e96aed51
--- /dev/null
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c
@@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_polynomial_init_f16.c
+ * Description:  SVM Polynomial Instance Initialization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup polysvm Polynomial SVM
+
+  Polynomial SVM classifier
+ */
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+/**
+ * @brief        SVM polynomial instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in]    S                      points to an instance of the polynomial SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    degree                 Polynomial degree
+ * @param[in]    coef0                  coeff0 (scikit-learn terminology)
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+
+void arm_svm_polynomial_init_f16(arm_svm_polynomial_instance_f16 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float16_t intercept,
+  const float16_t *dualCoefficients,
+  const float16_t *supportVectors,
+  const int32_t *classes,
+  int32_t      degree,
+  float16_t coef0,
+  float16_t gamma
+  )
+{
+   S->nbOfSupportVectors = nbOfSupportVectors;
+   S->vectorDimension = vectorDimension;
+   S->intercept = intercept;
+   S->dualCoefficients = dualCoefficients;
+   S->supportVectors = supportVectors;
+   S->classes = classes;
+   S->degree = degree;
+   S->coef0 = coef0;
+   S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of polysvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c
index 8828ea2a8eec1026dec5943c9daf72fc6dd7804f..7d33cd79cf999711b8d471e35d4a4b3cea0873dc 100644
--- a/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_svm_polynomial_init_f32.c
  * Description:  SVM Polynomial Instance Initialization
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,16 +26,24 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/svm_functions.h"
 #include <limits.h>
 #include <math.h>
 
+/**
+  @ingroup groupSVM
+ */
 
 /**
- * @addtogroup groupSVM
- * @{
+  @defgroup polysvm Polynomial SVM
+
+  Polynomial SVM classifier
  */
 
+/**
+ * @addtogroup polysvm
+ * @{
+ */
 
 
 /**
@@ -83,5 +93,5 @@ void arm_svm_polynomial_init_f32(arm_svm_polynomial_instance_f32 *S,
 
 
 /**
- * @} end of groupSVM group
+ * @} end of polysvm group
  */
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..e3e2d6a262331a21def700ab1095cdea259d3b56
--- /dev/null
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
@@ -0,0 +1,336 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_polynomial_predict_f16.c
+ * Description:  SVM Polynomial Classifier
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM polynomial prediction
+ * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+void arm_svm_polynomial_predict_f16(
+    const arm_svm_polynomial_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+        /* inlined Matrix x Vector function interleaved with dot prod */
+    uint32_t        numRows = S->nbOfSupportVectors;
+    uint32_t        numCols = S->vectorDimension;
+    const float16_t *pSupport = S->supportVectors;
+    const float16_t *pSrcA = pSupport;
+    const float16_t *pInA0;
+    const float16_t *pInA1;
+    uint32_t         row;
+    uint32_t         blkCnt;     /* loop counters */
+    const float16_t *pDualCoef = S->dualCoefficients;
+    _Float16       sum = S->intercept;
+    f16x8_t         vSum = vdupq_n_f16(0.0f);
+
+    row = numRows;
+
+    /*
+     * compute 4 rows in parrallel
+     */
+    while (row >= 4) {
+        const float16_t *pInA2, *pInA3;
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1, acc2, acc3;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        acc2 = vdupq_n_f16(0.0f);
+        acc3 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);
+            pSrcA2Vec += 8;
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);
+            pSrcA3Vec += 8;
+            acc3 = vfmaq(acc3, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA2Vec, p0);
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+            acc3 = vfmaq(acc3, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
+
+        vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                             arm_vec_exponent_f16
+                             (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), 
+                                S->degree),vctp16q(4));
+        
+        pDualCoef += 4;
+
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parrallel
+     */
+    if (row >= 2) {
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+
+        vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                             arm_vec_exponent_f16
+                             (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree), 
+                             vctp16q(2));
+        
+        pDualCoef += 2;
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1) {
+        f16x8_t         vecIn, acc0;
+        float16_t const *pSrcA0Vec, *pInVec;
+        float16_t const *pSrcVecPtr = in;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                             arm_vec_exponent_f16
+                             (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree), 
+                             vctp16q(1));
+    }
+    sum += (_Float16)vecAddAcrossF16Mve(vSum);
+
+    
+    *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_polynomial_predict_f16(
+    const arm_svm_polynomial_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+    _Float16 sum=S->intercept;
+    _Float16 dot=0;
+    uint32_t i,j;
+    const float16_t *pSupport = S->supportVectors;
+
+    for(i=0; i < S->nbOfSupportVectors; i++)
+    {
+        dot=0;
+        for(j=0; j < S->vectorDimension; j++)
+        {
+            dot = dot + (_Float16)in[j]* (_Float16)*pSupport++;
+        }
+        sum += S->dualCoefficients[i] * (_Float16)arm_exponent_f16(S->gamma * dot + S->coef0, S->degree);
+    }
+
+    *pResult=S->classes[STEP(sum)];
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/**
+ * @} end of polysvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c
index 0873bff6674eeb94917a0008353742b5f9ba708f..13b1f84c417ff0d471f8f1b6c83668b2060a9b94 100644
--- a/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_svm_polynomial_predict_f32.c
  * Description:  SVM Polynomial Classifier
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/svm_functions.h"
 #include <limits.h>
 #include <math.h>
 
@@ -33,7 +35,7 @@
 #endif
 
 /**
- * @addtogroup groupSVM
+ * @addtogroup polysvm
  * @{
  */
 
@@ -484,5 +486,5 @@ void arm_svm_polynomial_predict_f32(
 
 
 /**
- * @} end of groupSVM group
+ * @} end of polysvm group
  */
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..43de2498376ebe446fa7aafe11422ba09839bea9
--- /dev/null
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_rbf_init_f16.c
+ * Description:  SVM Radial Basis Function Instance Initialization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup rbfsvm RBF SVM
+
+  RBF SVM classifier
+ */
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief        SVM radial basis function instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in]    S                      points to an instance of the polynomial SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_rbf_init_f16(arm_svm_rbf_instance_f16 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float16_t intercept,
+  const float16_t *dualCoefficients,
+  const float16_t *supportVectors,
+  const int32_t *classes,
+  float16_t gamma
+  )
+{
+   S->nbOfSupportVectors = nbOfSupportVectors;
+   S->vectorDimension = vectorDimension;
+   S->intercept = intercept;
+   S->dualCoefficients = dualCoefficients;
+   S->supportVectors = supportVectors;
+   S->classes = classes;
+   S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of rbfsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c
index 6128e983edd5e8ca1e97774762915435551559ff..77bf28246706cb45a5c6ad45911773fd464c11ca 100644
--- a/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_svm_rbf_init_f32.c
  * Description:  SVM Radial Basis Function Instance Initialization
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,13 +26,23 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/svm_functions.h"
 #include <limits.h>
 #include <math.h>
 
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup rbfsvm RBF SVM
+
+  RBF SVM classifier
+ */
+
 
 /**
- * @addtogroup groupSVM
+ * @addtogroup rbfsvm
  * @{
  */
 
@@ -75,5 +87,5 @@ void arm_svm_rbf_init_f32(arm_svm_rbf_instance_f32 *S,
 
 
 /**
- * @} end of groupSVM group
+ * @} end of rbfsvm group
  */
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..7724fda25ae0318603c4b656aad801a475b7fff2
--- /dev/null
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c
@@ -0,0 +1,352 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_rbf_predict_f16.c
+ * Description:  SVM Radial Basis Function Classifier
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM rbf prediction
+ * @param[in]    S         Pointer to an instance of the rbf SVM structure.
+ * @param[in]    in        Pointer to input vector
+ * @param[out]   pResult   decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+void arm_svm_rbf_predict_f16(
+    const arm_svm_rbf_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+        /* inlined Matrix x Vector function interleaved with dot prod */
+    uint32_t        numRows = S->nbOfSupportVectors;
+    uint32_t        numCols = S->vectorDimension;
+    const float16_t *pSupport = S->supportVectors;
+    const float16_t *pSrcA = pSupport;
+    const float16_t *pInA0;
+    const float16_t *pInA1;
+    uint32_t         row;
+    uint32_t         blkCnt;     /* loop counters */
+    const float16_t *pDualCoef = S->dualCoefficients;
+    _Float16       sum = S->intercept;
+    f16x8_t         vSum = vdupq_n_f16(0);
+
+    row = numRows;
+
+    /*
+     * compute 4 rows in parrallel
+     */
+    while (row >= 4) {
+        const float16_t *pInA2, *pInA3;
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1, acc2, acc3;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        acc2 = vdupq_n_f16(0.0f);
+        acc3 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+            f16x8_t         vecDif;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc1 = vfmaq(acc1, vecDif, vecDif);
+            vecA = vld1q(pSrcA2Vec);
+            pSrcA2Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc2 = vfmaq(acc2, vecDif, vecDif);
+            vecA = vld1q(pSrcA3Vec);
+            pSrcA3Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc3 = vfmaq(acc3, vecDif, vecDif);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+            f16x8_t         vecDif;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc1 = vfmaq(acc1, vecDif, vecDif);
+            vecA = vldrhq_z_f16(pSrcA2Vec, p0);;
+            vecDif = vsubq(vecIn, vecA);
+            acc2 = vfmaq(acc2, vecDif, vecDif);
+            vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc3 = vfmaq(acc3, vecDif, vecDif);
+        }
+        /*
+         * Sum the partial parts
+         */
+
+        //sum += *pDualCoef++ * expf(-S->gamma * vecReduceF16Mve(acc0));
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
+
+        vSum =
+            vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                      vexpq_f16(vmulq_n_f16(vtmp, -S->gamma)),vctp16q(4));
+        pDualCoef += 4;
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parrallel
+     */
+    if (row >= 2) {
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+            f16x8_t         vecDif;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);;
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc1 = vfmaq(acc1, vecDif, vecDif);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA, vecDif;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc1 = vfmaq(acc1, vecDif, vecDif);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+
+        vSum =
+            vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                        vexpq_f16(vmulq_n_f16(vtmp, -S->gamma)), vctp16q(2));
+        pDualCoef += 2;
+
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1) {
+        f16x8_t         vecIn, acc0;
+        float16_t const *pSrcA0Vec, *pInVec;
+        float16_t const *pSrcVecPtr = in;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA, vecDif;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA, vecDif;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+
+        vSum =
+            vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                        vexpq_f16(vmulq_n_f16(vtmp, -S->gamma)), vctp16q(1));
+
+    }
+
+
+    sum += vecAddAcrossF16Mve(vSum);
+    *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_rbf_predict_f16(
+    const arm_svm_rbf_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+    _Float16 sum=S->intercept;
+    _Float16 dot=00.f16;
+    uint32_t i,j;
+    const float16_t *pSupport = S->supportVectors;
+
+    for(i=0; i < S->nbOfSupportVectors; i++)
+    {
+        dot=0.0f16;
+        for(j=0; j < S->vectorDimension; j++)
+        {
+            dot = dot + SQ((_Float16)in[j] - (_Float16) *pSupport);
+            pSupport++;
+        }
+        sum += (_Float16)S->dualCoefficients[i] * (_Float16)expf(-(_Float16)S->gamma * dot);
+    }
+    *pResult=S->classes[STEP(sum)];
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of rbfsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c
index 8625ef49ae35d4cf34f3513bf83957ca64f066fc..d3c43bf63a1b01448241313efa1c1dea81b91e5f 100644
--- a/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_svm_rbf_predict_f32.c
  * Description:  SVM Radial Basis Function Classifier
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,13 +26,13 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/svm_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 /**
- * @addtogroup groupSVM
+ * @addtogroup rbfsvm
  * @{
  */
 
@@ -517,5 +519,5 @@ void arm_svm_rbf_predict_f32(
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
- * @} end of groupSVM group
+ * @} end of rbfsvm group
  */
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..7a27417f897537dadfae986515969c20591d8485
--- /dev/null
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c
@@ -0,0 +1,98 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_sigmoid_predict_f16.c
+ * Description:  SVM Sigmoid Instance Initialization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup sigmoidsvm Sigmoid SVM
+
+  Sigmoid SVM classifier
+ */
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+/**
+ * @brief        SVM sigmoid instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in]    S                      points to an instance of the rbf SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    coef0                  coeff0 (scikit-learn terminology)
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_sigmoid_init_f16(arm_svm_sigmoid_instance_f16 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float16_t intercept,
+  const float16_t *dualCoefficients,
+  const float16_t *supportVectors,
+  const int32_t *classes,
+  float16_t coef0,
+  float16_t gamma
+  )
+{
+   S->nbOfSupportVectors = nbOfSupportVectors;
+   S->vectorDimension = vectorDimension;
+   S->intercept = intercept;
+   S->dualCoefficients = dualCoefficients;
+   S->supportVectors = supportVectors;
+   S->classes = classes;
+   S->coef0 = coef0;
+   S->gamma = gamma;
+}
+
+
+/**
+ * @} end of sigmoidsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c
index 7715d909dbec4a9dd1496f0227c6a5a330a7b561..a7f16c289772b5e1935016f8f6a1980357d56331 100644
--- a/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_svm_sigmoid_predict_f32.c
  * Description:  SVM Sigmoid Instance Initialization
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,13 +26,22 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/svm_functions.h"
 #include <limits.h>
 #include <math.h>
 
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup sigmoidsvm Sigmoid SVM
+
+  Sigmoid SVM classifier
+ */
 
 /**
- * @addtogroup groupSVM
+ * @addtogroup sigmoidsvm
  * @{
  */
 
@@ -77,5 +88,5 @@ void arm_svm_sigmoid_init_f32(arm_svm_sigmoid_instance_f32 *S,
 
 
 /**
- * @} end of groupSVM group
+ * @} end of sigmoidsvm group
  */
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..670806b519985e6089881f2a805540be7744df90
--- /dev/null
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c
@@ -0,0 +1,333 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_sigmoid_predict_f16.c
+ * Description:  SVM Sigmoid Classifier
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+
+/**
+ * @brief SVM sigmoid prediction
+ * @param[in]    S        Pointer to an instance of the rbf SVM structure.
+ * @param[in]    in       Pointer to input vector
+ * @param[out]   pResult  Decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+void arm_svm_sigmoid_predict_f16(
+    const arm_svm_sigmoid_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+        /* inlined Matrix x Vector function interleaved with dot prod */
+    uint32_t        numRows = S->nbOfSupportVectors;
+    uint32_t        numCols = S->vectorDimension;
+    const float16_t *pSupport = S->supportVectors;
+    const float16_t *pSrcA = pSupport;
+    const float16_t *pInA0;
+    const float16_t *pInA1;
+    uint32_t         row;
+    uint32_t         blkCnt;     /* loop counters */
+    const float16_t *pDualCoef = S->dualCoefficients;
+    _Float16       sum = S->intercept;
+    f16x8_t         vSum = vdupq_n_f16(0.0f);
+
+    row = numRows;
+
+    /*
+     * compute 4 rows in parrallel
+     */
+    while (row >= 4) {
+        const float16_t *pInA2, *pInA3;
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1, acc2, acc3;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        acc2 = vdupq_n_f16(0.0f);
+        acc3 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);
+            pSrcA2Vec += 8;
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);
+            pSrcA3Vec += 8;
+            acc3 = vfmaq(acc3, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA2Vec, p0);
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+            acc3 = vfmaq(acc3, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
+
+        vSum =
+            vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                      vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),vctp16q(4));
+
+        pDualCoef += 4;
+
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parrallel
+     */
+    if (row >= 2) {
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+
+        vSum =
+            vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                        vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
+                        vctp16q(2));
+
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1) {
+        f16x8_t         vecIn, acc0;
+        float16_t const *pSrcA0Vec, *pInVec;
+        float16_t const *pSrcVecPtr = in;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+
+        vSum =
+            vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                        vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
+                        vctp16q(1));
+    }
+    sum += vecAddAcrossF16Mve(vSum);
+
+    *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_sigmoid_predict_f16(
+    const arm_svm_sigmoid_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+    _Float16 sum=S->intercept;
+    _Float16 dot=0.0f16;
+    uint32_t i,j;
+    const float16_t *pSupport = S->supportVectors;
+
+    for(i=0; i < S->nbOfSupportVectors; i++)
+    {
+        dot=0.0f16;
+        for(j=0; j < S->vectorDimension; j++)
+        {
+            dot = dot + (_Float16)in[j] * (_Float16)*pSupport++;
+        }
+        sum += (_Float16)S->dualCoefficients[i] * (_Float16)tanhf((_Float16)S->gamma * dot + (_Float16)S->coef0);
+    }
+    *pResult=S->classes[STEP(sum)];
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of sigmoidsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c
index f3b2f2f45c57f283541ef5a2d8641c914f9eae3c..37c8a080128f41d2245b6b7b807f32391b4a9d3c 100644
--- a/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_svm_sigmoid_predict_f32.c
  * Description:  SVM Sigmoid Classifier
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,12 +26,12 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/svm_functions.h"
 #include <limits.h>
 #include <math.h>
 
 /**
- * @addtogroup groupSVM
+ * @addtogroup sigmoidsvm
  * @{
  */
 
@@ -481,5 +483,5 @@ void arm_svm_sigmoid_predict_f32(
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
- * @} end of groupSVM group
+ * @} end of sigmoidsvm group
  */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..473397f878c34771a34475c902462161b1325854
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_f16.c
@@ -0,0 +1,274 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_absmax_f16.c
+ * Description:  Maximum value of a absolute values of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include <limits.h>
+#endif
+
+/**
+  @ingroup groupStats
+ */
+
+
+/**
+  @addtogroup AbsMax
+  @{
+ */
+
+/**
+  @brief         Maximum value of absolute values of a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    maximum value returned here
+  @param[out]    pIndex     index of maximum value returned here
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_absmax_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult,
+        uint32_t * pIndex)
+{
+      uint16_t        blkCnt;           /* loop counters */
+    f16x8_t       vecSrc;
+    float16_t const *pSrcVec;
+    f16x8_t       curExtremValVec = vdupq_n_f16(F16_ABSMIN);
+    float16_t       maxValue = F16_ABSMIN;
+    uint16_t        idx = blockSize;
+    uint16x8_t    indexVec;
+    uint16x8_t    curExtremIdxVec;
+    mve_pred16_t    p0;
+
+
+    indexVec = vidupq_u16((uint32_t)0, 1);
+    curExtremIdxVec = vdupq_n_u16(0);
+
+    pSrcVec = (float16_t const *) pSrc;
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        vecSrc = vldrhq_f16(pSrcVec);  
+        pSrcVec += 8;
+        vecSrc = vabsq(vecSrc);
+        /*
+         * Get current max per lane and current index per lane
+         * when a max is selected
+         */
+        p0 = vcmpgeq(vecSrc, curExtremValVec);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+        indexVec = indexVec +  8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        vecSrc = vldrhq_f16(pSrcVec);  
+        pSrcVec += 8;
+        vecSrc = vabsq(vecSrc);
+
+        p0 = vctp16q(blkCnt);
+        /*
+         * Get current max per lane and current index per lane
+         * when a max is selected
+         */
+        p0 = vcmpgeq_m(vecSrc, curExtremValVec, p0);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+    }
+    /*
+     * Get max value across the vector
+     */
+    maxValue = vmaxnmvq(maxValue, curExtremValVec);
+    /*
+     * set index for lower values to max possible index
+     */
+    p0 = vcmpgeq(curExtremValVec, maxValue);
+    indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
+    /*
+     * Get min index which is thus for a max value
+     */
+    idx = vminvq(idx, indexVec);
+    /*
+     * Save result
+     */
+    *pIndex = idx;
+    *pResult = maxValue;
+}
+#else
+#if defined(ARM_MATH_LOOPUNROLL)
+void arm_absmax_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult,
+        uint32_t * pIndex)
+{
+        float16_t cur_absmax, out;                     /* Temporary variables to store the output value. */\
+        uint32_t blkCnt, outIndex;                     /* Loop counter */                                   \
+        uint32_t index;                                /* index of maximum value */                         \
+                                                                                                            \
+  /* Initialize index value to zero. */                                                                     \
+  outIndex = 0U;                                                                                            \
+  /* Load first input value that act as reference value for comparision */                                  \
+  out = *pSrc++;                                                                                            \
+  out = (out > 0.0f16) ? out : -out;                                                                             \
+  /* Initialize index of extrema value. */                                                                  \
+  index = 0U;                                                                                               \
+                                                                                                            \
+  /* Loop unrolling: Compute 4 outputs at a time */                                                         \
+  blkCnt = (blockSize - 1U) >> 2U;                                                                          \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    /* Initialize cur_absmax to next consecutive values one by one */                                         \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0.0f16) ? cur_absmax : -cur_absmax;                                                                 \
+    /* compare for the extrema value */                                                                     \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      /* Update the extrema value and it's index */                                                         \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 1U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0.0f16) ? cur_absmax : -cur_absmax;                                                                 \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 2U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0.0f16) ? cur_absmax : -cur_absmax;                                                                 \
+    if (cur_absmax > out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 3U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0.0f16) ? cur_absmax : -cur_absmax;                                                                 \
+    if (cur_absmax > out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 4U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    index += 4U;                                                                                            \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Loop unrolling: Compute remaining outputs */                                                           \
+  blkCnt = (blockSize - 1U) % 4U;                                                                           \
+                                                                                                            \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0.0f16) ? cur_absmax : -cur_absmax;                                                                 \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = blockSize - blkCnt;                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Store the extrema value and it's index into destination pointers */                                    \
+  *pResult = out;                                                                                           \
+  *pIndex = outIndex;  
+}
+#else
+void arm_absmax_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult,
+        uint32_t * pIndex)
+{
+        float16_t maxVal, out;                         /* Temporary variables to store the output value. */
+        uint32_t blkCnt, outIndex;                     /* Loop counter */
+
+  /* Initialise index value to zero. */
+  outIndex = 0U;
+
+  /* Load first input value that act as reference value for comparision */
+  out = fabsf(*pSrc++);
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = (blockSize - 1U);
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal = fabsf(*pSrc++);
+
+    /* compare for the maximum value */
+    if (out < maxVal)
+    {
+      /* Update the maximum value and it's index */
+      out = maxVal;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store the maximum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_LOOPUNROLL) */
+#endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of AbsMax group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..a571ae0d7aeaed8591a460688f7a333887dcaf58
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_f32.c
@@ -0,0 +1,260 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_absmax_f32.c
+ * Description:  Maximum value of absolute values of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include <limits.h>
+#endif
+
+/**
+  @ingroup groupStats
+ */
+
+/**
+  @defgroup AbsMax Absolute Maximum
+
+  Computes the maximum value of absolute values of an array of data.
+  The function returns both the maximum value and its position within the array.
+  There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+  @addtogroup AbsMax
+  @{
+ */
+
+/**
+  @brief         Maximum value of absolute values of a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    maximum value returned here
+  @param[out]    pIndex     index of maximum value returned here
+  @return        none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_absmax_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex)
+{
+    int32_t blkSize = blockSize;
+    f32x4_t vecSrc;
+    f32x4_t curExtremValVec = vdupq_n_f32(F32_ABSMIN);
+    float32_t maxValue = F32_ABSMIN;
+    uint32_t idx = blockSize;
+    uint32x4_t indexVec;
+    uint32x4_t curExtremIdxVec;
+    uint32_t curIdx = 0;
+    mve_pred16_t p0;
+
+
+    indexVec = vidupq_wb_u32(&curIdx, 1);
+    curExtremIdxVec = vdupq_n_u32(0);
+
+    do {
+        mve_pred16_t p = vctp32q(blkSize);
+
+        vecSrc = vldrwq_z_f32((float32_t const *) pSrc, p);
+        vecSrc = vabsq_m(vuninitializedq_f32(), vecSrc, p);
+        /*
+         * Get current max per lane and current index per lane
+         * when a max is selected
+         */
+        p0 = vcmpgeq_m(vecSrc, curExtremValVec, p);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+        /* Does TP detection works here ?? */
+        indexVec = vidupq_wb_u32(&curIdx, 1);
+
+        blkSize -= 4;
+        pSrc += 4;
+    }
+    while (blkSize > 0);
+
+    /*
+     * Get max value across the vector
+     */
+    maxValue = vmaxnmvq(maxValue, curExtremValVec);
+    /*
+     * set index for lower values to max possible index
+     */
+    p0 = vcmpgeq(curExtremValVec, maxValue);
+    indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
+    /*
+     * Get min index which is thus for a max value
+     */
+    idx = vminvq(idx, indexVec);
+    /*
+     * Save result
+     */
+    *pIndex = idx;
+    *pResult = maxValue;
+}
+
+
+#else
+#if defined(ARM_MATH_LOOPUNROLL)
+void arm_absmax_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex)
+{
+        float32_t cur_absmax, out;                     /* Temporary variables to store the output value. */\
+        uint32_t blkCnt, outIndex;                     /* Loop counter */                                   \
+        uint32_t index;                                /* index of maximum value */                         \
+                                                                                                            \
+  /* Initialize index value to zero. */                                                                     \
+  outIndex = 0U;                                                                                            \
+  /* Load first input value that act as reference value for comparision */                                  \
+  out = *pSrc++;                                                                                            \
+  out = (out > 0.0f) ? out : -out;                                                                             \
+  /* Initialize index of extrema value. */                                                                  \
+  index = 0U;                                                                                               \
+                                                                                                            \
+  /* Loop unrolling: Compute 4 outputs at a time */                                                         \
+  blkCnt = (blockSize - 1U) >> 2U;                                                                          \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    /* Initialize cur_absmax to next consecutive values one by one */                                         \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0.0f) ? cur_absmax : -cur_absmax;                                                                 \
+    /* compare for the extrema value */                                                                     \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      /* Update the extrema value and it's index */                                                         \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 1U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0.0f) ? cur_absmax : -cur_absmax;                                                                 \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 2U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0.0f) ? cur_absmax : -cur_absmax;                                                                 \
+    if (cur_absmax > out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 3U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0.0f) ? cur_absmax : -cur_absmax;                                                                 \
+    if (cur_absmax > out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 4U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    index += 4U;                                                                                            \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Loop unrolling: Compute remaining outputs */                                                           \
+  blkCnt = (blockSize - 1U) % 4U;                                                                           \
+                                                                                                            \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0.0f) ? cur_absmax : -cur_absmax;                                                                 \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = blockSize - blkCnt;                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Store the extrema value and it's index into destination pointers */                                    \
+  *pResult = out;                                                                                           \
+  *pIndex = outIndex;  
+}
+#else
+void arm_absmax_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex)
+{
+        float32_t maxVal, out;                         /* Temporary variables to store the output value. */
+        uint32_t blkCnt, outIndex;                     /* Loop counter */
+
+
+
+  /* Initialise index value to zero. */
+  outIndex = 0U;
+
+  /* Load first input value that act as reference value for comparision */
+  out = fabsf(*pSrc++);
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = (blockSize - 1U);
+
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal = fabsf(*pSrc++);
+
+    /* compare for the maximum value */
+    if (out < maxVal)
+    {
+      /* Update the maximum value and it's index */
+      out = maxVal;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store the maximum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_LOOPUNROLL) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of AbsMax group
+ */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q15.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q15.c
new file mode 100644
index 0000000000000000000000000000000000000000..a898d1f3fa20a0fbf65846c51651465dbd13b2b8
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q15.c
@@ -0,0 +1,236 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_absmax_q15.c
+ * Description:  Maximum value of absolute values of a Q15 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+  @ingroup groupStats
+ */
+
+/**
+  @addtogroup AbsMax
+  @{
+ */
+
+/**
+  @brief         Maximum value of absolute values of a Q15 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    maximum value returned here
+  @param[out]    pIndex     index of maximum value returned here
+  @return        none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_absmax_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex)
+{
+      int32_t         blkCnt;     /* loop counters */
+    q15x8_t         extremValVec = vdupq_n_s16(Q15_ABSMIN);
+    q15_t           maxValue = Q15_ABSMIN;
+    uint16x8_t      indexVec;
+    uint16x8_t      extremIdxVec;
+    mve_pred16_t    p0;
+    uint16_t        extremIdxArr[8];
+
+    indexVec = vidupq_u16(0U, 1);
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp16q(blkCnt);
+        q15x8_t         extremIdxVal = vld1q_z_s16(pSrc, p);
+
+        extremIdxVal = vabsq(extremIdxVal);
+        /*
+         * Get current max per lane and current index per lane
+         * when a max is selected
+         */
+        p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
+
+        extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+        /* store per-lane extrema indexes */
+        vst1q_p_u16(extremIdxArr, indexVec, p0);
+
+        indexVec += 8;
+        pSrc += 8;
+        blkCnt -= 8;
+    }
+    while (blkCnt > 0);
+
+
+    /* Get max value across the vector   */
+    maxValue = vmaxvq(maxValue, extremValVec);
+
+    /* set index for lower values to max possible index   */
+    p0 = vcmpgeq(extremValVec, maxValue);
+    extremIdxVec = vld1q_u16(extremIdxArr);
+
+    indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0);
+    *pIndex = vminvq(blockSize - 1, indexVec);
+    *pResult = maxValue;
+}
+
+#else
+#if defined(ARM_MATH_DSP)
+void arm_absmax_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex)
+{
+        q15_t cur_absmax, out;                     /* Temporary variables to store the output value. */\
+        uint32_t blkCnt, outIndex;                     /* Loop counter */                                   \
+        uint32_t index;                                /* index of maximum value */                         \
+                                                                                                            \
+  /* Initialize index value to zero. */                                                                     \
+  outIndex = 0U;                                                                                            \
+  /* Load first input value that act as reference value for comparision */                                  \
+  out = *pSrc++;                                                                                            \
+  out = (out > 0) ? out : (q15_t)__QSUB16(0, out);                                                                           \
+  /* Initialize index of extrema value. */                                                                  \
+  index = 0U;                                                                                               \
+                                                                                                            \
+  /* Loop unrolling: Compute 4 outputs at a time */                                                         \
+  blkCnt = (blockSize - 1U) >> 2U;                                                                          \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    /* Initialize cur_absmax to next consecutive values one by one */                                         \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q15_t)__QSUB16(0, cur_absmax);                                                                \
+    /* compare for the extrema value */                                                                     \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      /* Update the extrema value and it's index */                                                         \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 1U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q15_t)__QSUB16(0, cur_absmax);                                                                \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 2U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q15_t)__QSUB16(0, cur_absmax);                                                                \
+    if (cur_absmax > out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 3U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q15_t)__QSUB16(0, cur_absmax);                                                                 \
+    if (cur_absmax > out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 4U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    index += 4U;                                                                                            \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Loop unrolling: Compute remaining outputs */                                                           \
+  blkCnt = (blockSize - 1U) % 4U;                                                                           \
+                                                                                                            \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q15_t)__QSUB16(0, cur_absmax);                                                                 \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = blockSize - blkCnt;                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Store the extrema value and it's index into destination pointers */                                    \
+  *pResult = out;                                                                                           \
+  *pIndex = outIndex;  
+}
+#else
+void arm_absmax_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex)
+{
+       q15_t maxVal, out;                             /* Temporary variables to store the output value. */
+        uint32_t blkCnt, outIndex;                     /* Loop counter */
+
+  /* Initialise index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = (*pSrc > 0) ? *pSrc : ((*pSrc == (q15_t) 0x8000) ? 0x7fff : -*pSrc);
+  pSrc++;
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = (blockSize - 1U);
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal = (*pSrc > 0) ? *pSrc : ((*pSrc == (q15_t) 0x8000) ? 0x7fff : -*pSrc);
+    pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal)
+    {
+      /* Update the maximum value and it's index */
+      out = maxVal;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store the maximum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_DSP) */
+#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of AbsMax group
+ */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q31.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q31.c
new file mode 100644
index 0000000000000000000000000000000000000000..a4e1e83ddcc344de34157c08d838f2cd2d63aaa7
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q31.c
@@ -0,0 +1,236 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_absmax_q31.c
+ * Description:  Maximum value of absolute values of a Q31 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+  @ingroup groupStats
+ */
+
+/**
+  @addtogroup AbsMax
+  @{
+ */
+
+/**
+  @brief         Maximum value of absolute values of a Q31 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    maximum value returned here
+  @param[out]    pIndex     index of maximum value returned here
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_absmax_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex)
+{
+    int32_t         blkCnt;     /* loop counters */
+    q31x4_t         extremValVec = vdupq_n_s32(Q31_ABSMIN);
+    q31_t           maxValue = Q31_ABSMIN;
+    uint32x4_t      indexVec;
+    uint32x4_t      extremIdxVec;
+    mve_pred16_t    p0;
+    uint32_t        extremIdxArr[4];
+
+    indexVec = vidupq_u32(0U, 1);
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp32q(blkCnt);
+        q31x4_t         extremIdxVal = vld1q_z_s32(pSrc, p);
+
+        extremIdxVal = vabsq(extremIdxVal);
+        /*
+         * Get current max per lane and current index per lane
+         * when a max is selected
+         */
+        p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
+
+        extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+        /* store per-lane extrema indexes */
+        vst1q_p_u32(extremIdxArr, indexVec, p0);
+
+        indexVec += 4;
+        pSrc += 4;
+        blkCnt -= 4;
+    }
+    while (blkCnt > 0);
+
+
+    /* Get max value across the vector   */
+    maxValue = vmaxvq(maxValue, extremValVec);
+
+    /* set index for lower values to max possible index   */
+    p0 = vcmpgeq(extremValVec, maxValue);
+    extremIdxVec = vld1q_u32(extremIdxArr);
+
+    indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0);
+    *pIndex = vminvq(blockSize - 1, indexVec);
+    *pResult = maxValue;  
+}
+#else
+#if defined(ARM_MATH_DSP)
+void arm_absmax_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex)
+{
+        q31_t cur_absmax, out;                     /* Temporary variables to store the output value. */\
+        uint32_t blkCnt, outIndex;                     /* Loop counter */                                   \
+        uint32_t index;                                /* index of maximum value */                         \
+                                                                                                            \
+  /* Initialize index value to zero. */                                                                     \
+  outIndex = 0U;                                                                                            \
+  /* Load first input value that act as reference value for comparision */                                  \
+  out = *pSrc++;                                                                                            \
+  out = (out > 0) ? out : (q31_t)__QSUB(0, out);                                                                           \
+  /* Initialize index of extrema value. */                                                                  \
+  index = 0U;                                                                                               \
+                                                                                                            \
+  /* Loop unrolling: Compute 4 outputs at a time */                                                         \
+  blkCnt = (blockSize - 1U) >> 2U;                                                                          \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    /* Initialize cur_absmax to next consecutive values one by one */                                         \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q31_t)__QSUB(0, cur_absmax);                                                                \
+    /* compare for the extrema value */                                                                     \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      /* Update the extrema value and it's index */                                                         \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 1U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q31_t)__QSUB(0, cur_absmax);                                                                \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 2U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q31_t)__QSUB(0, cur_absmax);                                                                \
+    if (cur_absmax > out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 3U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q31_t)__QSUB(0, cur_absmax);                                                                 \
+    if (cur_absmax > out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 4U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    index += 4U;                                                                                            \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Loop unrolling: Compute remaining outputs */                                                           \
+  blkCnt = (blockSize - 1U) % 4U;                                                                           \
+                                                                                                            \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q31_t)__QSUB(0, cur_absmax);                                                                 \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = blockSize - blkCnt;                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Store the extrema value and it's index into destination pointers */                                    \
+  *pResult = out;                                                                                           \
+  *pIndex = outIndex;  
+}
+#else
+void arm_absmax_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex)
+{
+        q31_t maxVal, out;                             /* Temporary variables to store the output value. */
+        uint32_t blkCnt, outIndex;                     /* Loop counter */
+
+
+  /* Initialise index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = (*pSrc > 0) ? *pSrc : ((*pSrc == INT32_MIN) ? INT32_MAX : -*pSrc);
+  pSrc++;
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = (blockSize - 1U);
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal = (*pSrc > 0) ? *pSrc : ((*pSrc == INT32_MIN) ? INT32_MAX : -*pSrc);
+    pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal)
+    {
+      /* Update the maximum value and it's index */
+      out = maxVal;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store the maximum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_DSP) */
+#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of AbsMax group
+ */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q7.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q7.c
new file mode 100644
index 0000000000000000000000000000000000000000..f21b57a4353f360616c8a85cc4d552b6c921523c
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q7.c
@@ -0,0 +1,294 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_absmax_q7.c
+ * Description:  Maximum value of absolute values of a Q7 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+  @ingroup groupStats
+ */
+
+/**
+  @addtogroup AbsMax
+  @{
+ */
+
+/**
+  @brief         Maximum value of absolute values of a Q7 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    maximum value returned here
+  @param[out]    pIndex     index of maximum value returned here
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include <stdint.h>
+#include "arm_helium_utils.h"
+
+#define MAX_BLKSZ_S8  (UINT8_MAX+1)
+
+static void arm_small_blk_absmax_q7(
+    const q7_t * pSrc,
+    uint16_t blockSize,
+    q7_t * pResult,
+    uint32_t * pIndex)
+{
+    int32_t        blkCnt;     /* loop counters */
+    q7x16_t        extremValVec = vdupq_n_s8(Q7_ABSMIN);
+    q7_t           maxValue = Q7_ABSMIN;
+    uint8x16_t     indexVec;
+    uint8x16_t     extremIdxVec;
+    mve_pred16_t   p0;
+    uint8_t        extremIdxArr[16];
+
+    indexVec = vidupq_u8(0U, 1);
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp8q(blkCnt);
+        q7x16_t         extremIdxVal = vld1q_z_s8(pSrc, p);
+
+        extremIdxVal = vabsq(extremIdxVal);
+        /*
+         * Get current max per lane and current index per lane
+         * when a max is selected
+         */
+        p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
+
+        extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+        /* store per-lane extrema indexes */
+        vst1q_p_u8(extremIdxArr, indexVec, p0);
+
+        indexVec += 16;
+        pSrc += 16;
+        blkCnt -= 16;
+    }
+    while (blkCnt > 0);
+
+
+    /* Get max value across the vector   */
+    maxValue = vmaxvq(maxValue, extremValVec);
+
+    /* set index for lower values to max possible index   */
+    p0 = vcmpgeq(extremValVec, maxValue);
+    extremIdxVec = vld1q_u8(extremIdxArr);
+
+    indexVec = vpselq(extremIdxVec, vdupq_n_u8(blockSize - 1), p0);
+    *pIndex = vminvq_u8(blockSize - 1, indexVec);
+    *pResult = maxValue;
+}
+
+void arm_absmax_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * pResult,
+        uint32_t * pIndex)
+{
+    int32_t   totalSize = blockSize;
+
+    if (totalSize <= MAX_BLKSZ_S8)
+    {
+        arm_small_blk_absmax_q7(pSrc, blockSize, pResult, pIndex);
+    }
+    else
+    {
+        uint32_t  curIdx = 0;
+        q7_t      curBlkExtr = Q7_MIN;
+        uint32_t  curBlkPos = 0;
+        uint32_t  curBlkIdx = 0;
+        /*
+         * process blocks of 255 elts
+         */
+        while (totalSize >= MAX_BLKSZ_S8)
+        {
+            const q7_t     *curSrc = pSrc;
+
+            arm_small_blk_absmax_q7(curSrc, MAX_BLKSZ_S8, pResult, pIndex);
+            if (*pResult > curBlkExtr)
+            {
+                /*
+                 * update partial extrema
+                 */
+                curBlkExtr = *pResult;
+                curBlkPos = *pIndex;
+                curBlkIdx = curIdx;
+            }
+            curIdx++;
+            pSrc += MAX_BLKSZ_S8;
+            totalSize -= MAX_BLKSZ_S8;
+        }
+        /*
+         * remainder
+         */
+        arm_small_blk_absmax_q7(pSrc, totalSize, pResult, pIndex);
+        if (*pResult > curBlkExtr)
+        {
+            curBlkExtr = *pResult;
+            curBlkPos = *pIndex;
+            curBlkIdx = curIdx;
+        }
+        *pIndex = curBlkIdx * MAX_BLKSZ_S8 + curBlkPos;
+        *pResult = curBlkExtr;
+    }
+}
+#else
+#if defined(ARM_MATH_DSP)
+void arm_absmax_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * pResult,
+        uint32_t * pIndex)
+{
+        q7_t cur_absmax, out;                     /* Temporary variables to store the output value. */\
+        uint32_t blkCnt, outIndex;                     /* Loop counter */                                   \
+        uint32_t index;                                /* index of maximum value */                         \
+                                                                                                            \
+  /* Initialize index value to zero. */                                                                     \
+  outIndex = 0U;                                                                                            \
+  /* Load first input value that act as reference value for comparision */                                  \
+  out = *pSrc++;                                                                                            \
+  out = (out > 0) ? out : (q7_t)__QSUB8(0, out);                                                                           \
+  /* Initialize index of extrema value. */                                                                  \
+  index = 0U;                                                                                               \
+                                                                                                            \
+  /* Loop unrolling: Compute 4 outputs at a time */                                                         \
+  blkCnt = (blockSize - 1U) >> 2U;                                                                          \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    /* Initialize cur_absmax to next consecutive values one by one */                                         \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q7_t)__QSUB8(0, cur_absmax);                                                                \
+    /* compare for the extrema value */                                                                     \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      /* Update the extrema value and it's index */                                                         \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 1U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q7_t)__QSUB8(0, cur_absmax);                                                                \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 2U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q7_t)__QSUB8(0, cur_absmax);                                                                \
+    if (cur_absmax > out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 3U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q7_t)__QSUB8(0, cur_absmax);                                                                 \
+    if (cur_absmax > out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = index + 4U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    index += 4U;                                                                                            \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Loop unrolling: Compute remaining outputs */                                                           \
+  blkCnt = (blockSize - 1U) % 4U;                                                                           \
+                                                                                                            \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    cur_absmax = *pSrc++;                                                                                     \
+    cur_absmax = (cur_absmax > 0) ? cur_absmax : (q7_t)__QSUB8(0, cur_absmax);                                                                 \
+    if (cur_absmax > out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmax;                                                                                       \
+      outIndex = blockSize - blkCnt;                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Store the extrema value and it's index into destination pointers */                                    \
+  *pResult = out;                                                                                           \
+  *pIndex = outIndex;  
+}
+#else
+void arm_absmax_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * pResult,
+        uint32_t * pIndex)
+{
+       q7_t maxVal, out;                              /* Temporary variables to store the output value. */
+        uint32_t blkCnt, outIndex;                     /* Loop counter */
+
+
+  /* Initialise index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = (*pSrc > 0) ? *pSrc : ((*pSrc == (q7_t) 0x80) ? (q7_t) 0x7f : -*pSrc);
+  pSrc++;
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = (blockSize - 1U);
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal = (*pSrc > 0) ? *pSrc : ((*pSrc == (q7_t) 0x80) ? (q7_t) 0x7f : -*pSrc);
+    pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal)
+    {
+      /* Update the maximum value and it's index */
+      out = maxVal;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store the maximum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_DSP) */
+#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of AbsMax group
+ */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..45aec4900bef20827f90ff9fbe15ab5333e3f6d6
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_f16.c
@@ -0,0 +1,276 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_absmin_f16.c
+ * Description:  Minimum value of absolute values of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include <limits.h>
+#endif
+
+
+/**
+  @ingroup groupStats
+ */
+
+/**
+  @addtogroup AbsMin
+  @{
+ */
+
+/**
+  @brief         Minimum value of absolute values of a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    minimum value returned here
+  @param[out]    pIndex     index of minimum value returned here
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_absmin_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult,
+        uint32_t * pIndex)
+{
+    uint16_t  blkCnt;           /* loop counters */
+    f16x8_t vecSrc;
+    float16_t const *pSrcVec;
+    f16x8_t curExtremValVec = vdupq_n_f16(F16_ABSMAX);
+    float16_t minValue = F16_ABSMAX;
+    uint16_t  idx = blockSize;
+    uint16x8_t indexVec;
+    uint16x8_t curExtremIdxVec;
+    mve_pred16_t p0;
+
+
+    indexVec = vidupq_u16((uint32_t)0, 1);
+    curExtremIdxVec = vdupq_n_u16(0);
+
+    pSrcVec = (float16_t const *) pSrc;
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        vecSrc = vldrhq_f16(pSrcVec);  
+        pSrcVec += 8;
+        vecSrc = vabsq(vecSrc);
+        /*
+         * Get current max per lane and current index per lane
+         * when a max is selected
+         */
+        p0 = vcmpleq(vecSrc, curExtremValVec);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+        indexVec = indexVec +  8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        p0 = vctp16q(blkCnt);
+
+        vecSrc = vldrhq_f16(pSrcVec);  
+        pSrcVec += 8;
+        vecSrc = vabsq(vecSrc);
+        /*
+         * Get current max per lane and current index per lane
+         * when a max is selected
+         */
+        p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+    }
+    /*
+     * Get min value across the vector
+     */
+    minValue = vminnmvq(minValue, curExtremValVec);
+    /*
+     * set index for lower values to max possible index
+     */
+    p0 = vcmpleq(curExtremValVec, minValue);
+    indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
+    /*
+     * Get min index which is thus for a max value
+     */
+    idx = vminvq(idx, indexVec);
+    /*
+     * Save result
+     */
+    *pIndex = idx;
+    *pResult = minValue;
+}
+
+#else
+#if defined(ARM_MATH_LOOPUNROLL)
+void arm_absmin_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult,
+        uint32_t * pIndex)
+{
+        float16_t cur_absmin, out;                     /* Temporary variables to store the output value. */\
+        uint32_t blkCnt, outIndex;                     /* Loop counter */                                   \
+        uint32_t index;                                /* index of maximum value */                         \
+                                                                                                            \
+  /* Initialize index value to zero. */                                                                     \
+  outIndex = 0U;                                                                                            \
+  /* Load first input value that act as reference value for comparision */                                  \
+  out = *pSrc++;                                                                                            \
+  out = (out > 0.0f16) ? out : -out;                                                                             \
+  /* Initialize index of extrema value. */                                                                  \
+  index = 0U;                                                                                               \
+                                                                                                            \
+  /* Loop unrolling: Compute 4 outputs at a time */                                                         \
+  blkCnt = (blockSize - 1U) >> 2U;                                                                          \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    /* Initialize cur_absmin to next consecutive values one by one */                                         \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0.0f16) ? cur_absmin : -cur_absmin;                                                                 \
+    /* compare for the extrema value */                                                                     \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      /* Update the extrema value and it's index */                                                         \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 1U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0.0f16) ? cur_absmin : -cur_absmin;                                                                 \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 2U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0.0f16) ? cur_absmin : -cur_absmin;                                                                 \
+    if (cur_absmin < out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 3U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0.0f16) ? cur_absmin : -cur_absmin;                                                                 \
+    if (cur_absmin < out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 4U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    index += 4U;                                                                                            \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Loop unrolling: Compute remaining outputs */                                                           \
+  blkCnt = (blockSize - 1U) % 4U;                                                                           \
+                                                                                                            \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0.0f16) ? cur_absmin : -cur_absmin;                                                                 \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = blockSize - blkCnt;                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Store the extrema value and it's index into destination pointers */                                    \
+  *pResult = out;                                                                                           \
+  *pIndex = outIndex;  
+}
+#else
+void arm_absmin_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult,
+        uint32_t * pIndex)
+{
+        float16_t minVal, out;                         /* Temporary variables to store the output value. */
+        uint32_t blkCnt, outIndex;                     /* Loop counter */
+
+  /* Initialise index value to zero. */
+  outIndex = 0U;
+
+  /* Load first input value that act as reference value for comparision */
+  out = fabsf(*pSrc++);
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = (blockSize - 1U);
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal = fabsf(*pSrc++);
+
+    /* compare for the minimum value */
+    if (out > minVal)
+    {
+      /* Update the minimum value and it's index */
+      out = minVal;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store the minimum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_LOOPUNROLL) */
+#endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of AbsMin group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..3a963227f3a672b24caa97b620fe9aec451797c4
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_f32.c
@@ -0,0 +1,279 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_absmin_f32.c
+ * Description:  Minimum value of absolute values of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include <limits.h>
+#endif
+
+
+/**
+  @ingroup groupStats
+ */
+
+/**
+  @defgroup AbsMin Absolute Minimum
+
+  Computes the minimum value of absolute values of an array of data.
+  The function returns both the minimum value and its position within the array.
+  There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+  @addtogroup AbsMin
+  @{
+ */
+
+/**
+  @brief         Minimum value of absolute values of a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    minimum value returned here
+  @param[out]    pIndex     index of minimum value returned here
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_absmin_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex)
+{
+    int32_t  blkCnt;           /* loop counters */
+    f32x4_t vecSrc;
+    float32_t const *pSrcVec;
+    f32x4_t curExtremValVec = vdupq_n_f32(F32_ABSMAX);
+    float32_t minValue = F32_ABSMAX;
+    uint32_t  idx = blockSize;
+    uint32x4_t indexVec;
+    uint32x4_t curExtremIdxVec;
+    mve_pred16_t p0;
+
+
+    indexVec = vidupq_u32((uint32_t)0, 1);
+    curExtremIdxVec = vdupq_n_u32(0);
+
+    pSrcVec = (float32_t const *) pSrc;
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0)
+    {
+        vecSrc = vldrwq_f32(pSrcVec);  
+        pSrcVec += 4;
+        vecSrc = vabsq(vecSrc);
+        /*
+         * Get current max per lane and current index per lane
+         * when a max is selected
+         */
+        p0 = vcmpleq(vecSrc, curExtremValVec);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+        indexVec = indexVec +  4;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0)
+    {
+        p0 = vctp32q(blkCnt);
+
+        vecSrc = vldrwq_f32(pSrcVec);  
+        pSrcVec += 4;
+        vecSrc = vabsq(vecSrc);
+        /*
+         * Get current max per lane and current index per lane
+         * when a max is selected
+         */
+        p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+    }
+    /*
+     * Get min value across the vector
+     */
+    minValue = vminnmvq(minValue, curExtremValVec);
+    /*
+     * set index for lower values to max possible index
+     */
+    p0 = vcmpleq(curExtremValVec, minValue);
+    indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
+    /*
+     * Get min index which is thus for a max value
+     */
+    idx = vminvq(idx, indexVec);
+    /*
+     * Save result
+     */
+    *pIndex = idx;
+    *pResult = minValue;
+}
+
+#else
+#if defined(ARM_MATH_LOOPUNROLL)
+void arm_absmin_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex)
+{
+        float32_t cur_absmin, out;                     /* Temporary variables to store the output value. */\
+        uint32_t blkCnt, outIndex;                     /* Loop counter */                                   \
+        uint32_t index;                                /* index of maximum value */                         \
+                                                                                                            \
+  /* Initialize index value to zero. */                                                                     \
+  outIndex = 0U;                                                                                            \
+  /* Load first input value that act as reference value for comparision */                                  \
+  out = *pSrc++;                                                                                            \
+  out = (out > 0.0f) ? out : -out;                                                                             \
+  /* Initialize index of extrema value. */                                                                  \
+  index = 0U;                                                                                               \
+                                                                                                            \
+  /* Loop unrolling: Compute 4 outputs at a time */                                                         \
+  blkCnt = (blockSize - 1U) >> 2U;                                                                          \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    /* Initialize cur_absmin to next consecutive values one by one */                                         \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0.0f) ? cur_absmin : -cur_absmin;                                                                 \
+    /* compare for the extrema value */                                                                     \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      /* Update the extrema value and it's index */                                                         \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 1U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0.0f) ? cur_absmin : -cur_absmin;                                                                 \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 2U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0.0f) ? cur_absmin : -cur_absmin;                                                                 \
+    if (cur_absmin < out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 3U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0.0f) ? cur_absmin : -cur_absmin;                                                                 \
+    if (cur_absmin < out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 4U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    index += 4U;                                                                                            \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Loop unrolling: Compute remaining outputs */                                                           \
+  blkCnt = (blockSize - 1U) % 4U;                                                                           \
+                                                                                                            \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0.0f) ? cur_absmin : -cur_absmin;                                                                 \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = blockSize - blkCnt;                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Store the extrema value and it's index into destination pointers */                                    \
+  *pResult = out;                                                                                           \
+  *pIndex = outIndex;  
+}
+#else
+void arm_absmin_f32(
+  const float32_t * pSrc,
+        uint32_t blockSize,
+        float32_t * pResult,
+        uint32_t * pIndex)
+{
+       float32_t minVal, out;                         /* Temporary variables to store the output value. */
+       uint32_t blkCnt, outIndex;                     /* Loop counter */
+
+  /* Initialise index value to zero. */
+  outIndex = 0U;
+
+  /* Load first input value that act as reference value for comparision */
+  out = fabsf(*pSrc++);
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = (blockSize - 1U);
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal = fabsf(*pSrc++);
+
+    /* compare for the minimum value */
+    if (out > minVal)
+    {
+      /* Update the minimum value and it's index */
+      out = minVal;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store the minimum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+
+#endif /* defined(ARM_MATH_LOOPUNROLL) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of AbsMin group
+ */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q15.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q15.c
new file mode 100644
index 0000000000000000000000000000000000000000..e4d0704a835c45f9d0fa0b62a60a03f0eebc179b
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q15.c
@@ -0,0 +1,269 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_absmin_q15.c
+ * Description:  Minimum value of absolute values of a Q15 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+  @ingroup groupStats
+ */
+
+
+/**
+  @addtogroup AbsMin
+  @{
+ */
+
+/**
+  @brief         Minimum value of absolute values of a Q15 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    minimum value returned here
+  @param[out]    pIndex     index of minimum value returned here
+  @return        none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_absmin_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex)
+{
+      uint16_t        blkCnt;           /* loop counters */
+    q15x8_t       vecSrc;
+    q15_t const   *pSrcVec;
+    q15x8_t       curExtremValVec = vdupq_n_s16(Q15_ABSMAX);
+    q15_t           minValue = Q15_ABSMAX;
+    uint16_t        idx = blockSize;
+    uint16x8_t    indexVec;
+    uint16x8_t    curExtremIdxVec;
+    uint32_t        startIdx = 0;
+    mve_pred16_t    p0;
+
+
+    indexVec = vidupq_wb_u16(&startIdx, 1);
+    curExtremIdxVec = vdupq_n_u16(0);
+
+    pSrcVec = (q15_t const *) pSrc;
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrcVec); 
+        pSrcVec += 8;
+        vecSrc = vabsq(vecSrc);
+        /*
+         * Get current min per lane and current index per lane
+         * when a min is selected
+         */
+        p0 = vcmpleq(vecSrc, curExtremValVec);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+        indexVec = vidupq_wb_u16(&startIdx, 1);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrcVec); 
+        pSrcVec += 8;
+        vecSrc = vabsq(vecSrc);
+
+        p0 = vctp16q(blkCnt);
+        /*
+         * Get current min per lane and current index per lane
+         * when a min is selected
+         */
+        p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+    }
+    /*
+     * Get min value across the vector
+     */
+    minValue = vminvq(minValue, curExtremValVec);
+    /*
+     * set index for lower values to min possible index
+     */
+    p0 = vcmpleq(curExtremValVec, minValue);
+    indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
+    /*
+     * Get min index which is thus for a min value
+     */
+    idx = vminvq(idx, indexVec);
+    /*
+     * Save result
+     */
+    *pIndex = idx;
+    *pResult = minValue;
+}
+
+#else
+#if defined(ARM_MATH_DSP)
+void arm_absmin_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex)
+{
+        q15_t cur_absmin, out;                     /* Temporary variables to store the output value. */\
+        uint32_t blkCnt, outIndex;                     /* Loop counter */                                   \
+        uint32_t index;                                /* index of maximum value */                         \
+                                                                                                            \
+  /* Initialize index value to zero. */                                                                     \
+  outIndex = 0U;                                                                                            \
+  /* Load first input value that act as reference value for comparision */                                  \
+  out = *pSrc++;                                                                                            \
+  out = (out > 0) ? out : (q15_t)__QSUB16(0, out);                                                                           \
+  /* Initialize index of extrema value. */                                                                  \
+  index = 0U;                                                                                               \
+                                                                                                            \
+  /* Loop unrolling: Compute 4 outputs at a time */                                                         \
+  blkCnt = (blockSize - 1U) >> 2U;                                                                          \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    /* Initialize cur_absmin to next consecutive values one by one */                                         \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q15_t)__QSUB16(0, cur_absmin);                                                                \
+    /* compare for the extrema value */                                                                     \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      /* Update the extrema value and it's index */                                                         \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 1U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q15_t)__QSUB16(0, cur_absmin);                                                                \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 2U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q15_t)__QSUB16(0, cur_absmin);                                                                \
+    if (cur_absmin < out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 3U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q15_t)__QSUB16(0, cur_absmin);                                                                 \
+    if (cur_absmin < out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 4U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    index += 4U;                                                                                            \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Loop unrolling: Compute remaining outputs */                                                           \
+  blkCnt = (blockSize - 1U) % 4U;                                                                           \
+                                                                                                            \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q15_t)__QSUB16(0, cur_absmin);                                                                 \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = blockSize - blkCnt;                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Store the extrema value and it's index into destination pointers */                                    \
+  *pResult = out;                                                                                           \
+  *pIndex = outIndex;  
+}
+#else
+void arm_absmin_q15(
+  const q15_t * pSrc,
+        uint32_t blockSize,
+        q15_t * pResult,
+        uint32_t * pIndex)
+{
+       q15_t minVal, out;                             /* Temporary variables to store the output value. */
+        uint32_t blkCnt, outIndex;                     /* Loop counter */
+
+
+  /* Initialise index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = (*pSrc > 0) ? *pSrc : ((*pSrc == (q15_t) 0x8000) ? 0x7fff : -*pSrc);
+  pSrc++;
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = (blockSize - 1U);
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal = (*pSrc > 0) ? *pSrc : ((*pSrc == (q15_t) 0x8000) ? 0x7fff : -*pSrc);
+    pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal)
+    {
+      /* Update the minimum value and it's index */
+      out = minVal;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store the minimum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_DSP) */
+#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of AbsMin group
+ */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q31.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q31.c
new file mode 100644
index 0000000000000000000000000000000000000000..21c769b86ec432088d40ca6acc55b4b9b91d773e
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q31.c
@@ -0,0 +1,269 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_absmin_q31.c
+ * Description:  Minimum value of absolute values of a Q31 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+  @ingroup groupStats
+ */
+
+
+/**
+  @addtogroup AbsMin
+  @{
+ */
+
+/**
+  @brief         Minimum value of absolute values of a Q31 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    minimum value returned here
+  @param[out]    pIndex     index of minimum value returned here
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_absmin_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex)
+{
+      uint16_t        blkCnt;           /* loop counters */
+    q31x4_t       vecSrc;
+    q31_t const   *pSrcVec;
+    q31x4_t       curExtremValVec = vdupq_n_s32(Q31_ABSMAX);
+    q31_t           minValue = Q31_ABSMAX;
+    uint16_t        idx = blockSize;
+    uint32x4_t    indexVec;
+    uint32x4_t    curExtremIdxVec;
+    uint32_t        startIdx = 0;
+    mve_pred16_t    p0;
+
+
+    indexVec = vidupq_wb_u32(&startIdx, 1);
+    curExtremIdxVec = vdupq_n_u32(0);
+
+    pSrcVec = (q31_t const *) pSrc;
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        vecSrc = vldrwq_s32(pSrcVec);  
+        pSrcVec += 4;
+        vecSrc = vabsq(vecSrc);
+        /*
+         * Get current min per lane and current index per lane
+         * when a min is selected
+         */
+        p0 = vcmpleq(vecSrc, curExtremValVec);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+        indexVec = vidupq_wb_u32(&startIdx, 1);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        vecSrc = vldrwq_s32(pSrcVec);  
+        pSrcVec += 4;
+        vecSrc = vabsq(vecSrc);
+
+        p0 = vctp32q(blkCnt);
+        /*
+         * Get current min per lane and current index per lane
+         * when a min is selected
+         */
+        p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+    }
+    /*
+     * Get min value across the vector
+     */
+    minValue = vminvq(minValue, curExtremValVec);
+    /*
+     * set index for lower values to min possible index
+     */
+    p0 = vcmpleq(curExtremValVec, minValue);
+    indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
+    /*
+     * Get min index which is thus for a min value
+     */
+    idx = vminvq(idx, indexVec);
+    /*
+     * Save result
+     */
+    *pIndex = idx;
+    *pResult = minValue;
+}
+
+#else
+#if defined(ARM_MATH_DSP)
+void arm_absmin_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex)
+{
+        q31_t cur_absmin, out;                     /* Temporary variables to store the output value. */\
+        uint32_t blkCnt, outIndex;                     /* Loop counter */                                   \
+        uint32_t index;                                /* index of maximum value */                         \
+                                                                                                            \
+  /* Initialize index value to zero. */                                                                     \
+  outIndex = 0U;                                                                                            \
+  /* Load first input value that act as reference value for comparision */                                  \
+  out = *pSrc++;                                                                                            \
+  out = (out > 0) ? out : (q31_t)__QSUB(0, out);                                                                           \
+  /* Initialize index of extrema value. */                                                                  \
+  index = 0U;                                                                                               \
+                                                                                                            \
+  /* Loop unrolling: Compute 4 outputs at a time */                                                         \
+  blkCnt = (blockSize - 1U) >> 2U;                                                                          \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    /* Initialize cur_absmin to next consecutive values one by one */                                         \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q31_t)__QSUB(0, cur_absmin);                                                                \
+    /* compare for the extrema value */                                                                     \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      /* Update the extrema value and it's index */                                                         \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 1U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q31_t)__QSUB(0, cur_absmin);                                                                \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 2U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q31_t)__QSUB(0, cur_absmin);                                                                \
+    if (cur_absmin < out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 3U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q31_t)__QSUB(0, cur_absmin);                                                                 \
+    if (cur_absmin < out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 4U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    index += 4U;                                                                                            \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Loop unrolling: Compute remaining outputs */                                                           \
+  blkCnt = (blockSize - 1U) % 4U;                                                                           \
+                                                                                                            \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q31_t)__QSUB(0, cur_absmin);                                                                 \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = blockSize - blkCnt;                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Store the extrema value and it's index into destination pointers */                                    \
+  *pResult = out;                                                                                           \
+  *pIndex = outIndex;  
+}
+#else
+void arm_absmin_q31(
+  const q31_t * pSrc,
+        uint32_t blockSize,
+        q31_t * pResult,
+        uint32_t * pIndex)
+{
+        q31_t minVal, out;                             /* Temporary variables to store the output value. */
+        uint32_t blkCnt, outIndex;                     /* Loop counter */
+
+  /* Initialise index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = (*pSrc > 0) ? *pSrc : ((*pSrc == INT32_MIN) ? INT32_MAX : -*pSrc);
+  pSrc++;
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = (blockSize - 1U);
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal = (*pSrc > 0) ? *pSrc : ((*pSrc == INT32_MIN) ? INT32_MAX : -*pSrc);
+    pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal)
+    {
+      /* Update the minimum value and it's index */
+      out = minVal;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store the minimum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_DSP) */
+#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of AbsMin group
+ */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q7.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q7.c
new file mode 100644
index 0000000000000000000000000000000000000000..2889891ee11836a1f49750594433b52a04581868
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q7.c
@@ -0,0 +1,322 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_absmin_q7.c
+ * Description:  Minimum value of absolute values of a Q7 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+  @ingroup groupStats
+ */
+
+
+/**
+  @addtogroup AbsMin
+  @{
+ */
+
+/**
+  @brief         Minimum value of absolute values of a Q7 vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    minimum value returned here
+  @param[out]    pIndex     index of minimum value returned here
+  @return        none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include <stdint.h>
+#include "arm_helium_utils.h"
+
+#define MAX_BLKSZ_S8  (UINT8_MAX+1)
+
+static void arm_small_blk_absmin_q7(
+    const q7_t        *pSrc,
+    uint32_t     blockSize,
+    q7_t        *pResult,
+    uint32_t    *pIndex)
+{
+    uint16_t        blkCnt;           /* loop counters */
+    q7x16_t       vecSrc;
+    q7_t const   *pSrcVec;
+    q7x16_t       curExtremValVec = vdupq_n_s8(Q7_ABSMAX);
+    q7_t           minValue = Q7_ABSMAX;
+    uint16_t       idx = blockSize - 1;
+    uint8x16_t    indexVec;
+    uint8x16_t    curExtremIdxVec;
+    uint32_t       startIdx = 0;
+    mve_pred16_t   p0;
+
+
+    indexVec = vidupq_wb_u8(&startIdx, 1);
+    curExtremIdxVec = vdupq_n_u8(0);
+
+    pSrcVec = (q7_t const *) pSrc;
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrcVec); 
+        pSrcVec += 16;
+        vecSrc = vabsq(vecSrc);
+        /*
+         * Get current min per lane and current index per lane
+         * when a min is selected
+         */
+        p0 = vcmpleq(vecSrc, curExtremValVec);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+        indexVec = vidupq_wb_u8(&startIdx, 1);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrcVec); 
+        pSrcVec += 16;
+        vecSrc = vabsq(vecSrc);
+
+        p0 = vctp8q(blkCnt);
+        /*
+         * Get current min per lane and current index per lane
+         * when a min is selected
+         */
+        p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+    }
+    /*
+     * Get min value across the vector
+     */
+    minValue = vminvq(minValue, curExtremValVec);
+    /*
+     * set index for lower values to min possible index
+     */
+    p0 = vcmpleq(curExtremValVec, minValue);
+    idx = vminvq_p_u8(idx, curExtremIdxVec, p0);
+    /*
+     * Save result
+     */
+    *pIndex = idx;
+    *pResult = minValue;
+}
+
+
+void arm_absmin_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * pResult,
+        uint32_t * pIndex)
+{
+     int32_t   totalSize = blockSize;
+
+    if (totalSize <= MAX_BLKSZ_S8)
+    {
+        arm_small_blk_absmin_q7(pSrc, blockSize, pResult, pIndex);
+    }
+    else
+    {
+        uint32_t  curIdx = 0;
+        q7_t      curBlkExtr = Q7_MAX;
+        uint32_t  curBlkPos = 0;
+        uint32_t  curBlkIdx = 0;
+        /*
+         * process blocks of 255 elts
+         */
+        while (totalSize >= MAX_BLKSZ_S8)
+        {
+            const q7_t     *curSrc = pSrc;
+
+            arm_small_blk_absmin_q7(curSrc, MAX_BLKSZ_S8, pResult, pIndex);
+            if (*pResult < curBlkExtr)
+            {
+                /*
+                 * update partial extrema
+                 */
+                curBlkExtr = *pResult;
+                curBlkPos = *pIndex;
+                curBlkIdx = curIdx;
+            }
+            curIdx++;
+            pSrc += MAX_BLKSZ_S8;
+            totalSize -= MAX_BLKSZ_S8;
+        }
+        /*
+         * remainder
+         */
+        arm_small_blk_absmin_q7(pSrc, totalSize, pResult, pIndex);
+        if (*pResult < curBlkExtr)
+        {
+            curBlkExtr = *pResult;
+            curBlkPos = *pIndex;
+            curBlkIdx = curIdx;
+        }
+        *pIndex = curBlkIdx * MAX_BLKSZ_S8 + curBlkPos;
+        *pResult = curBlkExtr;
+    }
+}
+
+#else
+#if defined(ARM_MATH_DSP)
+void arm_absmin_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * pResult,
+        uint32_t * pIndex)
+{
+        q7_t cur_absmin, out;                     /* Temporary variables to store the output value. */\
+        uint32_t blkCnt, outIndex;                     /* Loop counter */                                   \
+        uint32_t index;                                /* index of maximum value */                         \
+                                                                                                            \
+  /* Initialize index value to zero. */                                                                     \
+  outIndex = 0U;                                                                                            \
+  /* Load first input value that act as reference value for comparision */                                  \
+  out = *pSrc++;                                                                                            \
+  out = (out > 0) ? out : (q7_t)__QSUB8(0, out);                                                                           \
+  /* Initialize index of extrema value. */                                                                  \
+  index = 0U;                                                                                               \
+                                                                                                            \
+  /* Loop unrolling: Compute 4 outputs at a time */                                                         \
+  blkCnt = (blockSize - 1U) >> 2U;                                                                          \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    /* Initialize cur_absmin to next consecutive values one by one */                                         \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q7_t)__QSUB8(0, cur_absmin);                                                                \
+    /* compare for the extrema value */                                                                     \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      /* Update the extrema value and it's index */                                                         \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 1U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q7_t)__QSUB8(0, cur_absmin);                                                                \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 2U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q7_t)__QSUB8(0, cur_absmin);                                                                \
+    if (cur_absmin < out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 3U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q7_t)__QSUB8(0, cur_absmin);                                                                 \
+    if (cur_absmin < out)                                                                          \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = index + 4U;                                                                                \
+    }                                                                                                       \
+                                                                                                            \
+    index += 4U;                                                                                            \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Loop unrolling: Compute remaining outputs */                                                           \
+  blkCnt = (blockSize - 1U) % 4U;                                                                           \
+                                                                                                            \
+                                                                                                            \
+  while (blkCnt > 0U)                                                                                       \
+  {                                                                                                         \
+    cur_absmin = *pSrc++;                                                                                     \
+    cur_absmin = (cur_absmin > 0) ? cur_absmin : (q7_t)__QSUB8(0, cur_absmin);                                                                 \
+    if (cur_absmin < out)                                                                         \
+    {                                                                                                       \
+      out = cur_absmin;                                                                                       \
+      outIndex = blockSize - blkCnt;                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    /* Decrement loop counter */                                                                            \
+    blkCnt--;                                                                                               \
+  }                                                                                                         \
+                                                                                                            \
+  /* Store the extrema value and it's index into destination pointers */                                    \
+  *pResult = out;                                                                                           \
+  *pIndex = outIndex;  
+}
+#else
+void arm_absmin_q7(
+  const q7_t * pSrc,
+        uint32_t blockSize,
+        q7_t * pResult,
+        uint32_t * pIndex)
+{
+        q7_t minVal, out;                              /* Temporary variables to store the output value. */
+        uint32_t blkCnt, outIndex;                     /* Loop counter */
+
+  /* Initialise index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = (*pSrc > 0) ? *pSrc : ((*pSrc == (q7_t) 0x80) ? (q7_t) 0x7f : -*pSrc);
+  pSrc++;
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = (blockSize - 1U);
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal = (*pSrc > 0) ? *pSrc : ((*pSrc == (q7_t) 0x80) ? (q7_t) 0x7f : -*pSrc);
+    pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal)
+    {
+      /* Update the minimum value and it's index */
+      out = minVal;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store the minimum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_DSP) */
+#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+  @} end of AbsMin group
+ */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..ffe08f42f8cd955a5b26afa34943dcf82d86f826
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f16.c
@@ -0,0 +1,140 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_logsumexp_f16.c
+ * Description:  LogSumExp
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupStats
+ */
+
+/**
+  @defgroup Entropy Entropy
+
+  Computes the entropy of a distribution
+
+ */
+
+/**
+ * @addtogroup Entropy
+ * @{
+ */
+
+
+/**
+ * @brief Entropy
+ *
+ * @param[in]  pSrcA        Array of input values.
+ * @param[in]  blockSize    Number of samples in the input array.
+ * @return     Entropy      -Sum(p ln p)
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize)
+{
+    uint32_t        blkCnt;
+    _Float16       accum=0.0f16,p;
+
+
+    blkCnt = blockSize;
+
+    f16x8_t         vSum = vdupq_n_f16(0.0f);
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        f16x8_t         vecIn = vld1q(pSrcA);
+
+        vSum = vaddq_f16(vSum, vmulq(vecIn, vlogq_f16(vecIn)));
+
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrcA += 8;
+        blkCnt --;
+    }
+
+    accum = vecAddAcrossF16Mve(vSum);
+
+    /* Tail */
+    blkCnt = blockSize & 0x7;
+    while(blkCnt > 0)
+    {
+       p = *pSrcA++;
+       accum += p * logf(p);
+       
+       blkCnt--;
+    
+    }
+
+    return (-accum);
+}
+
+#else
+
+float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize)
+{
+    const float16_t *pIn;
+    uint32_t blkCnt;
+    _Float16 accum, p;
+ 
+    pIn = pSrcA;
+    blkCnt = blockSize;
+
+    accum = 0.0f;
+
+    while(blkCnt > 0)
+    {
+       p = *pIn++;
+       accum += p * logf(p);
+       
+       blkCnt--;
+    
+    }
+
+    return(-accum);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of Entropy group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f32.c
index 01f7bc7883ce72ea275c66628a113d6cf3e56500..a2160c164ac59d9dd47c1a8fe9c0b670b57a80a6 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f32.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_logsumexp_f32.c
  * Description:  LogSumExp
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,13 +26,13 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 /**
- * @addtogroup groupStats
+ * @addtogroup Entropy
  * @{
  */
 
@@ -168,5 +170,5 @@ float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize)
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
- * @} end of groupStats group
+ * @} end of Entropy group
  */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f64.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f64.c
index 7638ba706e257faa075c884ecab153b0eac9380a..c208ff48e92d9669495df42c243f0a3aec64e6b9 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f64.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f64.c
@@ -3,11 +3,13 @@
  * Title:        arm_logsumexp_f64.c
  * Description:  LogSumExp
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,12 +26,12 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 #include <limits.h>
 #include <math.h>
 
 /**
- * @addtogroup groupStats
+ * @addtogroup Entropy
  * @{
  */
 
@@ -67,5 +69,5 @@ float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize)
 }
 
 /**
- * @} end of groupStats group
+ * @} end of Entropy group
  */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..a7da249447bddc635f5efa8b6d3851609caecbae
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f16.c
@@ -0,0 +1,152 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_logsumexp_f16.c
+ * Description:  LogSumExp
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupStats
+ */
+
+/**
+  @defgroup Kullback-Leibler Kullback-Leibler divergence
+
+  Computes the Kullback-Leibler divergence between two distributions
+
+ */
+
+
+/**
+ * @addtogroup Kullback-Leibler
+ * @{
+ */
+
+
+/**
+ * @brief Kullback-Leibler
+ *
+ * Distribution A may contain 0 with Neon version.
+ * Result will be right but some exception flags will be set.
+ *
+ * Distribution B must not contain 0 probability.
+ *
+ * @param[in]  *pSrcA         points to an array of input values for probaility distribution A.
+ * @param[in]  *pSrcB         points to an array of input values for probaility distribution B.
+ * @param[in]  blockSize      number of samples in the input array.
+ * @return Kullback-Leibler divergence D(A || B)
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+float16_t arm_kullback_leibler_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize)
+{
+    uint32_t blkCnt;
+    _Float16 accum, pA,pB;
+ 
+    
+    blkCnt = blockSize;
+
+    accum = 0.0f16;
+
+    f16x8_t         vSum = vdupq_n_f16(0.0f);
+    blkCnt = blockSize >> 3;
+    while(blkCnt > 0)
+    {
+        f16x8_t         vecA = vld1q(pSrcA);
+        f16x8_t         vecB = vld1q(pSrcB);
+        f16x8_t         vRatio;
+
+        vRatio = vdiv_f16(vecB, vecA);
+        vSum = vaddq_f16(vSum, vmulq(vecA, vlogq_f16(vRatio)));
+
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrcA += 8;
+        pSrcB += 8;
+        blkCnt --;
+    }
+
+    accum = vecAddAcrossF16Mve(vSum);
+
+    blkCnt = blockSize & 7;
+    while(blkCnt > 0)
+    {
+       pA = *pSrcA++;
+       pB = *pSrcB++;
+       accum += pA * logf(pB / pA);
+       
+       blkCnt--;
+    
+    }
+
+    return(-accum);
+}
+
+#else
+float16_t arm_kullback_leibler_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize)
+{
+    const float16_t *pInA, *pInB;
+    uint32_t blkCnt;
+    _Float16 accum, pA,pB;
+ 
+    pInA = pSrcA;
+    pInB = pSrcB;
+    blkCnt = blockSize;
+
+    accum = 0.0f;
+
+    while(blkCnt > 0)
+    {
+       pA = *pInA++;
+       pB = *pInB++;
+       accum += pA * logf(pB / pA);
+       
+       blkCnt--;
+    
+    }
+
+    return(-accum);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of Kullback-Leibler group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f32.c
index c406e84d25deee92027e003d16009e90aaf92575..45a9624c61870712c13d37be9b82bb8ee143afe3 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f32.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_logsumexp_f32.c
  * Description:  LogSumExp
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,13 +26,13 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 /**
- * @addtogroup groupStats
+ * @addtogroup Kullback-Leibler
  * @{
  */
 
@@ -187,5 +189,5 @@ float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSr
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
- * @} end of groupStats group
+ * @} end of Kullback-Leibler group
  */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f64.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
index 8f2f9667c4c878d81c868ebb93799ce8bbf53d47..b22d0473dab7d968e894837644247a32ce352f3e 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
@@ -3,11 +3,13 @@
  * Title:        arm_logsumexp_f64.c
  * Description:  LogSumExp
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,12 +26,12 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 #include <limits.h>
 #include <math.h>
 
 /**
- * @addtogroup groupStats
+ * @addtogroup Kullback-Leibler
  * @{
  */
 
@@ -69,5 +71,5 @@ float64_t arm_kullback_leibler_f64(const float64_t * pSrcA, const float64_t * pS
 }
 
 /**
- * @} end of groupStats group
+ * @} end of Kullback-Leibler group
  */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_dot_prod_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_dot_prod_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..a35ac0e855cadab111eb208bde95b6bb4a47fec6
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_dot_prod_f16.c
@@ -0,0 +1,84 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_logsumexp_f16.c
+ * Description:  LogSumExp
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupStats
+ */
+
+/**
+  @defgroup LogSumExp LogSumExp
+
+  LogSumExp optimizations to compute sum of probabilities with Gaussian distributions
+
+ */
+
+/**
+ * @addtogroup LogSumExp
+ * @{
+ */
+
+
+/**
+ * @brief Dot product with log arithmetic
+ *
+ * Vectors are containing the log of the samples
+ *
+ * @param[in]       *pSrcA points to the first input vector
+ * @param[in]       *pSrcB points to the second input vector
+ * @param[in]       blockSize number of samples in each vector
+ * @param[in]       *pTmpBuffer temporary buffer of length blockSize
+ * @return The log of the dot product.
+ *
+ */
+
+
+float16_t arm_logsumexp_dot_prod_f16(const float16_t * pSrcA,
+  const float16_t * pSrcB,
+  uint32_t blockSize,
+  float16_t *pTmpBuffer)
+{
+    float16_t result;
+    arm_add_f16((float16_t*)pSrcA, (float16_t*)pSrcB, pTmpBuffer, blockSize);
+
+    result = arm_logsumexp_f16(pTmpBuffer, blockSize);
+    return(result);
+}
+
+/**
+ * @} end of LogSumExp group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_dot_prod_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_dot_prod_f32.c
index 3f0c5548a05604bfc0be49f8b31e383d689873de..d2a94dacd17e808391a271664a119eaf9f99d09a 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_dot_prod_f32.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_dot_prod_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_logsumexp_f32.c
  * Description:  LogSumExp
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,13 +26,13 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 /**
- * @addtogroup groupStats
+ * @addtogroup LogSumExp
  * @{
  */
 
@@ -62,5 +64,5 @@ float32_t arm_logsumexp_dot_prod_f32(const float32_t * pSrcA,
 }
 
 /**
- * @} end of groupStats group
+ * @} end of LogSumExp group
  */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..81272d56a11a101da3fa5a308781acbffe530bf3
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_f16.c
@@ -0,0 +1,172 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_logsumexp_f16.c
+ * Description:  LogSumExp
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+
+/**
+ * @addtogroup LogSumExp
+ * @{
+ */
+
+
+/**
+ * @brief Computation of the LogSumExp
+ *
+ * In probabilistic computations, the dynamic of the probability values can be very
+ * wide because they come from gaussian functions.
+ * To avoid underflow and overflow issues, the values are represented by their log.
+ * In this representation, multiplying the original exp values is easy : their logs are added.
+ * But adding the original exp values is requiring some special handling and it is the
+ * goal of the LogSumExp function.
+ *
+ * If the values are x1...xn, the function is computing:
+ *
+ * ln(exp(x1) + ... + exp(xn)) and the computation is done in such a way that
+ * rounding issues are minimised.
+ *
+ * The max xm of the values is extracted and the function is computing:
+ * xm + ln(exp(x1 - xm) + ... + exp(xn - xm))
+ *
+ * @param[in]  *in         Pointer to an array of input values.
+ * @param[in]  blockSize   Number of samples in the input array.
+ * @return LogSumExp
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
+{
+    float16_t       maxVal;
+    const float16_t *pIn;
+    int32_t         blkCnt;
+    _Float16       accum=0.0f16;
+    _Float16       tmp;
+
+
+    arm_max_no_idx_f16((float16_t *) in, blockSize, &maxVal);
+
+
+    blkCnt = blockSize;
+    pIn = in;
+
+
+    f16x8_t         vSum = vdupq_n_f16(0.0f16);
+    blkCnt = blockSize >> 3;
+    while(blkCnt > 0)
+    {
+        f16x8_t         vecIn = vld1q(pIn);
+        f16x8_t         vecExp;
+
+        vecExp = vexpq_f16(vsubq_n_f16(vecIn, maxVal));
+
+        vSum = vaddq_f16(vSum, vecExp);
+
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pIn += 8;
+        blkCnt --;
+    }
+
+    /* sum + log */
+    accum = vecAddAcrossF16Mve(vSum);
+
+    blkCnt = blockSize & 0x7;
+    while(blkCnt > 0)
+    {
+       tmp = *pIn++;
+       accum += expf(tmp - maxVal);
+       blkCnt--;
+    
+    }
+
+    accum = maxVal + logf(accum);
+
+    return (accum);
+}
+
+#else
+float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
+{
+    _Float16 maxVal;
+    _Float16 tmp;
+    const float16_t *pIn;
+    uint32_t blkCnt;
+    _Float16 accum;
+ 
+    pIn = in;
+    blkCnt = blockSize;
+
+    maxVal = *pIn++;
+    blkCnt--;
+
+    while(blkCnt > 0)
+    {
+       tmp = *pIn++;
+
+       if (tmp > maxVal)
+       {
+          maxVal = tmp;
+       }
+       blkCnt--;
+    
+    }
+
+    blkCnt = blockSize;
+    pIn = in;
+    accum = 0;
+    while(blkCnt > 0)
+    {
+       tmp = *pIn++;
+       accum += expf(tmp - maxVal);
+       blkCnt--;
+    
+    }
+    accum = maxVal + logf(accum);
+
+    return(accum);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of LogSumExp group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_f32.c
index 03f778da9b0887f063bd1f438a632ab59294bc05..25daaf05b22954c23881527e646d16afb39aa8e7 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_f32.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_logsumexp_f32.c
  * Description:  LogSumExp
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,13 +26,13 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 /**
- * @addtogroup groupStats
+ * @addtogroup LogSumExp
  * @{
  */
 
@@ -271,5 +273,5 @@ float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize)
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
- * @} end of groupStats group
+ * @} end of LogSumExp group
  */
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_max_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..b9b64f0bf49f67abba7df9a70440a0998a1ab07d
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_f16.c
@@ -0,0 +1,246 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_max_f16.c
+ * Description:  Maximum value of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include <limits.h>
+#endif
+
+/**
+  @ingroup groupStats
+ */
+
+
+/**
+  @addtogroup Max
+  @{
+ */
+
+/**
+  @brief         Maximum value of a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    maximum value returned here
+  @param[out]    pIndex     index of maximum value returned here
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_max_f16(
+  const float16_t * pSrc,
+  uint32_t blockSize,
+  float16_t * pResult,
+  uint32_t * pIndex)
+{
+     int32_t blkCnt;
+    f16x8_t vecSrc;
+    f16x8_t curExtremValVec = vdupq_n_f16(F16_MIN);
+    float16_t maxValue = F16_MIN;
+    uint32_t idx = blockSize;
+    uint16x8_t indexVec;
+    uint16x8_t curExtremIdxVec;
+    uint32_t curIdx = 0;
+    mve_pred16_t p0;
+    float16_t tmp;
+
+
+    indexVec = vidupq_wb_u16(&curIdx, 1);
+    curExtremIdxVec = vdupq_n_u16(0);
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0)
+    {
+        vecSrc = vldrhq_f16(pSrc);
+        /*
+         * Get current max per lane and current index per lane
+         * when a max is selected
+         */
+        p0 = vcmpgeq(vecSrc, curExtremValVec);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+        indexVec = vidupq_wb_u16(&curIdx, 1);
+
+        pSrc += 8;
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+
+    /*
+     * Get max value across the vector
+     */
+    maxValue = vmaxnmvq(maxValue, curExtremValVec);
+    /*
+     * set index for lower values to max possible index
+     */
+    p0 = vcmpgeq(curExtremValVec, maxValue);
+    indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
+    /*
+     * Get min index which is thus for a max value
+     */
+    idx = vminvq(idx, indexVec);
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+
+    while (blkCnt > 0)
+    {
+      /* Initialize tmp to the next consecutive values one by one */
+      tmp = *pSrc++;
+
+      /* compare for the maximum value */
+      if (maxValue < tmp)
+      {
+        /* Update the maximum value and it's index */
+        maxValue = tmp;
+        idx = blockSize - blkCnt;
+      }
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+    /*
+     * Save result
+     */
+    *pIndex = idx;
+    *pResult = maxValue;
+}
+
+#else
+void arm_max_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult,
+        uint32_t * pIndex)
+{
+        float16_t maxVal, out;                         /* Temporary variables to store the output value. */
+        uint32_t blkCnt, outIndex;                     /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+        uint32_t index;                                /* index of maximum value */
+#endif
+
+  /* Initialise index value to zero. */
+  outIndex = 0U;
+
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+  /* Initialise index of maximum value. */
+  index = 0U;
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = (blockSize - 1U) >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to next consecutive values one by one */
+    maxVal = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal)
+    {
+      /* Update the maximum value and it's index */
+      out = maxVal;
+      outIndex = index + 1U;
+    }
+
+    maxVal = *pSrc++;
+    if (out < maxVal)
+    {
+      out = maxVal;
+      outIndex = index + 2U;
+    }
+
+    maxVal = *pSrc++;
+    if (out < maxVal)
+    {
+      out = maxVal;
+      outIndex = index + 3U;
+    }
+
+    maxVal = *pSrc++;
+    if (out < maxVal)
+    {
+      out = maxVal;
+      outIndex = index + 4U;
+    }
+
+    index += 4U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = (blockSize - 1U) % 4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal)
+    {
+      /* Update the maximum value and it's index */
+      out = maxVal;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store the maximum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of Max group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_max_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_f32.c
index 677ad1dca6164c4d7f5a176b44c5309d0433db9d..d82b03905e615a9bdc97053ab26c43a11fdb1605 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_max_f32.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_max_f32.c
  * Description:  Maximum value of a floating-point vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include <limits.h>
 #endif
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_max_no_idx_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_no_idx_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..3a95b4bea2dce9f015c18442254cde3462715a45
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_no_idx_f16.c
@@ -0,0 +1,144 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_max_no_idx_f16.c
+ * Description:  Maximum value of a floating-point vector without returning the index
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include <limits.h>
+#endif
+
+/**
+  @ingroup groupStats
+ */
+
+
+/**
+  @addtogroup Max
+  @{
+ */
+
+/**
+  @brief         Maximum value of a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    maximum value returned here
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_max_no_idx_f16(
+    const float16_t *pSrc,
+    uint32_t   blockSize,
+    float16_t *pResult)
+{
+   f16x8_t     vecSrc;
+   f16x8_t     curExtremValVec = vdupq_n_f16(F16_MIN);
+   float16_t   maxValue = F16_MIN;
+   float16_t   newVal;
+   uint32_t    blkCnt;
+
+   /* Loop unrolling: Compute 4 outputs at a time */
+   blkCnt = blockSize >> 3U;
+
+   while (blkCnt > 0U)
+   {
+
+        vecSrc = vldrhq_f16(pSrc);
+        /*
+         * update per-lane max.
+         */
+        curExtremValVec = vmaxnmq(vecSrc, curExtremValVec);
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrc += 8;
+        blkCnt --;
+    }
+    /*
+     * Get max value across the vector
+     */
+    maxValue = vmaxnmvq(maxValue, curExtremValVec);
+
+    blkCnt = blockSize & 7;
+
+    while (blkCnt > 0U)
+    {
+        newVal = *pSrc++;
+
+        /* compare for the maximum value */
+        if (maxValue < newVal)
+        {
+            /* Update the maximum value and it's index */
+            maxValue = newVal;
+        }
+
+        blkCnt --;
+    }
+
+    *pResult = maxValue;
+}
+
+#else
+
+void arm_max_no_idx_f16(
+    const float16_t *pSrc,
+    uint32_t   blockSize,
+    float16_t *pResult)
+{
+   float16_t   maxValue = F16_MIN;
+   float16_t   newVal;
+
+   while (blockSize > 0U)
+   {
+       newVal = *pSrc++;
+   
+       /* compare for the maximum value */
+       if (maxValue < newVal)
+       {
+           /* Update the maximum value and it's index */
+           maxValue = newVal;
+       }
+   
+       blockSize --;
+   }
+    
+   *pResult = maxValue;
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of Max group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_max_no_idx_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_no_idx_f32.c
index ccb31c71441947fbc700ee1a1b45f152d089ed17..9a2a015a1ecc6f69970717568dd5ed34b1d33693 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_max_no_idx_f32.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_no_idx_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_max_no_idx_f32.c
  * Description:  Maximum value of a floating-point vector without returning the index
  *
- * $Date:        16. October 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include <limits.h>
 #endif
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q15.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q15.c
index e06a912c9ddd1416d3dcc9d0061e275151afb680..5715e37bdeec881b0f6ef2a1bf16aa81e1bdec51 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q15.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_max_q15.c
  * Description:  Maximum value of a Q15 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -45,7 +45,7 @@
   @param[out]    pIndex     index of maximum value returned here
   @return        none
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
@@ -55,78 +55,49 @@ void arm_max_q15(
         q15_t * pResult,
         uint32_t * pIndex)
 {
-    uint32_t blkCnt;           /* loop counters */
-    q15x8_t vecSrc;
-    q15x8_t curExtremValVec = vdupq_n_s16(Q15_MIN);
-    q15_t maxValue = Q15_MIN, temp;
-    uint32_t  idx = blockSize;
-    uint16x8_t indexVec;
-    uint16x8_t curExtremIdxVec;
-    mve_pred16_t p0;
-
-
-    indexVec = vidupq_u16((uint32_t)0, 1);
-    curExtremIdxVec = vdupq_n_u16(0);
-
-    blkCnt = blockSize >> 3;
-    while (blkCnt > 0U)
-    {
-        vecSrc = vldrhq_s16(pSrc);  
-        pSrc += 8;
+    int32_t         blkCnt;     /* loop counters */
+    q15x8_t         extremValVec = vdupq_n_s16(Q15_MIN);
+    q15_t           maxValue = Q15_MIN;
+    uint16x8_t      indexVec;
+    uint16x8_t      extremIdxVec;
+    mve_pred16_t    p0;
+    uint16_t        extremIdxArr[8];
+
+    indexVec = vidupq_u16(0U, 1);
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp16q(blkCnt);
+        q15x8_t         extremIdxVal = vld1q_z_s16(pSrc, p);
         /*
          * Get current max per lane and current index per lane
          * when a max is selected
          */
-        p0 = vcmpgeq(vecSrc, curExtremValVec);
-        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
-        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+        p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
 
-        indexVec = indexVec +  8;
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-   
-    /*
-     * Get max value across the vector
-     */
-    maxValue = vmaxvq(maxValue, curExtremValVec);
-    /*
-     * set index for lower values to max possible index
-     */
-    p0 = vcmpgeq(curExtremValVec, maxValue);
-    indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
-    /*
-     * Get min index which is thus for a max value
-     */
-    idx = vminvq(idx, indexVec);
-
-    /* Tail */
-    blkCnt = blockSize & 0x7;
-    while (blkCnt > 0U)
-    {
-      /* Initialize temp to the next consecutive values one by one */
-      temp = *pSrc++;
-  
-      /* compare for the maximum value */
-      if (maxValue < temp)
-      {
-        /* Update the maximum value and it's index */
-        maxValue = temp;
-        idx = blockSize - blkCnt;
-      }
-  
-      /* Decrement loop counter */
-      blkCnt--;
+        extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+        /* store per-lane extrema indexes */
+        vst1q_p_u16(extremIdxArr, indexVec, p0);
+
+        indexVec += 8;
+        pSrc += 8;
+        blkCnt -= 8;
     }
+    while (blkCnt > 0);
+
 
-    /*
-     * Save result
-     */
-    *pIndex = idx;
+    /* Get max value across the vector   */
+    maxValue = vmaxvq(maxValue, extremValVec);
+
+    /* set index for lower values to max possible index   */
+    p0 = vcmpgeq(extremValVec, maxValue);
+    extremIdxVec = vld1q_u16(extremIdxArr);
+
+    indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0);
+    *pIndex = vminvq(blockSize - 1, indexVec);
     *pResult = maxValue;
 }
+
 #else
 void arm_max_q15(
   const q15_t * pSrc,
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q31.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q31.c
index e1832b47f251eeb7aa596cd499d7f12e466dcf7a..fed900b8f50b8b8c2daaf074eac360f1e69d17a6 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q31.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_max_q31.c
  * Description:  Maximum value of a Q31 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -45,91 +45,59 @@
   @param[out]    pIndex     index of maximum value returned here
   @return        none
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
 void arm_max_q31(
-  const q31_t * pSrc,
-        uint32_t blockSize,
-        q31_t * pResult,
-        uint32_t * pIndex)
+    const q31_t * pSrc,
+    uint32_t blockSize,
+    q31_t * pResult,
+    uint32_t * pIndex)
 {
-    uint32_t  blkCnt;           /* loop counters */
-    q31x4_t vecSrc;
-    q31x4_t curExtremValVec = vdupq_n_s32( Q31_MIN);
-    q31_t maxValue = Q31_MIN;
-    q31_t temp;
-    uint32_t  idx = blockSize;
-    uint32x4_t indexVec;
-    uint32x4_t curExtremIdxVec;
-    mve_pred16_t p0;
-
-
-    indexVec = vidupq_u32((uint32_t)0, 1);
-    curExtremIdxVec = vdupq_n_u32(0);
-
-    /* Compute 4 outputs at a time */
-    blkCnt = blockSize >> 2U;
-    while (blkCnt > 0U)
-    {
-        vecSrc = vldrwq_s32(pSrc);  
-        pSrc += 4;
+    int32_t         blkCnt;     /* loop counters */
+    q31x4_t         extremValVec = vdupq_n_s32(Q31_MIN);
+    q31_t           maxValue = Q31_MIN;
+    uint32x4_t      indexVec;
+    uint32x4_t      extremIdxVec;
+    mve_pred16_t    p0;
+    uint32_t        extremIdxArr[4];
+
+    indexVec = vidupq_u32(0U, 1);
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp32q(blkCnt);
+        q31x4_t         extremIdxVal = vld1q_z_s32(pSrc, p);
         /*
          * Get current max per lane and current index per lane
          * when a max is selected
          */
-        p0 = vcmpgeq(vecSrc, curExtremValVec);
-        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
-        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+        p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
 
-        indexVec = indexVec +  4;
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-   
-    /*
-     * Get max value across the vector
-     */
-    maxValue = vmaxvq(maxValue, curExtremValVec);
-    /*
-     * set index for lower values to max possible index
-     */
-    p0 = vcmpgeq(curExtremValVec, maxValue);
-    indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
-    /*
-     * Get min index which is thus for a max value
-     */
-    idx = vminvq(idx, indexVec);
-
-    /* Tail */
-    blkCnt = blockSize & 0x3;
-
-    while (blkCnt > 0U)
-    {
-       /* Initialize maxVal to the next consecutive values one by one */
-       temp = *pSrc++;
-   
-       /* compare for the maximum value */
-       if (maxValue < temp)
-       {
-         /* Update the maximum value and it's index */
-         maxValue = temp;
-         idx = blockSize - blkCnt;
-       }
-
-       /* Decrement loop counter */
-       blkCnt--;
+        extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+        /* store per-lane extrema indexes */
+        vst1q_p_u32(extremIdxArr, indexVec, p0);
+
+        indexVec += 4;
+        pSrc += 4;
+        blkCnt -= 4;
     }
+    while (blkCnt > 0);
 
-    /*
-     * Save result
-     */
-    *pIndex = idx;
+
+    /* Get max value across the vector   */
+    maxValue = vmaxvq(maxValue, extremValVec);
+
+    /* set index for lower values to max possible index   */
+    p0 = vcmpgeq(extremValVec, maxValue);
+    extremIdxVec = vld1q_u32(extremIdxArr);
+
+    indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0);
+    *pIndex = vminvq(blockSize - 1, indexVec);
     *pResult = maxValue;
 }
+
 #else
 void arm_max_q31(
   const q31_t * pSrc,
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c
index 705f914945f0e6222d7d513248207d0912808538..5deae648a335b13014fe7c300b6ff500d004abf5 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_max_q7.c
  * Description:  Maximum value of a Q7 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -45,89 +45,56 @@
   @param[out]    pIndex     index of maximum value returned here
   @return        none
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
 static void arm_small_blk_max_q7(
     const q7_t * pSrc,
-    uint8_t blockSize,
+    uint16_t blockSize,
     q7_t * pResult,
     uint32_t * pIndex)
 {
-    uint32_t        blkCnt;           /* loop counters */
-    q7x16_t         vecSrc;
-    q7x16_t         curExtremValVec = vdupq_n_s8( Q7_MIN);
-    q7_t            maxValue = Q7_MIN, temp;
-    uint32_t        idx = blockSize;
-    uint8x16_t      indexVec;
-    uint8x16_t      curExtremIdxVec;
-    mve_pred16_t    p0;
-
-
-    indexVec = vidupq_u8((uint32_t)0, 1);
-    curExtremIdxVec = vdupq_n_u8(0);
-
-    blkCnt = blockSize >> 4;
-    while (blkCnt > 0U)
-    {
-        vecSrc = vldrbq_s8(pSrc);  
-        pSrc += 16;
+    int32_t        blkCnt;     /* loop counters */
+    q7x16_t        extremValVec = vdupq_n_s8(Q7_MIN);
+    q7_t           maxValue = Q7_MIN;
+    uint8x16_t     indexVec;
+    uint8x16_t     extremIdxVec;
+    mve_pred16_t   p0;
+    uint8_t        extremIdxArr[16];
+
+    indexVec = vidupq_u8(0U, 1);
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp8q(blkCnt);
+        q7x16_t         extremIdxVal = vld1q_z_s8(pSrc, p);
         /*
          * Get current max per lane and current index per lane
          * when a max is selected
          */
-        p0 = vcmpgeq(vecSrc, curExtremValVec);
-        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
-        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+        p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
 
-        indexVec = indexVec +  16;
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-   
-    
-    /*
-     * Get max value across the vector
-     */
-    maxValue = vmaxvq(maxValue, curExtremValVec);
-    /*
-     * set index for lower values to max possible index
-     */
-    p0 = vcmpgeq(curExtremValVec, maxValue);
-    indexVec = vpselq(curExtremIdxVec, vdupq_n_u8(blockSize), p0);
-    /*
-     * Get min index which is thus for a max value
-     */
-    idx = vminvq(idx, indexVec);
-
-    /*
-     * tail
-     */
-    blkCnt = blockSize & 0xF;
-
-    while (blkCnt > 0U)
-    {
-      /* Initialize temp to the next consecutive values one by one */
-      temp = *pSrc++;
-  
-      /* compare for the maximum value */
-      if (maxValue < temp)
-      {
-        /* Update the maximum value and it's index */
-        maxValue = temp;
-        idx = blockSize - blkCnt;
-      }
-  
-      /* Decrement loop counter */
-      blkCnt--;
+        extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+        /* store per-lane extrema indexes */
+        vst1q_p_u8(extremIdxArr, indexVec, p0);
+
+        indexVec += 16;
+        pSrc += 16;
+        blkCnt -= 16;
     }
-    /*
-     * Save result
-     */
-    *pIndex = idx;
+    while (blkCnt > 0);
+
+
+    /* Get max value across the vector   */
+    maxValue = vmaxvq(maxValue, extremValVec);
+
+    /* set index for lower values to max possible index   */
+    p0 = vcmpgeq(extremValVec, maxValue);
+    extremIdxVec = vld1q_u8(extremIdxArr);
+
+    indexVec = vpselq(extremIdxVec, vdupq_n_u8(blockSize - 1), p0);
+    *pIndex = vminvq_u8(blockSize - 1, indexVec);
     *pResult = maxValue;
 }
 
@@ -138,8 +105,9 @@ void arm_max_q7(
         uint32_t * pIndex)
 {
     int32_t   totalSize = blockSize;
+    const uint16_t sub_blk_sz = UINT8_MAX + 1;
 
-    if (totalSize <= UINT8_MAX)
+    if (totalSize <= sub_blk_sz)
     {
         arm_small_blk_max_q7(pSrc, blockSize, pResult, pIndex);
     }
@@ -152,11 +120,11 @@ void arm_max_q7(
         /*
          * process blocks of 255 elts
          */
-        while (totalSize >= UINT8_MAX)
+        while (totalSize >= sub_blk_sz)
         {
             const q7_t     *curSrc = pSrc;
 
-            arm_small_blk_max_q7(curSrc, UINT8_MAX, pResult, pIndex);
+            arm_small_blk_max_q7(curSrc, sub_blk_sz, pResult, pIndex);
             if (*pResult > curBlkExtr)
             {
                 /*
@@ -167,8 +135,8 @@ void arm_max_q7(
                 curBlkIdx = curIdx;
             }
             curIdx++;
-            pSrc += UINT8_MAX;
-            totalSize -= UINT8_MAX;
+            pSrc += sub_blk_sz;
+            totalSize -= sub_blk_sz;
         }
         /*
          * remainder
@@ -180,7 +148,7 @@ void arm_max_q7(
             curBlkPos = *pIndex;
             curBlkIdx = curIdx;
         }
-        *pIndex = curBlkIdx * UINT8_MAX + curBlkPos;
+        *pIndex = curBlkIdx * sub_blk_sz + curBlkPos;
         *pResult = curBlkExtr;
     }
 }
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..02f495d68d4aaae05c42cf328bdd1ca2c96a713e
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_f16.c
@@ -0,0 +1,152 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mean_f16.c
+ * Description:  Mean value of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupStats
+ */
+
+/**
+  @defgroup mean Mean
+
+  Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector.
+  The underlying algorithm is used:
+
+  <pre>
+      Result = (pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]) / blockSize;
+  </pre>
+
+  There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+  @addtogroup mean
+  @{
+ */
+
+/**
+  @brief         Mean value of a floating-point vector.
+  @param[in]     pSrc       points to the input vector.
+  @param[in]     blockSize  number of samples in input vector.
+  @param[out]    pResult    mean value returned here.
+  @return        none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mean_f16(
+  const float16_t * pSrc,
+  uint32_t blockSize,
+  float16_t * pResult)
+{
+    int32_t  blkCnt;           /* loop counters */
+    f16x8_t vecSrc;
+    f16x8_t sumVec = vdupq_n_f16(0.0f16);
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t p = vctp16q(blkCnt);
+
+        vecSrc = vldrhq_z_f16((float16_t const *) pSrc, p);
+        sumVec = vaddq_m_f16(sumVec, sumVec, vecSrc, p);
+
+        blkCnt -= 8;
+        pSrc += 8;
+    }
+    while (blkCnt > 0);
+
+    *pResult = vecAddAcrossF16Mve(sumVec) / (float16_t) blockSize;
+}
+
+
+#else
+
+void arm_mean_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        float16_t sum = 0.0f;                          /* Temporary result storage */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    sum += *pSrc++;
+
+    sum += *pSrc++;
+
+    sum += *pSrc++;
+
+    sum += *pSrc++;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    sum += *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize  */
+  /* Store result to destination */
+  *pResult = (sum / (float16_t)blockSize);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of mean group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_f32.c
index 439a5975f81b7ebd22b5100b3aad5dac4e8f09dc..dd6d817aaf4028802b15421d7391007463ccb524 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_f32.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_mean_f32.c
  * Description:  Mean value of a floating-point vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,24 +26,12 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
  */
 
-/**
-  @defgroup mean Mean
-
-  Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector.
-  The underlying algorithm is used:
-
-  <pre>
-      Result = (pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]) / blockSize;
-  </pre>
-
-  There are separate functions for floating-point, Q31, Q15, and Q7 data types.
- */
 
 /**
   @addtogroup mean
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c
index 6920869610c4f5f321c42c60f70d85c71f076558..f8af0edad8d900201a12abe36c0c6a59059a550d 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_mean_q15.c
  * Description:  Mean value of a Q15 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -53,7 +53,7 @@
                    Finally, the accumulator is truncated to yield a result of 1.15 format.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_mean_q15(
   const q15_t * pSrc,
         uint32_t blockSize,
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q31.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q31.c
index 2694ef5c090d26454bf7d359a392b421eea59299..b33ed00c5e137f93267c86110b67dd69f3b2a793 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q31.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_mean_q31.c
  * Description:  Mean value of a Q31 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -52,7 +52,7 @@
                    full precision of intermediate result is preserved.
                    Finally, the accumulator is truncated to yield a result of 1.31 format.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_mean_q31(
   const q31_t * pSrc,
         uint32_t blockSize,
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q7.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q7.c
index 26879eb9aeccfaf7575de523a557a563191d5b2f..8cb68b26f8ea7fc89df4799ec10f5f9750057fab 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q7.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_mean_q7.c
  * Description:  Mean value of a Q7 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -53,7 +53,7 @@
                    Finally, the accumulator is truncated to yield a result of 1.7 format.
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 void arm_mean_q7(
   const q7_t * pSrc,
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_min_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_min_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..0d123f547efc885ce2ff468071e26bcf09deedf2
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_min_f16.c
@@ -0,0 +1,240 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_min_f16.c
+ * Description:  Minimum value of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include <limits.h>
+#endif
+
+
+/**
+  @ingroup groupStats
+ */
+
+/**
+  @addtogroup Min
+  @{
+ */
+
+/**
+  @brief         Minimum value of a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    minimum value returned here
+  @param[out]    pIndex     index of minimum value returned here
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_min_f16(
+  const float16_t * pSrc,
+  uint32_t blockSize,
+  float16_t * pResult,
+  uint32_t * pIndex)
+{
+    int32_t  blkCnt;           /* loop counters */
+    f16x8_t vecSrc;
+    float16_t const *pSrcVec;
+    f16x8_t curExtremValVec = vdupq_n_f16(F16_MAX);
+    float16_t minValue = F16_MAX;
+    uint32_t  idx = blockSize;
+    uint16x8_t indexVec;
+    uint16x8_t curExtremIdxVec;
+    mve_pred16_t p0;
+
+    indexVec = vidupq_u16((uint32_t)0, 1);
+    curExtremIdxVec = vdupq_n_u16(0);
+
+    pSrcVec = (float16_t const *) pSrc;
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0)
+    {
+        vecSrc = vldrhq_f16(pSrcVec);  pSrcVec += 8;
+        /*
+         * Get current min per lane and current index per lane
+         * when a min is selected
+         */
+        p0 = vcmpleq(vecSrc, curExtremValVec);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+        indexVec = indexVec + 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0)
+    {
+        vecSrc = vldrhq_f16(pSrcVec);  pSrcVec += 8;
+        p0 = vctp16q(blkCnt);
+        /*
+         * Get current min per lane and current index per lane
+         * when a min is selected
+         */
+        p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
+        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+    }
+    /*
+     * Get min value across the vector
+     */
+    minValue = vminnmvq(minValue, curExtremValVec);
+    /*
+     * set index for lower values to min possible index
+     */
+    p0 = vcmpleq(curExtremValVec, minValue);
+    indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
+    /*
+     * Get min index which is thus for a min value
+     */
+    idx = vminvq(idx, indexVec);
+    /*
+     * Save result
+     */
+    *pIndex = idx;
+    *pResult = minValue;
+}
+
+#else
+
+void arm_min_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult,
+        uint32_t * pIndex)
+{
+        float16_t minVal, out;                         /* Temporary variables to store the output value. */
+        uint32_t blkCnt, outIndex;                     /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+        uint32_t index;                                /* index of maximum value */
+#endif
+
+  /* Initialise index value to zero. */
+  outIndex = 0U;
+
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+  /* Initialise index of maximum value. */
+  index = 0U;
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = (blockSize - 1U) >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to next consecutive values one by one */
+    minVal = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal)
+    {
+      /* Update the minimum value and it's index */
+      out = minVal;
+      outIndex = index + 1U;
+    }
+
+    minVal = *pSrc++;
+    if (out > minVal)
+    {
+      out = minVal;
+      outIndex = index + 2U;
+    }
+
+    minVal = *pSrc++;
+    if (out > minVal)
+    {
+      out = minVal;
+      outIndex = index + 3U;
+    }
+
+    minVal = *pSrc++;
+    if (out > minVal)
+    {
+      out = minVal;
+      outIndex = index + 4U;
+    }
+
+    index += 4U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = (blockSize - 1U) % 4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal)
+    {
+      /* Update the minimum value and it's index */
+      out = minVal;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store the minimum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of Min group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_min_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_min_f32.c
index 28deff25ef3ee158e6f499eef1b34f7bc9993b91..ad8a4720b47627a03666b0f89f97137f56d019a4 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_min_f32.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_min_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_min_f32.c
  * Description:  Minimum value of a floating-point vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include <limits.h>
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q15.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q15.c
index aacefa932dfc202dbbab5c3f5a28c20ca7d2491b..f31019dbcea04c33bfef3454d6d4afa10562b365 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q15.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_min_q15.c
  * Description:  Minimum value of a Q15 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -46,7 +46,7 @@
   @param[out]    pIndex     index of minimum value returned here
   @return        none
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
@@ -56,79 +56,48 @@ void arm_min_q15(
         q15_t * pResult,
         uint32_t * pIndex)
 {
-    uint32_t  blkCnt;           /* loop counters */
-    q15x8_t vecSrc;
-    q15x8_t curExtremValVec = vdupq_n_s16(Q15_MAX);
-    q15_t minValue = Q15_MAX,temp;
-    uint32_t  idx = blockSize;
-    uint16x8_t indexVec;
-    uint16x8_t curExtremIdxVec;
-    mve_pred16_t p0;
 
+    int32_t         blkCnt;     /* loop counters */
+    q15x8_t         extremValVec = vdupq_n_s16(Q15_MAX);
+    q15_t           minValue = Q15_MAX;
+    uint16x8_t      indexVec;
+    uint16x8_t      extremIdxVec;
+    mve_pred16_t    p0;
+    uint16_t        extremIdxArr[8];
 
-    indexVec = vidupq_u16((uint32_t)0, 1);
-    curExtremIdxVec = vdupq_n_u16(0);
+    indexVec = vidupq_u16(0U, 1);
 
-    blkCnt = blockSize >> 3;
-    while (blkCnt > 0U)
-    {
-        vecSrc = vldrhq_s16(pSrc);  
-        pSrc += 8;
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp16q(blkCnt);
+        q15x8_t         extremIdxVal = vld1q_z_s16(pSrc, p);
         /*
          * Get current min per lane and current index per lane
          * when a min is selected
          */
-        p0 = vcmpleq(vecSrc, curExtremValVec);
-        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
-        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+        p0 = vcmpleq_m(extremIdxVal, extremValVec, p);
 
-        indexVec = indexVec +  8;
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-   
-    /*
-     * Get min value across the vector
-     */
-    minValue = vminvq(minValue, curExtremValVec);
-    /*
-     * set index for lower values to min possible index
-     */
-    p0 = vcmpleq(curExtremValVec, minValue);
-    indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
-    /*
-     * Get min index which is thus for a min value
-     */
-    idx = vminvq(idx, indexVec);
-
-    /*
-     * tail
-    */
-    blkCnt = blockSize & 7;
-    while (blkCnt > 0U)
-    {
-      /* Initialize minVal to the next consecutive values one by one */
-      temp = *pSrc++;
-  
-      /* compare for the minimum value */
-      if (minValue > temp)
-      {
-        /* Update the minimum value and it's index */
-        minValue = temp;
-        idx = blockSize - blkCnt;
-      }
-  
-      /* Decrement loop counter */
-      blkCnt--;
+        extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+        /* store per-lane extrema indexes */
+        vst1q_p_u16(extremIdxArr, indexVec, p0);
+
+        indexVec += 8;
+        pSrc += 8;
+        blkCnt -= 8;
     }
+    while (blkCnt > 0);
+
+    /* Get min value across the vector   */
+    minValue = vminvq(minValue, extremValVec);
+
+    /* set index for lower values to min possible index   */
+    p0 = vcmpleq(extremValVec, minValue);
+    extremIdxVec = vld1q_u16(extremIdxArr);
 
-    /*
-     * Save result
-     */
-    *pIndex = idx;
+    indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0);
+    *pIndex = vminvq(blockSize - 1, indexVec);
     *pResult = minValue;
+ 
 }
 #else
 void arm_min_q15(
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q31.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q31.c
index c38622bcbd243b432a080c15118f1f395fe6ff31..c993004cbef357d297fae89a3fcdb461aba27ea8 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q31.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_min_q31.c
  * Description:  Minimum value of a Q31 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -46,7 +46,7 @@
   @param[out]    pIndex     index of minimum value returned here
   @return        none
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
@@ -56,79 +56,49 @@ void arm_min_q31(
         q31_t * pResult,
         uint32_t * pIndex)
 {
-    uint32_t  blkCnt;           /* loop counters */
-    q31x4_t vecSrc;
-    q31x4_t curExtremValVec = vdupq_n_s32(Q31_MAX);
-    q31_t minValue = Q31_MAX, temp;
-    uint32_t  idx = blockSize;
-    uint32x4_t indexVec;
-    uint32x4_t curExtremIdxVec;
-    mve_pred16_t p0;
-
-
-    indexVec = vidupq_u32((uint32_t)0, 1);
-    curExtremIdxVec = vdupq_n_u32(0);
-
-    /* Compute 4 outputs at a time */
-    blkCnt = blockSize >> 2U;
-    while (blkCnt > 0U)
-    {
-        vecSrc = vldrwq_s32(pSrc);  
-        pSrc += 4;
+    int32_t         blkCnt;     /* loop counters */
+    q31x4_t         extremValVec = vdupq_n_s32(Q31_MAX);
+    q31_t           minValue = Q31_MAX;
+    uint32x4_t      indexVec;
+    uint32x4_t      extremIdxVec;
+    mve_pred16_t    p0;
+    uint32_t        extremIdxArr[4];
+
+    indexVec = vidupq_u32(0U, 1);
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp32q(blkCnt);
+        q31x4_t         extremIdxVal = vld1q_z_s32(pSrc, p);
         /*
          * Get current min per lane and current index per lane
          * when a min is selected
          */
-        p0 = vcmpleq(vecSrc, curExtremValVec);
-        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
-        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+        p0 = vcmpleq_m(extremIdxVal, extremValVec, p);
 
-        indexVec = indexVec +  4;
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-    
-    /*
-     * Get min value across the vector
-     */
-    minValue = vminvq(minValue, curExtremValVec);
-    /*
-     * set index for lower values to min possible index
-     */
-    p0 = vcmpleq(curExtremValVec, minValue);
-    indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
-    /*
-     * Get min index which is thus for a min value
-     */
-    idx = vminvq(idx, indexVec);
-
-
-    /* Tail */
-    blkCnt = blockSize & 0x3;
-    while (blkCnt > 0U)
-    {
-      /* Initialize temp to the next consecutive values one by one */
-      temp = *pSrc++;
-  
-      /* compare for the minimum value */
-      if (minValue > temp)
-      {
-        /* Update the minimum value and it's index */
-        minValue = temp;
-        idx = blockSize - blkCnt;
-      }
-  
-      /* Decrement loop counter */
-      blkCnt--;
+        extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+        /* store per-lane extrema indexes */
+        vst1q_p_u32(extremIdxArr, indexVec, p0);
+
+        indexVec += 4;
+        pSrc += 4;
+        blkCnt -= 4;
     }
-    /*
-     * Save result
-     */
-    *pIndex = idx;
+    while (blkCnt > 0);
+
+
+    /* Get min value across the vector   */
+    minValue = vminvq(minValue, extremValVec);
+
+    /* set index for lower values to min possible index   */
+    p0 = vcmpleq(extremValVec, minValue);
+    extremIdxVec = vld1q_u32(extremIdxArr);
+
+    indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0);
+    *pIndex = vminvq(blockSize - 1, indexVec);
     *pResult = minValue;
 }
+
 #else
 void arm_min_q31(
   const q31_t * pSrc,
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q7.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q7.c
index 619f7b6cbb962c2b131877ceec02cf2d39ec3c97..3e5aae536afdfb21616c6a4016bda36f9b5e67a5 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q7.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_min_q7.c
  * Description:  Minimum value of a Q7 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -46,7 +46,7 @@
   @param[out]    pIndex     index of minimum value returned here
   @return        none
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_power_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_power_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..fd135ebffba413d7718f751613c181c3fe847559
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_power_f16.c
@@ -0,0 +1,152 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_power_f16.c
+ * Description:  Sum of the squares of the elements of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupStats
+ */
+
+
+
+/**
+  @addtogroup power
+  @{
+ */
+
+/**
+  @brief         Sum of the squares of the elements of a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    sum of the squares value returned here
+  @return        none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_power_f16(
+  const float16_t * pSrc,
+  uint32_t blockSize,
+  float16_t * pResult)
+{
+    int32_t         blkCnt;     /* loop counters */
+    f16x8_t         vecSrc;
+    f16x8_t         sumVec = vdupq_n_f16(0.0f);
+
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp16q(blkCnt);
+
+        vecSrc = vldrhq_z_f16((float16_t const *) pSrc, p);
+        /*
+         * sum lanes
+         */
+        sumVec = vfmaq_m(sumVec, vecSrc, vecSrc, p);
+
+        blkCnt -= 8;
+        pSrc += 8;
+    }
+    while (blkCnt > 0);
+
+    *pResult = vecAddAcrossF16Mve(sumVec);
+}
+#else
+
+void arm_power_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        _Float16 sum = 0.0f16;                          /* Temporary result storage */
+        _Float16 in;                                  /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+    /* Compute Power and store result in a temporary variable, sum. */
+    in = *pSrc++;
+    sum += in * in;
+
+    in = *pSrc++;
+    sum += in * in;
+
+    in = *pSrc++;
+    sum += in * in;
+
+    in = *pSrc++;
+    sum += in * in;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+    /* Compute Power and store result in a temporary variable, sum. */
+    in = *pSrc++;
+    sum += in * in;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store result to destination */
+  *pResult = sum;
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of power group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_power_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_power_f32.c
index afe40cd4d3c068a2999ce96e2615b5cf4a313b9e..ce12e57e9ea4dc1a58393c023cf5a139653d87ef 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_power_f32.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_power_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_power_f32.c
  * Description:  Sum of the squares of the elements of a floating-point vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -43,6 +43,10 @@
   </pre>
 
   There are separate functions for floating point, Q31, Q15, and Q7 data types.
+
+  Since the result is not divided by the length, those functions are in fact computing
+  something which is more an energy than a power.
+
  */
 
 /**
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q15.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q15.c
index c7f99411a07cdee84004085470ea6e1e279c5684..37a02c061e5ce3160ab6d8543aa7d237e3bc13a4 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q15.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_power_q15.c
  * Description:  Sum of the squares of the elements of a Q15 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -53,7 +53,7 @@
                    full precision of the intermediate multiplication is preserved.
                    Finally, the return result is in 34.30 format.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 void arm_power_q15(
   const q15_t * pSrc,
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q31.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q31.c
index 4de4c9ecad9f68a0d8df41e21bf3b756e99ff6f1..a39b3a709cbb5f043dc8d8ad520becc112c83930 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q31.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_power_q31.c
  * Description:  Sum of the squares of the elements of a Q31 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -54,7 +54,7 @@
                    full precision of the intermediate multiplication is preserved.
                    Finally, the return result is in 16.48 format.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_power_q31(
   const q31_t * pSrc,
         uint32_t blockSize,
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q7.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q7.c
index 0e1b9da10fd9a2e0873ac700031f050cd2393b2c..1f2f6628c0c5a7bc9bb41225a0fdb303f90e1e27 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q7.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_power_q7.c
  * Description:  Sum of the squares of the elements of a Q7 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -53,7 +53,7 @@
                    full precision of the intermediate multiplication is preserved.
                    Finally, the return result is in 18.14 format.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_power_q7(
   const q7_t * pSrc,
         uint32_t blockSize,
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..1cb18c279892c6391f789253f4173de7d5d45a68
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_f16.c
@@ -0,0 +1,147 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rms_f16.c
+ * Description:  Root mean square value of the elements of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupStats
+ */
+
+/**
+  @defgroup RMS Root mean square (RMS)
+
+  Calculates the Root Mean Square of the elements in the input vector.
+  The underlying algorithm is used:
+
+  <pre>
+      Result = sqrt(((pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]) / blockSize));
+  </pre>
+
+  There are separate functions for floating point, Q31, and Q15 data types.
+ */
+
+/**
+  @addtogroup RMS
+  @{
+ */
+
+/**
+  @brief         Root Mean Square of the elements of a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    root mean square value returned here
+  @return        none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_rms_f16(
+  const float16_t * pSrc,
+  uint32_t blockSize,
+  float16_t * pResult)
+{
+    float16_t pow = 0.0f;
+
+    arm_power_f16(pSrc, blockSize, &pow);
+
+    /* Compute Rms and store the result in the destination */
+    arm_sqrt_f16(pow / (float16_t) blockSize, pResult);
+}
+#else
+
+void arm_rms_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        _Float16 sum = 0.0f16;                          /* Temporary result storage */
+        _Float16 in;                                  /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+    in = *pSrc++;
+    /* Compute sum of squares and store result in a temporary variable, sum. */
+    sum += in * in;
+
+    in = *pSrc++;
+    sum += in * in;
+
+    in = *pSrc++;
+    sum += in * in;
+
+    in = *pSrc++;
+    sum += in * in;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+    in = *pSrc++;
+    /* Compute sum of squares and store result in a temporary variable. */
+    sum += ( in * in);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Compute Rms and store result in destination */
+  arm_sqrt_f16(sum / (float16_t) blockSize, pResult);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of RMS group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_f32.c
index 7f26aff23b4f4717da20f98558b5e18a02d74f59..cb45752bca559d4043ff95a6df8b8abad6d8c2fe 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_f32.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_rms_f32.c
  * Description:  Root mean square value of the elements of a floating-point vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q15.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q15.c
index 80977713baf328fa4d013b2f2959ae0dbbaf5307..da925eb063b66ee705163b8e6956914879be9370 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q15.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_rms_q15.c
  * Description:  Root Mean Square of the elements of a Q15 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -54,7 +54,7 @@
                    Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
                    15 bits, and then saturated to yield a result in 1.15 format.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_rms_q15(
   const q15_t * pSrc,
         uint32_t blockSize,
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q31.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q31.c
index 3c1c69226166b22cd23484d1d5da5993d841981d..ba39b7b7226ae9497f10ccd6413f7b9ed0d3cb2a 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q31.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_rms_q31.c
  * Description:  Root Mean Square of the elements of a Q31 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -56,7 +56,7 @@
                    log2(blockSize) bits, as a total of blockSize additions are performed internally.
                    Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 void arm_rms_q31(
   const q31_t * pSrc,
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_std_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_std_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..951d1651009737eb28b92c532e6a223b7f962bb6
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_std_f16.c
@@ -0,0 +1,67 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_std_f16.c
+ * Description:  Standard deviation of the elements of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupStats
+ */
+
+
+
+/**
+  @addtogroup STD
+  @{
+ */
+
+/**
+  @brief         Standard deviation of the elements of a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    standard deviation value returned here
+  @return        none
+ */
+void arm_std_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult)
+{
+  float16_t var;
+  arm_var_f16(pSrc,blockSize,&var);
+  arm_sqrt_f16(var, pResult);
+}
+
+/**
+  @} end of STD group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_std_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_std_f32.c
index b584dcdfd53105d6a9a4d7a3dd8554ffe2fb8f2d..682443d911da8776ac8476d779c1ef269ba531cd 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_std_f32.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_std_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_std_f32.c
  * Description:  Standard deviation of the elements of a floating-point vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q15.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q15.c
index 8003f83e8e431d34f109d9eaaa70388a78fb0915..74ee4f12ce8d12a65dbfaf8f97a5ee127c56b477 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q15.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_std_q15.c
  * Description:  Standard deviation of an array of Q15 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -54,7 +54,7 @@
                    Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
                    15 bits, and then saturated to yield a result in 1.15 format.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_std_q15(
   const q15_t * pSrc,
         uint32_t blockSize,
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q31.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q31.c
index 190e5954470813c10c15b6bd2077280a37f9dab5..63170e79888200ff2e6e5881d1fec315f37523d2 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q31.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_std_q31.c
  * Description:  Standard deviation of the elements of a Q31 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -57,7 +57,7 @@
                    After division, internal variables should be Q18.46
                    Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_std_q31(
   const q31_t * pSrc,
         uint32_t blockSize,
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_var_f16.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_var_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..ad151d2b19f5e59d8fc08e8a5ef14496277b439f
--- /dev/null
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_var_f16.c
@@ -0,0 +1,218 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_var_f16.c
+ * Description:  Variance of the elements of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupStats
+ */
+
+
+/**
+  @addtogroup variance
+  @{
+ */
+
+/**
+  @brief         Variance of the elements of a floating-point vector.
+  @param[in]     pSrc       points to the input vector
+  @param[in]     blockSize  number of samples in input vector
+  @param[out]    pResult    variance value returned here
+  @return        none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+
+void arm_var_f16(
+           const float16_t * pSrc,
+                 uint32_t blockSize,
+                 float16_t * pResult)
+{
+    int32_t         blkCnt;     /* loop counters */
+    f16x8_t         vecSrc;
+    f16x8_t         sumVec = vdupq_n_f16((float16_t) 0.0);
+    float16_t       fMean;
+
+    if (blockSize <= 1U) {
+        *pResult = 0;
+        return;
+    }
+
+
+    arm_mean_f16(pSrc, blockSize, &fMean);
+
+/* 6.14 bug */
+#if defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100) && (__ARMCC_VERSION < 6150001)
+    __asm volatile(
+        "   vmov.i32                     %[acc], #0 \n"
+        : [acc] "+t"(sumVec)
+        : 
+        : );
+#endif
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp16q(blkCnt);
+
+        vecSrc = vldrhq_z_f16((float16_t const *) pSrc, p);
+        /*
+         * sum lanes
+         */
+        vecSrc = vsubq_m(vuninitializedq_f16(), vecSrc, fMean, p);
+        sumVec = vfmaq_m(sumVec, vecSrc, vecSrc, p);
+
+        blkCnt -= 8;
+        pSrc += 8;
+    }
+    while (blkCnt > 0);
+    
+    /* Variance */
+    *pResult = vecAddAcrossF16Mve(sumVec) / (float16_t) (blockSize - 1.0f);
+}
+#else
+
+void arm_var_f16(
+  const float16_t * pSrc,
+        uint32_t blockSize,
+        float16_t * pResult)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+        _Float16 sum = 0.0f;                          /* Temporary result storage */
+        _Float16 fSum = 0.0f;
+        _Float16 fMean, fValue;
+  const float16_t * pInput = pSrc;
+
+  if (blockSize <= 1U)
+  {
+    *pResult = 0;
+    return;
+  }
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+
+    sum += *pInput++;
+    sum += *pInput++;
+    sum += *pInput++;
+    sum += *pInput++;
+
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+
+    sum += *pInput++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize  */
+  fMean = sum / (float16_t) blockSize;
+
+  pInput = pSrc;
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    fValue = *pInput++ - fMean;
+    fSum += fValue * fValue;
+
+    fValue = *pInput++ - fMean;
+    fSum += fValue * fValue;
+
+    fValue = *pInput++ - fMean;
+    fSum += fValue * fValue;
+
+    fValue = *pInput++ - fMean;
+    fSum += fValue * fValue;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    fValue = *pInput++ - fMean;
+    fSum += fValue * fValue;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Variance */
+  *pResult = fSum / (float16_t)(blockSize - 1.0f);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of variance group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_var_f32.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_var_f32.c
index fd3950016abddff694b17e658fabd879bd12059d..7ff344c7cf6a1c126e549d31c61797821735fba2 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_var_f32.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_var_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_var_f32.c
  * Description:  Variance of the elements of a floating-point vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q15.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q15.c
index c9ddfd189a4a51949d6b5088821fb230e2964185..e15c6aa6252e560160b8e9ffa7e34ffee6e64b90 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q15.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_var_q15.c
  * Description:  Variance of an array of Q15 type
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -54,7 +54,7 @@
                    Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
                    15 bits, and then saturated to yield a result in 1.15 format.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_var_q15(
   const q15_t * pSrc,
         uint32_t blockSize,
@@ -81,8 +81,8 @@ void arm_var_q15(
         /* Compute Sum of squares of the input samples
          * and then store the result in a temporary variable, sumOfSquares. */
 
-        sumOfSquares = vmlaldavaq(sumOfSquares, vecSrc, vecSrc);
-        sum = vaddvaq(sum, vecSrc);
+        sumOfSquares = vmlaldavaq_s16(sumOfSquares, vecSrc, vecSrc);
+        sum = vaddvaq_s16(sum, vecSrc);
 
         blkCnt --;
         pSrc += 8;
diff --git a/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q31.c b/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q31.c
index 0b7d45353d35f741324faa5a38b3eb340de2f5db..45b6b66b4796b1ef1b7b18faec790105f6921ea4 100644
--- a/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q31.c
+++ b/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_var_q31.c
  * Description:  Variance of an array of Q31 type
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/statistics_functions.h"
 
 /**
   @ingroup groupStats
@@ -57,7 +57,7 @@
                    After division, internal variables should be Q18.46
                    Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_var_q31(
   const q31_t * pSrc,
         uint32_t blockSize,
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c b/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..9dc8c18dc63381250e637298bc608187170f16b2
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c
@@ -0,0 +1,274 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_barycenter_f16.c
+ * Description:  Barycenter
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @defgroup barycenter Barycenter
+
+  Barycenter of weighted vectors
+ */
+
+/**
+  @addtogroup barycenter
+  @{
+ */
+
+
+/**
+ * @brief Barycenter
+ *
+ *
+ * @param[in]    *in         List of vectors
+ * @param[in]    *weights    Weights of the vectors
+ * @param[out]   *out        Barycenter
+ * @param[in]    nbVectors   Number of vectors
+ * @param[in]    vecDim      Dimension of space (vector dimension)
+ * @return       None
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_barycenter_f16(const float16_t *in, 
+  const float16_t *weights, 
+  float16_t *out, 
+  uint32_t nbVectors,
+  uint32_t vecDim)
+{
+    const float16_t *pIn, *pW;
+    const float16_t *pIn1, *pIn2, *pIn3, *pIn4;
+    float16_t      *pOut;
+    uint32_t        blkCntVector, blkCntSample;
+    float16_t       accum, w;
+
+    blkCntVector = nbVectors;
+    blkCntSample = vecDim;
+
+    accum = 0.0f;
+
+    pW = weights;
+    pIn = in;
+
+
+    arm_fill_f16(0.0f, out, vecDim);
+
+
+    /* Sum */
+    pIn1 = pIn;
+    pIn2 = pIn1 + vecDim;
+    pIn3 = pIn2 + vecDim;
+    pIn4 = pIn3 + vecDim;
+
+    blkCntVector = nbVectors >> 2;
+    while (blkCntVector > 0) 
+    {
+        f16x8_t         outV, inV1, inV2, inV3, inV4;
+        float16_t       w1, w2, w3, w4;
+
+        pOut = out;
+        w1 = *pW++;
+        w2 = *pW++;
+        w3 = *pW++;
+        w4 = *pW++;
+        accum += w1 + w2 + w3 + w4;
+
+        blkCntSample = vecDim >> 3;
+        while (blkCntSample > 0) {
+            outV = vld1q((const float16_t *) pOut);
+            inV1 = vld1q(pIn1);
+            inV2 = vld1q(pIn2);
+            inV3 = vld1q(pIn3);
+            inV4 = vld1q(pIn4);
+            outV = vfmaq(outV, inV1, w1);
+            outV = vfmaq(outV, inV2, w2);
+            outV = vfmaq(outV, inV3, w3);
+            outV = vfmaq(outV, inV4, w4);
+            vst1q(pOut, outV);
+
+            pOut += 8;
+            pIn1 += 8;
+            pIn2 += 8;
+            pIn3 += 8;
+            pIn4 += 8;
+
+            blkCntSample--;
+        }
+
+        blkCntSample = vecDim & 7;
+        while (blkCntSample > 0) {
+            *pOut = *pOut + *pIn1++ * w1;
+            *pOut = *pOut + *pIn2++ * w2;
+            *pOut = *pOut + *pIn3++ * w3;
+            *pOut = *pOut + *pIn4++ * w4;
+            pOut++;
+            blkCntSample--;
+        }
+
+        pIn1 += 3 * vecDim;
+        pIn2 += 3 * vecDim;
+        pIn3 += 3 * vecDim;
+        pIn4 += 3 * vecDim;
+
+        blkCntVector--;
+    }
+
+    pIn = pIn1;
+
+    blkCntVector = nbVectors & 3;
+    while (blkCntVector > 0) 
+    {
+        f16x8_t         inV, outV;
+
+        pOut = out;
+        w = *pW++;
+        accum += w;
+
+        blkCntSample = vecDim >> 3;
+        while (blkCntSample > 0) 
+        {
+            outV = vld1q_f16(pOut);
+            inV = vld1q_f16(pIn);
+            outV = vfmaq(outV, inV, w);
+            vst1q_f16(pOut, outV);
+            pOut += 8;
+            pIn += 8;
+
+            blkCntSample--;
+        }
+
+        blkCntSample = vecDim & 7;
+        while (blkCntSample > 0) 
+        {
+            *pOut = *pOut + *pIn++ * w;
+            pOut++;
+            blkCntSample--;
+        }
+
+        blkCntVector--;
+    }
+
+    /* Normalize */
+    pOut = out;
+    accum = 1.0f / accum;
+
+    blkCntSample = vecDim >> 3;
+    while (blkCntSample > 0) 
+    {
+        f16x8_t         tmp;
+
+        tmp = vld1q((const float16_t *) pOut);
+        tmp = vmulq(tmp, accum);
+        vst1q(pOut, tmp);
+        pOut += 8;
+        blkCntSample--;
+    }
+
+    blkCntSample = vecDim & 7;
+    while (blkCntSample > 0) 
+    {
+        *pOut = *pOut * accum;
+        pOut++;
+        blkCntSample--;
+    }
+}
+#else
+void arm_barycenter_f16(const float16_t *in, const float16_t *weights, float16_t *out, uint32_t nbVectors,uint32_t vecDim)
+{
+
+   const float16_t *pIn,*pW;
+   float16_t *pOut;
+   uint32_t blkCntVector,blkCntSample;
+   float16_t accum, w;
+
+   blkCntVector = nbVectors;
+   blkCntSample = vecDim;
+
+   accum = 0.0f;
+
+   pW = weights;
+   pIn = in;
+
+   /* Set counters to 0 */
+   blkCntSample = vecDim;
+   pOut = out;
+
+   while(blkCntSample > 0)
+   {
+         *pOut = 0.0f;
+         pOut++;
+         blkCntSample--;
+   }
+
+   /* Sum */
+   while(blkCntVector > 0)
+   {
+      pOut = out;
+      w = *pW++;
+      accum += w;
+
+      blkCntSample = vecDim;
+      while(blkCntSample > 0)
+      {
+          *pOut = *pOut + *pIn++ * w;
+          pOut++;
+          blkCntSample--;
+      }
+
+      blkCntVector--;
+   }
+
+   /* Normalize */
+   blkCntSample = vecDim;
+   pOut = out;
+
+   while(blkCntSample > 0)
+   {
+         *pOut = *pOut / accum;
+         pOut++;
+         blkCntSample--;
+   }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of barycenter group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c
index 68dd8df41cbdfc01ebea4308c14cddda1987d5b8..817bb5883cb5d22fe61f6fd1acdfd0c2195d7c49 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_barycenter_f32.c
  * Description:  Barycenter
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,13 +26,13 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 #include <limits.h>
 #include <math.h>
 
 
 /**
-  @ingroup groupSupport
+  @ingroup barycenter
  */
 
 
@@ -408,5 +410,5 @@ void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
- * @} end of groupSupport group
+ * @} end of barycenter group
  */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c
index ca8a3f931037bc36e2c30628264f0a6da6613523..e9612b1bc9eda463d56057f279e9bde4d08fd459 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_bitonic_sort_f32.c
  * Description:  Floating point bitonic sort
  *
- * $Date:        2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 #include "arm_sorting.h"
 
 
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c
index 125799843e56eaa93d7854d620c93ded058c94e6..640778de4a69c82578fd7e381bbaa2efbbd70c45 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_bubble_sort_f32.c
  * Description:  Floating point bubble sort
  *
- * $Date:        2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 #include "arm_sorting.h"
 
 /**
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c b/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..d441332a828d9a7ca06d5ef4bb2f00de6148db8c
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c
@@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_copy_f16.c
+ * Description:  Copies the elements of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupSupport
+ */
+
+
+/**
+  @addtogroup copy
+  @{
+ */
+
+/**
+  @brief         Copies the elements of a f16 vector.
+  @param[in]     pSrc       points to input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_copy_f16(
+  const float16_t * pSrc,
+  float16_t * pDst,
+  uint32_t blockSize)
+{
+    do {
+        mve_pred16_t    p = vctp16q(blockSize);
+
+        vstrhq_p_f16(pDst,
+        vldrhq_z_f16((float16_t const *) pSrc, p), p);
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+        blockSize -= 8;
+    }
+    while ((int32_t) blockSize > 0);
+}
+
+#else
+
+void arm_copy_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+    *pDst++ = *pSrc++;
+    *pDst++ = *pSrc++;
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicCopy group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c
index 5ce3ec6109b80469dd42efe69e98a1256ada275d..d739f7cb5446d3014bcc4ef47ee48796bdfa7648 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_copy_f32.c
  * Description:  Copies the elements of a floating-point vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c b/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c
index d99f7250fba6077a26761ee639ea47d44e60c657..c657b5418631b77b791b8f67cd7ce2325917bb7b 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_copy_q15.c
  * Description:  Copies the elements of a Q15 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
@@ -44,7 +44,7 @@
   @param[in]     blockSize  number of samples in each vector
   @return        none
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_copy_q15(
   const q15_t * pSrc,
         q15_t * pDst,
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c b/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c
index 012113b5cfb11e8981ba4e62486bf9d443110a98..8e06bdac028491c4157a5740077a627d22cb0bcf 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_copy_q31.c
  * Description:  Copies the elements of a Q31 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
@@ -44,7 +44,7 @@
   @param[in]     blockSize  number of samples in each vector
   @return        none
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_copy_q31(
   const q31_t * pSrc,
         q31_t * pDst,
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c b/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c
index d56176f0e201abe3b445b07d622a34b02c81be7b..63f1cb947c624a63a8a480de3c317429b2d4ff42 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_copy_q7.c
  * Description:  Copies the elements of a Q7 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
@@ -44,7 +44,7 @@
   @param[in]     blockSize  number of samples in each vector
   @return        none
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_copy_q7(
   const q7_t * pSrc,
         q7_t * pDst,
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c b/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c
new file mode 100644
index 0000000000000000000000000000000000000000..a00435394fd808ed6e165f4df8152fc2a7cbe659
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c
@@ -0,0 +1,134 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_float_to_q15.c
+ * Description:  Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+ * @defgroup f16_to_x  Convert 16-bit floating point value
+ */
+
+/**
+  @addtogroup f16_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the f16 vector to f32 vector.
+  @param[in]     pSrc       points to the f16 input vector
+  @param[out]    pDst       points to the f32 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_f16_to_float built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(__CMSIS_GCC_H)
+
+void arm_f16_to_float(
+  const float16_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    int32_t  blkCnt;           /* loop counters */
+    float16x8_t vecDst;
+    float32x4x2_t tmp;
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0)
+    {
+        vecDst = vldrhq_f16(pSrc);          
+        pSrc += 8;
+
+        tmp.val[0] = vcvtbq_f32_f16(vecDst);
+        tmp.val[1] = vcvttq_f32_f16(vecDst);
+        vst2q(pDst,tmp);
+        
+        pDst += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 7;
+    while (blkCnt > 0)
+    {
+
+        *pDst++ = (float32_t) *pSrc++;
+        /*
+         * Decrement the loop counter
+         */
+        blkCnt--;
+    }
+}
+
+#else
+void arm_f16_to_float(
+  const float16_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    const float16_t *pIn = pSrc;      /* Src pointer */
+    uint32_t  blkCnt;           /* loop counter */
+
+    /*
+     * Loop over blockSize number of values
+     */
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+
+        *pDst++ = (float32_t) * pIn++;
+        /*
+         * Decrement the loop counter
+         */
+        blkCnt--;
+    }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of f16_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c b/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c
new file mode 100644
index 0000000000000000000000000000000000000000..5c8575a2fa44f5d9786f6db39a97f2022930d59b
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c
@@ -0,0 +1,157 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_float_to_q15.c
+ * Description:  Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup f16_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the f16 vector to Q15 vector.
+  @param[in]     pSrc       points to the f16 input vector
+  @param[out]    pDst       points to the Q15 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (q15_t)(pSrc[n] * 32768);   0 <= n < blockSize.
+  </pre>
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+
+  @note
+                   In order to apply rounding in scalar version, the library should be rebuilt with the ROUNDING macro
+                   defined in the preprocessor section of project options.
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_f16_to_q15(
+  const float16_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+    float16_t       maxQ = (float16_t) Q15_MAX;
+    float16x8_t         vecDst;
+
+
+    do {
+        mve_pred16_t    p = vctp16q(blockSize);
+
+        vecDst = vldrhq_z_f16((float16_t const *) pSrc, p);
+        /* C = A * 32767 */
+        /* convert from float to Q15 and then store the results in the destination buffer */
+        vecDst = vmulq_m(vuninitializedq_f16(), vecDst, maxQ, p);
+
+        vstrhq_p_s16(pDst,
+            vcvtaq_m(vuninitializedq_s16(), vecDst, p), p);
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+        blockSize -= 8;
+    }
+    while ((int32_t) blockSize > 0);
+}
+
+#else
+
+void arm_f16_to_q15(
+  const float16_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+    const float16_t *pIn = pSrc;      /* Src pointer */
+    uint32_t  blkCnt;           /* loop counter */
+#ifdef ARM_MATH_ROUNDING
+    float16_t in;
+#endif                          /*      #ifdef ARM_MATH_ROUNDING        */
+
+    /*
+     * Loop over blockSize number of values
+     */
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+
+#ifdef ARM_MATH_ROUNDING
+
+        /*
+         * C = A * 65536
+         */
+        /*
+         * convert from float to Q31 and then store the results in the destination buffer
+         */
+        in = *pIn++;
+        in = (in * 32768.0);
+        in += in > 0.0 ? 0.5 : -0.5;
+        *pDst++ = clip_q31_to_q15((q31_t) (in));
+
+#else
+
+        /*
+         * C = A * 32768
+         */
+        /*
+         * convert from float to Q31 and then store the results in the destination buffer
+         */
+        *pDst++ = clip_q31_to_q15((q31_t) (*pIn++ * 32768.0));
+
+#endif                          /*      #ifdef ARM_MATH_ROUNDING        */
+
+        /*
+         * Decrement the loop counter
+         */
+        blkCnt--;
+    }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of f16_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c b/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..0b08f12c95b96fd01241655ed8980318a6d2b5a5
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c
@@ -0,0 +1,127 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_fill_f16.c
+ * Description:  Fills a constant value into a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupSupport
+ */
+
+
+/**
+  @addtogroup Fill
+  @{
+ */
+
+/**
+  @brief         Fills a constant value into a f16 vector.
+  @param[in]     value      input value to be filled
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_fill_f16(
+  float16_t value,
+  float16_t * pDst,
+  uint32_t blockSize)
+{
+     do {
+        mve_pred16_t    p = vctp16q(blockSize);
+
+        vstrhq_p_f16(pDst,
+            vdupq_m_n_f16(vuninitializedq_f16(), value, p), p);
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pDst += 8;
+        blockSize -= 8;
+    }
+    while ((int32_t) blockSize > 0);
+}
+#else
+void arm_fill_f16(
+  float16_t value,
+  float16_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+    *pDst++ = value;
+    *pDst++ = value;
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of Fill group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c
index 2c40773ef14082b6db81f33fdcdc6595984d9670..50cdd8fc78ea206ee14ecd319c834c00266991f7 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_fill_f32.c
  * Description:  Fills a constant value into a floating-point vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c b/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c
index 0ffba933551133d733b24cc266f9e7205894ac84..997a728dc18b4f806a358a1bc6751a3664b05218 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_fill_q15.c
  * Description:  Fills a constant value into a Q15 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
@@ -44,7 +44,7 @@
   @param[in]     blockSize  number of samples in each vector
   @return        none
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_fill_q15(
   q15_t value,
   q15_t * pDst,
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c b/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c
index c5200fb8d04390fdf9c18ddabe613cd28a7ed16c..7da5fb6ac3b9b7cb24b9829fdfc698aab9d4cade 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_fill_q31.c
  * Description:  Fills a constant value into a Q31 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
@@ -44,7 +44,7 @@
   @param[in]     blockSize  number of samples in each vector
   @return        none
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_fill_q31(
         q31_t value,
         q31_t * pDst,
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c b/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c
index 4614db5e9227d5f624fc1a3bd65710458e2e353f..830fc7324954da0b63a668454abb3b6bb3265450 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_fill_q7.c
  * Description:  Fills a constant value into a Q7 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
@@ -44,7 +44,7 @@
   @param[in]     blockSize  number of samples in each vector
   @return        none
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_fill_q7(
   q7_t value,
   q7_t * pDst,
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c b/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..d627a8ada5afe87bc123475f69364ae2d1e7d5bf
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c
@@ -0,0 +1,131 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_float_to_q15.c
+ * Description:  Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup float_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the floating-point vector to f16 vector.
+  @param[in]     pSrc       points to the f32 input vector
+  @param[out]    pDst       points to the f16 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_float_to_f16 built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(__CMSIS_GCC_H)
+
+void arm_float_to_f16(
+  const float32_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+    int32_t  blkCnt;           /* loop counters */
+    float32x4x2_t tmp;
+    float16x8_t vecDst;
+    float32_t const *pSrcVec;
+
+
+    pSrcVec = (float32_t const *) pSrc;
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0)
+    {
+        /* convert from float32 to float16 and then store the results in the destination buffer */
+        tmp = vld2q(pSrcVec);   pSrcVec += 8;
+        /* narrow / merge */
+        vecDst = vcvtbq_f16_f32(vecDst, tmp.val[0]);
+        vecDst = vcvttq_f16_f32(vecDst, tmp.val[1]);
+        vst1q(pDst, vecDst);    pDst += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        tmp = vld2q(pSrcVec);
+        vecDst = vcvtbq_f16_f32(vecDst, tmp.val[0]);
+        vecDst = vcvttq_f16_f32(vecDst, tmp.val[1]);
+        vstrhq_p(pDst, vecDst, p0);
+    }
+}
+
+#else
+
+void arm_float_to_f16(
+  const float32_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+    const float32_t *pIn = pSrc;      /* Src pointer */
+    uint32_t  blkCnt;           /* loop counter */
+
+    /*
+     * Loop over blockSize number of values
+     */
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+
+        *pDst++ = (float16_t) * pIn++;
+        /*
+         * Decrement the loop counter
+         */
+        blkCnt--;
+    }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of float_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c b/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c
index 76736d5bf4e4ec97fb9ee2ec8a4428765eb8af8b..8061c9f2cc682fa88c65826704aa69c7d5c8f968 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_float_to_q15.c
  * Description:  Converts the elements of the floating-point vector to Q15 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c b/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c
index a4580f2ba47e0e75a29cb1e17fba3a94b85d3ef3..a222e499c627adbada08ba4b992dbc8acdf27c76 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_float_to_q31.c
  * Description:  Converts the elements of the floating-point vector to Q31 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c b/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c
index 80eb294a36dbfff696fdf9bf2ce83cb930e17a7a..27af5206988dfc7cc47e830baf87bda06249362a 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_float_to_q7.c
  * Description:  Converts the elements of the floating-point vector to Q7 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c
index 3fc594afe733f29622d8c94a0663f62df0e8edb3..5a46caa3b5b452fbbf63e09eb7c45ee959f1a44a 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_heap_sort_f32.c
  * Description:  Floating point heap sort
  *
- * $Date:        2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 #include "arm_sorting.h"
 
 
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c
index f253d72b79537bc17cf24c4e5c1d6a4ac854fa7e..4e85043864187eff057c5238719d6eb4c0b13816 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_insertion_sort_f32.c
  * Description:  Floating point insertion sort
  *
- * $Date:        2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 #include "arm_sorting.h"
 
 /**
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c
index 2e026a3935a7e8b7bb6c2b856adc3e8525a490b4..5c212013b6c4127178d9f22d34b7f3b5575dd6cb 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_merge_sort_f32.c
  * Description:  Floating point merge sort
  *
- * $Date:        2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 #include "arm_sorting.h"
 
 
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c
index 31431f315eb9db624c38487b65950abb6eaa0058..bd93a00fb3566d16b5d4e15b215299df8a343f1b 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_merge_sort_init_f32.c
  * Description:  Floating point merge sort initialization function
  *
- * $Date:        2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c b/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..960af8fa00dd2adb14d9d19ca6457cb78855eb0e
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c
@@ -0,0 +1,155 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_q15_to_float.c
+ * Description:  Converts the elements of the Q15 vector to floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q15_to_x  Convert 16-bit Integer value
+ */
+
+/**
+  @addtogroup q15_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the Q15 vector to f16 vector.
+  @param[in]     pSrc       points to the Q15 input vector
+  @param[out]    pDst       points to the f16 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (float16_t) pSrc[n] / 32768;   0 <= n < blockSize.
+  </pre>
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_q15_to_f16(
+  const q15_t * pSrc,
+  float16_t * pDst,
+  uint32_t blockSize)
+{
+    int32_t  blkCnt;           /* loop counters */
+    q15x8_t vecDst;
+    q15_t const *pSrcVec;
+
+    pSrcVec = (q15_t const *) pSrc;
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0)
+    {
+        /* C = (float16_t) A / 32768 */
+        /* convert from q15 to float and then store the results in the destination buffer */
+        vecDst = vld1q(pSrcVec);   pSrcVec += 8;
+        vstrhq(pDst, vcvtq_n_f16_s16(vecDst, 15));  pDst += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecDst = vld1q(pSrcVec);   pSrcVec += 8;
+        vstrhq_p(pDst, vcvtq_n_f16_s16(vecDst, 15), p0);
+    }
+}
+#else
+
+void arm_q15_to_f16(
+  const q15_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const q15_t *pIn = pSrc;                             /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (float16_t) A / 32768 */
+
+    /* Convert from q15 to float and store result in destination buffer */
+    *pDst++ = ((float16_t) * pIn++ / 32768.0f);
+    *pDst++ = ((float16_t) * pIn++ / 32768.0f);
+    *pDst++ = ((float16_t) * pIn++ / 32768.0f);
+    *pDst++ = ((float16_t) * pIn++ / 32768.0f);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (float16_t) A / 32768 */
+
+    /* Convert from q15 to float and store result in destination buffer */
+    *pDst++ = ((float16_t) *pIn++ / 32768.0f);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of q15_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c b/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
index 6f2927bac1e8962ea44801b2ece8fa2668feea6e..2dded65b2f0157302eeb223dd26c7698c41e6bb6 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
@@ -3,13 +3,13 @@
  * Title:        arm_q15_to_float.c
  * Description:  Converts the elements of the Q15 vector to floating-point vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c b/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c
index a33bcb7b913e73ad4305af2976e31564243c767d..468b997b29cec2f79e36f7242268eb173c62ab5e 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_q15_to_q31.c
  * Description:  Converts the elements of the Q15 vector to Q31 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
@@ -50,7 +50,7 @@
       pDst[n] = (q31_t) pSrc[n] << 16;   0 <= n < blockSize.
   </pre>
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_q15_to_q31(
   const q15_t * pSrc,
         q31_t * pDst,
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c b/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c
index 023410af560a89ae931838d468623742157cb49a..8fbd4710c4f1cf1fc011adaa869f936b590dd0f6 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_q15_to_q7.c
  * Description:  Converts the elements of the Q15 vector to Q7 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
@@ -50,7 +50,7 @@
       pDst[n] = (q7_t) pSrc[n] >> 8;   0 <= n < blockSize.
   </pre>
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_q15_to_q7(
   const q15_t * pSrc,
         q7_t * pDst,
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c b/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c
index 0598b3bb716863c872b2ec1fc7e58aa7b441069c..32ff400ca63984d419d4b33940c6b7603d07a859 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c
@@ -3,13 +3,13 @@
  * Title:        arm_q31_to_float.c
  * Description:  Converts the elements of the Q31 vector to floating-point vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c b/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c
index b3eeaca9b6bf8056f6b067cc8b3583f7d967303d..a25a2bb1c7016059026c1f2e6f8628d1b00b3bee 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_q31_to_q15.c
  * Description:  Converts the elements of the Q31 vector to Q15 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
@@ -50,7 +50,7 @@
       pDst[n] = (q15_t) pSrc[n] >> 16;   0 <= n < blockSize.
   </pre>
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_q31_to_q15(
   const q31_t * pSrc,
         q15_t * pDst,
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c b/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c
index 870163e8eb40a746e9ae2be84361bb0744a3991c..16fbe1861565599d3f32f032d5ae9daf8c276fbe 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c
@@ -3,13 +3,13 @@
  * Title:        arm_q31_to_q7.c
  * Description:  Converts the elements of the Q31 vector to Q7 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
@@ -50,7 +50,7 @@
       pDst[n] = (q7_t) pSrc[n] >> 24;   0 <= n < blockSize.
   </pre>
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_q31_to_q7(
   const q31_t * pSrc,
         q7_t * pDst,
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c b/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
index 8c844aa6e2c3e6261f8bb979572b0789a177cc73..258309ea8a3df750c237151021016a1feccd1a5b 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
@@ -3,13 +3,13 @@
  * Title:        arm_q7_to_float.c
  * Description:  Converts the elements of the Q7 vector to floating-point vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c b/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c
index 71f392c2a93465c25bc199871ba17f0903897c84..afe3e7995e4ae4b6465cc276e145a2d8506723e0 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_q7_to_q15.c
  * Description:  Converts the elements of the Q7 vector to Q15 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
@@ -51,7 +51,7 @@
   </pre>
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_q7_to_q15(
   const q7_t * pSrc,
         q15_t * pDst,
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c b/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c
index 69ad0b829d3e370c7561a2286b7d46a2c964b2e9..f48affdbc43c6f5630371c3c8867fa0c6f353dc0 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_q7_to_q31.c
  * Description:  Converts the elements of the Q7 vector to Q31 vector
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/support_functions.h"
 
 /**
   @ingroup groupSupport
@@ -50,7 +50,7 @@
       pDst[n] = (q31_t) pSrc[n] << 24;   0 <= n < blockSize.
   </pre>
  */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_q7_to_q31(
   const q7_t * pSrc,
         q31_t * pDst,
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c
index 7c2fa2403aa60843df29e5e8dd992a86872aeafe..c6c44d966ca3a31938cdb95b1cc0d74a8e4e4bfb 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_quick_sort_f32.c
  * Description:  Floating point quick sort
  *
- * $Date:        2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,6 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
 #include "arm_sorting.h"
 
 static uint32_t arm_quick_sort_partition_f32(float32_t *pSrc, int32_t first, int32_t last, uint8_t dir)
@@ -144,7 +143,7 @@ static void arm_quick_sort_core_f32(float32_t *pSrc, int32_t first, int32_t last
    * @par
    *                In this implementation the Hoare partition scheme has been 
    *                used [Hoare, C. A. R. (1 January 1962). "Quicksort". The Computer
-   *                Journal. 5 (1): 10–16.] The first element has always been chosen
+   *                Journal. 5 (1): 10...16.] The first element has always been chosen
    *                as the pivot. The partition algorithm guarantees that the returned
    *                pivot is never placed outside the vector, since it is returned only 
    *                when the pointers crossed each other. In this way it isn't 
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c
index 9819570c278f62a935e214c96724162dabbc0323..ad534bc0aaf33638df905892a34ff833a9001b46 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_selection_sort_f32.c
  * Description:  Floating point selection sort
  *
- * $Date:        2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,6 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
 #include "arm_sorting.h"
 
 /**
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c
index 1e73f5a786d46babe7d48c3f8cdebc9a435e72c4..43786dcbd69269dac0e4ec0b87a0a17f773b59ce 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_sort_f32.c
  * Description:  Floating point sort
  *
- * $Date:        2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,6 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
 #include "arm_sorting.h"
 
 /**
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c
index 264b5c3804b9a083d3d5ab0b5cc9da1419eb858c..72ad9c55755468f7f7453e825ac064ae6c59482e 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_sort_init_f32.c
  * Description:  Floating point sort initialization function
  *
- * $Date:        2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,6 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
 #include "arm_sorting.h"
 
 /**
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c b/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..f9aae6b03c3e124ccdb690fbae21353324e6f287
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c
@@ -0,0 +1,146 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_weighted_sum_f16.c
+ * Description:  Weighted Sum
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @defgroup weightedsum Weighted Sum
+
+  Weighted sum of values
+ */
+
+
+/**
+ * @addtogroup weightedsum
+ * @{
+ */
+
+
+/**
+ * @brief Weighted sum
+ *
+ *
+ * @param[in]    *in           Array of input values.
+ * @param[in]    *weigths      Weights
+ * @param[in]    blockSize     Number of samples in the input array.
+ * @return       Weighted sum
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+float16_t arm_weighted_sum_f16(const float16_t *in,const float16_t *weigths, uint32_t blockSize)
+{
+    _Float16       accum1, accum2;
+    float16x8_t    accum1V, accum2V;
+    float16x8_t    inV, wV;
+    const float16_t *pIn, *pW;
+    uint32_t        blkCnt;
+
+
+    pIn = in;
+    pW = weigths;
+
+
+    accum1V = vdupq_n_f16(0.0f16);
+    accum2V = vdupq_n_f16(0.0f16);
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0) 
+    {
+        inV = vld1q(pIn);
+        wV = vld1q(pW);
+
+        pIn += 4;
+        pW += 4;
+
+        accum1V = vfmaq(accum1V, inV, wV);
+        accum2V = vaddq(accum2V, wV);
+        blkCnt--;
+    }
+
+    accum1 = vecAddAcrossF16Mve(accum1V);
+    accum2 = vecAddAcrossF16Mve(accum2V);
+
+    blkCnt = blockSize & 7;
+    while(blkCnt > 0)
+    {
+        accum1 += (_Float16)*pIn++ * (_Float16)*pW;
+        accum2 += (_Float16)*pW++;
+        blkCnt--;
+    }
+
+
+    return (accum1 / accum2);
+}
+
+#else
+
+float16_t arm_weighted_sum_f16(const float16_t *in, const float16_t *weigths, uint32_t blockSize)
+{
+
+    _Float16 accum1, accum2;
+    const float16_t *pIn, *pW;
+    uint32_t blkCnt;
+
+
+    pIn = in;
+    pW = weigths;
+
+    accum1=0.0f16;
+    accum2=0.0f16;
+
+    blkCnt = blockSize;
+    while(blkCnt > 0)
+    {
+        accum1 += (_Float16)*pIn++ * (_Float16)*pW;
+        accum2 += (_Float16)*pW++;
+        blkCnt--;
+    }
+
+    return(accum1 / accum2);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of weightedsum group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c
index ac3d4819eff39d375a4c10ce1e6f6fbfff3cd034..dd378615227d89e52ad86ed6efa2c77882b24f02 100644
--- a/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c
@@ -3,11 +3,13 @@
  * Title:        arm_weighted_sum_f32.c
  * Description:  Weighted Sum
  *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
  * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -24,13 +26,13 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
 #include <limits.h>
 #include <math.h>
 
+#include "dsp/support_functions.h"
 
 /**
- * @addtogroup groupSupport
+ * @addtogroup weightedsum
  * @{
  */
 
@@ -181,5 +183,5 @@ float32_t arm_weighted_sum_f32(const float32_t *in, const float32_t *weigths, ui
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
- * @} end of groupSupport group
+ * @} end of weightedsum group
  */
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c b/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c
index c60812938a52cfbe680b08a8dd3be735a52587ba..687a9e86a7c654d5affbfcb63148288acbb2e35f 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c
@@ -3,13 +3,13 @@
  * Title:        arm_bitreversal.c
  * Description:  Bitreversal functions
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,9 +26,10 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
+
 /**
   @brief         In-place floating-point bit reversal function.
   @param[in,out] pSrc         points to in-place floating-point data buffer
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c b/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c
index 802129d56817ae31e8e6837dd26c62b6791e37cc..77fac1f8347633539bace225bbb2456c35deff72 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c
@@ -3,10 +3,10 @@
  * Title:        arm_bitreversal2.c
  * Description:  Bitreversal functions
  *
- * $Date:        18. March 2019
- * $Revision:    V1.0.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
  * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal_f16.c b/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..77b15a790615c43b85c576b91f22e77737a99fdd
--- /dev/null
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal_f16.c
@@ -0,0 +1,102 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_bitreversal_f16.c
+ * Description:  Bitreversal functions
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+
+/*
+* @brief  In-place bit reversal function.
+* @param[in, out] *pSrc        points to the in-place buffer of floating-point data type.
+* @param[in]      fftSize      length of the FFT.
+* @param[in]      bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table.
+* @param[in]      *pBitRevTab  points to the bit reversal table.
+* @return none.
+*/
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+void arm_bitreversal_f16(
+float16_t * pSrc,
+uint16_t fftSize,
+uint16_t bitRevFactor,
+const uint16_t * pBitRevTab)
+{
+   uint16_t fftLenBy2, fftLenBy2p1;
+   uint16_t i, j;
+   float16_t in;
+
+   /*  Initializations */
+   j = 0U;
+   fftLenBy2 = fftSize >> 1U;
+   fftLenBy2p1 = (fftSize >> 1U) + 1U;
+
+   /* Bit Reversal Implementation */
+   for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U)
+   {
+      if (i < j)
+      {
+         /*  pSrc[i] <-> pSrc[j]; */
+         in = pSrc[2U * i];
+         pSrc[2U * i] = pSrc[2U * j];
+         pSrc[2U * j] = in;
+
+         /*  pSrc[i+1U] <-> pSrc[j+1U] */
+         in = pSrc[(2U * i) + 1U];
+         pSrc[(2U * i) + 1U] = pSrc[(2U * j) + 1U];
+         pSrc[(2U * j) + 1U] = in;
+
+         /*  pSrc[i+fftLenBy2p1] <-> pSrc[j+fftLenBy2p1] */
+         in = pSrc[2U * (i + fftLenBy2p1)];
+         pSrc[2U * (i + fftLenBy2p1)] = pSrc[2U * (j + fftLenBy2p1)];
+         pSrc[2U * (j + fftLenBy2p1)] = in;
+
+         /*  pSrc[i+fftLenBy2p1+1U] <-> pSrc[j+fftLenBy2p1+1U] */
+         in = pSrc[(2U * (i + fftLenBy2p1)) + 1U];
+         pSrc[(2U * (i + fftLenBy2p1)) + 1U] =
+         pSrc[(2U * (j + fftLenBy2p1)) + 1U];
+         pSrc[(2U * (j + fftLenBy2p1)) + 1U] = in;
+
+      }
+
+      /*  pSrc[i+1U] <-> pSrc[j+1U] */
+      in = pSrc[2U * (i + 1U)];
+      pSrc[2U * (i + 1U)] = pSrc[2U * (j + fftLenBy2)];
+      pSrc[2U * (j + fftLenBy2)] = in;
+
+      /*  pSrc[i+2U] <-> pSrc[j+2U] */
+      in = pSrc[(2U * (i + 1U)) + 1U];
+      pSrc[(2U * (i + 1U)) + 1U] = pSrc[(2U * (j + fftLenBy2)) + 1U];
+      pSrc[(2U * (j + fftLenBy2)) + 1U] = in;
+
+      /*  Reading the index for the bit reversal */
+      j = *pBitRevTab;
+
+      /*  Updating the bit reversal index depending on the fft length  */
+      pBitRevTab += bitRevFactor;
+   }
+}
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f16.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..239e0031eddd4b1f625facec41bba0a76e57a50f
--- /dev/null
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f16.c
@@ -0,0 +1,842 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_f32.c
+ * Description:  Combined Radix Decimation in Frequency CFFT Floating point processing function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+#include "arm_common_tables_f16.h"
+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_fft.h"
+#include "arm_mve_tables_f16.h"
+
+
+static float16_t arm_inverse_fft_length_f16(uint16_t fftLen)
+{
+  float16_t retValue=1.0;
+
+  switch (fftLen)
+  {
+
+  case 4096U:
+    retValue = (float16_t)0.000244140625f;
+    break;
+
+  case 2048U:
+    retValue = (float16_t)0.00048828125f;
+    break;
+
+  case 1024U:
+    retValue = (float16_t)0.0009765625f;
+    break;
+
+  case 512U:
+    retValue = (float16_t)0.001953125f;
+    break;
+
+  case 256U:
+    retValue = (float16_t)0.00390625f;
+    break;
+
+  case 128U:
+    retValue = (float16_t)0.0078125f;
+    break;
+
+  case 64U:
+    retValue = (float16_t)0.015625f;
+    break;
+
+  case 32U:
+    retValue = (float16_t)0.03125f;
+    break;
+
+  case 16U:
+    retValue = (float16_t)0.0625f;
+    break;
+
+
+  default:
+    break;
+  }
+  return(retValue);
+}
+
+
+static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen)
+{
+    f16x8_t vecTmp0, vecTmp1;
+    f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
+    f16x8_t vecA, vecB, vecC, vecD;
+    uint32_t  blkCnt;
+    uint32_t  n1, n2;
+    uint32_t  stage = 0;
+    int32_t  iter = 1;
+    static const uint32_t strides[4] =
+       {(0 - 16) * sizeof(float16_t *)
+       , (4 - 16) * sizeof(float16_t *)
+       , (8 - 16) * sizeof(float16_t *)
+       , (12 - 16) * sizeof(float16_t *)};
+
+    n2 = fftLen;
+    n1 = n2;
+    n2 >>= 2u;
+    for (int k = fftLen / 4u; k > 1; k >>= 2)
+    {
+        for (int i = 0; i < iter; i++)
+        {
+            float16_t const     *p_rearranged_twiddle_tab_stride1 =
+                                &S->rearranged_twiddle_stride1[
+                                S->rearranged_twiddle_tab_stride1_arr[stage]];
+            float16_t const     *p_rearranged_twiddle_tab_stride2 =
+                                &S->rearranged_twiddle_stride2[
+                                S->rearranged_twiddle_tab_stride2_arr[stage]];
+            float16_t const     *p_rearranged_twiddle_tab_stride3 =
+                                &S->rearranged_twiddle_stride3[
+                                S->rearranged_twiddle_tab_stride3_arr[stage]];
+            float16_t const    *pW1, *pW2, *pW3;
+            float16_t           *inA = pSrc + CMPLX_DIM * i * n1;
+            float16_t           *inB = inA + n2 * CMPLX_DIM;
+            float16_t           *inC = inB + n2 * CMPLX_DIM;
+            float16_t           *inD = inC + n2 * CMPLX_DIM;
+            f16x8_t            vecW;
+
+
+            pW1 = p_rearranged_twiddle_tab_stride1;
+            pW2 = p_rearranged_twiddle_tab_stride2;
+            pW3 = p_rearranged_twiddle_tab_stride3;
+
+            blkCnt = n2 / 4;
+            /*
+             * load 2 f16 complex pair
+             */
+            vecA = vldrhq_f16(inA);
+            vecC = vldrhq_f16(inC);
+            while (blkCnt > 0U)
+            {
+                vecB = vldrhq_f16(inB);
+                vecD = vldrhq_f16(inD);
+
+                vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
+                vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
+
+                vecSum1 = vecB + vecD;
+                vecDiff1 = vecB - vecD;
+                /*
+                 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
+                 */
+                vecTmp0 = vecSum0 + vecSum1;
+                vst1q(inA, vecTmp0);
+                inA += 8;
+
+                /*
+                 * [ 1 -1 1 -1 ] * [ A B C D ]'
+                 */
+                vecTmp0 = vecSum0 - vecSum1;
+                /*
+                 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
+                 */
+                vecW = vld1q(pW2);
+                pW2 += 8;
+                vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
+                vst1q(inB, vecTmp1);
+                inB += 8;
+
+                /*
+                 * [ 1 -i -1 +i ] * [ A B C D ]'
+                 */
+                vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
+                /*
+                 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
+                 */
+                vecW = vld1q(pW1);
+                pW1 +=8;
+                vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
+                vst1q(inC, vecTmp1);
+                inC += 8;
+
+                /*
+                 * [ 1 +i -1 -i ] * [ A B C D ]'
+                 */
+                vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
+                /*
+                 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
+                 */
+                vecW = vld1q(pW3);
+                pW3 += 8;
+                vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
+                vst1q(inD, vecTmp1);
+                inD += 8;
+
+                vecA = vldrhq_f16(inA);
+                vecC = vldrhq_f16(inC);
+
+                blkCnt--;
+            }
+        }
+        n1 = n2;
+        n2 >>= 2u;
+        iter = iter << 2;
+        stage++;
+    }
+
+    /*
+     * start of Last stage process
+     */
+    uint32x4_t vecScGathAddr = vld1q_u32(strides);
+    vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
+
+    /* load scheduling */
+    vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
+    vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
+
+    blkCnt = (fftLen >> 4);
+    while (blkCnt > 0U)
+    {
+        vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
+        vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
+
+        vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
+        vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
+
+        vecSum1 = vecB + vecD;
+        vecDiff1 = vecB - vecD;
+
+        /* pre-load for next iteration */
+        vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
+        vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
+
+        vecTmp0 = vecSum0 + vecSum1;
+        vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
+
+        vecTmp0 = vecSum0 - vecSum1;
+        vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
+
+        vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
+        vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
+
+        vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
+        vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
+
+        blkCnt--;
+    }
+
+    /*
+     * End of last stage process
+     */
+}
+
+static void arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S, float16_t *pSrc, uint32_t fftLen)
+{
+    float16_t const *pCoefVec;
+    float16_t const  *pCoef = S->pTwiddle;
+    float16_t        *pIn0, *pIn1;
+    uint32_t          n2;
+    uint32_t          blkCnt;
+    f16x8_t         vecIn0, vecIn1, vecSum, vecDiff;
+    f16x8_t         vecCmplxTmp, vecTw;
+
+
+    n2 = fftLen >> 1;
+    pIn0 = pSrc;
+    pIn1 = pSrc + fftLen;
+    pCoefVec = pCoef;
+
+    blkCnt = n2 / 4;
+    while (blkCnt > 0U)
+    {
+        vecIn0 = *(f16x8_t *) pIn0;
+        vecIn1 = *(f16x8_t *) pIn1;
+        vecTw = vld1q(pCoefVec);
+        pCoefVec += 8;
+
+        vecSum = vaddq(vecIn0, vecIn1);
+        vecDiff = vsubq(vecIn0, vecIn1);
+
+        vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
+
+        vst1q(pIn0, vecSum);
+        pIn0 += 8;
+        vst1q(pIn1, vecCmplxTmp);
+        pIn1 += 8;
+
+        blkCnt--;
+    }
+
+    _arm_radix4_butterfly_f16_mve(S, pSrc, n2);
+
+    _arm_radix4_butterfly_f16_mve(S, pSrc + fftLen, n2);
+
+    pIn0 = pSrc;
+}
+
+static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen, float16_t onebyfftLen)
+{
+    f16x8_t vecTmp0, vecTmp1;
+    f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
+    f16x8_t vecA, vecB, vecC, vecD;
+    f16x8_t vecW;
+    uint32_t  blkCnt;
+    uint32_t  n1, n2;
+    uint32_t  stage = 0;
+    int32_t  iter = 1;
+    static const uint32_t strides[4] = {
+        (0 - 16) * sizeof(q31_t *),
+        (4 - 16) * sizeof(q31_t *),
+        (8 - 16) * sizeof(q31_t *),
+        (12 - 16) * sizeof(q31_t *)
+    };
+
+    n2 = fftLen;
+    n1 = n2;
+    n2 >>= 2u;
+    for (int k = fftLen / 4; k > 1; k >>= 2)
+    {
+        for (int i = 0; i < iter; i++)
+        {
+            float16_t const *p_rearranged_twiddle_tab_stride1 =
+                    &S->rearranged_twiddle_stride1[
+                    S->rearranged_twiddle_tab_stride1_arr[stage]];
+            float16_t const *p_rearranged_twiddle_tab_stride2 =
+                    &S->rearranged_twiddle_stride2[
+                    S->rearranged_twiddle_tab_stride2_arr[stage]];
+            float16_t const *p_rearranged_twiddle_tab_stride3 =
+                    &S->rearranged_twiddle_stride3[
+                    S->rearranged_twiddle_tab_stride3_arr[stage]];
+            float16_t const *pW1, *pW2, *pW3;
+            float16_t *inA = pSrc + CMPLX_DIM * i * n1;
+            float16_t *inB = inA + n2 * CMPLX_DIM;
+            float16_t *inC = inB + n2 * CMPLX_DIM;
+            float16_t *inD = inC + n2 * CMPLX_DIM;
+
+            pW1 = p_rearranged_twiddle_tab_stride1;
+            pW2 = p_rearranged_twiddle_tab_stride2;
+            pW3 = p_rearranged_twiddle_tab_stride3;
+
+            blkCnt = n2 / 4;
+            /*
+             * load 2 f32 complex pair
+             */
+            vecA = vldrhq_f16(inA);
+            vecC = vldrhq_f16(inC);
+            while (blkCnt > 0U)
+            {
+                vecB = vldrhq_f16(inB);
+                vecD = vldrhq_f16(inD);
+
+                vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
+                vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
+
+                vecSum1 = vecB + vecD;
+                vecDiff1 = vecB - vecD;
+                /*
+                 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
+                 */
+                vecTmp0 = vecSum0 + vecSum1;
+                vst1q(inA, vecTmp0);
+                inA += 8;
+                /*
+                 * [ 1 -1 1 -1 ] * [ A B C D ]'
+                 */
+                vecTmp0 = vecSum0 - vecSum1;
+                /*
+                 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
+                 */
+                vecW = vld1q(pW2);
+                pW2 += 8;
+                vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
+                vst1q(inB, vecTmp1);
+                inB += 8;
+
+                /*
+                 * [ 1 -i -1 +i ] * [ A B C D ]'
+                 */
+                vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
+                /*
+                 * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
+                 */
+                vecW = vld1q(pW1);
+                pW1 += 8;
+                vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
+                vst1q(inC, vecTmp1);
+                inC += 8;
+
+                /*
+                 * [ 1 +i -1 -i ] * [ A B C D ]'
+                 */
+                vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
+                /*
+                 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
+                 */
+                vecW = vld1q(pW3);
+                pW3 += 8;
+                vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
+                vst1q(inD, vecTmp1);
+                inD += 8;
+
+                vecA = vldrhq_f16(inA);
+                vecC = vldrhq_f16(inC);
+
+                blkCnt--;
+            }
+        }
+        n1 = n2;
+        n2 >>= 2u;
+        iter = iter << 2;
+        stage++;
+    }
+
+    /*
+     * start of Last stage process
+     */
+    uint32x4_t vecScGathAddr = vld1q_u32(strides);
+    vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
+
+    /*
+     * load scheduling
+     */
+    vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
+    vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
+
+    blkCnt = (fftLen >> 4);
+    while (blkCnt > 0U)
+    {
+        vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
+        vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
+
+        vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
+        vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
+
+        vecSum1 = vecB + vecD;
+        vecDiff1 = vecB - vecD;
+
+        vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
+        vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
+
+        vecTmp0 = vecSum0 + vecSum1;
+        vecTmp0 = vecTmp0 * onebyfftLen;
+        vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
+
+        vecTmp0 = vecSum0 - vecSum1;
+        vecTmp0 = vecTmp0 * onebyfftLen;
+        vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
+
+        vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
+        vecTmp0 = vecTmp0 * onebyfftLen;
+        vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
+
+        vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
+        vecTmp0 = vecTmp0 * onebyfftLen;
+        vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
+
+        blkCnt--;
+    }
+
+    /*
+     * End of last stage process
+     */
+}
+
+static void arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t *pSrc, uint32_t fftLen)
+{
+    float16_t const *pCoefVec;
+    float16_t const  *pCoef = S->pTwiddle;
+    float16_t        *pIn0, *pIn1;
+    uint32_t          n2;
+    float16_t         onebyfftLen = arm_inverse_fft_length_f16(fftLen);
+    uint32_t          blkCnt;
+    f16x8_t         vecIn0, vecIn1, vecSum, vecDiff;
+    f16x8_t         vecCmplxTmp, vecTw;
+
+
+    n2 = fftLen >> 1;
+    pIn0 = pSrc;
+    pIn1 = pSrc + fftLen;
+    pCoefVec = pCoef;
+
+    blkCnt = n2 / 4;
+    while (blkCnt > 0U)
+    {
+        vecIn0 = *(f16x8_t *) pIn0;
+        vecIn1 = *(f16x8_t *) pIn1;
+        vecTw = vld1q(pCoefVec);
+        pCoefVec += 8;
+
+        vecSum = vaddq(vecIn0, vecIn1);
+        vecDiff = vsubq(vecIn0, vecIn1);
+
+        vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
+
+        vst1q(pIn0, vecSum);
+        pIn0 += 8;
+        vst1q(pIn1, vecCmplxTmp);
+        pIn1 += 8;
+
+        blkCnt--;
+    }
+
+    _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, n2, onebyfftLen);
+
+    _arm_radix4_butterfly_inverse_f16_mve(S, pSrc + fftLen, n2, onebyfftLen);
+}
+
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Processing function for the floating-point complex FFT.
+  @param[in]     S              points to an instance of the floating-point CFFT structure
+  @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
+  @param[in]     ifftFlag       flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        none
+ */
+
+
+void arm_cfft_f16(
+  const arm_cfft_instance_f16 * S,
+        float16_t * pSrc,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag)
+{
+        uint32_t fftLen = S->fftLen;
+
+        if (ifftFlag == 1U) {
+
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
+                break;
+            }
+        } else {
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
+                break;
+            }
+        }
+
+
+        if (bitReverseFlag)
+        {
+
+            arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
+
+        }
+}
+
+#else
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+extern void arm_bitreversal_16(
+        uint16_t * pSrc,
+  const uint16_t bitRevLen,
+  const uint16_t * pBitRevTable);
+
+
+extern void arm_cfft_radix4by2_f16(
+    float16_t * pSrc,
+    uint32_t fftLen,
+    const float16_t * pCoef);
+
+extern void arm_radix4_butterfly_f16(
+        float16_t * pSrc,
+        uint16_t fftLen,
+  const float16_t * pCoef,
+        uint16_t twidCoefModifier);
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @defgroup ComplexFFT Complex FFT Functions
+
+  @par
+                   The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
+                   Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
+                   than the DFT, especially for long lengths.
+                   The algorithms described in this section
+                   operate on complex data.  A separate set of functions is devoted to handling
+                   of real sequences.
+  @par
+                   There are separate algorithms for handling floating-point, Q15, and Q31 data
+                   types.  The algorithms available for each data type are described next.
+  @par
+                   The FFT functions operate in-place.  That is, the array holding the input data
+                   will also be used to hold the corresponding result.  The input data is complex
+                   and contains <code>2*fftLen</code> interleaved values as shown below.
+                   <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
+                   The FFT result will be contained in the same array and the frequency domain
+                   values will have the same interleaving.
+
+  @par Floating-point
+                   The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
+                   stages are performed along with a single radix-2 or radix-4 stage, as needed.
+                   The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
+                   a different twiddle factor table.
+  @par
+                   The function uses the standard FFT definition and output values may grow by a
+                   factor of <code>fftLen</code> when computing the forward transform.  The
+                   inverse transform includes a scale of <code>1/fftLen</code> as part of the
+                   calculation and this matches the textbook definition of the inverse FFT.
+  @par
+                   For the MVE version, the new arm_cfft_init_f32 initialization function is
+                   <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
+                   needed FFTs.</b> Other FFT versions can continue to be initialized as
+                   explained below.
+  @par
+                   For not MVE versions, pre-initialized data structures containing twiddle factors
+                   and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>.  Include
+                   this header in your function and then pass one of the constant structures as
+                   an argument to arm_cfft_f32.  For example:
+  @par
+                   <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
+  @par
+                   computes a 64-point inverse complex FFT including bit reversal.
+                   The data structures are treated as constant data and not modified during the
+                   calculation.  The same data structure can be reused for multiple transforms
+                   including mixing forward and inverse transforms.
+  @par
+                   Earlier releases of the library provided separate radix-2 and radix-4
+                   algorithms that operated on floating-point data.  These functions are still
+                   provided but are deprecated.  The older functions are slower and less general
+                   than the new functions.
+  @par
+                   An example of initialization of the constants for the arm_cfft_f32 function follows:
+  @code
+                   const static arm_cfft_instance_f32 *S;
+                   ...
+                     switch (length) {
+                       case 16:
+                         S = &arm_cfft_sR_f32_len16;
+                         break;
+                       case 32:
+                         S = &arm_cfft_sR_f32_len32;
+                         break;
+                       case 64:
+                         S = &arm_cfft_sR_f32_len64;
+                         break;
+                       case 128:
+                         S = &arm_cfft_sR_f32_len128;
+                         break;
+                       case 256:
+                         S = &arm_cfft_sR_f32_len256;
+                         break;
+                       case 512:
+                         S = &arm_cfft_sR_f32_len512;
+                         break;
+                       case 1024:
+                         S = &arm_cfft_sR_f32_len1024;
+                         break;
+                       case 2048:
+                         S = &arm_cfft_sR_f32_len2048;
+                         break;
+                       case 4096:
+                         S = &arm_cfft_sR_f32_len4096;
+                         break;
+                     }
+  @endcode
+  @par
+                   The new arm_cfft_init_f32 can also be used.
+  @par Q15 and Q31
+                   The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-4
+                   stages are performed along with a single radix-2 stage, as needed.
+                   The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
+                   a different twiddle factor table.
+  @par
+                   The function uses the standard FFT definition and output values may grow by a
+                   factor of <code>fftLen</code> when computing the forward transform.  The
+                   inverse transform includes a scale of <code>1/fftLen</code> as part of the
+                   calculation and this matches the textbook definition of the inverse FFT.
+  @par
+                   Pre-initialized data structures containing twiddle factors and bit reversal
+                   tables are provided and defined in <code>arm_const_structs.h</code>.  Include
+                   this header in your function and then pass one of the constant structures as
+                   an argument to arm_cfft_q31. For example:
+  @par
+                   <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
+  @par
+                   computes a 64-point inverse complex FFT including bit reversal.
+                   The data structures are treated as constant data and not modified during the
+                   calculation.  The same data structure can be reused for multiple transforms
+                   including mixing forward and inverse transforms.
+  @par
+                   Earlier releases of the library provided separate radix-2 and radix-4
+                   algorithms that operated on floating-point data.  These functions are still
+                   provided but are deprecated.  The older functions are slower and less general
+                   than the new functions.
+  @par
+                   An example of initialization of the constants for the arm_cfft_q31 function follows:
+  @code
+                   const static arm_cfft_instance_q31 *S;
+                   ...
+                     switch (length) {
+                       case 16:
+                         S = &arm_cfft_sR_q31_len16;
+                         break;
+                       case 32:
+                         S = &arm_cfft_sR_q31_len32;
+                         break;
+                       case 64:
+                         S = &arm_cfft_sR_q31_len64;
+                         break;
+                       case 128:
+                         S = &arm_cfft_sR_q31_len128;
+                         break;
+                       case 256:
+                         S = &arm_cfft_sR_q31_len256;
+                         break;
+                       case 512:
+                         S = &arm_cfft_sR_q31_len512;
+                         break;
+                       case 1024:
+                         S = &arm_cfft_sR_q31_len1024;
+                         break;
+                       case 2048:
+                         S = &arm_cfft_sR_q31_len2048;
+                         break;
+                       case 4096:
+                         S = &arm_cfft_sR_q31_len4096;
+                         break;
+                     }
+  @endcode
+
+ */
+
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Processing function for the floating-point complex FFT.
+  @param[in]     S              points to an instance of the floating-point CFFT structure
+  @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
+  @param[in]     ifftFlag       flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        none
+ */
+
+void arm_cfft_f16(
+    const arm_cfft_instance_f16 * S,
+    float16_t * p1,
+    uint8_t ifftFlag,
+    uint8_t bitReverseFlag)
+{
+    uint32_t  L = S->fftLen, l;
+    float16_t invL, * pSrc;
+
+    if (ifftFlag == 1U)
+    {
+        /*  Conjugate input data  */
+        pSrc = p1 + 1;
+        for(l=0; l<L; l++)
+        {
+            *pSrc = -*pSrc;
+            pSrc += 2;
+        }
+    }
+
+    switch (L)
+    {
+
+        case 16:
+        case 64:
+        case 256:
+        case 1024:
+        case 4096:
+        arm_radix4_butterfly_f16  (p1, L, (float16_t*)S->pTwiddle, 1U);
+        break;
+
+        case 32:
+        case 128:
+        case 512:
+        case 2048:
+        arm_cfft_radix4by2_f16  ( p1, L, (float16_t*)S->pTwiddle);
+        break;
+
+    }
+
+    if ( bitReverseFlag )
+        arm_bitreversal_16((uint16_t*)p1, S->bitRevLength,(uint16_t*)S->pBitRevTable);
+
+    if (ifftFlag == 1U)
+    {
+        invL = 1.0f/(float16_t)L;
+        /*  Conjugate and scale output data */
+        pSrc = p1;
+        for(l=0; l<L; l++)
+        {
+            *pSrc++ *=   invL ;
+            *pSrc  = -(*pSrc) * invL;
+            pSrc++;
+        }
+    }
+}
+#endif /* if defined(ARM_FLOAT16_SUPPORTED) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of ComplexFFT group
+ */
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f32.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f32.c
index c7f26fdca5ca61ff3722df5b09a2a64b601c1919..f7f1fde12a52a7a9f1e061bde7290feb6c140a60 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f32.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_f32.c
  * Description:  Combined Radix Decimation in Frequency CFFT Floating point processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
@@ -39,87 +39,56 @@
 static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
 {
   float32_t retValue=1.0;
-                                                      
-  switch (fftLen)                                     
-  {                                                   
-                                                      
-  case 4096U:                                         
-    retValue = 0.000244140625;                        
-    break;                                            
-                                                      
-  case 2048U:                                         
-    retValue = 0.00048828125;                         
-    break;                                            
-                                                      
-  case 1024U:                                         
-    retValue = 0.0009765625f;                         
-    break;                                            
-                                                      
-  case 512U:                                          
-    retValue = 0.001953125;                           
-    break;                                            
-                                                      
-  case 256U:                                          
-    retValue = 0.00390625f;                           
-    break;                                            
-                                                      
-  case 128U:                                          
-    retValue = 0.0078125;                             
-    break;                                            
-                                                      
-  case 64U:                                           
-    retValue = 0.015625f;                             
-    break;                                            
-                                                      
-  case 32U:                                           
-    retValue = 0.03125;                               
-    break;                                            
-                                                      
-  case 16U:                                           
-    retValue = 0.0625f;                               
-    break;                                            
-                                                      
-                                                      
-  default:                                            
-    break;                                            
-  }                                                   
-  return(retValue); 
-}
 
+  switch (fftLen)
+  {
 
-static void arm_bitreversal_f32_inpl_mve(
-        uint32_t *pSrc,
-  const uint16_t  bitRevLen,
-  const uint16_t *pBitRevTab)
+  case 4096U:
+    retValue = 0.000244140625;
+    break;
 
-{
-    uint64_t       *src = (uint64_t *) pSrc;
-    uint32_t        blkCnt;     /* loop counters */
-    uint32x4_t      bitRevTabOff;
-    uint32x4_t      one = vdupq_n_u32(1);
+  case 2048U:
+    retValue = 0.00048828125;
+    break;
+
+  case 1024U:
+    retValue = 0.0009765625f;
+    break;
+
+  case 512U:
+    retValue = 0.001953125;
+    break;
 
-    blkCnt = (bitRevLen / 2) / 2;
-    while (blkCnt > 0U) {
-        bitRevTabOff = vldrhq_u32(pBitRevTab);
-        pBitRevTab += 4;
+  case 256U:
+    retValue = 0.00390625f;
+    break;
 
-        uint64x2_t      bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
-        uint64x2_t      bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
+  case 128U:
+    retValue = 0.0078125;
+    break;
 
-        uint64x2_t      in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
-        uint64x2_t      in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
+  case 64U:
+    retValue = 0.015625f;
+    break;
 
-        vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
-        vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
+  case 32U:
+    retValue = 0.03125;
+    break;
 
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
+  case 16U:
+    retValue = 0.0625f;
+    break;
+
+
+  default:
+    break;
+  }
+  return(retValue);
 }
 
 
+
+
 static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
 {
     f32x4_t vecTmp0, vecTmp1;
@@ -241,7 +210,7 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
     /*
      * start of Last stage process
      */
-    uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
+    uint32x4_t vecScGathAddr = vld1q_u32(strides);
     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
 
     /* load scheduling */
@@ -447,7 +416,7 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
     /*
      * start of Last stage process
      */
-    uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
+    uint32x4_t vecScGathAddr = vld1q_u32 (strides);
     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
 
     /*
@@ -563,53 +532,53 @@ void arm_cfft_f32(
         float32_t * pSrc,
         uint8_t ifftFlag,
         uint8_t bitReverseFlag)
-{                                                                                
-        uint32_t fftLen = S->fftLen;     
-
-        if (ifftFlag == 1U) {                                                            
-                                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:                                                                   
-                _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen)); 
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);              
-                break;                                                                   
-            }  
-        } else {                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:                                                                   
-                _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);         
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);                      
-                break;                                                                   
-            }                                                                            
-        }                                                                                
-                                                                                         
-                                                                                         
-        if (bitReverseFlag) 
-        {                                                            
-            
-            arm_bitreversal_f32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-                    
-        } 
+{
+        uint32_t fftLen = S->fftLen;
+
+        if (ifftFlag == 1U) {
+
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
+                break;
+            }
+        } else {
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
+                break;
+            }
+        }
+
+
+        if (bitReverseFlag)
+        {
+
+            arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
+
+        }
 }
 
 
@@ -631,7 +600,7 @@ extern void arm_bitreversal_32(
 
 /**
   @defgroup ComplexFFT Complex FFT Functions
- 
+
   @par
                    The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
                    Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
@@ -649,7 +618,7 @@ extern void arm_bitreversal_32(
                    <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
                    The FFT result will be contained in the same array and the frequency domain
                    values will have the same interleaving.
- 
+
   @par Floating-point
                    The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
                    stages are performed along with a single radix-2 or radix-4 stage, as needed.
@@ -661,12 +630,12 @@ extern void arm_bitreversal_32(
                    inverse transform includes a scale of <code>1/fftLen</code> as part of the
                    calculation and this matches the textbook definition of the inverse FFT.
   @par
-                   For the MVE version, the new arm_cfft_init_f32 initialization function is 
+                   For the MVE version, the new arm_cfft_init_f32 initialization function is
                    <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
-                   needed FFTs.</b> Other FFT versions can continue to be initialized as 
+                   needed FFTs.</b> Other FFT versions can continue to be initialized as
                    explained below.
   @par
-                   For not MVE versions, pre-initialized data structures containing twiddle factors 
+                   For not MVE versions, pre-initialized data structures containing twiddle factors
                    and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>.  Include
                    this header in your function and then pass one of the constant structures as
                    an argument to arm_cfft_f32.  For example:
@@ -781,7 +750,7 @@ extern void arm_bitreversal_32(
                          break;
                      }
   @endcode
- 
+
  */
 
 void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f64.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f64.c
index 8c6d3e6aa97856b51efbb9d8a02f6261b72e37d2..d686eb8c495624a292d5fb3fde0b5722a1fe3a75 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f64.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f64.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_f64.c
  * Description:  Combined Radix Decimation in Frequency CFFT Double Precision Floating point processing function
  *
- * $Date:        29. November 2019
- * $Revision:    V1.0.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f16.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..fdf887ba51e5f799d72341667c767bec0d799849
--- /dev/null
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f16.c
@@ -0,0 +1,363 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_init_f16.c
+ * Description:  Initialization function for cfft f16 instance
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define FFTINIT(EXT,SIZE)                                           \
+  S->bitRevLength = arm_cfft_sR_##EXT##_len##SIZE.bitRevLength;        \
+  S->pBitRevTable = arm_cfft_sR_##EXT##_len##SIZE.pBitRevTable;         \
+  S->pTwiddle = arm_cfft_sR_##EXT##_len##SIZE.pTwiddle;
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Initialization function for the cfft f16 function
+  @param[in,out] S              points to an instance of the floating-point CFFT structure
+  @param[in]     fftLen         fft length (number of complex samples)
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+
+  @par          Use of this function is mandatory only for the MVE version of the FFT.
+                Other versions can still initialize directly the data structure using 
+                variables declared in arm_const_structs.h
+ */
+
+#include "dsp/transform_functions_f16.h"
+#include "arm_common_tables_f16.h"
+#include "arm_const_structs_f16.h"
+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_vec_fft.h"
+#include "arm_mve_tables_f16.h"
+
+arm_status arm_cfft_radix4by2_rearrange_twiddles_f16(arm_cfft_instance_f16 *S, int twidCoefModifier)
+{
+                                                                  
+        switch (S->fftLen >> (twidCoefModifier - 1)) {  
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+            || defined(ARM_TABLE_TWIDDLECOEF_F16_4096)
+        case 4096U:                                                                                
+            S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_4096_f16;
+            S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_4096_f16;     
+
+            S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_4096_f16;
+            S->rearranged_twiddle_stride2  =  rearranged_twiddle_stride2_4096_f16;    
+
+            S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_4096_f16;
+            S->rearranged_twiddle_stride3  =  rearranged_twiddle_stride3_4096_f16;                                                     
+            break; 
+#endif                                  
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+            || defined(ARM_TABLE_TWIDDLECOEF_F16_1024) || defined(ARM_TABLE_TWIDDLECOEF_F16_2048)                                                                                                  
+        case 1024U:                                                                                
+            S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_1024_f16;
+            S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_1024_f16;     
+
+            S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_1024_f16;
+            S->rearranged_twiddle_stride2  =  rearranged_twiddle_stride2_1024_f16;    
+
+            S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_1024_f16;
+            S->rearranged_twiddle_stride3  =  rearranged_twiddle_stride3_1024_f16;                                                                          
+            break;                                                                                 
+ #endif 
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+    || defined(ARM_TABLE_TWIDDLECOEF_F16_256) || defined(ARM_TABLE_TWIDDLECOEF_F16_512)                                                                                              
+        case 256U:                                                                                 
+            S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_256_f16;
+            S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_256_f16;     
+
+            S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_256_f16;
+            S->rearranged_twiddle_stride2  =  rearranged_twiddle_stride2_256_f16;    
+
+            S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_256_f16;
+            S->rearranged_twiddle_stride3  =  rearranged_twiddle_stride3_256_f16;    
+
+            break;                     
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+            || defined(ARM_TABLE_TWIDDLECOEF_F16_64) || defined(ARM_TABLE_TWIDDLECOEF_F16_128)
+        case 64U:                                                                                  
+            S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_64_f16;
+            S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_64_f16;     
+
+            S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_64_f16;
+            S->rearranged_twiddle_stride2  =  rearranged_twiddle_stride2_64_f16;    
+
+            S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_64_f16;
+            S->rearranged_twiddle_stride3  =  rearranged_twiddle_stride3_64_f16;                                                                         
+            break;  
+#endif                                                                               
+              
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+            || defined(ARM_TABLE_TWIDDLECOEF_F16_16) || defined(ARM_TABLE_TWIDDLECOEF_F16_32)                                                                                                                                                                                                             
+        case 16U:                                                                                  
+            S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_16_f16;
+            S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_16_f16;     
+
+            S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_16_f16;
+            S->rearranged_twiddle_stride2  =  rearranged_twiddle_stride2_16_f16;    
+
+            S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_16_f16;
+            S->rearranged_twiddle_stride3  =  rearranged_twiddle_stride3_16_f16;                                                                       
+            break;  
+#endif                                                                               
+                                                                                                   
+        default:  
+            return(ARM_MATH_ARGUMENT_ERROR);                                                                                 
+            break;                                                                                 
+            /* invalid sizes already filtered */                                                   
+        }                                                                                          
+
+        return(ARM_MATH_SUCCESS);
+
+}
+
+arm_status arm_cfft_init_f16(
+  arm_cfft_instance_f16 * S,
+  uint16_t fftLen)
+{
+
+        /*  Initialise the default arm status */                                
+        arm_status status = ARM_MATH_SUCCESS;                                   
+                                                                                
+        /*  Initialise the FFT length */                                        
+        S->fftLen = fftLen;                                                     
+                                                                                
+        /*  Initialise the Twiddle coefficient pointer */                       
+        S->pTwiddle = NULL;                         
+                                                                                
+                                                                                
+        /*  Initializations of Instance structure depending on the FFT length */
+        switch (S->fftLen) {                                                    
+            /*  Initializations of structure parameters for 4096 point FFT */   
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_F16_4096))                                                          
+        case 4096U:  
+            /*  Initialise the bit reversal table modifier */                   
+            S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH;      
+            S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_4096;   
+            S->pTwiddle = (float16_t *)twiddleCoefF16_4096;       
+            status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);               
+            break;                                                              
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_F16_2048))                                                                                                         
+            /*  Initializations of structure parameters for 2048 point FFT */   
+        case 2048U:                                                             
+            /*  Initialise the bit reversal table modifier */                   
+            S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH;      
+            S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_2048;
+            S->pTwiddle = (float16_t *)twiddleCoefF16_2048;          
+            status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);           
+            break;     
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_F16_1024))                                                                                                                   
+            /*  Initializations of structure parameters for 1024 point FFT */   
+        case 1024U:                                                             
+            /*  Initialise the bit reversal table modifier */                   
+            S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH;      
+            S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_1024; 
+            S->pTwiddle = (float16_t *)twiddleCoefF16_1024;         
+            status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);           
+            break;                                                              
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_F16_512))                                                            
+            /*  Initializations of structure parameters for 512 point FFT */    
+        case 512U:                                                              
+            /*  Initialise the bit reversal table modifier */                   
+            S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH;       
+            S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_512;  
+            S->pTwiddle = (float16_t *)twiddleCoefF16_512;         
+            status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);           
+            break;                                                              
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_F16_256))                                                           
+        case 256U:                                                              
+            S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH;       
+            S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256; 
+            S->pTwiddle = (float16_t *)twiddleCoefF16_256;          
+            status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);           
+            break;  
+#endif                                                            
+                 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_F16_128))                                                                                                                           
+        case 128U:                                                              
+            S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH;       
+            S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128; 
+            S->pTwiddle = (float16_t *)twiddleCoefF16_128;          
+            status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);           
+            break;                                                              
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_F16_64))                                                                                                                       
+        case 64U:                                                               
+            S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH;        
+            S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64;  
+            S->pTwiddle = (float16_t *)twiddleCoefF16_64;          
+            status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);           
+            break;                                                              
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_F16_32))                                                                                                                           
+        case 32U:                                                               
+            S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH;        
+            S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32;  
+            S->pTwiddle = (float16_t *)twiddleCoefF16_32;          
+            status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);           
+            break;                                                              
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_F16_16))                                                                                                                                 
+        case 16U:                                                               
+            /*  Initializations of structure parameters for 16 point FFT */     
+            S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH;        
+            S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_16; 
+            S->pTwiddle = (float16_t *)twiddleCoefF16_16;           
+            status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);           
+            break;                                                              
+#endif                                                                             
+                                                                                
+        default:                                                                
+            /*  Reporting argument error if fftSize is not valid value */       
+            status = ARM_MATH_ARGUMENT_ERROR;                                   
+            break;                                                              
+        }                                                                       
+                                                                                
+                                                                                
+        return (status);     
+}
+#else
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+arm_status arm_cfft_init_f16(
+  arm_cfft_instance_f16 * S,
+  uint16_t fftLen)
+{
+        /*  Initialise the default arm status */
+        arm_status status = ARM_MATH_SUCCESS;
+
+        /*  Initialise the FFT length */
+        S->fftLen = fftLen;
+
+        /*  Initialise the Twiddle coefficient pointer */
+        S->pTwiddle = NULL;
+
+
+        /*  Initializations of Instance structure depending on the FFT length */
+        switch (S->fftLen) {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
+            /*  Initializations of structure parameters for 4096 point FFT */
+        case 4096U:
+            /*  Initialise the bit reversal table modifier */
+            FFTINIT(f16,4096);
+            break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
+            /*  Initializations of structure parameters for 2048 point FFT */
+        case 2048U:
+            /*  Initialise the bit reversal table modifier */
+            FFTINIT(f16,2048);
+
+            break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
+            /*  Initializations of structure parameters for 1024 point FFT */
+        case 1024U:
+            /*  Initialise the bit reversal table modifier */
+            FFTINIT(f16,1024);
+
+            break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
+            /*  Initializations of structure parameters for 512 point FFT */
+        case 512U:
+            /*  Initialise the bit reversal table modifier */
+            FFTINIT(f16,512);
+            break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
+        case 256U:
+            FFTINIT(f16,256);
+            break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
+        case 128U:
+            FFTINIT(f16,128);
+            break;
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
+        case 64U:
+            FFTINIT(f16,64);
+            break;
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
+        case 32U:
+            FFTINIT(f16,32);
+            break;
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
+        case 16U:
+            /*  Initializations of structure parameters for 16 point FFT */
+            FFTINIT(f16,16);
+            break;
+#endif
+
+        default:
+            /*  Reporting argument error if fftSize is not valid value */
+            status = ARM_MATH_ARGUMENT_ERROR;
+            break;
+        }
+
+
+        return (status);
+}
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of ComplexFFT group
+ */
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f32.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f32.c
index 97ba22e936c8ef5f9030e449eadd24c3296bf882..abfab4af90ccae929a818b19a8f3b1dc297d85a6 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f32.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_init_f32.c
  * Description:  Initialization function for cfft f32 instance
  *
- * $Date:        07. January 2020
- * $Revision:    V1.7.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -49,7 +49,7 @@
                 variables declared in arm_const_structs.h
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 #include "arm_const_structs.h"
 
@@ -63,7 +63,8 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_f32(arm_cfft_instance_f32 *S, i
                                                                   
         switch (S->fftLen >> (twidCoefModifier - 1)) {  
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_4096)
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+            || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)                                                                                                
         case 4096U:                                                                                
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_4096_f32;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_4096_f32;     
@@ -76,7 +77,8 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_f32(arm_cfft_instance_f32 *S, i
             break; 
 #endif                                  
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_1024) || defined(ARM_TABLE_BITREVIDX_FXT_2048)                                                                                                   
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+            || defined(ARM_TABLE_TWIDDLECOEF_F32_1024) || defined(ARM_TABLE_TWIDDLECOEF_F32_2048)                                                                                                  
         case 1024U:                                                                                
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_1024_f32;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_1024_f32;     
@@ -89,7 +91,8 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_f32(arm_cfft_instance_f32 *S, i
             break;                                                                                 
  #endif 
 
- #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_256) || defined(ARM_TABLE_BITREVIDX_FXT_512)                                                                                                  
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+            || defined(ARM_TABLE_TWIDDLECOEF_F32_256) || defined(ARM_TABLE_TWIDDLECOEF_F32_512)                                                                                                 
         case 256U:                                                                                 
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_256_f32;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_256_f32;     
@@ -103,7 +106,8 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_f32(arm_cfft_instance_f32 *S, i
             break;                     
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_64) || defined(ARM_TABLE_BITREVIDX_FXT_128)
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+            || defined(ARM_TABLE_TWIDDLECOEF_F32_64) || defined(ARM_TABLE_TWIDDLECOEF_F32_128)
         case 64U:                                                                                  
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_64_f32;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_64_f32;     
@@ -116,7 +120,8 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_f32(arm_cfft_instance_f32 *S, i
             break;  
 #endif                                                                               
               
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_16) || defined(ARM_TABLE_BITREVIDX_FXT_32)                                                                                                                                                                                                                
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+            || defined(ARM_TABLE_TWIDDLECOEF_F32_16) || defined(ARM_TABLE_TWIDDLECOEF_F32_32)                                                                                                                                                                                                               
         case 16U:                                                                                  
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_16_f32;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_16_f32;     
@@ -157,7 +162,7 @@ arm_status arm_cfft_init_f32(
         /*  Initializations of Instance structure depending on the FFT length */
         switch (S->fftLen) {                                                    
             /*  Initializations of structure parameters for 4096 point FFT */   
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_4096)                                                           
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_F32_4096))                     
         case 4096U:  
             /*  Initialise the bit reversal table modifier */                   
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH;      
@@ -167,7 +172,7 @@ arm_status arm_cfft_init_f32(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_2048)                                                                                                        
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_F32_2048))                                                
             /*  Initializations of structure parameters for 2048 point FFT */   
         case 2048U:                                                             
             /*  Initialise the bit reversal table modifier */                   
@@ -178,7 +183,7 @@ arm_status arm_cfft_init_f32(
             break;     
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_1024)                                                                                                                  
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_F32_1024))                                                           
             /*  Initializations of structure parameters for 1024 point FFT */   
         case 1024U:                                                             
             /*  Initialise the bit reversal table modifier */                   
@@ -189,7 +194,7 @@ arm_status arm_cfft_init_f32(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_512)                                                           
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_F32_512))                     
             /*  Initializations of structure parameters for 512 point FFT */    
         case 512U:                                                              
             /*  Initialise the bit reversal table modifier */                   
@@ -200,7 +205,7 @@ arm_status arm_cfft_init_f32(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_256)                                                           
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_F32_256))                                                           
         case 256U:                                                              
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH;       
             S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256; 
@@ -209,7 +214,7 @@ arm_status arm_cfft_init_f32(
             break;  
 #endif                                                            
                  
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_128)                                                                                                                          
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_F32_128))                                                                                                                          
         case 128U:                                                              
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH;       
             S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128; 
@@ -218,7 +223,7 @@ arm_status arm_cfft_init_f32(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_64)                                                                                                                      
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_F32_64))                                                                                                                      
         case 64U:                                                               
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH;        
             S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64;  
@@ -227,7 +232,7 @@ arm_status arm_cfft_init_f32(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_32)                                                                                                                           
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_F32_32))                                                                                                                           
         case 32U:                                                               
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH;        
             S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32;  
@@ -236,7 +241,7 @@ arm_status arm_cfft_init_f32(
             break;                                                              
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_16)                                                                                                                                
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_F32_16))                                                                                                                                
         case 16U:                                                               
             /*  Initializations of structure parameters for 16 point FFT */     
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH;        
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f64.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f64.c
index 939488bb4f88e37d14278748e7249e7ee91887a3..26bee3e47d1a2b481234acbfd656e8dd3126d945 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f64.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f64.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_init_f64.c
  * Description:  Initialization function for cfft f64 instance
  *
- * $Date:        23. January 2020
- * $Revision:    V1.7.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -49,7 +49,7 @@
                 variables declared in arm_const_structs.h
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 #include "arm_const_structs.h"
 
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c
index 99af18cd5a7cdcddb51bec33cd4ce1d2de70ecff..3cf6e3a6e52f155977ac4327931a520a7ef85f63 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_init_q15.c
  * Description:  Initialization function for cfft q15 instance
  *
- * $Date:        07. January 2020
- * $Revision:    V1.7.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -49,11 +49,11 @@
                 variables declared in arm_const_structs.h
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 #include "arm_const_structs.h"
 
-#if defined(ARM_MATH_MVEI) 
+#if defined(ARM_MATH_MVEI)  && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_vec_fft.h"
 #include "arm_mve_tables.h"
@@ -64,7 +64,7 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_q15(arm_cfft_instance_q15 *S, i
                                                                   
         switch (S->fftLen >> (twidCoefModifier - 1)) {  
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_4096)
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_Q15_4096))
         case 4096U:                                                                                
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_4096_q15;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_4096_q15;     
@@ -77,7 +77,7 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_q15(arm_cfft_instance_q15 *S, i
             break; 
 #endif                                  
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_1024) || defined(ARM_TABLE_BITREVIDX_FXT_2048)                                                                                                   
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_Q15_1024)) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_Q15_2048))                                                                                                   
         case 1024U:                                                                                
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_1024_q15;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_1024_q15;     
@@ -90,7 +90,7 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_q15(arm_cfft_instance_q15 *S, i
             break;                                                                                 
  #endif 
 
- #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_256) || defined(ARM_TABLE_BITREVIDX_FXT_512)                                                                                                  
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_Q15_256)) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_Q15_512))                                                                                                  
         case 256U:                                                                                 
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_256_q15;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_256_q15;     
@@ -104,7 +104,7 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_q15(arm_cfft_instance_q15 *S, i
             break;                     
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_64) || defined(ARM_TABLE_BITREVIDX_FXT_128)
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_Q15_64)) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_Q15_128))
         case 64U:                                                                                  
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_64_q15;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_64_q15;     
@@ -117,7 +117,7 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_q15(arm_cfft_instance_q15 *S, i
             break;  
 #endif                                                                               
               
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_16) || defined(ARM_TABLE_BITREVIDX_FXT_32)                                                                                                                                                                                                                
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_Q15_16)) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_Q15_32))                                                                                                                                                                                                                
         case 16U:                                                                                  
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_16_q15;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_16_q15;     
@@ -160,7 +160,7 @@ arm_status arm_cfft_init_q15(
         /*  Initializations of Instance structure depending on the FFT length */
         switch (S->fftLen) {                                                    
             /*  Initializations of structure parameters for 4096 point FFT */   
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_4096)                                                           
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_Q15_4096))                                                           
         case 4096U:  
             /*  Initialise the bit reversal table modifier */                   
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH;      
@@ -170,7 +170,7 @@ arm_status arm_cfft_init_q15(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_2048)                                                                                                        
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_Q15_2048))                                                                                                        
             /*  Initializations of structure parameters for 2048 point FFT */   
         case 2048U:                                                             
             /*  Initialise the bit reversal table modifier */                   
@@ -181,7 +181,7 @@ arm_status arm_cfft_init_q15(
             break;     
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_1024)                                                                                                                  
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_Q15_1024))                                                                                                                  
             /*  Initializations of structure parameters for 1024 point FFT */   
         case 1024U:                                                             
             /*  Initialise the bit reversal table modifier */                   
@@ -192,7 +192,7 @@ arm_status arm_cfft_init_q15(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_512)                                                           
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_Q15_512))                                                           
             /*  Initializations of structure parameters for 512 point FFT */    
         case 512U:                                                              
             /*  Initialise the bit reversal table modifier */                   
@@ -203,7 +203,7 @@ arm_status arm_cfft_init_q15(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_256)                                                           
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_Q15_256))                                                           
         case 256U:                                                              
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH;       
             S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256; 
@@ -212,7 +212,7 @@ arm_status arm_cfft_init_q15(
             break;  
 #endif                                                            
                  
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_128)                                                                                                                          
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_Q15_128))                                                                                                                          
         case 128U:                                                              
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH;       
             S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128; 
@@ -221,7 +221,7 @@ arm_status arm_cfft_init_q15(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_64)                                                                                                                      
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_Q15_64))                                                                                                                      
         case 64U:                                                               
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH;        
             S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64;  
@@ -230,7 +230,7 @@ arm_status arm_cfft_init_q15(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_32)                                                                                                                           
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_Q15_32))                                                                                                                           
         case 32U:                                                               
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH;        
             S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32;  
@@ -239,7 +239,7 @@ arm_status arm_cfft_init_q15(
             break;                                                              
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_16)                                                                                                                                
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_Q15_16))                                                                                                                                
         case 16U:                                                               
             /*  Initializations of structure parameters for 16 point FFT */     
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH;        
@@ -275,7 +275,7 @@ arm_status arm_cfft_init_q15(
 
         /*  Initializations of Instance structure depending on the FFT length */
         switch (S->fftLen) {
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
             /*  Initializations of structure parameters for 4096 point FFT */
         case 4096U:
             /*  Initialise the bit reversal table modifier */
@@ -283,7 +283,7 @@ arm_status arm_cfft_init_q15(
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
             /*  Initializations of structure parameters for 2048 point FFT */
         case 2048U:
             /*  Initialise the bit reversal table modifier */
@@ -292,7 +292,7 @@ arm_status arm_cfft_init_q15(
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
             /*  Initializations of structure parameters for 1024 point FFT */
         case 1024U:
             /*  Initialise the bit reversal table modifier */
@@ -301,7 +301,7 @@ arm_status arm_cfft_init_q15(
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
             /*  Initializations of structure parameters for 512 point FFT */
         case 512U:
             /*  Initialise the bit reversal table modifier */
@@ -309,31 +309,31 @@ arm_status arm_cfft_init_q15(
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
         case 256U:
             FFTINIT(q15,256);
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
         case 128U:
             FFTINIT(q15,128);
             break;
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
         case 64U:
             FFTINIT(q15,64);
             break;
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
         case 32U:
             FFTINIT(q15,32);
             break;
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
         case 16U:
             /*  Initializations of structure parameters for 16 point FFT */
             FFTINIT(q15,16);
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q31.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q31.c
index 0511af02e33a2e53e85f4d6a423a393b58b004f3..3722a680473bce86dd21a9130539b0bd17eceda9 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q31.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_init_q31.c
  * Description:  Initialization function for cfft q31 instance
  *
- * $Date:        07. January 2020
- * $Revision:    V1.7.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -49,11 +49,11 @@
                 variables declared in arm_const_structs.h
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 #include "arm_const_structs.h"
 
-#if defined(ARM_MATH_MVEI) 
+#if defined(ARM_MATH_MVEI)  && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_vec_fft.h"
 #include "arm_mve_tables.h"
@@ -64,7 +64,7 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_q31(arm_cfft_instance_q31 *S, i
                                                                   
         switch (S->fftLen >> (twidCoefModifier - 1)) {  
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_4096)
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_Q31_4096))
         case 4096U:                                                                                
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_4096_q31;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_4096_q31;     
@@ -77,7 +77,7 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_q31(arm_cfft_instance_q31 *S, i
             break; 
 #endif                                  
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_1024) || defined(ARM_TABLE_BITREVIDX_FXT_2048)                                                                                                   
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_Q31_1024)) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_Q31_2048))                                                                                                   
         case 1024U:                                                                                
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_1024_q31;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_1024_q31;     
@@ -90,7 +90,7 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_q31(arm_cfft_instance_q31 *S, i
             break;                                                                                 
  #endif 
 
- #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_256) || defined(ARM_TABLE_BITREVIDX_FXT_512)                                                                                                  
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_Q31_256)) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_Q31_512))                                                                                                  
         case 256U:                                                                                 
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_256_q31;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_256_q31;     
@@ -104,7 +104,7 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_q31(arm_cfft_instance_q31 *S, i
             break;                     
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_64) || defined(ARM_TABLE_BITREVIDX_FXT_128)
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_Q31_64)) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_Q31_128))
         case 64U:                                                                                  
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_64_q31;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_64_q31;     
@@ -117,7 +117,7 @@ arm_status arm_cfft_radix4by2_rearrange_twiddles_q31(arm_cfft_instance_q31 *S, i
             break;  
 #endif                                                                               
               
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_16) || defined(ARM_TABLE_BITREVIDX_FXT_32)                                                                                                                                                                                                                
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_Q31_16)) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_Q31_32))                                                                                                                                                                                                                
         case 16U:                                                                                  
             S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_16_q31;
             S->rearranged_twiddle_stride1  =  rearranged_twiddle_stride1_16_q31;     
@@ -160,7 +160,7 @@ arm_status arm_cfft_init_q31(
         /*  Initializations of Instance structure depending on the FFT length */
         switch (S->fftLen) {                                                    
             /*  Initializations of structure parameters for 4096 point FFT */   
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_4096)                                                           
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_Q31_4096))                                                           
         case 4096U:  
             /*  Initialise the bit reversal table modifier */                   
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH;      
@@ -170,7 +170,7 @@ arm_status arm_cfft_init_q31(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_2048)                                                                                                        
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_Q31_2048))                                                                                                        
             /*  Initializations of structure parameters for 2048 point FFT */   
         case 2048U:                                                             
             /*  Initialise the bit reversal table modifier */                   
@@ -181,7 +181,7 @@ arm_status arm_cfft_init_q31(
             break;     
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_1024)                                                                                                                  
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_Q31_1024))                                                                                                                  
             /*  Initializations of structure parameters for 1024 point FFT */   
         case 1024U:                                                             
             /*  Initialise the bit reversal table modifier */                   
@@ -192,7 +192,7 @@ arm_status arm_cfft_init_q31(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_512)                                                           
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_Q31_512))                                                           
             /*  Initializations of structure parameters for 512 point FFT */    
         case 512U:                                                              
             /*  Initialise the bit reversal table modifier */                   
@@ -203,7 +203,7 @@ arm_status arm_cfft_init_q31(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_256)                                                           
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_Q31_256))                                                           
         case 256U:                                                              
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH;       
             S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256; 
@@ -212,7 +212,7 @@ arm_status arm_cfft_init_q31(
             break;  
 #endif                                                            
                  
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_128)                                                                                                                          
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_Q31_128))                                                                                                                          
         case 128U:                                                              
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH;       
             S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128; 
@@ -221,7 +221,7 @@ arm_status arm_cfft_init_q31(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_64)                                                                                                                      
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_Q31_64))                                                                                                                      
         case 64U:                                                               
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH;        
             S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64;  
@@ -230,7 +230,7 @@ arm_status arm_cfft_init_q31(
             break;                                                              
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_32)                                                                                                                           
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_Q31_32))                                                                                                                           
         case 32U:                                                               
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH;        
             S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32;  
@@ -239,7 +239,7 @@ arm_status arm_cfft_init_q31(
             break;                                                              
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREVIDX_FXT_16)                                                                                                                                
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_Q31_16))                                                                                                                                
         case 16U:                                                               
             /*  Initializations of structure parameters for 16 point FFT */     
             S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH;        
@@ -275,7 +275,7 @@ arm_status arm_cfft_init_q31(
 
         /*  Initializations of Instance structure depending on the FFT length */
         switch (S->fftLen) {
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
             /*  Initializations of structure parameters for 4096 point FFT */
         case 4096U:
             /*  Initialise the bit reversal table modifier */
@@ -283,7 +283,7 @@ arm_status arm_cfft_init_q31(
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
             /*  Initializations of structure parameters for 2048 point FFT */
         case 2048U:
             /*  Initialise the bit reversal table modifier */
@@ -292,7 +292,7 @@ arm_status arm_cfft_init_q31(
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
             /*  Initializations of structure parameters for 1024 point FFT */
         case 1024U:
             /*  Initialise the bit reversal table modifier */
@@ -301,7 +301,7 @@ arm_status arm_cfft_init_q31(
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
             /*  Initializations of structure parameters for 512 point FFT */
         case 512U:
             /*  Initialise the bit reversal table modifier */
@@ -309,31 +309,31 @@ arm_status arm_cfft_init_q31(
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
         case 256U:
             FFTINIT(q31,256);
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
         case 128U:
             FFTINIT(q31,128);
             break;
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
         case 64U:
             FFTINIT(q31,64);
             break;
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
         case 32U:
             FFTINIT(q31,32);
             break;
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
         case 16U:
             /*  Initializations of structure parameters for 16 point FFT */
             FFTINIT(q31,16);
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c
index a25fc824a3c81ab5f1f8d63001dc7d0a08af4474..9d4eb96cc477647f41f646780bc39ef332f420ef 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_q15.c
  * Description:  Combined Radix Decimation in Q15 Frequency CFFT processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,72 +26,13 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_vec_fft.h"
 
 
-static void arm_bitreversal_16_inpl_mve(
-        uint16_t *pSrc,
-  const uint16_t bitRevLen,
-  const uint16_t *pBitRevTab)
-
-{
-    uint32_t       *src = (uint32_t *)pSrc;
-    uint32_t        blkCnt;     /* loop counters */
-    uint32x4_t      bitRevTabOff;
-    uint16x8_t      one = vdupq_n_u16(1);
-
-    blkCnt = (bitRevLen / 2) / 4;
-    while (blkCnt > 0U) {
-        bitRevTabOff = vldrhq_u16(pBitRevTab);
-        pBitRevTab += 8;
-
-        uint32x4_t      bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
-        uint32x4_t      bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
-
-        bitRevOff1 = bitRevOff1 >> 3;
-        bitRevOff2 = bitRevOff2 >> 3;
-
-        uint32x4_t      in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1);
-        uint32x4_t      in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2);
-
-        vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2);
-        vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1);
-
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-
-
-    /*
-     * tail
-     * (will be merged thru tail predication)
-     */
-    blkCnt = bitRevLen & 7;
-    if (blkCnt > 0U) {
-        mve_pred16_t    p0 = vctp16q(blkCnt);
-
-        bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0);
-
-        uint32x4_t      bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
-        uint32x4_t      bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
-
-        bitRevOff1 = bitRevOff1 >> 3;
-        bitRevOff2 = bitRevOff2 >> 3;
-
-        uint32x4_t      in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0);
-        uint32x4_t      in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0);
-
-        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0);
-        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0);
-    }
-}
-
 static void _arm_radix4_butterfly_q15_mve(
     const arm_cfft_instance_q15 * S,
     q15_t   *pSrc,
@@ -216,7 +157,7 @@ static void _arm_radix4_butterfly_q15_mve(
     /*
      * start of Last stage process
      */
-    uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
+    uint32x4_t vecScGathAddr = vld1q_u32 (strides);
     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
 
     /*
@@ -451,7 +392,7 @@ static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S
     /*
      * start of Last stage process
      */
-    uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
+    uint32x4_t vecScGathAddr = vld1q_u32(strides);
     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
 
     /*
@@ -592,53 +533,53 @@ void arm_cfft_q15(
         q15_t * pSrc,
         uint8_t ifftFlag,
         uint8_t bitReverseFlag)
-{                                                                             
-        uint32_t fftLen = S->fftLen;     
-
-        if (ifftFlag == 1U) {                                                            
-                                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:                                                                   
-                _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen); 
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);              
-                break;                                                                   
-            }  
-        } else {                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:    
-                _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);         
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);                      
-                break;                                                                   
-            }                                                                            
-        }                                                                                
-                                                                                         
-                                                                                         
-        if (bitReverseFlag) 
-        {                                                            
-            
+{
+        uint32_t fftLen = S->fftLen;
+
+        if (ifftFlag == 1U) {
+
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen);
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);
+                break;
+            }
+        } else {
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);
+                break;
+            }
+        }
+
+
+        if (bitReverseFlag)
+        {
+
             arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-       
-        } 
+
+        }
 }
 
 #else
@@ -794,7 +735,7 @@ void arm_cfft_radix4by2_q15(
       out2 = __SMUAD(coeff, R);
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 
-      write_q15x2_ia (&pSl, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+      write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ) );
   }
 
 #else /* #if defined (ARM_MATH_DSP) */
@@ -893,7 +834,7 @@ void arm_cfft_radix4by2_inverse_q15(
      out2 = __SMUSD(__QSUB(0, coeff), R);
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 
-     write_q15x2_ia (&pSl, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+     write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ));
   }
 
 #else /* #if defined (ARM_MATH_DSP) */
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c
index f8a46bd542a64e404aa3f3839cc605232321783b..a26927e3278c581ce4397906c8d15ccaed7e6abf 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_q31.c
  * Description:  Combined Radix Decimation in Frequency CFFT fixed point processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,45 +26,14 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_vec_fft.h"
 
-static void arm_bitreversal_32_inpl_mve(
-        uint32_t *pSrc,
-  const uint16_t  bitRevLen,
-  const uint16_t *pBitRevTab)
-
-{
-    uint64_t       *src = (uint64_t *) pSrc;
-    uint32_t        blkCnt;     /* loop counters */
-    uint32x4_t      bitRevTabOff;
-    uint32x4_t      one = vdupq_n_u32(1);
-
-    blkCnt = (bitRevLen / 2) / 2;
-    while (blkCnt > 0U) {
-        bitRevTabOff = vldrhq_u32(pBitRevTab);
-        pBitRevTab += 4;
-
-        uint64x2_t      bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
-        uint64x2_t      bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
-
-        uint64x2_t      in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
-        uint64x2_t      in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
-
-        vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
-        vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
-
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-}
 
 static void _arm_radix4_butterfly_q31_mve(
     const arm_cfft_instance_q31 * S,
@@ -198,7 +167,7 @@ static void _arm_radix4_butterfly_q31_mve(
     /*
      * start of Last stage process
      */
-    uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
+    uint32x4_t vecScGathAddr = vld1q_u32(strides);
     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
 
     /*
@@ -448,7 +417,7 @@ static void _arm_radix4_butterfly_inverse_q31_mve(
     /*
      * start of Last stage process
      */
-    uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
+    uint32x4_t vecScGathAddr = vld1q_u32(strides);
     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
 
     /*
@@ -598,55 +567,55 @@ void arm_cfft_q31(
         q31_t * pSrc,
         uint8_t ifftFlag,
         uint8_t bitReverseFlag)
-{                                                                             
-        uint32_t fftLen = S->fftLen;     
-
-        if (ifftFlag == 1U) {                                                            
-                                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:                                                                   
-                _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen); 
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);              
-                break;                                                                   
-            }  
-        } else {                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:    
-                _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);         
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);                      
-                break;                                                                   
-            }                                                                            
-        }                                                                                
-                                                                                         
-                                                                                         
-        if (bitReverseFlag) 
-        {                                                            
-            
+{
+        uint32_t fftLen = S->fftLen;
+
+        if (ifftFlag == 1U) {
+
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
+                break;
+            }
+        } else {
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
+                break;
+            }
+        }
+
+
+        if (bitReverseFlag)
+        {
+
             arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-       
-        } 
+
+        }
 }
-#else 
+#else
 
 extern void arm_radix4_butterfly_q31(
         q31_t * pSrc,
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f16.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..de21de99021db03ebc342149e3ce1fc0406a6291
--- /dev/null
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f16.c
@@ -0,0 +1,475 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix2_f16.c
+ * Description:  Radix-2 Decimation in Frequency CFFT & CIFFT Floating point processing function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+void arm_radix2_butterfly_f16(
+        float16_t * pSrc,
+        uint32_t fftLen,
+  const float16_t * pCoef,
+        uint16_t twidCoefModifier);
+
+void arm_radix2_butterfly_inverse_f16(
+        float16_t * pSrc,
+        uint32_t fftLen,
+  const float16_t * pCoef,
+        uint16_t twidCoefModifier,
+        float16_t onebyfftLen);
+
+extern void arm_bitreversal_f16(
+        float16_t * pSrc,
+        uint16_t fftSize,
+        uint16_t bitRevFactor,
+  const uint16_t * pBitRevTab);
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Radix-2 CFFT/CIFFT.
+  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_f16 and will be removed in the future
+  @param[in]     S    points to an instance of the floating-point Radix-2 CFFT/CIFFT structure
+  @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
+  @return        none
+ */
+
+void arm_cfft_radix2_f16(
+const arm_cfft_radix2_instance_f16 * S,
+      float16_t * pSrc)
+{
+
+   if (S->ifftFlag == 1U)
+   {
+      /* Complex IFFT radix-2 */
+      arm_radix2_butterfly_inverse_f16(pSrc, S->fftLen, S->pTwiddle,
+      S->twidCoefModifier, S->onebyfftLen);
+   }
+   else
+   {
+      /* Complex FFT radix-2 */
+      arm_radix2_butterfly_f16(pSrc, S->fftLen, S->pTwiddle,
+      S->twidCoefModifier);
+   }
+
+   if (S->bitReverseFlag == 1U)
+   {
+      /* Bit Reversal */
+      arm_bitreversal_f16(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+   }
+
+}
+
+
+/**
+  @} end of ComplexFFT group
+ */
+
+
+
+/* ----------------------------------------------------------------------
+** Internal helper function used by the FFTs
+** ------------------------------------------------------------------- */
+
+/*
+* @brief  Core function for the floating-point CFFT butterfly process.
+* @param[in, out] *pSrc            points to the in-place buffer of floating-point data type.
+* @param[in]      fftLen           length of the FFT.
+* @param[in]      *pCoef           points to the twiddle coefficient buffer.
+* @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+* @return none.
+*/
+
+void arm_radix2_butterfly_f16(
+float16_t * pSrc,
+uint32_t fftLen,
+const float16_t * pCoef,
+uint16_t twidCoefModifier)
+{
+
+   uint32_t i, j, k, l;
+   uint32_t n1, n2, ia;
+   float16_t xt, yt, cosVal, sinVal;
+   float16_t p0, p1, p2, p3;
+   float16_t a0, a1;
+
+#if defined (ARM_MATH_DSP)
+
+   /*  Initializations for the first stage */
+   n2 = fftLen >> 1;
+   ia = 0;
+   i = 0;
+
+   // loop for groups
+   for (k = n2; k > 0; k--)
+   {
+      cosVal = pCoef[ia * 2];
+      sinVal = pCoef[(ia * 2) + 1];
+
+      /*  Twiddle coefficients index modifier */
+      ia += twidCoefModifier;
+
+      /*  index calculation for the input as, */
+      /*  pSrc[i + 0], pSrc[i + fftLen/1] */
+      l = i + n2;
+
+      /*  Butterfly implementation */
+      a0 = pSrc[2 * i] + pSrc[2 * l];
+      xt = pSrc[2 * i] - pSrc[2 * l];
+
+      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+      a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+      p0 = xt * cosVal;
+      p1 = yt * sinVal;
+      p2 = yt * cosVal;
+      p3 = xt * sinVal;
+
+      pSrc[2 * i]     = a0;
+      pSrc[2 * i + 1] = a1;
+
+      pSrc[2 * l]     = p0 + p1;
+      pSrc[2 * l + 1] = p2 - p3;
+
+      i++;
+   }                             // groups loop end
+
+   twidCoefModifier <<= 1U;
+
+   // loop for stage
+   for (k = n2; k > 2; k = k >> 1)
+   {
+      n1 = n2;
+      n2 = n2 >> 1;
+      ia = 0;
+
+      // loop for groups
+      j = 0;
+      do
+      {
+         cosVal = pCoef[ia * 2];
+         sinVal = pCoef[(ia * 2) + 1];
+         ia += twidCoefModifier;
+
+         // loop for butterfly
+         i = j;
+         do
+         {
+            l = i + n2;
+            a0 = pSrc[2 * i] + pSrc[2 * l];
+            xt = pSrc[2 * i] - pSrc[2 * l];
+
+            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+            a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+            p0 = xt * cosVal;
+            p1 = yt * sinVal;
+            p2 = yt * cosVal;
+            p3 = xt * sinVal;
+
+            pSrc[2 * i] = a0;
+            pSrc[2 * i + 1] = a1;
+
+            pSrc[2 * l]     = p0 + p1;
+            pSrc[2 * l + 1] = p2 - p3;
+
+            i += n1;
+         } while ( i < fftLen );                        // butterfly loop end
+         j++;
+      } while ( j < n2);                          // groups loop end
+      twidCoefModifier <<= 1U;
+   }                             // stages loop end
+
+   // loop for butterfly
+   for (i = 0; i < fftLen; i += 2)
+   {
+      a0 = pSrc[2 * i] + pSrc[2 * i + 2];
+      xt = pSrc[2 * i] - pSrc[2 * i + 2];
+
+      yt = pSrc[2 * i + 1] - pSrc[2 * i + 3];
+      a1 = pSrc[2 * i + 3] + pSrc[2 * i + 1];
+
+      pSrc[2 * i] = a0;
+      pSrc[2 * i + 1] = a1;
+      pSrc[2 * i + 2] = xt;
+      pSrc[2 * i + 3] = yt;
+   }                             // groups loop end
+
+#else
+
+   n2 = fftLen;
+
+   // loop for stage
+   for (k = fftLen; k > 1; k = k >> 1)
+   {
+      n1 = n2;
+      n2 = n2 >> 1;
+      ia = 0;
+
+      // loop for groups
+      j = 0;
+      do
+      {
+         cosVal = pCoef[ia * 2];
+         sinVal = pCoef[(ia * 2) + 1];
+         ia += twidCoefModifier;
+
+         // loop for butterfly
+         i = j;
+         do
+         {
+            l = i + n2;
+            a0 = pSrc[2 * i] + pSrc[2 * l];
+            xt = pSrc[2 * i] - pSrc[2 * l];
+
+            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+            a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+            p0 = xt * cosVal;
+            p1 = yt * sinVal;
+            p2 = yt * cosVal;
+            p3 = xt * sinVal;
+
+            pSrc[2 * i] = a0;
+            pSrc[2 * i + 1] = a1;
+
+            pSrc[2 * l]     = p0 + p1;
+            pSrc[2 * l + 1] = p2 - p3;
+
+            i += n1;
+         } while (i < fftLen);
+         j++;
+      } while (j < n2);
+      twidCoefModifier <<= 1U;
+   }
+
+#endif //    #if defined (ARM_MATH_DSP)
+
+}
+
+
+void arm_radix2_butterfly_inverse_f16(
+float16_t * pSrc,
+uint32_t fftLen,
+const float16_t * pCoef,
+uint16_t twidCoefModifier,
+float16_t onebyfftLen)
+{
+
+   uint32_t i, j, k, l;
+   uint32_t n1, n2, ia;
+   float16_t xt, yt, cosVal, sinVal;
+   float16_t p0, p1, p2, p3;
+   float16_t a0, a1;
+
+#if defined (ARM_MATH_DSP)
+
+   n2 = fftLen >> 1;
+   ia = 0;
+
+   // loop for groups
+   for (i = 0; i < n2; i++)
+   {
+      cosVal = pCoef[ia * 2];
+      sinVal = pCoef[(ia * 2) + 1];
+      ia += twidCoefModifier;
+
+      l = i + n2;
+      a0 = pSrc[2 * i] + pSrc[2 * l];
+      xt = pSrc[2 * i] - pSrc[2 * l];
+
+      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+      a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+      p0 = xt * cosVal;
+      p1 = yt * sinVal;
+      p2 = yt * cosVal;
+      p3 = xt * sinVal;
+
+      pSrc[2 * i] = a0;
+      pSrc[2 * i + 1] = a1;
+
+      pSrc[2 * l]     = p0 - p1;
+      pSrc[2 * l + 1] = p2 + p3;
+   }                             // groups loop end
+
+   twidCoefModifier <<= 1U;
+
+   // loop for stage
+   for (k = fftLen / 2; k > 2; k = k >> 1)
+   {
+      n1 = n2;
+      n2 = n2 >> 1;
+      ia = 0;
+
+      // loop for groups
+      j = 0;
+      do
+      {
+         cosVal = pCoef[ia * 2];
+         sinVal = pCoef[(ia * 2) + 1];
+         ia += twidCoefModifier;
+
+         // loop for butterfly
+         i = j;
+         do
+         {
+            l = i + n2;
+            a0 = pSrc[2 * i] + pSrc[2 * l];
+            xt = pSrc[2 * i] - pSrc[2 * l];
+
+            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+            a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+            p0 = xt * cosVal;
+            p1 = yt * sinVal;
+            p2 = yt * cosVal;
+            p3 = xt * sinVal;
+
+            pSrc[2 * i] = a0;
+            pSrc[2 * i + 1] = a1;
+
+            pSrc[2 * l]     = p0 - p1;
+            pSrc[2 * l + 1] = p2 + p3;
+
+            i += n1;
+         } while ( i < fftLen );                 // butterfly loop end
+         j++;
+      } while (j < n2);                      // groups loop end
+
+      twidCoefModifier <<= 1U;
+   }                             // stages loop end
+
+   // loop for butterfly
+   for (i = 0; i < fftLen; i += 2)
+   {
+      a0 = pSrc[2 * i] + pSrc[2 * i + 2];
+      xt = pSrc[2 * i] - pSrc[2 * i + 2];
+
+      a1 = pSrc[2 * i + 3] + pSrc[2 * i + 1];
+      yt = pSrc[2 * i + 1] - pSrc[2 * i + 3];
+
+      p0 = a0 * onebyfftLen;
+      p2 = xt * onebyfftLen;
+      p1 = a1 * onebyfftLen;
+      p3 = yt * onebyfftLen;
+
+      pSrc[2 * i] = p0;
+      pSrc[2 * i + 1] = p1;
+      pSrc[2 * i + 2] = p2;
+      pSrc[2 * i + 3] = p3;
+   }                             // butterfly loop end
+
+#else
+
+   n2 = fftLen;
+
+   // loop for stage
+   for (k = fftLen; k > 2; k = k >> 1)
+   {
+      n1 = n2;
+      n2 = n2 >> 1;
+      ia = 0;
+
+      // loop for groups
+      j = 0;
+      do
+      {
+         cosVal = pCoef[ia * 2];
+         sinVal = pCoef[(ia * 2) + 1];
+         ia = ia + twidCoefModifier;
+
+         // loop for butterfly
+         i = j;
+         do
+         {
+            l = i + n2;
+            a0 = pSrc[2 * i] + pSrc[2 * l];
+            xt = pSrc[2 * i] - pSrc[2 * l];
+
+            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+            a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+            p0 = xt * cosVal;
+            p1 = yt * sinVal;
+            p2 = yt * cosVal;
+            p3 = xt * sinVal;
+
+            pSrc[2 * i] = a0;
+            pSrc[2 * i + 1] = a1;
+
+            pSrc[2 * l]     = p0 - p1;
+            pSrc[2 * l + 1] = p2 + p3;
+
+            i += n1;
+         } while ( i < fftLen );                    // butterfly loop end
+         j++;
+      } while ( j < n2 );                      // groups loop end
+
+      twidCoefModifier = twidCoefModifier << 1U;
+   }                             // stages loop end
+
+   n1 = n2;
+   n2 = n2 >> 1;
+
+   // loop for butterfly
+   for (i = 0; i < fftLen; i += n1)
+   {
+      l = i + n2;
+
+      a0 = pSrc[2 * i] + pSrc[2 * l];
+      xt = pSrc[2 * i] - pSrc[2 * l];
+
+      a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+
+      p0 = a0 * onebyfftLen;
+      p2 = xt * onebyfftLen;
+      p1 = a1 * onebyfftLen;
+      p3 = yt * onebyfftLen;
+
+      pSrc[2 * i] = p0;
+      pSrc[2U * l] = p2;
+
+      pSrc[2 * i + 1] = p1;
+      pSrc[2U * l + 1U] = p3;
+   }                             // butterfly loop end
+
+#endif //      #if defined (ARM_MATH_DSP)
+
+}
+
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c
index f75e329091d4f284e4e1aee224dec317172cfe50..ab218a50bdd340d393629d11b5175b8858cf6362 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_radix2_f32.c
  * Description:  Radix-2 Decimation in Frequency CFFT & CIFFT Floating point processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 void arm_radix2_butterfly_f32(
         float32_t * pSrc,
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f16.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..e021a0ea90fcd006cbdecd8c90822fda4786e781
--- /dev/null
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f16.c
@@ -0,0 +1,214 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix2_init_f16.c
+ * Description:  Radix-2 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+#include "arm_common_tables.h"
+#include "arm_common_tables_f16.h"
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Initialization function for the floating-point CFFT/CIFFT.
+  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_f16 and will be removed in the future.
+  @param[in,out] S              points to an instance of the floating-point CFFT/CIFFT structure
+  @param[in]     fftLen         length of the FFT
+  @param[in]     ifftFlag       flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
+
+  @par           Details
+                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
+                   Set(=1) ifftFlag for calculation of CIFFT otherwise  CFFT is calculated
+  @par
+                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
+                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+  @par
+                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+  @par
+                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+*/
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+arm_status arm_cfft_radix2_init_f16(
+  arm_cfft_radix2_instance_f16 * S,
+  uint16_t fftLen,
+  uint8_t ifftFlag,
+  uint8_t bitReverseFlag)
+{
+   /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_ARGUMENT_ERROR;
+  
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096)
+
+
+  /*  Initialise the default arm status */
+  status = ARM_MATH_SUCCESS;
+
+  /*  Initialise the FFT length */
+  S->fftLen = fftLen;
+
+  /*  Initialise the Twiddle coefficient pointer */
+  S->pTwiddle = (float16_t *) twiddleCoefF16_4096;
+
+  /*  Initialise the Flag for selection of CFFT or CIFFT */
+  S->ifftFlag = ifftFlag;
+
+  /*  Initialise the Flag for calculation Bit reversal or not */
+  S->bitReverseFlag = bitReverseFlag;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
+
+  /*  Initializations of structure parameters depending on the FFT length */
+  switch (S->fftLen)
+  {
+
+  case 4096U:
+    /*  Initializations of structure parameters for 4096 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 1U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 1U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) armBitRevTable;
+    /*  Initialise the 1/fftLen Value */
+    S->onebyfftLen = 0.000244140625;
+    break;
+
+  case 2048U:
+    /*  Initializations of structure parameters for 2048 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 2U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 2U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
+    /*  Initialise the 1/fftLen Value */
+    S->onebyfftLen = 0.00048828125;
+    break;
+
+  case 1024U:
+    /*  Initializations of structure parameters for 1024 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 4U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 4U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+    /*  Initialise the 1/fftLen Value */
+    S->onebyfftLen = 0.0009765625f;
+    break;
+
+  case 512U:
+    /*  Initializations of structure parameters for 512 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 8U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 8U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
+    /*  Initialise the 1/fftLen Value */
+    S->onebyfftLen = 0.001953125;
+    break;
+
+  case 256U:
+    /*  Initializations of structure parameters for 256 point FFT */
+    S->twidCoefModifier = 16U;
+    S->bitRevFactor = 16U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+    S->onebyfftLen = 0.00390625f;
+    break;
+
+  case 128U:
+    /*  Initializations of structure parameters for 128 point FFT */
+    S->twidCoefModifier = 32U;
+    S->bitRevFactor = 32U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
+    S->onebyfftLen = 0.0078125;
+    break;
+
+  case 64U:
+    /*  Initializations of structure parameters for 64 point FFT */
+    S->twidCoefModifier = 64U;
+    S->bitRevFactor = 64U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+    S->onebyfftLen = 0.015625f;
+    break;
+
+  case 32U:
+    /*  Initializations of structure parameters for 64 point FFT */
+    S->twidCoefModifier = 128U;
+    S->bitRevFactor = 128U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
+    S->onebyfftLen = 0.03125;
+    break;
+
+  case 16U:
+    /*  Initializations of structure parameters for 16 point FFT */
+    S->twidCoefModifier = 256U;
+    S->bitRevFactor = 256U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+    S->onebyfftLen = 0.0625f;
+    break;
+
+
+  default:
+    /*  Reporting argument error if fftSize is not valid value */
+    status = ARM_MATH_ARGUMENT_ERROR;
+    break;
+  }
+
+#endif
+#endif
+#endif
+  return (status);
+}
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+/**
+  @} end of ComplexFFT group
+ */
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c
index 417ad917015629dc6de6abc8a9ee540c1bf434b2..ae9f29a8a7705e1c2b9ddf6040da1b304fc00cd5 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_radix2_init_f32.c
  * Description:  Radix-2 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 /**
@@ -71,8 +71,15 @@ arm_status arm_cfft_radix2_init_f32(
   uint8_t ifftFlag,
   uint8_t bitReverseFlag)
 {
+   /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
+
   /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_SUCCESS;
+  status = ARM_MATH_SUCCESS;
 
   /*  Initialise the FFT length */
   S->fftLen = fftLen;
@@ -86,6 +93,8 @@ arm_status arm_cfft_radix2_init_f32(
   /*  Initialise the Flag for calculation Bit reversal or not */
   S->bitReverseFlag = bitReverseFlag;
 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
+
   /*  Initializations of structure parameters depending on the FFT length */
   switch (S->fftLen)
   {
@@ -189,6 +198,9 @@ arm_status arm_cfft_radix2_init_f32(
     break;
   }
 
+#endif
+#endif
+#endif
   return (status);
 }
 
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c
index 3d865d03c30a79084b41e624d25492983d725983..68c99308d94a1919bbd73d0706698826a0793702 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_radix2_init_q15.c
  * Description:  Radix-2 Decimation in Frequency Q15 FFT & IFFT initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 /**
@@ -72,8 +72,15 @@ arm_status arm_cfft_radix2_init_q15(
   uint8_t ifftFlag,
   uint8_t bitReverseFlag)
 {
+   /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096)
+
   /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_SUCCESS;
+  status = ARM_MATH_SUCCESS;
 
   /*  Initialise the FFT length */
   S->fftLen = fftLen;
@@ -85,6 +92,8 @@ arm_status arm_cfft_radix2_init_q15(
   /*  Initialise the Flag for calculation Bit reversal or not */
   S->bitReverseFlag = bitReverseFlag;
 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
+
   /*  Initializations of structure parameters depending on the FFT length */
   switch (S->fftLen)
   {
@@ -174,6 +183,9 @@ arm_status arm_cfft_radix2_init_q15(
     break;
   }
 
+#endif
+#endif
+#endif
   return (status);
 }
 
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c
index f4a20d65a8713b91630ee527018fde6e4e0a39ca..73b8a39f557265c965323da826de111d8ab07b74 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_radix2_init_q31.c
  * Description:  Radix-2 Decimation in Frequency Fixed-point CFFT & CIFFT Initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 /**
@@ -72,7 +72,14 @@ arm_status arm_cfft_radix2_init_q31(
   uint8_t bitReverseFlag)
 {
   /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_SUCCESS;
+  arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096)
+
+  /*  Initialise the default arm status */
+  status = ARM_MATH_SUCCESS;
 
   /*  Initialise the FFT length */
   S->fftLen = fftLen;
@@ -86,6 +93,8 @@ arm_status arm_cfft_radix2_init_q31(
   /*  Initialise the Flag for calculation Bit reversal or not */
   S->bitReverseFlag = bitReverseFlag;
 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
+
   /*  Initializations of Instance structure depending on the FFT length */
   switch (S->fftLen)
   {
@@ -171,6 +180,9 @@ arm_status arm_cfft_radix2_init_q31(
     break;
   }
 
+#endif
+#endif 
+#endif
   return (status);
 }
 
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c
index 2a03b5771c7683ea17a7d25594975fda92dacb51..ca15ea18b79a567208b3eb695a41e69a7330c652 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_radix2_q15.c
  * Description:  Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 void arm_radix2_butterfly_q15(
         q15_t * pSrc,
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c
index 6c79a65b59bfdb21ee99048cbddb93ddbaba077f..996e91d3a76add3ce10a99309e6c81d80c1dacc8 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_radix2_q31.c
  * Description:  Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 void arm_radix2_butterfly_q31(
         q31_t * pSrc,
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f16.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..fad2b2112a45605bb104c68887fb07ff1131d5f6
--- /dev/null
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f16.c
@@ -0,0 +1,1272 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix4_f16.c
+ * Description:  Radix-4 Decimation in Frequency CFFT & CIFFT Floating point processing function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+extern void arm_bitreversal_f16(
+        float16_t * pSrc,
+        uint16_t fftSize,
+        uint16_t bitRevFactor,
+  const uint16_t * pBitRevTab);
+
+void arm_radix4_butterfly_f16(
+        float16_t * pSrc,
+        uint16_t fftLen,
+  const float16_t * pCoef,
+        uint16_t twidCoefModifier);
+
+void arm_radix4_butterfly_inverse_f16(
+        float16_t * pSrc,
+        uint16_t fftLen,
+  const float16_t * pCoef,
+        uint16_t twidCoefModifier,
+        float16_t onebyfftLen);
+
+
+void arm_cfft_radix4by2_f16(
+    float16_t * pSrc,
+    uint32_t fftLen,
+    const float16_t * pCoef);
+
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/*
+* @brief  Core function for the floating-point CFFT butterfly process.
+* @param[in, out] *pSrc            points to the in-place buffer of floating-point data type.
+* @param[in]      fftLen           length of the FFT.
+* @param[in]      *pCoef           points to the twiddle coefficient buffer.
+* @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+* @return none.
+*/
+
+void arm_cfft_radix4by2_f16(
+    float16_t * pSrc,
+    uint32_t fftLen,
+    const float16_t * pCoef)
+{
+    uint32_t i, l;
+    uint32_t n2, ia;
+    float16_t xt, yt, cosVal, sinVal;
+    float16_t p0, p1,p2,p3,a0,a1;
+
+    n2 = fftLen >> 1;
+    ia = 0;
+    for (i = 0; i < n2; i++)
+    {
+        cosVal = pCoef[2*ia];
+        sinVal = pCoef[2*ia + 1];
+        ia++;
+
+        l = i + n2;
+       
+        /*  Butterfly implementation */
+        a0 = pSrc[2 * i] + pSrc[2 * l];
+        xt = pSrc[2 * i] - pSrc[2 * l];
+  
+        yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+        a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+  
+        p0 = xt * cosVal;
+        p1 = yt * sinVal;
+        p2 = yt * cosVal;
+        p3 = xt * sinVal;
+  
+        pSrc[2 * i]     = a0;
+        pSrc[2 * i + 1] = a1;
+  
+        pSrc[2 * l]     = p0 + p1;
+        pSrc[2 * l + 1] = p2 - p3;
+
+    }
+
+    // first col
+    arm_radix4_butterfly_f16( pSrc, n2, (float16_t*)pCoef, 2U);
+    // second col
+    arm_radix4_butterfly_f16( pSrc + fftLen, n2, (float16_t*)pCoef, 2U);
+
+}
+
+
+/**
+  @brief         Processing function for the floating-point Radix-4 CFFT/CIFFT.
+  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_f16 and will be removed in the future.
+  @param[in]     S    points to an instance of the floating-point Radix-4 CFFT/CIFFT structure
+  @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
+  @return        none
+ */
+
+void arm_cfft_radix4_f16(
+  const arm_cfft_radix4_instance_f16 * S,
+        float16_t * pSrc)
+{
+   if (S->ifftFlag == 1U)
+   {
+      /*  Complex IFFT radix-4  */
+      arm_radix4_butterfly_inverse_f16(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier, S->onebyfftLen);
+   }
+   else
+   {
+      /*  Complex FFT radix-4  */
+      arm_radix4_butterfly_f16(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+   }
+
+   if (S->bitReverseFlag == 1U)
+   {
+      /*  Bit Reversal */
+      arm_bitreversal_f16(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+   }
+
+}
+
+/**
+  @} end of ComplexFFT group
+ */
+
+/* ----------------------------------------------------------------------
+ * Internal helper function used by the FFTs
+ * ---------------------------------------------------------------------- */
+
+/*
+* @brief  Core function for the floating-point CFFT butterfly process.
+* @param[in, out] *pSrc            points to the in-place buffer of floating-point data type.
+* @param[in]      fftLen           length of the FFT.
+* @param[in]      *pCoef           points to the twiddle coefficient buffer.
+* @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+* @return none.
+*/
+
+void arm_radix4_butterfly_f16(
+float16_t * pSrc,
+uint16_t fftLen,
+const float16_t * pCoef,
+uint16_t twidCoefModifier)
+{
+
+   float16_t co1, co2, co3, si1, si2, si3;
+   uint32_t ia1, ia2, ia3;
+   uint32_t i0, i1, i2, i3;
+   uint32_t n1, n2, j, k;
+
+#if defined (ARM_MATH_DSP)
+
+   /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+   float16_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
+   float16_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
+   Ybminusd;
+   float16_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
+   float16_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
+   float16_t *ptr1;
+   float16_t p0,p1,p2,p3,p4,p5;
+   float16_t a0,a1,a2,a3,a4,a5,a6,a7;
+
+   /*  Initializations for the first stage */
+   n2 = fftLen;
+   n1 = n2;
+
+   /* n2 = fftLen/4 */
+   n2 >>= 2U;
+   i0 = 0U;
+   ia1 = 0U;
+
+   j = n2;
+
+   /*  Calculation of first stage */
+   do
+   {
+      /*  index calculation for the input as, */
+      /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+      i1 = i0 + n2;
+      i2 = i1 + n2;
+      i3 = i2 + n2;
+
+      xaIn = pSrc[(2U * i0)];
+      yaIn = pSrc[(2U * i0) + 1U];
+
+      xbIn = pSrc[(2U * i1)];
+      ybIn = pSrc[(2U * i1) + 1U];
+
+      xcIn = pSrc[(2U * i2)];
+      ycIn = pSrc[(2U * i2) + 1U];
+
+      xdIn = pSrc[(2U * i3)];
+      ydIn = pSrc[(2U * i3) + 1U];
+
+      /* xa + xc */
+      Xaplusc = xaIn + xcIn;
+      /* xb + xd */
+      Xbplusd = xbIn + xdIn;
+      /* ya + yc */
+      Yaplusc = yaIn + ycIn;
+      /* yb + yd */
+      Ybplusd = ybIn + ydIn;
+
+      /*  index calculation for the coefficients */
+      ia2 = ia1 + ia1;
+      co2 = pCoef[ia2 * 2U];
+      si2 = pCoef[(ia2 * 2U) + 1U];
+
+      /* xa - xc */
+      Xaminusc = xaIn - xcIn;
+      /* xb - xd */
+      Xbminusd = xbIn - xdIn;
+      /* ya - yc */
+      Yaminusc = yaIn - ycIn;
+      /* yb - yd */
+      Ybminusd = ybIn - ydIn;
+
+      /* xa' = xa + xb + xc + xd */
+      pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+      /* ya' = ya + yb + yc + yd */
+      pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+      /* (xa - xc) + (yb - yd) */
+      Xb12C_out = (Xaminusc + Ybminusd);
+      /* (ya - yc) + (xb - xd) */
+      Yb12C_out = (Yaminusc - Xbminusd);
+      /* (xa + xc) - (xb + xd) */
+      Xc12C_out = (Xaplusc - Xbplusd);
+      /* (ya + yc) - (yb + yd) */
+      Yc12C_out = (Yaplusc - Ybplusd);
+      /* (xa - xc) - (yb - yd) */
+      Xd12C_out = (Xaminusc - Ybminusd);
+      /* (ya - yc) + (xb - xd) */
+      Yd12C_out = (Xbminusd + Yaminusc);
+
+      co1 = pCoef[ia1 * 2U];
+      si1 = pCoef[(ia1 * 2U) + 1U];
+
+      /*  index calculation for the coefficients */
+      ia3 = ia2 + ia1;
+      co3 = pCoef[ia3 * 2U];
+      si3 = pCoef[(ia3 * 2U) + 1U];
+
+      Xb12_out = Xb12C_out * co1;
+      Yb12_out = Yb12C_out * co1;
+      Xc12_out = Xc12C_out * co2;
+      Yc12_out = Yc12C_out * co2;
+      Xd12_out = Xd12C_out * co3;
+      Yd12_out = Yd12C_out * co3;
+
+      /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+      //Xb12_out -= Yb12C_out * si1;
+      p0 = Yb12C_out * si1;
+      /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+      //Yb12_out += Xb12C_out * si1;
+      p1 = Xb12C_out * si1;
+      /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+      //Xc12_out -= Yc12C_out * si2;
+      p2 = Yc12C_out * si2;
+      /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+      //Yc12_out += Xc12C_out * si2;
+      p3 = Xc12C_out * si2;
+      /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+      //Xd12_out -= Yd12C_out * si3;
+      p4 = Yd12C_out * si3;
+      /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+      //Yd12_out += Xd12C_out * si3;
+      p5 = Xd12C_out * si3;
+
+      Xb12_out += p0;
+      Yb12_out -= p1;
+      Xc12_out += p2;
+      Yc12_out -= p3;
+      Xd12_out += p4;
+      Yd12_out -= p5;
+
+      /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+      pSrc[2U * i1] = Xc12_out;
+
+      /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+      pSrc[(2U * i1) + 1U] = Yc12_out;
+
+      /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+      pSrc[2U * i2] = Xb12_out;
+
+      /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+      pSrc[(2U * i2) + 1U] = Yb12_out;
+
+      /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+      pSrc[2U * i3] = Xd12_out;
+
+      /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+      pSrc[(2U * i3) + 1U] = Yd12_out;
+
+      /*  Twiddle coefficients index modifier */
+      ia1 += twidCoefModifier;
+
+      /*  Updating input index */
+      i0++;
+
+   }
+   while (--j);
+
+   twidCoefModifier <<= 2U;
+
+   /*  Calculation of second stage to excluding last stage */
+   for (k = fftLen >> 2U; k > 4U; k >>= 2U)
+   {
+      /*  Initializations for the first stage */
+      n1 = n2;
+      n2 >>= 2U;
+      ia1 = 0U;
+
+      /*  Calculation of first stage */
+      j = 0;
+      do
+      {
+         /*  index calculation for the coefficients */
+         ia2 = ia1 + ia1;
+         ia3 = ia2 + ia1;
+         co1 = pCoef[ia1 * 2U];
+         si1 = pCoef[(ia1 * 2U) + 1U];
+         co2 = pCoef[ia2 * 2U];
+         si2 = pCoef[(ia2 * 2U) + 1U];
+         co3 = pCoef[ia3 * 2U];
+         si3 = pCoef[(ia3 * 2U) + 1U];
+
+         /*  Twiddle coefficients index modifier */
+         ia1 += twidCoefModifier;
+
+         i0 = j;
+         do
+         {
+            /*  index calculation for the input as, */
+            /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+            i1 = i0 + n2;
+            i2 = i1 + n2;
+            i3 = i2 + n2;
+
+            xaIn = pSrc[(2U * i0)];
+            yaIn = pSrc[(2U * i0) + 1U];
+
+            xbIn = pSrc[(2U * i1)];
+            ybIn = pSrc[(2U * i1) + 1U];
+
+            xcIn = pSrc[(2U * i2)];
+            ycIn = pSrc[(2U * i2) + 1U];
+
+            xdIn = pSrc[(2U * i3)];
+            ydIn = pSrc[(2U * i3) + 1U];
+
+            /* xa - xc */
+            Xaminusc = xaIn - xcIn;
+            /* (xb - xd) */
+            Xbminusd = xbIn - xdIn;
+            /* ya - yc */
+            Yaminusc = yaIn - ycIn;
+            /* (yb - yd) */
+            Ybminusd = ybIn - ydIn;
+
+            /* xa + xc */
+            Xaplusc = xaIn + xcIn;
+            /* xb + xd */
+            Xbplusd = xbIn + xdIn;
+            /* ya + yc */
+            Yaplusc = yaIn + ycIn;
+            /* yb + yd */
+            Ybplusd = ybIn + ydIn;
+
+            /* (xa - xc) + (yb - yd) */
+            Xb12C_out = (Xaminusc + Ybminusd);
+            /* (ya - yc) -  (xb - xd) */
+            Yb12C_out = (Yaminusc - Xbminusd);
+            /* xa + xc -(xb + xd) */
+            Xc12C_out = (Xaplusc - Xbplusd);
+            /* (ya + yc) - (yb + yd) */
+            Yc12C_out = (Yaplusc - Ybplusd);
+            /* (xa - xc) - (yb - yd) */
+            Xd12C_out = (Xaminusc - Ybminusd);
+            /* (ya - yc) +  (xb - xd) */
+            Yd12C_out = (Xbminusd + Yaminusc);
+
+            pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+            pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+            Xb12_out = Xb12C_out * co1;
+            Yb12_out = Yb12C_out * co1;
+            Xc12_out = Xc12C_out * co2;
+            Yc12_out = Yc12C_out * co2;
+            Xd12_out = Xd12C_out * co3;
+            Yd12_out = Yd12C_out * co3;
+
+            /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+            //Xb12_out -= Yb12C_out * si1;
+            p0 = Yb12C_out * si1;
+            /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+            //Yb12_out += Xb12C_out * si1;
+            p1 = Xb12C_out * si1;
+            /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+            //Xc12_out -= Yc12C_out * si2;
+            p2 = Yc12C_out * si2;
+            /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+            //Yc12_out += Xc12C_out * si2;
+            p3 = Xc12C_out * si2;
+            /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+            //Xd12_out -= Yd12C_out * si3;
+            p4 = Yd12C_out * si3;
+            /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+            //Yd12_out += Xd12C_out * si3;
+            p5 = Xd12C_out * si3;
+
+            Xb12_out += p0;
+            Yb12_out -= p1;
+            Xc12_out += p2;
+            Yc12_out -= p3;
+            Xd12_out += p4;
+            Yd12_out -= p5;
+
+            /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+            pSrc[2U * i1] = Xc12_out;
+
+            /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+            pSrc[(2U * i1) + 1U] = Yc12_out;
+
+            /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+            pSrc[2U * i2] = Xb12_out;
+
+            /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+            pSrc[(2U * i2) + 1U] = Yb12_out;
+
+            /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+            pSrc[2U * i3] = Xd12_out;
+
+            /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+            pSrc[(2U * i3) + 1U] = Yd12_out;
+
+            i0 += n1;
+         } while (i0 < fftLen);
+         j++;
+      } while (j <= (n2 - 1U));
+      twidCoefModifier <<= 2U;
+   }
+
+   j = fftLen >> 2;
+   ptr1 = &pSrc[0];
+
+   /*  Calculations of last stage */
+   do
+   {
+      xaIn = ptr1[0];
+      yaIn = ptr1[1];
+      xbIn = ptr1[2];
+      ybIn = ptr1[3];
+      xcIn = ptr1[4];
+      ycIn = ptr1[5];
+      xdIn = ptr1[6];
+      ydIn = ptr1[7];
+
+      /* xa + xc */
+      Xaplusc = xaIn + xcIn;
+
+      /* xa - xc */
+      Xaminusc = xaIn - xcIn;
+
+      /* ya + yc */
+      Yaplusc = yaIn + ycIn;
+
+      /* ya - yc */
+      Yaminusc = yaIn - ycIn;
+
+      /* xb + xd */
+      Xbplusd = xbIn + xdIn;
+
+      /* yb + yd */
+      Ybplusd = ybIn + ydIn;
+
+      /* (xb-xd) */
+      Xbminusd = xbIn - xdIn;
+
+      /* (yb-yd) */
+      Ybminusd = ybIn - ydIn;
+
+      /* xa' = xa + xb + xc + xd */
+      a0 = (Xaplusc + Xbplusd);
+      /* ya' = ya + yb + yc + yd */
+      a1 = (Yaplusc + Ybplusd);
+      /* xc' = (xa-xb+xc-xd) */
+      a2 = (Xaplusc - Xbplusd);
+      /* yc' = (ya-yb+yc-yd) */
+      a3 = (Yaplusc - Ybplusd);
+      /* xb' = (xa+yb-xc-yd) */
+      a4 = (Xaminusc + Ybminusd);
+      /* yb' = (ya-xb-yc+xd) */
+      a5 = (Yaminusc - Xbminusd);
+      /* xd' = (xa-yb-xc+yd)) */
+      a6 = (Xaminusc - Ybminusd);
+      /* yd' = (ya+xb-yc-xd) */
+      a7 = (Xbminusd + Yaminusc);
+
+      ptr1[0] = a0;
+      ptr1[1] = a1;
+      ptr1[2] = a2;
+      ptr1[3] = a3;
+      ptr1[4] = a4;
+      ptr1[5] = a5;
+      ptr1[6] = a6;
+      ptr1[7] = a7;
+
+      /* increment pointer by 8 */
+      ptr1 += 8U;
+   } while (--j);
+
+#else
+
+   float16_t t1, t2, r1, r2, s1, s2;
+
+   /* Run the below code for Cortex-M0 */
+
+   /*  Initializations for the fft calculation */
+   n2 = fftLen;
+   n1 = n2;
+   for (k = fftLen; k > 1U; k >>= 2U)
+   {
+      /*  Initializations for the fft calculation */
+      n1 = n2;
+      n2 >>= 2U;
+      ia1 = 0U;
+
+      /*  FFT Calculation */
+      j = 0;
+      do
+      {
+         /*  index calculation for the coefficients */
+         ia2 = ia1 + ia1;
+         ia3 = ia2 + ia1;
+         co1 = pCoef[ia1 * 2U];
+         si1 = pCoef[(ia1 * 2U) + 1U];
+         co2 = pCoef[ia2 * 2U];
+         si2 = pCoef[(ia2 * 2U) + 1U];
+         co3 = pCoef[ia3 * 2U];
+         si3 = pCoef[(ia3 * 2U) + 1U];
+
+         /*  Twiddle coefficients index modifier */
+         ia1 = ia1 + twidCoefModifier;
+
+         i0 = j;
+         do
+         {
+            /*  index calculation for the input as, */
+            /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+            i1 = i0 + n2;
+            i2 = i1 + n2;
+            i3 = i2 + n2;
+
+            /* xa + xc */
+            r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)];
+
+            /* xa - xc */
+            r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)];
+
+            /* ya + yc */
+            s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+
+            /* ya - yc */
+            s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+            /* xb + xd */
+            t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+            /* xa' = xa + xb + xc + xd */
+            pSrc[2U * i0] = r1 + t1;
+
+            /* xa + xc -(xb + xd) */
+            r1 = r1 - t1;
+
+            /* yb + yd */
+            t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+
+            /* ya' = ya + yb + yc + yd */
+            pSrc[(2U * i0) + 1U] = s1 + t2;
+
+            /* (ya + yc) - (yb + yd) */
+            s1 = s1 - t2;
+
+            /* (yb - yd) */
+            t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+
+            /* (xb - xd) */
+            t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+            /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+            pSrc[2U * i1] = (r1 * co2) + (s1 * si2);
+
+            /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+            pSrc[(2U * i1) + 1U] = (s1 * co2) - (r1 * si2);
+
+            /* (xa - xc) + (yb - yd) */
+            r1 = r2 + t1;
+
+            /* (xa - xc) - (yb - yd) */
+            r2 = r2 - t1;
+
+            /* (ya - yc) -  (xb - xd) */
+            s1 = s2 - t2;
+
+            /* (ya - yc) +  (xb - xd) */
+            s2 = s2 + t2;
+
+            /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+            pSrc[2U * i2] = (r1 * co1) + (s1 * si1);
+
+            /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+            pSrc[(2U * i2) + 1U] = (s1 * co1) - (r1 * si1);
+
+            /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+            pSrc[2U * i3] = (r2 * co3) + (s2 * si3);
+
+            /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+            pSrc[(2U * i3) + 1U] = (s2 * co3) - (r2 * si3);
+
+            i0 += n1;
+         } while ( i0 < fftLen);
+         j++;
+      } while (j <= (n2 - 1U));
+      twidCoefModifier <<= 2U;
+   }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+/*
+* @brief  Core function for the floating-point CIFFT butterfly process.
+* @param[in, out] *pSrc            points to the in-place buffer of floating-point data type.
+* @param[in]      fftLen           length of the FFT.
+* @param[in]      *pCoef           points to twiddle coefficient buffer.
+* @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+* @param[in]      onebyfftLen      value of 1/fftLen.
+* @return none.
+*/
+
+void arm_radix4_butterfly_inverse_f16(
+float16_t * pSrc,
+uint16_t fftLen,
+const float16_t * pCoef,
+uint16_t twidCoefModifier,
+float16_t onebyfftLen)
+{
+   float16_t co1, co2, co3, si1, si2, si3;
+   uint32_t ia1, ia2, ia3;
+   uint32_t i0, i1, i2, i3;
+   uint32_t n1, n2, j, k;
+
+#if defined (ARM_MATH_DSP)
+
+   float16_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
+   float16_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
+   Ybminusd;
+   float16_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
+   float16_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
+   float16_t *ptr1;
+   float16_t p0,p1,p2,p3,p4,p5,p6,p7;
+   float16_t a0,a1,a2,a3,a4,a5,a6,a7;
+
+
+   /*  Initializations for the first stage */
+   n2 = fftLen;
+   n1 = n2;
+
+   /* n2 = fftLen/4 */
+   n2 >>= 2U;
+   i0 = 0U;
+   ia1 = 0U;
+
+   j = n2;
+
+   /*  Calculation of first stage */
+   do
+   {
+      /*  index calculation for the input as, */
+      /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+      i1 = i0 + n2;
+      i2 = i1 + n2;
+      i3 = i2 + n2;
+
+      /*  Butterfly implementation */
+      xaIn = pSrc[(2U * i0)];
+      yaIn = pSrc[(2U * i0) + 1U];
+
+      xcIn = pSrc[(2U * i2)];
+      ycIn = pSrc[(2U * i2) + 1U];
+
+      xbIn = pSrc[(2U * i1)];
+      ybIn = pSrc[(2U * i1) + 1U];
+
+      xdIn = pSrc[(2U * i3)];
+      ydIn = pSrc[(2U * i3) + 1U];
+
+      /* xa + xc */
+      Xaplusc = xaIn + xcIn;
+      /* xb + xd */
+      Xbplusd = xbIn + xdIn;
+      /* ya + yc */
+      Yaplusc = yaIn + ycIn;
+      /* yb + yd */
+      Ybplusd = ybIn + ydIn;
+
+      /*  index calculation for the coefficients */
+      ia2 = ia1 + ia1;
+      co2 = pCoef[ia2 * 2U];
+      si2 = pCoef[(ia2 * 2U) + 1U];
+
+      /* xa - xc */
+      Xaminusc = xaIn - xcIn;
+      /* xb - xd */
+      Xbminusd = xbIn - xdIn;
+      /* ya - yc */
+      Yaminusc = yaIn - ycIn;
+      /* yb - yd */
+      Ybminusd = ybIn - ydIn;
+
+      /* xa' = xa + xb + xc + xd */
+      pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+
+      /* ya' = ya + yb + yc + yd */
+      pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+      /* (xa - xc) - (yb - yd) */
+      Xb12C_out = (Xaminusc - Ybminusd);
+      /* (ya - yc) + (xb - xd) */
+      Yb12C_out = (Yaminusc + Xbminusd);
+      /* (xa + xc) - (xb + xd) */
+      Xc12C_out = (Xaplusc - Xbplusd);
+      /* (ya + yc) - (yb + yd) */
+      Yc12C_out = (Yaplusc - Ybplusd);
+      /* (xa - xc) + (yb - yd) */
+      Xd12C_out = (Xaminusc + Ybminusd);
+      /* (ya - yc) - (xb - xd) */
+      Yd12C_out = (Yaminusc - Xbminusd);
+
+      co1 = pCoef[ia1 * 2U];
+      si1 = pCoef[(ia1 * 2U) + 1U];
+
+      /*  index calculation for the coefficients */
+      ia3 = ia2 + ia1;
+      co3 = pCoef[ia3 * 2U];
+      si3 = pCoef[(ia3 * 2U) + 1U];
+
+      Xb12_out = Xb12C_out * co1;
+      Yb12_out = Yb12C_out * co1;
+      Xc12_out = Xc12C_out * co2;
+      Yc12_out = Yc12C_out * co2;
+      Xd12_out = Xd12C_out * co3;
+      Yd12_out = Yd12C_out * co3;
+
+      /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+      //Xb12_out -= Yb12C_out * si1;
+      p0 = Yb12C_out * si1;
+      /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+      //Yb12_out += Xb12C_out * si1;
+      p1 = Xb12C_out * si1;
+      /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+      //Xc12_out -= Yc12C_out * si2;
+      p2 = Yc12C_out * si2;
+      /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+      //Yc12_out += Xc12C_out * si2;
+      p3 = Xc12C_out * si2;
+      /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+      //Xd12_out -= Yd12C_out * si3;
+      p4 = Yd12C_out * si3;
+      /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+      //Yd12_out += Xd12C_out * si3;
+      p5 = Xd12C_out * si3;
+
+      Xb12_out -= p0;
+      Yb12_out += p1;
+      Xc12_out -= p2;
+      Yc12_out += p3;
+      Xd12_out -= p4;
+      Yd12_out += p5;
+
+      /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+      pSrc[2U * i1] = Xc12_out;
+
+      /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+      pSrc[(2U * i1) + 1U] = Yc12_out;
+
+      /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+      pSrc[2U * i2] = Xb12_out;
+
+      /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+      pSrc[(2U * i2) + 1U] = Yb12_out;
+
+      /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+      pSrc[2U * i3] = Xd12_out;
+
+      /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+      pSrc[(2U * i3) + 1U] = Yd12_out;
+
+      /*  Twiddle coefficients index modifier */
+      ia1 = ia1 + twidCoefModifier;
+
+      /*  Updating input index */
+      i0 = i0 + 1U;
+
+   } while (--j);
+
+   twidCoefModifier <<= 2U;
+
+   /*  Calculation of second stage to excluding last stage */
+   for (k = fftLen >> 2U; k > 4U; k >>= 2U)
+   {
+      /*  Initializations for the first stage */
+      n1 = n2;
+      n2 >>= 2U;
+      ia1 = 0U;
+
+      /*  Calculation of first stage */
+      j = 0;
+      do
+      {
+         /*  index calculation for the coefficients */
+         ia2 = ia1 + ia1;
+         ia3 = ia2 + ia1;
+         co1 = pCoef[ia1 * 2U];
+         si1 = pCoef[(ia1 * 2U) + 1U];
+         co2 = pCoef[ia2 * 2U];
+         si2 = pCoef[(ia2 * 2U) + 1U];
+         co3 = pCoef[ia3 * 2U];
+         si3 = pCoef[(ia3 * 2U) + 1U];
+
+         /*  Twiddle coefficients index modifier */
+         ia1 = ia1 + twidCoefModifier;
+
+         i0 = j;
+         do
+         {
+            /*  index calculation for the input as, */
+            /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+            i1 = i0 + n2;
+            i2 = i1 + n2;
+            i3 = i2 + n2;
+
+            xaIn = pSrc[(2U * i0)];
+            yaIn = pSrc[(2U * i0) + 1U];
+
+            xbIn = pSrc[(2U * i1)];
+            ybIn = pSrc[(2U * i1) + 1U];
+
+            xcIn = pSrc[(2U * i2)];
+            ycIn = pSrc[(2U * i2) + 1U];
+
+            xdIn = pSrc[(2U * i3)];
+            ydIn = pSrc[(2U * i3) + 1U];
+
+            /* xa - xc */
+            Xaminusc = xaIn - xcIn;
+            /* (xb - xd) */
+            Xbminusd = xbIn - xdIn;
+            /* ya - yc */
+            Yaminusc = yaIn - ycIn;
+            /* (yb - yd) */
+            Ybminusd = ybIn - ydIn;
+
+            /* xa + xc */
+            Xaplusc = xaIn + xcIn;
+            /* xb + xd */
+            Xbplusd = xbIn + xdIn;
+            /* ya + yc */
+            Yaplusc = yaIn + ycIn;
+            /* yb + yd */
+            Ybplusd = ybIn + ydIn;
+
+            /* (xa - xc) - (yb - yd) */
+            Xb12C_out = (Xaminusc - Ybminusd);
+            /* (ya - yc) +  (xb - xd) */
+            Yb12C_out = (Yaminusc + Xbminusd);
+            /* xa + xc -(xb + xd) */
+            Xc12C_out = (Xaplusc - Xbplusd);
+            /* (ya + yc) - (yb + yd) */
+            Yc12C_out = (Yaplusc - Ybplusd);
+            /* (xa - xc) + (yb - yd) */
+            Xd12C_out = (Xaminusc + Ybminusd);
+            /* (ya - yc) -  (xb - xd) */
+            Yd12C_out = (Yaminusc - Xbminusd);
+
+            pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+            pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+            Xb12_out = Xb12C_out * co1;
+            Yb12_out = Yb12C_out * co1;
+            Xc12_out = Xc12C_out * co2;
+            Yc12_out = Yc12C_out * co2;
+            Xd12_out = Xd12C_out * co3;
+            Yd12_out = Yd12C_out * co3;
+
+            /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+            //Xb12_out -= Yb12C_out * si1;
+            p0 = Yb12C_out * si1;
+            /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+            //Yb12_out += Xb12C_out * si1;
+            p1 = Xb12C_out * si1;
+            /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+            //Xc12_out -= Yc12C_out * si2;
+            p2 = Yc12C_out * si2;
+            /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+            //Yc12_out += Xc12C_out * si2;
+            p3 = Xc12C_out * si2;
+            /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+            //Xd12_out -= Yd12C_out * si3;
+            p4 = Yd12C_out * si3;
+            /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+            //Yd12_out += Xd12C_out * si3;
+            p5 = Xd12C_out * si3;
+
+            Xb12_out -= p0;
+            Yb12_out += p1;
+            Xc12_out -= p2;
+            Yc12_out += p3;
+            Xd12_out -= p4;
+            Yd12_out += p5;
+
+            /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+            pSrc[2U * i1] = Xc12_out;
+
+            /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+            pSrc[(2U * i1) + 1U] = Yc12_out;
+
+            /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+            pSrc[2U * i2] = Xb12_out;
+
+            /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+            pSrc[(2U * i2) + 1U] = Yb12_out;
+
+            /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+            pSrc[2U * i3] = Xd12_out;
+
+            /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+            pSrc[(2U * i3) + 1U] = Yd12_out;
+
+            i0 += n1;
+         } while (i0 < fftLen);
+         j++;
+      } while (j <= (n2 - 1U));
+      twidCoefModifier <<= 2U;
+   }
+   /*  Initializations of last stage */
+
+   j = fftLen >> 2;
+   ptr1 = &pSrc[0];
+
+   /*  Calculations of last stage */
+   do
+   {
+      xaIn = ptr1[0];
+      yaIn = ptr1[1];
+      xbIn = ptr1[2];
+      ybIn = ptr1[3];
+      xcIn = ptr1[4];
+      ycIn = ptr1[5];
+      xdIn = ptr1[6];
+      ydIn = ptr1[7];
+
+      /*  Butterfly implementation */
+      /* xa + xc */
+      Xaplusc = xaIn + xcIn;
+
+      /* xa - xc */
+      Xaminusc = xaIn - xcIn;
+
+      /* ya + yc */
+      Yaplusc = yaIn + ycIn;
+
+      /* ya - yc */
+      Yaminusc = yaIn - ycIn;
+
+      /* xb + xd */
+      Xbplusd = xbIn + xdIn;
+
+      /* yb + yd */
+      Ybplusd = ybIn + ydIn;
+
+      /* (xb-xd) */
+      Xbminusd = xbIn - xdIn;
+
+      /* (yb-yd) */
+      Ybminusd = ybIn - ydIn;
+
+      /* xa' = (xa+xb+xc+xd) * onebyfftLen */
+      a0 = (Xaplusc + Xbplusd);
+      /* ya' = (ya+yb+yc+yd) * onebyfftLen */
+      a1 = (Yaplusc + Ybplusd);
+      /* xc' = (xa-xb+xc-xd) * onebyfftLen */
+      a2 = (Xaplusc - Xbplusd);
+      /* yc' = (ya-yb+yc-yd) * onebyfftLen  */
+      a3 = (Yaplusc - Ybplusd);
+      /* xb' = (xa-yb-xc+yd) * onebyfftLen */
+      a4 = (Xaminusc - Ybminusd);
+      /* yb' = (ya+xb-yc-xd) * onebyfftLen */
+      a5 = (Yaminusc + Xbminusd);
+      /* xd' = (xa-yb-xc+yd) * onebyfftLen */
+      a6 = (Xaminusc + Ybminusd);
+      /* yd' = (ya-xb-yc+xd) * onebyfftLen */
+      a7 = (Yaminusc - Xbminusd);
+
+      p0 = a0 * onebyfftLen;
+      p1 = a1 * onebyfftLen;
+      p2 = a2 * onebyfftLen;
+      p3 = a3 * onebyfftLen;
+      p4 = a4 * onebyfftLen;
+      p5 = a5 * onebyfftLen;
+      p6 = a6 * onebyfftLen;
+      p7 = a7 * onebyfftLen;
+
+      /* xa' = (xa+xb+xc+xd) * onebyfftLen */
+      ptr1[0] = p0;
+      /* ya' = (ya+yb+yc+yd) * onebyfftLen */
+      ptr1[1] = p1;
+      /* xc' = (xa-xb+xc-xd) * onebyfftLen */
+      ptr1[2] = p2;
+      /* yc' = (ya-yb+yc-yd) * onebyfftLen  */
+      ptr1[3] = p3;
+      /* xb' = (xa-yb-xc+yd) * onebyfftLen */
+      ptr1[4] = p4;
+      /* yb' = (ya+xb-yc-xd) * onebyfftLen */
+      ptr1[5] = p5;
+      /* xd' = (xa-yb-xc+yd) * onebyfftLen */
+      ptr1[6] = p6;
+      /* yd' = (ya-xb-yc+xd) * onebyfftLen */
+      ptr1[7] = p7;
+
+      /* increment source pointer by 8 for next calculations */
+      ptr1 = ptr1 + 8U;
+
+   } while (--j);
+
+#else
+
+   float16_t t1, t2, r1, r2, s1, s2;
+
+   /* Run the below code for Cortex-M0 */
+
+   /*  Initializations for the first stage */
+   n2 = fftLen;
+   n1 = n2;
+
+   /*  Calculation of first stage */
+   for (k = fftLen; k > 4U; k >>= 2U)
+   {
+      /*  Initializations for the first stage */
+      n1 = n2;
+      n2 >>= 2U;
+      ia1 = 0U;
+
+      /*  Calculation of first stage */
+      j = 0;
+      do
+      {
+         /*  index calculation for the coefficients */
+         ia2 = ia1 + ia1;
+         ia3 = ia2 + ia1;
+         co1 = pCoef[ia1 * 2U];
+         si1 = pCoef[(ia1 * 2U) + 1U];
+         co2 = pCoef[ia2 * 2U];
+         si2 = pCoef[(ia2 * 2U) + 1U];
+         co3 = pCoef[ia3 * 2U];
+         si3 = pCoef[(ia3 * 2U) + 1U];
+
+         /*  Twiddle coefficients index modifier */
+         ia1 = ia1 + twidCoefModifier;
+
+         i0 = j;
+         do
+         {
+            /*  index calculation for the input as, */
+            /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+            i1 = i0 + n2;
+            i2 = i1 + n2;
+            i3 = i2 + n2;
+
+            /* xa + xc */
+            r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)];
+
+            /* xa - xc */
+            r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)];
+
+            /* ya + yc */
+            s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+
+            /* ya - yc */
+            s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+            /* xb + xd */
+            t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+            /* xa' = xa + xb + xc + xd */
+            pSrc[2U * i0] = r1 + t1;
+
+            /* xa + xc -(xb + xd) */
+            r1 = r1 - t1;
+
+            /* yb + yd */
+            t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+
+            /* ya' = ya + yb + yc + yd */
+            pSrc[(2U * i0) + 1U] = s1 + t2;
+
+            /* (ya + yc) - (yb + yd) */
+            s1 = s1 - t2;
+
+            /* (yb - yd) */
+            t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+
+            /* (xb - xd) */
+            t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+            /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+            pSrc[2U * i1] = (r1 * co2) - (s1 * si2);
+
+            /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+            pSrc[(2U * i1) + 1U] = (s1 * co2) + (r1 * si2);
+
+            /* (xa - xc) - (yb - yd) */
+            r1 = r2 - t1;
+
+            /* (xa - xc) + (yb - yd) */
+            r2 = r2 + t1;
+
+            /* (ya - yc) +  (xb - xd) */
+            s1 = s2 + t2;
+
+            /* (ya - yc) -  (xb - xd) */
+            s2 = s2 - t2;
+
+            /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+            pSrc[2U * i2] = (r1 * co1) - (s1 * si1);
+
+            /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+            pSrc[(2U * i2) + 1U] = (s1 * co1) + (r1 * si1);
+
+            /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+            pSrc[2U * i3] = (r2 * co3) - (s2 * si3);
+
+            /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+            pSrc[(2U * i3) + 1U] = (s2 * co3) + (r2 * si3);
+
+            i0 += n1;
+         } while ( i0 < fftLen);
+         j++;
+      } while (j <= (n2 - 1U));
+      twidCoefModifier <<= 2U;
+   }
+   /*  Initializations of last stage */
+   n1 = n2;
+   n2 >>= 2U;
+
+   /*  Calculations of last stage */
+   for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
+   {
+      /*  index calculation for the input as, */
+      /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+      i1 = i0 + n2;
+      i2 = i1 + n2;
+      i3 = i2 + n2;
+
+      /*  Butterfly implementation */
+      /* xa + xc */
+      r1 = pSrc[2U * i0] + pSrc[2U * i2];
+
+      /* xa - xc */
+      r2 = pSrc[2U * i0] - pSrc[2U * i2];
+
+      /* ya + yc */
+      s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+
+      /* ya - yc */
+      s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+      /* xc + xd */
+      t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+      /* xa' = xa + xb + xc + xd */
+      pSrc[2U * i0] = (r1 + t1) * onebyfftLen;
+
+      /* (xa + xb) - (xc + xd) */
+      r1 = r1 - t1;
+
+      /* yb + yd */
+      t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+
+      /* ya' = ya + yb + yc + yd */
+      pSrc[(2U * i0) + 1U] = (s1 + t2) * onebyfftLen;
+
+      /* (ya + yc) - (yb + yd) */
+      s1 = s1 - t2;
+
+      /* (yb-yd) */
+      t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+
+      /* (xb-xd) */
+      t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+      /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+      pSrc[2U * i1] = r1 * onebyfftLen;
+
+      /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+      pSrc[(2U * i1) + 1U] = s1 * onebyfftLen;
+
+      /* (xa - xc) - (yb-yd) */
+      r1 = r2 - t1;
+
+      /* (xa - xc) + (yb-yd) */
+      r2 = r2 + t1;
+
+      /* (ya - yc) + (xb-xd) */
+      s1 = s2 + t2;
+
+      /* (ya - yc) - (xb-xd) */
+      s2 = s2 - t2;
+
+      /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+      pSrc[2U * i2] = r1 * onebyfftLen;
+
+      /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+      pSrc[(2U * i2) + 1U] = s1 * onebyfftLen;
+
+      /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+      pSrc[2U * i3] = r2 * onebyfftLen;
+
+      /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+      pSrc[(2U * i3) + 1U] = s2 * onebyfftLen;
+   }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+}
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c
index 96291458eb2caf2e199dc529e1e16c0b1c03e0a6..9d9d4d5638d39525acfecbe40948b1c766e86b55 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_radix4_f32.c
  * Description:  Radix-4 Decimation in Frequency CFFT & CIFFT Floating point processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 extern void arm_bitreversal_f32(
         float32_t * pSrc,
@@ -48,6 +48,8 @@ void arm_radix4_butterfly_inverse_f32(
         float32_t onebyfftLen);
 
 
+
+
 /**
   @ingroup groupTransforms
  */
@@ -57,6 +59,7 @@ void arm_radix4_butterfly_inverse_f32(
   @{
  */
 
+
 /**
   @brief         Processing function for the floating-point Radix-4 CFFT/CIFFT.
   @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future.
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f16.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..cb45cd8075296ffbd2174d413bc0cd54c4bfc584
--- /dev/null
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f16.c
@@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix4_init_f16.c
+ * Description:  Radix-4 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+#include "arm_common_tables.h"
+#include "arm_common_tables_f16.h"
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Initialization function for the floating-point CFFT/CIFFT.
+  @deprecated    Do not use this function. It has been superceded by \ref arm_cfft_f16 and will be removed in the future.
+  @param[in,out] S              points to an instance of the floating-point CFFT/CIFFT structure
+  @param[in]     fftLen         length of the FFT
+  @param[in]     ifftFlag       flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
+
+  @par           Details
+                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
+                   Set(=1) ifftFlag for calculation of CIFFT otherwise  CFFT is calculated
+  @par
+                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
+                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+  @par
+                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+  @par
+                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ */
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+arm_status arm_cfft_radix4_init_f16(
+  arm_cfft_radix4_instance_f16 * S,
+  uint16_t fftLen,
+  uint8_t ifftFlag,
+  uint8_t bitReverseFlag)
+{
+    /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096)
+
+  /*  Initialise the default arm status */
+  status = ARM_MATH_SUCCESS;
+
+  /*  Initialise the FFT length */
+  S->fftLen = fftLen;
+
+  /*  Initialise the Twiddle coefficient pointer */
+  S->pTwiddle = (float16_t *) twiddleCoefF16;
+
+  /*  Initialise the Flag for selection of CFFT or CIFFT */
+  S->ifftFlag = ifftFlag;
+
+  /*  Initialise the Flag for calculation Bit reversal or not */
+  S->bitReverseFlag = bitReverseFlag;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
+
+  /*  Initializations of structure parameters depending on the FFT length */
+  switch (S->fftLen)
+  {
+
+  case 4096U:
+    /*  Initializations of structure parameters for 4096 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 1U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 1U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) armBitRevTable;
+    /*  Initialise the 1/fftLen Value */
+    S->onebyfftLen = 0.000244140625;
+    break;
+
+  case 1024U:
+    /*  Initializations of structure parameters for 1024 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 4U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 4U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+    /*  Initialise the 1/fftLen Value */
+    S->onebyfftLen = 0.0009765625f;
+    break;
+
+
+  case 256U:
+    /*  Initializations of structure parameters for 256 point FFT */
+    S->twidCoefModifier = 16U;
+    S->bitRevFactor = 16U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+    S->onebyfftLen = 0.00390625f;
+    break;
+
+  case 64U:
+    /*  Initializations of structure parameters for 64 point FFT */
+    S->twidCoefModifier = 64U;
+    S->bitRevFactor = 64U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+    S->onebyfftLen = 0.015625f;
+    break;
+
+  case 16U:
+    /*  Initializations of structure parameters for 16 point FFT */
+    S->twidCoefModifier = 256U;
+    S->bitRevFactor = 256U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+    S->onebyfftLen = 0.0625f;
+    break;
+
+
+  default:
+    /*  Reporting argument error if fftSize is not valid value */
+    status = ARM_MATH_ARGUMENT_ERROR;
+    break;
+  }
+
+#endif
+#endif
+#endif
+  return (status);
+}
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+/**
+  @} end of ComplexFFT group
+ */
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c
index 930c2c1e1e68000248e3841ffb2daeadd3c1d1b7..b3aabbb2daa629039976e857fece3550d6e5c577 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_radix4_init_f32.c
  * Description:  Radix-4 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 /**
@@ -71,8 +71,15 @@ arm_status arm_cfft_radix4_init_f32(
   uint8_t ifftFlag,
   uint8_t bitReverseFlag)
 {
+   /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
+
   /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_SUCCESS;
+  status = ARM_MATH_SUCCESS;
 
   /*  Initialise the FFT length */
   S->fftLen = fftLen;
@@ -86,6 +93,8 @@ arm_status arm_cfft_radix4_init_f32(
   /*  Initialise the Flag for calculation Bit reversal or not */
   S->bitReverseFlag = bitReverseFlag;
 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
+
   /*  Initializations of structure parameters depending on the FFT length */
   switch (S->fftLen)
   {
@@ -147,7 +156,10 @@ arm_status arm_cfft_radix4_init_f32(
     status = ARM_MATH_ARGUMENT_ERROR;
     break;
   }
-
+#endif
+#endif
+#endif
+  
   return (status);
 }
 
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c
index 0090688ab53509bc42ef1becb120dce130f0f7c2..77742439ee1198cf2586195c006ab5719f3c455f 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_radix4_init_q15.c
  * Description:  Radix-4 Decimation in Frequency Q15 FFT & IFFT initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 /**
@@ -74,7 +74,14 @@ arm_status arm_cfft_radix4_init_q15(
   uint8_t bitReverseFlag)
 {
   /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_SUCCESS;
+  arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096)
+
+  /*  Initialise the default arm status */
+  status = ARM_MATH_SUCCESS;
   /*  Initialise the FFT length */
   S->fftLen = fftLen;
   /*  Initialise the Twiddle coefficient pointer */
@@ -84,6 +91,8 @@ arm_status arm_cfft_radix4_init_q15(
   /*  Initialise the Flag for calculation Bit reversal or not */
   S->bitReverseFlag = bitReverseFlag;
 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
+
   /*  Initializations of structure parameters depending on the FFT length */
   switch (S->fftLen)
   {
@@ -137,6 +146,9 @@ arm_status arm_cfft_radix4_init_q15(
     break;
   }
 
+#endif
+#endif 
+#endif
   return (status);
 }
 
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c
index 17d16b7c03215311813cbaa587052f636fd61e8f..04ba393ad33b142fde204d549fd123fbc7ba0ca9 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_radix4_init_q31.c
  * Description:  Radix-4 Decimation in Frequency Q31 FFT & IFFT initialization function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 /**
@@ -72,8 +72,16 @@ arm_status arm_cfft_radix4_init_q31(
   uint8_t ifftFlag,
   uint8_t bitReverseFlag)
 {
+
+  /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096)
+
   /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_SUCCESS;
+  status = ARM_MATH_SUCCESS;
   /*  Initialise the FFT length */
   S->fftLen = fftLen;
   /*  Initialise the Twiddle coefficient pointer */
@@ -83,6 +91,8 @@ arm_status arm_cfft_radix4_init_q31(
   /*  Initialise the Flag for calculation Bit reversal or not */
   S->bitReverseFlag = bitReverseFlag;
 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
+
   /*  Initializations of Instance structure depending on the FFT length */
   switch (S->fftLen)
   {
@@ -133,6 +143,9 @@ arm_status arm_cfft_radix4_init_q31(
     break;
   }
 
+#endif
+#endif
+#endif
   return (status);
 }
 
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c
index b4cabb1321ddb4b87a902530f7b94a760a2bffb3..159f1a8bb620ebdb873c5b663bc3868fb38cfbdc 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c
@@ -4,13 +4,13 @@
  * Description:  This file has function definition of Radix-4 FFT & IFFT function and
  *               In-place bit reversal using bit reversal table
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -27,7 +27,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 
 void arm_radix4_butterfly_q15(
@@ -260,7 +260,7 @@ void arm_radix4_butterfly_q15(
 
     /* writing the butterfly processed i0 + fftLen/4 sample */
     /* writing output(xc', yc') in little endian format */
-    write_q15x2_ia (&pSi1, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+    write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
 
     /*  Butterfly calculations */
     /* U = packed(yd, xd) */
@@ -300,7 +300,7 @@ void arm_radix4_butterfly_q15(
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 
     /* writing output(xb', yb') in little endian format */
-    write_q15x2_ia (&pSi2, ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF));
+    write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
 
     /* co3 & si3 are read from SIMD Coefficient pointer */
     C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
@@ -319,7 +319,7 @@ void arm_radix4_butterfly_q15(
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 
     /* writing output(xd', yd') in little endian format */
-    write_q15x2_ia (&pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+    write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
 
     /*  Twiddle coefficients index modifier */
     ic = ic + twidCoefModifier;
@@ -417,7 +417,7 @@ void arm_radix4_butterfly_q15(
         /*  writing the butterfly processed i0 + fftLen/4 sample */
         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-        write_q15x2 (pSi1, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+        write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
         pSi1 += 2 * n1;
 
         /*  Butterfly calculations */
@@ -454,7 +454,7 @@ void arm_radix4_butterfly_q15(
 
         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
-        write_q15x2 (pSi2, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+        write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
         pSi2 += 2 * n1;
 
         /*  Butterfly process for the i0+3fftLen/4 sample */
@@ -469,7 +469,7 @@ void arm_radix4_butterfly_q15(
 
         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
-        write_q15x2 (pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+        write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
         pSi3 += 2 * n1;
       }
     }
@@ -1126,7 +1126,7 @@ void arm_radix4_butterfly_inverse_q15(
 
     /* writing the butterfly processed i0 + fftLen/4 sample */
     /* writing output(xc', yc') in little endian format */
-    write_q15x2_ia (&pSi1, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+    write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
 
     /*  Butterfly calculations */
     /* U = packed(yd, xd) */
@@ -1166,7 +1166,7 @@ void arm_radix4_butterfly_inverse_q15(
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 
     /* writing output(xb', yb') in little endian format */
-    write_q15x2_ia (&pSi2, ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF));
+    write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
 
     /* co3 & si3 are read from SIMD Coefficient pointer */
     C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
@@ -1185,7 +1185,7 @@ void arm_radix4_butterfly_inverse_q15(
 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
 
     /* writing output(xd', yd') in little endian format */
-    write_q15x2_ia (&pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+    write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
 
     /*  Twiddle coefficients index modifier */
     ic = ic + twidCoefModifier;
@@ -1283,7 +1283,7 @@ void arm_radix4_butterfly_inverse_q15(
         /*  writing the butterfly processed i0 + fftLen/4 sample */
         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-        write_q15x2 (pSi1, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+        write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
         pSi1 += 2 * n1;
 
         /*  Butterfly calculations */
@@ -1318,7 +1318,7 @@ void arm_radix4_butterfly_inverse_q15(
 
         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
-        write_q15x2 (pSi2, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+        write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
         pSi2 += 2 * n1;
 
         /*  Butterfly process for the i0+3fftLen/4 sample */
@@ -1333,7 +1333,7 @@ void arm_radix4_butterfly_inverse_q15(
 
         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
-        write_q15x2 (pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+        write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
         pSi3 += 2 * n1;
       }
     }
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c
index a9a59dd46eaa7b450e173203b0183095842c67c6..46c1e478d5785f882a1b8a7c5b2f5cbc885afcd5 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c
@@ -4,13 +4,13 @@
  * Description:  This file has function definition of Radix-4 FFT & IFFT function and
  *               In-place bit reversal using bit reversal table
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -27,7 +27,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 void arm_radix4_butterfly_inverse_q31(
         q31_t * pSrc,
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f16.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..79f9311289ccf42fdebf4a685c6bf92b9002fc4b
--- /dev/null
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f16.c
@@ -0,0 +1,289 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix8_f16.c
+ * Description:  Radix-8 Decimation in Frequency CFFT & CIFFT Floating point processing function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/* ----------------------------------------------------------------------
+ * Internal helper function used by the FFTs
+ * -------------------------------------------------------------------- */
+
+/**
+  brief         Core function for the floating-point CFFT butterfly process.
+  param[in,out] pSrc             points to the in-place buffer of floating-point data type.
+  param[in]     fftLen           length of the FFT.
+  param[in]     pCoef            points to the twiddle coefficient buffer.
+  param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+  return        none
+*/
+
+void arm_radix8_butterfly_f16(
+  float16_t * pSrc,
+  uint16_t fftLen,
+  const float16_t * pCoef,
+  uint16_t twidCoefModifier)
+{
+   uint32_t ia1, ia2, ia3, ia4, ia5, ia6, ia7;
+   uint32_t i1, i2, i3, i4, i5, i6, i7, i8;
+   uint32_t id;
+   uint32_t n1, n2, j;
+
+   float16_t r1, r2, r3, r4, r5, r6, r7, r8;
+   float16_t t1, t2;
+   float16_t s1, s2, s3, s4, s5, s6, s7, s8;
+   float16_t p1, p2, p3, p4;
+   float16_t co2, co3, co4, co5, co6, co7, co8;
+   float16_t si2, si3, si4, si5, si6, si7, si8;
+   const float16_t C81 = 0.70710678118f;
+
+   n2 = fftLen;
+
+   do
+   {
+      n1 = n2;
+      n2 = n2 >> 3;
+      i1 = 0;
+
+      do
+      {
+         i2 = i1 + n2;
+         i3 = i2 + n2;
+         i4 = i3 + n2;
+         i5 = i4 + n2;
+         i6 = i5 + n2;
+         i7 = i6 + n2;
+         i8 = i7 + n2;
+         r1 = pSrc[2 * i1] + pSrc[2 * i5];
+         r5 = pSrc[2 * i1] - pSrc[2 * i5];
+         r2 = pSrc[2 * i2] + pSrc[2 * i6];
+         r6 = pSrc[2 * i2] - pSrc[2 * i6];
+         r3 = pSrc[2 * i3] + pSrc[2 * i7];
+         r7 = pSrc[2 * i3] - pSrc[2 * i7];
+         r4 = pSrc[2 * i4] + pSrc[2 * i8];
+         r8 = pSrc[2 * i4] - pSrc[2 * i8];
+         t1 = r1 - r3;
+         r1 = r1 + r3;
+         r3 = r2 - r4;
+         r2 = r2 + r4;
+         pSrc[2 * i1] = r1 + r2;
+         pSrc[2 * i5] = r1 - r2;
+         r1 = pSrc[2 * i1 + 1] + pSrc[2 * i5 + 1];
+         s5 = pSrc[2 * i1 + 1] - pSrc[2 * i5 + 1];
+         r2 = pSrc[2 * i2 + 1] + pSrc[2 * i6 + 1];
+         s6 = pSrc[2 * i2 + 1] - pSrc[2 * i6 + 1];
+         s3 = pSrc[2 * i3 + 1] + pSrc[2 * i7 + 1];
+         s7 = pSrc[2 * i3 + 1] - pSrc[2 * i7 + 1];
+         r4 = pSrc[2 * i4 + 1] + pSrc[2 * i8 + 1];
+         s8 = pSrc[2 * i4 + 1] - pSrc[2 * i8 + 1];
+         t2 = r1 - s3;
+         r1 = r1 + s3;
+         s3 = r2 - r4;
+         r2 = r2 + r4;
+         pSrc[2 * i1 + 1] = r1 + r2;
+         pSrc[2 * i5 + 1] = r1 - r2;
+         pSrc[2 * i3]     = t1 + s3;
+         pSrc[2 * i7]     = t1 - s3;
+         pSrc[2 * i3 + 1] = t2 - r3;
+         pSrc[2 * i7 + 1] = t2 + r3;
+         r1 = (r6 - r8) * C81;
+         r6 = (r6 + r8) * C81;
+         r2 = (s6 - s8) * C81;
+         s6 = (s6 + s8) * C81;
+         t1 = r5 - r1;
+         r5 = r5 + r1;
+         r8 = r7 - r6;
+         r7 = r7 + r6;
+         t2 = s5 - r2;
+         s5 = s5 + r2;
+         s8 = s7 - s6;
+         s7 = s7 + s6;
+         pSrc[2 * i2]     = r5 + s7;
+         pSrc[2 * i8]     = r5 - s7;
+         pSrc[2 * i6]     = t1 + s8;
+         pSrc[2 * i4]     = t1 - s8;
+         pSrc[2 * i2 + 1] = s5 - r7;
+         pSrc[2 * i8 + 1] = s5 + r7;
+         pSrc[2 * i6 + 1] = t2 - r8;
+         pSrc[2 * i4 + 1] = t2 + r8;
+
+         i1 += n1;
+      } while (i1 < fftLen);
+
+      if (n2 < 8)
+         break;
+
+      ia1 = 0;
+      j = 1;
+
+      do
+      {
+         /*  index calculation for the coefficients */
+         id  = ia1 + twidCoefModifier;
+         ia1 = id;
+         ia2 = ia1 + id;
+         ia3 = ia2 + id;
+         ia4 = ia3 + id;
+         ia5 = ia4 + id;
+         ia6 = ia5 + id;
+         ia7 = ia6 + id;
+
+         co2 = pCoef[2 * ia1];
+         co3 = pCoef[2 * ia2];
+         co4 = pCoef[2 * ia3];
+         co5 = pCoef[2 * ia4];
+         co6 = pCoef[2 * ia5];
+         co7 = pCoef[2 * ia6];
+         co8 = pCoef[2 * ia7];
+         si2 = pCoef[2 * ia1 + 1];
+         si3 = pCoef[2 * ia2 + 1];
+         si4 = pCoef[2 * ia3 + 1];
+         si5 = pCoef[2 * ia4 + 1];
+         si6 = pCoef[2 * ia5 + 1];
+         si7 = pCoef[2 * ia6 + 1];
+         si8 = pCoef[2 * ia7 + 1];
+
+         i1 = j;
+
+         do
+         {
+            /*  index calculation for the input */
+            i2 = i1 + n2;
+            i3 = i2 + n2;
+            i4 = i3 + n2;
+            i5 = i4 + n2;
+            i6 = i5 + n2;
+            i7 = i6 + n2;
+            i8 = i7 + n2;
+            r1 = pSrc[2 * i1] + pSrc[2 * i5];
+            r5 = pSrc[2 * i1] - pSrc[2 * i5];
+            r2 = pSrc[2 * i2] + pSrc[2 * i6];
+            r6 = pSrc[2 * i2] - pSrc[2 * i6];
+            r3 = pSrc[2 * i3] + pSrc[2 * i7];
+            r7 = pSrc[2 * i3] - pSrc[2 * i7];
+            r4 = pSrc[2 * i4] + pSrc[2 * i8];
+            r8 = pSrc[2 * i4] - pSrc[2 * i8];
+            t1 = r1 - r3;
+            r1 = r1 + r3;
+            r3 = r2 - r4;
+            r2 = r2 + r4;
+            pSrc[2 * i1] = r1 + r2;
+            r2 = r1 - r2;
+            s1 = pSrc[2 * i1 + 1] + pSrc[2 * i5 + 1];
+            s5 = pSrc[2 * i1 + 1] - pSrc[2 * i5 + 1];
+            s2 = pSrc[2 * i2 + 1] + pSrc[2 * i6 + 1];
+            s6 = pSrc[2 * i2 + 1] - pSrc[2 * i6 + 1];
+            s3 = pSrc[2 * i3 + 1] + pSrc[2 * i7 + 1];
+            s7 = pSrc[2 * i3 + 1] - pSrc[2 * i7 + 1];
+            s4 = pSrc[2 * i4 + 1] + pSrc[2 * i8 + 1];
+            s8 = pSrc[2 * i4 + 1] - pSrc[2 * i8 + 1];
+            t2 = s1 - s3;
+            s1 = s1 + s3;
+            s3 = s2 - s4;
+            s2 = s2 + s4;
+            r1 = t1 + s3;
+            t1 = t1 - s3;
+            pSrc[2 * i1 + 1] = s1 + s2;
+            s2 = s1 - s2;
+            s1 = t2 - r3;
+            t2 = t2 + r3;
+            p1 = co5 * r2;
+            p2 = si5 * s2;
+            p3 = co5 * s2;
+            p4 = si5 * r2;
+            pSrc[2 * i5]     = p1 + p2;
+            pSrc[2 * i5 + 1] = p3 - p4;
+            p1 = co3 * r1;
+            p2 = si3 * s1;
+            p3 = co3 * s1;
+            p4 = si3 * r1;
+            pSrc[2 * i3]     = p1 + p2;
+            pSrc[2 * i3 + 1] = p3 - p4;
+            p1 = co7 * t1;
+            p2 = si7 * t2;
+            p3 = co7 * t2;
+            p4 = si7 * t1;
+            pSrc[2 * i7]     = p1 + p2;
+            pSrc[2 * i7 + 1] = p3 - p4;
+            r1 = (r6 - r8) * C81;
+            r6 = (r6 + r8) * C81;
+            s1 = (s6 - s8) * C81;
+            s6 = (s6 + s8) * C81;
+            t1 = r5 - r1;
+            r5 = r5 + r1;
+            r8 = r7 - r6;
+            r7 = r7 + r6;
+            t2 = s5 - s1;
+            s5 = s5 + s1;
+            s8 = s7 - s6;
+            s7 = s7 + s6;
+            r1 = r5 + s7;
+            r5 = r5 - s7;
+            r6 = t1 + s8;
+            t1 = t1 - s8;
+            s1 = s5 - r7;
+            s5 = s5 + r7;
+            s6 = t2 - r8;
+            t2 = t2 + r8;
+            p1 = co2 * r1;
+            p2 = si2 * s1;
+            p3 = co2 * s1;
+            p4 = si2 * r1;
+            pSrc[2 * i2]     = p1 + p2;
+            pSrc[2 * i2 + 1] = p3 - p4;
+            p1 = co8 * r5;
+            p2 = si8 * s5;
+            p3 = co8 * s5;
+            p4 = si8 * r5;
+            pSrc[2 * i8]     = p1 + p2;
+            pSrc[2 * i8 + 1] = p3 - p4;
+            p1 = co6 * r6;
+            p2 = si6 * s6;
+            p3 = co6 * s6;
+            p4 = si6 * r6;
+            pSrc[2 * i6]     = p1 + p2;
+            pSrc[2 * i6 + 1] = p3 - p4;
+            p1 = co4 * t1;
+            p2 = si4 * t2;
+            p3 = co4 * t2;
+            p4 = si4 * t1;
+            pSrc[2 * i4]     = p1 + p2;
+            pSrc[2 * i4 + 1] = p3 - p4;
+
+            i1 += n1;
+         } while (i1 < fftLen);
+
+         j++;
+      } while (j < n2);
+
+      twidCoefModifier <<= 3;
+   } while (n2 > 7);
+}
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c
index 50048f67d7d71939012f272a651f31e72bf3c286..7990d4797c06d63a462c1b797b53eb59e72c59a5 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_cfft_radix8_f32.c
  * Description:  Radix-8 Decimation in Frequency CFFT & CIFFT Floating point processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 
 /* ----------------------------------------------------------------------
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c b/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c
index 87455dc0fd8daaf7f496c4d8a8a9e77a9ffde4db..b9dff3fb7ed5a3a5617dc4e9226e3d948be34918 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_dct4_f32.c
  * Description:  Processing function of DCT4 & IDCT4 F32
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 /**
   @ingroup groupTransforms
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c b/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c
index 42b9df0e9496381a23ec0af620bf6e1fccc2bd25..e1d80c0e42d6b457ea3c9c6c9bc1f53a84dd3dd5 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_dct4_init_f32.c
  * Description:  Initialization function of DCT-4 & IDCT4 F32
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c b/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c
index 52ba8cc725557ae07aeb79a02f25059ad2875538..5390da3c84b67609d6836ab43205452a83f2da5e 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_dct4_init_q15.c
  * Description:  Initialization function of DCT-4 & IDCT4 Q15
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c b/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c
index 954ebcf331d55204828a4cfdf4a976448e9c470d..4c7622a1bab423607034fa53106a166ce36f6130 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_dct4_init_q31.c
  * Description:  Initialization function of DCT-4 & IDCT4 Q31
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c b/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c
index f926a1d19158cd2069b46518915781ac6f239d14..a4650da273e3409516a13b6c9ecce347f829e977 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_dct4_q15.c
  * Description:  Processing function of DCT4 & IDCT4 Q15
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 /**
   @addtogroup DCT4_IDCT4
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c b/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c
index 369a5c3ae87ff34c3fc48d6fda186f334f1178c3..6cbccff81fbc3deeac70ee36ebf96e4ededf73d7 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_dct4_q31.c
  * Description:  Processing function of DCT4 & IDCT4 Q31
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 /**
   @addtogroup DCT4_IDCT4
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c
index a1fc70ffdc60642cdb2655117fc29f45cfb522a5..8844b73a71d32595ddee66c5fa3eca6787b12dc7 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_rfft_f32.c
  * Description:  RFFT & RIFFT Floating point process function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 /* ----------------------------------------------------------------------
  * Internal functions prototypes
@@ -85,6 +85,13 @@ void arm_split_rifft_f32(
   @param[in]     pSrc points to the input buffer
   @param[out]    pDst points to the output buffer
   @return        none
+
+  @par
+                   For the RIFFT, the source buffer must at least have length 
+                   fftLenReal + 2.
+                   The last two elements must be equal to what would be generated
+                   by the RFFT:
+                     (pSrc[0] - pSrc[1]) and 0.0f
  */
 
 void arm_rfft_f32(
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f16.c b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..f5e6f15c9791ab0f0c0f39121faada992af36bb1
--- /dev/null
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f16.c
@@ -0,0 +1,612 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rfft_fast_f16.c
+ * Description:  RFFT & RIFFT Floating point process function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+#include "arm_common_tables_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void stage_rfft_f16(
+  const arm_rfft_fast_instance_f16 * S,
+        float16_t * p,
+        float16_t * pOut)
+{
+        int32_t  k;                                /* Loop Counter */
+        float16_t twR, twI;                         /* RFFT Twiddle coefficients */
+  const float16_t * pCoeff = S->pTwiddleRFFT;       /* Points to RFFT Twiddle factors */
+        float16_t *pA = p;                          /* increasing pointer */
+        float16_t *pB = p;                          /* decreasing pointer */
+        float16_t xAR, xAI, xBR, xBI;               /* temporary variables */
+        float16_t t1a, t1b;                         /* temporary variables */
+        float16_t p0, p1, p2, p3;                   /* temporary variables */
+
+        float16x8x2_t tw,xA,xB;
+        float16x8x2_t tmp1, tmp2, res;
+
+        uint16x8_t     vecStridesBkwd;
+
+        vecStridesBkwd = vddupq_u16((uint16_t)14, 2);
+
+
+        int blockCnt;
+
+
+   k = (S->Sint).fftLen - 1;
+
+   /* Pack first and last sample of the frequency domain together */
+
+   xBR = pB[0];
+   xBI = pB[1];
+   xAR = pA[0];
+   xAI = pA[1];
+
+   twR = *pCoeff++ ;
+   twI = *pCoeff++ ;
+
+   // U1 = XA(1) + XB(1); % It is real
+   t1a = xBR + xAR  ;
+
+   // U2 = XB(1) - XA(1); % It is imaginary
+   t1b = xBI + xAI  ;
+
+   // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+   // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+   *pOut++ = 0.5f * ( t1a + t1b );
+   *pOut++ = 0.5f * ( t1a - t1b );
+
+   // XA(1) = 1/2*( U1 - imag(U2) +  i*( U1 +imag(U2) ));
+   pB  = p + 2*k - 14;
+   pA += 2;
+
+   blockCnt = k >> 3;
+   while (blockCnt > 0)
+   {
+      /*
+         function X = my_split_rfft(X, ifftFlag)
+         % X is a series of real numbers
+         L  = length(X);
+         XC = X(1:2:end) +i*X(2:2:end);
+         XA = fft(XC);
+         XB = conj(XA([1 end:-1:2]));
+         TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
+         for l = 2:L/2
+            XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
+         end
+         XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
+         X = XA;
+      */
+
+
+      xA = vld2q_f16(pA);
+      pA += 16;
+
+      xB = vld2q_f16(pB);
+
+      xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd);
+      xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd);
+
+      xB.val[1] = vnegq_f16(xB.val[1]);
+      pB -= 16;
+
+
+      tw = vld2q_f16(pCoeff);
+      pCoeff += 16;
+
+
+      tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]);
+      tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]);
+
+      tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]);
+      tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]);
+
+      res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
+      res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
+
+      res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
+      res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
+
+      res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] );
+      res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] );
+
+      res.val[0] = vmulq_n_f16(res.val[0], 0.5f);
+      res.val[1] = vmulq_n_f16(res.val[1], 0.5f);
+
+
+      vst2q_f16(pOut, res);
+      pOut += 16;
+
+    
+      blockCnt--;
+   } 
+
+   pB += 14;
+   blockCnt = k & 7;
+   while (blockCnt > 0)
+   {
+      /*
+         function X = my_split_rfft(X, ifftFlag)
+         % X is a series of real numbers
+         L  = length(X);
+         XC = X(1:2:end) +i*X(2:2:end);
+         XA = fft(XC);
+         XB = conj(XA([1 end:-1:2]));
+         TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
+         for l = 2:L/2
+            XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
+         end
+         XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
+         X = XA;
+      */
+
+      xBI = pB[1];
+      xBR = pB[0];
+      xAR = pA[0];
+      xAI = pA[1];
+
+      twR = *pCoeff++;
+      twI = *pCoeff++;
+
+      t1a = xBR - xAR ;
+      t1b = xBI + xAI ;
+
+      // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+      // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+      p0 = twR * t1a;
+      p1 = twI * t1a;
+      p2 = twR * t1b;
+      p3 = twI * t1b;
+
+      *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
+      *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
+
+      pA += 2;
+      pB -= 2;
+      blockCnt--;
+   }
+}
+
+/* Prepares data for inverse cfft */
+void merge_rfft_f16(
+  const arm_rfft_fast_instance_f16 * S,
+        float16_t * p,
+        float16_t * pOut)
+{
+        int32_t  k;                                /* Loop Counter */
+        float16_t twR, twI;                         /* RFFT Twiddle coefficients */
+  const float16_t *pCoeff = S->pTwiddleRFFT;        /* Points to RFFT Twiddle factors */
+        float16_t *pA = p;                          /* increasing pointer */
+        float16_t *pB = p;                          /* decreasing pointer */
+        float16_t xAR, xAI, xBR, xBI;               /* temporary variables */
+        float16_t t1a, t1b, r, s, t, u;             /* temporary variables */
+
+        float16x8x2_t tw,xA,xB;
+        float16x8x2_t tmp1, tmp2, res;
+        uint16x8_t     vecStridesBkwd;
+
+        vecStridesBkwd = vddupq_u16((uint16_t)14, 2);
+
+        int blockCnt;
+        
+
+   k = (S->Sint).fftLen - 1;
+
+   xAR = pA[0];
+   xAI = pA[1];
+
+   pCoeff += 2 ;
+
+   *pOut++ = 0.5f * ( xAR + xAI );
+   *pOut++ = 0.5f * ( xAR - xAI );
+
+   pB  =  p + 2*k - 14;
+   pA +=  2    ;
+
+   blockCnt = k >> 3;
+   while (blockCnt > 0)
+   {
+      /* G is half of the frequency complex spectrum */
+      //for k = 2:N
+      //    Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
+      xA = vld2q_f16(pA);
+      pA += 16;
+
+      xB = vld2q_f16(pB);
+
+      xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd);
+      xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd);
+
+      xB.val[1] = vnegq_f16(xB.val[1]);
+      pB -= 16;
+
+
+      tw = vld2q_f16(pCoeff);
+      tw.val[1] = vnegq_f16(tw.val[1]);
+      pCoeff += 16;
+
+
+      tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]);
+      tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]);
+
+      tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]);
+      tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]);
+
+      res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
+      res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
+
+      res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
+      res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
+
+      res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] );
+      res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] );
+
+      res.val[0] = vmulq_n_f16(res.val[0], 0.5f);
+      res.val[1] = vmulq_n_f16(res.val[1], 0.5f);
+
+
+      vst2q_f16(pOut, res);
+      pOut += 16;
+
+    
+      blockCnt--;
+   }
+
+   pB += 14;
+   blockCnt = k & 7;
+   while (blockCnt > 0)
+   {
+      /* G is half of the frequency complex spectrum */
+      //for k = 2:N
+      //    Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
+      xBI =   pB[1]    ;
+      xBR =   pB[0]    ;
+      xAR =  pA[0];
+      xAI =  pA[1];
+
+      twR = *pCoeff++;
+      twI = *pCoeff++;
+
+      t1a = xAR - xBR ;
+      t1b = xAI + xBI ;
+
+      r = twR * t1a;
+      s = twI * t1b;
+      t = twI * t1a;
+      u = twR * t1b;
+
+      // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
+      // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
+      *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR
+      *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI
+
+      pA += 2;
+      pB -= 2;
+      blockCnt--;
+   }
+
+}
+#else
+void stage_rfft_f16(
+  const arm_rfft_fast_instance_f16 * S,
+        float16_t * p,
+        float16_t * pOut)
+{
+        int32_t  k;                                /* Loop Counter */
+        float16_t twR, twI;                         /* RFFT Twiddle coefficients */
+  const float16_t * pCoeff = S->pTwiddleRFFT;       /* Points to RFFT Twiddle factors */
+        float16_t *pA = p;                          /* increasing pointer */
+        float16_t *pB = p;                          /* decreasing pointer */
+        float16_t xAR, xAI, xBR, xBI;               /* temporary variables */
+        float16_t t1a, t1b;                         /* temporary variables */
+        float16_t p0, p1, p2, p3;                   /* temporary variables */
+
+
+   k = (S->Sint).fftLen - 1;
+
+   /* Pack first and last sample of the frequency domain together */
+
+   xBR = pB[0];
+   xBI = pB[1];
+   xAR = pA[0];
+   xAI = pA[1];
+
+   twR = *pCoeff++ ;
+   twI = *pCoeff++ ;
+
+
+   // U1 = XA(1) + XB(1); % It is real
+   t1a = xBR + xAR  ;
+
+   // U2 = XB(1) - XA(1); % It is imaginary
+   t1b = xBI + xAI  ;
+
+   // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+   // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+   *pOut++ = 0.5f * ( t1a + t1b );
+   *pOut++ = 0.5f * ( t1a - t1b );
+
+   // XA(1) = 1/2*( U1 - imag(U2) +  i*( U1 +imag(U2) ));
+   pB  = p + 2*k;
+   pA += 2;
+
+   do
+   {
+      /*
+         function X = my_split_rfft(X, ifftFlag)
+         % X is a series of real numbers
+         L  = length(X);
+         XC = X(1:2:end) +i*X(2:2:end);
+         XA = fft(XC);
+         XB = conj(XA([1 end:-1:2]));
+         TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
+         for l = 2:L/2
+            XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
+         end
+         XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
+         X = XA;
+      */
+
+      xBI = pB[1];
+      xBR = pB[0];
+      xAR = pA[0];
+      xAI = pA[1];
+
+      twR = *pCoeff++;
+      twI = *pCoeff++;
+
+      t1a = xBR - xAR ;
+      t1b = xBI + xAI ;
+
+      // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+      // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+      p0 = twR * t1a;
+      p1 = twI * t1a;
+      p2 = twR * t1b;
+      p3 = twI * t1b;
+
+      *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
+      *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
+
+
+      pA += 2;
+      pB -= 2;
+      k--;
+   } while (k > 0);
+}
+
+/* Prepares data for inverse cfft */
+void merge_rfft_f16(
+  const arm_rfft_fast_instance_f16 * S,
+        float16_t * p,
+        float16_t * pOut)
+{
+        int32_t  k;                                /* Loop Counter */
+        float16_t twR, twI;                         /* RFFT Twiddle coefficients */
+  const float16_t *pCoeff = S->pTwiddleRFFT;        /* Points to RFFT Twiddle factors */
+        float16_t *pA = p;                          /* increasing pointer */
+        float16_t *pB = p;                          /* decreasing pointer */
+        float16_t xAR, xAI, xBR, xBI;               /* temporary variables */
+        float16_t t1a, t1b, r, s, t, u;             /* temporary variables */
+
+   k = (S->Sint).fftLen - 1;
+
+   xAR = pA[0];
+   xAI = pA[1];
+
+   pCoeff += 2 ;
+
+   *pOut++ = 0.5f * ( xAR + xAI );
+   *pOut++ = 0.5f * ( xAR - xAI );
+
+   pB  =  p + 2*k ;
+   pA +=  2	   ;
+
+   while (k > 0)
+   {
+      /* G is half of the frequency complex spectrum */
+      //for k = 2:N
+      //    Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
+      xBI =   pB[1]    ;
+      xBR =   pB[0]    ;
+      xAR =  pA[0];
+      xAI =  pA[1];
+
+      twR = *pCoeff++;
+      twI = *pCoeff++;
+
+      t1a = xAR - xBR ;
+      t1b = xAI + xBI ;
+
+      r = twR * t1a;
+      s = twI * t1b;
+      t = twI * t1a;
+      u = twR * t1b;
+
+      // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
+      // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
+      *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR
+      *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI
+
+      pA += 2;
+      pB -= 2;
+      k--;
+   }
+
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @ingroup groupTransforms
+*/
+
+/**
+  @defgroup RealFFT Real FFT Functions
+ 
+  @par
+                   The CMSIS DSP library includes specialized algorithms for computing the
+                   FFT of real data sequences.  The FFT is defined over complex data but
+                   in many applications the input is real.  Real FFT algorithms take advantage
+                   of the symmetry properties of the FFT and have a speed advantage over complex
+                   algorithms of the same length.
+  @par
+                   The Fast RFFT algorithm relays on the mixed radix CFFT that save processor usage.
+  @par
+                   The real length N forward FFT of a sequence is computed using the steps shown below.
+  @par
+                   \image html RFFT.gif "Real Fast Fourier Transform"
+  @par
+                   The real sequence is initially treated as if it were complex to perform a CFFT.
+                   Later, a processing stage reshapes the data to obtain half of the frequency spectrum
+                   in complex format. Except the first complex number that contains the two real numbers
+                   X[0] and X[N/2] all the data is complex. In other words, the first complex sample
+                   contains two real values packed.
+  @par
+                   The input for the inverse RFFT should keep the same format as the output of the
+                   forward RFFT. A first processing stage pre-process the data to later perform an
+                   inverse CFFT.
+  @par
+                   \image html RIFFT.gif "Real Inverse Fast Fourier Transform"
+  @par
+                   The algorithms for floating-point, Q15, and Q31 data are slightly different
+                   and we describe each algorithm in turn.
+  @par           Floating-point
+                   The main functions are \ref arm_rfft_fast_f16() and \ref arm_rfft_fast_init_f16().
+                   
+  @par
+                   The FFT of a real N-point sequence has even symmetry in the frequency domain. 
+                   The second half of the data equals the conjugate of the first half flipped in frequency. 
+                   Looking at the data, we see that we can uniquely represent the FFT using only N/2 complex numbers.
+                   These are packed into the output array in alternating real and imaginary components:
+  @par
+                   X = { real[0], imag[0], real[1], imag[1], real[2], imag[2] ...
+                   real[(N/2)-1], imag[(N/2)-1 }
+  @par
+                   It happens that the first complex number (real[0], imag[0]) is actually
+                   all real. real[0] represents the DC offset, and imag[0] should be 0.
+                   (real[1], imag[1]) is the fundamental frequency, (real[2], imag[2]) is
+                   the first harmonic and so on.
+  @par
+                   The real FFT functions pack the frequency domain data in this fashion.
+                   The forward transform outputs the data in this form and the inverse
+                   transform expects input data in this form. The function always performs
+                   the needed bitreversal so that the input and output data is always in
+                   normal order. The functions support lengths of [32, 64, 128, ..., 4096]
+                   samples.
+  @par           Q15 and Q31
+                   The real algorithms are defined in a similar manner and utilize N/2 complex
+                   transforms behind the scenes.
+  @par
+                   The complex transforms used internally include scaling to prevent fixed-point
+                   overflows.  The overall scaling equals 1/(fftLen/2).
+                   Due to the use of complex transform internally, the source buffer is
+                   modified by the rfft.
+  @par
+                   A separate instance structure must be defined for each transform used but
+                   twiddle factor and bit reversal tables can be reused.
+  @par
+                   There is also an associated initialization function for each data type.
+                   The initialization function performs the following operations:
+                    - Sets the values of the internal structure fields.
+                    - Initializes twiddle factor table and bit reversal table pointers.
+                    - Initializes the internal complex FFT data structure.
+  @par
+                   Use of the initialization function is optional **except for MVE versions where it is mandatory**.
+                   If you don't use the initialization functions, then the structures should be initialized with code
+                   similar to the one below:
+  <pre>
+      arm_rfft_instance_q31 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft};
+      arm_rfft_instance_q15 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft};
+  </pre>
+                   where <code>fftLenReal</code> is the length of the real transform;
+                   <code>fftLenBy2</code> length of  the internal complex transform (fftLenReal/2).
+                   <code>ifftFlagR</code> Selects forward (=0) or inverse (=1) transform.
+                   <code>bitReverseFlagR</code> Selects bit reversed output (=0) or normal order
+                   output (=1).
+                   <code>twidCoefRModifier</code> stride modifier for the twiddle factor table.
+                   The value is based on the FFT length;
+                   <code>pTwiddleAReal</code>points to the A array of twiddle coefficients;
+                   <code>pTwiddleBReal</code>points to the B array of twiddle coefficients;
+                   <code>pCfft</code> points to the CFFT Instance structure. The CFFT structure
+                   must also be initialized.  
+@par
+                   Note that with MVE versions you can't initialize instance structures directly and **must
+                   use the initialization function**.
+ */
+
+/**
+  @addtogroup RealFFT
+  @{
+*/
+
+/**
+  @brief         Processing function for the floating-point real FFT.
+  @param[in]     S         points to an arm_rfft_fast_instance_f16 structure
+  @param[in]     p         points to input buffer (Source buffer is modified by this function.)
+  @param[in]     pOut      points to output buffer
+  @param[in]     ifftFlag
+                   - value = 0: RFFT
+                   - value = 1: RIFFT
+  @return        none
+*/
+
+void arm_rfft_fast_f16(
+  const arm_rfft_fast_instance_f16 * S,
+  float16_t * p,
+  float16_t * pOut,
+  uint8_t ifftFlag)
+{
+   const arm_cfft_instance_f16 * Sint = &(S->Sint);
+
+
+   /* Calculation of Real FFT */
+   if (ifftFlag)
+   {
+      /*  Real FFT compression */
+      merge_rfft_f16(S, p, pOut);
+      /* Complex radix-4 IFFT process */
+      arm_cfft_f16( Sint, pOut, ifftFlag, 1);
+   }
+   else
+   {
+
+      /* Calculation of RFFT of input */
+      arm_cfft_f16( Sint, p, ifftFlag, 1);
+
+      /*  Real FFT extraction */
+      stage_rfft_f16(S, p, pOut);
+   }
+}
+
+/**
+* @} end of RealFFT group
+*/
+
+#endif /*  #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c
index dde22d1f588d6bed40b0566c4ad1a8fd526a6c46..671250439fe22b6f737d99912481674c61563690 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c
@@ -1,15 +1,15 @@
 /* ----------------------------------------------------------------------
  * Project:      CMSIS DSP Library
- * Title:        arm_rfft_f32.c
+ * Title:        arm_rfft_fast_f32.c
  * Description:  RFFT & RIFFT Floating point process function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 void stage_rfft_f32(
@@ -34,7 +34,7 @@ void stage_rfft_f32(
         float32_t * p,
         float32_t * pOut)
 {
-        uint32_t  k;                                /* Loop Counter */
+        int32_t  k;                                /* Loop Counter */
         float32_t twR, twI;                         /* RFFT Twiddle coefficients */
   const float32_t * pCoeff = S->pTwiddleRFFT;       /* Points to RFFT Twiddle factors */
         float32_t *pA = p;                          /* increasing pointer */
@@ -193,7 +193,7 @@ void merge_rfft_f32(
         float32_t * p,
         float32_t * pOut)
 {
-        uint32_t  k;                                /* Loop Counter */
+        int32_t  k;                                /* Loop Counter */
         float32_t twR, twI;                         /* RFFT Twiddle coefficients */
   const float32_t *pCoeff = S->pTwiddleRFFT;        /* Points to RFFT Twiddle factors */
         float32_t *pA = p;                          /* increasing pointer */
@@ -312,7 +312,7 @@ void stage_rfft_f32(
         float32_t * p,
         float32_t * pOut)
 {
-        uint32_t  k;                                /* Loop Counter */
+        int32_t  k;                                /* Loop Counter */
         float32_t twR, twI;                         /* RFFT Twiddle coefficients */
   const float32_t * pCoeff = S->pTwiddleRFFT;       /* Points to RFFT Twiddle factors */
         float32_t *pA = p;                          /* increasing pointer */
@@ -392,7 +392,7 @@ void stage_rfft_f32(
       pA += 2;
       pB -= 2;
       k--;
-   } while (k > 0U);
+   } while (k > 0);
 }
 
 /* Prepares data for inverse cfft */
@@ -401,7 +401,7 @@ void merge_rfft_f32(
         float32_t * p,
         float32_t * pOut)
 {
-        uint32_t  k;                                /* Loop Counter */
+        int32_t  k;                                /* Loop Counter */
         float32_t twR, twI;                         /* RFFT Twiddle coefficients */
   const float32_t *pCoeff = S->pTwiddleRFFT;        /* Points to RFFT Twiddle factors */
         float32_t *pA = p;                          /* increasing pointer */
@@ -422,7 +422,7 @@ void merge_rfft_f32(
    pB  =  p + 2*k ;
    pA +=  2	   ;
 
-   while (k > 0U)
+   while (k > 0)
    {
       /* G is half of the frequency complex spectrum */
       //for k = 2:N
@@ -471,7 +471,7 @@ void merge_rfft_f32(
                    of the symmetry properties of the FFT and have a speed advantage over complex
                    algorithms of the same length.
   @par
-                   The Fast RFFT algorith relays on the mixed radix CFFT that save processor usage.
+                   The Fast RFFT algorithm relays on the mixed radix CFFT that save processor usage.
   @par
                    The real length N forward FFT of a sequence is computed using the steps shown below.
   @par
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f64.c b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f64.c
index 16a7a03c78ec48b7ded4e7e02c50e51e180c664b..a1e4ed01655763b6296d4b24f733370fddca7d55 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f64.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f64.c
@@ -1,15 +1,15 @@
 /* ----------------------------------------------------------------------
  * Project:      CMSIS DSP Library
- * Title:        arm_rfft_f64.c
+ * Title:        arm_rfft_fast_f64.c
  * Description:  RFFT & RIFFT Double precision Floating point process function
  *
- * $Date:        29. November 2019
- * $Revision:    V1.0.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 void stage_rfft_f64(
   const arm_rfft_fast_instance_f64 * S,
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f16.c b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..c74f1ffc9203a95a9e23056cfc6675152d8e3127
--- /dev/null
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f16.c
@@ -0,0 +1,357 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rfft_fast_init_f16.c
+ * Description:  Split Radix Decimation in Frequency CFFT Floating point processing function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+#include "arm_common_tables_f16.h"
+#include "arm_const_structs_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup RealFFT
+  @{
+ */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_32))
+
+/**
+  @private
+  @brief         Initialization function for the 32pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f16 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_32_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+  arm_status status;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  status=arm_cfft_init_f16(&(S->Sint),16);
+  if (status != ARM_MATH_SUCCESS)
+  {
+    return(status);
+  }
+
+  S->fftLenRFFT = 32U;
+  S->pTwiddleRFFT    = (float16_t *) twiddleCoefF16_rfft_32;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_64))
+
+/**
+  @private
+  @brief         Initialization function for the 64pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f16 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_64_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+  arm_status status;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  status=arm_cfft_init_f16(&(S->Sint),32);
+  if (status != ARM_MATH_SUCCESS)
+  {
+    return(status);
+  }
+  S->fftLenRFFT = 64U;
+
+  S->pTwiddleRFFT    = (float16_t *) twiddleCoefF16_rfft_64;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_128))
+
+/**
+  @private
+  @brief         Initialization function for the 128pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f16 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_128_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+  arm_status status;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  status=arm_cfft_init_f16(&(S->Sint),64);
+  if (status != ARM_MATH_SUCCESS)
+  {
+    return(status);
+  }
+  S->fftLenRFFT = 128;
+
+  S->pTwiddleRFFT    = (float16_t *) twiddleCoefF16_rfft_128;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_256))
+
+/**
+  @private
+  @brief         Initialization function for the 256pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f16 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+*/
+
+static arm_status arm_rfft_256_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+  arm_status status;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  status=arm_cfft_init_f16(&(S->Sint),128);
+  if (status != ARM_MATH_SUCCESS)
+  {
+    return(status);
+  }
+  S->fftLenRFFT = 256U;
+
+  S->pTwiddleRFFT    = (float16_t *) twiddleCoefF16_rfft_256;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_512))
+
+/**
+  @private
+  @brief         Initialization function for the 512pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f16 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_512_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+  arm_status status;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  status=arm_cfft_init_f16(&(S->Sint),256);
+  if (status != ARM_MATH_SUCCESS)
+  {
+    return(status);
+  }
+  S->fftLenRFFT = 512U;
+
+  S->pTwiddleRFFT    = (float16_t *) twiddleCoefF16_rfft_512;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_1024))
+/**
+  @private
+  @brief         Initialization function for the 1024pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f16 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_1024_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+  arm_status status;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  status=arm_cfft_init_f16(&(S->Sint),512);
+  if (status != ARM_MATH_SUCCESS)
+  {
+    return(status);
+  }
+  S->fftLenRFFT = 1024U;
+
+  S->pTwiddleRFFT    = (float16_t *) twiddleCoefF16_rfft_1024;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_2048))
+/**
+  @private
+  @brief         Initialization function for the 2048pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f16 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+static arm_status arm_rfft_2048_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+  arm_status status;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  status=arm_cfft_init_f16(&(S->Sint),1024);
+  if (status != ARM_MATH_SUCCESS)
+  {
+    return(status);
+  }
+  S->fftLenRFFT = 2048U;
+
+  S->pTwiddleRFFT    = (float16_t *) twiddleCoefF16_rfft_2048;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_4096))
+/**
+  @private
+* @brief         Initialization function for the 4096pt floating-point real FFT.
+* @param[in,out] S  points to an arm_rfft_fast_instance_f16 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_4096_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+  arm_status status;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  status=arm_cfft_init_f16(&(S->Sint),2048);
+  if (status != ARM_MATH_SUCCESS)
+  {
+    return(status);
+  }
+  S->fftLenRFFT = 4096U;
+
+  S->pTwiddleRFFT    = (float16_t *) twiddleCoefF16_rfft_4096;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif 
+
+/**
+  @brief         Initialization function for the floating-point real FFT.
+  @param[in,out] S       points to an arm_rfft_fast_instance_f16 structure
+  @param[in]     fftLen  length of the Real Sequence
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
+
+  @par           Description
+                   The parameter <code>fftLen</code> specifies the length of RFFT/CIFFT process.
+                   Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096.
+  @par
+                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ */
+
+arm_status arm_rfft_fast_init_f16(
+  arm_rfft_fast_instance_f16 * S,
+  uint16_t fftLen)
+{
+  typedef arm_status(*fft_init_ptr)( arm_rfft_fast_instance_f16 *);
+  fft_init_ptr fptr = 0x0;
+
+  switch (fftLen)
+  {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_4096))
+  case 4096U:
+    fptr = arm_rfft_4096_fast_init_f16;
+    break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_2048))
+  case 2048U:
+    fptr = arm_rfft_2048_fast_init_f16;
+    break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_1024))
+  case 1024U:
+    fptr = arm_rfft_1024_fast_init_f16;
+    break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_512))
+  case 512U:
+    fptr = arm_rfft_512_fast_init_f16;
+    break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_256))
+  case 256U:
+    fptr = arm_rfft_256_fast_init_f16;
+    break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_128))
+  case 128U:
+    fptr = arm_rfft_128_fast_init_f16;
+    break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_64))
+  case 64U:
+    fptr = arm_rfft_64_fast_init_f16;
+    break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_32))
+  case 32U:
+    fptr = arm_rfft_32_fast_init_f16;
+    break;
+#endif
+  default:
+    return ARM_MATH_ARGUMENT_ERROR;
+  }
+
+  if( ! fptr ) return ARM_MATH_ARGUMENT_ERROR;
+  return fptr( S );
+
+}
+
+/**
+  @} end of RealFFT group
+ */
+
+#endif /*  #if defined(ARM_FLOAT16_SUPPORTED) */
\ No newline at end of file
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c
index 72a6768ddd604e165dbf778f3b75f07228db57d5..9050aa0136d2d8a7216f22aa65d99aa8d17b6327 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c
@@ -1,15 +1,15 @@
 /* ----------------------------------------------------------------------
  * Project:      CMSIS DSP Library
- * Title:        arm_cfft_init_f32.c
+ * Title:        arm_rfft_fast_init_f32.c
  * Description:  Split Radix Decimation in Frequency CFFT Floating point processing function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 /**
@@ -38,7 +38,7 @@
   @{
  */
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
 
 /**
   @private
@@ -68,7 +68,7 @@ static arm_status arm_rfft_32_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
 }
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
 
 /**
   @private
@@ -98,7 +98,7 @@ static arm_status arm_rfft_64_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
 }
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
 
 /**
   @private
@@ -128,7 +128,7 @@ static arm_status arm_rfft_128_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
 }
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
 
 /**
   @private
@@ -158,7 +158,7 @@ static arm_status arm_rfft_256_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
 }
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
 
 /**
   @private
@@ -188,7 +188,7 @@ static arm_status arm_rfft_512_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
 }
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
 /**
   @private
   @brief         Initialization function for the 1024pt floating-point real FFT.
@@ -217,7 +217,7 @@ static arm_status arm_rfft_1024_fast_init_f32( arm_rfft_fast_instance_f32 * S )
 }
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
 /**
   @private
   @brief         Initialization function for the 2048pt floating-point real FFT.
@@ -245,7 +245,7 @@ static arm_status arm_rfft_2048_fast_init_f32( arm_rfft_fast_instance_f32 * S )
 }
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
 /**
   @private
 * @brief         Initialization function for the 4096pt floating-point real FFT.
@@ -298,42 +298,42 @@ arm_status arm_rfft_fast_init_f32(
 
   switch (fftLen)
   {
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
   case 4096U:
     fptr = arm_rfft_4096_fast_init_f32;
     break;
 #endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
   case 2048U:
     fptr = arm_rfft_2048_fast_init_f32;
     break;
 #endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
   case 1024U:
     fptr = arm_rfft_1024_fast_init_f32;
     break;
 #endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
   case 512U:
     fptr = arm_rfft_512_fast_init_f32;
     break;
 #endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
   case 256U:
     fptr = arm_rfft_256_fast_init_f32;
     break;
 #endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
   case 128U:
     fptr = arm_rfft_128_fast_init_f32;
     break;
 #endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
   case 64U:
     fptr = arm_rfft_64_fast_init_f32;
     break;
 #endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
   case 32U:
     fptr = arm_rfft_32_fast_init_f32;
     break;
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f64.c b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f64.c
index 3ea02ff1cf430f77de157210f6668ede1d5bc29c..97c4fb31e22292303078b4e3a2a35645c3bf8317 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f64.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f64.c
@@ -1,15 +1,15 @@
 /* ----------------------------------------------------------------------
  * Project:      CMSIS DSP Library
- * Title:        arm_cfft_init_f64.c
+ * Title:        arm_rfft_fast_init_f64.c
  * Description:  Split Radix Decimation in Frequency CFFT Double Precision Floating point processing function
  *
- * $Date:        29. November 2019
- * $Revision:    V1.0.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 /**
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c
index 141f8b62cb600cdfeac7203d8ee0bca01a3c4b20..0a32da661729f94e314855a39fe16c5bb827e952 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c
@@ -3,13 +3,13 @@
  * Title:        arm_rfft_init_f32.c
  * Description:  RFFT & RIFFT Floating point initialisation function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 
 
@@ -71,9 +71,15 @@ arm_status arm_rfft_init_f32(
   uint32_t ifftFlagR,
   uint32_t bitReverseFlag)
 {
+   /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_F32)
 
   /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_SUCCESS;
+  status = ARM_MATH_SUCCESS;
 
   /*  Initialize the Real FFT length */
   S->fftLenReal = (uint16_t) fftLenReal;
@@ -129,6 +135,8 @@ arm_status arm_rfft_init_f32(
     arm_cfft_radix4_init_f32(S->pCfft, S->fftLenBy2, 0U, 0U);
   }
 
+#endif
+#endif
   /* return the status of RFFT Init function */
   return (status);
 
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c
index 11e175cf68ea88f6c025824ccd502fccae49c3aa..e70f8af9dd8edf49e03a47127f24345147745b8e 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_rfft_init_q15.c
  * Description:  RFFT & RIFFT Q15 initialisation function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 #include "arm_const_structs.h"
 
@@ -68,8 +68,15 @@ arm_status arm_rfft_init_q15(
     uint32_t ifftFlagR,
     uint32_t bitReverseFlag)
 {
+     /*  Initialise the default arm status */
+    arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q15)
+
     /*  Initialise the default arm status */
-    arm_status status = ARM_MATH_SUCCESS;
+    status = ARM_MATH_SUCCESS;
 
     /*  Initialize the Real FFT length */
     S->fftLenReal = (uint16_t) fftLenReal;
@@ -93,7 +100,7 @@ arm_status arm_rfft_init_q15(
     case 8192U:
         S->twidCoefRModifier = 1U;
 
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q15(&(S->cfftInst),4096);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -108,7 +115,7 @@ arm_status arm_rfft_init_q15(
     case 4096U:
         S->twidCoefRModifier = 2U;
 
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q15(&(S->cfftInst),2048);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -123,7 +130,7 @@ arm_status arm_rfft_init_q15(
     case 2048U:
         S->twidCoefRModifier = 4U;
 
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q15(&(S->cfftInst),1024);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -138,7 +145,7 @@ arm_status arm_rfft_init_q15(
     case 1024U:
         S->twidCoefRModifier = 8U;
 
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q15(&(S->cfftInst),512);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -153,7 +160,7 @@ arm_status arm_rfft_init_q15(
     case 512U:
         S->twidCoefRModifier = 16U;
 
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q15(&(S->cfftInst),256);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -168,7 +175,7 @@ arm_status arm_rfft_init_q15(
     case 256U:
         S->twidCoefRModifier = 32U;
 
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q15(&(S->cfftInst),128);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -183,7 +190,7 @@ arm_status arm_rfft_init_q15(
     case 128U:
         S->twidCoefRModifier = 64U;
 
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q15(&(S->cfftInst),64);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -198,7 +205,7 @@ arm_status arm_rfft_init_q15(
     case 64U:
         S->twidCoefRModifier = 128U;
 
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q15(&(S->cfftInst),32);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -213,7 +220,7 @@ arm_status arm_rfft_init_q15(
     case 32U:
         S->twidCoefRModifier = 256U;
 
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q15(&(S->cfftInst),16);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -230,6 +237,8 @@ arm_status arm_rfft_init_q15(
         break;
     }
 
+#endif
+#endif
     /* return the status of RFFT Init function */
     return (status);
 }
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c
index 80e83f6a105d41f9d1a7ad58fe09a02d327dc480..0a28719e5206cda4dae3a82397139746b7eb3081 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_rfft_init_q31.c
  * Description:  RFFT & RIFFT Q31 initialisation function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 #include "arm_common_tables.h"
 #include "arm_const_structs.h"
 
@@ -70,8 +70,15 @@ arm_status arm_rfft_init_q31(
     uint32_t ifftFlagR,
     uint32_t bitReverseFlag)
 {
+     /*  Initialise the default arm status */
+    arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q31)
+
     /*  Initialise the default arm status */
-    arm_status status = ARM_MATH_SUCCESS;
+    status = ARM_MATH_SUCCESS;
 
     /*  Initialize the Real FFT length */
     S->fftLenReal = (uint16_t) fftLenReal;
@@ -97,7 +104,7 @@ arm_status arm_rfft_init_q31(
 
         S->twidCoefRModifier = 1U;
 
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q31(&(S->cfftInst),4096);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -112,7 +119,7 @@ arm_status arm_rfft_init_q31(
     case 4096U:
         S->twidCoefRModifier = 2U;
 
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q31(&(S->cfftInst),2048);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -127,7 +134,7 @@ arm_status arm_rfft_init_q31(
     case 2048U:
         S->twidCoefRModifier = 4U;
 
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q31(&(S->cfftInst),1024);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -141,7 +148,7 @@ arm_status arm_rfft_init_q31(
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
     case 1024U:
         S->twidCoefRModifier = 8U;
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q31(&(S->cfftInst),512);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -155,7 +162,7 @@ arm_status arm_rfft_init_q31(
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
     case 512U:
         S->twidCoefRModifier = 16U;
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q31(&(S->cfftInst),256);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -169,7 +176,7 @@ arm_status arm_rfft_init_q31(
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
     case 256U:
         S->twidCoefRModifier = 32U;
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q31(&(S->cfftInst),128);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -183,7 +190,7 @@ arm_status arm_rfft_init_q31(
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
     case 128U:
         S->twidCoefRModifier = 64U;
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q31(&(S->cfftInst),64);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -197,7 +204,7 @@ arm_status arm_rfft_init_q31(
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
     case 64U:
         S->twidCoefRModifier = 128U;
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q31(&(S->cfftInst),32);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -211,7 +218,7 @@ arm_status arm_rfft_init_q31(
 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
     case 32U:
         S->twidCoefRModifier = 256U;
-        #if defined(ARM_MATH_MVEI)
+        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            status=arm_cfft_init_q31(&(S->cfftInst),16);
            if (status != ARM_MATH_SUCCESS)
            {
@@ -228,6 +235,8 @@ arm_status arm_rfft_init_q31(
         break;
     }
 
+#endif
+#endif
     /* return the status of RFFT Init function */
     return (status);
 }
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c
index 8f19e8de653a587bfe2a78069d92c2ee61a47f09..f7086bec17f99151edb8098261e649111d9ef6b6 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c
@@ -3,13 +3,13 @@
  * Title:        arm_rfft_q15.c
  * Description:  RFFT & RIFFT Q15 process function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 /* ----------------------------------------------------------------------
  * Internal functions prototypes
@@ -71,6 +71,12 @@ void arm_split_rifft_q15(
   @par
                    If the input buffer is of length N, the output buffer must have length 2*N.
                    The input buffer is modified by this function.
+  @par
+                   For the RIFFT, the source buffer must at least have length 
+                   fftLenReal + 2.
+                   The last two elements must be equal to what would be generated
+                   by the RFFT:
+                     (pSrc[0] - pSrc[1]) >> 1 and 0
  */
 
 void arm_rfft_q15(
@@ -78,13 +84,12 @@ void arm_rfft_q15(
         q15_t * pSrc,
         q15_t * pDst)
 {
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
   const arm_cfft_instance_q15 *S_CFFT = &(S->cfftInst);
 #else
   const arm_cfft_instance_q15 *S_CFFT = S->pCfft;
 #endif
         uint32_t L2 = S->fftLenReal >> 1U;
-        uint32_t i;
 
   /* Calculation of RIFFT of input */
   if (S->ifftFlagR == 1U)
@@ -95,10 +100,7 @@ void arm_rfft_q15(
      /* Complex IFFT process */
      arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
 
-     for(i = 0; i < S->fftLenReal; i++)
-     {
-        pDst[i] = pDst[i] << 1U;
-     }
+     arm_shift_q15(pDst, 1, pDst, S->fftLenReal);
   }
   else
   {
@@ -131,7 +133,17 @@ void arm_rfft_q15(
                    The function implements a Real FFT
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_fft.h"
+
+#if defined(__CMSIS_GCC_H)
+#define MVE_CMPLX_MULT_FX_AxB_S16(A,B)          vqdmladhxq_s16(vqdmlsdhq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B)
+#define MVE_CMPLX_MULT_FX_AxConjB_S16(A,B)      vqdmladhq_s16(vqdmlsdhxq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B)
+
+#endif 
+
 void arm_split_rfft_q15(
         q15_t * pSrc,
         uint32_t fftLen,
@@ -140,101 +152,58 @@ void arm_split_rfft_q15(
         q15_t * pDst,
         uint32_t modifier)
 {
-    q15_t const     *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
-    q15_t           *pDst1 = &pDst[2], *pDst2 = &pDst[(4U * fftLen) - 1U - 14]; /* temp pointers for output buffer */
-    q15_t const     *pSrc1 = &pSrc[2], *pSrc2 = &pSrc[(2U * fftLen) - 1U - 14]; /* temp pointers for input buffer */
-    q15_t const    *pVecSrc1;
-    q15_t          *pVecDst1;
-    q15x8x2_t      vecIn, vecSum;
-    uint32_t         blkCnt;
-    uint16x8_t     vecStridesFwd, vecStridesBkwd;
-    q15x8_t        vecInBkwd, vecCoefFwd0, vecCoefFwd1;
-
-    /*
-     * Init coefficient pointers
-     */
-    pCoefA = &pATable[modifier * 2U];
-    pCoefB = &pBTable[modifier * 2U];
-    /*
-     * scatter / gather offsets
-     * for ascending & descending addressing
-     */
-    vecStridesFwd = vidupq_u16((uint32_t)0, 2);    // 0, 2, 4, 6, 8, 10, 12, 14
-    vecStridesBkwd = vddupq_u16(14, 2);   // 14, 12, 10, 8, 6, 4, 2, 0
-    vecStridesFwd = vecStridesFwd * (uint16_t)  modifier;
-
-    pVecSrc1 = (q15_t const *) pSrc1;
-    pVecDst1 = pDst1;
-
-    blkCnt = fftLen >> 3;
-    while (blkCnt > 0U)
-    {
-        vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefA, vecStridesFwd);
-        vecCoefFwd1 = vldrhq_gather_shifted_offset(&pCoefA[1], vecStridesFwd);
-        vecIn = vld2q(pVecSrc1);
-        pVecSrc1 += 16;
-        /*
-         * outR = *pSrc1 * CoefA1;
-         */
-        vecSum.val[0] = vrmulhq(vecIn.val[0], vecCoefFwd0);
-        /*
-         * outI = *pSrc1++ * CoefA2;
-         */
-        vecSum.val[1] = vrmulhq(vecIn.val[0], vecCoefFwd1);
-
-        vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd);
-        /*
-         * outR -= (*pSrc1 + *pSrc2) * CoefA2;
-         */
-        vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd);
-        vecSum.val[0] = vqsubq(vecSum.val[0], vrmulhq(vecInBkwd, vecCoefFwd1));
-
-        vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd);
-        /*
-         * outI += *pSrc1++ * CoefA1;
-         */
-        vecSum.val[1] = vqaddq(vecSum.val[1], vrmulhq(vecIn.val[1], vecCoefFwd0));
-
-        vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefB, vecStridesFwd);
-        /*
-         * outI -= *pSrc2-- * CoefB1;
-         */
-        vecSum.val[1] = vqsubq(vecSum.val[1], vrmulhq(vecInBkwd, vecCoefFwd0));
-
-        vecInBkwd = vldrhq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd);
-        /*
-         * outI -= *pSrc2 * CoefA2;
-         */
-        vecSum.val[1] = vqsubq(vecSum.val[1], vrmulhq(vecInBkwd, vecCoefFwd1));
-        /*
-         * outR += *pSrc2-- * CoefB1;
-         */
-        vecSum.val[0] = vqaddq(vecSum.val[0], vrmulhq(vecInBkwd, vecCoefFwd0));
-
-        vst2q(pVecDst1, vecSum);
-        pVecDst1 += 16;
-        /*
-         * write complex conjugate output
-         */
-        vecSum.val[1] = -vecSum.val[1];
-        vstrhq_scatter_shifted_offset(pDst2, vecStridesBkwd, vecSum.val[1]);
-        vstrhq_scatter_shifted_offset(&pDst2[-1], vecStridesBkwd, vecSum.val[0]);
-        /*
-         * update fwd and backwd offsets
-         */
-        vecStridesFwd = vecStridesFwd + (uint16_t)(modifier * 16U);
-        /* cannot use negative 16-bit offsets (would lead to positive 32-65K jump*/
-        //vecStridesBkwd = vecStridesBkwd - (uint16_t)16;
-        pSrc2 = pSrc2 - 16;
-        pDst2 = pDst2 - 16;
+   uint32_t        i;          /* Loop Counter */
+    const q15_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
+    q15_t          *pOut1 = &pDst[2];
+    q15_t          *pIn1 = &pSrc[2];
+    uint16x8_t      offsetIn = { 6, 7, 4, 5, 2, 3, 0, 1 };
+    uint16x8_t      offsetCoef;
+    const uint16_t  offsetCoefArr[16] = {
+        0, 0, 2, 2, 4, 4, 6, 6,
+        0, 1, 0, 1, 0, 1, 0, 1
+    };
+
+    offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8);
+    offsetIn = vaddq_n_u16(offsetIn, (2 * fftLen - 8));
+
+    /* Init coefficient pointers */
+    pCoefA = &pATable[modifier * 2];
+    pCoefB = &pBTable[modifier * 2];
+
+    const q15_t    *pCoefAb, *pCoefBb;
+    pCoefAb = pCoefA;
+    pCoefBb = pCoefB;
+
+    pIn1 = &pSrc[2];
+
+    i = fftLen - 1U;
+    i = i / 4 + 1;
+    while (i > 0U) {
+        q15x8_t         in1 = vld1q_s16(pIn1);
+        q15x8_t         in2 = vldrhq_gather_shifted_offset_s16(pSrc, offsetIn);
+        q15x8_t         coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
+        q15x8_t         coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
+
+#if defined(__CMSIS_GCC_H)
+        q15x8_t         out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB_S16(in1, coefA),
+                                     MVE_CMPLX_MULT_FX_AxConjB_S16(coefB, in2));
+#else
+        q15x8_t         out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB(in1, coefA),
+                                     MVE_CMPLX_MULT_FX_AxConjB(coefB, in2));
+#endif
+        vst1q_s16(pOut1, out);
+        pOut1 += 8;
 
-        blkCnt--;
+        offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
+        offsetIn -= 8;
+        pIn1 += 8;
+        i -= 1;
     }
 
-    pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
-    pDst[(2U * fftLen) + 1U] = 0;
+    pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
+    pDst[2 * fftLen + 1] = 0;
 
-    pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
+    pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
     pDst[1] = 0;
 }
 #else
@@ -396,7 +365,10 @@ void arm_split_rfft_q15(
                    The function implements a Real IFFT
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_fft.h"
 
 void arm_split_rifft_q15(
         q15_t * pSrc,
@@ -406,90 +378,52 @@ void arm_split_rifft_q15(
         q15_t * pDst,
         uint32_t modifier)
 {
-    q15_t const     *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
-    q15_t const     *pSrc1 = &pSrc[0], *pSrc2 = &pSrc[(2U * fftLen) + 1U - 14U];
-    q15_t           *pDst1 = &pDst[0];
-    q15_t const    *pVecSrc1;
-    q15_t          *pVecDst1;
-    q15x8x2_t      vecIn, vecSum;
-    uint32_t         blkCnt;
-    uint16x8_t     vecStridesFwd, vecStridesBkwd;
-    q15x8_t        vecInBkwd, vecCoefFwd0, vecCoefFwd1;
-
-    /*
-     * Init coefficient pointers
-     */
+   uint32_t        i;                  /* Loop Counter */
+    const q15_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
+    q15_t          *pIn1;
+    uint16x8_t      offset = { 6, 7, 4, 5, 2, 3, 0, 1 };
+    uint16x8_t      offsetCoef;
+    int16x8_t       conj = { 1, -1, 1, -1, 1, -1, 1, -1 }; /* conjugate */
+    const uint16_t  offsetCoefArr[16] = {
+        0, 0, 2, 2, 4, 4, 6, 6,
+        0, 1, 0, 1, 0, 1, 0, 1
+    };
+
+    offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8);
+
+    offset = vaddq_n_u16(offset, (2 * fftLen - 6));
+
+    /* Init coefficient pointers */
     pCoefA = &pATable[0];
     pCoefB = &pBTable[0];
-    /*
-     * scatter / gather offsets
-     * for ascending & descending addressing
-     */
-    vecStridesFwd = vidupq_u16((uint32_t)0, 2);    // 0, 2, 4, 6, 8, 10, 12, 14
-    vecStridesBkwd = vddupq_u16(14, 2);   // 14, 12, 10, 8, 6, 4, 2, 0
-    vecStridesFwd = vecStridesFwd * (uint16_t)  modifier;
-
 
-    pVecSrc1 = (q15_t const *) pSrc1;
-    pVecDst1 = pDst1;
+    const q15_t    *pCoefAb, *pCoefBb;
+    pCoefAb = pCoefA;
+    pCoefBb = pCoefB;
 
-    blkCnt = fftLen >> 3;
-    while (blkCnt > 0U)
-    {
-        vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefA, vecStridesFwd);
-        vecCoefFwd1 = vldrhq_gather_shifted_offset(&pCoefA[1], vecStridesFwd);
-        vecIn = vld2q(pVecSrc1);
-        pVecSrc1 += 16;
-        /*
-         * outR = *pSrc1 * CoefA1;
-         */
-        vecSum.val[0] = vmulhq(vecIn.val[0], vecCoefFwd0);
-        /*
-         * outI = -(*pSrc1++) * CoefA2;
-         */
-        vecIn.val[0] = vnegq(vecIn.val[0]);
-        vecSum.val[1] = vmulhq(vecIn.val[0], vecCoefFwd1);
+    pIn1 = &pSrc[0];
 
-        vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd);
-        /*
-         * outR += (*pSrc1 + *pSrc2) * CoefA2;
-         */
-        vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd);
-        vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd1));
+    i = fftLen;
+    i = i / 4;
 
-        vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd);
-        /*
-         * outI += *pSrc1++ * CoefA1;
-         */
-        vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecIn.val[1], vecCoefFwd0));
+    while (i > 0U) {
+        q15x8_t         in1 = vld1q_s16(pIn1);
+        q15x8_t         in2 = vldrhq_gather_shifted_offset_s16(pSrc, offset);
+        q15x8_t         coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
+        q15x8_t         coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
 
-        vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefB, vecStridesFwd);
-        /*
-         * outI -= *pSrc2-- * CoefB1;
-         */
-        vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd0));
+        /* can we avoid the conjugate here ? */
+        q15x8_t         out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA),
+                                     vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB)));
 
-        vecInBkwd = vldrhq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd);
-        /*
-         * outI += *pSrc2 * CoefA2;
-         */
-        vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd1));
-        /*
-         * outR += *pSrc2-- * CoefB1;
-         */
-        vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd0));
+        vst1q_s16(pDst, out);
+        pDst += 8;
 
-        vst2q(pVecDst1, vecSum);
-        pVecDst1 += 16;
-        /*
-         * update fwd and backwd offsets
-         */
-        vecStridesFwd = vecStridesFwd + (uint16_t)(modifier * 16U);
+        offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
+        offset -= 8;
 
-        /* cannot use negative 16-bit offsets (would lead to positive 32-65K jump*/
-        //vecStridesBkwd = vecStridesBkwd - (uint16_t)16;
-        pSrc2 = pSrc2 - 16;
-        blkCnt--;
+        pIn1 += 8;
+        i -= 1;
     }
 }
 #else
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c
index bb11f1bbe5eeeb888f630c1c3d22511237d4d2e2..9f57011c3b068bf4b18c15c451845efb1398b1dc 100644
--- a/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_rfft_q31.c
  * Description:  FFT & RIFFT Q31 process function
  *
- * $Date:        18. March 2019
- * $Revision:    V1.6.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
  *
- * Target Processor: Cortex-M cores
+ * Target Processor: Cortex-M and Cortex-A cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -26,7 +26,7 @@
  * limitations under the License.
  */
 
-#include "arm_math.h"
+#include "dsp/transform_functions.h"
 
 /* ----------------------------------------------------------------------
  * Internal functions prototypes
@@ -71,6 +71,13 @@ void arm_split_rifft_q31(
   @par
                    If the input buffer is of length N, the output buffer must have length 2*N.
                    The input buffer is modified by this function.
+  @par
+                   For the RIFFT, the source buffer must at least have length 
+                   fftLenReal + 2.
+                   The last two elements must be equal to what would be generated
+                   by the RFFT:
+                     (pSrc[0] - pSrc[1]) >> 1 and 0
+
  */
 
 void arm_rfft_q31(
@@ -78,13 +85,12 @@ void arm_rfft_q31(
         q31_t * pSrc,
         q31_t * pDst)
 {
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
   const arm_cfft_instance_q31 *S_CFFT = &(S->cfftInst);
 #else
   const arm_cfft_instance_q31 *S_CFFT = S->pCfft;
 #endif
         uint32_t L2 = S->fftLenReal >> 1U;
-        uint32_t i;
 
   /* Calculation of RIFFT of input */
   if (S->ifftFlagR == 1U)
@@ -95,10 +101,7 @@ void arm_rfft_q31(
      /* Complex IFFT process */
      arm_cfft_q31 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
 
-     for(i = 0; i < S->fftLenReal; i++)
-     {
-        pDst[i] = pDst[i] << 1U;
-     }
+     arm_shift_q31(pDst, 1, pDst, S->fftLenReal);
   }
   else
   {
@@ -128,7 +131,17 @@ void arm_rfft_q31(
   @return        none
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_fft.h"
+
+#if defined(__CMSIS_GCC_H)
+
+#define MVE_CMPLX_MULT_FX_AxB_S32(A,B)          vqdmladhxq_s32(vqdmlsdhq_s32((__typeof(A))vuninitializedq_s32(), A, B), A, B)
+#define MVE_CMPLX_MULT_FX_AxConjB_S32(A,B)      vqdmladhq_s32(vqdmlsdhxq_s32((__typeof(A))vuninitializedq_s32(), A, B), A, B)
+
+#endif 
 
 void arm_split_rfft_q31(
     q31_t       *pSrc,
@@ -138,98 +151,52 @@ void arm_split_rfft_q31(
     q31_t       *pDst,
     uint32_t     modifier)
 {
-    q31_t const     *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
-    q31_t           *pDst1 = &pDst[2], *pDst2 = &pDst[(4U * fftLen) - 1U];    /* temp pointers for output buffer */
-    q31_t const     *pSrc1 = &pSrc[2], *pSrc2 = &pSrc[(2U * fftLen) - 1U];    /* temp pointers for input buffer */
-    q31_t const    *pVecSrc1;
-    q31_t          *pVecDst1;
-    q31x4x2_t      vecIn, vecSum;
-    uint32_t         blkCnt;
-    uint32x4_t     vecStridesFwd, vecStridesBkwd;
-    q31x4_t        vecInBkwd, vecCoefFwd0, vecCoefFwd1;
-
-    /*
-     * Init coefficient pointers
-     */
-    pCoefA = &pATable[modifier * 2U];
-    pCoefB = &pBTable[modifier * 2U];
-    /*
-     * scatter / gather offsets
-     * for ascending & descending addressing
-     */
-    vecStridesFwd = vidupq_u32((uint32_t)0, 2);
-    vecStridesBkwd = -vecStridesFwd;
-    vecStridesFwd = vecStridesFwd * modifier;
-
-    pVecSrc1 = (q31_t const *) pSrc1;
-    pVecDst1 = pDst1;
-
-    blkCnt = fftLen >> 2;
-    while (blkCnt > 0U)
-    {
-        vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefA, vecStridesFwd);
-        vecCoefFwd1 = vldrwq_gather_shifted_offset(&pCoefA[1], vecStridesFwd);
-        vecIn = vld2q(pVecSrc1);
-        pVecSrc1 += 8;
-        /*
-         * outR = *pSrc1 * CoefA1;
-         */
-        vecSum.val[0] = vmulhq(vecIn.val[0], vecCoefFwd0);
-        /*
-         * outI = *pSrc1++ * CoefA2;
-         */
-        vecSum.val[1] = vmulhq(vecIn.val[0], vecCoefFwd1);
-
-        vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd);
-        /*
-         * outR -= (*pSrc1 + *pSrc2) * CoefA2;
-         */
-        vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd);
-        vecSum.val[0] = vqsubq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd1));
-
-        vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd);
-        /*
-         * outI += *pSrc1++ * CoefA1;
-         */
-        vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecIn.val[1], vecCoefFwd0));
-
-        vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefB, vecStridesFwd);
-        /*
-         * outI -= *pSrc2-- * CoefB1;
-         */
-        vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd0));
-
-        vecInBkwd = vldrwq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd);
-        /*
-         * outI -= *pSrc2 * CoefA2;
-         */
-        vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd1));
-        /*
-         * outR += *pSrc2-- * CoefB1;
-         */
-        vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd0));
-
-        vst2q(pVecDst1, vecSum);
-        pVecDst1 += 8;
-        /*
-         * write complex conjugate output
-         */
-        vecSum.val[1] = -vecSum.val[1];
-        vstrwq_scatter_shifted_offset(pDst2, vecStridesBkwd, vecSum.val[1]);
-        vstrwq_scatter_shifted_offset(&pDst2[-1], vecStridesBkwd, vecSum.val[0]);
-        /*
-         * update fwd and backwd offsets
-         */
-        vecStridesFwd = vecStridesFwd + (modifier * 8U);
-        vecStridesBkwd = vecStridesBkwd - 8;
-
-        blkCnt--;
+    uint32_t        i;          /* Loop Counter */
+    const q31_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
+    q31_t          *pOut1 = &pDst[2];
+    q31_t          *pIn1 = &pSrc[2];
+    uint32x4_t      offset = { 2, 3, 0, 1 };
+    uint32x4_t      offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 };
+
+    offset = offset + (2 * fftLen - 4);
+
+
+    /* Init coefficient pointers */
+    pCoefA = &pATable[modifier * 2];
+    pCoefB = &pBTable[modifier * 2];
+
+    const q31_t    *pCoefAb, *pCoefBb;
+    pCoefAb = pCoefA;
+    pCoefBb = pCoefB;
+
+    pIn1 = &pSrc[2];
+
+    i = fftLen - 1U;
+    i = i / 2 + 1;
+    while (i > 0U) {
+        q31x4_t         in1 = vld1q_s32(pIn1);
+        q31x4_t         in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset);
+        q31x4_t         coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef);
+        q31x4_t         coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef);
+#if defined(__CMSIS_GCC_H)
+        q31x4_t         out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxB_S32(in1, coefA),MVE_CMPLX_MULT_FX_AxConjB_S32(coefB, in2));
+#else
+        q31x4_t         out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxB(in1, coefA),MVE_CMPLX_MULT_FX_AxConjB(coefB, in2));
+#endif
+        vst1q(pOut1, out);
+        pOut1 += 4;
+
+        offsetCoef += modifier * 4;
+        offset -= 4;
+
+        pIn1 += 4;
+        i -= 1;
     }
 
-    pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
-    pDst[(2U * fftLen) + 1U] = 0;
+    pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
+    pDst[2 * fftLen + 1] = 0;
 
-    pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
+    pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
     pDst[1] = 0;
 }
 #else
@@ -331,7 +298,7 @@ void arm_split_rfft_q31(
   @return        none
  */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 void arm_split_rifft_q31(
         q31_t * pSrc,
@@ -341,87 +308,49 @@ void arm_split_rifft_q31(
         q31_t * pDst,
         uint32_t modifier)
 {
-    q31_t const     *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
-    q31_t const     *pSrc1 = &pSrc[0], *pSrc2 = &pSrc[(2U * fftLen) + 1U];
-    q31_t const    *pVecSrc1;
-    q31_t          *pVecDst;
-    q31x4x2_t      vecIn, vecSum;
-    uint32_t         blkCnt;
-    uint32x4_t     vecStridesFwd, vecStridesBkwd;
-    q31x4_t        vecInBkwd, vecCoefFwd0, vecCoefFwd1;
-
-
-    /*
-     * Init coefficient pointers
-     */
+    uint32_t        i;          /* Loop Counter */
+    const q31_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
+    q31_t          *pIn1;
+    uint32x4_t      offset = { 2, 3, 0, 1 };
+    uint32x4_t      offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 };
+    int32x4_t       conj = { 1, -1, 1, -1 };
+
+    offset = offset + (2 * fftLen - 2);
+
+    /* Init coefficient pointers */
     pCoefA = &pATable[0];
     pCoefB = &pBTable[0];
-    /*
-     * scatter / gather offsets
-     * for ascending & descending addressing
-     */
-    vecStridesFwd = vidupq_u32((uint32_t)0, 2);
-    vecStridesBkwd = -vecStridesFwd;
-    vecStridesFwd = vecStridesFwd * modifier;
-
-    pVecSrc1 = (q31_t const *) pSrc1;
-    pVecDst = pDst;
-
-    blkCnt = fftLen >> 2;
-    while (blkCnt > 0U)
-    {
-        vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefA, vecStridesFwd);
-        vecCoefFwd1 = vldrwq_gather_shifted_offset(&pCoefA[1], vecStridesFwd);
-        vecIn = vld2q(pVecSrc1);
-        pVecSrc1 += 8;
-        /*
-         * outR = *pSrc1 * CoefA1;
-         */
-        vecSum.val[0] = vmulhq(vecIn.val[0], vecCoefFwd0);
-        /*
-         * outI = -(*pSrc1++) * CoefA2;
-         */
-        vecIn.val[0] =  (-vecIn.val[0]);
-        vecSum.val[1] = vmulhq(vecIn.val[0], vecCoefFwd1);
-
-        vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd);
-        /*
-         * outR += (*pSrc1 + *pSrc2) * CoefA2;
-         */
-        vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd);
-        vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd1));
-
-        vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd);
-        /*
-         * outI += *pSrc1++ * CoefA1;
-         */
-        vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecIn.val[1], vecCoefFwd0));
-
-        vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefB, vecStridesFwd);
-        /*
-         * outI -= *pSrc2-- * CoefB1;
-         */
-        vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd0));
-
-        vecInBkwd = vldrwq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd);
-        /*
-         * outI += *pSrc2-- * CoefA2;;
-         */
-        vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd1));
-        /*
-         * outR += *pSrc2-- * CoefB1;
-         */
-        vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd0));
-
-        vst2q(pVecDst, vecSum);
-        pVecDst += 8;
-        /*
-         * update fwd and backwd offsets
-         */
-        vecStridesFwd = vecStridesFwd + (modifier * 8U);
-        vecStridesBkwd = vecStridesBkwd - 8;
-
-        blkCnt--;
+
+    const q31_t    *pCoefAb, *pCoefBb;
+    pCoefAb = pCoefA;
+    pCoefBb = pCoefB;
+
+    pIn1 = &pSrc[0];
+
+    i = fftLen;
+    i = i >> 1;
+    while (i > 0U) {
+        q31x4_t         in1 = vld1q_s32(pIn1);
+        q31x4_t         in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset);
+        q31x4_t         coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef);
+        q31x4_t         coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef);
+
+        /* can we avoid the conjugate here ? */
+#if defined(__CMSIS_GCC_H)
+        q31x4_t         out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxConjB_S32(in1, coefA),
+                                     vmulq_s32(conj, MVE_CMPLX_MULT_FX_AxB_S32(in2, coefB)));
+#else
+        q31x4_t         out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA),
+                                     vmulq_s32(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB)));
+#endif
+        vst1q_s32(pDst, out);
+        pDst += 4;
+
+        offsetCoef += modifier * 4;
+        offset -= 4;
+
+        pIn1 += 4;
+        i -= 1;
     }
 }
 #else