/******************************************************************************
 * @file     arm_sorting.h
  * @version  v2.0.9
  * @date     2022-06-28
 * @brief    Private header file for CMSIS DSP Library
 ******************************************************************************/
/*
 * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef _ARM_SORTING_H_
#define _ARM_SORTING_H_

#include "arm_math.h"

#ifdef   __cplusplus
extern "C"
{
#endif

  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data.
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_bubble_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
    uint32_t blockSize);

   /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data.
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_heap_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
    uint32_t blockSize);

  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data.
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_insertion_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t *pSrc,
          float32_t* pDst,
    uint32_t blockSize);

  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_quick_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
    uint32_t blockSize);

  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_selection_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
    uint32_t blockSize);

  /**
   * @param[in]  S          points to an instance of the sorting structure.
   * @param[in]  pSrc       points to the block of input data.
   * @param[out] pDst       points to the block of output data
   * @param[in]  blockSize  number of samples to process.
   */
  void arm_bitonic_sort_f32(
    const arm_sort_instance_f32 * S,
          float32_t * pSrc,
          float32_t * pDst,
          uint32_t blockSize);

#if defined(ARM_MATH_NEON)

#define vtrn256_128q(a, b)                   \
do {                                         \
	float32x4_t vtrn128_temp = a.val[1]; \
	a.val[1] = b.val[0];                 \
	b.val[0] = vtrn128_temp ;            \
} while (0)

#define vtrn128_64q(a, b)           \
do {                                \
	float32x2_t ab, cd, ef, gh; \
	ab = vget_low_f32(a);	    \
	ef = vget_low_f32(b);	    \
	cd = vget_high_f32(a);	    \
	gh = vget_high_f32(b);      \
	a = vcombine_f32(ab, ef);   \
	b = vcombine_f32(cd, gh);   \
} while (0)

#define vtrn256_64q(a, b)                  \
do {                                       \
	float32x2_t a_0, a_1, a_2, a_3;    \
	float32x2_t b_0, b_1, b_2, b_3;    \
	a_0 = vget_low_f32(a.val[0]);      \
	a_1 = vget_high_f32(a.val[0]);     \
	a_2 = vget_low_f32(a.val[1]);      \
	a_3 = vget_high_f32(a.val[1]);     \
	b_0 = vget_low_f32(b.val[0]);      \
	b_1 = vget_high_f32(b.val[0]);     \
	b_2 = vget_low_f32(b.val[1]);      \
	b_3 = vget_high_f32(b.val[1]);     \
	a.val[0] = vcombine_f32(a_0, b_0); \
	a.val[1] = vcombine_f32(a_2, b_2); \
	b.val[0] = vcombine_f32(a_1, b_1); \
	b.val[1] = vcombine_f32(a_3, b_3); \
} while (0)

#define vtrn128_32q(a, b)                               \
do {                                                    \
	float32x4x2_t vtrn32_tmp = vtrnq_f32((a), (b)); \
	(a) = vtrn32_tmp.val[0];                        \
	(b) = vtrn32_tmp.val[1];                        \
} while (0)

#define vtrn256_32q(a, b)               \
do {                                    \
	float32x4x2_t vtrn32_tmp_1 = vtrnq_f32((a.val[0]), (b.val[0])); \
	float32x4x2_t vtrn32_tmp_2 = vtrnq_f32((a.val[1]), (b.val[1])); \
	a.val[0] = vtrn32_tmp_1.val[0]; \
	a.val[1] = vtrn32_tmp_2.val[0]; \
	b.val[0] = vtrn32_tmp_1.val[1]; \
	b.val[1] = vtrn32_tmp_2.val[1]; \
} while (0)

#define vminmaxq(a, b)                    \
	do {                              \
	float32x4_t minmax_tmp = (a);     \
	(a) = vminq_f32((a), (b));        \
	(b) = vmaxq_f32(minmax_tmp, (b)); \
} while (0)

#define vminmax256q(a, b)                         \
	do {                                      \
	float32x4x2_t minmax256_tmp = (a);        \
	a.val[0] = vminq_f32(a.val[0], b.val[0]); \
	a.val[1] = vminq_f32(a.val[1], b.val[1]); \
	b.val[0] = vmaxq_f32(minmax256_tmp.val[0], b.val[0]); \
	b.val[1] = vmaxq_f32(minmax256_tmp.val[1], b.val[1]); \
} while (0)

#define vrev128q_f32(a) \
        vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))

#define vrev256q_f32(a)     \
	do {                \
        float32x4_t rev_tmp = vcombine_f32(vrev64_f32(vget_high_f32(a.val[0])), vrev64_f32(vget_low_f32(a.val[0]))); \
	a.val[0] = vcombine_f32(vrev64_f32(vget_high_f32(a.val[1])), vrev64_f32(vget_low_f32(a.val[1])));  \
	a.val[1] = rev_tmp; \
} while (0)

#define vldrev128q_f32(a, p) \
	do {                 \
	a = vld1q_f32(p);    \
	a = vrev128q_f32(a); \
} while (0)

#endif /* ARM_MATH_NEON */

#ifdef   __cplusplus
}
#endif

#endif /* _ARM_SORTING_H */