stm32f407-openocd/Drivers/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c

/*
 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* ----------------------------------------------------------------------
 * Project:      CMSIS NN Library
 * Title:        arm_svdf_s8.c
 * Description:  S8 basic SVDF layer function
 *
 * $Date:        15. April 2021
 * $Revision:    V.1.5.0
 *
 * Target Processor:  Cortex-M processors
 *
 * -------------------------------------------------------------------- */

#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"

/**
 * @ingroup groupNN
 */

/**
 * @addtogroup SVDF
 * @{
 */

/*
 * S8 SVDF layer function for TensorFlow Lite
 *
 * Refer to header file for details.
 *
 */

arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
                       const cmsis_nn_context *output_ctx,
                       const cmsis_nn_svdf_params *svdf_params,
                       const cmsis_nn_per_tensor_quant_params *input_quant_params,
                       const cmsis_nn_per_tensor_quant_params *output_quant_params,
                       const cmsis_nn_dims *input_dims,
                       const q7_t *input_data,
                       const cmsis_nn_dims *state_dims,
                       q15_t *state_data,
                       const cmsis_nn_dims *weights_feature_dims,
                       const q7_t *weights_feature_data,
                       const cmsis_nn_dims *weights_time_dims,
                       const q15_t *weights_time_data,
                       const cmsis_nn_dims *bias_dims,
                       const q31_t *bias_data,
                       const cmsis_nn_dims *output_dims,
                       q7_t *output_data)
{
    (void)bias_dims;
    (void)state_dims;
    (void)output_dims;

    const q31_t multiplier_in = input_quant_params->multiplier;
    const q31_t shift_in = input_quant_params->shift;
    const q31_t multiplier_out = output_quant_params->multiplier;
    const q31_t shift_2 = output_quant_params->shift;
    const int32_t zp_in = svdf_params->input_offset;
    const int32_t zp_out = svdf_params->output_offset;
    const int32_t in_activation_min = svdf_params->input_activation.min;
    const int32_t in_activation_max = svdf_params->input_activation.max;
    const int32_t out_activation_min = svdf_params->output_activation.min;
    const int32_t out_activation_max = svdf_params->output_activation.max;
    const int16_t rank = svdf_params->rank;

    const int32_t input_batches = input_dims->n;
    const int32_t input_height = input_dims->h;
    const int32_t feature_batches = weights_feature_dims->n;
    const int32_t time_batches = weights_time_dims->h;
    const int32_t unit_count = feature_batches / rank;

    q31_t *buffer_a = (q31_t *)input_ctx->buf;
    q31_t *buffer_b = (q31_t *)output_ctx->buf;

    memmove((q15_t *)state_data,
            (q15_t *)state_data + 1,
            (size_t)(input_batches * feature_batches * time_batches * (int32_t)sizeof(int16_t)));

    for (int i_batch = 0; i_batch < input_batches; i_batch++)
    {
        q15_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
        const q7_t *weight = weights_feature_data;
        const q7_t *input = input_data + i_batch * input_height;

        arm_status res = arm_nn_vec_mat_mult_t_svdf_s8(input,
                                                       weight,
                                                       res_ptr,
                                                       -zp_in,
                                                       0,
                                                       time_batches,
                                                       multiplier_in,
                                                       shift_in,
                                                       input_height,
                                                       feature_batches,
                                                       in_activation_min,
                                                       in_activation_max);

        if (res != ARM_MATH_SUCCESS)
        {
            return res;
        }
    }

    {
        q31_t *ptr_a = buffer_a;
        const q15_t *v2 = state_data;
        for (int i_batch = 0; i_batch < input_batches; i_batch++)
        {
            const q15_t *v1 = weights_time_data;

            for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++)
            {
                *ptr_a = 0;
                int32_t sum = 0;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
                int j = 0;
                int32_t block_count = time_batches >> 1;
                for (int i = 0; i < block_count; i++)
                {
                    j += 2;
                    q31_t r1 = arm_nn_read_q15x2_ia(&v1);
                    q31_t r2 = arm_nn_read_q15x2_ia(&v2);

                    sum = __SMLAD(r1, r2, sum);
                }

                // Process the remaining data
                for (; j < time_batches; j++)
                {
                    sum += *v1 * *v2;
                    v1++;
                    v2++;
                }
#else
                for (int j = 0; j < time_batches; j++)
                {
                    sum += *v1 * *v2;
                    v1++;
                    v2++;
                }
#endif

                *ptr_a = sum;
                ptr_a++;
            }
        }
    }

    if (bias_data)
    {
        if (unit_count == feature_batches)
        {
            for (int i = 0; i < input_batches; i++)
            {
                q31_t *output_temp = buffer_b + i * feature_batches;
                const q31_t *ptr_a = buffer_a + i * feature_batches;

                const int32_t *bi = bias_data;
                for (int j = 0; j < feature_batches; j++)
                {
                    output_temp[j] = ptr_a[j] + bi[j];
                }
            }
        }
        else
        {
            for (int i_batch = 0; i_batch < input_batches; i_batch++)
            {
                q31_t *output_data_temp = buffer_b + i_batch * unit_count;
                q31_t *ptr_a = buffer_a + i_batch * feature_batches;

                for (int i = 0; i < unit_count; i++)
                {
                    int32_t sum = bias_data[i];
                    for (int j = 0; j < rank; j++)
                    {
                        sum += *ptr_a;
                        ptr_a++;
                    }
                    output_data_temp[i] = sum;
                }
            }
        }
    }
    else
    {
        for (int i_batch = 0; i_batch < input_batches; i_batch++)
        {
            q31_t *output_data_temp = buffer_b + i_batch * unit_count;
            q31_t *ptr_a = buffer_a + i_batch * feature_batches;

            for (int i = 0; i < unit_count; i++)
            {
                int32_t sum = 0;
                for (int j = 0; j < rank; j++)
                {
                    sum += *ptr_a;
                    ptr_a++;
                }
                output_data_temp[i] = sum;
            }
        }
    }

#if defined(ARM_MATH_MVEI)
    int32_t num_elements = input_batches * unit_count;
    const int32_t loop_count = (num_elements + 3) / 4;
    for (int i_op = 0; i_op < loop_count; i_op++)
    {
        mve_pred16_t p = vctp32q((uint32_t)num_elements);
        int32x4_t op = vldrwq_z_s32(buffer_b, p);
        op = arm_requantize_mve(op, multiplier_out, shift_2);
        op = vaddq_n_s32(op, zp_out);
        const int32x4_t min_vec = vdupq_n_s32((int8_t)out_activation_min);
        const int32x4_t max_vec = vdupq_n_s32((int8_t)out_activation_max);
        op = vmaxq_s32(op, min_vec);
        op = vminq_s32(op, max_vec);
        vstrbq_p_s32(output_data, op, p);
        output_data += 4;
        buffer_b += 4;
        num_elements -= 4;
    }
#else
    for (int i = 0; i < input_batches * unit_count; i++)
    {
        output_data[i] = (q7_t)CLAMP(
            arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min);
    }
#endif

    return (ARM_MATH_SUCCESS);
}

/**
 * @} end of SVDF group
 */
first 2024-06-12 08:32:58 +00:00			`/*`
			`* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.`
			`*`
			`* SPDX-License-Identifier: Apache-2.0`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the License); you may`
			`* not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an AS IS BASIS, WITHOUT`
			`* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`

			`/* ----------------------------------------------------------------------`
			`* Project: CMSIS NN Library`
			`* Title: arm_svdf_s8.c`
			`* Description: S8 basic SVDF layer function`
			`*`
			`* $Date: 15. April 2021`
			`* $Revision: V.1.5.0`
			`*`
			`* Target Processor: Cortex-M processors`
			`*`
			`* -------------------------------------------------------------------- */`

			`#include "arm_nnfunctions.h"`
			`#include "arm_nnsupportfunctions.h"`

			`/**`
			`* @ingroup groupNN`
			`*/`

			`/**`
			`* @addtogroup SVDF`
			`* @{`
			`*/`

			`/*`
			`* S8 SVDF layer function for TensorFlow Lite`
			`*`
			`* Refer to header file for details.`
			`*`
			`*/`

			`arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx,`
			`const cmsis_nn_context *output_ctx,`
			`const cmsis_nn_svdf_params *svdf_params,`
			`const cmsis_nn_per_tensor_quant_params *input_quant_params,`
			`const cmsis_nn_per_tensor_quant_params *output_quant_params,`
			`const cmsis_nn_dims *input_dims,`
			`const q7_t *input_data,`
			`const cmsis_nn_dims *state_dims,`
			`q15_t *state_data,`
			`const cmsis_nn_dims *weights_feature_dims,`
			`const q7_t *weights_feature_data,`
			`const cmsis_nn_dims *weights_time_dims,`
			`const q15_t *weights_time_data,`
			`const cmsis_nn_dims *bias_dims,`
			`const q31_t *bias_data,`
			`const cmsis_nn_dims *output_dims,`
			`q7_t *output_data)`
			`{`
			`(void)bias_dims;`
			`(void)state_dims;`
			`(void)output_dims;`

			`const q31_t multiplier_in = input_quant_params->multiplier;`
			`const q31_t shift_in = input_quant_params->shift;`
			`const q31_t multiplier_out = output_quant_params->multiplier;`
			`const q31_t shift_2 = output_quant_params->shift;`
			`const int32_t zp_in = svdf_params->input_offset;`
			`const int32_t zp_out = svdf_params->output_offset;`
			`const int32_t in_activation_min = svdf_params->input_activation.min;`
			`const int32_t in_activation_max = svdf_params->input_activation.max;`
			`const int32_t out_activation_min = svdf_params->output_activation.min;`
			`const int32_t out_activation_max = svdf_params->output_activation.max;`
			`const int16_t rank = svdf_params->rank;`

			`const int32_t input_batches = input_dims->n;`
			`const int32_t input_height = input_dims->h;`
			`const int32_t feature_batches = weights_feature_dims->n;`
			`const int32_t time_batches = weights_time_dims->h;`
			`const int32_t unit_count = feature_batches / rank;`

			`q31_t buffer_a = (q31_t )input_ctx->buf;`
			`q31_t buffer_b = (q31_t )output_ctx->buf;`

			`memmove((q15_t *)state_data,`
			`(q15_t *)state_data + 1,`
			`(size_t)(input_batches * feature_batches * time_batches * (int32_t)sizeof(int16_t)));`

			`for (int i_batch = 0; i_batch < input_batches; i_batch++)`
			`{`
			`q15_t res_ptr = state_data + (time_batches i_batch * feature_batches) + (time_batches - 1);`
			`const q7_t *weight = weights_feature_data;`
			`const q7_t input = input_data + i_batch input_height;`

			`arm_status res = arm_nn_vec_mat_mult_t_svdf_s8(input,`
			`weight,`
			`res_ptr,`
			`-zp_in,`
			`0,`
			`time_batches,`
			`multiplier_in,`
			`shift_in,`
			`input_height,`
			`feature_batches,`
			`in_activation_min,`
			`in_activation_max);`

			`if (res != ARM_MATH_SUCCESS)`
			`{`
			`return res;`
			`}`
			`}`

			`{`
			`q31_t *ptr_a = buffer_a;`
			`const q15_t *v2 = state_data;`
			`for (int i_batch = 0; i_batch < input_batches; i_batch++)`
			`{`
			`const q15_t *v1 = weights_time_data;`

			`for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++)`
			`{`
			`*ptr_a = 0;`
			`int32_t sum = 0;`
			`#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)`
			`int j = 0;`
			`int32_t block_count = time_batches >> 1;`
			`for (int i = 0; i < block_count; i++)`
			`{`
			`j += 2;`
			`q31_t r1 = arm_nn_read_q15x2_ia(&v1);`
			`q31_t r2 = arm_nn_read_q15x2_ia(&v2);`

			`sum = __SMLAD(r1, r2, sum);`
			`}`

			`// Process the remaining data`
			`for (; j < time_batches; j++)`
			`{`
			`sum += v1 *v2;`
			`v1++;`
			`v2++;`
			`}`
			`#else`
			`for (int j = 0; j < time_batches; j++)`
			`{`
			`sum += v1 *v2;`
			`v1++;`
			`v2++;`
			`}`
			`#endif`

			`*ptr_a = sum;`
			`ptr_a++;`
			`}`
			`}`
			`}`

			`if (bias_data)`
			`{`
			`if (unit_count == feature_batches)`
			`{`
			`for (int i = 0; i < input_batches; i++)`
			`{`
			`q31_t output_temp = buffer_b + i feature_batches;`
			`const q31_t ptr_a = buffer_a + i feature_batches;`

			`const int32_t *bi = bias_data;`
			`for (int j = 0; j < feature_batches; j++)`
			`{`
			`output_temp[j] = ptr_a[j] + bi[j];`
			`}`
			`}`
			`}`
			`else`
			`{`
			`for (int i_batch = 0; i_batch < input_batches; i_batch++)`
			`{`
			`q31_t output_data_temp = buffer_b + i_batch unit_count;`
			`q31_t ptr_a = buffer_a + i_batch feature_batches;`

			`for (int i = 0; i < unit_count; i++)`
			`{`
			`int32_t sum = bias_data[i];`
			`for (int j = 0; j < rank; j++)`
			`{`
			`sum += *ptr_a;`
			`ptr_a++;`
			`}`
			`output_data_temp[i] = sum;`
			`}`
			`}`
			`}`
			`}`
			`else`
			`{`
			`for (int i_batch = 0; i_batch < input_batches; i_batch++)`
			`{`
			`q31_t output_data_temp = buffer_b + i_batch unit_count;`
			`q31_t ptr_a = buffer_a + i_batch feature_batches;`

			`for (int i = 0; i < unit_count; i++)`
			`{`
			`int32_t sum = 0;`
			`for (int j = 0; j < rank; j++)`
			`{`
			`sum += *ptr_a;`
			`ptr_a++;`
			`}`
			`output_data_temp[i] = sum;`
			`}`
			`}`
			`}`

			`#if defined(ARM_MATH_MVEI)`
			`int32_t num_elements = input_batches * unit_count;`
			`const int32_t loop_count = (num_elements + 3) / 4;`
			`for (int i_op = 0; i_op < loop_count; i_op++)`
			`{`
			`mve_pred16_t p = vctp32q((uint32_t)num_elements);`
			`int32x4_t op = vldrwq_z_s32(buffer_b, p);`
			`op = arm_requantize_mve(op, multiplier_out, shift_2);`
			`op = vaddq_n_s32(op, zp_out);`
			`const int32x4_t min_vec = vdupq_n_s32((int8_t)out_activation_min);`
			`const int32x4_t max_vec = vdupq_n_s32((int8_t)out_activation_max);`
			`op = vmaxq_s32(op, min_vec);`
			`op = vminq_s32(op, max_vec);`
			`vstrbq_p_s32(output_data, op, p);`
			`output_data += 4;`
			`buffer_b += 4;`
			`num_elements -= 4;`
			`}`
			`#else`
			`for (int i = 0; i < input_batches * unit_count; i++)`
			`{`
			`output_data[i] = (q7_t)CLAMP(`
			`arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min);`
			`}`
			`#endif`

			`return (ARM_MATH_SUCCESS);`
			`}`

			`/**`
			`* @} end of SVDF group`
			`*/`