stm32f407-openocd/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c

/* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mat_inverse_f32.c
 * Description:  Floating-point matrix inverse
 *
 * $Date:        23 April 2021
 * $Revision:    V1.9.0
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
/*
 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "dsp/matrix_functions.h"


/**
  @ingroup groupMatrix
 */

/**
  @defgroup MatrixInv Matrix Inverse

  Computes the inverse of a matrix.

  The inverse is defined only if the input matrix is square and non-singular (the determinant is non-zero).
  The function checks that the input and output matrices are square and of the same size.

  Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
  inversion of floating-point matrices.

  @par Algorithm
  The Gauss-Jordan method is used to find the inverse.
  The algorithm performs a sequence of elementary row-operations until it
  reduces the input matrix to an identity matrix. Applying the same sequence
  of elementary row-operations to an identity matrix yields the inverse matrix.
  If the input matrix is singular, then the algorithm terminates and returns error status
  <code>ARM_MATH_SINGULAR</code>.
  \image html MatrixInverse.gif "Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method"
 */

/**
  @addtogroup MatrixInv
  @{
 */

/**
  @brief         Floating-point matrix inverse.
  @param[in]     pSrc      points to input matrix structure. The source matrix is modified by the function.
  @param[out]    pDst      points to output matrix structure
  @return        execution status
                   - \ref ARM_MATH_SUCCESS       : Operation successful
                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
                   - \ref ARM_MATH_SINGULAR      : Input matrix is found to be singular (non-invertible)
 */
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)

arm_status arm_mat_inverse_f32(
  const arm_matrix_instance_f32 * pSrc,
  arm_matrix_instance_f32 * pDst)
{
    float32_t *pIn = pSrc->pData;   /* input data matrix pointer */
    float32_t *pOut = pDst->pData;  /* output data matrix pointer */
    float32_t *pInT1, *pInT2;   /* Temporary input data matrix pointer */
    float32_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
    float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;    /* Temporary input and output data matrix pointer */

    uint32_t  numRows = pSrc->numRows;  /* Number of rows in the matrix  */
    uint32_t  numCols = pSrc->numCols;  /* Number of Cols in the matrix  */
    float32_t *pTmpA, *pTmpB;

    float32_t in = 0.0f;        /* Temporary input values  */
    uint32_t  i, rowCnt, flag = 0U, j, loopCnt, l;   /* loop counters */
    arm_status status;          /* status of matrix inverse */
    uint32_t  blkCnt;

#ifdef ARM_MATH_MATRIX_CHECK
   /* Check for matrix mismatch condition */
  if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
     || (pSrc->numRows != pDst->numRows))
  {
    /* Set status as ARM_MATH_SIZE_MISMATCH */
    status = ARM_MATH_SIZE_MISMATCH;
  }
  else
#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
  {

    /*--------------------------------------------------------------------------------------------------------------
     * Matrix Inverse can be solved using elementary row operations.
     *
     *  Gauss-Jordan Method:
     *
     *     1. First combine the identity matrix and the input matrix separated by a bar to form an
     *        augmented matrix as follows:
     *                      _  _          _     _      _   _         _         _
     *                     |  |  a11  a12  | | | 1   0  |   |       |  X11 X12  |
     *                     |  |            | | |        |   |   =   |           |
     *                     |_ |_ a21  a22 _| | |_0   1 _|  _|       |_ X21 X21 _|
     *
     *      2. In our implementation, pDst Matrix is used as identity matrix.
     *
     *      3. Begin with the first row. Let i = 1.
     *
     *      4. Check to see if the pivot for row i is zero.
     *         The pivot is the element of the main diagonal that is on the current row.
     *         For instance, if working with row i, then the pivot element is aii.
     *         If the pivot is zero, exchange that row with a row below it that does not
     *         contain a zero in column i. If this is not possible, then an inverse
     *         to that matrix does not exist.
     *
     *      5. Divide every element of row i by the pivot.
     *
     *      6. For every row below and  row i, replace that row with the sum of that row and
     *         a multiple of row i so that each new element in column i below row i is zero.
     *
     *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
     *         for every element below and above the main diagonal.
     *
     *      8. Now an identical matrix is formed to the left of the bar(input matrix, src).
     *         Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
     *----------------------------------------------------------------------------------------------------------------*/

        /*
         * Working pointer for destination matrix
         */
        pOutT1 = pOut;
        /*
         * Loop over the number of rows
         */
        rowCnt = numRows;
        /*
         * Making the destination matrix as identity matrix
         */
        while (rowCnt > 0U)
        {
            /*
             * Writing all zeroes in lower triangle of the destination matrix
             */
            j = numRows - rowCnt;
            while (j > 0U)
            {
                *pOutT1++ = 0.0f;
                j--;
            }
            /*
             * Writing all ones in the diagonal of the destination matrix
             */
            *pOutT1++ = 1.0f;
            /*
             * Writing all zeroes in upper triangle of the destination matrix
             */
            j = rowCnt - 1U;
            while (j > 0U)
            {
                *pOutT1++ = 0.0f;
                j--;
            }
            /*
             * Decrement the loop counter
             */
            rowCnt--;
        }

        /*
         * Loop over the number of columns of the input matrix.
         * All the elements in each column are processed by the row operations
         */
        loopCnt = numCols;
        /*
         * Index modifier to navigate through the columns
         */
        l = 0U;
        while (loopCnt > 0U)
        {
            /*
             * Check if the pivot element is zero..
             * If it is zero then interchange the row with non zero row below.
             * If there is no non zero element to replace in the rows below,
             * then the matrix is Singular.
             */

            /*
             * Working pointer for the input matrix that points
             * * to the pivot element of the particular row
             */
            pInT1 = pIn + (l * numCols);
            /*
             * Working pointer for the destination matrix that points
             * * to the pivot element of the particular row
             */
            pOutT1 = pOut + (l * numCols);
            /*
             * Temporary variable to hold the pivot value
             */
            in = *pInT1;
           

            /*
             * Check if the pivot element is zero
             */
            if (*pInT1 == 0.0f)
            {
                /*
                 * Loop over the number rows present below
                 */
                for (i = 1U; i < numRows-l; i++)
                {
                    /*
                     * Update the input and destination pointers
                     */
                    pInT2 = pInT1 + (numCols * i);
                    pOutT2 = pOutT1 + (numCols * i);
                    /*
                     * Check if there is a non zero pivot element to
                     * * replace in the rows below
                     */
                    if (*pInT2 != 0.0f)
                    {
                        f32x4_t vecA, vecB;
                        /*
                         * Loop over number of columns
                         * * to the right of the pilot element
                         */
                        pTmpA = pInT1;
                        pTmpB = pInT2;
                        blkCnt = (numCols - l) >> 2;
                        while (blkCnt > 0U)
                        {
                            
                            vecA = vldrwq_f32(pTmpA);
                            vecB = vldrwq_f32(pTmpB);
                            vstrwq_f32(pTmpB, vecA);
                            vstrwq_f32(pTmpA, vecB);

                            pTmpA += 4;
                            pTmpB += 4;
                            /*
                             * Decrement the blockSize loop counter
                             */
                            blkCnt--;
                        }
                        /*
                         * tail
                         * (will be merged thru tail predication)
                         */
                        blkCnt = (numCols - l) & 3;
                        if (blkCnt > 0U)
                        {
                            mve_pred16_t p0 = vctp32q(blkCnt);

                            vecA = vldrwq_f32(pTmpA);
                            vecB = vldrwq_f32(pTmpB);
                            vstrwq_p_f32(pTmpB, vecA, p0);
                            vstrwq_p_f32(pTmpA, vecB, p0);
                        }

                        pInT1 += numCols - l;
                        pInT2 += numCols - l;
                        pTmpA = pOutT1;
                        pTmpB = pOutT2;
                        blkCnt = numCols >> 2;
                        while (blkCnt > 0U)
                        {

                            vecA = vldrwq_f32(pTmpA);
                            vecB = vldrwq_f32(pTmpB);
                            vstrwq_f32(pTmpB, vecA);
                            vstrwq_f32(pTmpA, vecB);
                            pTmpA += 4;
                            pTmpB += 4;
                            /*
                             * Decrement the blockSize loop counter
                             */
                            blkCnt--;
                        }
                        /*
                         * tail
                         */
                        blkCnt = numCols & 3;
                        if (blkCnt > 0U)
                        {
                            mve_pred16_t p0 = vctp32q(blkCnt);

                            vecA = vldrwq_f32(pTmpA);
                            vecB = vldrwq_f32(pTmpB);
                            vstrwq_p_f32(pTmpB, vecA, p0);
                            vstrwq_p_f32(pTmpA, vecB, p0);
                        }

                        pOutT1 += numCols;
                        pOutT2 += numCols;
                        /*
                         * Flag to indicate whether exchange is done or not
                         */
                        flag = 1U;

                        /*
                         * Break after exchange is done
                         */
                        break;
                    }
                    
                }
            }

            /*
             * Update the status if the matrix is singular
             */
            if ((flag != 1U) && (in == 0.0f))
            {
                return ARM_MATH_SINGULAR;
            }

            /*
             * Points to the pivot row of input and destination matrices
             */
            pPivotRowIn = pIn + (l * numCols);
            pPivotRowDst = pOut + (l * numCols);

            /*
             * Temporary pointers to the pivot row pointers
             */
            pInT1 = pPivotRowIn;
            pOutT1 = pPivotRowDst;

            /*
             * Pivot element of the row
             */
            in = *(pIn + (l * numCols));

            pTmpA = pInT1;

            f32x4_t invIn = vdupq_n_f32(1.0f / in);

            blkCnt = (numCols - l) >> 2;
            f32x4_t vecA;
            while (blkCnt > 0U)
            {
                *(f32x4_t *) pTmpA = *(f32x4_t *) pTmpA * invIn;
                pTmpA += 4;
                /*
                 * Decrement the blockSize loop counter
                 */
                blkCnt--;
            }
            /*
             * tail
             */
            blkCnt = (numCols - l) & 3;
            if (blkCnt > 0U)
            {
                mve_pred16_t p0 = vctp32q(blkCnt);
                

                vecA = vldrwq_f32(pTmpA);
                vecA = vecA * invIn;
                vstrwq_p_f32(pTmpA, vecA, p0);
            }

            pInT1 += numCols - l;
            /*
             * Loop over number of columns
             * * to the right of the pilot element
             */

            pTmpA = pOutT1;
            blkCnt = numCols >> 2;
            while (blkCnt > 0U)
            {
                *(f32x4_t *) pTmpA = *(f32x4_t *) pTmpA *invIn;
                pTmpA += 4;
                /*
                 * Decrement the blockSize loop counter
                 */
                blkCnt--;
            }
            /*
             * tail
             * (will be merged thru tail predication)
             */
            blkCnt = numCols & 3;
            if (blkCnt > 0U)
            {
                mve_pred16_t p0 = vctp32q(blkCnt);

                vecA = vldrwq_f32(pTmpA);
                vecA = vecA * invIn;
                vstrwq_p_f32(pTmpA, vecA, p0);
            }

            pOutT1 += numCols;

            /*
             * Replace the rows with the sum of that row and a multiple of row i
             * * so that each new element in column i above row i is zero.
             */

            /*
             * Temporary pointers for input and destination matrices
             */
            pInT1 = pIn;
            pOutT1 = pOut;

            for (i = 0U; i < numRows; i++)
            {
                /*
                 * Check for the pivot element
                 */
                if (i == l)
                {
                    /*
                     * If the processing element is the pivot element,
                     * only the columns to the right are to be processed
                     */
                    pInT1 += numCols - l;
                    pOutT1 += numCols;
                }
                else
                {
                    /*
                     * Element of the reference row
                     */

                    /*
                     * Working pointers for input and destination pivot rows
                     */
                    pPRT_in = pPivotRowIn;
                    pPRT_pDst = pPivotRowDst;
                    /*
                     * Loop over the number of columns to the right of the pivot element,
                     * to replace the elements in the input matrix
                     */

                    in = *pInT1;
                    f32x4_t tmpV = vdupq_n_f32(in);

                    blkCnt = (numCols - l) >> 2;
                    while (blkCnt > 0U)
                    {
                        f32x4_t vec1, vec2;
                        /*
                         * Replace the element by the sum of that row
                         * and a multiple of the reference row
                         */
                        vec1 = vldrwq_f32(pInT1);
                        vec2 = vldrwq_f32(pPRT_in);
                        vec1 = vfmsq_f32(vec1, tmpV, vec2);
                        vstrwq_f32(pInT1, vec1);
                        pPRT_in += 4;
                        pInT1 += 4;
                        /*
                         * Decrement the blockSize loop counter
                         */
                        blkCnt--;
                    }
                    /*
                     * tail
                     * (will be merged thru tail predication)
                     */
                    blkCnt = (numCols - l) & 3;
                    if (blkCnt > 0U)
                    {
                        f32x4_t vec1, vec2;
                        mve_pred16_t p0 = vctp32q(blkCnt);

                        vec1 = vldrwq_f32(pInT1);
                        vec2 = vldrwq_f32(pPRT_in);
                        vec1 = vfmsq_f32(vec1, tmpV, vec2);
                        vstrwq_p_f32(pInT1, vec1, p0);
                        pInT1 += blkCnt;
                    }

                    blkCnt = numCols >> 2;
                    while (blkCnt > 0U)
                    {
                        f32x4_t vec1, vec2;

                        /*
                         * Replace the element by the sum of that row
                         * and a multiple of the reference row
                         */
                        vec1 = vldrwq_f32(pOutT1);
                        vec2 = vldrwq_f32(pPRT_pDst);
                        vec1 = vfmsq_f32(vec1, tmpV, vec2);
                        vstrwq_f32(pOutT1, vec1);
                        pPRT_pDst += 4;
                        pOutT1 += 4;
                        /*
                         * Decrement the blockSize loop counter
                         */
                        blkCnt--;
                    }
                    /*
                     * tail
                     * (will be merged thru tail predication)
                     */
                    blkCnt = numCols & 3;
                    if (blkCnt > 0U)
                    {
                        f32x4_t vec1, vec2;
                        mve_pred16_t p0 = vctp32q(blkCnt);

                        vec1 = vldrwq_f32(pOutT1);
                        vec2 = vldrwq_f32(pPRT_pDst);
                        vec1 = vfmsq_f32(vec1, tmpV, vec2);
                        vstrwq_p_f32(pOutT1, vec1, p0);

                        pInT2 += blkCnt;
                        pOutT1 += blkCnt;
                    }
                }
                /*
                 * Increment the temporary input pointer
                 */
                pInT1 = pInT1 + l;
            }
            /*
             * Increment the input pointer
             */
            pIn++;
            /*
             * Decrement the loop counter
             */
            loopCnt--;
            /*
             * Increment the index modifier
             */
            l++;
        }

        /*
         * Set status as ARM_MATH_SUCCESS
         */
        status = ARM_MATH_SUCCESS;

        if ((flag != 1U) && (in == 0.0f))
        {
            pIn = pSrc->pData;
            for (i = 0; i < numRows * numCols; i++)
            {
                if (pIn[i] != 0.0f)
                    break;
            }

            if (i == numRows * numCols)
                status = ARM_MATH_SINGULAR;
        }
  }
  /* Return to application */
  return (status);
}

#else
#if defined(ARM_MATH_NEON)
arm_status arm_mat_inverse_f32(
  const arm_matrix_instance_f32 * pSrc,
  arm_matrix_instance_f32 * pDst)
{
  float32_t *pIn = pSrc->pData;                  /* input data matrix pointer */
  float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
  float32_t *pInT1, *pInT2;                      /* Temporary input data matrix pointer */
  float32_t *pOutT1, *pOutT2;                    /* Temporary output data matrix pointer */
  float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;  /* Temporary input and output data matrix pointer */
  uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
  uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */


  float32_t Xchg, in = 0.0f, in1;                /* Temporary input values  */
  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l;      /* loop counters */
  arm_status status;                             /* status of matrix inverse */
  float32x4_t vec1;
  float32x4_t vec2;
  float32x4_t tmpV;

#ifdef ARM_MATH_MATRIX_CHECK

  /* Check for matrix mismatch condition */
  if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
     || (pSrc->numRows != pDst->numRows))
  {
    /* Set status as ARM_MATH_SIZE_MISMATCH */
    status = ARM_MATH_SIZE_MISMATCH;
  }
  else
#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */

  {
   /*--------------------------------------------------------------------------------------------------------------
   * Matrix Inverse can be solved using elementary row operations.
   *
   *  Gauss-Jordan Method:
   *
   *     1. First combine the identity matrix and the input matrix separated by a bar to form an
   *        augmented matrix as follows:
   *              _                  _         _         _
   *             |  a11  a12 | 1   0  |       |  X11 X12  |
   *             |           |        |   =   |           |
   *             |_ a21  a22 | 0   1 _|       |_ X21 X21 _|
   *
   *    2. In our implementation, pDst Matrix is used as identity matrix.
   *
   *    3. Begin with the first row. Let i = 1.
   *
   *    4. Check to see if the pivot for row i is zero.
   *       The pivot is the element of the main diagonal that is on the current row.
   *       For instance, if working with row i, then the pivot element is aii.
   *       If the pivot is zero, exchange that row with a row below it that does not
   *       contain a zero in column i. If this is not possible, then an inverse
   *       to that matrix does not exist.
   *
   *      5. Divide every element of row i by the pivot.
   *
   *      6. For every row below and  row i, replace that row with the sum of that row and
   *       a multiple of row i so that each new element in column i below row i is zero.
   *
   *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
   *       for every element below and above the main diagonal.
   *
   *    8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
   *       Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
   *----------------------------------------------------------------------------------------------------------------*/

    /* Working pointer for destination matrix */
    pOutT1 = pOut;

    /* Loop over the number of rows */
    rowCnt = numRows;

    /* Making the destination matrix as identity matrix */
    while (rowCnt > 0U)
    {
      /* Writing all zeroes in lower triangle of the destination matrix */
      j = numRows - rowCnt;
      while (j > 0U)
      {
        *pOutT1++ = 0.0f;
        j--;
      }

      /* Writing all ones in the diagonal of the destination matrix */
      *pOutT1++ = 1.0f;

      /* Writing all zeroes in upper triangle of the destination matrix */
      j = rowCnt - 1U;

      while (j > 0U)
      {
        *pOutT1++ = 0.0f;
        j--;
      }

      /* Decrement the loop counter */
      rowCnt--;
    }

    /* Loop over the number of columns of the input matrix.
       All the elements in each column are processed by the row operations */
    loopCnt = numCols;

    /* Index modifier to navigate through the columns */
    l = 0U;

    while (loopCnt > 0U)
    {
      /* Check if the pivot element is zero..
       * If it is zero then interchange the row with non zero row below.
       * If there is no non zero element to replace in the rows below,
       * then the matrix is Singular. */

      /* Working pointer for the input matrix that points
       * to the pivot element of the particular row  */
      pInT1 = pIn + (l * numCols);

      /* Working pointer for the destination matrix that points
       * to the pivot element of the particular row  */
      pOutT1 = pOut + (l * numCols);

      /* Temporary variable to hold the pivot value */
      in = *pInT1;

      /* Check if the pivot element is zero */
      if (*pInT1 == 0.0f)
      {
        /* Loop over the number rows present below */
        for (i = 1U; i < numRows - l; i++)
        {
          /* Update the input and destination pointers */
          pInT2 = pInT1 + (numCols * i);
          pOutT2 = pOutT1 + (numCols * i);

          /* Check if there is a non zero pivot element to
           * replace in the rows below */
          if (*pInT2 != 0.0f)
          {
            /* Loop over number of columns
             * to the right of the pilot element */
            j = numCols - l;

            while (j > 0U)
            {
              /* Exchange the row elements of the input matrix */
              Xchg = *pInT2;
              *pInT2++ = *pInT1;
              *pInT1++ = Xchg;

              /* Decrement the loop counter */
              j--;
            }

            /* Loop over number of columns of the destination matrix */
            j = numCols;

            while (j > 0U)
            {
              /* Exchange the row elements of the destination matrix */
              Xchg = *pOutT2;
              *pOutT2++ = *pOutT1;
              *pOutT1++ = Xchg;

              /* Decrement the loop counter */
              j--;
            }

            /* Flag to indicate whether exchange is done or not */
            flag = 1U;

            /* Break after exchange is done */
            break;
          }

         
        }
      }

      /* Update the status if the matrix is singular */
      if ((flag != 1U) && (in == 0.0f))
      {
        return ARM_MATH_SINGULAR;
      }

      /* Points to the pivot row of input and destination matrices */
      pPivotRowIn = pIn + (l * numCols);
      pPivotRowDst = pOut + (l * numCols);

      /* Temporary pointers to the pivot row pointers */
      pInT1 = pPivotRowIn;
      pInT2 = pPivotRowDst;

      /* Pivot element of the row */
      in = *pPivotRowIn;
      tmpV = vdupq_n_f32(1.0f/in);

      /* Loop over number of columns
       * to the right of the pilot element */
      j = (numCols - l) >> 2;

      while (j > 0U)
      {
        /* Divide each element of the row of the input matrix
         * by the pivot element */
        vec1 = vld1q_f32(pInT1);

        vec1 = vmulq_f32(vec1, tmpV);
        vst1q_f32(pInT1, vec1);
        pInT1 += 4;

        /* Decrement the loop counter */
        j--;
      }

      /* Tail */
      j = (numCols - l) & 3;

      while (j > 0U)
      {
        /* Divide each element of the row of the input matrix
         * by the pivot element */
        in1 = *pInT1;
        *pInT1++ = in1 / in;

        /* Decrement the loop counter */
        j--;
      }

      /* Loop over number of columns of the destination matrix */
      j = numCols >> 2;

      while (j > 0U)
      {
        /* Divide each element of the row of the destination matrix
         * by the pivot element */
        vec1 = vld1q_f32(pInT2);

        vec1 = vmulq_f32(vec1, tmpV);
        vst1q_f32(pInT2, vec1);
        pInT2 += 4;
      
        /* Decrement the loop counter */
        j--;
      }

      /* Tail */
      j = numCols & 3;

      while (j > 0U)
      {
        /* Divide each element of the row of the destination matrix
         * by the pivot element */
        in1 = *pInT2;
        *pInT2++ = in1 / in;

        /* Decrement the loop counter */
        j--;
      }

      /* Replace the rows with the sum of that row and a multiple of row i
       * so that each new element in column i above row i is zero.*/

      /* Temporary pointers for input and destination matrices */
      pInT1 = pIn;
      pInT2 = pOut;

      /* index used to check for pivot element */
      i = 0U;

      /* Loop over number of rows */
      /*  to be replaced by the sum of that row and a multiple of row i */
      k = numRows;

      while (k > 0U)
      {
        /* Check for the pivot element */
        if (i == l)
        {
          /* If the processing element is the pivot element,
             only the columns to the right are to be processed */
          pInT1 += numCols - l;

          pInT2 += numCols;
        }
        else
        {
          /* Element of the reference row */
          in = *pInT1;
          tmpV = vdupq_n_f32(in);

          /* Working pointers for input and destination pivot rows */
          pPRT_in = pPivotRowIn;
          pPRT_pDst = pPivotRowDst;

          /* Loop over the number of columns to the right of the pivot element,
             to replace the elements in the input matrix */
          j = (numCols - l) >> 2;
	  
          while (j > 0U)
          {
            /* Replace the element by the sum of that row
               and a multiple of the reference row  */
            vec1 = vld1q_f32(pInT1);
            vec2 = vld1q_f32(pPRT_in);
            vec1 = vmlsq_f32(vec1, tmpV, vec2);
            vst1q_f32(pInT1, vec1);
            pPRT_in += 4;
            pInT1 += 4;

            /* Decrement the loop counter */
            j--;
          }

	  /* Tail */
          j = (numCols - l) & 3;

          while (j > 0U)
          {
            /* Replace the element by the sum of that row
               and a multiple of the reference row  */
            in1 = *pInT1;
            *pInT1++ = in1 - (in * *pPRT_in++);

            /* Decrement the loop counter */
            j--;
          }

          /* Loop over the number of columns to
             replace the elements in the destination matrix */
          j = numCols >> 2;

          while (j > 0U)
          {
            /* Replace the element by the sum of that row
               and a multiple of the reference row  */
            vec1 = vld1q_f32(pInT2);
            vec2 = vld1q_f32(pPRT_pDst);
            vec1 = vmlsq_f32(vec1, tmpV, vec2);
            vst1q_f32(pInT2, vec1);
            pPRT_pDst += 4;
            pInT2 += 4;

            /* Decrement the loop counter */
            j--;
          }

	  /* Tail */
          j = numCols & 3;

          while (j > 0U)
          {
            /* Replace the element by the sum of that row
               and a multiple of the reference row  */
            in1 = *pInT2;
            *pInT2++ = in1 - (in * *pPRT_pDst++);

            /* Decrement the loop counter */
            j--;
          }

        }

        /* Increment the temporary input pointer */
        pInT1 = pInT1 + l;

        /* Decrement the loop counter */
        k--;

        /* Increment the pivot index */
        i++;
      }

      /* Increment the input pointer */
      pIn++;

      /* Decrement the loop counter */
      loopCnt--;

      /* Increment the index modifier */
      l++;
    }

    /* Set status as ARM_MATH_SUCCESS */
    status = ARM_MATH_SUCCESS;

    if ((flag != 1U) && (in == 0.0f))
    {
      pIn = pSrc->pData;
      for (i = 0; i < numRows * numCols; i++)
      {
        if (pIn[i] != 0.0f)
            break;
      }

      if (i == numRows * numCols)
        status = ARM_MATH_SINGULAR;
    }
  }
  /* Return to application */
  return (status);
}
#else
arm_status arm_mat_inverse_f32(
  const arm_matrix_instance_f32 * pSrc,
        arm_matrix_instance_f32 * pDst)
{
  float32_t *pIn = pSrc->pData;                  /* input data matrix pointer */
  float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
  float32_t *pInT1, *pInT2;                      /* Temporary input data matrix pointer */
  float32_t *pOutT1, *pOutT2;                    /* Temporary output data matrix pointer */
  float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;  /* Temporary input and output data matrix pointer */
  uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
  uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */

#if defined (ARM_MATH_DSP)

  float32_t Xchg, in = 0.0f, in1;                /* Temporary input values  */
  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l;      /* loop counters */
  arm_status status;                             /* status of matrix inverse */

#ifdef ARM_MATH_MATRIX_CHECK

  /* Check for matrix mismatch condition */
  if ((pSrc->numRows != pSrc->numCols) ||
      (pDst->numRows != pDst->numCols) ||
      (pSrc->numRows != pDst->numRows)   )
  {
    /* Set status as ARM_MATH_SIZE_MISMATCH */
    status = ARM_MATH_SIZE_MISMATCH;
  }
  else

#endif /* #ifdef ARM_MATH_MATRIX_CHECK */

  {

    /*--------------------------------------------------------------------------------------------------------------
     * Matrix Inverse can be solved using elementary row operations.
     *
     *  Gauss-Jordan Method:
     *
     *      1. First combine the identity matrix and the input matrix separated by a bar to form an
     *        augmented matrix as follows:
     *                      _                  _         _         _
     *                     |  a11  a12 | 1   0  |       |  X11 X12  |
     *                     |           |        |   =   |           |
     *                     |_ a21  a22 | 0   1 _|       |_ X21 X21 _|
     *
     *      2. In our implementation, pDst Matrix is used as identity matrix.
     *
     *      3. Begin with the first row. Let i = 1.
     *
     *      4. Check to see if the pivot for row i is zero.
     *         The pivot is the element of the main diagonal that is on the current row.
     *         For instance, if working with row i, then the pivot element is aii.
     *         If the pivot is zero, exchange that row with a row below it that does not
     *         contain a zero in column i. If this is not possible, then an inverse
     *         to that matrix does not exist.
     *
     *      5. Divide every element of row i by the pivot.
     *
     *      6. For every row below and  row i, replace that row with the sum of that row and
     *         a multiple of row i so that each new element in column i below row i is zero.
     *
     *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
     *         for every element below and above the main diagonal.
     *
     *      8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
     *         Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
     *----------------------------------------------------------------------------------------------------------------*/

    /* Working pointer for destination matrix */
    pOutT1 = pOut;

    /* Loop over the number of rows */
    rowCnt = numRows;

    /* Making the destination matrix as identity matrix */
    while (rowCnt > 0U)
    {
      /* Writing all zeroes in lower triangle of the destination matrix */
      j = numRows - rowCnt;
      while (j > 0U)
      {
        *pOutT1++ = 0.0f;
        j--;
      }

      /* Writing all ones in the diagonal of the destination matrix */
      *pOutT1++ = 1.0f;

      /* Writing all zeroes in upper triangle of the destination matrix */
      j = rowCnt - 1U;
      while (j > 0U)
      {
        *pOutT1++ = 0.0f;
        j--;
      }

      /* Decrement loop counter */
      rowCnt--;
    }

    /* Loop over the number of columns of the input matrix.
       All the elements in each column are processed by the row operations */
    loopCnt = numCols;

    /* Index modifier to navigate through the columns */
    l = 0U;

    while (loopCnt > 0U)
    {
      /* Check if the pivot element is zero..
       * If it is zero then interchange the row with non zero row below.
       * If there is no non zero element to replace in the rows below,
       * then the matrix is Singular. */

      /* Working pointer for the input matrix that points
       * to the pivot element of the particular row  */
      pInT1 = pIn + (l * numCols);

      /* Working pointer for the destination matrix that points
       * to the pivot element of the particular row  */
      pOutT1 = pOut + (l * numCols);

      /* Temporary variable to hold the pivot value */
      in = *pInT1;

    
      /* Check if the pivot element is zero */
      if (*pInT1 == 0.0f)
      {
        /* Loop over the number rows present below */

        for (i = 1U; i < numRows - l; i++)
        {
          /* Update the input and destination pointers */
          pInT2 = pInT1 + (numCols * i);
          pOutT2 = pOutT1 + (numCols * i);

          /* Check if there is a non zero pivot element to
           * replace in the rows below */
          if (*pInT2 != 0.0f)
          {
            /* Loop over number of columns
             * to the right of the pilot element */
            j = numCols - l;

            while (j > 0U)
            {
              /* Exchange the row elements of the input matrix */
              Xchg = *pInT2;
              *pInT2++ = *pInT1;
              *pInT1++ = Xchg;

              /* Decrement the loop counter */
              j--;
            }

            /* Loop over number of columns of the destination matrix */
            j = numCols;

            while (j > 0U)
            {
              /* Exchange the row elements of the destination matrix */
              Xchg = *pOutT2;
              *pOutT2++ = *pOutT1;
              *pOutT1++ = Xchg;

              /* Decrement loop counter */
              j--;
            }

            /* Flag to indicate whether exchange is done or not */
            flag = 1U;

            /* Break after exchange is done */
            break;
          }


          /* Decrement loop counter */
        }
      }

      /* Update the status if the matrix is singular */
      if ((flag != 1U) && (in == 0.0f))
      {
        return ARM_MATH_SINGULAR;
      }

      /* Points to the pivot row of input and destination matrices */
      pPivotRowIn = pIn + (l * numCols);
      pPivotRowDst = pOut + (l * numCols);

      /* Temporary pointers to the pivot row pointers */
      pInT1 = pPivotRowIn;
      pInT2 = pPivotRowDst;

      /* Pivot element of the row */
      in = *pPivotRowIn;

      /* Loop over number of columns
       * to the right of the pilot element */
      j = (numCols - l);

      while (j > 0U)
      {
        /* Divide each element of the row of the input matrix
         * by the pivot element */
        in1 = *pInT1;
        *pInT1++ = in1 / in;

        /* Decrement the loop counter */
        j--;
      }

      /* Loop over number of columns of the destination matrix */
      j = numCols;

      while (j > 0U)
      {
        /* Divide each element of the row of the destination matrix
         * by the pivot element */
        in1 = *pInT2;
        *pInT2++ = in1 / in;

        /* Decrement the loop counter */
        j--;
      }

      /* Replace the rows with the sum of that row and a multiple of row i
       * so that each new element in column i above row i is zero.*/

      /* Temporary pointers for input and destination matrices */
      pInT1 = pIn;
      pInT2 = pOut;

      /* index used to check for pivot element */
      i = 0U;

      /* Loop over number of rows */
      /*  to be replaced by the sum of that row and a multiple of row i */
      k = numRows;

      while (k > 0U)
      {
        /* Check for the pivot element */
        if (i == l)
        {
          /* If the processing element is the pivot element,
             only the columns to the right are to be processed */
          pInT1 += numCols - l;

          pInT2 += numCols;
        }
        else
        {
          /* Element of the reference row */
          in = *pInT1;

          /* Working pointers for input and destination pivot rows */
          pPRT_in = pPivotRowIn;
          pPRT_pDst = pPivotRowDst;

          /* Loop over the number of columns to the right of the pivot element,
             to replace the elements in the input matrix */
          j = (numCols - l);

          while (j > 0U)
          {
            /* Replace the element by the sum of that row
               and a multiple of the reference row  */
            in1 = *pInT1;
            *pInT1++ = in1 - (in * *pPRT_in++);

            /* Decrement the loop counter */
            j--;
          }

          /* Loop over the number of columns to
             replace the elements in the destination matrix */
          j = numCols;

          while (j > 0U)
          {
            /* Replace the element by the sum of that row
               and a multiple of the reference row  */
            in1 = *pInT2;
            *pInT2++ = in1 - (in * *pPRT_pDst++);

            /* Decrement loop counter */
            j--;
          }

        }

        /* Increment temporary input pointer */
        pInT1 = pInT1 + l;

        /* Decrement loop counter */
        k--;

        /* Increment pivot index */
        i++;
      }

      /* Increment the input pointer */
      pIn++;

      /* Decrement the loop counter */
      loopCnt--;

      /* Increment the index modifier */
      l++;
    }


#else

  float32_t Xchg, in = 0.0f;                     /* Temporary input values  */
  uint32_t i, rowCnt, flag = 0U, j, loopCnt, l;      /* loop counters */
  arm_status status;                             /* status of matrix inverse */

#ifdef ARM_MATH_MATRIX_CHECK

  /* Check for matrix mismatch condition */
  if ((pSrc->numRows != pSrc->numCols) ||
      (pDst->numRows != pDst->numCols) ||
      (pSrc->numRows != pDst->numRows)   )
  {
    /* Set status as ARM_MATH_SIZE_MISMATCH */
    status = ARM_MATH_SIZE_MISMATCH;
  }
  else

#endif /* #ifdef ARM_MATH_MATRIX_CHECK */

  {

    /*--------------------------------------------------------------------------------------------------------------
     * Matrix Inverse can be solved using elementary row operations.
     *
     *  Gauss-Jordan Method:
     *
     *      1. First combine the identity matrix and the input matrix separated by a bar to form an
     *        augmented matrix as follows:
     *                      _  _          _     _      _   _         _         _
     *                     |  |  a11  a12  | | | 1   0  |   |       |  X11 X12  |
     *                     |  |            | | |        |   |   =   |           |
     *                     |_ |_ a21  a22 _| | |_0   1 _|  _|       |_ X21 X21 _|
     *
     *      2. In our implementation, pDst Matrix is used as identity matrix.
     *
     *      3. Begin with the first row. Let i = 1.
     *
     *      4. Check to see if the pivot for row i is zero.
     *         The pivot is the element of the main diagonal that is on the current row.
     *         For instance, if working with row i, then the pivot element is aii.
     *         If the pivot is zero, exchange that row with a row below it that does not
     *         contain a zero in column i. If this is not possible, then an inverse
     *         to that matrix does not exist.
     *
     *      5. Divide every element of row i by the pivot.
     *
     *      6. For every row below and  row i, replace that row with the sum of that row and
     *         a multiple of row i so that each new element in column i below row i is zero.
     *
     *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
     *         for every element below and above the main diagonal.
     *
     *      8. Now an identical matrix is formed to the left of the bar(input matrix, src).
     *         Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
     *----------------------------------------------------------------------------------------------------------------*/

    /* Working pointer for destination matrix */
    pOutT1 = pOut;

    /* Loop over the number of rows */
    rowCnt = numRows;

    /* Making the destination matrix as identity matrix */
    while (rowCnt > 0U)
    {
      /* Writing all zeroes in lower triangle of the destination matrix */
      j = numRows - rowCnt;
      while (j > 0U)
      {
        *pOutT1++ = 0.0f;
        j--;
      }

      /* Writing all ones in the diagonal of the destination matrix */
      *pOutT1++ = 1.0f;

      /* Writing all zeroes in upper triangle of the destination matrix */
      j = rowCnt - 1U;
      while (j > 0U)
      {
        *pOutT1++ = 0.0f;
        j--;
      }

      /* Decrement loop counter */
      rowCnt--;
    }

    /* Loop over the number of columns of the input matrix.
       All the elements in each column are processed by the row operations */
    loopCnt = numCols;

    /* Index modifier to navigate through the columns */
    l = 0U;

    while (loopCnt > 0U)
    {
      /* Check if the pivot element is zero..
       * If it is zero then interchange the row with non zero row below.
       * If there is no non zero element to replace in the rows below,
       * then the matrix is Singular. */

      /* Working pointer for the input matrix that points
       * to the pivot element of the particular row  */
      pInT1 = pIn + (l * numCols);

      /* Working pointer for the destination matrix that points
       * to the pivot element of the particular row  */
      pOutT1 = pOut + (l * numCols);

      /* Temporary variable to hold the pivot value */
      in = *pInT1;

      /* Check if the pivot element is zero */
      if (*pInT1 == 0.0f)
      {
        /* Loop over the number rows present below */
        for (i = 1U; i < numRows-l; i++)
        {
          /* Update the input and destination pointers */
          pInT2 = pInT1 + (numCols * i);
          pOutT2 = pOutT1 + (numCols * i);

          /* Check if there is a non zero pivot element to
           * replace in the rows below */
          if (*pInT2 != 0.0f)
          {
            /* Loop over number of columns
             * to the right of the pilot element */
            for (j = 0U; j < (numCols - l); j++)
            {
              /* Exchange the row elements of the input matrix */
              Xchg = *pInT2;
              *pInT2++ = *pInT1;
              *pInT1++ = Xchg;
            }

            for (j = 0U; j < numCols; j++)
            {
              Xchg = *pOutT2;
              *pOutT2++ = *pOutT1;
              *pOutT1++ = Xchg;
            }

            /* Flag to indicate whether exchange is done or not */
            flag = 1U;

            /* Break after exchange is done */
            break;
          }
        }
      }


      /* Update the status if the matrix is singular */
      if ((flag != 1U) && (in == 0.0f))
      {
        return ARM_MATH_SINGULAR;
      }

      /* Points to the pivot row of input and destination matrices */
      pPivotRowIn = pIn + (l * numCols);
      pPivotRowDst = pOut + (l * numCols);

      /* Temporary pointers to the pivot row pointers */
      pInT1 = pPivotRowIn;
      pOutT1 = pPivotRowDst;

      /* Pivot element of the row */
      in = *(pIn + (l * numCols));

      /* Loop over number of columns
       * to the right of the pilot element */
      for (j = 0U; j < (numCols - l); j++)
      {
        /* Divide each element of the row of the input matrix
         * by the pivot element */
        *pInT1 = *pInT1 / in;
        pInT1++;
      }
      for (j = 0U; j < numCols; j++)
      {
        /* Divide each element of the row of the destination matrix
         * by the pivot element */
        *pOutT1 = *pOutT1 / in;
        pOutT1++;
      }

      /* Replace the rows with the sum of that row and a multiple of row i
       * so that each new element in column i above row i is zero.*/

      /* Temporary pointers for input and destination matrices */
      pInT1 = pIn;
      pOutT1 = pOut;

      for (i = 0U; i < numRows; i++)
      {
        /* Check for the pivot element */
        if (i == l)
        {
          /* If the processing element is the pivot element,
             only the columns to the right are to be processed */
          pInT1 += numCols - l;
          pOutT1 += numCols;
        }
        else
        {
          /* Element of the reference row */
          in = *pInT1;

          /* Working pointers for input and destination pivot rows */
          pPRT_in = pPivotRowIn;
          pPRT_pDst = pPivotRowDst;

          /* Loop over the number of columns to the right of the pivot element,
             to replace the elements in the input matrix */
          for (j = 0U; j < (numCols - l); j++)
          {
            /* Replace the element by the sum of that row
               and a multiple of the reference row  */
            *pInT1 = *pInT1 - (in * *pPRT_in++);
            pInT1++;
          }

          /* Loop over the number of columns to
             replace the elements in the destination matrix */
          for (j = 0U; j < numCols; j++)
          {
            /* Replace the element by the sum of that row
               and a multiple of the reference row  */
            *pOutT1 = *pOutT1 - (in * *pPRT_pDst++);
            pOutT1++;
          }

        }

        /* Increment temporary input pointer */
        pInT1 = pInT1 + l;
      }

      /* Increment the input pointer */
      pIn++;

      /* Decrement the loop counter */
      loopCnt--;

      /* Increment the index modifier */
      l++;
    }

#endif /* #if defined (ARM_MATH_DSP) */

    /* Set status as ARM_MATH_SUCCESS */
    status = ARM_MATH_SUCCESS;

    if ((flag != 1U) && (in == 0.0f))
    {
      pIn = pSrc->pData;
      for (i = 0; i < numRows * numCols; i++)
      {
        if (pIn[i] != 0.0f)
            break;
      }

      if (i == numRows * numCols)
        status = ARM_MATH_SINGULAR;
    }
  }

  /* Return to application */
  return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */

/**
  @} end of MatrixInv group
 */