/* **COPYRIGHT******************************************************************
    INTEL CONFIDENTIAL
    Copyright (C) 2017 Intel Corporation
    Copyright (C), 1994-2005 Aware Inc. All Rights Reserved.
******************************************************************COPYRIGHT** */
/* **DISCLAIMER*****************************************************************
    The source code contained or described herein and all documents related
    to the source code ("Material") are owned by Intel Corporation or its
    suppliers or licensors. Title to the Material remains with Intel
    Corporation or its suppliers and licensors. The Material may contain
    trade secrets and proprietary and confidential information of Intel
    Corporation and its suppliers and licensors, and is protected by
    worldwide copyright and trade secret laws and treaty provisions. No part
    of the Material may be used, copied, reproduced, modified, published,
    uploaded, posted, transmitted, distributed, or disclosed in any way
    without Intel's prior express written permission.

    No license under any patent, copyright, trade secret or other
    intellectual property right is granted to or conferred upon you by
    disclosure or delivery of the Materials, either expressly, by
    implication, inducement, estoppel or otherwise. Any license under
    such intellectual property rights must be express and approved by
    Intel in writing.
*****************************************************************DISCLAIMER** */
/*
*-------------------------------------------------------------------------------
*
*   Aware DMT Technology. Proprietary and Confidential.
*
*   40 Middlesex Turnpike, Bedford, MA 01730-1413
*   Phone (781) 276 - 4000
*   Fax   (781) 276 - 4001
*
*   filename: MfdqHandler_b.c
*
*   This file contains background functions used in MfdqHandler().
*
*-------------------------------------------------------------------------------
*/

#include <string.h>
#include "common.h"
#include "gdata.h"
#include "dsp_op.h"
#include "acc48_ops.h"
#include "ComplexMult.h"
#include "ieee_flt.h"
#include "matrix.h"
#include "MfdqHandler.h"
#include "MfdqHandler_b.h"
#include "MfdqSupport.h"

#define USE_CHOLESKY

/*
*-------------------------------------------------------------------------------
*
*   Prototype: void BgInitMfdqVars(void)
*
*   This function selects MFDQ tones and associated feedback tones, and
*   initializes other variables used in MfdqHandler().
*
*   Input Arguments:
*
*   Output Arguments:
*
*   Returns:
*
*   Global Variables:
*
*-------------------------------------------------------------------------------
*/

void BgInitMfdqVars(void)
{
   int16 i, s_min = (int16)0x7FFF, s_max = 0;

   // select MFDQ tones and associated feedback tones
   // this is done here every time since buffers may be shared with other processes
   SelectMfdqTones();

   for (i=0; i<gt_MfdqConfig.s_NumMfdqTones; i++)
   {
      // logical tone index should be loaded to MFDQ tone table if TRT is disabled
      // use physical tone order here and make sure TRT is disabled at this point
      gt_MfdqConfig.pla_ToneTable[i] = ((uint32)gt_MfdqConfig.psa_FbckTones[i] << 16)
                                       | (uint32)gt_MfdqConfig.psa_MfdqTones[i];
      // note that gt_MfdqConfig.psa_MfdqTones[i] > gt_MfdqConfig.psa_FbckTones[i]
      if (s_min > gt_MfdqConfig.psa_FbckTones[i])
      {
         s_min = gt_MfdqConfig.psa_FbckTones[i];
      }
      if (s_max < gt_MfdqConfig.psa_MfdqTones[i])
      {
         s_max = gt_MfdqConfig.psa_MfdqTones[i];
      }
   }

   // determine the start tone index to read from HW
   gt_MfdqConfig.s_ReadToneStart = s_min;

   // determine # of tones to read from HW
   gt_MfdqConfig.s_NumReadTone = s_max - s_min + 1;

   // clear accumulation buffers for covariance matrix and correlation vector
   memset(gt_MfdqConfig.pla_CovMatrix, 0, sizeof(int32)*MAX_MFDQ_TONES*MFDQ_COV_MATRIX_SIZE);
   memset(gt_MfdqConfig.pla_CorVector, 0, sizeof(int32)*MAX_MFDQ_TONES*MFDQ_COR_VECTOR_SIZE);

   // clear MFDQ coefficients
   memset(gt_MfdqConfig.psa_sf_coef, 0, sizeof(int16)*MAX_MFDQ_TONES*2);
   memset(gt_MfdqConfig.puca_sf_exp, 0, sizeof(uint8)*MAX_MFDQ_TONES);
   memset(gt_MfdqConfig.psa_ff_coef, 0, sizeof(int16)*MAX_MFDQ_TONES*2);
   memset(gt_MfdqConfig.psa_fb_coef, 0, sizeof(int16)*MAX_MFDQ_TONES*2);
   memset(gt_MfdqConfig.puca_fb_exp, 0, sizeof(uint8)*MAX_MFDQ_TONES);

   if ((gt_MfdqConfig.s_NumMfdqTones == 0) ||
         (gt_MfdqConfig.s_NumReadTone > MAX_MFDQ_BUF_SIZE))
   {
      gt_MfdqConfig.s_ErrorCode |= MFDQ_TONE_SELECT_FAIL;
   }

   guc_MfdqTrainingState = TRAINING_DONE;
}

/*
*-------------------------------------------------------------------------------
*
*   Prototype: void SelectMfdqTones(void)
*
*   This function selects MFDQ tones and associated feedback tones.
*
*   1.   simply choose predefined # (gt_MfdqConfig.s_NumMfdqTones) of consecutive
*      tones as MFDQ tones starting from the lowest tone of the first RX band
*   2.   for each tone, one previous physical tone is chosen as a feedback tone
*   3.   # of MFDQ tone can be reduced.
*
*   Input Arguments:
*
*   Output Arguments:
*
*   Returns:
*
*   Global Variables:
*      gt_MfdqConfig.s_NumMfdqTones: # of MFDQ tones (I/O)
*
*-------------------------------------------------------------------------------
*/

void SelectMfdqTones(void)
{

//#define USE_MFDQ_HW_WORK_AROUND

#ifdef USE_MFDQ_HW_WORK_AROUND

   int16 i,s_ch;
   int16 s_d,s_min_d,k;
   int16 k_best, s_is_anchor;

   // Determine which tones will be used as feedforward anchors.
   // Take anchors only from first band, with specified spacing, and subject to
   // snr-based qualification.
   gt_MfdqHwWkArndConfig.s_num_anchors = 0;
   s_ch = gsa_RxBandLeftChannel[0] + gt_MfdqHwWkArndConfig.s_first_cand_anchor_offset;
   while ((gt_MfdqHwWkArndConfig.s_num_anchors<gt_MfdqHwWkArndConfig.s_max_num_anchors) &&
          (gt_MfdqHwWkArndConfig.s_num_anchors<16) &&
          (s_ch <= gsa_RxBandRightChannel[0]) )
   {
      if (gpsa_MeasuredSnrBuf[s_ch] < gsa_ConstellationSNR[2])
      {
         s_ch++;
         continue;
      }
      gt_MfdqHwWkArndConfig.sa_anchors[gt_MfdqHwWkArndConfig.s_num_anchors] = s_ch;
      gt_MfdqHwWkArndConfig.s_num_anchors++;
      s_ch += gt_MfdqHwWkArndConfig.s_anchor_spacing;
   }

   i=0;
   //Determine MFDQ tones.  For each candidate MFDQ tone, take feedforward from nearest anchor tone that
   //is before the candidate MFDQ tone.  If there is no such anchor tone, mfdq is not used on the
   //candidate mfdq tone. i counts the number of tones used for mfdq.
   for (s_ch=gsa_RxBandLeftChannel[0]; s_ch<=gsa_RxBandRightChannel[0]; s_ch++)
   {
      if (i==gt_MfdqConfig.s_NumMfdqTones)
      {
         break;
      }
      s_min_d = 0x7FFF;
      s_is_anchor = 0;
      for (k=0; k<gt_MfdqHwWkArndConfig.s_num_anchors; k++)
      {
         s_d = s_ch - gt_MfdqHwWkArndConfig.sa_anchors[k];
         if (s_d==0)
         {
            s_is_anchor = 1;
         }
         if ((s_d<s_min_d)&&(s_d>0))
         {
            s_min_d = s_d;
            k_best = k;
         }
      }
      // Don't use mfdq on tone s_ch, if there is no anchor preceeding it, or
      // if it is itself an anchor.
      if ((s_is_anchor!=1) && (s_min_d<0x7FFF))
      {
         gt_MfdqConfig.psa_MfdqTones[i] = s_ch;
         gt_MfdqConfig.psa_FbckTones[i] = gt_MfdqHwWkArndConfig.sa_anchors[k_best];
         i++;
      }
   }
   gt_MfdqConfig.s_NumMfdqTones = i;

   // Eliminate any anchors that do not provide feedforward to any mfdq tone, so that
   // they can be bitloaded in showtime.
   if (gt_MfdqConfig.s_NumMfdqTones == 0)
   {
      gt_MfdqHwWkArndConfig.s_num_anchors = 0;
   }
   else
   {
      i=0;
      for (k=0; k<gt_MfdqHwWkArndConfig.s_num_anchors; k++)
         if (gt_MfdqHwWkArndConfig.sa_anchors[k]>gt_MfdqConfig.psa_MfdqTones[gt_MfdqConfig.s_NumMfdqTones-1])
         {
            i++;
         }
      gt_MfdqHwWkArndConfig.s_num_anchors = gt_MfdqHwWkArndConfig.s_num_anchors - i;
   }


#endif //USE_MFDQ_HW_WORK_AROUND

#ifndef USE_MFDQ_HW_WORK_AROUND
   int16 i;
   int16 s_CurBand, s_ch;

   // determine MFDQ tone
   // start from the (leftmost + 1) tone in the first band
   // if # of active tones in the 1st band is less than MAX_MFDQ_TONES,
   // then reduce # of MFDQ tones (this is temporary)
   i = 0;
   s_CurBand = 0;
   s_ch = gsa_RxBandLeftChannel[s_CurBand] + 1;

   while (i < gt_MfdqConfig.s_NumMfdqTones)
   {
      // consider in-band active RX tones
      if (IS_TONEFLAGSET(guca_RxSupportedToneSet, s_ch) &&
            (gpsa_MeasuredSnrBuf[s_ch] > gsa_ConstellationSNR[2]))
      {
         // select MFDQ tones
         gt_MfdqConfig.psa_MfdqTones[i] = s_ch;

         // assume we can always find adjacent tone for feedback tone
         gt_MfdqConfig.psa_FbckTones[i++] = s_ch - 1;
      }

      s_ch++;

      if (s_ch > gsa_RxBandRightChannel[s_CurBand])
      {
         // for now, MFDQ tone does not work over 2 bands
         gt_MfdqConfig.s_NumMfdqTones = i;
      }
   }
#endif
}

/*
*-------------------------------------------------------------------------------
*
*   Prototype: void BgAccumCovarMatrix(void)
*
*   This function accumulates all the elements of covariance matrix R = X*X' and
*   correlation vector p = X*Di' over multiple frames. 48-bit accumulation is
*   used for correlation between FDQ outputs. 32-bit accumulation is used for
*   correlation between FDQ output and decision output since decision output
*   during training is either +1 or -1 (i.e., +128 or -128 in Q9.7 format).
*   There is no precision loss in this routine.
*
*   where   X = [Yi Ek Dk]
*         Yi:   SFDQ output for MFDQ tone i, Yi = Ci*Xi - Q3.13 format
*         Di:   final decision output for MFDQ tone i - Q9.7 format
*         Ek:   error output for feedback tone k (to MFDQ tone i) - Q3.13 format
*            based on SFDQ output, Ek = Ck*Xk - Dk
*         Dk:   final decision output for feedback tone k - Q9.7 format
*
*   Input Arguments:
*
*   Output Arguments:
*
*   Returns:
*
*   Global Variables:
*
*-------------------------------------------------------------------------------
*/

void BgAccumCovarMatrix(void)
{
   int16 i, s_cov_idx, s_cor_idx;
   int16 s_mt_idx, s_ft_idx, s_mt_r_idx, s_mt_i_idx, s_ft_r_idx, s_ft_i_idx;
   int16 sa_Yi[2], sa_Di[2], sa_Yk[2], sa_Ek[2], sa_Dk[2];
   int32 la_CovM11[2], la_CovM12[2], la_CovM13[2], la_CovM22[2], la_CovM23[2];
   int32 la_CorV11[2], la_CorV21[2], la_CorV31[2];

   for (i=0; i<gt_MfdqConfig.s_NumMfdqTones; i++)
   {

      // get MFDQ tone index
      s_mt_idx = gt_MfdqConfig.psa_MfdqTones[i] - gt_MfdqConfig.s_ReadToneStart;

      // get feedback tone index
      s_ft_idx = gt_MfdqConfig.psa_FbckTones[i] - gt_MfdqConfig.s_ReadToneStart;

      s_mt_r_idx = s_mt_idx << 1;
      s_mt_i_idx = s_mt_r_idx + 1;
      s_ft_r_idx = s_ft_idx << 1;
      s_ft_i_idx = s_ft_r_idx + 1;

      sa_Yi[0] = gt_MfdqConfig.psa_FdqOutput[s_mt_r_idx]; // Yi real (Q3.13)
      sa_Yi[1] = gt_MfdqConfig.psa_FdqOutput[s_mt_i_idx]; // Yi imag (Q3.13)
      sa_Di[0] = gt_MfdqConfig.psa_DecOutput[s_mt_r_idx] >> 7; // Di real (Q9.7 -> Q2.0)
      sa_Di[1] = gt_MfdqConfig.psa_DecOutput[s_mt_i_idx] >> 7; // Di imag (Q9.7 -> Q2.0)
      sa_Yk[0] = gt_MfdqConfig.psa_FdqOutput[s_ft_r_idx]; // Yk real (Q3.13)
      sa_Yk[1] = gt_MfdqConfig.psa_FdqOutput[s_ft_i_idx]; // Yk imag (Q3.13)
      sa_Dk[0] = gt_MfdqConfig.psa_DecOutput[s_ft_r_idx] >> 7; // Dk real (Q9.7 -> Q2.0)
      sa_Dk[1] = gt_MfdqConfig.psa_DecOutput[s_ft_i_idx] >> 7; // Dk imag (Q9.7 -> Q2.0)

      sa_Ek[0] = sa_Yk[0] - (sa_Dk[0] << 13); // Ek real (Q3.13)
      sa_Ek[1] = sa_Yk[1] - (sa_Dk[1] << 13); // Ek imag (Q3.13)

      // covariance matrix calculation
      ComplexMult(la_CovM11, sa_Yi, sa_Yi, 1, 0); // Q7.26
      ComplexMult(la_CovM12, sa_Yi, sa_Ek, 1, 0); // Q7.26
      ComplexMult(la_CovM13, sa_Yi, sa_Dk, 1, 0); // Q6.13
      ComplexMult(la_CovM22, sa_Ek, sa_Ek, 1, 0); // Q7.26
      ComplexMult(la_CovM23, sa_Ek, sa_Dk, 1, 0); // Q6.13

      // correlation vector calculation
      ComplexMult(la_CorV11, sa_Yi, sa_Di, 1, 0); // Q6.13
      ComplexMult(la_CorV21, sa_Ek, sa_Di, 1, 0); // Q6.13
      ComplexMult(la_CorV31, sa_Dk, sa_Di, 1, 0); // Q5.0

      s_cov_idx = i * MFDQ_COV_MATRIX_SIZE;

      // 48-bit accumulation for Cov(1,1) - always real
      mAcc48(&gt_MfdqConfig.pla_CovMatrix[s_cov_idx+COV_M11_R_H], &gt_MfdqConfig.pla_CovMatrix[s_cov_idx+COV_M11_R_L], la_CovM11[0]);
      // 48-bit accumulation for Cov(1,2) - real part
      mAcc48(&gt_MfdqConfig.pla_CovMatrix[s_cov_idx+COV_M12_R_H], &gt_MfdqConfig.pla_CovMatrix[s_cov_idx+COV_M12_R_L], la_CovM12[0]);
      // 48-bit accumulation for Cov(1,2) - imaginary part
      mAcc48(&gt_MfdqConfig.pla_CovMatrix[s_cov_idx+COV_M12_I_H], &gt_MfdqConfig.pla_CovMatrix[s_cov_idx+COV_M12_I_L], la_CovM12[1]);
      // 32-bit accumulation for Cov(1,3) - real part
      gt_MfdqConfig.pla_CovMatrix[s_cov_idx+COV_M13_R] += la_CovM13[0];
      // 32-bit accumulation for Cov(1,3) - imaginary part
      gt_MfdqConfig.pla_CovMatrix[s_cov_idx+COV_M13_I] += la_CovM13[1];
      // 48-bit accumulation for Cov(2,2) - always real
      mAcc48(&gt_MfdqConfig.pla_CovMatrix[s_cov_idx+COV_M22_R_H], &gt_MfdqConfig.pla_CovMatrix[s_cov_idx+COV_M22_R_L], la_CovM22[0]);
      // 32-bit accumulation for Cov(2,3) - real part
      gt_MfdqConfig.pla_CovMatrix[s_cov_idx+COV_M23_R] += la_CovM23[0];
      // 32-bit accumulation for Cov(2,3) - imaginary part
      gt_MfdqConfig.pla_CovMatrix[s_cov_idx+COV_M23_I] += la_CovM23[1];

      s_cor_idx = i * MFDQ_COR_VECTOR_SIZE;

      // 32-bit accumulation for all components of correlation vector
      gt_MfdqConfig.pla_CorVector[s_cor_idx+COR_V11_R] += la_CorV11[0];
      gt_MfdqConfig.pla_CorVector[s_cor_idx+COR_V11_I] += la_CorV11[1];
      gt_MfdqConfig.pla_CorVector[s_cor_idx+COR_V21_R] += la_CorV21[0];
      gt_MfdqConfig.pla_CorVector[s_cor_idx+COR_V21_I] += la_CorV21[1];
      gt_MfdqConfig.pla_CorVector[s_cor_idx+COR_V31_R] += la_CorV31[0];
      gt_MfdqConfig.pla_CorVector[s_cor_idx+COR_V31_I] += la_CorV31[1];
   }

   guc_MfdqTrainingState = TRAINING_DONE;
}

/*
*-------------------------------------------------------------------------------
*
*   Prototype: void BgCalcMfdqCoef(void)
*
*   This function calculates MFDQ coefficients - SFDQ coefficient (Si),
*   Feedforward coefficient (Ak), and Feedback coefficient (Bk).
*
*   Input Arguments:
*
*   Output Arguments:
*
*   Returns:
*
*   Global Variables:
*
*-------------------------------------------------------------------------------
*/

void BgCalcMfdqCoef(void)
{
   int16 i, j, s_cov_idx, s_cor_idx;
   int16 sa_MfdqCoef[9];

   // save original SFDQ coefficients read from HW for MFDQ calculation
   SaveSfdqCoef();

   for (i=0; i<gt_MfdqConfig.s_NumMfdqTones; i++)
   {

      j = i << 1;

      // initialize with SFDQ coefficients
      gt_MfdqConfig.puca_ff_exp[i] = gt_MfdqConfig.puca_sf_exp[i];

      s_cov_idx = i * MFDQ_COV_MATRIX_SIZE;
      s_cor_idx = i * MFDQ_COR_VECTOR_SIZE;

      // compute MFDQ coefficients by solving LS solution
      if (CalcMfdqCoef(i, sa_MfdqCoef) == SUCCEED)
      {
         // prepare coefficients ready for HW
         // if error occurs, then load original SFDQ
         gt_MfdqConfig.psa_sf_coef[j] = sa_MfdqCoef[0];
         gt_MfdqConfig.psa_sf_coef[j+1] = -sa_MfdqCoef[1];
         gt_MfdqConfig.puca_sf_exp[i] = (uint8)sa_MfdqCoef[2];
         gt_MfdqConfig.psa_ff_coef[j] = sa_MfdqCoef[3];
         gt_MfdqConfig.psa_ff_coef[j+1] = -sa_MfdqCoef[4];
         gt_MfdqConfig.puca_ff_exp[i] = (uint8)sa_MfdqCoef[5];
         gt_MfdqConfig.psa_fb_coef[j] = sa_MfdqCoef[6];
         gt_MfdqConfig.psa_fb_coef[j+1] = -sa_MfdqCoef[7];
         gt_MfdqConfig.puca_fb_exp[i] = (uint8)sa_MfdqCoef[8];
      }
   }

   // adjust SFDQ exponent so that SFDQ and Feedforward terms share the same exponent
   AdjustMfdqExponent();

   // prepare an array for new SFDQ coefficient writes
   UpdateSfdqCoef();

   guc_MfdqTrainingState = TRAINING_DONE;
}

/*
*-------------------------------------------------------------------------------
*
*   Prototype: int16 CalcMfdqCoef(int16 s_tidx, int16 *psa_MfdqCoef)
*
*   This function computes complex conjugate of MFDQ coefficients W = inv(R)*P
*   where W = [Si'Gi Ak'Gi Bk'Gi], R = E{X*X'} and P = E{X*Di'}.
*
*   Input Arguments:
*      s_tidx: MFDQ tone index (0 to gt_MfdqConfig.s_NumMfdqTones-1)
*
*   Output Arguments:
*      psa_MfdqCoef: pointer to MFDQ coefficients
*         psa_MfdqCoef[0]: SFDQ (Si) real component (Q1.15)
*         psa_MfdqCoef[1]: SFDQ (Si) imaginary component (Q1.15)
*         psa_MfdqCoef[2]: SFDQ (Si) exponent component (0 to 15)
*         psa_MfdqCoef[3]: Feedforward (Ak) real component (Q1.15)
*         psa_MfdqCoef[4]: Feedforward (Ak) imaginary component (Q1.15)
*         psa_MfdqCoef[5]: Feedforward (Ak) exponent component (0 to 15)
*         psa_MfdqCoef[6]: Feedback (Bk) real component (Q1.15)
*         psa_MfdqCoef[7]: Feedback (Bk) imaginary component (Q1.15)
*         psa_MfdqCoef[8]: Feedback (Bk) exponent component (-9 to +6)
*
*   Returns:
*      s_return: SUCCEED or FAIL
*
*   Global Variables:
*
*-------------------------------------------------------------------------------
*/

int16 CalcMfdqCoef(int16 s_tidx, int16 *psa_MfdqCoef)
{
   int16 i, j, s_exp, s_rsh, s_return = SUCCEED;
   int16 s_up_lsh, s_hi_rsh, s_lo_rsh, s_acc_rsh;
   int16 s_ft_idx, s_mt_idx, sa_Ck[2], s_Ck_exp, sa_Ci[2], s_Ci_exp;
   int32 *pla_CovMatrix, *pla_CorVector;
   int32 l_real, l_imag, l_tmp_real, l_tmp_imag, l_scale;
   FloatG fa_R[9], fa_P[6], fa_W[7]; // size of fa_W is increased by one to use Cholesky() & Backsubstitution()
   FloatG f_temp, f_scale, fa_Ck[2], fa_Ci[2];

   pla_CovMatrix = &gt_MfdqConfig.pla_CovMatrix[s_tidx*MFDQ_COV_MATRIX_SIZE];
   pla_CorVector = &gt_MfdqConfig.pla_CorVector[s_tidx*MFDQ_COR_VECTOR_SIZE];

   //==========================================================================
   // convert covariance matrix R (int32 to FloatG)
   //==========================================================================

   // get up-scale factor which will be applied to error output
   s_acc_rsh = gt_MfdqConfig.s_AlgLog2NumFramesToAccum;
   s_up_lsh = GetUpScaleFactor(pla_CovMatrix[COV_M22_R_H], pla_CovMatrix[COV_M22_R_L], s_acc_rsh);
   s_up_lsh -= GetUpScaleFactor(pla_CovMatrix[COV_M11_R_H], pla_CovMatrix[COV_M11_R_L], s_acc_rsh);
   if (s_up_lsh > MFDQ_MAX_UP_SCALE_SHIFT)
   {
      s_up_lsh = MFDQ_MAX_UP_SCALE_SHIFT;
      gt_MfdqConfig.s_ErrorCode |= MFDQ_UPSCALE_FAIL;
      s_return = FAIL;
   }
   s_up_lsh >>= 1;

   // R(1,1) - always real
   s_hi_rsh = MFDQ_YY_SHIFT - 16;
   s_lo_rsh = MFDQ_YY_SHIFT;
   fa_R[0] = ConvMfdqStat(pla_CovMatrix[COV_M11_R_H], s_hi_rsh, pla_CovMatrix[COV_M11_R_L], s_lo_rsh, s_acc_rsh);

   // R(1,2) - real & imaginary
   s_hi_rsh = MFDQ_YE_SHIFT - s_up_lsh - 16;
   s_lo_rsh = MFDQ_YE_SHIFT - s_up_lsh;
   fa_R[1] = ConvMfdqStat(pla_CovMatrix[COV_M12_R_H], s_hi_rsh, pla_CovMatrix[COV_M12_R_L], s_lo_rsh, s_acc_rsh);
   fa_R[2] = ConvMfdqStat(pla_CovMatrix[COV_M12_I_H], s_hi_rsh, pla_CovMatrix[COV_M12_I_L], s_lo_rsh, s_acc_rsh);

   // R(1,3) - real & imaginary
   s_lo_rsh = MFDQ_YD_SHIFT;
   fa_R[3] = ConvMfdqStat(0, 1, pla_CovMatrix[COV_M13_R], s_lo_rsh, s_acc_rsh);
   fa_R[4] = ConvMfdqStat(0, 1, pla_CovMatrix[COV_M13_I], s_lo_rsh, s_acc_rsh);

   // R(2,2) - always real
   s_hi_rsh = MFDQ_EE_SHIFT - 2*s_up_lsh - 16;
   s_lo_rsh = MFDQ_EE_SHIFT - 2*s_up_lsh;
   fa_R[5] = ConvMfdqStat(pla_CovMatrix[COV_M22_R_H], s_hi_rsh, pla_CovMatrix[COV_M22_R_L], s_lo_rsh, s_acc_rsh);

   // R(2,3) - real & imaginary
   s_lo_rsh = MFDQ_ED_SHIFT - s_up_lsh;
   fa_R[6] = ConvMfdqStat(0, 1, pla_CovMatrix[COV_M23_R], s_lo_rsh, s_acc_rsh);
   fa_R[7] = ConvMfdqStat(0, 1, pla_CovMatrix[COV_M23_I], s_lo_rsh, s_acc_rsh);

   // R(3,3) - always real
   fa_R[8] = int32toFloat32(2);

   //==========================================================================
   // convert correlation vector P (int32 to FloatG)
   //==========================================================================

   // P(1,1) - real & imaginary
   s_lo_rsh = MFDQ_YD_SHIFT;
   fa_P[0] = ConvMfdqStat(0, 1, pla_CorVector[COR_V11_R], s_lo_rsh, s_acc_rsh);
   fa_P[1] = ConvMfdqStat(0, 1, pla_CorVector[COR_V11_I], s_lo_rsh, s_acc_rsh);

   // P(2,1) - real & imaginary
   s_lo_rsh = MFDQ_ED_SHIFT - s_up_lsh;
   fa_P[2] = ConvMfdqStat(0, 1, pla_CorVector[COR_V21_R], s_lo_rsh, s_acc_rsh);
   fa_P[3] = ConvMfdqStat(0, 1, pla_CorVector[COR_V21_I], s_lo_rsh, s_acc_rsh);

   // P(3,1) - real & imaginary
   s_lo_rsh = MFDQ_DD_SHIFT;
   fa_P[4] = ConvMfdqStat(0, 1, pla_CorVector[COR_V31_R], s_lo_rsh, s_acc_rsh);
   fa_P[5] = ConvMfdqStat(0, 1, pla_CorVector[COR_V31_I], s_lo_rsh, s_acc_rsh);

   //==========================================================================
   // compute complex conjugate of MFDQ coefficients, W = inv(R)*P
   //==========================================================================

   CalcComplexLinearEq3x3(fa_R, fa_P, fa_W);

   //==========================================================================
   // scale MFDQ coefficients
   //==========================================================================

   // get original SFDQ coefficient for feedback tone
   s_ft_idx = gt_MfdqConfig.psa_FbckTones[s_tidx] - gt_MfdqConfig.s_ReadToneStart;
   sa_Ck[0] = gt_MfdqConfig.psa_SfdqCoef[2*s_ft_idx]; // Ck real (Q1.15)
   sa_Ck[1] = gt_MfdqConfig.psa_SfdqCoef[2*s_ft_idx+1]; // Ck imag (Q1.15)
   s_Ck_exp = (int16)gt_MfdqConfig.puca_SfdqExp[s_ft_idx];

   // get original SFDQ coefficient for MFDQ tone
   s_mt_idx = gt_MfdqConfig.psa_MfdqTones[s_tidx] - gt_MfdqConfig.s_ReadToneStart;
   sa_Ci[0] = gt_MfdqConfig.psa_SfdqCoef[2*s_mt_idx]; // Ci real (Q1.15)
   sa_Ci[1] = gt_MfdqConfig.psa_SfdqCoef[2*s_mt_idx+1]; // Ci imag (Q1.15)
   s_Ci_exp = (int16)gt_MfdqConfig.puca_SfdqExp[s_mt_idx];

   f_temp = int32toFloat32((int32)(1<<15));
   fa_Ck[0] = int32toFloat32((int32)(sa_Ck[0] << s_Ck_exp));
   fa_Ck[1] = int32toFloat32((int32)(sa_Ck[1] << s_Ck_exp));
   fa_Ck[0] = divf32(fa_Ck[0], f_temp);
   fa_Ck[1] = divf32(fa_Ck[1], f_temp);
   fa_Ci[0] = int32toFloat32((int32)(sa_Ci[0] << s_Ci_exp));
   fa_Ci[1] = int32toFloat32((int32)(sa_Ci[1] << s_Ci_exp));
   fa_Ci[0] = divf32(fa_Ci[0], f_temp);
   fa_Ci[1] = divf32(fa_Ci[1], f_temp);

   // scale feedback tap Bk = W3 - W2*(2^s_up_lsh)
   f_scale = int32toFloat32((int32)(1 << s_up_lsh));
   fa_W[2] = mpyf32(fa_W[2], f_scale);
   fa_W[3] = mpyf32(fa_W[3], f_scale);
   fa_W[4] = subf32(fa_W[4], fa_W[2]);
   fa_W[5] = subf32(fa_W[5], fa_W[3]);

   // scale feedforward tap Ak = W2*(2^s_up_lsh)*conj(C_k)
   ComplexMultFloat32(&fa_W[2], &fa_W[2], fa_Ck, 1);

   // scale center tap Si = W1*conj(C_1)
   ComplexMultFloat32(&fa_W[0], &fa_W[0], fa_Ci, 1);

   //==========================================================================
   // convert MFDQ coefficients to fixed point
   //==========================================================================

   // scale Si (Q1.15) and Ak (Q1.15) with exp = 0 to 15
   // scale Bk (Q1.15) with exp = -9 to 6 (- means right shift)
   for (i=0; i<3; i++)
   {
      j = i << 1;
      if (i == 2)
      {
         s_rsh = MFDQ_MAX_FB_EXP_RSHFT;
      }
      else
      {
         s_rsh = 0;
      }
      l_scale = (int32)(1 << (int16)(MFDQ_MANTISSA_FRAC_BITS + s_rsh));
      f_scale = int32toFloat32(l_scale);
      f_temp = mpyf32(fa_W[j], f_scale);
      l_real = f32toint32(f_temp, 0);
      f_temp = mpyf32(fa_W[j+1], f_scale);
      l_imag = f32toint32(f_temp, 0);
      s_exp = -1;
      do
      {
         s_exp++;
         l_tmp_real = round(l_real, s_exp);
         l_tmp_imag = round(l_imag, s_exp);
      }
      while ((l_tmp_real > (int32)0x00007FFFL) || (l_tmp_real < (int32)0xFFFF8000L) ||
             (l_tmp_imag > (int32)0x00007FFFL) || (l_tmp_imag < (int32)0xFFFF8000L));
      if (s_exp > MFDQ_MAX_EXP_SHFT)
      {
         s_exp = MFDQ_MAX_EXP_SHFT;
         gt_MfdqConfig.s_ErrorCode |= MFDQ_COEFF_OVFLOW;
         s_return = FAIL;
      }
      *psa_MfdqCoef++ = (int16)l_tmp_real;
      *psa_MfdqCoef++ = (int16)l_tmp_imag;
      *psa_MfdqCoef++ = s_exp;
   }

   return (s_return);
}

/*
*-------------------------------------------------------------------------------
*
*   Prototype: int16 GetUpScaleFactor(int32 la_Hi, int32 la_Lo, int16 s_acc_rsh)
*
*   This function computes up-scale factor for the error output.
*
*   Input Arguments:
*      la_Hi: high 16-bit of accumulated Ek*Ek' term
*      la_Lo: low 16-bit of accumulated Ek*Ek' term
*      s_acc_rsh: log2(# of accumulation)
*
*   Output Arguments:
*
*   Returns:
*      s_scale: up scale factor = # of left shift
*
*   Global Variables:
*
*-------------------------------------------------------------------------------
*/

int16 GetUpScaleFactor(int32 la_Hi, int32 la_Lo, int16 s_acc_rsh)
{
   int16 s_hi_sh, s_lo_sh, s_scale;

   norm_acc48(&la_Hi, &la_Lo);
   s_hi_sh = norm_l(la_Hi);
   s_lo_sh = norm_l(la_Lo);
   if (s_hi_sh != 0)
   {
      s_scale = s_hi_sh + s_acc_rsh - 16;
   }
   else
   {
      s_scale = s_lo_sh + s_acc_rsh;
   }

   return (s_scale);
}

/*
*-------------------------------------------------------------------------------
*
*   Prototype: FloatG ConvMfdqStat(int32 l_hi, int16 s_hi_rsh, int32 l_lo,
*      int16 s_lo_rsh, int16 s_acc_rsh)
*
*   This function averages accumulated MFDQ statistics and convert them into
*   floating point number.
*
*      f_result = (l_hi/(2^s_hi_rsh) + l_lo/(2^s_lo_rsh))/(2^s_acc_rsh)
*
*   Input Arguments:
*      l_hi: 32-bit number represents high 16-bit of 48-bit accumulation
*      s_hi_rsh: normalization factor applied to l_hi (# of right shift)
*      l_lo: 32-bit number represents low 16-bit of 48-bit accumulation
*      s_lo_rsh: normalization factor applied to l_lo (# of right shift)
*      s_acc_rsh: # of accumulation (# of right shift)
*
*   Output Arguments:
*      f_result: floating point result
*
*   Returns:
*
*   Global Variables:
*
*-------------------------------------------------------------------------------
*/

FloatG ConvMfdqStat(int32 l_hi, int16 s_hi_rsh, int32 l_lo, int16 s_lo_rsh, int16 s_acc_rsh)
{
   int32 l_temp;
   FloatG f_temp, f_hi, f_lo, f_result;

   // f_hi = l_hi/(2^s_hi_rsh)
   f_hi = int32toFloat32(l_hi);
   if (s_hi_rsh > 0)
   {
      l_temp = (int32)(1 << s_hi_rsh);
      f_temp = int32toFloat32(l_temp);
      f_hi = divf32(f_hi, f_temp);
   }
   else
   {
      l_temp = (int32)(1 << -s_hi_rsh);
      f_temp = int32toFloat32(l_temp);
      f_hi = mpyf32(f_hi, f_temp);
   }

   // f_lo = l_lo/(2^s_lo_rsh)
   f_lo = int32toFloat32(l_lo);
   if (s_lo_rsh > 0)
   {
      l_temp = (int32)(1 << s_lo_rsh);
      f_temp = int32toFloat32(l_temp);
      f_lo = divf32(f_lo, f_temp);
   }
   else
   {
      l_temp = (int32)(1 << -s_lo_rsh);
      f_temp = int32toFloat32(l_temp);
      f_lo = mpyf32(f_lo, f_temp);
   }

   // (f_hi + f_lo)/(2^s_acc_rsh)
   l_temp = (int32)(1 << s_acc_rsh);
   f_temp = int32toFloat32(l_temp);
   f_result = addf32(f_hi, f_lo);
   f_result = divf32(f_result, f_temp);

   return (f_result);
}

/*
*-------------------------------------------------------------------------------
*
*   Prototype: void ComplexMultFloat32(FloatG *pfa_Z, FloatG *pfa_X,
*      FloatG *pfa_Y, FlagT ft_conj_Y)
*
*   This function calculates complex vector multiplication.
*      Zr+jZi = (Xr+jXi)*(Yr+jYi)
*
*   Input Arguments:
*      pfa_X: pointer to complex input 1
*      pfa_Y: pointer to complex input 2
*      ft_conj_Y: apply conjugate on Y if 1
*
*   Output Arguments:
*      pfa_Z: pointer to complex output
*
*   Returns:
*
*   Global Variables:
*
*-------------------------------------------------------------------------------
*/

void ComplexMultFloat32(FloatG *pfa_Z, FloatG *pfa_X, FloatG *pfa_Y, FlagT ft_conj_Y)
{
   FloatG f_x_r, f_x_i, f_y_r, f_y_i;
   FloatG f_temp1, f_temp2;

   f_x_r = pfa_X[0];
   f_x_i = pfa_X[1];
   f_y_r = pfa_Y[0];
   f_y_i = pfa_Y[1];

   if (ft_conj_Y)
   {
      f_y_i = subf32(0, f_y_i);
   }

   f_temp1 = mpyf32(f_x_r, f_y_r);
   f_temp2 = mpyf32(f_x_i, f_y_i);
   pfa_Z[0] = subf32(f_temp1, f_temp2);

   f_temp1 = mpyf32(f_x_i, f_y_r);
   f_temp2 = mpyf32(f_x_r, f_y_i);
   pfa_Z[1] = addf32(f_temp1, f_temp2);
}

/*
*-------------------------------------------------------------------------------
*
*   Prototype: void CalcComplexLinearEq3x3(FloatG *pfa_R, FloatG *pfa_P, FloatG *pfa_W)
*
*   This function solves 3x3 complex linear equation, W = inv(R)*P.
*   where R is a 3x3 Hermitian matrix, and P is 3x1 complex vector
*
*   Input Arguments:
*      pfa_R: pointer to 3x3 complex matrix R
*         pfa_R[0]: R(1,1) - always real
*         pfa_R[1]: real part of R(1,2)
*         pfa_R[2]: imaginary part of R(1,2)
*         pfa_R[3]: real part of R(1,3)
*         pfa_R[4]: imaginary part of R(1,3)
*         pfa_R[5]: R(2,2) - always real
*         pfa_R[6]: real part of R(2,3)
*         pfa_R[7]: imaginary part of R(2,3)
*         pfa_R[8]: R(3,3) - always real
*      pfa_P: pointer to 3x1 complex vector P
*         pfa_P[0]: real part of P(1,1)
*         pfa_P[1]: imaginary part of P(1,1)
*         pfa_P[2]: real part of P(2,1)
*         pfa_P[3]: imaginary part of P(2,1)
*         pfa_P[4]: real part of P(3,1)
*         pfa_P[5]: imaginary part of P(3,1)
*
*   Output Arguments:
*      pfa_W: pointer to 3x1 complex vector W
*
*   Returns:
*
*   Global Variables:
*
*-------------------------------------------------------------------------------
*/


void CalcComplexLinearEq3x3(FloatG *pfa_R, FloatG *pfa_P, FloatG *pfa_W)
{
   static FloatG fa_R[28]; // M*(M+1)/2 where M=7

   LMat(fa_R, 0, 0, 7) = (FloatG)pfa_R[0];
   LMat(fa_R, 0, 1, 7) = (FloatG)0;
   LMat(fa_R, 0, 2, 7) = (FloatG)pfa_R[1];
   LMat(fa_R, 0, 3, 7) = (FloatG)subf32(0, pfa_R[2]);
   LMat(fa_R, 0, 4, 7) = (FloatG)pfa_R[3];
   LMat(fa_R, 0, 5, 7) = (FloatG)subf32(0, pfa_R[4]);
   LMat(fa_R, 1, 1, 7) = (FloatG)pfa_R[0];
   LMat(fa_R, 1, 2, 7) = (FloatG)pfa_R[2];
   LMat(fa_R, 1, 3, 7) = (FloatG)pfa_R[1];
   LMat(fa_R, 1, 4, 7) = (FloatG)pfa_R[4];
   LMat(fa_R, 1, 5, 7) = (FloatG)pfa_R[3];
   LMat(fa_R, 2, 2, 7) = (FloatG)pfa_R[5];
   LMat(fa_R, 2, 3, 7) = (FloatG)0;
   LMat(fa_R, 2, 4, 7) = (FloatG)pfa_R[6];
   LMat(fa_R, 2, 5, 7) = (FloatG)subf32(0, pfa_R[7]);
   LMat(fa_R, 3, 3, 7) = (FloatG)pfa_R[5];
   LMat(fa_R, 3, 4, 7) = (FloatG)pfa_R[7];
   LMat(fa_R, 3, 5, 7) = (FloatG)pfa_R[6];
   LMat(fa_R, 4, 4, 7) = (FloatG)pfa_R[8];
   LMat(fa_R, 4, 5, 7) = (FloatG)0;
   LMat(fa_R, 5, 5, 7) = (FloatG)pfa_R[8];

   LMat(fa_R, 0, 6, 7) = (FloatG)pfa_P[0];
   LMat(fa_R, 1, 6, 7) = (FloatG)pfa_P[1];
   LMat(fa_R, 2, 6, 7) = (FloatG)pfa_P[2];
   LMat(fa_R, 3, 6, 7) = (FloatG)pfa_P[3];
   LMat(fa_R, 4, 6, 7) = (FloatG)pfa_P[4];
   LMat(fa_R, 5, 6, 7) = (FloatG)pfa_P[5];

   Cholesky(fa_R, 7);
   BackSubstitution(fa_R, pfa_W, 7);
}

