/* **COPYRIGHT******************************************************************
    INTEL CONFIDENTIAL
    Copyright (C) 2017 Intel Corporation
    Copyright (C), 1994-2002 Aware Inc. All Rights Reserved.
******************************************************************COPYRIGHT** */
/* **DISCLAIMER*****************************************************************
    The source code contained or described herein and all documents related
    to the source code ("Material") are owned by Intel Corporation or its
    suppliers or licensors. Title to the Material remains with Intel
    Corporation or its suppliers and licensors. The Material may contain
    trade secrets and proprietary and confidential information of Intel
    Corporation and its suppliers and licensors, and is protected by
    worldwide copyright and trade secret laws and treaty provisions. No part
    of the Material may be used, copied, reproduced, modified, published,
    uploaded, posted, transmitted, distributed, or disclosed in any way
    without Intel's prior express written permission.

    No license under any patent, copyright, trade secret or other
    intellectual property right is granted to or conferred upon you by
    disclosure or delivery of the Materials, either expressly, by
    implication, inducement, estoppel or otherwise. Any license under
    such intellectual property rights must be express and approved by
    Intel in writing.
*****************************************************************DISCLAIMER** */
/*****************************************************************************
;
;   Aware DMT Technology. Proprietary and Confidential.
;
;   40 Middlesex Turnpike, Bedford, MA 01730-1413 USA
;   Phone (781) 276 - 4000
;   Fax   (781) 276 - 4001
;
;   tdq_init.c
;
;  Routine to perform time-domain equalization (TDQ) initialization.
;  This function will be called from the C_R_REVERB1 state of the training
;  phase of initialization. This version implements the reduced matrix
;  computation.
;
;   New algorithm using the channel impulse response.
;
;****************************************************************************/

#include "common.h"
#include "dsp_op.h"
#include "rx_ops.h"
#include "dsp_op2.h"
#include "rt_tones.h"
#include "ifft_fix.h"
#include "matrix.h"
#include "gdata.h"
#include "gdata_bis.h"
#include "tx_ops.h"
#include "tx_ops2.h"
#include "fft_bg.h"
#include "platform.h"
#include "fpconv48.h"
#include "circor48.h"
#include "int48toFloat32.h"
#include "data_alloc.h"
#include "ieee_flt.h"
#include "cmv.h"

//#define PRN_TDQ       //if defined, received and reference signal for TDQ training are
                  //saved to file x.dat and y.dat respectively for test in Matlab

#ifdef PRN_TDQ
#include <stdio.h>
#endif


#include "fpc_ini.h"

//===============================================
//static variable declarations
//===============================================


#define MAX_COEFF_VALUE_LOG2     11

int16 *psa_SignalIn;

//Dont detele these lines, just to keep track of sizes */
//Float32 fa_Rxx[TDQ_LENGTH];                // vector that holds X'X
//Float32 fa_WFull[TDQ_LENGTH][TDQ_LENGTH];     // Fast TDQ.
//Float32 fa_A[TDQ_LENGTH*(TDQ_LENGTH+1)/2];    // holds A from (**)

DATA_MAP_deILV3_BIS  //pointers currently point to guca_HndshkBuf[] in Deilv1
Float32 *fa_Rxx;                 // vector that holds X'X
Float32 *fa_WFull;      // Fast TDQ.
Float32 *fa_A;    // holds A from (**)
DATA_MAP_END

void Populate_W_Matrix(void)
{
   int16 index1, index2, i, j, k;
   int32 l_Prod;
   Float32 f_Sum_p, f_Sum_n;
   int16 s_CPLen;

   int32 l_acc48L_n;
   int32 l_acc48H_n;
   int32 l_acc48L_p;
   int32 l_acc48H_p;

   s_CPLen = gs_RxSamplesPerFrame >> 4;

   /*
   * Compute Rxx, the autocorrelation of the estimated channel impulse response,
   * at TDQLEN-1 points.
   * This will contain periodic autocorrelations Rxx[0],Rxx[1], ...
   * ... Rxx[TDQlength - 1].
   */
   for(i=0;i<gs_TDQLen;i++)
      fa_Rxx[i] = CircularCorrelate(psa_SignalIn,psa_SignalIn,gs_RxSamplesPerFrame,gs_RxSamplesPerFrame,i);

   // Compute elements of symmetric W matrix.

   for (i=0; i<gs_TDQLen; i++) {
      for (j=0; j<=i; j++) {

         l_acc48L_n = 0;
         l_acc48H_n = 0;
         l_acc48L_p = 0;
         l_acc48H_p = 0;

         for (k=1; k<= s_CPLen ; k++)
         {
            index1 = k-i;
            if (index1 < 0)
               index1 += gs_RxSamplesPerFrame;
            index2 = k-j;
            if (index2 < 0)
               index2 += gs_RxSamplesPerFrame;
            l_Prod = (int32)psa_SignalIn[index1]*(int32)psa_SignalIn[index2];
            if (l_Prod < 0) {
               l_Prod = -l_Prod;
               l_acc48L_n += l_Prod & 0xFFFF;
               l_acc48H_n += l_Prod >> 16;
            }
            else {
               l_acc48L_p += l_Prod & 0xFFFF;
               l_acc48H_p += l_Prod >> 16;
            }
         }
         // Convert the 48-bit product to a 32-bit float value.
         f_Sum_p = int48toFloat32(l_acc48H_p,l_acc48L_p);
         f_Sum_n = int48toFloat32(l_acc48H_n,l_acc48L_n);
         f_Sum_p = subf32(f_Sum_p, f_Sum_n);

         fa_WFull[i*TDQ_LENGTH+j] = f_Sum_p;
      }
   }

   // Form sparse, symmetric A = X'X - W
   for (i=0; i<gs_TDQLen; i++) {
      for (j=0; j<=i; j++) {
         LMat(fa_A, j, i, gs_TDQLen) = subf32(fa_Rxx[i-j],fa_WFull[i*TDQ_LENGTH+j]);
      }
   }

}

/*****************************************************************************
;  Subroutine Name: TdqInit(void)
;
;  Description:
;     This routine calculates the coefficients of the following TDQ filter:
;
;        y[n] = sum_{i=1}^{CPlen}(b_i y[n-i]) + sum_{j=1}^{TDQlen}(a_j x[n])
;     where y[n] are outputs of the filter (desired response) and
;     x[n] are the inputs to the filter (output of channel)
;
;     Written in vector form:
;
;     [X|Y] * [a|b]' = z, where
;
;     z is a vector of y[l] values at times n, n-1, n-2,...
;     X is a matrix whos rows, of dimension TDQlen, contain x[l] values starting at
;        l = n, for row 1; n-1, for row 2, etc.
;     Y is a matrix whos rows, of dimension CPlen, contain y[l] values starting at
;        l = n-1, for row 1; n-2, for row 2, etc.
;     a and b are the vectors of filter coefficients, a_i and b_j
;     Since the input signal (reverb) is periodic, all of the above values can be
;     filled in using the contents of a single buffer.
;
;     Premultiplying the above by [X Y]', we get
;     [X'X X'Y  * [a = [X'z
;     Y'X Y'Y]    b]   X'z]
;
;     For the TDQ, we will only be obtaining the a values. To do this, we can premultiply
;     the above by:
;     [I -X'Y*inv(Y'Y)
;     0          I       ], where I is the identity matrix.  This results in an upper and
;     lower vector equation.  One to solve for a and one for b.  Taking the upper one for a, we get
;
;     [X'X - X'Y*inv(Y'Y)*Y'X] * [a] = [X'z - X'Y*inv(Y'Y)*Y'z].
;
;     If L is the Cholesky decomposition of inv(Y'Y), then inv(Y'Y) = LL' and the above becomes
;
;     [X'X - (X'YL)(L'Y'X)] * [a] = [X'z - (X'YL)(L'Y'z)] (**)
;     or Aa = c,
;     where c is defined in (**).
;
;     Equation (**) will be used to solve for the coefficients, a.  Since y[n] are desired outputs, we
;     will use the known replica vector to supply these values; hence, the following
;     terms can be stored in memory, if desired.
;        L, L'Y'z, YL, L'Y'
;     For this implementation, L is provided and all other quantities are computed.
;
;     Note also that the matrix X is created from a circular buffer.  As a result, the
;     autocorrelation matrix X'X is Toeplitz in nature.  Given this, only one row of X'X will be needed
;     to be computed and the rest can easily be filled out.  In addition, more computational savings
;     are gained by only computing the minimum needed unique cross-correlations Rxy[-CPLen]...Rxy[TDQLen-1],
;     which are used to efficiently compute L'Y'X and also contain the needed subset X'z.
;
;     Since the matrix A is symmetric, only the lower triangular portion plus
;     the diagonal is required for the Cholesky.  This will fill a special vector, R, only
;     containing the mandatory number of values.  In addition, to complete the decomposition
;     and backsubstitution, c' will be appended to the bottom of A and placed in R.
;
;
;  Prototype:
;     void TdqInit(void);
;
;  Input Arguments:
;     psa_SignalIn         - received input signal (x)
;
;  Output Arguments:
;     none
;
;  Return Value:
;     none
;
;  Global Variables Used:
;       gsa_pre_tdq_h[]          - (O) filter coeffcients
;     gfa_TdqLt[]          - (I) transpose of L matrix
;     guc_TdqTrainingState - (O) status of progress
;     gs_pre_tdq_h_exp        - (O) right shift amount for filter
;
;  Notes: This implementation uses floating point emulation with fixed
;     point operations and does not require any external floating point
;     libraries from the compiler.
;
;****************************************************************************/

// these buffers are mapped to handshake buffer
   // static Float32 fa_TempBuf[R_MAT_SIZE];             // temporary buffer to hold outputs of backsub
   //static Float32 fa_R[(R_MAT_SIZE*(R_MAT_SIZE+1))/2];
   // static Float32 fa_Xtz[TDQ_LENGTH];        // X'z
   // static Float32 fa_Temp[TDQ_LENGTH];

DATA_MAP_deILV3_BIS  //pointers currently point to guca_HndshkBuf[] in Deilv1
Float32 *fa_TempBuf;             // temporary buffer to hold outputs of backsub
   // sparse matrix used for Cholesky (holds A and c)
Float32 *fa_R;
Float32 *fa_Xtz;        // X'z
Float32 *fa_Temp;
DATA_MAP_END

int16 gs_TdqCtbk=0;
void TdqInit_new(void)
{
   int16 i, j;
   int16 *ps;
   int16 gs_MatrixSize;
   //int16 s_DLen;                           // difference between CPLen and TDQLen
   int16 s_Val, s_Val2;

   Float32 *fa_z = (Float32 *)(void *)gsa_TDQTrainingSignal;
   int16 *sa_TrainingSignal = gsa_TDQTrainingSignal;
   int16 *sa_FftInBuf = gsa_TDQTrainingSignal;
   int16 *sa_FftOutBuf = gsa_TDQTrainingSignal;
   // Reuse an existing buffer to store the power in the FFT bins.
   // Required size for this use is [RX_LAST_CHANNEL-RX_FIRST_CHANNEL+1] 32-bit doublewords.
   int32 *l_Power = (int32 *)(void *)gsa_TDQTrainingSignal;

   int16 s_CPLen;

DATA_MAP_deILV3_BIS
   static Float32 f_Sum;                           // intermediate sum
   static Float32 f_Val, f_Val2;


   // coefficient scaling
   int16   s_TDQScaleFactor = TDQ_SCALE_FACTOR;    // scale power of coeffs
   int16 s_MaxCoeffValue = MAX_COEFF_VALUE;        // make largest coeff. this value
   int16 s_MaxCoeffValueLog2 = MAX_COEFF_VALUE_LOG2;
    static Float32 f_MaxCoeff;                           // holds maximum coefficient
   static Float32 f_Rms;
   static Float32 f_ScaleFactor;
   static Float32 f_FScale;

DATA_MAP_END

#ifdef PRN_TDQ
   //Tmp print reverb and received signal for MATLAB test
   FILE *fpReverb, *fpReceive, *fpRxx, *fpRxy, *fpLtYtX, *fpA;
   FILE *fpR, *fpTemp, *fpTdqV, *fpXtz, *fpTempBuf;
#endif

   psa_SignalIn = gsa_RxRepFrameAlignBuf;

   /* if the OfflineTdq flag is TRUE, then gsa_pre_tdq_h[] and */
   /* gsa_pre_tdq_exp are already set to the desired values */

   if(gft_OfflineTdq == TRUE)
      return;

   gs_MatrixSize = gs_TDQLen + 1;
   s_CPLen = gs_RxSamplesPerFrame >> 4;

   Populate_W_Matrix();

   fa_Xtz[0] = int2f32(psa_SignalIn[0]);
   for (i=1; i<gs_TDQLen; i++)
      fa_Xtz[i] = int2f32(psa_SignalIn[gs_RxSamplesPerFrame - i]);

   // Now fill sparse R matrix with lower triangular and diagonal regions of A
   // as well as c'.
   for (i=0; i<gs_TDQLen; i++) {
      for (j=0; j<=i; j++) {
         LMat(fa_R, j, i, gs_MatrixSize) = LMat(fa_A, j, i, gs_TDQLen);
      }
   }
   for (i=0; i<gs_TDQLen; i++) {
      LMat(fa_R, i, gs_TDQLen, gs_MatrixSize) = fa_Xtz[i];
   }

#ifdef PRN_TDQ
   fpR = fopen("R_pre_chol.dat", "w");
   for(i=0; i<gs_TDQLen*(gs_TDQLen+1)/2; i++)
      fprintf(fpR, "%ld\n", fa_R[i]);
   fclose(fpR);
#endif

   // do Cholesky and backsubstitution to obtain floating-point coefficients
   Cholesky(fa_R, gs_MatrixSize);

#ifdef PRN_TDQ
   fpR = fopen("R_post_chol.dat", "w");
   for(i=0; i<gs_TDQLen*(gs_TDQLen+1)/2; i++)
      fprintf(fpR, "%ld\n", fa_R[i]);
   fclose(fpR);
#endif

      BackSubstitution(fa_R, fa_TempBuf, gs_MatrixSize);

#ifdef PRN_TDQ
   fpTempBuf = fopen("tdq32_c_code.dat", "w");
   for(i=0; i<gs_TDQLen; i++)
      fprintf(fpTempBuf, "%ld\n", fa_TempBuf[i]);
   fclose(fpTempBuf);
#endif

   // convert output coefficients to fixed point

   // first make largest coefficient s_MaxCoeffValue
   f_MaxCoeff = absf32(fa_TempBuf[0]);
   for (i=1; i<gs_TDQLen; i++) {
      f_Val = absf32(fa_TempBuf[i]);
      f_MaxCoeff = ( cmpgtf32(f_Val, f_MaxCoeff)) ? f_Val : f_MaxCoeff;
   }

   // calculate and scale (by s_MaxCoeffValue) a fixed point TDQ
   ps = &(gsa_pre_tdq_h[0]);
   for (i=0; i<gs_TDQLen; i++) {
      *ps++ = f32toint16(mpyf32(divf32(fa_TempBuf[i],f_MaxCoeff),int2f32(s_MaxCoeffValue)), 0);
   }

#ifdef PRN_TDQ
   fpTempBuf = fopen("tdq_1.dat", "w");
   fprintf(fpTempBuf, "%x\n", f_MaxCoeff);
   for(i=0; i<gs_TDQLen; i++)
      fprintf(fpTempBuf, "%d\n", gsa_pre_tdq_h[i]);
   fclose(fpTempBuf);
#endif

   // Scale TDQ taps.
   // convolve input signal with TDQ to get the calculated output as Float32
   // and compute rms power and account for some  scaling during convolution.

   /* perform any platform-specific initializations */
   /* Note: MemCpyTDQFilterForConvolve is a wrapper function for doing memcopy */
   MemCpyTDQFilterForConvolve(&gsa_RxRepFrameAlignBuf[0], gsa_pre_tdq_h, (int16)(sizeof(int16)*2*gs_TDQLen));

   f_Rms = FixedPointConvolve48(&(gsa_RxRepFrameAlignBuf[gs_RxSamplesPerFrame]), gsa_RxRepFrameAlignBuf, fa_z,
         f_MaxCoeff, s_CPLen, gs_TDQLen, s_MaxCoeffValueLog2);

   // get time-based scaling
   f_ScaleFactor = divf32(int2f32(TDQ_SCALE_FACTOR),f_Rms);

   // do FFT of sample output, first make sure numbers are in range
   f_Val = 0;
   for (i=0; i<gs_RxSamplesPerFrame; i++) {
      f_Val2 = absf32(fa_z[i]);
      if (cmpgtf32(f_Val2, f_Val)) {
         f_Val = f_Val2;
        }
   }

    f_FScale = divf32(int2f32(s_MaxCoeffValue), f_Val);
   for (i=0; i<gs_RxSamplesPerFrame; i++) {
      sa_FftInBuf[i] = f32toint16(mpyf32(fa_z[i], f_FScale),0);
   }

   BgFftReal(sa_FftInBuf, sa_FftOutBuf, (int16)(gs_RxSamplesPerFrame>>1), (int16)(gs_Log2RxSamplesPerFrame-1));

    // get maximum value out of FFT
   s_Val = 0;
   for (i=0; i<(gs_RxSamplesPerFrame); i++) {
      s_Val2 = abs_s(sa_FftOutBuf[i]);
      if (s_Val2 > s_Val) {s_Val = s_Val2;}
   }

   // get combined frequency based scaling to get to TDQ_SCALE_FACTOR
   f_FScale = mpyf32(f_FScale,divf32(int2f32(TDQ_SCALE_FACTOR),int2f32(s_Val)));

   // final scaling will be minimum of time-based and frequency-based scaling
    f_ScaleFactor = (cmpgtf32(f_FScale, f_ScaleFactor) ) ? f_ScaleFactor : f_FScale;

   if (gs_CurrentCoChipset == IFTN_CO_CHIPSET) gs_TdqCtbk = 3; //for geminax CO use 3dB tdq scaling cutback
   else if (gs_TdqCtbk==-1) gs_TdqCtbk = ((gs_PGA_margin + gs_PGA_margin_delta_AnxL_BTloops)>>8);  //if initialized to -1, use pga margin for cutback
   for (i = 0; i < gs_TdqCtbk; i++)
   {
      f_ScaleFactor = divf32(mpyf32(f_ScaleFactor, int2f32(14602)), int2f32(16384));
   }

#ifdef PRN_TDQ
   fpTempBuf = fopen("tdq_2.dat", "w");
   fprintf(fpTempBuf, "Freq: %ld\tTime: %ld\n", f_FScale, f_ScaleFactor);
   fclose(fpTempBuf);
#endif

   // scale floating-point coeffcients
   for (i=0; i<gs_TDQLen; i++) {
      fa_TempBuf[i] = mpyf32(fa_TempBuf[i],f_ScaleFactor);
   }

   // max coefficient has scaled too
   f_MaxCoeff = mpyf32(f_MaxCoeff,f_ScaleFactor);

    //============================================================
    // get new scaling to scale max coefficient such that
    //  2^14 < |f_MaxCoeff| <= 2^15
    // (keep most significant digits in fixed point coefficients)
    //============================================================
   f_ScaleFactor = divf32(TWO_TO_15TH, f_MaxCoeff);

   // now get exponent, f_ScaleFactor = (2^exp) * (frac), w/ 1 <= frac < 2
    // set gs_pre_tdq_h_exp = exp
   gs_pre_tdq_h_exp = expf32(f_ScaleFactor);

   // scale coefficients by 2^(gs_pre_tdq_h_exp)
   f_ScaleFactor = pow2f32(gs_pre_tdq_h_exp);

    ps = &(gsa_pre_tdq_h[0]);
   for (i=0; i<gs_TDQLen; i++) {
      *ps++ = f32toint16(mpyf32(fa_TempBuf[i],f_ScaleFactor),0);
   }

#ifdef PRN_TDQ
   fpTempBuf = fopen("tdq_3.dat", "w");
   for(i=0; i<gs_TDQLen; i++)
      fprintf(fpTempBuf, "%ld\n", gsa_pre_tdq_h[i]);
   fclose(fpTempBuf);
#endif

   // restore C-Reverb reference which may have been obliterated by floating
   // point signal
   if ((gs_RxNumTones == 128) && (gs_RxSamplesPerFrame == 512))
   {
      Gen_Reverb_Ref_fd(gsa_CReverbRefTones, GEN_REVERB_TYPE_CO, gs_RxNumTones,
         gs_RxFirstChannel, gs_RxLastChannel, (int16)(DEC_QPSK_GAIN));
   }

        if(gs_pre_tdq_h_exp < LOWERBOUND_TDQEXP)
           gs_pre_tdq_h_exp = LOWERBOUND_TDQEXP;

        if(gs_pre_tdq_h_exp > UPPERBOUND_TDQEXP)
        {
           j= gs_pre_tdq_h_exp -UPPERBOUND_TDQEXP;
           gs_pre_tdq_h_exp = UPPERBOUND_TDQEXP;

           for(i=0; i<gs_TDQLen; i++)
              gsa_pre_tdq_h[i] >>= j;
        }

#ifdef PRN_TDQ
   fpTempBuf = fopen("tdq_4.dat", "w");
   for(i=0; i<gs_TDQLen; i++)
      fprintf(fpTempBuf, "%ld\n", gsa_pre_tdq_h[i]);
   fclose(fpTempBuf);
#endif

   // reset flag to signal training is finished (for background tasking)
   guc_TdqTrainingState = TRAINING_DONE;

   return;
}

//undefine constants to clarify code
#ifndef FLOAT_IN_SW
    #ifndef ABS_VAL
    #undef ABS_VAL(x)
    #endif
#endif
