;; **COPYRIGHT******************************************************************
;;    INTEL CONFIDENTIAL
;;    Copyright (C) 2017 Intel Corporation
;;    Copyright (C), 1994-2002 Aware Inc. All Rights Reserved.
;; ******************************************************************COPYRIGHT**
;; **DISCLAIMER*****************************************************************
;;   The source code contained or described herein and all documents related
;;   to the source code ("Material") are owned by Intel Corporation or its
;;   suppliers or licensors. Title to the Material remains with Intel
;;   Corporation or its suppliers and licensors. The Material may contain
;;   trade secrets and proprietary and confidential information of Intel
;;   Corporation and its suppliers and licensors, and is protected by
;;   worldwide copyright and trade secret laws and treaty provisions. No part
;;   of the Material may be used, copied, reproduced, modified, published,
;;   uploaded, posted, transmitted, distributed, or disclosed in any way
;;   without Intels prior express written permission.
;;
;;   No license under any patent, copyright, trade secret or other
;;   intellectual property right is granted to or conferred upon you by
;;   disclosure or delivery of the Materials, either expressly, by
;;   implication, inducement, estoppel or otherwise. Any license under
;;   such intellectual property rights must be express and approved by
;;   Intel in writing.
;; *****************************************************************DISCLAIMER**
;*************************************************************************
; Aware DMT Technology. Proprietary and Confidential.
;
; ADDRESS:         40 Middlesex Turnpike, Bedford, MA 01730-1413 USA
; TELEPHONE:       781.276.4000
; FAX:             781.276.4001
; WEB:             http://www.aware.com
;
; FILENAME:        decxcorr.s
;
; DESCRIPTION:     This file contains the DECXcorr function used in DEC training.
;
;*************************************************************************
.include "asm.h"

.data
pla_DECTrainTxBufXY:
   .size pla_DECTrainTxBufXY, 1*BYTES_PER_LONGWORD
   .type pla_DECTrainTxBufXY, @object
   .word   0
   .align   BYTES_PER_LONGWORD

pla_DECTrainTxBufOddXY: ; we assume that the Tx DEC signal length is even
   .size pla_DECTrainTxBufOddXY, 1*BYTES_PER_LONGWORD
   .type pla_DECTrainTxBufOddXY, @object
   .word   0
   .align   BYTES_PER_LONGWORD

pla_DECTrainRxBufXYAddr:
   .size pla_DECTrainRxBufXYAddr, 1*BYTES_PER_LONGWORD
   .type pla_DECTrainRxBufXYAddr, @object
   .word   0
   .align   BYTES_PER_LONGWORD

s_DECTxSignalLen:
   .size s_DECTxSignalLen, 1*BYTES_PER_WORD
   .type s_DECTxSignalLen, @object
   .half   0
   .align   BYTES_PER_WORD

s_DECRxBlockSize:
   .size s_DECRxBlockSize, 1*BYTES_PER_WORD
   .type s_DECRxBlockSize, @object
   .half   0
   .align   BYTES_PER_WORD

.text
;/*******************************************************************************
;*
;*   Prototype: void SetupXYMemoryForDECXcorr(int32 psa_TxBuf, int16 s_TxBufSize,
;*                            int32 psa_RxBuf, int16 s_RxBufSize, int16 s_pitch);
;*
;*
;*   Description:
;*       This function formats the existing Tx and Rx DEC buffers in XY memory and
;*       configures the XY window registers for use in the DECXcorr routine.
;*
;*   Arguments:
;*       psa_TxBuf:   (I) XY address of the Tx DEC buffer (Y memory)
;*       s_TxBufSize: (I) size in 16-bit words of the Tx buffer; must be even
;*       psa_RxBuf:   (I) XY address of the Rx DEC buffer (X memory)
;*       s_RxBufSize: (I) size in 16-bit words of the Rx buffer; must be even
;*       s_pitch:     (I) ratio of sampling rate of received sequence to
;*                        the sampling rate of the decorrelating sequence;
;*                        thus s_RxBufSize should be a multiple of this number
;*
;*   Return Value:
;*
;*   Global Variables:
;*
;*******************************************************************************/
; void SetupXYMemoryForDECXcorr(int32 psa_TxBuf, int16 s_TxBufSize, int32 psa_RxBuf, int16 s_RxBufSize, int16 s_pitch);
.global SetupXYMemoryForDECXcorr
SetupXYMemoryForDECXcorr:
   ; Input Arguments
   ; %r0 = psa_TxBuf
   ; %r1 = s_TxBufSize
   ; %r2 = psa_RxBuf
   ; %r3 = s_RxBufSize
   ; %r4 = s_pitch

   .define psa_TxBuf, %r0
   .define s_TxBufSize,   %r1
   .define psa_RxBuf,     %r2
   .define s_RxBufSize,   %r3
   .define s_pitch,       %r4
   .define s_TxBufSize_sav, %r13

   ;;;;;;;;;;
   ; prolog ;
   ;;;;;;;;;;
   .define FRAMESIZE_IN_BYTES, (BTSIZE_IN_BYTES + 1*BYTES_PER_LONGWORD)
   st  %blink, [%sp, BYTES_PER_LONGWORD]
   st  %fp, [%sp]
   mov %fp, %sp
   sub %sp, %sp, FRAMESIZE_IN_BYTES

   st  s_TxBufSize_sav, [%sp, BTSIZE_IN_BYTES]
   mov s_TxBufSize_sav, s_TxBufSize

   ;**********************************************
   ; compute and save variables used in DECXcorr *
   ;**********************************************
   .define scratch,      %r6
   .define pla_TxBuf32,  %r5
   .define pla_TxBufOdd, %r5
   .define s_leftShift,  %r6
   .define s_log2bufLen, %r6
   .define s_blockSize,  %r6
   ;;;;;;;;;;;;;;;;
   ; Tx variables ;
   ;;;;;;;;;;;;;;;;
   asr pla_TxBuf32, psa_TxBuf
   st  pla_TxBuf32, [pla_DECTrainTxBufXY]

   ; compute starting offset of the odd address block
   sub  scratch, s_TxBufSize, 1
   norm s_leftShift, scratch
   sub  s_log2bufLen, 30, s_leftShift
   asl  s_blockSize, 1, s_log2bufLen
   add  pla_TxBufOdd, pla_TxBuf32, s_blockSize
   st   pla_TxBufOdd, [pla_DECTrainTxBufOddXY]

   stw  s_TxBufSize, [s_DECTxSignalLen]

   .undef pla_TxBuf32
   .undef pla_TxBufOdd
   .undef s_leftShift
   .undef s_log2bufLen
   .undef s_blockSize

   ;;;;;;;;;;;;;;;;
   ; Rx variables ;
   ;;;;;;;;;;;;;;;;
   ; to use the dual MAC for the cross-correlation routine,
   ; the data needs to be set up as follows:
   ;
   ;  Data                                           Address in 32-bit Addressing mode
   ;  |   s_pitch + offset |             offset |    n
   ;  | 3*s_pitch + offset | 2*s_pitch + offset |    n+1
   ;  | 5*s_pitch + offset | 4*s_pitch + offset |    n+3
   ;  etc.
   ;   where this data arrangement is repeated for 0 <= offset < s_pitch

   ; each Y data block will thus contain s_RxBufSize/s_pitch entries of 16-bit data
   ; s_RxBufLen/s_pitch is an integer because 1 Rx sample = s_pitch * 1 Tx sample
   ; compute s_subBufSize = s_RxBufSize/s_pitch using long division
   sub.f 0, s_RxBufSize, s_pitch
   blt   9f

   .define divisor,  %r5
   .define dividend, %r6
   .define scratch,  %r7
   .define scratch1, %r8
   .define s_leftShift, %r9
   .define s_subBufSize, %r12

   norm scratch, s_RxBufSize
   norm scratch1, s_pitch
   sub  s_leftShift, scratch1, scratch
   asl  divisor, s_pitch, s_leftShift
   add  %lp_count, s_leftShift, 1
   mov  dividend, s_RxBufSize
   mov  s_subBufSize, 0

   lp 0f
      asl      s_subBufSize, s_subBufSize
      sub.f    0, dividend, divisor
      sub.ge.f dividend, dividend, divisor
      add.ge   s_subBufSize, s_subBufSize, 1
      asr      divisor, divisor
   0:

   .undef dividend
   .undef divisor
   .undef scratch
   .undef scratch1

   .define scratch, %r5
   .define s_leftShift, %r5
   .define s_log2bufLen, %r5
   .define s_blockSize32, %r11
   .define s_blockSize, %r11
   ; compute the power of 2 immediately greater than s_subBufSize
   sub  scratch, s_subBufSize, 1
   norm s_leftShift, scratch
   sub  s_log2bufLen, 30, s_leftShift
   asl  s_blockSize32, 1, s_log2bufLen
   stw  s_blockSize32, [s_DECRxBlockSize]
   asl  s_blockSize, s_blockSize32

   .undef scratch
   .undef s_leftShift
   .undef s_log2bufLen
   .undef s_blockSize32

   ; place formatted data in memory above current buffer containing the Rx data
   .define psa_RxBufDec, %r10
   .define pla_RxBufDec, %r5
   add psa_RxBufDec, psa_RxBuf, s_RxBufSize
   asr pla_RxBufDec, psa_RxBufDec
   st  pla_RxBufDec, [pla_DECTrainRxBufXYAddr]

   .undef pla_RxBufDec

   ;;;;;;;;;;;;;;;;;;;;
   ; set up Rx memory ;
   ;;;;;;;;;;;;;;;;;;;;
   .define l_x0_set, %r5
   sr  psa_RxBuf, [%ax0]
   or  l_x0_set, 0x2000_0000, s_pitch
   sr  l_x0_set, [%mx0]             ; offset = s_pitch, postupdate, linear, 16-bit mode, AM_MODULO
   .undef l_x0_set

   sr  psa_RxBufDec, [%ax1]
   sr  0x2000_0001, [%mx1]             ; offset = 1, postupdate, linear, 16-bit mode, AM_MODULO

   ; set up the copy loop
   .define scratch, %r5
   .define scratch1, %r6
   mov %lp_count, s_subBufSize
   mov   scratch, 3f >> LOG2_BYTES_PER_LONGWORD
   add   scratch1, scratch, 1
   sr    scratch, [%LP_START]
   sr    scratch1, [%LP_END]
   nop
   .undef scratch
   .undef scratch1

   .define s_TxRxOffset, %r5
   .define psa_RxSrcBuf, %r6
   mov s_TxRxOffset, 0

   2:                                  ; for (i=0; i < s_pitch; i++)
      3:                               ; for (j=0; j < s_subBufSize; j++) {
         mov   %x1_u, %x0_u

      mov   %lp_count, s_subBufSize
      add   psa_RxBufDec, psa_RxBufDec, s_blockSize
      sr    psa_RxBufDec, [%ax1]
      add   s_TxRxOffset, s_TxRxOffset, 1
      add   psa_RxSrcBuf, psa_RxBuf, s_TxRxOffset
      sr    psa_RxSrcBuf, [%ax0]
      sub.f 0, s_TxRxOffset, s_pitch
      blt 2b

   .undef l_x0_set
   .undef s_TxRxOffset
   .undef psa_RxSrcBuf
   .undef psa_RxBufDec
   .undef s_blockSize

   ;;;;;;;;;;;;;;;;;;;;
   ; set up Tx memory ;
   ;;;;;;;;;;;;;;;;;;;;
   ; void AlignOddAddressWordsX(int32 psa_xyBuf, int16 s_bufLen)
   ; mov %r0, psa_TxBuf ; don't need this line because psa_TxBuf is already in %r0
   ; mov %r1, s_TxBufSize  ; don't need this line because s_TxBufSize is already in %r1
   bl  AlignOddAddressWordsY

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; set address modifier registers for the DECXcorr function ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   .define l_x0_set, %r5
   .define l_y0_set, %r5
   ; Tx modifier
   ; asr l_y0_set, s_TxBufSize_sav
   ; asl l_y0_set, l_y0_set, 16
   asl l_y0_set, s_TxBufSize_sav, 16-1 ; bits 16:28 = modulo length
   or  l_y0_set, l_y0_set, 0x1         ; bits 0:13 = offset = 1
                              ; all other bits = postupdate, linear, 32-bit mode, AM_MODULO
   sr  l_y0_set, [%my0]

   ; Rx modifier
   ;asr l_x0_set, s_subBufSize
   ;asl l_x0_set, l_x0_set, 16
   asl l_x0_set, s_subBufSize, 16   ; bits 16:28 = modulo length
   or  l_x0_set, l_x0_set, 0x1      ; bits 0:13 = offset = 1
                           ; all other bits = postupdate, linear, 32-bit mode, AM_MODULO
   sr  l_x0_set, [%mx0]       ; offset = 1, postupdate, linear, 32 bit mode, AM_MODULO

   .undef l_x0_set
   .undef l_y0_set

   ;;;;;;;;;;
   ; epilog ;
   ;;;;;;;;;;
   9:
   ld   s_TxBufSize_sav, [%sp, BTSIZE_IN_BYTES]
   ld   %blink, [%fp, 1*BYTES_PER_LONGWORD]
   j.d  [%blink]
   ld.a %fp, [%sp, FRAMESIZE_IN_BYTES]
   .undef FRAMESIZE_IN_BYTES

;/*******************************************************************************
;*
;*   Prototype: void DECXcorr(int16 s_TxOffset, int16 s_RxOffset,
;*       int32 *pl_Xcorr, int16 s_cnt, int16 s_pitch, int16 s_RS)
;*
;*   Description:
;*       This function is called by DECTrain2b to help compute the inner loop
;*       of the circular cross correlation between the received sequence
;*       (DEC_TRAINING_PERIOD*gs_RxSamplesPerFrame) and a "decorrelating
;*       sequence" (length = DEC_TRAINING_PERIOD*TxFftLength).
;*
;*   Arguments:
;*       s_TxOffset: (I) offset from the beginning of the decorrelating sequence
;*       s_RxOffset: (I) offset from the beginning of the received sequence
;*       pl_Xcorr:   (I/O) pointer to the cross-correlation
;*       s_cnt:      (I) number of times to execute the loop
;*       s_pitch:    (I) ratio of sampling rate of received sequence to
;*           the sampling rate of the decorrelating sequence
;*       s_RS:       (I) amount to scale down accumulator
;*
;*   Return Value:
;*
;*   Global Variables:
;*
;*******************************************************************************/
; void DECXcorr(int16 s_TxOffset, int16 s_RxOffset, int32 *pl_Xcorr, int16 s_cnt, int16 s_pitch, int16 s_RS)
.global DECXcorr
DECXcorr:
   ; Input Arguments
   ; %r0 = s_TxOffset
   ; %r1 = s_RxOffset
   ; %r2 = pl_Xcorr
   ; %r3 = s_cnt
   ; %r4 = s_pitch
   ; %r5 = s_RS
   .define s_TxOffset,     %r0
   .define s_RxOffset,     %r1
   .define pl_Xcorr,       %r2
   .define s_cnt,          %r3
   .define s_pitch,        %r4
   .define s_RS,           %r5

   .define pla_RxAddr,     %r1 ; should be the same as s_RxOffset

   .define pl_Xcorr_sav,   %r13

   ;;;;;;;;;;
   ; prolog ;
   ;;;;;;;;;;
   .define FRAMESIZE_IN_BYTES, (BTSIZE_IN_BYTES + 1*BYTES_PER_LONGWORD)
   st  %blink, [%sp, 4]       ; save return address register
   st  %fp, [%sp]             ; save caller's frame pointer
   mov %fp, %sp               ; set new frame pointer
   sub %sp, %sp, FRAMESIZE_IN_BYTES ; allocate frame
   st  pl_Xcorr_sav, [%sp, BTSIZE_IN_BYTES]  ; save non-volatile register
   mov pl_Xcorr_sav, pl_Xcorr

   ;;;;;;;;;
   ; start ;
   ;;;;;;;;;
   sub.f 0, s_cnt, 0
   ble   9f             ; if (s_cnt <= 0) return

   .define pla_TxBuf, %r6
   .define s_TxMaxOffsetPlus1, %r7
   .define s_TxOffset32, %r0
   .define pla_TxAddr, %r0

   ; set up for MAC (multiply and accumulate)
   and.f 0, 1, s_TxOffset
   ld    pla_TxBuf, [pla_DECTrainTxBufXY]
   bz    0f
   ld    pla_TxBuf, [pla_DECTrainTxBufOddXY]

   0:
   ldw    s_TxMaxOffsetPlus1, [s_DECTxSignalLen]
   sub.f  0, s_TxOffset, s_TxMaxOffsetPlus1
   sub.ge s_TxOffset, s_TxOffset, s_TxMaxOffsetPlus1
   asr    s_TxOffset32, s_TxOffset
   add    pla_TxAddr, pla_TxBuf, s_TxOffset32
   sr     pla_TxAddr, [%ay0]

   .undef pla_TxBuf
   .undef s_TxMaxOffsetPlus1
   .undef s_TxOffset32
   .undef pla_TxAddr
   .undef s_TxOffset

   .define s_RxBlockSize, %r0
   .define s_RxBlockOffset, %r0
   .define pla_RxBaseAddr, %r1
   .define pla_RxAddr, %r0
   ldw s_RxBlockSize, [s_DECRxBlockSize]
   mul 0, s_RxOffset, s_RxBlockSize
   lr  s_RxBlockOffset, [%AUX_XMAC2]
   ld  pla_RxBaseAddr, [pla_DECTrainRxBufXYAddr]
   add pla_RxAddr, s_RxBlockOffset, pla_RxBaseAddr
   sr  pla_RxAddr, [%ax0]
   .undef s_RxBlockSize
   .undef s_RxBlockOffset
   .undef pla_RxBaseAddr
   .undef s_RxOffset

   asr.f %lp_count, s_cnt        ; because of the dual MAC, we are doing two multiplies and adds at once
                           ; so we halve the number of loops

   bz    3f                ; if (s_cnt == 1) jump to code that handles odd number of multiplies

   .define scratch, %r0
   .define scratch1, %r1
   ; set up the single instruction loop
   mov   scratch, 2f >> LOG2_BYTES_PER_LONGWORD
   add   scratch1, scratch, 1
   sr    scratch, [%LP_START]
   sr    scratch1, [%LP_END]

   mul 0, 0, 0                ; clear the accumulator
   nop                        ; 2 instructions are required between a write to
                           ; LP_END (LP_START) and its usage
   ; perform MAC
   2:                ; for (n=0; n<s_cnt; n++)
      mac 0, %x0_u, %y0_u        ; l_Acc += ((int32) *psa_RxInBuf) * (*psa_TxDataBuf)

   and.f 0, s_cnt, 0x1
   bz    4f

   3: ; if s_cnt is odd, handle the odd multiply
   and scratch, %x0_u, 0xFFFF    ; grab only the next word of Rx data
   and scratch1, %y0_u, 0xFFFF      ; grab only the next word of Tx data
   mac 0, scratch, scratch1
   .undef scratch
   .undef scratch1
   .undef s_cnt

   4:
   .define l_Acc0_lsp,  %r0
   .define l_Acc1_lsp, %r1
   .define l_Acc_ext,  %r3
   .define s_Acc1_msp, %r6
   .define s_Acc0_msp, %r3
   lr l_Acc0_lsp, [%AUX_XMAC1]
   lr l_Acc1_lsp, [%AUX_XMAC2]

   ; extract the extension bits
   lr     l_Acc_ext, [%AUX_XMAC0]
   and    s_Acc1_msp, l_Acc_ext, 0xFFFF
   sexw   s_Acc1_msp, s_Acc1_msp
   asr    s_Acc0_msp, l_Acc_ext, 16
   .undef l_Acc_ext

   ; compute the total
   add.f %r1, l_Acc0_lsp, l_Acc1_lsp      ; the 32 LSBs of the sum
   adc   %r0, s_Acc0_msp, s_Acc1_msp      ; the 8 MSBs of the sum
   .undef l_Acc0_lsp
   .undef l_Acc1_lsp
   .undef s_Acc0_msp
   .undef s_Acc1_msp

   ;int32 round64(int32 l_AccH, int32 l_AccL, int32 s_bit_position);
   mov   %r2, s_RS
   bl    round64

   .define l_Xcorr, %r1
   ; add new MAC value to previously stored MAC value
   ld  l_Xcorr, [pl_Xcorr_sav]
   add l_Xcorr, l_Xcorr, %r0

   st  l_Xcorr, [pl_Xcorr_sav]
   .undef l_Xcorr

   ;;;;;;;;;;
   ; epilog ;
   ;;;;;;;;;;
   9:
   ld    pl_Xcorr_sav, [%sp, BTSIZE_IN_BYTES]   ; restore saved non-volatile register
   ld    %blink, [%fp, 4]           ; restore the return-address register
   j.d   [%blink]                ; return to the caller through BLINK
   ld.a  %fp, [%sp, FRAMESIZE_IN_BYTES]   ; restore the caller's frame-pointer register
                              ; and stack-pointeR register
   .undef FRAMESIZE_IN_BYTES
