;; **COPYRIGHT******************************************************************
;;    INTEL CONFIDENTIAL
;;    Copyright (C) 2017 Intel Corporation
;;    Copyright (C), 1994-2002 Aware Inc. All Rights Reserved.
;; ******************************************************************COPYRIGHT**
;; **DISCLAIMER*****************************************************************
;;   The source code contained or described herein and all documents related
;;   to the source code ("Material") are owned by Intel Corporation or its
;;   suppliers or licensors. Title to the Material remains with Intel
;;   Corporation or its suppliers and licensors. The Material may contain
;;   trade secrets and proprietary and confidential information of Intel
;;   Corporation and its suppliers and licensors, and is protected by
;;   worldwide copyright and trade secret laws and treaty provisions. No part
;;   of the Material may be used, copied, reproduced, modified, published,
;;   uploaded, posted, transmitted, distributed, or disclosed in any way
;;   without Intels prior express written permission.
;;
;;   No license under any patent, copyright, trade secret or other
;;   intellectual property right is granted to or conferred upon you by
;;   disclosure or delivery of the Materials, either expressly, by
;;   implication, inducement, estoppel or otherwise. Any license under
;;   such intellectual property rights must be express and approved by
;;   Intel in writing.
;; *****************************************************************DISCLAIMER**
;*************************************************************************
; Aware DMT Technology. Proprietary and Confidential.
;
; ADDRESS:         40 Middlesex Turnpike, Bedford, MA 01730-1413 USA
; TELEPHONE:       781.276.4000
; FAX:             781.276.4001
; WEB:             http://www.aware.com
;
; FILENAME:        noiseacc.s
;
; DESCRIPTION:     This file contains the noiseacc function used in SNR calculation
;
;*************************************************************************
.data
; int16 *gpla_NoiseAccRecvSignalBuf = (int16 *) 0;
.global gpla_NoiseAccRecvSignalBuf
gpla_NoiseAccRecvSignalBuf:
   .size gpla_NoiseAccRecvSignalBuf, 0x4
   .type gpla_NoiseAccRecvSignalBuf, @object
   .block  0x4
   .align   4
   .word   0

; int16 *gpla_NoiseAccRefSignalBuf = (int16 *) 0;
.global gpla_NoiseAccRefSignalBuf
gpla_NoiseAccRefSignalBuf:
   .size gpla_NoiseAccRefSignalBuf, 0x4
   .type gpla_NoiseAccRefSignalBuf, @object
   .block  0x4
   .align   4
   .word   0

.text
;/********************************************************************************************
;  Subroutine Name: NoiseAcc
;
;  Description:
;     This routine performs the following operations:
;
;       For each tone i,
;     1) Compute the noise power
;        pow[i] = (0.5*(ref_tone[i*2]-recv_tone[i*2]))^2
;                    + (0.5*(ref_tone[i*2+1]-recv_tone[i*2+1]))^2
;
;     2) Add pow[i] to the accumulated noise power array.
;
;       The scaling by 0.5 before the squaring is needed to avoid overflowing
;       the 32 bits variable that holds the square. Thus the final
;       accumulated power is the real power dividied by 4.
;
;       The code uses a 48 bits accumulator implemented as 2 overlapping
;       32-bits counters. Therefore, it takes 2 32-bits words to store the
;       accumulated noise value for each tone.
;
;     This routine is called both during REVERB and MEDLEY SNR calculations.
;     The offset parameter is used to control the PN sequence shift during
;     MEDLEY SNR calculations.
;
;
;  Prototype:
;     void NoiseAcc(int32 *pla_NoisePower, int16 s_doffset, int16 *psa_recv_tones, int16 s_offset, int16 *psa_ref_tones,
;              int16 s_first_chan, int16 s_last_chan);
;
;  Input Arguments:
;     psa_recv_tones -- pointer to the received DMT tones
;     psa_ref_tones  -- pointer to the reference DMT tones
;     s_offset    -- offset used to generate Medley signal
;     s_first_chan   -- first channel
;     s_last_chan    -- last channel
;
;  Output Arguments:
;     pla_NoisePower -- pointer to accumulators for storing noise power.
;     s_doffset      -- offset for pla_Noisepower to point to the first noise value.
;                 Noise values will get written to pla_NoisePower[s_doffset] through
;                 pla_NoisePower[s_doffset+s_last_chan-s_first_chan], rather than
;                 pla_NoisePower[s_first_chan] through pla_NoisePower[s_last_chan]
;
;  Return Value:
;     none
;
;  Global Variables:
;     none
;
;****************************************************************************/
; void NoiseAcc(int32 *pla_NoisePower, int16 s_doffset, int16 *psa_recv_tones, int16 s_offset,
;           int16 s_modulo, int16 s_fft_length, int16 *psa_ref_tones,int16 s_first_chan, int16 s_last_chan)

.global NoiseAcc
NoiseAcc:
   .equ MAX_NUM_TONES,  256
   .equ BYTES_PER_WORD, 2
   .equ LOG2_BYTES_PER_WORD, 1
   .equ BYTES_PER_LONGWORD, 4
   .equ LOG2_BYTES_PER_LONGWORD, 2

   ; Input Arguments
   ; %r0 = pla_NoisePower
   ; %r1 = s_doffset
   ; %r2 = psa_recv_tones
   ; %r3 = s_offset
   ; %r4 = s_modulo
   ; %r5 = s_fft_length
   ; %r6 = psa_ref_tones
   ; %r7 = s_first_chan
   ; s_last_chan is on the stack
   .define pla_NoisePower, %r0
   .define s_doffset, %r1
   .define psa_recv_tones, %r2
   .define s_offset, %r3
   .define s_modulo, %r4
   .define s_fft_length, %r5
   .define psa_ref_tones, %r6
   .define s_first_chan, %r7
   .define s_last_chan, %r8

   .define s_numChannels, %r13
   .define s_2first_chan, %r14
   .define scratch, %XMAC0
   .define scratch1, %XMAC1
   .define sa_snr_ref_tones, %r15
   .define pla_out, %r9
   .define psa_rec, %r10
   .define psa_ref, %r11

   ;;;;;;;;;;
   ; prolog ;
   ;;;;;;;;;;
   ld  s_last_chan, [%sp, 16]    ; read from the 8th argument from the stack

   ; save stack back-trace data structure
   st %blink, [%sp, 4]
   st  %fp, [%sp]
   mov %fp, %sp

   ; allocate memory for the current frame (used entirely for the generate MEDLEY signal)
   ; we use the stack space only because there is no current method of writing the output of
   ; the function GenerateMedley to XY memory
   sub %sp, %sp, 2*MAX_NUM_TONES*2+3*4+16    ; the MEDLEY signal contains 2*MAX_NUM_TONES entries but only
                              ; 16-bit data.  We only need MAX_NUM_TONES 32-bit addresses
   st  %r13, [%sp, 16]
   st  %r14, [%sp, 20]
   st  %r15, [%sp, 24]
   add sa_snr_ref_tones, %sp, 28

   ; since we are calling another function
   ; finish using as many input arguments as we can
   sub s_numChannels, s_last_chan, s_first_chan
   add s_numChannels, s_numChannels, 1
   asl s_2first_chan, s_first_chan

   .undef s_first_chan ; %r7
   .undef s_last_chan   ; %r8
   ; free registers: %r9, %r10, %r11

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; set up the output address pointer ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   add pla_out, pla_NoisePower, s_doffset    ; pla_NoisePower += s_doffset
   sr  pla_out, [%ax1]
   sr  0x1, [%mx1]                        ; offset = 1, postupdate, linear, 32 bit mode, AM_MODULO

   .undef pla_NoisePower   ; %r0
   .undef s_doffset     ; %r1
   ; free registers: %r0, %r1, %r10, %r11

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; load received data into X memory                 ;
   ; and set up address pointers to the received data ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   sr  psa_recv_tones, [%burstsys]
   ld  scratch, [gpla_NoiseAccRecvSignalBuf]
   sr  scratch, [%burstxym]         ; write to the XY location gpla_NoiseAccRecvSignalBuf
                              ; address should be for 32-bit data

   ; psa_rec = psa_recv_tones + 2*s_first_chan
   ; convert gpla_NoiseAccRecvSignalBuf into address for 16-bit data
   asl scratch, scratch
   add psa_rec, scratch, s_2first_chan
   sr  psa_rec, [%ax0]
   sr  0x2000_0001, [%mx0]       ; offset = 1, postupdate, linear, 16 bit mode, AM_MODULO

   ; just use maximum possible data transfer for now (we can shorten this later)
   ; bit 30 = 1 (write to XY data), bit 29 = 0 (X memory)
   ; max ref signal size is NAX_NUM_TONES complex values
   ; = 2*MAX_NUM_TONES words = 4*MAX_NUM_TONES bytes
   sr  (0x4000_0000 | (((2*MAX_NUM_TONES) * BYTES_PER_WORD)-1)), [%burstsz]

   0: ; wait for end of burst
      lr    scratch, [%xyconfig]
      and.f 0, scratch, 0x10     ; get status
      bne 0b

   .undef psa_recv_tones
   ; free registers: %r0, %r1, %r2, %r11

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; load reference data into Y memory ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   sub.f 0, s_offset, 0
   ble   1f

   ; if (s_offset > 0)
   ; generate the MEDLEY signal using the reference data
   ; GenerateMedley(psa_ref_tones, sa_snr_ref_tones, s_offset, s_modulo, s_fft_length);
   mov %r0, psa_ref_tones
   mov %r1, sa_snr_ref_tones
   mov %r2, s_offset
   mov %r3, s_modulo
   mov %r4, s_fft_length

   .undef s_offset         ; %r3
   .undef s_modulo         ; %r4
   .undef s_fft_length     ; %r5
   ; free registers: %r0, %r1, %r2, %r3, %r4, %r5, %r11

   sub %sp, %sp, 16
   bl  GenerateMedley
   add %sp, %sp, 16

   mov %lp_count, MAX_NUM_TONES
   ld  scratch, [gpla_NoiseAccRefSignalBuf]
   sr  scratch, [%ay0]
   asl scratch, scratch    ; XY address speciifed in 16-bit address locations
   add psa_ref, scratch, s_2first_chan

   sr  0x1, [%my0]         ; offset = 1, postupdate, linear, 32 bit mode, AM_MODULO
   sub sa_snr_ref_tones, sa_snr_ref_tones, BYTES_PER_WORD

   ; copy MEDLEY signal into Y memory (can't use burst since it is part of the stack)
   lp 0f
      ldw.a scratch1, [sa_snr_ref_tones, BYTES_PER_WORD]
      ldw.a scratch, [sa_snr_ref_tones, BYTES_PER_WORD]
      asl scratch, scratch, 16
      or  scratch, scratch, scratch1
      mov %y0_u, scratch
   0:

   sr  psa_ref, [%ay0]
   b   2f


   ; else copy the reference signal as is into Y memory
   1:
   sr  0x2000_0001, [%my0]       ; offset = 1, postupdate, linear, 16 bit mode, AM_MODULO
   sr  psa_ref_tones, [%burstsys]
   ld  scratch, [gpla_NoiseAccRefSignalBuf]
   sr  scratch, [%burstxym]
   asl scratch, scratch    ; XY address speciifed in 16-bit address locations
   add scratch, scratch, s_2first_chan
   sr  scratch, [%ay0]        ; specified in 16-bit address locations
                        ; psa_ref += 2*s_first_chan

   ; bit 30 = 1 (write to XY data), bit 29 = 1 (X memory)
   ; max ref signal size is NAX_NUM_TONES complex values
   ; = 2*MAX_NUM_TONES words = 4*MAX_NUM_TONES bytes
   sr  (0x6000_0000 | (((2*MAX_NUM_TONES) * BYTES_PER_WORD)-1)), [%burstsz]

   0: ; wait for end of burst
      lr    scratch, [%xyconfig]
      and.f 0, scratch, 0x10     ; get status
      bne 0b

   .undef psa_ref_tones ; %r6
   .undef sa_snr_ref_tones ; %r12
   ; free registers: %r0, %r1, %r2, %r3, %r4, %r5, %r6, %r11, %r12

   2:

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; compute noise power and add it to the total power ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   mov %lp_count, s_numChannels
   sr  0x2000_0001, [%my0]       ; offset = 1, postupdate, linear, 16 bit mode, AM_MODULO
   nop
   nop

   lp 1f
   .if 0
      sub scratch, %y0_u, %x0_u  ; l_Acc = (int32) (*psa_ref++) - (int32) (*psa_rec++)
      asr scratch, scratch    ; l_Acc >>= 1
   .else
      subs scratch, %y0_u, %x0_u
      asr scratch, scratch    ; l_Acc >>= 1
   .endif

      mul  0, scratch, scratch   ; l_Acc = l_Acc*l_Acc

   .if 0
      sub scratch, %y0_u, %x0_u  ; l_Acc1 = (int32) (*psa_ref++) - (int32_ (*psa_ref++)
      asr scratch, scratch    ; l_Acc1 >>= l_Acc1
   .else
      subs scratch, %y0_u, %x0_u
      asr scratch, scratch    ; l_Acc1 >>= l_Acc1
   .endif

      mac  0, scratch, scratch   ; l_Acc += l_Acc1*l_Acc1, 33-bit number

      lr   %r0, [%AUX_XMAC1]
      lr   %r1, [%AUX_XMAC0]     ; read out the extension part (the extension part for the unused part of
                           ; the accumulator should be zero)
      and  %r1, %r1, 0xFFFF_0000    ; we only want the extension part for the channel that we are working on
      add.f %x1_u, %r0, %x1_u
      adc  %x1_u, %r1, %x1_u

      ; do right shift of 2 at the end
   1:

.if 0

   lp 1f ; compute l_Acc = (int32) (*psa_ref++) - (int32) (*psa_rec++)
      subs scratch, %y0_u, %x0_u ; real part
      mul  0, scratch, scratch
      subs scratch1, %y0_u, %x0_u   ; imaginary part
      mac  0, scratch1, scratch1


   1:


   lp 1f ; compute l_Acc = (int32) (*psa_ref++) - (int32) (*psa_rec++)
      subs scratch, %y0_u, %x0_u ; real part
      subs scratch1, %y0_u, %x0_u   ; imaginary part
   1:

   ; set up the single instruction loop
   mov scratch, 2f >> 2
   add scratch1, scratch, 1
   sr  scratch, [%LP_START]
   sr  scratch1, [%LP_END]
   mov %lp_count, s_numChannels
   sr  psa_rec, [%ax0]
   sr  0x1, [%mx0]         ; offset = 1, postupdate, linear, 32 bit mode, AM_MODULO
   mul 0, 0, 0
   nop                  ; a write to an XY register takes 2 cycles to take effect

   lp 2f
   2: ; l_Acc = l_Acc * l_Acc
      mac   0, %x0_u, %x0_u
      mul 0, %x0_u, %x0_u

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; store the final result into the 48-bit accumulator ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; read out accumulators
   lr %r0, [%AUX_XMAC1]
   lr %r1, [%AUX_XMAC2]

   ; extract the extension bits
   lr   %r3, [%AUX_XMAC0]
   and  %r4, %r3, 0xFFFF
   sexw %r4, %r4
   asr %r3, %r3, 16

   ; compute the total
   add.f %r0, %r0, %r1           ; the 32 LSBs of the sum
   adc   %r1, %r3, %r4           ; the 8 MSBs of the sum

   st.a  %r0, [pla_out, 1*BYTES_PER_LONGWORD]   ; store LO part
   st.a  %r1, [pla_out, 1*BYTES_PER_LONGWORD]   ; store HI part
   2:

   ; alternate implementation to avoid inplace storage of the noise difference
   ; compute the noise power
;  lp 1f
;     subs %r0, %y0_u, %x0_u
;     subs %r1, %y0_u, %x0_u
;     asr  %r0, %r0, 16
;     or   %r0, %r0, %r1
;     mac  0, %r0, %r0
;  1:
.endif
   ;;;;;;;;;;
   ; epilog ;
   ;;;;;;;;;;
   9:
   ld   %r13, [%sp, 16]
   ld   %r14, [%sp, 20]
   ld   %r15, [%sp, 24]
   ld   %blink, [%fp, 4]
   ld.a %fp, [%sp, 2*MAX_NUM_TONES*2+3*4+16]
   j    [%blink]



