;; **COPYRIGHT******************************************************************
;;    INTEL CONFIDENTIAL
;;    Copyright (C) 2017 Intel Corporation
;;    Copyright (C), 1994-2002 Aware Inc. All Rights Reserved.
;; ******************************************************************COPYRIGHT**
;; **DISCLAIMER*****************************************************************
;;   The source code contained or described herein and all documents related
;;   to the source code ("Material") are owned by Intel Corporation or its
;;   suppliers or licensors. Title to the Material remains with Intel
;;   Corporation or its suppliers and licensors. The Material may contain
;;   trade secrets and proprietary and confidential information of Intel
;;   Corporation and its suppliers and licensors, and is protected by
;;   worldwide copyright and trade secret laws and treaty provisions. No part
;;   of the Material may be used, copied, reproduced, modified, published,
;;   uploaded, posted, transmitted, distributed, or disclosed in any way
;;   without Intels prior express written permission.
;;
;;   No license under any patent, copyright, trade secret or other
;;   intellectual property right is granted to or conferred upon you by
;;   disclosure or delivery of the Materials, either expressly, by
;;   implication, inducement, estoppel or otherwise. Any license under
;;   such intellectual property rights must be express and approved by
;;   Intel in writing.
;; *****************************************************************DISCLAIMER**
;*************************************************************************
; Aware DMT Technology. Proprietary and Confidential.
;
; ADDRESS:         40 Middlesex Turnpike, Bedford, MA 01730-1413 USA
; TELEPHONE:       781.276.4000
; FAX:             781.276.4001
; WEB:             http://www.aware.com
;
; FILENAME:        nacc.s
;
; DESCRIPTION:     This file contains the loop function used to accumulate
;                  noise power in NoiseAcc.c
;
;*************************************************************************
.include "asm.h"
.include "xy_regs.h"

.text
;/********************************************************************************************
;  Subroutine Name: NoiseAcc
;
;  Description:
;     This routine performs the following operations:
;
;       For each tone i,
;     1) Compute the noise power
;        pow[i] = (0.5*(ref_tone[i*2]-recv_tone[i*2]))^2
;                    + (0.5*(ref_tone[i*2+1]-recv_tone[i*2+1]))^2
;
;     2) Add pow[i] to the accumulated noise power array.
;
;       The scaling by 0.5 before the squaring is needed to avoid overflowing
;       the 32 bits variable that holds the square. Thus the final
;       accumulated power is the real power dividied by 4.
;
;       The code uses a 48 bit accumulator implemented as two 32-bit words,
;       one containing the 16 MSBs and the other the 32 LSBs.  Therefore,
;       it takes 2 32-bits words to store the accumulated noise value for each tone.
;
;  Prototype:
;      void NoiseAccLoop(int32 pla_NoisePower, int32 psa_rec, int32 psa_ref,
;        int16 s_numChannels, int16 s_modulo);
;
;  Input Arguments:
;     pla_NoisePower -- XY address of the accum buffer (32-bit addressing mode) (in X Memory)
;     psa_rec        -- XY address of the first channel of the received DMT tones
;                         (16-bit addressing mode) (in X Memory)
;       psa_ref        -- XY address of the first channel of the reference DMT tones (in Y Memory)
;       s_numChannels  -- number of channels
;       s_modulo       -- lenght of the MEDLEY PN sequence
;
;  Return Value:
;     none
;
;  Global Variables:
;     none
;
;****************************************************************************/
; void NoiseAccLoop(int32 pla_NoisePower, int32 psa_rec, int32 psa_ref,
;           int16 s_numChannels, int16 s_modulo);


.global NoiseAccLoop
NoiseAccLoop:
   .equ MAX_NUM_TONES,  256
   ; Input Arguments
   ; %r0 = pla_NoisePower
   ; %r1 = psa_rec
   ; %r2 = psa_ref
   ; %r3 = s_numChannels
   ; %r4 = s_modulo
   .define pla_NoisePower, %r0
   .define psa_rec,        %r1
   .define psa_ref,        %r2
   .define s_numChannels,  %r3
   .define s_modulo,       %r4
   .define l_signMask,     %r5
   .define scratch,        %r8


   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; prolog: none since leaf function and no stack frame for auto variables ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; set up the window registers ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   sr  psa_rec, [%ax0]
   sr  0x2000_0001, [aux_mx0]       ; offset = 1, postupdate, linear, 16-bit mode, AM_MODULO

   sr  pla_NoisePower, [%ax1]
   sr  0x1, [aux_mx1]               ; offset = 1, postupdate, linear, 32-bit mode, AM_MODULO

   sr  psa_ref, [%ay0]
   and s_modulo, s_modulo, ((1<<13) - 1)
   asl scratch, s_modulo, 16
   or  scratch, scratch, 0x2000_0001
   sr  scratch, [aux_my0]        ; offset = 1, postupdate, linear, 16-bit mode, AM_MODULO, modulo = s_modulo

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; compute noise power and add it to the total power ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   mov %lp_count, s_numChannels
   mov l_signMask, (1 << 31)
   nop

   lp.nz 0f
      sub.f  scratch, aux_y0_u, aux_x0_u     ; l_Acc = (int32) (*psa_ref++) - (int32) (*psa_rec++)
      asr    scratch, scratch, 1       ; l_Acc >>= 1
      xor.v  scratch, scratch, l_signMask ; if (overflow) then, preserve sign by xor`ing signMask with
                                 ; the scratch register to restore sign.
      mul_op 0, scratch, scratch       ; l_Acc = l_Acc*l_Acc

      sub.f  scratch, aux_y0_u, aux_x0_u     ; l_Acc1 = (int32) (*psa_ref++) - (int32_ (*psa_ref++)
      asr    scratch, scratch, 1       ; l_Acc >>= 1
      xor.v  scratch, scratch, l_signMask ; xor signMask to preserve sign flip due to overflow
      mac_op 0, scratch, scratch       ; l_Acc += l_Acc1*l_Acc1, 33-bit number

      lr   %r0, [%AUX_XMAC1]
      lr   %r1, [%AUX_XMAC0]           ; read out the extension part (the extension part for the unused part of
                                 ; the accumulator should be zero)
      asr   %r1, %r1, 16               ; we only want the extension part for the channel that we are working on
      add.f aux_x1_u, %r0, aux_x1_u
      adc   aux_x1_u, %r1, aux_x1_u
   0:

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; epilog: just return to the caller ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   9:
   j    [%blink]

;****************************************************************************/
; void NoiseAccLoopXY(int32 pla_NoisePower, int32 psa_rec, int32 psa_ref,
;           int16 s_numChannels, int16 s_modulo);


.global NoiseAccLoopXY
NoiseAccLoopXY:
;  .equ MAX_NUM_TONES,  256
   ; Input Arguments
   ; %r0 = pla_NoisePower
   ; %r1 = psa_rec
   ; %r2 = psa_ref
   ; %r3 = s_numChannels
   ; %r4 = s_modulo
   .define pla_NoisePower, %r0
   .define psa_rec,        %r1
   .define psa_ref,        %r2
   .define s_numChannels,  %r3
   .define s_modulo,       %r4
   .define l_signMask,     %r5
   .define scratch,        %r8


   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; prolog: none since leaf function and no stack frame for auto variables ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; set up the window registers ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   sr  psa_rec, [%ay1]
   sr  0x2000_0001, [aux_my1]       ; offset = 1, postupdate, linear, 16-bit mode, AM_MODULO

   sr  pla_NoisePower, [%ax1]
   sr  0x1, [aux_mx1]               ; offset = 1, postupdate, linear, 32-bit mode, AM_MODULO

   sr  psa_ref, [%ay0]
   and s_modulo, s_modulo, ((1<<13) - 1)
   asl scratch, s_modulo, 16
   or  scratch, scratch, 0x2000_0001
   sr  scratch, [aux_my0]        ; offset = 1, postupdate, linear, 16-bit mode, AM_MODULO, modulo = s_modulo

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; compute noise power and add it to the total power ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   mov %lp_count, s_numChannels
   mov l_signMask, (1 << 31)
   nop

   lp.nz 0f
      sub.f  scratch, aux_y0_u, aux_y1_u     ; l_Acc = (int32) (*psa_ref++) - (int32) (*psa_rec++)
      asr    scratch, scratch, 1       ; l_Acc >>= 1
      xor.v  scratch, scratch, l_signMask ; if (overflow) then, preserve sign by xor`ing signMask with
                                 ; the scratch register to restore sign.
      mul_op 0, scratch, scratch       ; l_Acc = l_Acc*l_Acc

      sub.f  scratch, aux_y0_u, aux_y1_u     ; l_Acc1 = (int32) (*psa_ref++) - (int32_ (*psa_ref++)
      asr    scratch, scratch, 1       ; l_Acc >>= 1
      xor.v  scratch, scratch, l_signMask ; xor signMask to preserve sign flip due to overflow
      mac_op 0, scratch, scratch       ; l_Acc += l_Acc1*l_Acc1, 33-bit number

      lr   %r0, [%AUX_XMAC1]
      lr   %r1, [%AUX_XMAC0]           ; read out the extension part (the extension part for the unused part of
                                 ; the accumulator should be zero)
      asr   %r1, %r1, 16               ; we only want the extension part for the channel that we are working on
      add.f aux_x1_u, %r0, aux_x1_u
      adc   aux_x1_u, %r1, aux_x1_u
   0:

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; epilog: just return to the caller ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   9:
   j    [%blink]


   .undef pla_NoisePower
   .undef psa_rec
   .undef psa_ref
   .undef s_numChannels
   .undef s_modulo
   .undef scratch
   .undef l_signMask


