;; **COPYRIGHT******************************************************************
;;    INTEL CONFIDENTIAL
;;    Copyright (C) 2017 Intel Corporation
;;    Copyright (C), 1994-2002 Aware Inc. All Rights Reserved.
;; ******************************************************************COPYRIGHT**
;; **DISCLAIMER*****************************************************************
;;   The source code contained or described herein and all documents related
;;   to the source code ("Material") are owned by Intel Corporation or its
;;   suppliers or licensors. Title to the Material remains with Intel
;;   Corporation or its suppliers and licensors. The Material may contain
;;   trade secrets and proprietary and confidential information of Intel
;;   Corporation and its suppliers and licensors, and is protected by
;;   worldwide copyright and trade secret laws and treaty provisions. No part
;;   of the Material may be used, copied, reproduced, modified, published,
;;   uploaded, posted, transmitted, distributed, or disclosed in any way
;;   without Intels prior express written permission.
;;
;;   No license under any patent, copyright, trade secret or other
;;   intellectual property right is granted to or conferred upon you by
;;   disclosure or delivery of the Materials, either expressly, by
;;   implication, inducement, estoppel or otherwise. Any license under
;;   such intellectual property rights must be express and approved by
;;   Intel in writing.
;; *****************************************************************DISCLAIMER**
;*************************************************************************
; Aware DMT Technology. Proprietary and Confidential.
;
; ADDRESS:         40 Middlesex Turnpike, Bedford, MA 01730-1413 USA
; TELEPHONE:       781.276.4000
; FAX:             781.276.4001
; WEB:             http://www.aware.com
;
; FILENAME:        vecpwr.s
;
; DESCRIPTION:     Computation loop for the function VectorPower
;
;*************************************************************************
.include "asm.h"
.include "xy_regs.h"

.define l_acc2_lsp,  %ACC1
.define l_acc1_lsp,  %ACC2


.text
;/*^^^
; *------------------------------------------------------------------------
; *
; *  Name : VectorPowerLoop
; *
; *  Description:  Computes the total power of a 16-bit signed vector X.
; *               If the vector X is represented as X[0], X[1], X[2], etc...
; *               the return value is:
; *
; *               Sum {i=0 to ndata-1} ((X[i]*X[i] + (ROUND_CNST) >> s_GuardBits)
; *
; *               where the rounding constant ROUND_CNST = 1<<(s_GuardBits-1).
; *  Prototype:
; *      int32 VectorPowerLoop(int32 psa_databuf, int16 s_ndata, int16 s_GuardBits)
; *
; *  Input Arguments:
; *      int32 psa_databuf - XY address of the complex vector, stored in interleaved
; *                     format, i.e. alternating real and imag 16-bit signed values.
; *
; *  Output Arguments:
; *
; *  Return:
; *      int32 l_power    - total power
; *
; *
; *  Notes:
; *
; *------------------------------------------------------------------------
; *^^^
; */


; int32 VectorPowerLoop(int32 psa_databuf, int16 s_ndata, int16 s_GuardBits, int16 s_XYmemory)
.global VectorPowerLoop

VectorPowerLoop:

   ; Input Arguments
   ; %r0 = psa_databuf
   ; %r1 = s_ndata
   ; %r2 = s_GuardBits
   ; %r3 = 0 - X memory, 1 - Y memory
   .define psa_databuf,  %r0
   .define s_ndata,      %r1
   .define s_GuardBits,  %r2
   .define s_XYmemory,  %r3

   .define scratch,      %r4
   .define scratch1,     %r5
   .define s_acc2_msp,     %r5
   .define s_acc1_msp,   %r6

   ;;;;;;;;;;
   ; prolog ;
   ;;;;;;;;;;
    push_s %blink


   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; prolog: none since leaf function and no stack frame for auto variables ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   sub.f  0, s_ndata, 0

   mov.le r0, 0
      ble 9f


   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; set up XY unit for computation ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   mul_op 0, 0, 0                ; clear the accumulators

   mov.f s_XYmemory, s_XYmemory
   bnz      isYMemory

   sr  r0, [%ax0]
   sr  0x1, [aux_mx0]               ; offset = 1, postupdate, linear, 32 bit mode, AM_MODULO

   asr.f  %lp_count, s_ndata     ; have the number of iterations since we are doing 2 multiplies at once
   bz     macx_loop_end       ; if (s_ndata == 1) skip the computation loop

    //ARC6 related
   sr macx_loop_end, [lp_end]
   sr macx_loop_start, [lp_start]

   nop
   nop

   ;;;;;;;;;;;;;;;;;;;;;;;;
   ; compute vector power ;
   ;;;;;;;;;;;;;;;;;;;;;;;;
   ;lp macx_loop_end ; for(i=0; i<s_ndata, i++)
#ifdef ARC7
    lp macx_loop_end
#endif
macx_loop_start:
      mac_op 0, aux_x0_u, aux_x0_u           ; the X window register increments only once
macx_loop_end:

   and.f  0, s_ndata, 1       ; if s_ndata is odd

; if prev line work, then these two lines not needed
   bz 5f
   lsl scratch, aux_x0_u, 16

   5:

   b  isAccumDone

isYMemory:
   sr  r0, [%ay0]
   sr  0x1, [aux_my0]               ; offset = 1, postupdate, linear, 32 bit mode, AM_MODULO

   asr.f  %lp_count, s_ndata     ; have the number of iterations since we are doing 2 multiplies at once
   bz     macy_loop_end       ; if (s_ndata == 1) skip the computation loop


    //ARC6 related
   sr macy_loop_end, [lp_end]
   sr macy_loop_start, [lp_start]

   nop
   nop

   ;;;;;;;;;;;;;;;;;;;;;;;;
   ; compute vector power ;
   ;;;;;;;;;;;;;;;;;;;;;;;;
   ;lp macy_loop_end ; for(i=0; i<s_ndata, i++)
#ifdef ARC7
    lp macy_loop_end
#endif

macy_loop_start:
      mac_op 0, aux_y0_u, aux_y0_u           ; the Y window register increments only once
macy_loop_end:

   and.f  0, s_ndata, 1       ; if s_ndata is odd
   bz 6f
   lsl scratch, aux_y0_u, 16     ; do the remaining multiply
   6:

isAccumDone:


    bz 7f
    mac_op 0, scratch, scratch
   7:

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; read out the accumulators ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   lr  l_acc1_lsp, [%AUX_XMAC1]     ; the 32 LSBs of the power from accumulator 1
   lr l_acc2_lsp, [%AUX_XMAC2]      ; the 32 LSBs of the power from accumulator 2

   lr  scratch, [%AUX_XMAC0]
   and s_acc2_msp, scratch, 0xFFFF     ; the 8 MSBs of the power from accumulator 2
   sexw s_acc2_msp, s_acc2_msp
   asr s_acc1_msp, scratch, 16         ; the 8 MSBs of the power from accumulator 1

   ;;;;;;;;;;;;;;;;;;;;;;;
   ; compute total power ;
   ;;;;;;;;;;;;;;;;;;;;;;;
   add.f %r1, l_acc2_lsp, l_acc1_lsp   ; the 32 LSBs of the total power
   adc   %r0, s_acc2_msp, s_acc1_msp   ; the 8 MSBs of the total power
                              ; this computation requires the carry flag
                              ; set by add.f

   ;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; round the final result ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;
   ;int32 round64(int32 l_AccH, int32 l_AccL, int32 s_bit_position);
   mov   %r2, s_GuardBits
   bl    round64

   ;;;;;;;;;;
   ; epilog ;
   ;;;;;;;;;;
   9:
    pop_s %blink
   j   [%blink]                  ; return to the caller through BLINK
