;; **COPYRIGHT******************************************************************
;;    INTEL CONFIDENTIAL
;;    Copyright (C) 2017 Intel Corporation
;;    Copyright (C), 1994-2002 Aware Inc. All Rights Reserved.
;; ******************************************************************COPYRIGHT**
;; **DISCLAIMER*****************************************************************
;;   The source code contained or described herein and all documents related
;;   to the source code ("Material") are owned by Intel Corporation or its
;;   suppliers or licensors. Title to the Material remains with Intel
;;   Corporation or its suppliers and licensors. The Material may contain
;;   trade secrets and proprietary and confidential information of Intel
;;   Corporation and its suppliers and licensors, and is protected by
;;   worldwide copyright and trade secret laws and treaty provisions. No part
;;   of the Material may be used, copied, reproduced, modified, published,
;;   uploaded, posted, transmitted, distributed, or disclosed in any way
;;   without Intels prior express written permission.
;;
;;   No license under any patent, copyright, trade secret or other
;;   intellectual property right is granted to or conferred upon you by
;;   disclosure or delivery of the Materials, either expressly, by
;;   implication, inducement, estoppel or otherwise. Any license under
;;   such intellectual property rights must be express and approved by
;;   Intel in writing.
;; *****************************************************************DISCLAIMER**
;*************************************************************************
; Aware DMT Technology. Proprietary and Confidential.
;
; ADDRESS:         40 Middlesex Turnpike, Bedford, MA 01730-1413 USA
; TELEPHONE:       781.276.4000
; FAX:             781.276.4001
; WEB:             http://www.aware.com
;
; FILENAME:        ccl.s
;
; DESCRIPTION:     This file contains the loop function used to compute
;                  the circular correlation for the function CircCorrLoop.
;
;*************************************************************************
.include "asm.h"

.data
.global l_acc48H
l_acc48H:
   .size l_acc48H, 1*BYTES_PER_LONGWORD
   .type l_acc48H, @object
   .word   0
   .align   BYTES_PER_LONGWORD

.global l_acc48L
l_acc48L:
   .size l_acc48L, 1*BYTES_PER_LONGWORD
   .type l_acc48L, @object
   .word   0
   .align   BYTES_PER_LONGWORD

.text
;/*******************************************************************************
;*
;*   Prototype: void CircAutoCorrLoop(int32 psa_Signal1, int32 psa_Signal2,
;*                      int16 s_signal1len, int16 s_signal2len, int16 s_offset)
;*
;*   Description:
;*       This function is called by CircCorrLoop to compute the loop
;*       for the circular correlation of two input signals that both lie in
;*    in X memory.  Signal 1 and Signal 2 may be the same signal.
;*
;*   Arguments:
;*      int32 psa_Signal1  - XY address of signal 1 in 32-bit addressing mode
;*    int32 psa_Signal2 - XY address of signal 2 in 32-bit addressing mode
;*    int16 s_signal1len   - length of input signal1
;*    int16 s_signal2len   - length of input signal2
;*    int16 s_offset    - offset given
;*
;*   Return Value:
;*
;*   Global Variables:
;*
;*******************************************************************************/
; void CircAutoCorrLoop(int32 psa_Signal1, int32 psa_Signal2, int16 s_signa1len, int16 s_signal2len, int16 s_offset);
.global CircAutoCorrLoop
CircAutoCorrLoop:
   ; Input Arguments
   ; %r0 = psa_Signal1
   ; %r1 = psa_Signal2
   ; %r2 = s_signal1len
   ; %r3 = s_signal2len
   ; %r4 = s_offset
   .define psa_Signal1,    %r0
   .define psa_Signal2,    %r1
   .define s_signal1len,   %r2
   .define s_signal2len,   %r3
   .define s_offset,       %r4

   .define scratch,        %r0
   .define l_acc48L_reg,   %XMAC0
   .define l_acc48H_reg,   %XMAC1
   .define l_Acc0_lsp,     %r1
   .define l_Acc1_lsp,     %r2
   .define l_Acc_ext,      %r3
   .define l_Acc0_msp,     %r4
   .define l_Acc1_msp,     %r3

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; prolog: none since leaf function and no stack frame for auto variables ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

   ; if (s_signal1len <= 0) exit
   sub.f  0, s_signal1len, 0
   ble    9f

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; set up address pointers ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;
   asr    psa_Signal1, psa_Signal1        ; convert 16-bit addressing mode address
                                 ; to a 32-bit addressing mode address
   sr     psa_Signal1, [%ax0]
   sr     0x1, [%mx0]                  ; offset = 1, postupdate, linear, 32-bit mode, AM_MODULO
                                 ; assume signal 1 does not wrap around

   sub.f  0, s_offset, 0
   add.lt s_offset, s_offset, s_signal2len   ; assume that s_offset < s_signal2len

   add    psa_Signal2, psa_Signal2, s_offset
   sr     psa_Signal2, [%ax1]
   asl    scratch, s_signal2len, 16    ; bits 16:28 = modulo length
   or     scratch, scratch, 0x2000_0001   ; bits 0:13 = offset = 1
   sr     scratch, [%mx1]              ; all other bits = postupdate, linear, 32-bit mode, AM_MODULO

   asr.f  %lp_count, s_signal1len            ; halve the number of iterations since we are doing
                                    ; 2 multiplies at once
   mul    0, 0, 0                      ; clear the accumulator
   bz     0f

   nop

   ;;;;;;;;;;;;;;;;;;;;;;;
   ; compute correlation ;
   ;;;;;;;;;;;;;;;;;;;;;;;
   lp 0f
      lsr scratch, %x1_u, 16           ; 1. can't use both %x0_u and %x1_u
      or  scratch, %x1_u, scratch         ;    as source registers in the same instruction
      mac 0, %x0_u, scratch            ; 2. 16-bit addressing allows us to handle odd offsets
   0:

   and.f  0, s_signal1len, 1
   lsl.nz scratch, %x0_u, 16
   mac.nz 0, scratch, %x1_u

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; add contents of accumulators to previous accumulations        ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ld    l_acc48L_reg, [l_acc48L]
   ld    l_acc48H_reg, [l_acc48H]
   lr    l_Acc0_lsp, [%AUX_XMAC2]
   lr    l_Acc1_lsp, [%AUX_XMAC1]
   lr    l_Acc_ext, [%AUX_XMAC0]

   and   l_Acc0_msp, l_Acc_ext, 0xFFFF
   sexw  l_Acc0_msp, l_Acc0_msp
   asr   l_Acc1_msp, l_Acc_ext, 16
   add.f l_Acc0_lsp, l_Acc0_lsp, l_Acc1_lsp
   adc   l_Acc0_msp, l_Acc0_msp, l_Acc1_msp

   and   scratch, l_Acc0_lsp, 0xFFFF
   add   l_acc48L_reg, l_acc48L_reg, scratch ; assume that l_acc48L_reg will not overflow to a negative number

   lsr   scratch, l_Acc0_lsp, 16
   add   l_acc48H_reg, l_acc48H_reg, scratch
   asl   scratch, l_Acc0_msp, 16
   add   l_acc48H_reg, l_acc48H_reg, scratch ; assume that l_acc48H_reg will not saturate
   st    l_acc48L_reg, [l_acc48L]
   st    l_acc48H_reg, [l_acc48H]

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; epilog: just return to the caller ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   9:
      j    [%blink]

   .undef psa_Signal1
   .undef psa_Signal2
   .undef s_signal1len
   .undef s_signal2len
   .undef s_offset

   .undef scratch

   .undef l_acc48L_p_reg
   .undef l_acc48H_p_reg
   .undef l_Acc0_lsp
   .undef l_Acc1_lsp
   .undef l_Acc_ext
   .undef l_Acc_msp

;/*******************************************************************************
;*
;*   Prototype: void CircCrossCorrLoop(int32 psa_Signal1, int32 psa_Signal2,
;*                      int16 s_signal1len, int16 s_signal2len, int16 s_offset)
;*
;*   Description:
;*       This function is called by CircCorrLoop to compute the loop
;*       for the circular correlation of two input signals.  Signal 1 must lie in
;*       X memory and Signal 2 must lie in Y memory.
;*
;*   Arguments:
;*      int32 psa_Signal1  - XY address of signal 1 in 32-bit addressing mode
;*    int32 psa_Signal2 - XY address of signal 2 in 32-bit addressing mode
;*    int16 s_signal1len   - length of input signal1
;*    int16 s_signal2len   - length of input signal2
;*    int16 s_offset    - offset given
;*
;*   Return Value:
;*
;*   Global Variables:
;*
;*******************************************************************************/
; void CircCrossCorrLoop(int32 psa_Signal1, int32 psa_Signal2, int16 s_signa1len, int16 s_signal2len, int16 s_offset);
.global CircCrossCorrLoop
CircCrossCorrLoop:
   ; Input Arguments
   ; %r0 = psa_Signal1
   ; %r1 = psa_Signal2
   ; %r2 = s_signal1len
   ; %r3 = s_signal2len
   ; %r4 = s_offset
   .define psa_Signal1,    %r0
   .define psa_Signal2,    %r1
   .define s_signal1len,   %r2
   .define s_signal2len,   %r3
   .define s_offset,       %r4

   .define scratch,        %r5
   .define scratch1,       %XMAC1

   .define s_leftShift,    %XMAC0
   .define s_log2bufLen,   %XMAC0
   .define s_blockSize,    %XMAC0

   .define scratch,        %r0
   .define l_acc48L_reg,   %XMAC0
   .define l_acc48H_reg,   %XMAC1
   .define l_Acc0_lsp,     %r1
   .define l_Acc1_lsp,     %r2
   .define l_Acc_ext,      %r3
   .define l_Acc0_msp,     %r4
   .define l_Acc1_msp,     %r3

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; prolog: none since leaf function and no stack frame for auto variables ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

   ; if (s_signal1len <= 0) exit
   sub.f  0, s_signal1len, 0
   ble    9f

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; set up address pointers ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;
   asr    psa_Signal1, psa_Signal1        ; convert 16-bit addressing mode address
                                 ; to a 32-bit addressing mode address
   sr     psa_Signal1, [%ax0]
   sr     0x1, [%mx0]                  ; offset = 1, postupdate, linear, 32-bit mode, AM_MODULO
                                 ; assume signal 1 does not wrap around

   ; to determine the starting XY address for Signal 2 with offset, we must consider 4 cases:
   ; s_signal2len = even, s_offset = even - go to s_offset >> 1 in the even block (two blocks)
   ; s_signal2len = even, s_offset = odd  - go to s_offset >> 1 in the odd block (two blocks)
   ; s_signal2len = odd, s_offset = even  - go to s_offset >> 1 (one block contains both even and odd)
   ; s_signal2len = odd, s_offset = odd   - go to ((s_offset + s_signal2len) >> 1)  (one block contains both even and odd)
   and.f  0, s_signal2len, 1
   bnz    2f

   ; s_signal2len = even
   and.f  0, s_offset, 1
   bz     1f

   ; s_signal2len = even, s_offset = odd - go to s_offset >> 1 in the odd block (two blocks)
   sub    scratch, s_signal2len, 1
   norm   s_leftShift, scratch
   sub    s_log2bufLen, 31, s_leftShift
   asl    s_blockSize, 1, s_log2bufLen
   add    psa_Signal2, psa_Signal2, s_blockSize

   ; s_signal2len = even, s_offset = even - go to s_offset >> 1 in the even block (two blocks)
   1:
   sub.f  0, s_offset, 0
   add.lt s_offset, s_offset, s_signal2len   ; assume that s_offset < s_signal2len
   add    psa_Signal2, psa_Signal2, s_offset
   asr    psa_Signal2, psa_Signal2        ; convert 16-bit addressing mode address
                                 ; to a 32-bit addressing mode address
   sr     psa_Signal2, [%ay0]
   asl    scratch, s_signal2len, 16-1     ; bits 16:28 = modulo length
                                 ; left shift of 15 = (right shift of 1 to halve modulo length for 32-bit addressing
                                 ; and left shift of 15 to get modulo information to the correct bitfield)
   or     scratch, scratch, 0x1        ; bits 0:13 = offset = 1
   sr     scratch, [%my0]              ; all other bits = postupdate, linear, 32-bit mode, AM_MODULO
   b      7f

   ; s_signal2len = odd
   2:
   sub.f  0, s_offset, 0
   add.lt s_offset, s_offset, s_signal2len   ; assume that s_offset < s_signal2len

   and.f  0, s_offset, 1
   add.nz s_offset, s_offset, s_signal2len

   add    psa_Signal2, psa_Signal2, s_offset
   asr    psa_Signal2, psa_Signal2        ; convert 16-bit addressing mode address
                                 ; to a 32-bit addressing mode address
   sr     psa_Signal2, [%ay0]
   asl    scratch, s_signal2len, 16    ; bits 16:28 = modulo length
                                 ; odd and even are packed so our buffer of 16-bit data is 2x as big
   or     scratch, scratch, 0x1        ; bits 0:13 = offset = 1
   sr     scratch, [%my0]              ; all other bits = postupdate, linear, 32-bit mode, AM_MODULO

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; set up the single instruction correlation loop ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   7:
   asr.f %lp_count, s_signal1len       ; halve the number of iterations since we are doing
                                 ; 2 multiplies at once
   mul 0, 0, 0                      ; clear the accumulator
   bz    6f

   ; the lp instruction can only be used for loops of two or more instructions
   ; the single loop set up is described in the ARC documentation
   mov scratch, 8f >> LOG2_BYTES_PER_LONGWORD
   add scratch1, scratch, 1
   sr  scratch, [%LP_START]
   sr  scratch1, [%LP_END]
   nop
   nop

   ;;;;;;;;;;;;;;;;;;;;;;;
   ; compute correlation ;
   ;;;;;;;;;;;;;;;;;;;;;;;
   8:
      mac 0, %x0_u, %y0_u

   6:
   and.f  0, s_signal1len, 1
   lsl.nz scratch, %x0_u, 16
   lsl.nz scratch1, %y0_u, 16
   mac.nz 0, scratch, scratch1

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; add contents of accumulators to previous accumulations        ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ld    l_acc48L_reg, [l_acc48L]
   ld    l_acc48H_reg, [l_acc48H]
   lr    l_Acc0_lsp, [%AUX_XMAC2]
   lr    l_Acc1_lsp, [%AUX_XMAC1]
   lr    l_Acc_ext, [%AUX_XMAC0]

   and   l_Acc0_msp, l_Acc_ext, 0xFFFF
   sexw  l_Acc0_msp, l_Acc0_msp
   asr   l_Acc1_msp, l_Acc_ext, 16
   add.f l_Acc0_lsp, l_Acc0_lsp, l_Acc1_lsp
   adc   l_Acc0_msp, l_Acc0_msp, l_Acc1_msp

   and   scratch, l_Acc0_lsp, 0xFFFF
   add   l_acc48L_reg, l_acc48L_reg, scratch ; assume that l_acc48L_reg will not overflow to a negative number

   lsr   scratch, l_Acc0_lsp, 16
   add   l_acc48H_reg, l_acc48H_reg, scratch
   asl   scratch, l_Acc0_msp, 16
   add   l_acc48H_reg, l_acc48H_reg, scratch ; assume that l_acc48H_reg will not saturate
   st    l_acc48L_reg, [l_acc48L]
   st    l_acc48H_reg, [l_acc48H]

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; epilog: just return to the caller ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   9:
      j    [%blink]

   .undef psa_Signal1
   .undef psa_Signal2
   .undef s_signal1len
   .undef s_signal2len
   .undef s_offset

   .undef scratch
   .undef scratch1

   .undef s_leftShift
   .undef s_log2BufLen
   .undef s_BlockSize

   .undef l_acc48L_p_reg
   .undef l_acc48H_p_reg
   .undef l_Acc0_lsp
   .undef l_Acc1_lsp
   .undef l_Acc_ext
   .undef l_Acc_msp
