;; **COPYRIGHT******************************************************************
;;    INTEL CONFIDENTIAL
;;    Copyright (C) 2017 Intel Corporation
;;    Copyright (C), 1994-2002 Aware Inc. All Rights Reserved.
;; ******************************************************************COPYRIGHT**
;; **DISCLAIMER*****************************************************************
;;   The source code contained or described herein and all documents related
;;   to the source code ("Material") are owned by Intel Corporation or its
;;   suppliers or licensors. Title to the Material remains with Intel
;;   Corporation or its suppliers and licensors. The Material may contain
;;   trade secrets and proprietary and confidential information of Intel
;;   Corporation and its suppliers and licensors, and is protected by
;;   worldwide copyright and trade secret laws and treaty provisions. No part
;;   of the Material may be used, copied, reproduced, modified, published,
;;   uploaded, posted, transmitted, distributed, or disclosed in any way
;;   without Intels prior express written permission.
;;
;;   No license under any patent, copyright, trade secret or other
;;   intellectual property right is granted to or conferred upon you by
;;   disclosure or delivery of the Materials, either expressly, by
;;   implication, inducement, estoppel or otherwise. Any license under
;;   such intellectual property rights must be express and approved by
;;   Intel in writing.
;; *****************************************************************DISCLAIMER**
;*************************************************************************
; Aware DMT Technology. Proprietary and Confidential.
;
; ADDRESS:         40 Middlesex Turnpike, Bedford, MA 01730-1413 USA
; TELEPHONE:       781.276.4000
; FAX:             781.276.4001
; WEB:             http://www.aware.com
;
; FILENAME:        accum32.s
;
; DESCRIPTION:     This file contains the accumulation function for the function accum16to32()
;
;*************************************************************************
.include "asm.h"
.include "xy_regs.h"
.text

;/*^^^
; *------------------------------------------------------------------------
; *
; *  Name : Accum16to32Loop
; *
; *  Description:
; *
; *      accumulates 16 bit data array into a 32 bit accumulator
; *      array.
; *
; *  Prototype:
; *      Accum16to32Loop(int32 pla_accumbuf, int16 *psa_databuf, int16 s_length)
; *
; *
; *  Input Arguments:
; *      int32 *pla_accumbuf - Pointer to 32 bit accumulation array
; *      int16 *psa_databuf - Pointer to 16 bit data array
; *      int16 s_length - length of both arrays
; *
; *  Output Arguments:
; *
; *  Return:
; *
; *  Notes:
; *
; *------------------------------------------------------------------------
; *^^^
; */
; void Accum16to32Loop(int32 pla_accumbuf, int32 psa_databuf, int16 s_length)
.ifndef GENERAL_PURPOSE    ; XY implementation

.global Accum16to32LoopXX
Accum16to32LoopXX:
   ; Input Arguments
   ; %r0 = pla_accumbuf
   ; %r1 = psa_databuf
   ; %r2 = s_length
   .define pla_accumbuf, %r0       ; X_MEM
   .define psa_databuf,  %r1       ; X_MEM
   .define s_length,     %r2
   .define scratch,         %r0

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; prolog: none since leaf function and no stack frame for auto variables ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; set up window registers ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;
   sr   psa_databuf, [%ax0]
   sr   0x2000_0001, [aux_mx0]      ; offset = 1, postupdate, linear, 16-bit mode, AM_MODULO

   sr   pla_accumbuf, [%ax1]
   sr   0x1, [aux_mx1]              ; offset = 1, postupdate, linear, 32-bit mode, AM_MODULO

   sr   pla_accumbuf, [%ax2]
   sr   0x1, [aux_mx2]              ; offset = 1, postupdate, linear, 32-bit mode, AM_MODULO

   ;;;;;;;;
   ; main ;
   ;;;;;;;;
   mov.f %lp_count, s_length
   ble 9f            ; if (s_length <= 0) return
   nop
   nop

   ;;;;;;;;;;;;;;
   ; accumulate ;
   ;;;;;;;;;;;;;;
   lp 0f ; for(i=0; i<s_length; i++)
      asr scratch, aux_x0_u, 16        ; scratch = (int32) *psa_datbuf++
      add aux_x2_u, aux_x1_u, scratch     ; *pla_accumbuf** += scratch
   0:

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; epilog: just return to the caller ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   9:
      j [%blink]
   .undef pla_accumbuf
   .undef psa_databuf
   .undef s_length
   .undef scratch

.global Accum16to32LoopXY
Accum16to32LoopXY:
   ; Input Arguments
   ; %r0 = pla_accumbuf
   ; %r1 = psa_databuf
   ; %r2 = s_length
   .define pla_accumbuf, %r0       ; X_MEM
   .define psa_databuf,  %r1       ; Y_MEM
   .define s_length,     %r2
   .define scratch,         %r0

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; prolog: none since leaf function and no stack frame for auto variables ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; set up window registers ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;
   sr   psa_databuf, [%ay0]
   sr   0x2000_0001, [aux_my0]      ; offset = 1, postupdate, linear, 16-bit mode, AM_MODULO

   sr   pla_accumbuf, [%ax1]
   sr   0x1, [aux_mx1]              ; offset = 1, postupdate, linear, 32-bit mode, AM_MODULO

   sr   pla_accumbuf, [%ax2]
   sr   0x1, [aux_mx2]              ; offset = 1, postupdate, linear, 32-bit mode, AM_MODULO

   ;;;;;;;;
   ; main ;
   ;;;;;;;;
   mov.f %lp_count, s_length
   ble 9f            ; if (s_length <= 0) return
   nop
   nop

   ;;;;;;;;;;;;;;
   ; accumulate ;
   ;;;;;;;;;;;;;;
   lp 0f ; for(i=0; i<s_length; i++)
      asr scratch, aux_y0_u, 16        ; scratch = (int32) *psa_datbuf++
      add aux_x2_u, aux_x1_u, scratch     ; *pla_accumbuf** += scratch
   0:

   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ; epilog: just return to the caller ;
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   9:
      j [%blink]
   .undef pla_accumbuf
   .undef psa_databuf
   .undef s_length
   .undef scratch
.else    ; if we are not writing from X memory to X memory; untested

.global Accum16to32Loop
Accum16to32Loop:
   ; Constants
   ; Input Arguments
   ; %r0 = pla_accumbuf
   ; %r1 = s_offset1
   ; %r2 = psa_databuf
   ; %r3 = s_offset2
   ; %r4 = s_length
   .define pla_accumbuf, %r0
   .define s_offset1,    %r1
   .define psa_databuf,  %r2
   .define s_offset2,    %r3
   .define s_length,     %r4
   .define scratch,      %r5
   .define scratch1,     %r6

   ; prolog: none since leaf function and no stack frame for auto variables
   mov.f %lp_count, s_length
   ble 9f            ; if (s_length <= 0) return

   sub s_offset1, s_offset1, 1
   asl s_offset1, s_offset1, LOG2_BYTES_PER_LONGWORD
   add pla_accumbuf, pla_accumbuf, s_offset1  ; pla_accumbuf += (s_offset1 - 1)

   sub s_offset2, s_offset2, 1
   asl s_offset2, s_offset2, LOG2_BYTES_PER_WORD
   add psa_databuf, psa_databuf, s_offset2  ; psa_databuf += (s_offset2 - 1)

   lp 0f          ; for(i=0; i<s_length; i++)
      ldw.x.a scratch, [psa_databuf, 1*BYTES_PER_WORD]   ; scratch = (int32) *++psa_databuf
      ld    scratch1, [pla_accumbuf, 1*BYTES_PER_LONGWORD]  ; scratch1 = *pla_accumbuf
      add   scratch, scratch, scratch1
      st.a  scratch, [pla_accumbuf, 1*BYTES_PER_LONGWORD]   ; *++pla_accumbuf += *psa_databuf
   0:

   ; epilog: just return to the caller
   9:
      j [%blink]

   .undef pla_accumbuf
   .undef s_offset1
   .undef psa_databuf
   .undef s_offset2
   .undef s_length
   .undef scratch
   .undef scratch1
.endif
