;; **COPYRIGHT******************************************************************
;;    INTEL CONFIDENTIAL
;;    Copyright (C) 2017 Intel Corporation
;;    Copyright (C), 1994-2002 Aware Inc. All Rights Reserved.
;; ******************************************************************COPYRIGHT**
;; **DISCLAIMER*****************************************************************
;;   The source code contained or described herein and all documents related
;;   to the source code ("Material") are owned by Intel Corporation or its
;;   suppliers or licensors. Title to the Material remains with Intel
;;   Corporation or its suppliers and licensors. The Material may contain
;;   trade secrets and proprietary and confidential information of Intel
;;   Corporation and its suppliers and licensors, and is protected by
;;   worldwide copyright and trade secret laws and treaty provisions. No part
;;   of the Material may be used, copied, reproduced, modified, published,
;;   uploaded, posted, transmitted, distributed, or disclosed in any way
;;   without Intels prior express written permission.
;;
;;   No license under any patent, copyright, trade secret or other
;;   intellectual property right is granted to or conferred upon you by
;;   disclosure or delivery of the Materials, either expressly, by
;;   implication, inducement, estoppel or otherwise. Any license under
;;   such intellectual property rights must be express and approved by
;;   Intel in writing.
;; *****************************************************************DISCLAIMER**
;*************************************************************************
; Aware DMT Technology. Proprietary and Confidential.
;
; ADDRESS:         40 Middlesex Turnpike, Bedford, MA 01730-1413 USA
; TELEPHONE:       781.276.4000
; FAX:             781.276.4001
; WEB:             http://www.aware.com
;
; FILENAME:        arctan.s
;
; DESCRIPTION:     Fast ArcTangent implementation.
;
;*************************************************************************
.text

;/*^^^
; *------------------------------------------------------------------------
; *
; *  Name: FastAtan
; *
; *  Description: Computes 4 quadrant arctan(y/x).
; *
; *  Return: scaled arctan(y/x) with 1 radian = 8192
; *
; *  Notes: The algorithm computes the arctan in the first octant and maps
; *         the answer to the remaining octants as needed. The first octant
; *         is divided into 7 sections and each section is approximated by
; *         1) a(y/x) + b  or
; *         2) (xy/(x^2 + ay^2)) + b
; *
; *  The relative approximation error is no more than -51 dB.
; *
; *  Here is the pseudo code:
; *
; *  if (x > 20*y) {
; *      p = y/x;
; *   } else if (x > 8*y) {
; *      p = (x*y)/(x*x + y*y*0.125);
; *   } else if (x > 2*y) {
; *      p = (x*y)/(x*x + y*y*0.3125) - 0.0001;
; *   } else if (2*x > 3*y) {
; *      p = (x*y)/(x*x + y*y*0.3) - 0.001;
; *   } else if (3*x > 4*y) {
; *      p = 0.66925389*(y/x) +  0.14194371;
; *   } else if (7*x > 8*y) {
; *      p = 0.60255826*(y/x) +  0.19225833;
; *   } else {
; *      p = 0.53246136*(y/x) +  0.25353658;
; *   }
; *
; *------------------------------------------------------------------------
; *^^^
; */
;
; int16 FastAtan(int16 y_in, int16 x_in)
.global FastAtan
FastAtan:

   ; Input Arguments
   ; %r0 = y_in
   ; %r1 = x_in

   ; Register Assignments
   ; (these will stay fixed throughout the duration of the function)
   ; %r2 = y_in
   ; %r3 = abs(x_in)
   ; %r4 = abs(y_in)

   ; prolog
   st %blink, [%sp, 4]
   st  %fp, [%sp]
   mov %fp, %sp

   ; handle special cases
   sub.f 0, %r1, 0      ; test x_in
   bnz   0f       ; skip special case section if (x_in != 0)

   sub.f 0, %r0, 0
   mov   %r0, 12868  ; insert instruction here since
                  ; there must be a stall between setting a flag
                  ; and jumping
   bgt   9f       ; if (y_in > 0) return pi/2

   ;sub.f 0, %r0, 0
   mov   %r0, -12868 ; insert instruction here since
                  ; there must be a stall between setting a flag
                  ; and jumping
   blt   9f          ; else if (y_in < 0) return -pi/2

   b.d  9f
   mov   %r0, 0      ; return 0

   ; compute abs(x_in) and abs(y_in)
   0:
   mov %r2, %r0
   sub %sp, %sp, 16
   bl  abs           ; %r4 = abs(y_in)
   add %sp, %sp, 16
   mov %r4, %r0

   mov %r0, %r1      ; %r3 = abs(x_in)
   sub %sp, %sp, 16
   bl  abs
   add %sp, %sp, 16
   mov %r3, %r0

   ; map x and y into first octant
   sub.f 0, %r3, %r4
   blt   1f
                  ; if (abs(x_in) >= abs(y_in))
   mov %r5, %r3      ; l_x = abs((int32) x_in)
   mov %r6, %r4      ; l_y = abs((int32) y_in)
   b   2f            ; figure out where I should place this due to branch delay slots

   1:             ; else
   mov %r5, %r4      ; l_x = abs((int32) y_in)
   mov %r6, %r3      ; l_y = abs((int32) x_in)

   ; approximate
   ; %r5 = l_x, %r6 = l_y
   ; %r7 = l_num, %r8 = l_den, %r9 = b
   ; %r10 = x products, %r11 = y products
   ; dual MAC: use channel 2 for all calculations
   2:

   ; if (l_x >= 20*l_y)
   mul   0, 20, %r6
   lr    %r11, [%AUX_XMAC2]   ; %r11 = 20*l_y
   sub.f 0, %r5, %r11
   blt   3f

   mov   %r7, %r6          ; l_num = l_y
   mov   %r8, %r5          ; l_den = l_x
   mov   %r9, 0            ; b = 0

   b     4f

   ; else if (l_x >= 8*l_y)
   3:
   asl   %r11, %r6, 3         ; %r11 = 8*l_y
   sub.f 0, %r5, %r11
   blt   3f

   mul   0, %r5, %r6       ; l_num = (l_x*l_y) >> 1
   lr    %r7, [%AUX_XMAC2]
   asr   %r7, %r7

   mul   0, %r5, %r5       ; l_den = ((l_x*l_x) >> 1) + (l_y*l_y) >> 3))
   lr    %r8, [%AUX_XMAC2]
   asr   %r8, %r8          ; %r8 = (l_x*l_x) >> 1

   mul   0, %r6, %r6
   lr    %r9, [%AUX_XMAC2]
   asr   %r9, %r9, 3       ; %r9 = (l_y*l_y) >> 3

   add   %r8, %r8, %r9
   mov   %r9, 0            ; b = 0

   b     4f

   ; else if (l_x >= 2*l_y)
   3:
   asl   %r11, %r6, 1         ; %r11 = 2*l_y
   sub.f 0, %r5, %r11
   blt   3f

   mul   0, %r5, %r6       ; l_num = (l_x*l_y) >> 1
   lr    %r7, [%AUX_XMAC2]
   asr   %r7, %r7

   mul   0, %r5, %r5       ; l_den = ((l_x*l_x) >> 1) + ((5*l_y >> 2)*l_y >> 3)
   lr    %r8, [%AUX_XMAC2]
   asr   %r8, %r8          ; %r8 = (l_x*l_x) >> 1

   mul   0, 5, %r6
   lr    %r9, [%AUX_XMAC2]
   asr   %r9, %r9, 2       ; %r9 = 5*l_y >> 2

   mul   0, %r9, %r6
   lr    %r9, [%AUX_XMAC2]
   asr   %r9, %r9, 3       ; %r9 =  ((5*l_y >> 2)*l_y >> 3)

   add   %r8, %r8, %r9
   mov   %r9, -1           ; b = -1

   b     4f

   ; else if (2*l_x >= 3*l_y)
   3:
   mul   0, 3, %r6
   asl   %r10, %r5, 1         ; %r10 = 2*l_x
   lr    %r11, [%AUX_XMAC2]   ; %r11 = 3*l_y
   sub.f 0, %r10, %r11
   blt   3f

   mul   0, %r5, %r6       ; l_num = (l_x*l_y) >> 1
   lr    %r7, [%AUX_XMAC2]
   asr   %r7, %r7

   mul   0, %r5, %r5       ; l_den = ((l_x*l_x) >> 1) + ((19661*l_y >> 14)*l_y >> 3)
   lr    %r8, [%AUX_XMAC2]
   asr   %r8, %r8          ; %r8 = (l_x*l_x) >> 1

   mul   0, 19661, %r6
   lr    %r9, [%AUX_XMAC2]
   asr   %r9, %r9, 14         ; %r9 = 19661*l_y >> 14

   mul   0, %r9, %r6
   lr    %r9, [%AUX_XMAC2]
   asr   %r9, %r9, 3       ; %r9 =  ((19661*l_y >> 14)*l_y >> 3)

   add   %r8, %r8, %r9
   mov   %r9, -8           ; b = -8

   b     4f

   ; else if (3*l_x >= 4*l_y)
   3:
   ; for all the remaining cases l_den is the same
   ; so compute it once and for all here
   asl %r8, %r5, 14        ; l_den = l_x*16384

   mul   0, 3, %r5
   asl   %r11, %r6, 2         ; %r11 = 4*l_y
   lr    %r10, [%AUX_XMAC2]   ; %r10 = 3*l_x
   sub.f 0, %r10, %r11
   blt   3f

   ; assuming that the LSP is returned from the accumulators
   ;mul    %r7, %r6, 10965    ; l_num = l_y*10965
   ;mov    %r9, 1163       ; b = 1163
   ; otherwise use the following code
   mul   0, %r6, 10965
   mov   %r9, 1163         ; b = 1163
   lr    %r7, [%AUX_XMAC2] ; l_num = l_y*10965

   b      4f

   ; else if (7*l_x >= 8*l_y)
   3:
   mul   0, 7, %r5
   asl   %r11, %r6, 3         ; %r11 = 8*l_y
   lr    %r10, [%AUX_XMAC2]   ; %r10 = 7*l_x
   sub.f 0, %r10, %r11
   blt   3f

   ; assuming that the LSP is returned from the accumulators
   ;mul    %r7, %r6, 9872     ; l_num = l_y*9872
   ;mov    %r9, 1575       ; b = 1575
   ; otherwise use the following code
   mul   0, %r6, 9872
   mov   %r9, 1575         ; b = 1575
   lr    %r7, [%AUX_XMAC2] ; l_num = l_y*9872

   b      4f
   ; else
   3:
   ; assuming that the LSP is returned from the accumulators
   ;mul    %r7, %r6, 8724     ; l_num = l_y*8724
   ;mov    %r9, 2077       ; b = 2077
   ; otherwise use the following code
   mul   0, %r6, 8724
   mov   %r9, 2077         ; b = 2077
   lr    %r7, [%AUX_XMAC2] ; l_num = l_y*8724

   ; long division
   ; %r0 = q (since q is the value returned by the function)
   4:
   mov    %lp_count, 13
   mov    %r0, 0           ; q = 0
   nop                     ; there needs to be 2 cycles between
                        ; the writing and the reading of lp_count

   lp 5f                ; for(i=0; i<13, i++) {
      sub.f     0, %r7, %r8   ; if (l_num >= l_den) {
      sub.ge.f  %r7, %r7, %r8 ; l_num -= l_den
      add.ge    %r0, %r0, 1   ; q += 1 }

      asl     %r0, %r0     ; q <<= 1
      asl     %r7, %r7     ; l_num <<=1
   5:

   ; add constant
   add    %r0, %r0, %r9    ; q += b

   ; default answer is in 1st octant
   sub.f  0, %r1, 0        ; if (x_in >= 0) {
   blt    6f

   sub.f  0, %r3, %r4      ; if (abs(x_in) < abs(y_in)), 2nd octant
   sub.lt %r0, 12868, %r0  ; q = 12868 - q, answer = pi/2 - answer
   b      8f

   6:                ; } else {
   sub.f  0, %r3, %r4      ; if (abs(x_in) < abs(y_in)), 3rd octant
   add.lt %r0, 12868, %r0  ; insert instruction here since
                     ; there must be a stall between setting a flag
                     ; and jumping
   blt   8f          ; if (abs(x_in) < abs(y_in)) q = 12868 + q, answer = pi/2 + answer

   sub   %r0, 25736, %r0   ; q = 25736 - q, answer = pi - answer

   ; mirror the answer to the 5th, 6th, 7th, and 8th octants
   8:
   sub.f  0, %r2, 0           ; if (y_in >= 0) return q
   ; assuming that the LSP of the accumulator is returned
   ; mul.lt %r0, 0x0000_FFFF, %r0   ; else return -q
   ; otherwise use the following code
   bge    9f
   sub    %r0, 0, %r0

   9:
   ; epilog
   ld   %blink, [%fp, 4]
   j.d  [%blink]
   ld.a %fp, [%sp]
