;; **COPYRIGHT******************************************************************
;;    INTEL CONFIDENTIAL
;;    Copyright (C) 2017 Intel Corporation
;; ******************************************************************COPYRIGHT**
;; **DISCLAIMER*****************************************************************
;;   The source code contained or described herein and all documents related
;;   to the source code ("Material") are owned by Intel Corporation or its
;;   suppliers or licensors. Title to the Material remains with Intel
;;   Corporation or its suppliers and licensors. The Material may contain
;;   trade secrets and proprietary and confidential information of Intel
;;   Corporation and its suppliers and licensors, and is protected by
;;   worldwide copyright and trade secret laws and treaty provisions. No part
;;   of the Material may be used, copied, reproduced, modified, published,
;;   uploaded, posted, transmitted, distributed, or disclosed in any way
;;   without Intels prior express written permission.
;;
;;   No license under any patent, copyright, trade secret or other
;;   intellectual property right is granted to or conferred upon you by
;;   disclosure or delivery of the Materials, either expressly, by
;;   implication, inducement, estoppel or otherwise. Any license under
;;   such intellectual property rights must be express and approved by
;;   Intel in writing.
;; *****************************************************************DISCLAIMER**
.include "asm.h"

;    void *memcpy(uint8 *s,uint8 *ct,int32 n)
;    {
;        int32 l_XorAddrBits, l_AndAddrBits;
;        int32 l_StartBytes = 0, l_StartShorts = 0, l_StartLongs = 0;
;        int32 i;
;
;        uint8 *s_return = s;
;
;        printf("Src adr = 0x%08x, Dest Adr = 0x%08x\n", ct, s);
;
;        l_XorAddrBits = ((int32)s ^ (int32)ct) & 0x3;
;
;        if ((n <= 1) || l_XorAddrBits & 0x1)
;        {
;            // only one side is byte aligned.
;            // have to copy per bytes.
;            printf("1: copy %d bytes\n", n);
;            for (i = 0; i < n; i++)
;            {
;                *s++ = *ct++;
;            }
;            n = 0;
;            goto _cpy_Exit;
;        }
;
;        l_AndAddrBits = ((int32)s & (int32)ct) & 0x3;
;
;        if (l_AndAddrBits & 0x1)
;        {
;            printf("2: copy 1 bytes\n");
;            // both addresses are byte aligned.
;            *s++ = *ct++;
;            n--;
;        }
;
;        if (n < 4) goto _cpy_EndBytes;
;
;        if (l_XorAddrBits & 0x2)
;        {
;            // only one side is short aligned
;            for (i = 0; i < (n >> 1); i++)
;            {
;                *((uint16*)s)++ = *((uint16*)ct)++;
;            }
;            printf("2: copy %d shorts, %d bytes left\n", n >> 1, n & 0x1);
;            n = n & 0x1;
;            goto _cpy_EndBytes;
;        }
;
;
;        if ((l_AndAddrBits == 1) || (l_AndAddrBits == 2))
;        {
;            // both sides are short aligned
;            // after one short, both sides will be long aligned
;            *((uint16*)s)++ = *((uint16*)ct)++;
;            n = n - 2;
;            printf("3: copy 1 shorts, %d bytes left\n",  n);
;        }
;
;        if (n >= 4)
;        {
;            // both sides are long aligned
;            for (i = 0; i < (n >> 2); i++)
;            {
;                *((uint32*)s)++ = *((uint32*)ct)++;
;            }
;            printf("4: copy %d longs, %d bytes left\n",  n >> 2, n & 0x3);
;            n = n & 0x3;
;        }
;
;    _cpy_EndBytes:
;        if (n & 0x2)
;        {
;            *((uint16*)s)++ = *((uint16*)ct)++;
;            printf("5: left-over 1 short\n");
;        }
;
;        if (n & 0x1)
;        {
;            *s++ = *ct++;
;            printf("5: left-over 1 bytes\n");
;        }
;
;    _cpy_Exit:
;        return (void *)s_return;
;
;    }

.define   l_XorAddrBits_r4, r4
.define   l_AndAddrBits_r5, r5

.text


.global memcpy
.global memmove

; void *memcpy(uint8 *s,uint8 *ct,int32 n)

memmove:
memcpy:

    mov_s r3, r0;               r0 is saved for return; r3 is dest addr

    brle r2, 1, _memcpy_last_byte;

    xor r4, r3, r1;

    and r4, r4, 0x3;           r4 = l_XorAddrBits_r4

    mov_s lp_count, r2;         set up lp_count ahead of time

    and r5, r3, r1;

    and r5, r5, 0x3;           r5 = l_AndAddrBits_r5

    ; if (l_XorAddrBits & 0x1) copy bytes only
    bbit1 l_XorAddrBits_r4, 0, _memcpy_bytes_only;

_memcpy_move_1_byte:
    bbit0 l_AndAddrBits_r5, 0, _memcpy_not_byte_aligned;  if (l_AndAddrBits & 1) move 1 byte
    ldb.ab r6, [r1, 1];          r1 = r1 + 1; post-incrementing src addr
    stb.ab r6, [r3, 1];          r3 = r3 + 1; post-incrementing dest addr
    sub_s r2, r2, 1;            n--;

_memcpy_not_byte_aligned:
    brlt r2, 4, _memcpy_ending_bytes;   if (n < 4) go copying ending bytes

    bbit0 l_XorAddrBits_r4, 1, _memcpy_both_sides_align_16

    ; only one side is 16-bit aligned; perform "most" of the transfer using 16-bit access
    asr.f lp_count, r2, 1
    and r2, r2, 1;            potential ending byte

.align 4
    lpnz _memcpy_shorts_end;
    _memcpy_shorts_start:
        ldw.ab r6, [r1, 2];          r1 = r1 + 2; post-incrementing src addr
        stw.ab r6, [r3, 2];          r3 = r3 + 2; post-incrementing dest addr
    _memcpy_shorts_end:

    b_s _memcpy_last_byte;

    ; move one more 16-bit, the both addresses should be 32-bit aligned
_memcpy_both_sides_align_16:
    breq l_AndAddrBits_r5, 1, _memcpy_move_1_short
    brne l_AndAddrBits_r5, 2, _memcpy_longs

_memcpy_move_1_short:
    ldw.ab r6, [r1, 2];          r1 = r1 + 2; post-incrementing src addr
    stw.ab r6, [r3, 2];          r3 = r3 + 2; post-incrementing dest addr
    sub    r2, r2, 2

_memcpy_longs:
    asr.f lp_count, r2, 2
    and r2, r2, 0x3;          potential ending bytes

.align 4
    lpnz  _memcpy_longs_end;

    _memcpy_longs_start:
        ld.ab r6, [r1, 4];          r1 = r1 + 4; post-incrementing src addr
        st.ab r6, [r3, 4];          r3 = r3 + 4; post-incrementing dest addr
    _memcpy_longs_end:

    b_s _memcpy_ending_bytes;


; perform the entire transfer using byte access
_memcpy_bytes_only:
.align 4
    lp  _memcpy_bytes_only_end;

    _memcpy_bytes_only_start:
        ldb.ab r6, [r1, 1];       // r1 = r1 + 1; post-incrementing src addr
        stb.ab r6, [r3, 1]        // r3 = r3 + 1; post-incrementing dest addr
    _memcpy_bytes_only_end:

    b_s _memcpy_exit;


; processing the ending 3 bytes if any
_memcpy_ending_bytes:
    bbit0 r2, 1, _memcpy_last_byte;
    ldw.ab r6, [r1, 2];          r1 = r1 + 2; post-incrementing src addr
    stw.ab r6, [r3, 2];          r3 = r3 + 2; post-incrementing dest addr

_memcpy_last_byte:
    bbit0 r2, 0, _memcpy_exit;
    ldb.ab r6, [r1, 1];          r1 = r1 + 1; post-incrementing src addr
    stb.ab r6, [r3, 1];          r3 = r3 + 1; post-incrementing dest addr

_memcpy_exit:
    j   [blink]


.global memset
.global _memclr

; void *memset(uint8 *s,uint8 c,int32 n)

.align 4
_memclr:
    mov r2, r1;     // move n to 3rd argument of memset
    sub r1, r1, r1  // c is set to 0
    b_s memset_memclr;

.align 4
memset:
    ; form the pattern in a long word
    and r1, r1, 0xFF
    asl r5, r1, 8
    or  r1, r1, r5
    asl r5, r1, 16
    or  r1, r1, r5

.align 4
memset_memclr:
    mov_s r3, r0;               r0 is saved for return; r3 is dest addr
    brle r2, 1, _memset_last_byte;

    ; if starting addr is byte-aligned
    bbit0 r3, 0, _memset_not_byte_align;
    stb.ab r1, [r3, 1];          r3 = r3 + 1; post-incrementing dest addr
    sub_s r2, r2, 1;            n--;


_memset_not_byte_align:

    brlt r2, 4, _memset_ending_bytes;   if (n < 4) goto setting ending bytes

    ; set one short word if current address is 16-bit aligned
    bbit0 r3, 1, _memset_not_short_aligned;
    stw.ab r1, [r3, 2];          r3 = r3 + 2; post-incrementing dest addr
    sub_s r2, r2, 2;            n = n - 2

_memset_not_short_aligned:
    ; must be 32-bit aligned
    brlt r2, 4, _memset_ending_bytes;   if (n < 4) goto setting ending bytes
    asr lp_count, r2, 2
    sr _memset_longs_start, [lp_start]
    sr _memset_longs_end, [lp_end]
    and r2, r2, 0x3
    nop

.align 4
    _memset_longs_start:
        st.ab r1, [r3, 4];          r3 = r3 + 4; post-incrementing dest addr
    _memset_longs_end:


; processing the ending 3 bytes if any
_memset_ending_bytes:
    bbit0 r2, 1, _memset_last_byte;
    stw.ab r1, [r3, 2];          r3 = r3 + 2; post-incrementing dest addr

_memset_last_byte:
    bbit0 r2, 0, _memset_exit;
    stb.ab r1, [r3, 1];          r3 = r3 + 1; post-incrementing dest addr

_memset_exit:
    j   [blink]


