/*maan_idct.cpp:  The assembly version of the iDCT routine.
********************************************************************
*
*                    Copyright (c) 1998 Intel Corporation
*    
*    THIS SOURCE CODE IS PROVIDED "AS IS" WITH NO WARRANTIES WHATSOEVER, 
*    INCLUDING ANY WARRANTY OF MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR ANY 
*    PARTICULAR PURPOSE, OR ANY WARRANTY OTHERWISE ARISING OUT OF ANY PROPOSAL, 
*    SPECIFICATION OR SAMPLE. Intel disclaims all liability, including liability 
*    for infringement of any proprietary rights, relating to use of information 
*    in this specification. No license, express or implied, by estoppel or 
*    otherwise, to any intellectual property rights is granted herein, other 
*    than a royalty-free copyright license to use, copy, modify, distribute, and 
*    otherwise dispose of this source code and resulting object code in any 
*    format, for execution on Intel Architecture processors.
*
*    Microprocessors may contain design defects or errors known as errata which
*    may cause the product to deviate from published specifications. Current 
*    characterized errata for Intel microprocessors are available on request.
*
********************************************************************
*/

#include "aan_idct.h"


// The 8x8 "output block" is DCTWIDTH bytes wide.
#define	DCTWIDTH   16
// Each element in the 8x8 "output block" is DATASIZE bytes big.
#define	DATASIZE   2


#pragma optimize("",off)	
/*F*
////////////////////////////////////////////////////////////////////////////
// Name:		MMX_iDCT8x8AAN
//
// Purpose:		Performs an inverse DCT on an 8x8 block.
//				Optimized assembly code using MMX(TM) technology.
//
// Context:		The AAN (Arai, Agui, and Nakajima) algorithm from
//				Trans. IEICE, vol. E 71(11), 1095-1097, Nov. 1988 is used.
//				This implementation using MMX(TM) technology was developed
//				by Intel Corporation.
//
// Returns:		None.
//
// Parameters:
//				coef_block	Input:  A set of 1 DC and 63 AC coefficients.
//							Ouput:  An 8x8 raster of image values.
//
////////////////////////////////////////////////////////////////////////////
*F*/
__declspec(naked) void MMX_iDCT8x8AAN(short *coef_block)
{
	__asm {
		
		mov          eax, [esp+4] /*coef_block*/ ; source coeff
		
		movq         mm0, [eax][8*12]          ; V12
		
		movq         mm1, [eax][8*4]           ; V4
		
		movq         mm3, [eax][8*0]           ; V0
		movq         mm2, mm1                  ; duplicate V4

		movq         mm5, [eax][8*8]           ; V8
		psubw        mm1, mm0                  ; V16 (s1)


		//new three below:
		movq         mm7, xm1					; 23170 ->V18 (s3)
		psllw		 mm1, 2
		
		pmulhw		 mm1, mm7
		paddw        mm2, mm0                  ; V17

		movq         mm0, mm2                  ; duplicate V17
		movq         mm4, mm3                  ; duplicate V0

		paddw        mm3, mm5                  ; V19
		psubw        mm1, mm0                  ; V21, mm0 free
		
		psubw        mm4, mm5                  ; V20, mm5 free
		movq         mm6, mm3                  ; duplicate t74=t81

		paddw        mm3, mm2                  ; V22

		movq         mm5, mm1                  ; duplicate V21
		paddw        mm1, mm4                  ; V23

		movq         [eax][8*4], mm3           ; V22
		psubw        mm4, mm5                  ; V24, mm5 free

		movq         [eax][8*12], mm1          ; V23
		psubw        mm6, mm2                  ; V25, mm2 free

		movq         [eax][8*0], mm4           ; V24

		// keep mm6 alive all along the next block

		movq         mm7, [eax][8*10]          ; V10

		movq         mm0, [eax][8*6]           ; V6
		movq         mm3, mm7                  ; duplicate V10

		movq         mm5, [eax][8*2]           ; V2
		psubw        mm7, mm0                  ; V26 (s1/7)

		movq         mm4, [eax][8*14]          ; V14
		paddw        mm3, mm0                  ; V29, free mm0


		//new three:
		movq		mm1, xm2
		movq		mm2, mm7
		
		psllw		mm7, 3
		movq        mm0, mm5                  ; duplicate V2

		pmulhw		mm7, mm1
		paddw		mm5, mm4                  ; V27
		
	
		//next three:
		movq		mm1, xm3
		psubw 		mm0, mm4                  ; (s1) for next, V28, free mm4
		
		movq		mm4, mm5                  ; duplicate t90=t93
		paddw       mm2, mm0

		psllw		mm0, 2
		paddw       mm5, mm3                  ; V31
		
		pmulhw		mm0, mm1
		psllw		mm2, 2
		
		//next three:
		movq		mm1, xm4
		psubw       mm4, mm3                  ; V30 ; free mm3

		psllw		mm4, 2
		pmulhw		mm1, mm2

		//next 3:
		movq		mm3, xm1
		
		movq		mm2,[eax][8*12]				; V23
		pmulhw		mm4, mm3


		psubw       mm0, mm1                  ; V38
		paddw       mm1, mm7                  ; V37, free mm7

		movq        mm7, [eax][8*4]           ; V22
		movq        mm3, mm6                  ; duplicate V25

		psubw       mm1, mm5                  ; V39 (mm5 still needed for next block)

		//**********************************************************

		psubw		mm4, mm1					; V40

		paddw		mm0, mm4					; V41; free mm0

		psubw		mm6, mm0					; tm6
		paddw		mm3, mm0					; tm8; free mm1

		movq		mm0, mm1					; line added by Kumar
		movq		mm1, mm7					; duplicate V22

		movq		[eax][8*8], mm3				; tm8; free mm3
		paddw		mm7, mm5					; tm0

		movq		[eax][8*6], mm6				; tm6; free mm6
		psubw		mm1, mm5					; tm14; free mm5

		movq		mm6,  [eax][8*0]			; V24
		movq		mm3, mm2					; duplicate t117=t125

		movq		[eax][8*0], mm7				; tm0; free mm7
		paddw		mm2, mm0					; tm2

		movq		[eax][8*14], mm1			; tm14; free mm1
		psubw		mm3, mm0					; tm12; free mm0

		movq		[eax][8*2], mm2				; tm2; free mm2
		movq		mm0, mm6					; duplicate t119=t123

		movq		[eax][8*12], mm3			; tm12; free mm3
		paddw		mm6, mm4					; tm4

		movq		mm1,  [eax][8*5]			; V5
		psubw		mm0, mm4					; tm10; free mm4

		movq         [eax][8*4], mm6			; tm4; free mm6
		movq         mm2, mm1					; duplicate t128=t130

		movq         mm7, [eax][8*13]			; V13

		movq         [eax][8*10], mm0			; tm10, free mm0
		psubw        mm1, mm7					; V50

		// column 1: even part
		// use V5, V13, V1, V9 to produce V56..V59

		movq         mm3, [eax][8*1]			; V1
		paddw        mm2, mm7					; V51

		movq         mm5, [eax][8*9]			; V9

		//next 3:
		movq		mm7, xm1
		psllw		mm1, 2
		pmulhw		mm1, mm7

		movq         mm6, mm2					; duplicate V51


		movq         mm0, [eax][8*11]          ; V11
		movq         mm4, mm3                  ; duplicate V1

		//**********************************************************

		paddw		mm3, mm5					; V53
		psubw		mm4, mm5					; V54 ;mm5 free

		movq		mm7, mm3					; duplicate V53
		psubw		mm1, mm6					; V55 ; mm6 free

		movq		mm6, [eax][8*7]			; V7
		paddw		mm3, mm2					; V56

		movq		mm5, mm4					; duplicate t140=t142
		paddw		mm4, mm1					; V57

		movq		[eax][8*5], mm3				; V56
		psubw		mm5, mm1					; V58; mm1 free

		psubw		mm7, mm2					; V59; mm2 free

		movq		[eax][8*13], mm4			; V57
		movq		mm3, mm0					; duplicate V11

		// keep mm7 alive all along the next block
		movq		[eax][8*9], mm5				; V58
		paddw		mm0, mm6					; V63

		movq		mm4, [eax][8*15]			; V15
		psubw		mm3, mm6					; V60 ; free mm6

		// note that V15 computation has a correction step:
		// this is a 'magic' constant that rebiases the results to be closer to the expected result
		// this magic constant can be refined to reduce the error even more
		// by doing the correction step in a later stage when the number is actually multiplied by 16

		movq		mm5, [eax][8*3]	; V3
		movq		mm1, mm3					; duplicate V60

		movq		mm2, xm2
		movq		mm6, mm5					; duplicate V3

		//next 3:
		psllw		mm1, 3
		paddw		mm5, mm4					; V61

		psubw		mm6, mm4					; V62 ; free mm4
		pmulhw		mm1, mm2		

		//next 3:
		movq		mm2, xm1
		movq		mm4, mm5					; duplicate V61
		
		paddw		mm5, mm0					; V65 -> result
		psubw		mm4, mm0					; V64 ; free mm0
		
		movq		mm0, xm4
		psllw		mm4, 2
		
		//next 3:
		pmulhw		mm4, mm2
		paddw		mm3, mm6					; V66

		//next 3:	
		movq		mm2, xm3
		psllw		mm3, 2
		
		psllw		mm6, 2
		pmulhw		mm3, mm0
		
		//**********************************************************

		movq		mm0, [eax][8*5]				; V56
		pmulhw		mm6, mm2

		psubw		mm6, mm3					; V72
		paddw		mm3, mm1					; V71

		psubw		mm3, mm5					; V73
		movq		mm1, mm0					; duplicate t177=t188

		psubw		mm4, mm3					; V74
		paddw		mm0, mm5					; tm1

		paddw		mm6, mm4					; V75

		//location 
		//  5 - V56
		// 13 - V57
		//  9 - V58
		//  X - V59, mm7
		//  X - V65, mm5
		//  X - V73, mm6
		//  X - V74, mm4
		//  X - V75, mm3                              
		// free mm0, mm1 & mm2                        

		movq		mm2,  [eax][8*13]			; V57
		psubw		mm1, mm5					; tm15; free mm5

		//save the store as used directly in the transpose
		movq		[eax][8*1], mm0				; tm1; free mm0
		movq		mm5, mm7                    ; duplicate t182=t184

		movq		mm0,  [eax][8*9]			; V58
		psubw		mm7, mm6					; tm7

		paddw		mm5, mm6					; tm9; free mm6
		movq		mm6, mm3

		movq		[eax][8*7], mm7				; tm7; free mm7
		movq		mm3, mm2					; duplicate V57

		psubw		mm3, mm6					; tm13
		paddw		mm2, mm6					; tm3 ; free mm6

		movq		mm6, mm0					; duplicate V58

		movq		[eax][8*3], mm2				; tm3; free mm2
		paddw		mm0, mm4					; tm5

		psubw		mm6, mm4					; tm11; free mm4

		movq		[eax][8*5], mm0				; tm5; free mm0

		movq		mm0, mm5					; copy w4---0,1,3,5,6

		// transpose the bottom right quadrant(4X4) of the matrix
		//  ---------       ---------
		// | M1 | M2 |     | M1'| M3'|
		//  ---------  -->  ---------
		// | M3 | M4 |     | M2'| M4'|
		//  ---------       ---------

		punpcklwd	mm5, mm6					;

		punpckhwd	mm0, mm6					;---0,1,3,5,6 

		movq		mm6,  [eax][8*0]			;get w0 of top left quadrant
		movq		mm2, mm3					;---0,1,2,3,5,6

		movq		mm7,  [eax][8*2]			;get w1 of top left quadrant
		punpcklwd	mm3, mm1					;

		punpckhwd	mm2, mm1					;---0,2,3,5,6,7
		movq		mm4, mm5					;---0,2,3,4,5,6,7

		punpckldq	mm5, mm3					; transposed w4

		punpckhdq	mm4, mm3					; transposed w5---0,2,4,6,7

		movq		[eax][8*9], mm5				; store w4
		movq		mm3, mm0					;---0,2,3,4,6,7

		movq		[eax][8*11], mm4			; store w5
		punpckldq	mm0, mm2					; transposed w6

		punpckhdq	mm3, mm2					; transposed w7---0,3,6,7
		movq		mm5, mm6					; copy w0

		movq		[eax][8*13], mm0			; store w6---3,5,6,7	
		punpcklwd	mm6, mm7

		movq		[eax][8*15], mm3			; store w7---5,6,7
		punpckhwd	mm5, mm7					;---5,6,7

		// transpose the top left quadrant(4X4) of the matrix

		movq		mm7,  [eax][8*4]			; get w2 of TL quadrant

		movq		mm4,  [eax][8*6]			; get w3 of TL quadrant
		movq		mm3, mm7					; copy w2---3,4,5,6,7
	
		movq		mm2, mm6
		punpcklwd	mm7, mm4					;---2,3,4,5,6,7

		punpckhwd	mm3, mm4					;---2,3,4,5,6,7
		movq		mm4, mm5					;	

		punpckldq	mm6, mm7					;---1,2,3,4,5,6,7

		movq		mm1, mm5
		punpckhdq	mm2, mm7					;---1,2,3,4,5,6,7

		movq		[eax][8*0], mm6				; store w0 of TL quadrant
		punpckldq	mm5, mm3					;---1,2,3,4,5,6,7

		movq		[eax][8*2], mm2				; store w1 of TL quadrant
		punpckhdq	mm1, mm3					;---1,2,3,4,5,6,7

		movq		[eax][8*4], mm5				; store w2 of TL quadrant

		movq		[eax][8*6], mm1				; store w3 of TL quadrant

		// transpose the top right quadrant(4X4) of the matrix

		movq		mm0,  [eax][8*1]			;---0

		movq		mm1,  [eax][8*3]			;---0,1,2
		movq		mm2, mm0

		movq		mm3,  [eax][8*5]
		punpcklwd	mm0, mm1					;---0,1,2,3

		punpckhwd	mm2, mm1

		movq		mm1,  [eax][8*7]			;---0,1,2,3
		movq		mm4, mm3

		punpcklwd	mm3, mm1					;---0,1,2,3,4

		punpckhwd	mm4, mm1					;---0,1,2,3,4
		movq		mm1, mm0

		movq		mm5, mm2
		punpckldq	mm0, mm3					;---0,1,2,3,4,5

		punpckhdq	mm1, mm3					;---0,1,2,3,4,5

		movq		mm3,  [eax][8*8]
		punpckldq	mm2, mm4					;---1,2,3,4,5

		movq		 [eax][8*8], mm0
		punpckhdq	mm5, mm4					;---1,2,3,4,5

		movq		mm4,  [eax][8*10]

		// transpose the bottom left quadrant(4X4) of the matrix
		// Also store w1,w2,w3 of top right quadrant into
		// w5,w6,w7 of bottom left quadrant. Storing w0 of TR in w4
		// of BL is already done.

		movq		[eax][8*10], mm1
		movq		mm1, mm3					;---1,2,3,4,5

		movq		mm0,  [eax][8*12]
		punpcklwd	mm3, mm4					;---0,1,2,3,4,5

		punpckhwd	mm1, mm4					;---0,1,2,3,4,5

		movq		mm4,  [eax][8*14]

		movq		[eax][8*12], mm2
		movq		mm2, mm0

		movq		[eax][8*14], mm5
		punpcklwd	mm0, mm4					;---0,1,2,3,4

		punpckhwd	mm2, mm4					;---0,1,2,3,4
		movq		mm4, mm3

		movq		mm5, mm1
		punpckldq	mm3, mm0					;---0,1,2,3,4,5

		punpckhdq	mm4, mm0					;---1,2,4,5

		movq		[eax][8*1], mm3
		punpckldq	mm1, mm2					;---1,2,5

		movq		[eax][8*3], mm4
		punpckhdq	mm5, mm2					;---5

		movq		[eax][8*5], mm1

		movq		[eax][8*7], mm5

		//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
		//;;;;;;;;   1D DCT of the rows    ;;;;;;;;;;;
		//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

		// column 0: even part
		// use V4, V12, V0, V8 to produce V22..V25

		movq		mm0,[eax][8*12]				; V12

		movq		mm1,[eax][8*4]				; V4

		movq		mm3,[eax][8*0]				; V0
		movq		mm2, mm1					; duplicate V4

		movq		mm5,[eax][8*8]				; V8
		psubw		mm1, mm0					; V16

		//next 3:
		movq		mm6, xm1
		paddw		mm2, mm0					; V17

		psllw		mm1, 2
		movq		mm0, mm2					; duplicate V17
		
		pmulhw		mm1, mm6
		movq		mm4, mm3					; duplicate V0

		//**********************************************************

		paddw		mm3, mm5					; V19
		psubw		mm4, mm5					; V20 ;mm5 free

		movq		mm6, mm3					; duplicate t74=t81
		psubw		mm1, mm0					; V21 ; mm0 free

		paddw		mm3, mm2					; V22
		movq		mm5, mm1					; duplicate V21

		paddw		mm1, mm4					; V23

		movq		[eax][8*4], mm3				; V22
		psubw		mm4, mm5					; V24; mm5 free

		movq		[eax][8*12], mm1			; V23
		psubw		mm6, mm2					; V25; mm2 free

		movq		[eax][8*0], mm4				; V24

		// keep mm6 alive all along the next block
		// column 0: odd part
		// use V2, V6, V10, V14 to produce V31, V39, V40, V41

		movq		mm7,  [eax][8*10]			; V10

		movq		mm0,  [eax][8*6]			; V6
		movq		mm3, mm7					; duplicate V10

		movq		mm5,  [eax][8*2]			; V2
		psubw		mm7, mm0					; V26

		movq		mm4,  [eax][8*14]			; V14
		paddw		mm3, mm0					; V29 ; free mm0


		movq		mm2, xm2
		movq		mm1, mm7					; duplicate V26
		//next 3:
		
		movq		mm0, mm5						; duplicate V2
		psllw		mm7, 3

		pmulhw		mm7, mm2
		paddw		mm5, mm4						; V27
		
		
		//**********************************************************
		//next 3:
		movq		mm2,  xm3
		psubw		mm0, mm4						; V28 ; free mm4

		paddw		mm1, mm0						; V32 ; free mm2
		psllw		mm0, 2
		
		psllw		mm1, 2
		pmulhw		mm0, mm2

		//next 3:
		movq		mm2, xm4
		movq		mm4, mm5						; duplicate t90=t93

		pmulhw		mm1, mm2
		paddw		mm5, mm3						; V31

		//**********************************************************

		
		//next 3:
		movq		mm2, xm1
		psubw		mm4, mm3						; V30 ; free mm3

		//**********************************************************
		psllw		mm4, 2
		psubw		mm0, mm1						; V38

		paddw		mm1, mm7						; V37 ; free mm7
		pmulhw		mm4, mm2

		movq		mm2,  [eax][8*12]				; V23
		psubw		mm1, mm5						; V39 (mm5 still needed for next block)

		movq		mm7,  [eax][8*4]				; V22
		movq		mm3, mm6                       ; duplicate V25

		//**********************************************************						  

		psubw		mm4, mm1						; V40

		paddw		mm0, mm4						; V41; free mm0

		// column 0: output butterfly

		psubw		mm6, mm0						; tm6
		paddw		mm3, mm0						; tm8; free mm1

		movq		mm0, mm1						; line added by Kumar
		movq		mm1, mm7						; duplicate V22

		movq		[eax][8*8], mm3					; tm8; free mm3
		paddw		mm7, mm5						; tm0

		movq		[eax][8*6], mm6					; tm6; free mm6
		psubw		mm1, mm5						; tm14; free mm5

		movq		mm6,  [eax][8*0]				; V24
		movq		mm3, mm2						; duplicate t117=t125

		movq		[eax][8*0], mm7					; tm0; free mm7
		paddw		mm2, mm0						; tm2

		movq		[eax][8*14], mm1				; tm14; free mm1
		psubw		mm3, mm0						; tm12; free mm0

		movq		[eax][8*2], mm2					; tm2; free mm2
		movq		mm0, mm6						; duplicate t119=t123

		movq		[eax][8*12], mm3				; tm12; free mm3
		paddw		mm6, mm4						; tm4

		movq		mm1,[eax][8*5]					; V5
		psubw		mm0, mm4						; tm10; free mm4

		movq		[eax][8*4], mm6					; tm4; free mm6
		movq		mm2, mm1						; duplicate t128=t130

		movq		[eax][8*10], mm0				; tm10; free mm0

		// column 1: even part
		// use V5, V13, V1, V9 to produce V56..V59

		movq		mm7,  [eax][8*13]				; V13

		movq		mm3,  [eax][8*1]				; V1
		psubw		mm1, mm7						; V50

		movq		mm5,  [eax][8*9]				; V9
		paddw		mm2, mm7						; V51

		//next 3:
		movq		mm0, xm1
		psllw		mm1, 2
		
		movq		mm4, mm3						; duplicate V1
		pmulhw		mm1, mm0

		paddw		mm3, mm5						; V53
		psubw		mm4, mm5						; V54 ;mm5 free

		movq		mm7, mm3						; duplicate V53
		paddw		mm3, mm2						; V56

		movq		mm6, mm2						; duplicate V51
		psubw		mm1, mm2						; V55 ; mm6 free
		
		//**********************************************************

		movq		mm5, mm4						; duplicate t140=t142
		paddw		mm4, mm1						; V57

		movq		[eax][8*5], mm3					; V56
		psubw		mm5, mm1						; V58; mm1 free

		movq		[eax][8*13], mm4				; V57
		psubw		mm7, mm2						; V59; mm2 free

		movq		[eax][8*9], mm5					; V58

		// keep mm7 alive all along the next block

		movq		mm0,[eax][8*11]					; V11

		movq		mm6,[eax][8*7]					; V7
		movq		mm3, mm0						; duplicate V11

		movq		mm4,[eax][8*15]					; V15
		paddw		mm0, mm6						; V63

		movq		mm5,[eax][8*3]					; V3
		psubw		mm3, mm6						; V60 ; free mm6

		// note that V15 computation has a correction step:
		// this is a 'magic' constant that rebiases the results to be closer to the expected result
		// this magic constant can be refined to reduce the error even more
		// by doing the correction step in a later stage when the number is actually multiplied by 16

		movq		mm1, mm3						; duplicate V60

//		//next 3:
		movq		mm6, xm2
		psllw		mm1, 3
		
		pmulhw		mm1, mm6
		movq		mm6, mm5						; duplicate V3

		paddw		mm5, mm4						; V61
		psubw		mm6, mm4						; V62 ; free mm4

		movq		mm4, mm5						; duplicate V61
		paddw		mm5, mm0						; V65 -> result

		movq		mm2, xm1
		psubw		mm4, mm0						; V64 ; free mm0

		//**********************************************************

		//next 3:
		psllw		mm4, 2
		paddw		mm3, mm6						; V66

		pmulhw		mm4, mm2
		psllw		mm3, 2
		
		//next 3:
		movq		mm2,  xm4
		psllw		mm6, 2
		
		movq		mm0,  [eax][8*5]				; V56
		pmulhw		mm3, mm2

		//**********************************************************

		//next 3:
		movq		mm2, xm3
		
		pmulhw		mm6, mm2

		//**********************************************************
			
		psubw		mm6, mm3						; V72
		paddw		mm3, mm1						; V71 ; free mm1

		psubw		mm3, mm5						; V73 ; free mm2
		movq		mm1, mm0						; duplicate t177=t188

		psubw		mm4, mm3						; V74
		paddw		mm0, mm5						; tm1

		movq		mm2,  [eax][8*13]				; V57
		paddw		mm6, mm4						; V75

		//location 
		//  5 - V56
		// 13 - V57
		//  9 - V58
		//  X - V59, mm7
		//  X - V65, mm5
		//  X - V73, mm6
		//  X - V74, mm4
		//  X - V75, mm3                              
		// free mm0, mm1 & mm2                        

		movq		[eax][8*1], mm0					; tm1; free mm0
		psubw		mm1, mm5						; tm15; free mm5

		//save the store as used directly in the transpose
		movq		mm5, mm7						; duplicate t182=t184
		psubw		mm7, mm6						; tm7

		paddw		mm5, mm6						; tm9; free mm3
		movq		mm6, mm3

		movq		mm0,  [eax][8*9]				; V58
		movq		mm3, mm2						; duplicate V57

		movq		[eax][8*7], mm7					; tm7; free mm7
		psubw		mm3, mm6						; tm13

		paddw		mm2, mm6						; tm3 ; free mm6
		movq		mm6, mm0						; duplicate V58

		paddw		mm0, mm4						; tm5
		psubw		mm6, mm4						; tm11; free mm4

		movq		[eax][8*3], mm2					; tm3; free mm2

		movq		[eax][8*5], mm0					; tm5; free mm0
		movq		mm0, mm5						; copy w4---0,1,3,5,6

		// Final results to be stored after the transpose
		// transpose the bottom right quadrant(4X4) of the matrix
		//  ---------       ---------
		// | M1 | M2 |     | M1'| M3'|
		//  ---------  -->  ---------
		// | M3 | M4 |     | M2'| M4'|
		//  ---------       ---------
		//
		// get the pointer to array "range"

		// calculate the destination address

		punpcklwd	mm5, mm6

		punpckhwd	mm0, mm6						;---0,1,3,5,6
		movq		mm2, mm3						;---0,1,2,3,5,6
 
		movq		mm6,  [eax][8*0]				;get w0 of top left quadrant
		punpcklwd	mm3, mm1						

		movq		mm7,  [eax][8*2]				;get w1 of top left quadrant
		punpckhwd	mm2, mm1						;---0,2,3,5,6,7

		movq		mm4, mm5						;---0,2,3,4,5,6,7
		punpckldq	mm5, mm3						; transposed w4

		movq		mm1, const_128
		punpckhdq	mm4, mm3						; transposed w5---0,2,4,6,7

		psraw		mm5, 5
		pxor		mm3, mm3

		paddw		mm5, mm1
		
		packuswb	mm5, mm5
		
		punpcklbw	mm5, mm3
		
		movq		mm3, mm0						;---0,2,3,4,6,7

		punpckldq	mm0, mm2						; transposed w6

		movq		[eax][(DCTWIDTH*4)+(DATASIZE*4)], mm5
		movq		mm5, mm6						; copy w0

		psraw		mm4, 5

		paddw		mm4, mm1
		punpckhdq	mm3, mm2						; transposed w7---0,3,6,7
		
		pxor		mm2, mm2
		psraw		mm0, 5

		packuswb	mm4, mm4
		punpcklbw	mm4, mm2


		movq		[eax][(DCTWIDTH*5)+(DATASIZE*4)], mm4
		pxor		mm4, mm4

		paddw		mm0, mm1

		psraw		mm3, 5


		packuswb	mm0, mm0
		punpcklbw	mm0, mm4


		movq		[eax][(DCTWIDTH*6)+(DATASIZE*4)], mm0

		paddw		mm3, mm1
		punpcklwd	mm6, mm7

		movq		mm2, mm6
		
		punpckhwd	mm5, mm7						;---5,6,7

		packuswb	mm3, mm3
		punpcklbw	mm3, mm4


		movq		[eax][(DCTWIDTH*7)+(DATASIZE*4)], mm3

		// transpose the top left quadrant(4X4) of the matrix

		movq		mm4,  [eax][8*6]				; get w3 of TL quadrant

		movq		mm7,  [eax][8*4]				; get w2 of TL quadrant

		movq		mm3, mm7						; copy w2---3,4,5,6,7
		punpcklwd	mm7, mm4						;---2,3,4,5,6,7

		punpckhwd	mm3, mm4						;---2,3,4,5,6,7
		movq		mm4, mm5						;	

		punpckldq	mm6, mm7						;---1,2,3,4,5,6,7

		psraw		mm6, 5

		paddw		mm6, mm1
		punpckhdq	mm2, mm7						;---1,2,3,4,5,6,7
		
		pxor	mm7, mm7
		packuswb	mm6, mm6
		punpcklbw	mm6, mm7
		

		movq		mm7, mm1
		movq		mm1, mm5


		movq		[eax][(DCTWIDTH*0)+(DATASIZE*0)], mm6

		pxor		mm6, mm6
		
		psraw		mm2, 5

		paddw		mm2, mm7
		punpckldq	mm5, mm3				;---1,2,3,4,5,6,7

		
		packuswb	mm2, mm2
		punpcklbw	mm2, mm6


		movq		[eax][(DCTWIDTH*1)+(DATASIZE*0)], mm2
		
		psraw		mm5, 5

		paddw		mm5, mm7
		punpckhdq	mm1, mm3					;---1,2,3,4,5,6,7

		packuswb	mm5, mm5
		punpcklbw	mm5, mm6

		movq		[eax][(DCTWIDTH*2)+(DATASIZE*0)], mm5
		
		psraw		mm1, 5

		paddw		mm1, mm7

		movq		mm0, [eax][8*1]				;---0
		movq		mm2, mm0

		packuswb	mm1, mm1
		punpcklbw	mm1, mm6

		movq		[eax][(DCTWIDTH*3)+(DATASIZE*0)], mm1

		// transpose the top right quadrant(4X4) of the matrix
		// calculate the destination address for **bottom left quadrant

		movq		mm1, [eax][8*3]				;---0,1,2

		movq		mm3, [eax][8*5]
		punpcklwd	mm0, mm1					;---0,1,2,3

		punpckhwd	mm2, mm1
		movq		mm4, mm3

		movq		mm1,  [eax][8*7]				;---0,1,2,3
		movq		mm5, mm2

		punpcklwd	mm3, mm1						;---0,1,2,3,4

		punpckhwd	mm4, mm1						;---0,1,2,3,4
		movq		mm1, mm0

		punpckldq	mm0, mm3						;---0,1,2,3,4,5

		punpckhdq	mm1, mm3						;---0,1,2,3,4,5

		movq		mm3,  [eax][8*8]

		psraw		mm0, 5


		paddw		mm0, mm7
		punpckldq	mm2, mm4						;---1,2,3,4,5

		punpckhdq	mm5, mm4						;---1,2,3,4,5

		packuswb	mm0, mm0
		punpcklbw	mm0, mm6


		movq		[eax][(DCTWIDTH*4)+(DATASIZE*0)], mm0
		psraw		mm2, 5

		movq		mm4,  [eax][8*10]
		paddw		mm2, mm7

		// transpose the bottom left quadrant(4X4) of the matrix
		// Also store w1,w2,w3 of top right quadrant into
		// w5,w6,w7 of bottom left quadrant. Storing w0 of TR in w4
		// of BL is already done.

		movq		mm0,  [eax][8*12]
		psraw		mm1, 5

		paddw		mm1, mm7
		packuswb	mm2, mm2
		
		packuswb	mm1, mm1
		punpcklbw	mm1, mm6


		movq		[eax][(DCTWIDTH*5)+(DATASIZE*0)], mm1
		movq		mm1, mm3						;---1,2,3,4,5

		punpcklwd	mm3, mm4						;---0,1,2,3,4,5

		punpckhwd	mm1, mm4						;---0,1,2,3,4,5

		movq		mm4,  [eax][8*14]
		punpcklbw	mm2, mm6

		
		movq		[eax][(DCTWIDTH*6)+(DATASIZE*0)], mm2
		movq		mm2, mm0

		psraw		mm5, 5

		paddw		mm5, mm7
		punpcklwd	mm0, mm4						;---0,1,2,3,4

		punpckhwd	mm2, mm4						;---0,1,2,3,4
		movq		mm4, mm3

		packuswb	mm5, mm5
		punpcklbw	mm5, mm6

		punpckldq	mm3, mm0						;---0,1,2,3,4,5


		movq		[eax][(DCTWIDTH*7)+(DATASIZE*0)], mm5
		movq		mm5, mm1

		// calculate the destination address for **top right quadrant

		psraw		mm3, 5

		paddw		mm3, mm7
		punpckhdq	mm4, mm0						;---1,2,4,5

		packuswb	mm3, mm3
		
		punpcklbw	mm3, mm6

		movq		[eax][(DCTWIDTH*0)+(DATASIZE*4)], mm3
		psraw		mm4, 5
		
		paddw		mm4, mm7
		punpckldq	mm1, mm2						;---1,2,5

		packuswb	mm4, mm4
		punpcklbw	mm4, mm6

		movq		[eax][(DCTWIDTH*1)+(DATASIZE*4)], mm4
		
		psraw		mm1, 5

		paddw		mm1, mm7
		punpckhdq	mm5, mm2						;---5
		
		packuswb	mm1, mm1
		punpcklbw	mm1, mm6

		movq		[eax][(DCTWIDTH*2)+(DATASIZE*4)], mm1
		psraw		mm5, 5

		paddw		mm5, mm7

		packuswb	mm5, mm5
		punpcklbw	mm5, mm6

		movq		[eax][(DCTWIDTH*3)+(DATASIZE*4)], mm5

		ret

	} // end of __asm

} // end of MMX_iDCT8x8AAN

#pragma optimize("",on)	
