                TITLE   faa2x64a_p3
                .686
                .mmx
                .xmm
                .MODEL  FLAT
;
_ACODE          SEGMENT	PARA USE32 PUBLIC 'CODE'
                ASSUME  CS:_ACODE
                PUBLIC  _faa2x64a_p3_       			; Entry point name

_faa2x64a_p3_    proc    near
    ;Pre amble
    push   ebp					;Push ebp onto the stack
    mov    ebp,	   esp				;move esp which contains input data into ebp for use
    push   ecx					;preserve ecx
    push   ebx					;preserve ebx
    push   esi					;preserve esi
    push   edi					;preserve edi    
    push   edx					;preserve edx
    push   eax					;preserve eax
    
    add    	   [dword ptr [ebp+8]], 32	;Moves the pointer 32 elements into dt1 array
    add    	   [dword ptr [ebp+12]],32	;Moves the pointer 32 elements into dt2 array
    mov    edx,    [dword ptr [ebp+8]]		;Makes edx a pointer to source array (dt1)
    mov    esi,    [dword ptr [ebp+12]]		;Makes esi a pointer to destination array (dt2)
    mov    edi,    edx				;Copy contents of edx (DT1) to edi
    mov	   ebx,	   [dword ptr [ebp+16]]		;Make ebx a pointer to the length variable
    add	   edx,    [ebx]			;Make edx point to the end of the source array (dt1) 
    pxor   mm7,    mm7				;Clear mm7 to use for carries
    
    ;Prepare all the 64 bit constants
    mov    ecx,    076F6F6F6H
    movd    mm5,    ecx
    psllq  mm5,    32
    movd    mm6,    ecx
    por    mm5,    mm6
    movq   [qword ptr [ebp+20]],  mm5
    
    mov    ecx,    060606060H
    movd    mm5,    ecx
    psllq  mm5,    32
    movd    mm6,    ecx
    por    mm5,    mm6
    movq   [qword ptr [ebp+28]],  mm5

    mov    ecx,    00F0F0F0FH
    movd    mm5,    ecx
    psllq  mm5,    32
    movd    mm6,    ecx
    por    mm5,    mm6
    movq   [qword ptr [ebp+36]],  mm5
    
    mov    ecx,    00000000000000001B
    movd    mm5,    ecx
    psllq  mm5,    32
    mov    ecx,    00000000000000000B
    movd    mm6,    ecx
    por    mm5,    mm6
    movq   [qword ptr [ebp+44]],  mm5    
    
    
;This is the Pentium III Specific version of this code
     
    
    sub    edx,    4   				;Move the edx pointer back by 4 for endianness		
f2a:

    mov    eax,    [edx]			;Load four in-order bytes from left
    bswap  eax					;Reverse them
    sub    edx,    4				;Decrement left-side Pointer (edx)
    mov    ebx,    [edx]			;Load four more in-order bytes from left
    bswap  ebx					;Reverse them    
    sub    edx,    4				;Decrement left-side Pointer (edx)
    movd   mm0,    eax  			;Move first 4 reversed digits from eax to mm0
    movd   mm1,    ebx				;Move second 4 reversed digits from ebx to mm1
    psllq  mm1,    32				;Move 4 digits left by 4    
    por    mm1,    mm0    			;Combine mm0 and mm1 into mm1

  
    
    paddd  mm1,    [edi]			;Add 8 in order digits from source array to mm1
    paddd  mm1,    mm7				;Add the previous carry
    add    edi,    8				;Icrement right-side Pointer (edi)    
    paddd  mm1,    [qword ptr [ebp+20]]		;Add bit mask
    pshufw mm3,   mm1, 0E4H			;Copy results to a scratch register
    psllq  mm3,    1
    pand   mm3,    [qword ptr [ebp+44]]
    paddd  mm1,    mm3    			;add intraqword carry
    pshufw   mm7,    mm1, 0E4H			;Copy mask result to a scratch register mm7			
    pand   mm7,    [qword ptr [ebp+28]]		;Leave high nibbles that did not need a carry set to 6
    psrlq  mm7,    4				;Move into low nibbles
    psubd  mm1,    mm7				;Remove extra 6 from values that didn't carry
    pshufw   mm7,    mm1, 0E4H			;Copy results back to mm7
    pand   mm1,    [qword ptr [ebp+36]]		;Clear high nibbles
    psrlq  mm7,    63				;Shift inter dword carry for addition to next 8 digits   
    movq   [esi],   mm1				;Move output data into output array
    nop
    add    esi,    8				;Increment destination array   


    mov    eax,    [edx]			;Load four in-order bytes from left
    bswap  eax					;Reverse them
    sub    edx,    4				;Decrement left-side Pointer (edx)
    mov    ebx,    [edx]			;Load four more in-order bytes from left
    bswap  ebx					;Reverse them    
    sub    edx,    4				;Decrement left-side Pointer (edx)
    movd   mm0,    eax  			;Move first 4 reversed digits from eax to mm0
    movd   mm2,    ebx				;Move second 4 reversed digits from ebx to mm2
    psllq  mm2,    32				;Move 4 digits left by 4    
    por    mm2,    mm0    			;Combine mm0 and mm2 into mm2

    paddd  mm2,    [edi]			;Add 8 in order digits from source array to mm1
    paddd  mm2,    mm7				;Add the previous carry
    add    edi,    8				;Icrement right-side Pointer (edi)    
    paddd  mm2,    [qword ptr [ebp+20]]		;Add bit mask
    pshufw   mm3,    mm2, 0E4H   		;Copy results to a scratch register
    psllq  mm3,    1
    pand   mm3,    [qword ptr [ebp+44]]
    paddd  mm2,    mm3    			;add intraqword carry
    pshufw   mm7,    mm2, 0E4H			;Copy mask result to a scratch register mm7			
    pand   mm7,    [qword ptr [ebp+28]]		;Leave high nibbles that did not need a carry set to 6
    psrlq  mm7,    4				;Move into low nibbles
    psubd  mm2,    mm7				;Remove extra 6 from values that didn't carry
    pshufw   mm7,    mm2, 0E4H			;Copy results back to mm7
    pand   mm2,    [qword ptr [ebp+36]]		;Clear high nibbles
    psrlq  mm7,    63				;Shift inter dword carry for addition to next 8 digits   
    movq   [esi],   mm2				;Move output data into output array   
    nop
    add    esi,    8				;Increment destination array
    
    mov    ecx,    [dword ptr [ebp+8]]		;Make ecx a pointer to the source array     
    mov    eax,    [edx-8]			;Try Pre-caching data for next loop
    mov    eax,    [edi+8]    
    sub    ecx,    16				;Decrement right side pointer
    cmp    edx,    ecx				;Compare both ends of source array to see if we are done
    jge    f2a					;If not done loop again



    ;Post amble
    emms					;Finished with mmx registers (Dumb idea Intel)
    pop   eax					;Restore edi from the stack
    pop   edx					;Restore edi from the stack
    pop   edi					;Restore edi from the stack
    pop   esi					;Restore esi from the stack
    pop   ebx					;Restore ebx from the stack
    pop   ecx					;Restore ecx from the stack
    mov   esp,ebp 				;Moves output data from ebp to esp where Fortran expects it
    pop   ebp					;Restrore ebp from the stack
    ret
 _faa2x64a_p3_    endp  			;End of procedure
    
                     
    _ACODE          ENDS                    	; End of segment
                    END