;-----------------------------------------------------------------------------
;
; This file contains the general span parsing code combined with loop code.
;
;
; WARNING WARNING WARNING
; This asm file generated from mas file.
; EDIT THE MAS FILE.
; I warned you.
; WARNING WARNING WARNING
;
;-----------------------------------------------------------------------------

INCLUDE iammx.inc
INCLUDE offs_acp.inc

include(`m4hdr.mh')dnl
include(`cvars.mh')dnl
include(`texaddra.mh')dnl

EXTERN g_uDitherValue:MMWORD

.586
.model flat


; Big seperating lines seperate code into span code
; and loop code.  If span and loop are not going to
; end up being combined then it will be easy to
; seperate the code.


.data


;-----------------------------------------------------------------------------
; Span Variables
StackPos    dd  ?
uSpans      dd  ?
;-----------------------------------------------------------------------------

;-----------------------------------------------------------------------------
; Loop Variables

;// Table is needed to get starting value for dither, but can use xor trick afterwards to generate consecutive values.
;// Need to compare table based method with Xor method and compare timing/memory usage.  It is good to keep xor method
;// around since it can be used more efficently when there are more free registers (i.e. a monolithic routine Probably
;// only enough registers to do it in a gouraud or gouraud/specular case).
;static UINT64 uMMXDitherTable[16] =
;{
;    0x0000000000000000 >> 6,   0x0000800080008000 >> 6,   0x0000200020002000 >> 6,   0x0000a000a000a000 >> 6,
;    0x0000c000c000c000 >> 6,   0x0000400040004000 >> 6,   0x0000e000e000e000 >> 6,   0x0000600060006000 >> 6,
;    0x0000300030003000 >> 6,   0x0000b000b000b000 >> 6,   0x0000100010001000 >> 6,   0x0000900090009000 >> 6,
;    0x0000f000f000f000 >> 6,   0x0000700070007000 >> 6,   0x0000d000d000d000 >> 6,   0x0000500050005000 >> 6
;};

uMMXDitherTable dq  000000000000000h , 000800080008000h , 000200020002000h , 000a000a000a000h
                dq  000c000c000c000h , 000400040004000h , 000e000e000e000h , 000600060006000h
                dq  000300030003000h , 000b000b000b000h , 000100010001000h , 000900090009000h
                dq  000f000f000f000h , 000700070007000h , 000d000d000d000h , 000500050005000h

u565MultShifter dq 00000000200010002h
u555MultShifter dq 00000000200020002h
uFogDXAdd       dq 00000000400040004h

iSurfaceStep    dd  ?
iZStep          dd  ?

uDitherXorXorMask   dq  0
uDitherXorMask      dq  0

uDitherXorXorMaskInitVal    dq  0000200020002000h
uDitherXorMaskInitVal       dq  0000800080008000h

uPix            dd  ?

;-----------------------------------------------------------------------------

.code

;HRESULT MMX_RenderSpansAny(PD3DI_RASTCTX pCtx)
;{
    PUBLIC _MMX_RenderSpansAny
_MMX_RenderSpansAny:

    push    ebp
    mov     StackPos, esp
    mov     eax, esp
    sub     esp, 0Ch        ; This will need to change if stack frame size changes.
    push    ebx
    push    esi
    push    edi

    ; Put pCtx into ebx
    mov     ebx, [eax+8]

    ;PD3DI_RASTPRIM pP = pCtx->pPrim;
    mov     ecx, XpCtx(pPrim)

    ; ATTENTION?? Should these be set by validation?  I dont know
    ; why they would need to be since every span routine knows
    ; where the code needs to return.  Also, How is pfnAlphaTestFailEnd
    ; different than pfnPixelEnd?
    mov     eax, _MMX_LoopAnyEndPixel
    mov     XpCtx(pfnPixelEnd), eax
    mov     XpCtx(pfnAlphaTestFailEnd), eax

    ;while (pP)
    ;{
PrimLoop:
    cmp     ecx, 0
    je      ExitPrimLoop

    ;UINT16 uSpans = pP->uSpans;
    movzx   eax, word ptr XpP(uSpans)
    mov     uSpans, eax

    ;PD3DI_RASTSPAN pS = (PD3DI_RASTSPAN)(pP + 1);
    mov     ebp, ecx
    add     ebp, SIZEOF_RASTPRIM


    ;while (uSpans-- > 0)
    ;{
SpanLoop:
    mov     edx, uSpans
    mov     eax, edx
    dec     eax
    mov     uSpans, eax
    test    edx, edx
    jle     ExitSpanLoop

    ;pCtx->pfnBegin(pCtx, pP, pS);

;-----------------------------------------------------------------------------
;  LoopAny code inserted here.  This is to get rid of an extra
;  jump.
;-----------------------------------------------------------------------------

; Setup Code begins

    ; get values to iterate

    ;uPix = pS->uPix;
    movzx   eax, word ptr XpS(uPix)
    mov     uPix, eax

    ; TODO Copy uFog and iDFog from pS to pCtx.SI
    ; so fog increment can be done faster in MMX.


    ; dont need to do this if there is no fog.
    ;if (pCtx->pdwRenderState[D3DRENDERSTATE_FOGENABLE]) {
    cmp dword ptr XpCtx(pdwRenderState+RS_FOGENABLE), 0
    je  NoFogSetup
        ;D3DCOLOR FogColor = pCtx->pdwRenderState[D3DRENDERSTATE_FOGCOLOR];
        ;UINT16 FR = (UINT16)RGBA_GETRED(FogColor);
        ;UINT16 FG = (UINT16)RGBA_GETGREEN(FogColor);
        ;UINT16 FB = (UINT16)RGBA_GETBLUE(FogColor);
        pxor        mm0, mm0
        movd        mm1, XpCtx(pdwRenderState+RS_FOGCOLOR)
        ;UINT16 uMFog = 0xff - (pS->uFog>>8);
        pcmpeqd     mm2, mm2
        movzx       eax, word ptr XpS(uFog)
        shr         eax, 8
        movd        mm3, eax
        psubb       mm2, mm3

        punpcklbw   mm2, mm0
        punpcklwd   mm2, mm2    ; Replicate uMFog
        punpckldq   mm2, mm2

        ;pCtx->SI.uFogR = uMFog * FR;    // 0.8 * 0.8 = 8.8
        ;pCtx->SI.uFogG = uMFog * FG;
        ;pCtx->SI.uFogB = uMFog * FB;
        punpcklbw   mm1, mm0
        pmullw      mm2, mm1
        movq        XpCtxSI(uFogB), mm2
        ;INT32 iMDFog = -pS->iDFog;
        movsx       eax, word ptr XpS(iDFog)
        neg         eax
        movd        mm3, eax
        punpcklwd   mm3, mm3
        punpckldq   mm3, mm3
        ;// 1.7.8 * 8.0 >> 8 = 1.7.8 (ATTENTION this could overflow, but it is naturally aligned for
        ;// doing the walking.  Can fix by changing precision of uFogR values, or by clamping
        ;// range of iDFog.
        ;pCtx->SI.iFogRDX = (INT16)((iMDFog * FR) >> 8);
        ;pCtx->SI.iFogGDX = (INT16)((iMDFog * FG) >> 8);
        ;pCtx->SI.iFogBDX = (INT16)((iMDFog * FB) >> 8);
        psllw       mm1, 7      ; Have to loose a bit on fog or add some extra code
        pmulhw      mm3, mm1
        psllw       mm3, 1
        ;// if iFog*DX is positive, iFog*DX will always be too small, hence no overflow
        ;// but if iFog*DX is negative, add some to make sure overflow does not
        ;// occur
        ;if (pCtx->SI.iFogRDX < 0)
        ;{
        ;    pCtx->SI.iFogRDX = min(pCtx->SI.iFogRDX+4, 0);
        ;}
        pxor        mm4, mm4                    ; make zero for compare
        pcmpgtw     mm4, mm3                    ; ffff mask of all negative deltas
        movq        mm5, mm4                    ; save copy of mask
        pand        mm4, MMWORD PTR uFogDXAdd   ; 4 for negative deltas
        paddw       mm3, mm4                    ; 4 added to negative deltas
        movq        mm2, mm3                    ; copy of deltas after add
        pxor        mm4, mm4                    ; make zero for compare
        pcmpgtw     mm2, mm4                    ; ffff mask for all positive values
        pand        mm2, mm5                    ; ffff mask for all created positive values
        pandn       mm2, mm3                    ; all created positive values anded out to zero
        movq        XpCtxSI(iFogBDX), mm2       ; save deltas

        ; Copy these values to Span Iterator so that they can be done at the same time
        ; as other increments.
        xor         eax, eax
        mov         ax, XpS(uFog)
        mov         XpCtxSI(uFog), ax
        mov         ax, XpS(iDFog)
        mov         XpCtxSI(iDFog), ax
    ;}
NoFogSetup:

    ; dont need to do this if not texture mapping
    ;if (pCtx->pdwRenderState[D3DRENDERSTATE_TEXTUREPERSPECTIVE])
    ;{
    cmp dword ptr XpCtx(pdwRenderState+RS_TEXTUREPERSPECTIVE), 0
    je  SetupNonPerspective
        ;//pCtx->SI.iU1 = (pS->iW*(pS->iUoW1>>4))>>16;    // 8.16 * 1.11.16 == 1.15.32 >> 16 == 1.15.16
        ;//pCtx->SI.iV1 = (pS->iW*(pS->iVoW1>>4))>>16;
        ;//pCtx->SI.iU2 = (pS->iW*(pS->iUoW2>>4))>>16;
        ;//pCtx->SI.iV2 = (pS->iW*(pS->iVoW2>>4))>>16;
        ;pCtx->SI.iDW = 0x0;
        mov dword ptr XpCtxSI(iDW), 0

        mov     esi, XpS(iW)
        movq    mm5, MMWORD PTR XpS(iUoW1)

        d_UoWVoWTimesW(1)

        mov     esi, XpS(iW)
        movq    mm5, MMWORD PTR XpS(iUoW2)

        d_UoWVoWTimesW(2)


        ;if (pP->iDOoWDX > 0)
        ;{
        cmp dword ptr XpP(iDOoWDX), 0
        jg  SpecialWLast3
            ;// iSpecialW should be negative for the first 3 pixels of span
            ;pCtx->SI.iSpecialW = -3;
            mov word ptr XpCtxSI(iSpecialW), -3
            jmp DoneSpecialWif
        ;}
        ;else
        ;{
SpecialWLast3:
            ;// iSpecialW should be negative for the last 3 pixels of span
            ;pCtx->SI.iSpecialW = 0x7fff - uPix;
            mov     eax, 07fffh
            sub     eax, uPix
            ;pCtx->SI.iSpecialW += 5;        // this may wrap, but it should
            add     eax, 5
            mov     XpCtxSI(iSpecialW), eax
        ;}
DoneSpecialWif:

        jmp DonePerspectiveif

    ;}
    ;else
    ;{
SetupNonPerspective:
        ; TODO Add assembly code for affine setup.
        ;pCtx->SI.iU1 = pS->iUoW1>>TEX_TO_FINAL_SHIFT;   // 1.11.20 >> 4 == 1.15.16
        ;pCtx->SI.iV1 = pS->iVoW1>>TEX_TO_FINAL_SHIFT;

        movq    mm5, XpS(iUoW1)

        d_UpdateNonPersp(1)

        ;pCtx->SI.iU2 = pS->iUoW2>>TEX_TO_FINAL_SHIFT;
        ;pCtx->SI.iV2 = pS->iVoW2>>TEX_TO_FINAL_SHIFT;

        movq    mm5, XpS(iUoW2)

        d_UpdateNonPersp(2)

        ;pCtx->SI.iDW = 0x0;
        mov dword ptr XpCtxSI(iDW), 0

        ;pCtx->SI.iSpecialW = 0;
        mov word ptr XpCtxSI(iSpecialW), 0
    ;}
DonePerspectiveif:
    ; Static variables are placed in
    ;static INT iSurfaceStep;
    ;static INT iZStep;
    ; Note: Dither code needs to be setup if either color dithering or alpha dithering are on.
    ;
    ;// Dither code depends on rendering direction.
    ;// Shift everything down by 6 then use multiply to shift up one to have an end result of either 565 or 555.
    ;static UINT64 uDitherXorMask;                               // will be either 1010b or 1000b (even or odd)
    ;static UINT64 uDitherXorXorMask;

    ;uDitherXorXorMask = 0x0000200020002000 >> 6;
    ;uDitherXorMask = 0x0000800080008000 >> 6;
    movq    mm0, MMWORD PTR uDitherXorXorMaskInitVal
    psrlw   mm0, 6
    movq    MMWORD PTR uDitherXorXorMask, mm0

    movq    mm0, MMWORD PTR uDitherXorMaskInitVal
    psrlw   mm0, 6
    movq    MMWORD PTR uDitherXorMask, mm0


    ;if (pP->uFlags & D3DI_RASTPRIM_X_DEC)
    ;{
    mov     eax, XpP(uFlags)
    and     eax, D3DI_RASTPRIM_X_DEC
    test    eax, eax
    jz      LeftToRightSpan
        ;iZStep = -pCtx->iZStep;
        mov eax, XpCtx(iZStep)
        neg eax
        mov iZStep, eax
        ;iSurfaceStep = -pCtx->iSurfaceStep;
        mov eax, XpCtx(iSurfaceStep)
        neg eax
        mov iSurfaceStep, eax
        ;pCtx->SI.iXStep = -1;   // for dithering.
        ; This shouldnt be needed for dithering
        ; since I do it differently. TODO check this
        ;_asm{
            ; Dither xor mask starting value changes
            movq        mm1, MMWORD PTR uDitherXorMask
            por         mm1, MMWORD PTR uDitherXorXorMask
            movq        MMWORD PTR uDitherXorMask, mm1
        ;}

    ;}
        jmp DoneSpanDirif
    ;else
    ;{
LeftToRightSpan:
        ;iZStep = pCtx->iZStep;
        mov eax, XpCtx(iZStep)
        mov iZStep, eax
        ;iSurfaceStep = pCtx->iSurfaceStep;
        mov eax, XpCtx(iSurfaceStep)
        mov iSurfaceStep, eax
        ;pCtx->SI.iXStep = 1;
        ; iXStep shouldnt be needed. TODO check this.
    ;}
DoneSpanDirif:

;// ----------------------------------------------------------------------------------------------------------------
;// Doing dither setup code even if dither is not turned on.
;// This code is not very clean.  TODO clean it up after it works.
    ;_asm{
    ;//if(pS->uX & 1) uDitherXorValue |= uDitherXorXorValue;

    movzx       eax, word ptr XpS(uX)

    ;// Create Zero or uDitherXorXorValue based on low bit of uX
    and         eax, 1
    shl         eax, (13 - 6)

    movd        mm1, eax
    punpcklwd   mm1, mm1
    punpckldq   mm1, mm1

    ; TODO Do I need to and here so that I dont disrupt Alpha channel???

    pxor        mm1, MMWORD PTR uDitherXorMask
    movq        MMWORD PTR uDitherXorMask, mm1
    ;}

    ;// Keep dither pattern up to date directly, so keeping SI.uX up
    ;// to date is not necessary, except for debug
    ;//pCtx->SI.uDitherOffset = (pS->uY & 3) | ((pS->uX & 3)<<2);

    ;// I move along the dithertable completely orthogonal to the way the C code does.  This should not make a difference.
    ;g_uDitherValue = uMMXDitherTable[( ((pS->uY & 3)<<2) | (pS->uX & 3))]; //  >> 6;  shift is done in table.
    movzx   eax, word ptr XpS(uY)
    and     eax, 3
    shl     eax, 2
    movzx   edx, word ptr XpS(uX)
    and     edx, 3
    or      eax, edx
    shl     eax, 3

    movq    mm1, MMWORD PTR uMMXDitherTable[eax]
    psrlw   mm1, 6
    movq    MMWORD PTR g_uDitherValue, mm1

    ;//if colormode is 565 then shift all green values down by one more.
    ;// TODO Add RR_STYPE_B5G5R5A1 when code is done for that format.
    ;// Are these multiplies noticeable or should I use two tables?

    ;switch(pCtx->iSurfaceType)
    ;{
    ;case RR_STYPE_B5G6R5:
    cmp     dword ptr XpCtx(iSurfaceType), RR_STYPE_B5G6R5
    jne     Test555
        ;_asm{
            movq    mm1, MMWORD PTR uDitherXorMask
            pmullw  mm1, MMWORD PTR u565MultShifter
            movq    MMWORD PTR uDitherXorMask, mm1

            movq    mm1, MMWORD PTR uDitherXorXorMask
            pmullw  mm1, MMWORD PTR u565MultShifter
            movq    MMWORD PTR uDitherXorXorMask, mm1

            movq    mm1, MMWORD PTR g_uDitherValue
            pmullw  mm1, MMWORD PTR u565MultShifter
            movq    MMWORD PTR g_uDitherValue, mm1
        ;}
    ;break;
    jmp     DoneModDitherValues

Test555:
    ;case RR_STYPE_B5G5R5:
    ; Commented out this condional because dither needs to be on for alpha dithering
    ;  which is independent of what type of color output we want.
    ;
    ;cmp     dword ptr XpCtx(iSurfaceType), RR_STYPE_B5G5R5
    ;jne     DoneModDitherValues

        ;_asm{

            movq    mm1, MMWORD PTR uDitherXorMask
            pmullw  mm1, MMWORD PTR u555MultShifter
            movq    MMWORD PTR uDitherXorMask, mm1

            movq    mm1, MMWORD PTR uDitherXorXorMask
            pmullw  mm1, MMWORD PTR u555MultShifter
            movq    MMWORD PTR uDitherXorXorMask, mm1

            movq    mm1, MMWORD PTR g_uDitherValue
            pmullw  mm1, MMWORD PTR u555MultShifter
            movq    MMWORD PTR g_uDitherValue, mm1
        ;}
    ;break;
    ;}
DoneModDitherValues:

; Setup Code Ends
; ----------------------------------------------------------------------------------------------------------------
; Loop Code Begins

    ;//while (1)
    ;//{
PixelLoop:
            ; uncomment to look at a span in a particular range
;            movzx eax, word ptr XpS(uX)
;            cmp eax, 340
;            jl  NotSpecial
;            cmp eax, 363
;            jg  NotSpecial
;            cmp word ptr XpS(uY), 330
;            jne NotSpecial
;
;            ; Special
;            xor eax, eax
;
;NotSpecial:
            ; Probably dont need to move this into a register first.
            mov   eax, XpCtx(pfnLoopEnd)

            ;pCtx->pfnLoopEnd(pCtx, pP, pS);
            jmp  eax


; Just put EndBead here for now.  After Kent and Drew decide on beads, code can be moved around.
    PUBLIC  _MMX_LoopAnyEndPixel
_MMX_LoopAnyEndPixel:

        ;//if (--uPix <= 0)
        ;//    break;
        dec   uPix    ;// BUG BUG?? uPix should never start as zero should it?
                      ;// if so, this is a bug.
        jle   ExitPixelLoop

        ;//pS->pZ += iZStep;
        ;//pS->pSurface += iSurfaceStep;
        mov   eax, dword ptr XpS(pZ)
        mov   edx, dword ptr XpS(pSurface)

        add   eax, iZStep
        add   edx, iSurfaceStep

        mov   dword ptr XpS(pZ), eax
        mov   dword ptr XpS(pSurface), edx

        ;// dont update this in dithered write functions because of alpha test
        ;// ATTENTION could specialize loop routines based on things like dither and Z buffer
        ;//pCtx->SI.uDitherOffset = (pCtx->SI.uDitherOffset + (pCtx->SI.iXStep<<2)) & 0xf;
        ;// May Not need DitherOffset, but I might have to update xor masks.
        movq    mm3, MMWORD PTR g_uDitherValue     ; four bit value from table
        movq    mm4, MMWORD PTR uDitherXorMask     ; will be either 1010b or 1000b (even or odd)
        pxor    mm3, mm4                           ; change dither value
        pxor    mm4, MMWORD PTR uDitherXorXorMask  ; always 0010b
        movq    MMWORD PTR uDitherXorMask, mm4     ; save new xor mask
        movq    MMWORD PTR g_uDitherValue, mm3     ; save new dither value.

;#ifdef DBG
        ;// handy for debug to see where we are
        ;//pS->uX += (INT16)pCtx->SI.iXStep;
;#endif
    ;// } // while
    jmp   PixelLoop


ExitPixelLoop:
; Loop code ends

;-----------------------------------------------------------------------------
;  LoopAny code ends here
;-----------------------------------------------------------------------------

    ;pS++;
    add     ebp, SIZEOF_RASTSPAN

    ;}
    jmp     SpanLoop
ExitSpanLoop:
    ;pP = pP->pNext;
    mov     ecx, XpP(pNext)
    ;}
    jmp     PrimLoop

ExitPrimLoop:
    ;_asm{
    emms
    ;}

    ;return S_OK;
    xor     eax, eax
;}
    pop     edi
    pop     esi
    pop     ebx
    mov     esp, StackPos
    pop     ebp
    ret

; ATTENTION  Just putting this here, because selection code needs a function pointer
    PUBLIC _MMX_LoopAny
_MMX_LoopAny:
    ; This Should never be called by anything.
    ret

END