;-----------------------------------------------------------------------------
;
; Monolith 23.  Non-perspective NO Z buffered X888
;
;   Exactly the same as monolith 2 except color input is 32 bits and
;   output is 32 bits and no Z buffer code
;
;-----------------------------------------------------------------------------

INCLUDE iammx.inc
INCLUDE offs_acp.inc

; Names are read LSB to MSB, so B5G6R5 means five bits of blue starting
; at the LSB, then six bits of green, then five bits of red.

;TBD check to see if this value is correct.
COLOR_SHIFT equ 8

.586
.model flat


; Big separating lines seperate code into span code
; and loop code.  If span and loop are not going to
; end up being combined then it will be easy to
; seperate the code.


.data

; Need externs for all of the variables that are needed for various beads

  EXTERN IncHighandLow16:MMWORD
  EXTERN UFracVFracMask:MMWORD
  EXTERN UV32to15Mask:MMWORD
  EXTERN Makelow16one:MMWORD
  EXTERN MaskKeepUValues:MMWORD
  EXTERN MaskKeepVValues:MMWORD
  EXTERN UFrac:MMWORD
  EXTERN VFrac:MMWORD
  EXTERN Zero:MMWORD
  EXTERN memD3DTFG_POINT:MMWORD
  EXTERN GiveUp:MMWORD
  EXTERN LastW:MMWORD
  EXTERN Val0x000a000a:MMWORD
  EXTERN Val0xffff:MMWORD
  EXTERN Val0x0000002000000020:MMWORD
  EXTERN Val0x0000ffff0000ffff:MMWORD


EXTERN MaskRed565to888:MMWORD
EXTERN MaskGreen565to888:MMWORD
EXTERN MaskBlue565to888:MMWORD

EXTERN MaskRed555to888:MMWORD
EXTERN MaskGreen555to888:MMWORD
EXTERN MaskBlue555to888:MMWORD

EXTERN MaskAlpha1555to8888:MMWORD
EXTERN MaskRed1555to8888:MMWORD
EXTERN MaskGreen1555to8888:MMWORD
EXTERN MaskBlue1555to8888:MMWORD

; TBD. I think that I want to do 0xffff instead of 0xff.  This will
; have to be checked.  There is a value very similiar to this in
; buf write.
EXTERN SetAlphato0xffff:MMWORD
EXTERN SetAlphato0xff:MMWORD

; TODO This equate are identical to the ones in texread.mas.  Maybe they should be in a common .inc file.
RedShift565to888     equ 8
GreenShift565to888   equ 5
BlueShift565to888    equ 3

RedShift555to888     equ 9
GreenShift555to888   equ 6
BlueShift555to888    equ 3

AlphaShift1555to8888 equ 16
RedShift1555to8888   equ 9
GreenShift1555to8888 equ 6
BlueShift1555to8888  equ 3

EXTERN Zero:MMWORD


EXTERN DW_One_One:MMWORD


EXTERN MaskOffAlpha:MMWORD
EXTERN ShiftTA:MMWORD
EXTERN Val0x00ff00ff00ff00ff:MMWORD
EXTERN Val0x000000ff00ff00ff:MMWORD
EXTERN Val0X0000000001000000:MMWORD
EXTERN AlphaVal128:MMWORD
EXTERN RGBVal128:MMWORD


    EXTERN  g_uDitherValue:MMWORD
    EXTERN  SetAlphato0xff:MMWORD
    EXTERN  u888to565RedBlueMask:MMWORD
    EXTERN  u888to565GreenMask:MMWORD
    EXTERN  u888to565Multiplier:MMWORD
    EXTERN  uVal0x000007ff03ff07ff:MMWORD
    EXTERN  uVal0x0000078003c00780:MMWORD
    EXTERN  u888to555RedBlueMask:MMWORD
    EXTERN  u888to555GreenMask:MMWORD
    EXTERN  u888to555Multiplier:MMWORD
    EXTERN  uVal0x000007ff07ff07ff:MMWORD
    EXTERN  uVal0x0000078007800780:MMWORD



;-----------------------------------------------------------------------------
; Span Variables
uMaskU          dq      ?
StackPos    dd  ?
uSpans      dd  ?
iShiftU     dd  ?
iShiftPitch     dd  ?
pBits           dd  ?
;-----------------------------------------------------------------------------

;-----------------------------------------------------------------------------
; Loop Variables

iSurfaceStep    dd  ?
uPix            dd  ?

;-----------------------------------------------------------------------------

.code

PUBLIC _MMXMLRast_23
_MMXMLRast_23:
    push    ebp
    mov     StackPos, esp
    mov     eax, esp
    sub     esp, 0Ch        ; This will need to change if stack frame size changes.
    push    ebx
    push    esi
    push    edi

    ; Put pCtx into ebx
    mov     ebx, [eax+8]

    ;PD3DI_RASTPRIM pP = pCtx->pPrim;
    mov     ecx, [ebx+RASTCTX_pPrim]

    ;while (pP)
    ;{
PrimLoop:
    cmp     ecx, 0
    je      ExitPrimLoop

    ;UINT16 uSpans = pP->uSpans;
    movzx   eax, word ptr [ecx+RASTPRIM_uSpans]
    mov     uSpans, eax

    ;PD3DI_RASTSPAN pS = (PD3DI_RASTSPAN)(pP + 1);
    mov     ebp, ecx
    add     ebp, SIZEOF_RASTPRIM

SpanLoop:
    mov     edx, uSpans
    mov     eax, edx
    dec     eax
    mov     uSpans, eax
    test    edx, edx
    jle     ExitSpanLoop

    ;pCtx->pfnBegin(pCtx, pP, pS);

;-----------------------------------------------------------------------------
;  LoopAny code inserted here.  This is to get rid of an extra
;  jump.
;-----------------------------------------------------------------------------

; Setup Code begins - get values to iterate

    movzx   eax, word ptr [ebp+RASTSPAN_uPix]
    mov     uPix, eax
    movq    mm5, [ebp+RASTSPAN_iUoW1]

    ; non perspective correct.

    psrad   mm5, TEX_TO_FINAL_SHIFT
    movq    [ebx+RASTCTX_SI+SPANITER_iU1], mm5
    mov     dword ptr [ebx+RASTCTX_SI+SPANITER_iDW], 0
    mov     word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW], 0

    mov     eax, [ecx+RASTPRIM_uFlags]
    and     eax, D3DI_RASTPRIM_X_DEC
    test    eax, eax
    jz      LeftToRightSpan
    mov     eax, [ebx+RASTCTX_iSurfaceStep]
    neg     eax
    mov     iSurfaceStep, eax
    jmp     DoneSpanDirif

LeftToRightSpan:
    mov     eax, [ebx+RASTCTX_iSurfaceStep]
    mov     iSurfaceStep, eax
DoneSpanDirif:

;******************************************
    mov             esi, [ebx+RASTCTX_pTexture]
    mov     edx, [esi + SPANTEX_iShiftU]
    mov     iShiftU, edx
    movzx   edx, word ptr [esi + SPANTEX_iShiftPitch]
    mov     iShiftPitch, edx
    movd    mm0, dword ptr [esi+SPANTEX_uMaskU]     ; Load U and V mask
    movq    MMWORD PTR uMaskU, mm0
    mov     edx, [esi+SPANTEX_pBits]
    mov             pBits, edx
    mov     edi, [ebp+RASTSPAN_pSurface]
;******************************************
PixelLoop:

; Doing UV calculation a little more accurate
; Exactly like C code.

    ; I  iU and iV to the right not by (TEX_FINAL_SHIFT - iShiftU0) but by
    ; (TEX_FINAL_SHIFT - iShiftU0 - 6).  iShiftU0 = pTex->iShiftU - iLOD0
    ; (TEX_FINAL_SHIFT - (pTex->iShiftU - iLOD0))
    ; (TEX_FINAL_SHIFT + iLOD0 - pTex->iShiftU)

    ; COMMENT1**
    ; If textures have a max of 1024 then shiftU0 would be at most 10 which would
    ; make (TEXT_FINAL_SHIFT - iShiftU - 6) at most zero.  This is why I choose 6
    ; It will also give bi-linear 6 bits of precision I think it was said that
    ; only five was needed.
    ;INT16 iShiftU0 = pTex->iShiftU - iLOD0;
    ;INT16 iShiftV0 = pTex->iShiftV - iLOD0;
    movq        mm5, MMWORD PTR Val0x000a000a  ; This is TEX_FINAL_SHIFT - 6 = 10.
    ;******************************************
    ;movd        mm4, [esi+SPANTEX_iShiftU]
    movd        mm4, iShiftU
    ;******************************************
    psubw       mm5, mm4
    movq        mm4, mm5
    pand        mm5, MMWORD PTR Val0xffff
    psrld       mm4, 16
    movd        mm1, [ebx+RASTCTX_SI+SPANITER_iU1]
    psrad       mm1, mm5
    movd        mm2, [ebx+RASTCTX_SI+SPANITER_iV1]
    psrad       mm2, mm4
    punpckldq   mm1, mm2

    ; Texture Pitch cannot be calculated so it must be looked up in the iShiftPitch table
    ; ----------------- Start of hack
    ; ATTENTION This is really hacked right now.  Just to get it working
    ; Pitch would be better for me, instead of  pitch.
    ; With actual pitch, this would be two moves and a .

    ;******************************************
    ;movzx   edx, word ptr [esi+SPANTEX_iShiftPitch]
    mov             edx, iShiftPitch
    ;******************************************
    add     edx, 16
    movd    mm2, edx
    movq    mm5, MMWORD ptr Makelow16one
    pslld   mm5, mm2

    ; ----------------- End of hack

    por       mm5, MMWORD ptr Makelow16one
                  ; Make the low 16 bits of dword one
                  ; This helps in calculating texture address.

    ; Gets U and V value into mm1 so that it can be mirrored, wrapped or
    ; clamped.  This can be done for two values in the point case
    ; or four values in the bilinear case.
    ;iU00 >>= 6;
    ;iV00 >>= 6;

    psrad      mm1, 6
    packssdw   mm1, mm1  ; Value needs to be packed since all wrap/mirror
            ; operations assume UV in low 32 bits.

    ;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0;  UINT16 uMaskV0 = pTex->uMaskV >> iLOD0;
    ; put mask in mm3 and replicate to match location for wrap/mirror/clamp
    pand            mm1, MMWORD PTR uMaskU
    movq    mm4, mm1

    pmaddwd   mm4, mm5  ; Throw in first address calculation.
            ; Just to get it started. Calculate
            ; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0

    ; iV0 iU1 address should be done by now.
    movd        eax, mm4
    shl             eax, 2
    ;add     eax, [esi+SPANTEX_pBits]
    add             eax, pBits

    ;pCtx->SI.TexCol[0] = pCtx->pfnTexRead(iU00, iV00, pTex->iShiftU,
    ;    pTex->pBits[iLOD0], &pCtx->Texture[0]);

    ; -------------------- In Monolithic version calls are inlined.

; Generate Border Mask to always be true in non border case.
    ;pcmpeqd mm5, mm5
    mov     edx, dword ptr [eax]
    and     edx, 000ffffffh
    mov     [edi], edx

    dec   uPix
    jle   ExitPixelLoop

; Doing update code after span length test so that an extra update is not done.

    movq    mm5, [ebp+RASTSPAN_iUoW1]
    paddd   mm5, [ecx+RASTPRIM_iDUoW1DX]
    movq    [ebp+RASTSPAN_iUoW1], mm5

    ; mm5 still contains iUoW and iVoW which are the iU and iV values for
    ; non perspective correct.
    psrad   mm5, TEX_TO_FINAL_SHIFT
    movq    [ebx+RASTCTX_SI+SPANITER_iU1], mm5
    add     edi, iSurfaceStep
    jmp     PixelLoop


ExitPixelLoop:
; Loop code ends

;-----------------------------------------------------------------------------
;  LoopAny code ends here
;-----------------------------------------------------------------------------

    ;pS++;
    add     ebp, SIZEOF_RASTSPAN

    ;}
    jmp     SpanLoop
ExitSpanLoop:
    ;pP = pP->pNext;
    mov     ecx, [ecx+RASTPRIM_pNext]
    ;}
    jmp     PrimLoop

ExitPrimLoop:
    ;_asm{
    emms
    ;}

    ;return S_OK;
    xor     eax, eax
;}
    pop     edi
    pop     esi
    pop     ebx
    mov     esp, StackPos
    pop     ebp
    ret

END