2025-04-27 07:49:33 -04:00

677 lines
28 KiB
NASM

;---------------------------Module-Header------------------------------;
; Module Name: str.asm
;
; Contains the x86 'Asm' versions of some inner-loop routines for the
; partially hardware accelerated StretchBlt.
;
; Copyright (c) 1994-1995 Microsoft Corporation
;-----------------------------------------------------------------------;
.386
.model small,c
assume cs:FLAT,ds:FLAT,es:FLAT,ss:FLAT
assume fs:nothing,gs:nothing
.xlist
include stdcall.inc ;calling convention cmacros
include i386\strucs.inc
include i386\hw.inc
.list
.data
;
; stack based params and local variables
;
STACK_STRUC struc
; Feel free to add any local variables here:
sp_TempXFrac dd ?
sp_YCarry dd ?
sp_LeftCase dd ?
sp_RightCase dd ?
sp_pjSrcScan dd ?
sp_SrcIntStep dd ?
sp_XCount dd ?
sp_yDst dd ?
sp_cxMemory dd ?
sp_pjDst dd ?
sp_ulDst dd ?
sp_pjBase dd ?
sp_ulYDstOrg dd ?
sp_XWidthLessOne dd ?
; Don't add any fields below here without modifying PROC_MEM_SIZE!
sp_ebp dd ?
sp_esi dd ?
sp_edi dd ?
sp_ebx dd ?
sp_RetAddr dd ?
sp_pSTR_BLT dd ? ; If adding parameters, adjust 'ret' value!
STACK_STRUC ends
PROC_MEM_SIZE equ 6 * 4
;
; Make sure this STR_BLT matches that declared in driver.h!
;
STR_BLT struc
str_ppdev dd ?
str_pjSrcScan dd ?
str_lDeltaSrc dd ?
str_XSrcStart dd ?
str_pjDstScan dd ? ; Unused by MGA
str_lDeltaDst dd ? ; Unused by MGA
str_XDstStart dd ?
str_XDstEnd dd ?
str_YDstStart dd ?
str_YDstCount dd ?
str_ulXDstToSrcIntCeil dd ?
str_ulXDstToSrcFracCeil dd ?
str_ulYDstToSrcIntCeil dd ?
str_ulYDstToSrcFracCeil dd ?
str_ulXFracAccumulator dd ?
str_ulYFracAccumulator dd ?
STR_BLT ends
.code
;---------------------------Public-Routine------------------------------;
; VOID vMgaDirectStretch8(pStrBlt)
;
; NOTE: This routine doesn't handle cases where the blt stretch starts
; and ends in the same destination dword! vDirectStretchNarrow
; is expected to have been called for that case.
;
; Stretch blt 8 -> 8
;-----------------------------------------------------------------------;
public vMgaDirectStretch8@4
vMgaDirectStretch8@4 proc near
;
; use ebp as general register, use esp for parameter and local access
; save ebp,ebx,esi,edi
;
push ebx
push edi
push esi
push ebp
sub esp,(size STACK_STRUC) - PROC_MEM_SIZE ; make room for local variables
mov ebp,[esp].sp_pSTR_BLT ; load pSTR_BLT into ebp
;
; load up some stack-based parameters to be used by our scan
; duplicator when doing vertical stretches
;
mov eax,[ebp].str_ppdev
mov ecx,[ebp].str_YDstStart ; get start y coordinate
add ecx,[eax].pdev_yOffset ; convert to abs coordinate
mov [esp].sp_yDst,ecx ; save current y coordinate
mov ebx,[eax].pdev_ulYDstOrg
mov edx,[eax].pdev_cxMemory
mov edi,[eax].pdev_pjBase
mov [esp].sp_ulYDstOrg,ebx ; local copy of ulYDstOrg
mov [esp].sp_cxMemory,edx ; local copy of stride
mov [esp].sp_pjBase,edi ; local copy of pjBase
imul ecx,edx ; yDst * cxMemory
add ecx,[ebp].str_XDstStart
add ecx,[eax].pdev_xOffset
add ecx,[eax].pdev_ulYDstOrg
mov [esp].sp_ulDst,ecx ; ulDst = ulYDstOrg
; + (yDstStart * cxMemory)
; + (XDstStart + xOffset)
and ecx,31
add ecx,SRCWND
add ecx,edi
mov [esp].sp_pjDst,ecx ; pjDst = (ulDst & 31)
; + pjBase + SRCWND
;
; calc starting addressing parameters
;
mov esi,[ebp].str_pjSrcScan ; load src DIB pointer
add esi,[ebp].str_XSrcStart ; add starting Src Pixel
mov [esp].sp_pjSrcScan,esi ; save scan line start pointer
mov eax,[ebp].str_ulYDstToSrcIntCeil ; number of src scan lines to step
mul [ebp].str_lDeltaSrc ; calc scan line int lines to step
mov [esp].sp_SrcIntStep,eax ; save int portion of Y src step
mov edx,4 ; calc left bytes = (4 - LeftCase) & 0x03
sub edx,[ebp].str_XDstStart
and edx,3 ; left edge bytes
mov [esp].sp_LeftCase,edx ; save left edge case pixels (4-LeftCase)&0x03
mov ecx,[ebp].str_XDstEnd ; load x end
mov eax,ecx ; ending dst addr
and eax,3 ; calc right edge case
mov [esp].sp_RightCase,eax ; save right edge case
sub ecx,[ebp].str_XDstStart ; calc x count
lea ebx,[ecx - 1] ; calc width less one
mov [esp].sp_XWidthLessOne,ebx ; save it
sub ecx,eax ; sub right edge from XCount
sub ecx,edx ; sub left edge from XCount
shr ecx,2 ; convert from byte to DWORD count
mov [esp].sp_XCount,ecx ; save DWORD count
mov ebx,[ebp].str_ulXDstToSrcFracCeil ; get x frac
mov [esp].sp_TempXFrac,ebx ; save x frac to a esp based location
@@: cmp byte ptr [edi+HST_FIFOSTATUS],FIFOSIZE
jb @b ; CHECK_FIFO_SPACE(32)
@@: test byte ptr [edi+HST_STATUS+2],(dwgengsts_MASK shr 16)
jnz @b ; WAIT_NOT_BUSY()
mov edx,[esp].sp_ulDst
NextScan:
; edx = current destination offset
; esi = pointer to source pixel
mov [edi+HST_DSTPAGE],edx ; CP_WRITE(HST_DSTPAGE, ulDst)
mov eax,[ebp].str_ulXDstToSrcIntCeil ; get src integer step for step in dst
mov ebx,[ebp].str_ulXDstToSrcFracCeil ; get src frac step for step in dst
mov edi,[ebp].str_ulXFracAccumulator ; put it in edx as tmp
mov ebp,[esp].sp_pjDst ; get dst pointer to ebp
;
; Can't directly access pSTR_BLT variables through ebp
;
mov ecx,[esp].sp_LeftCase
; eax = integer step in source
; ebx = fractional step in source
; ecx = left edge case
; edx = free for pixel data
; esi = pointer to source pixel
; edi = fractional accumulator
; ebp = pointer to dest pixel
;
; first do the left side to align dwords
;
test ecx,ecx
jz DwordAligned
@@:
mov dl,[esi] ; fetch pixel
mov [ebp],dl ; write it out
add edi,ebx ; step fraction
adc esi,eax ; add in integer and possible carry
inc ebp ; step 1 in dest
dec ecx ; dec left count
jne @B ; repeat until done
DwordAligned:
mov ecx,[esp].sp_XCount ; get run length
@@:
mov dl,[esi] ; get a source pixel edx = ???0
add edi,ebx ; step fraction
adc esi,eax ; add integer and carry
add edi,ebx ; step fraction
mov dh,[esi] ; get source pixel edx = ??10
adc esi,eax ; add integer and carry
shl edx,16 ; edx = 10??
add edi,ebx ; step fraction
mov dl,[esi] ; get a source pixel edx = 10?2
adc esi,eax ; add integer and carry
add edi,ebx ; step fraction
mov dh,[esi] ; get source pixel edx = 0132
adc esi,eax ; add integer and carry
ror edx,16 ; edx = 3210
mov [ebp],edx ; write everything to dest
add ebp,4 ; increment dest pointer by 1 dword
dec ecx ; decrement count
jnz @b ; do more pixels
;
; now do the right side trailing bytes
;
mov ecx,[esp].sp_RightCase
test ecx,ecx
jz EndScanLine
@@:
mov dl,[esi] ; fetch pixel
mov [ebp],dl ; write it out
add edi,ebx ; step fraction
adc esi,eax ; add in integer and possible carry
inc ebp ; step 1 in dest
dec ecx ; dec right count
jnz @b ; repeat until done
EndScanLine:
mov ebp,[esp].sp_pSTR_BLT ; load pSTR_BLT into ebp
mov esi,[esp].sp_pjSrcScan ; load src scan start addr
mov ebx,esi ; save a copy
mov eax,[ebp].str_ulYFracAccumulator ; get .32 part of Y pointer
add eax,[ebp].str_ulYDstToSrcFracCeil ; add in fractional step
jnc @f
add esi,[ebp].str_lDeltaSrc ; step one extra in src
@@: mov [ebp].str_ulYFracAccumulator,eax ; save Y accumulator
add esi,[esp].sp_SrcIntStep ; step int part
mov [esp].sp_pjSrcScan,esi ; save starting scan addr
dec [ebp].str_YDstCount ; decrement scan count
jz Done ; no more scans
inc [esp].sp_yDst ; one scan further down in dst
mov edi,[esp].sp_pjBase
mov edx,[esp].sp_ulDst
add edx,[esp].sp_cxMemory
mov [esp].sp_ulDst,edx ; ulDst += cxMemory
cmp esi,ebx ; is src scan same as before?
jne NextScan ; if so, fall through to dupe scan
;--------------------------------------------------------------------
; The source scan is the same one used for the previous destination
; scan, so we can simply use the hardware to copy the previous
; destination scan.
;
; Since on the MGA we can set up a 'rolling blt' to copy one scan
; line to several scans in a single command, we will count up how
; many times this scan should be duplicated. If your hardware
; cannot do a rolling blt, simply issue a new blt command for
; every time the scan should be duplicated.
;
; eax = ulYFracAccumulator
; ebx = original pjSrcScan
; edx = ulDst, current offset from start of frame buffer to next scan
; esi = current pjSrcScan
; edi = pjBase
; ebp = pSTR_BLT
;
xor ecx,ecx ; number of times scan is to be
; duplicated
AnotherDuplicate:
inc ecx ; one scan further down
dec [ebp].str_YDstCount ; decrement scan count
jz OutputDuplicates ; no more scans
add eax,[ebp].str_ulYDstToSrcFracCeil ; add in fractional step
jnc @f
add esi,[ebp].str_lDeltaSrc ; step one extra in src
@@: add esi,[esp].sp_SrcIntStep ; step int part
cmp esi,ebx ; is src scan same as before?
je AnotherDuplicate
OutputDuplicates:
mov [esp].sp_pjSrcScan,esi ; save starting scan address
mov [ebp].str_ulYFracAccumulator,eax ; save Y accumulator
mov eax,[esp].sp_cxMemory
imul eax,ecx
add eax,edx
mov [esp].sp_ulDst,eax ; ulDst += cxMemory * cyDuplicate
mov eax,[esp].sp_yDst
mov [edi+DWG_YDST],eax
mov [edi+DWG_LEN],ecx
add ecx,eax
mov [esp].sp_yDst,ecx ; add duplicate count to y
sub edx,[esp].sp_cxMemory
mov [edi+DWG_AR3],edx
add edx,[esp].sp_XWidthLessOne
mov [edi+DWG_AR0+StartDwgReg],edx
; Unfortunately, if we try to write to the frame buffer at the
; same time that the MGA is doing the screen-to-screen blt, we
; will get garbage on the screen. Consequently, we always
; wait for idle before writing on the frame buffer:
@@: cmp byte ptr [edi+HST_FIFOSTATUS],FIFOSIZE
jb @b ; CHECK_FIFO_SPACE(32)
@@: test byte ptr [edi+HST_STATUS+2],(dwgengsts_MASK shr 16)
jnz @b ; WAIT_NOT_BUSY()
mov edx,[esp].sp_ulDst
; edx = current destination offset
; esi = pointer to source pixel
cmp [ebp].str_YDstCount,0 ; we might be all done
jne NextScan
Done:
add esp,(size STACK_STRUC) - PROC_MEM_SIZE
pop ebp
pop esi
pop edi
pop ebx
ret 4
vMgaDirectStretch8@4 endp
;---------------------------Public-Routine------------------------------;
; VOID vMgaDirectStretch16(pStrBlt)
;
; Stretch blt 16 -> 16
;-----------------------------------------------------------------------;
public vMgaDirectStretch16@4
vMgaDirectStretch16@4 proc near
;
; use ebp as general register, use esp for parameter and local access
; save ebp,ebx,esi,edi
;
push ebx
push edi
push esi
push ebp
sub esp,(size STACK_STRUC) - PROC_MEM_SIZE ; make room for local variables
mov ebp,[esp].sp_pSTR_BLT ; load pSTR_BLT into ebp
;
; load up some stack-based parameters to be used by our scan
; duplicator when doing vertical stretches
;
mov eax,[ebp].str_ppdev
mov ecx,[ebp].str_YDstStart ; get start y coordinate
add ecx,[eax].pdev_yOffset ; convert to abs coordinate
mov [esp].sp_yDst,ecx ; save current y coordinate
mov ebx,[eax].pdev_ulYDstOrg
mov edx,[eax].pdev_cxMemory
mov edi,[eax].pdev_pjBase
mov [esp].sp_ulYDstOrg,ebx ; local copy of ulYDstOrg
mov [esp].sp_cxMemory,edx ; local copy of stride
mov [esp].sp_pjBase,edi ; local copy of pjBase
imul ecx,edx ; yDst * cxMemory
add ecx,[ebp].str_XDstStart
add ecx,[eax].pdev_xOffset
add ecx,[eax].pdev_ulYDstOrg
mov [esp].sp_ulDst,ecx ; ulDst = ulYDstOrg
; + (yDstStart * cxMemory)
; + (XDstStart + xOffset)
add ecx,ecx
and ecx,31
add ecx,SRCWND
add ecx,edi
mov [esp].sp_pjDst,ecx ; pjDst = (ulDst & 31)
; + pjBase + SRCWND
;
; calc starting addressing parameters
;
mov esi,[ebp].str_pjSrcScan ; load src DIB pointer
add esi,[ebp].str_XSrcStart
add esi,[ebp].str_XSrcStart ; add 2 * starting Src Pixel
mov [esp].sp_pjSrcScan,esi ; save scan line start pointer
mov eax,[ebp].str_ulYDstToSrcIntCeil ; number of src scan lines to step
mul [ebp].str_lDeltaSrc ; calc scan line int lines to step
mov [esp].sp_SrcIntStep,eax ; save int portion of Y src step
mov edx,[ebp].str_XDstStart
and edx,1
mov [esp].sp_LeftCase,edx ; save left edge case pixels (4-LeftCase)&0x03
mov ecx,[ebp].str_XDstEnd ; load x end
mov eax,ecx
and eax,1
mov [esp].sp_RightCase,eax ; save right edge case
sub ecx,[ebp].str_XDstStart ; calc x count
lea ebx,[ecx - 1] ; calc width less one
mov [esp].sp_XWidthLessOne,ebx ; save it
sub ecx,eax ; sub right edge from XCount
sub ecx,edx ; sub left edge from XCount
shr ecx,1 ; convert from pixels to DWORD count
mov [esp].sp_XCount,ecx ; save DWORD count
mov ebx,[ebp].str_ulXDstToSrcFracCeil ; get x frac
mov [esp].sp_TempXFrac,ebx ; save x frac to a esp based location
@@: cmp byte ptr [edi+HST_FIFOSTATUS],FIFOSIZE
jb @b ; CHECK_FIFO_SPACE(32)
@@: test byte ptr [edi+HST_STATUS+2],(dwgengsts_MASK shr 16)
jnz @b ; WAIT_NOT_BUSY()
mov edx,[esp].sp_ulDst
NextScan:
; edx = current destination offset
; esi = pointer to source pixel
add edx,edx ; convert pixels to bytes
mov [edi+HST_DSTPAGE],edx ; CP_WRITE(HST_DSTPAGE, ulDst)
mov eax,[ebp].str_ulXDstToSrcIntCeil ; get src integer step for step in dst
mov ebx,[ebp].str_ulXDstToSrcFracCeil ; get src frac step for step in dst
mov edi,[ebp].str_ulXFracAccumulator ; put it in edx as tmp
mov ebp,[esp].sp_pjDst ; get dst pointer to ebp
;
; Can't directly access pSTR_BLT variables through ebp
;
mov ecx,[esp].sp_LeftCase
; eax = integer step in source
; ebx = fractional step in source
; ecx = left edge case
; edx = free for pixel data
; esi = pointer to source pixel
; edi = fractional accumulator
; ebp = pointer to dest pixel
;
; divide 'esi' by 2 so that we can always dereference it by
; [2*esi] -- this allows us to still use an 'add with carry'
; to jump to the next pixel
;
shr esi,1
;
; first do the left side to align dwords
;
test ecx,ecx
jz DwordAligned
mov dx,[2*esi] ; fetch pixel
mov [ebp],dx ; write it out
add edi,ebx ; step fraction
adc esi,eax ; add in integer and possible carry
add ebp,2 ; step 1 in dest
DwordAligned:
mov ecx,[esp].sp_XCount ; get run length
test ecx,ecx
jz TrailingBytes ; watch for zero dword case
@@:
mov dx,[2*esi] ; get a source pixel
add edi,ebx ; step fraction
adc esi,eax ; add integer and carry
shl edx,16
add edi,ebx ; step fraction
mov dx,[2*esi] ; get source pixel
adc esi,eax ; add integer and carry
ror edx,16
mov [ebp],edx ; write everything to dest
add ebp,4 ; increment dest pointer by 1 dword
dec ecx ; decrement count
jnz @b ; do more pixels
TrailingBytes:
;
; now do the right side trailing bytes
;
mov ecx,[esp].sp_RightCase
test ecx,ecx
jz EndScanLine
mov dx,[2*esi] ; fetch pixel
mov [ebp],dx ; write it out
add edi,ebx ; step fraction
adc esi,eax ; add in integer and possible carry
add ebp,2 ; step 1 in dest
EndScanLine:
mov edi,ebp ; get dst pointer back
mov ebp,[esp].sp_pSTR_BLT ; load pSTR_BLT into ebp
mov esi,[esp].sp_pjSrcScan ; load src scan start addr
mov ebx,esi ; save a copy
mov eax,[ebp].str_ulYFracAccumulator ; get .32 part of Y pointer
add eax,[ebp].str_ulYDstToSrcFracCeil ; add in fractional step
jnc @f
add esi,[ebp].str_lDeltaSrc ; step one extra in src
@@: mov [ebp].str_ulYFracAccumulator,eax ; save Y accumulator
add esi,[esp].sp_SrcIntStep ; step int part
mov [esp].sp_pjSrcScan,esi ; save starting scan addr
dec [ebp].str_YDstCount ; decrement scan count
jz Done ; no more scans
inc [esp].sp_yDst ; one scan further down in dst
mov edi,[esp].sp_pjBase
mov edx,[esp].sp_ulDst
add edx,[esp].sp_cxMemory
mov [esp].sp_ulDst,edx ; ulDst += cxMemory
cmp esi,ebx ; is src scan same as before?
jne NextScan ; if so, fall through to dupe scan
;--------------------------------------------------------------------
; The source scan is the same one used for the previous destination
; scan, so we can simply use the hardware to copy the previous
; destination scan.
;
; Since on the MGA we can set up a 'rolling blt' to copy one scan
; line to several scans in a single command, we will count up how
; many times this scan should be duplicated. If your hardware
; cannot do a rolling blt, simply issue a new blt command for
; every time the scan should be duplicated.
;
; eax = ulYFracAccumulator
; ebx = original pjSrcScan
; edx = ulDst, current offset from start of frame buffer to next scan
; esi = current pjSrcScan
; edi = pjBase
; ebp = pSTR_BLT
;
xor ecx,ecx ; number of times scan is to be
; duplicated
AnotherDuplicate:
inc ecx ; one scan further down
dec [ebp].str_YDstCount ; decrement scan count
jz OutputDuplicates ; no more scans
add eax,[ebp].str_ulYDstToSrcFracCeil ; add in fractional step
jnc @f
add esi,[ebp].str_lDeltaSrc ; step one extra in src
@@: add esi,[esp].sp_SrcIntStep ; step int part
cmp esi,ebx ; is src scan same as before?
je AnotherDuplicate
OutputDuplicates:
mov [esp].sp_pjSrcScan,esi ; save starting scan address
mov [ebp].str_ulYFracAccumulator,eax ; save Y accumulator
mov eax,[esp].sp_cxMemory
imul eax,ecx
add eax,edx
mov [esp].sp_ulDst,eax ; ulDst += cxMemory * cyDuplicate
mov eax,[esp].sp_yDst
mov [edi+DWG_YDST],eax
mov [edi+DWG_LEN],ecx
add ecx,eax
mov [esp].sp_yDst,ecx ; add duplicate count to y
sub edx,[esp].sp_cxMemory
mov [edi+DWG_AR3],edx
add edx,[esp].sp_XWidthLessOne
mov [edi+DWG_AR0+StartDwgReg],edx
; Unfortunately, if we try to write to the frame buffer at the
; same time that the MGA is doing the screen-to-screen blt, we
; will get garbage on the screen. Consequently, we always
; wait for idle before writing on the frame buffer:
@@: cmp byte ptr [edi+HST_FIFOSTATUS],FIFOSIZE
jb @b ; CHECK_FIFO_SPACE(32)
@@: test byte ptr [edi+HST_STATUS+2],(dwgengsts_MASK shr 16)
jnz @b ; WAIT_NOT_BUSY()
mov edx,[esp].sp_ulDst
; edx = current destination offset
; esi = pointer to source pixel
cmp [ebp].str_YDstCount,0 ; we might be all done
jne NextScan
Done:
add esp,(size STACK_STRUC) - PROC_MEM_SIZE
pop ebp
pop esi
pop edi
pop ebx
ret 4
vMgaDirectStretch16@4 endp
end