admin
base
com
developer
drivers
ds
enduser
downlevelclients
mapistub
miscdocuments
msasn1
netmeeting
as
av
callcont
codecs
dec
intel
g723
h261
h263
i386
adjpels.asm
ccinst.inc
clocals.inc
cmemmod.inc
cx1209.asm
cx512161.asm
cx512162.asm
cx512241.asm
cx512242.asm
cx51224z.asm
cx512321.asm
cx512322.asm
cx51281.asm
cx51282.asm
cx5128a1.asm
cx5128a2.asm
cx512y2.asm
cx512yuv.asm
cxm12161.asm
cxm12162.asm
cxm12241.asm
cxm12242.asm
cxm12321.asm
cxm12322.asm
cxm1281.asm
cxm1282.asm
cxm12y2.asm
d35bimot.asm
d35obmc.asm
d35xpand.asm
d3madvpr.asm
d3mbimot.asm
d3mbkadd.asm
d3mbvriq.asm
d3mmc.asm
decconst.inc
dx5aspec.asm
dx5cnvy.asm
dx5frmcp.asm
dxmidct.asm
e35bme.asm
e35vlc.asm
e3dctc.inc
e3inst.inc
e3mbad.inc
e3mbme.asm
e3msig.asm
ex5fdct.asm
ex5me.asm
ex5qrle.asm
exedtq.inc
exmfdct.asm
exmme.asm
exmqrle.asm
iammx.inc
locals.inc
memmodel.inc
yuv12enc.asm
c3_32.def
c3rtp.cpp
c3rtp.h
c_h26x.rc
ccodecid.h
ccpuvsn.cpp
ccpuvsn.h
ccustmsg.h
cdialogs.cpp
cdialogs.h
cdrvbase.cpp
cdrvcom.h
cdrvdefs.h
cdrvproc.cpp
cldebug.cpp
cldebug.h
counters.cpp
counters.h
cproto.h
cresourc.h
ctiming.h
ctypedef.h
cversion.h
d3bef.cpp
d3bvriq.cpp
d3bvriq.h
d3color.cpp
d3coltbl.cpp
d3coltbl.h
d3const.h
d3dec.cpp
d3dec.h
d3gob.cpp
d3gob.h
d3halfmc.cpp
d3halfmc.h
d3idct.cpp
d3idct.h
d3mblk.cpp
d3mblk.h
d3mvdec.cpp
d3mvdec.h
d3pict.cpp
d3pict.h
d3rtp.cpp
d3rtp.h
d3tables.cpp
d3tables.h
dxap.cpp
dxap.h
dxbase.cpp
dxblkadd.cpp
dxblkcpy.cpp
dxcolori.h
dxctrls.cpp
dxfm.h
dxgetbit.cpp
dxgetbit.h
dxidctab.cpp
dxpal.cpp
e3enc.cpp
e3enc.h
e3mbenc.cpp
e3pcolor.cpp
e3pcolor.h
e3rgb16.cpp
e3rgb24.cpp
e3rgb32.cpp
e3rgb4.cpp
e3rgb8.cpp
e3rtp.cpp
e3rtp.h
e3stat.cpp
e3stat.h
e3vlc.h
e3ycrcb.cpp
exbase.cpp
exbitsio.cpp
exbrc.cpp
exbrc.h
excolcnv.cpp
flipuyvy.cpp
makefile
memmon.cpp
memmon.h
precomp.h
sources
dirs
lh
dirs
dcap
filter
h
h323
intelcc
nac
nmcap
qos
rrcm
dirs
core
dev
evtmsg
extern
h
nmmkcert
nmrk
nmutil
setup
srvc
t120
ui
ulsldap
dirs
published
speech
stuff
testlockout
troubleshoot
vc_asms
vccomsupport
windows.com
zone_internetgames
dirs
project.mk
inetcore
inetsrv
loc
mergedcomponents
multimedia
net
printscan
public
published
sdktools
shell
termsrv
tools
windows
dirs
makefil0
3968 lines
147 KiB
NASM
3968 lines
147 KiB
NASM
;/* *************************************************************************
|
|
;** INTEL Corporation Proprietary Information
|
|
;**
|
|
;** This listing is supplied under the terms of a license
|
|
;** agreement with INTEL Corporation and may not be copied
|
|
;** nor disclosed except in accordance with the terms of
|
|
;** that agreement.
|
|
;**
|
|
;** Copyright (c) 1995 Intel Corporation.
|
|
;** All Rights Reserved.
|
|
;**
|
|
;** *************************************************************************
|
|
;*/
|
|
|
|
;////////////////////////////////////////////////////////////////////////////
|
|
;//
|
|
;// $Header: R:\h26x\h26x\src\enc\ex5me.asv 1.17 24 Sep 1996 11:27:00 BNICKERS $
|
|
;//
|
|
;// $Log: R:\h26x\h26x\src\enc\ex5me.asv $
|
|
;//
|
|
;// Rev 1.17 24 Sep 1996 11:27:00 BNICKERS
|
|
;//
|
|
;// Fix register colision.
|
|
;//
|
|
;// Rev 1.16 24 Sep 1996 10:40:32 BNICKERS
|
|
;// For H261, zero out motion vectors when classifying MB as Intra.
|
|
;//
|
|
;// Rev 1.13 19 Aug 1996 13:48:26 BNICKERS
|
|
;// Provide threshold and differential variables for spatial filtering.
|
|
;//
|
|
;// Rev 1.12 17 Jun 1996 15:19:34 BNICKERS
|
|
;// Fix recording of block and MB SWDs for Spatial Loop Filtering case in H261.
|
|
;//
|
|
;// Rev 1.11 30 May 1996 16:40:14 BNICKERS
|
|
;// Fix order of arguments.
|
|
;//
|
|
;// Rev 1.10 30 May 1996 15:08:36 BNICKERS
|
|
;// Fixed minor error in recent IA ME speed improvements.
|
|
;//
|
|
;// Rev 1.9 29 May 1996 15:37:58 BNICKERS
|
|
;// Acceleration of IA version of ME.
|
|
;//
|
|
;// Rev 1.8 15 Apr 1996 10:48:48 AKASAI
|
|
;// Fixed bug in Spatial loop filter code. Code had been unrolled and
|
|
;// the second case had not been updated in the fix put in place of
|
|
;// (for) the first case. Basically an ebx instead of bl that cased
|
|
;// and overflow from 7F to 3F.
|
|
;//
|
|
;// Rev 1.7 15 Feb 1996 15:39:26 BNICKERS
|
|
;// No change.
|
|
;//
|
|
;// Rev 1.6 15 Feb 1996 14:39:00 BNICKERS
|
|
;// Fix bug wherein access to area outside stack frame was occurring.
|
|
;//
|
|
;// Rev 1.5 15 Jan 1996 14:31:40 BNICKERS
|
|
;// Fix decrement of ref area addr when half pel upward is best in block ME.
|
|
;// Broadcast macroblock level MV when block gets classified as Intra.
|
|
;//
|
|
;// Rev 1.4 12 Jan 1996 13:16:08 BNICKERS
|
|
;// Fix SLF so that 3 7F pels doesn't overflow, and result in 3F instead of 7F.
|
|
;//
|
|
;// Rev 1.3 27 Dec 1995 15:32:46 RMCKENZX
|
|
;// Added copyright notice
|
|
;//
|
|
;// Rev 1.2 19 Dec 1995 17:11:16 RMCKENZX
|
|
;// fixed 2 bugs:
|
|
;// 1. do +-15 pel search if central and NOT 4 mv / macroblock
|
|
;// (was doing when central AND 4 mv / macroblock)
|
|
;// 2. correctly compute motion vectors when doing 4 motion
|
|
;// vectors per block.
|
|
;//
|
|
;// Rev 1.1 28 Nov 1995 15:25:48 AKASAI
|
|
;// Added white space so that will complie with the long lines.
|
|
;//
|
|
;// Rev 1.0 28 Nov 1995 14:37:00 BECHOLS
|
|
;// Initial revision.
|
|
;//
|
|
;//
|
|
;// Rev 1.13 22 Nov 1995 15:32:42 DBRUCKS
|
|
;// Brian made this change on my system.
|
|
;// Increased a value to simplify debugging
|
|
;//
|
|
;//
|
|
;//
|
|
;// Rev 1.12 17 Nov 1995 10:43:58 BNICKERS
|
|
;// Fix problems with B-Frame ME.
|
|
;//
|
|
;//
|
|
;//
|
|
;// Rev 1.11 31 Oct 1995 11:44:26 BNICKERS
|
|
;// Save/restore ebx.
|
|
;//
|
|
;////////////////////////////////////////////////////////////////////////////
|
|
;
|
|
; MotionEstimation -- This function performs motion estimation for the macroblocks identified
|
|
; in the input list.
|
|
; Conditional assembly selects either the H263 or H261 version.
|
|
;
|
|
; Input Arguments:
|
|
;
|
|
; MBlockActionStream
|
|
;
|
|
; The list of macroblocks for which we need to perform motion estimation.
|
|
;
|
|
; Upon input, the following fields must be defined:
|
|
;
|
|
; CodedBlocks -- Bit 6 must be set for the last macroblock to be processed.
|
|
;
|
|
; FirstMEState -- must be 0 for macroblocks that are forced to be Intracoded. An
|
|
; IntraSWD will be calculated.
|
|
; Other macroblocks must have the following values:
|
|
; 1: upper left, without advanced prediction. (Advanced prediction
|
|
; only applies to H263.)
|
|
; 2: upper edge, without advanced prediction.
|
|
; 3: upper right, without advanced prediction.
|
|
; 4: left edge, without advanced prediction.
|
|
; 5: central block, or any block if advanced prediction is being done.
|
|
; 6: right edge, without advanced prediction.
|
|
; 7: lower left, without advanced prediction.
|
|
; 8: lower edge, without advanced prediction.
|
|
; 9: lower right, without advanced prediction.
|
|
; If vertical motion is NOT allowed:
|
|
; 10: left edge, without advanced prediction.
|
|
; 11: central block, or any block if advanced prediction is being done.
|
|
; 12: right edge, without advanced prediction.
|
|
; *** Note that with advanced prediction, only initial states 0, 4, or
|
|
; 11 can be specified. Doing block level motion vectors mandates
|
|
; advanced prediction, but in that case, only initial
|
|
; states 0 and 4 are allowed.
|
|
;
|
|
; BlkOffset -- must be defined for each of the blocks in the macroblocks.
|
|
;
|
|
; TargetFrameBaseAddress -- Address of upper left viewable pel in the target Y plane.
|
|
;
|
|
; PreviousFrameBaseAddress -- Address of upper left viewable pel in the previous Y plane. Whether this is the
|
|
; reconstructed previous frame, or the original, is up to the caller to decide.
|
|
;
|
|
; FilteredFrameBaseAddress -- Address of upper left viewable pel in the scratch area that this function can record
|
|
; the spatially filtered prediction for each block, so that frame differencing can
|
|
; utilize it rather than have to recompute it. (H261 only)
|
|
;
|
|
; DoRadius15Search -- TRUE if central macroblocks should search a distance of 15 from center. Else searches 7 out.
|
|
;
|
|
; DoHalfPelEstimation -- TRUE if we should do ME to half pel resolution. This is only applicable for H263 and must
|
|
; be FALSE for H261. (Note: TRUE must be 1; FALSE must be 0).
|
|
;
|
|
; DoBlockLevelVectors -- TRUE if we should do ME at block level. This is only applicable for H263 and must be FALSE
|
|
; for H261. (Note: TRUE must be 1; FALSE must be 0).
|
|
; DoSpatialFiltering -- TRUE if we should determine if spatially filtering the prediction reduces the SWD. Only
|
|
; applicable for H261 and must be FALSE for H263. (Note: TRUE must be 1; FALSE must be 0).
|
|
;
|
|
; ZeroVectorThreshold -- If the SWD for a macroblock is less than this threshold, we do not bother searching for a
|
|
; better motion vector. Compute as follows, where D is the average tolerable pel difference
|
|
; to satisfy this threshold. (Initial recommendation: D=2 ==> ZVT=384)
|
|
; ZVT = (128 * ((int)((D**1.6)+.5)))
|
|
;
|
|
; NonZeroDifferential -- After searching for the best motion vector (or individual block motion vectors, if enabled),
|
|
; if the macroblock's SWD is not better than it was for the zero vector -- not better by at
|
|
; least this amount -- then we revert to the zero vector. We are comparing two macroblock
|
|
; SWDs, both calculated as follows: (Initial recommendation: NZD=128)
|
|
; For each of 128 match points, where D is its Abs Diff, accumulate ((int)(M**1.6)+.5)))
|
|
;
|
|
; BlockMVDifferential -- The amount by which the sum of four block level SWDs must be better than a single macroblock
|
|
; level SWD to cause us to choose block level motion vectors. See NonZeroDifferential for
|
|
; how the SWDs are calculated. Only applicable for H261. (Initial recommendation: BMVD=128)
|
|
;
|
|
; EmptyThreshold -- If the SWD for a block is less than this, the block is forced empty. Compute as follows, where D
|
|
; is the average tolerable pel diff to satisfy threshold. (Initial recommendation: D=3 ==> ET=96)
|
|
; ET = (32 * ((int)((D**1.6)+.5)))
|
|
;
|
|
; InterCodingThreshold -- If any of the blocks are forced empty, we can simply skip calculating the INTRASWD for the
|
|
; macroblock. If none of the blocks are forced empty, we will compare the macroblock's SWD
|
|
; against this threshold. If below the threshold, we will likewise skip calculating the
|
|
; INTRASWD. Otherwise, we will calculate the INTRASWD, and if it is less than the [Inter]SWD,
|
|
; we will classify the block as INTRA-coded. Compute as follows, where D is the average
|
|
; tolerable pel difference to satisfy threshold. (Initial recommendation: D=4 ==> ICT=1152)
|
|
; ICT = (128 * ((int)((D**1.6)+.5)))
|
|
;
|
|
; IntraCodingDifferential -- For INTRA coding to occur, the INTRASWD must be better than the INTERSWD by at least
|
|
; this amount.
|
|
;
|
|
; Output Arguments
|
|
;
|
|
; MBlockActionStream
|
|
;
|
|
; These fields are defined as follows upon return:
|
|
;
|
|
; BlockType -- Set to INTRA, INTER1MV, or (H263 only) INTER4MV.
|
|
;
|
|
; PHMV and PVMV -- The horizontal and vertical motion vectors, in units of a half pel.
|
|
;
|
|
; BHMV and BVMV -- These fields get clobbered.
|
|
;
|
|
; PastRef -- If BlockType != INTRA, set to the address of the reference block.
|
|
;
|
|
; If Horizontal MV indicates a half pel position, the prediction for the upper left pel of the block
|
|
; is the average of the pel at PastRef and the one at PastRef+1.
|
|
;
|
|
; If Vertical MV indicates a half pel position, the prediction for the upper left pel of the block
|
|
; is the average of the pel at PastRef and the one at PastRef+PITCH.
|
|
;
|
|
; If both MVs indicate half pel positions, the prediction for the upper left pel of the block is the
|
|
; average of the pels at PastRef, PastRef+1, PastRef+PITCH, and PastRef+PITCH+1.
|
|
;
|
|
; Indications of a half pel position can only happen for H263.
|
|
;
|
|
; In H261, when spatial filtering is done, the address will be in the SpatiallyFilteredFrame, where
|
|
; this function stashes the spatially filtered prediction for subsequent reuse by frame differencing.
|
|
;
|
|
; CodedBlocks -- Bits 4 and 5 are turned on, indicating that the U and V blocks should be processed. (If the
|
|
; FDCT function finds them to quantize to empty, it will mark them as empty.)
|
|
;
|
|
; Bits 0 thru 3 are cleared for each of blocks 1 thru 4 that MotionEstimation forces empty;
|
|
; they are set otherwise.
|
|
;
|
|
; Bits 6 and 7 are left unchanged.
|
|
;
|
|
; SWD -- Set to the sum of the SWDs for the four luma blocks in the macroblock. The SWD for any block that is
|
|
; forced empty, is NOT included in the sum.
|
|
;
|
|
;
|
|
;
|
|
; IntraSWDTotal -- The sum of the block SWDs for all Intracoded macroblocks.
|
|
;
|
|
; IntraSWDBlocks -- The number of blocks that make up the IntraSWDTotal.
|
|
;
|
|
; InterSWDTotal -- The sum of the block SWDs for all Intercoded macroblocks.
|
|
; None of the blocks forced empty are included in this.
|
|
;
|
|
; InterSWDBlocks -- The number of blocks that make up the InterSWDTotal.
|
|
;
|
|
;
|
|
; Other assumptions:
|
|
;
|
|
; For performance reasons, it is assumed that the layout of current and previous frames (and spatially filtered
|
|
; frame for H261) rigourously conforms to the following guide.
|
|
;
|
|
; The spatially filtered frame (only present and applicable for H261) is an output frame into which MotionEstimation
|
|
; places spatially filtered macroblocks as it determines if filtering is good for a macroblock. If it determines
|
|
; such, frame differencing will be able to re-use the spatially filtered macroblock, rather than recomputing it.
|
|
;
|
|
; Cache
|
|
; Alignment
|
|
; Points: v v v v v v v v v v v v v
|
|
; 16 | 352 (narrower pictures are left justified) | 16
|
|
; +---+---------------------------------------------------------------------------------------+---+
|
|
; | D | Current Frame Y Plane | D |
|
|
; | u | | u |
|
|
; Frame | m | | m |
|
|
; Height | m | | m |
|
|
; Lines | y | | y |
|
|
; | | | |
|
|
; +---+---------------------------------------------------------------------------------------+---+
|
|
; | |
|
|
; | |
|
|
; | |
|
|
; 24 lines | Dummy Space (24 lines plus 8 bytes. Can be reduced to 8 bytes if unrestricted motion |
|
|
; | vectors is NOT selected.) |
|
|
; | |
|
|
; | 8 176 16 176 |8
|
|
; | +-+-------------------------------------------------------------------------------------------+-+
|
|
; +-+D| Current Frame U Plane | D | Current Frame V Plane |D|
|
|
; Frame |u| | u | |u|
|
|
; Height |m| | m | |m|
|
|
; Div By 2 |m| | m | |m|
|
|
; Lines |y| | y | |y|
|
|
; +-+-------------------------------------------+---+-------------------------------------------+-+
|
|
; 72 dummy bytes. I.e. enough dummy space to assure that MOD ((Previous_Frame - Current_Frame), 128) == 80
|
|
; +-----------------------------------------------------------------------------------------------+
|
|
; | |
|
|
; 16 lines | If Unrestricted Motion Vectors selected, 16 lines must appear above and below previous frame, |
|
|
; | and these lines plus the 16 columns to the left and 16 columns to the right of the previous |
|
|
; | frame must be initialized to the values at the edges and corners, propagated outward. If |
|
|
; | Unrestricted Motion Vectors is off, these lines don't have to be allocated. |
|
|
; | |
|
|
; | +---------------------------------------------------------------------------------------+ +
|
|
; Frame | | Previous Frame Y Plane | |
|
|
; Height | | | |
|
|
; Lines | | | |
|
|
; | | | |
|
|
; | | | |
|
|
; | +---------------------------------------------------------------------------------------+ +
|
|
; | |
|
|
; 16 lines | See comment above Previous Y Plane |
|
|
; | |
|
|
; |+--- 8 bytes of dummy space. Must be there, whether unrestricted MV or not. |
|
|
; || |
|
|
; |v+-----------------------------------------------+---------------------------------------------+-+
|
|
; +-+ | |
|
|
; | See comment above Previous Y Plane. | See comment above Previous Y Plane. |
|
|
; 8 lines | Same idea here, but 8 lines are needed above | Same idea here, but 8 lines are needed |
|
|
; | and below U plane, and 8 columns on each side.| and below V plane, and 8 columns on each side.|
|
|
; | | |
|
|
; |8 176 8|8 176 8|
|
|
; | +-------------------------------------------+ | +-------------------------------------------+ |
|
|
; | | Previous Frame U Plane | | | Previous Frame V Plane | |
|
|
; Frame | | | | | | |
|
|
; Height | | | | | | |
|
|
; Div By 2 | | | | | | |
|
|
; Lines | | | | | | |
|
|
; | +-------------------------------------------+ | +-------------------------------------------+ |
|
|
; | | |
|
|
; 8 lines | See comment above Previous U Plane | See comment above Previous V Plane |
|
|
; | | |
|
|
; | | |
|
|
; | | |
|
|
; +-----------------------------------------------+---------------------------------------------+-+
|
|
; Enough dummy space to assure that MOD ((Spatial_Frame - Previous_Frame), 4096) == 2032
|
|
; +---+---------------------------------------------------------------------------------------+---+
|
|
; | D | Spatially Filtered Y Plane (present only for H261) | D |
|
|
; | u | | u |
|
|
; Frame | m | | m |
|
|
; Height | m | | m |
|
|
; Lines | y | | y |
|
|
; | | | |
|
|
; +---+---------------------------------------------------------------------------------------+---+
|
|
; | |
|
|
; | |
|
|
; | |
|
|
; 24 lines | Dummy Space (24 lines plus 8 bytes. Can be reduced to 8 bytes if unrestricted motion |
|
|
; | vectors is NOT selected, which is certainly the case for H261.) |
|
|
; | |
|
|
; | 8 176 16 176 |8
|
|
; | +-+-------------------------------------------------------------------------------------------+-+
|
|
; +-+D| Spatially Filtered U plane (H261 only) | D | Spatially Filtered V plane (H261 only) |D|
|
|
; Frame |u| | u | |u|
|
|
; Height |m| | m | |m|
|
|
; Div By 2 |m| | m | |m|
|
|
; Lines |y| | y | |y|
|
|
; +-+-------------------------------------------+---+-------------------------------------------+-+
|
|
;
|
|
; Cache layout of the target block and the full range for the reference area (as restricted to +/- 7 in vertical,
|
|
; and +/- 7 (expandable to +/- 15) in horizontal, is as shown here. Each box represents a cache line (32 bytes),
|
|
; increasing incrementally from left to right, and then to the next row (like reading a book). The 128 boxes taken
|
|
; as a whole represent 4Kbytes. The boxes are populated as follows:
|
|
;
|
|
; R -- Data from the reference area. Each box contains 23 of the pels belonging to a line of the reference area.
|
|
; The remaining 7 pels of the line is either in the box to the left (for reference areas used to provide
|
|
; predictions for target macroblocks that begin at an address 0-mod-32), or to the right (for target MBs that
|
|
; begin at an address 16-mod-32). There are 30 R's corresponding to the 30-line limit on the vertical distance
|
|
; we might search.
|
|
;
|
|
; T -- Data from the target macroblock. Each box contains a full line (16 pels) for each of two adjacent
|
|
; macroblocks. There are 16 T's corresponding to the 16 lines of the macroblocks.
|
|
;
|
|
; S -- Space for the spatially filtered macroblock (H261 only).
|
|
;
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | T | | R | | S | | R | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | T | | R | | S | | R | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | T | | R | | S | | R | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | T | | R | | S | | R | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | T | | R | | S | | R | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | S | | R | | S | | R | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | S | | R | | S | | R | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | S | | R | | S | | R | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | S | | R | | S | | R | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | S | | R | | S | | R | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | S | | R | |
|
|
; +---+---+---+---+---+---+---+---+
|
|
;
|
|
; Thus, in a logical sense, the above data fits into one of the 4K data cache pages, leaving the other for all other
|
|
; data. Care has been taken to assure that the tables and the stack space needed by this function fit nicely into
|
|
; the other data cache page. Only the MBlockActionStream remains to conflict with the above data structures. That
|
|
; is both unavoidable, and of minimal consequence.
|
|
; An algorithm has been selected that calculates fewer SWDs (Sum of Weighted Differences) than the typical log search.
|
|
; In the typical log search, a three level search is done, in which the SWDs are compared for the center point and a
|
|
; point at each 45 degrees, initially 4 pels away, then 2, then 1. This requires a total of 25 SWDs for each
|
|
; macroblock (except those near edges or corners).
|
|
;
|
|
; In this algorithm, six levels are performed, with each odd level being a horizontal search, and each even level being
|
|
; a vertical search. Each search compares the SWD for the center point with that of a point in each direction on the
|
|
; applicable axis. This requires 13 SWDs, and a lot simpler control structure. Here is an example picture of a
|
|
; search, in which "0" represents the initial center point (the 0,0 motion vector), "A", and "a" represent the first
|
|
; search points, etc. In this example, the "winner" of each level of the search proceeds as follows: a, B, C, C, E, F,
|
|
; arriving at a motion vector of -1 horizontal, 5 vertical.
|
|
;
|
|
; ...............
|
|
; ...............
|
|
; ...............
|
|
; ...b...........
|
|
; ...............
|
|
; ...............
|
|
; ...............
|
|
; ...a...0...A...
|
|
; ...............
|
|
; .....d.........
|
|
; ......f........
|
|
; .c.BeCE........
|
|
; ......F........
|
|
; .....D.........
|
|
; ...............
|
|
;
|
|
;
|
|
; A word about data cache performance. Conceptually, the tables and local variables used by this function are placed
|
|
; in memory such that they will fit in one 4K page of the on-chip data cache. For the Pentium (tm) microprocessor,
|
|
; this leaves the other 4K page for other purposes. The other data structures consist of:
|
|
;
|
|
; The current frame, from which we need to access the lines of the 16*16 macroblock. Since cache lines are 32 bytes
|
|
; wide, the cache fill operations that fetch one target macroblock will serve to fetch the macroblock to the right,
|
|
; so an average of 8 cache lines are fetched for each macroblock.
|
|
;
|
|
; The previous frame, from which we need to access a reference area of 30*30 pels. For each macroblock for which we
|
|
; need to search for a motion vector, we will typically need to access no more than about 25 of these, but in general
|
|
; these lines span the 30 lines of the search area. Since cache lines are 32 bytes wide, the cache fill operations
|
|
; that fetch reference data for one macroblock, will tend to fetch data that is useful as reference data for the
|
|
; macroblock to the right, so an average of about 15 (rounded up to be safe) cache lines are fetched for each
|
|
; macroblock.
|
|
;
|
|
; The MBlockActionStream, which controls the searching (since we don't need to motion estimate blocks that are
|
|
; legislated to be intra) will disrupt cache behaviour of the other data structures, but not to a significant degree.
|
|
;
|
|
; By setting the pitch to a constant of 384, and by allocating the frames as described above, the one available 4K page
|
|
; of data cache will be able to contain the 30 lines of the reference area, the 16 lines of the target area, and the
|
|
; 16 lines of the spatially filtered area (H261 only) without any collisions.
|
|
;
|
|
;
|
|
; Here is a flowchart of the major sections of this function:
|
|
;
|
|
; +-- Execute once for Y part of each macroblock that is NOT Intra By Decree --+
|
|
; | |
|
|
; | +---------------------------------------------------------------+ |
|
|
; | | 1) Compute average value for target match points. | |
|
|
; | | 2) Prepare match points in target MB for easier matching. | |
|
|
; | | 3) Compute the SWD for (0,0) motion vector. | |
|
|
; | +---------------------------------------------------------------+ |
|
|
; | | |
|
|
; | v |
|
|
; | /---------------------------------\ Yes |
|
|
; | < 4) Is 0-motion SWD good enough? >-------------------------+ |
|
|
; | \---------------------------------/ | |
|
|
; | | | |
|
|
; | |No | |
|
|
; | v | |
|
|
; | +--- 5) While state engine has more motion vectors to check ---+ | |
|
|
; | | | | |
|
|
; | | | | |
|
|
; | | +---------------------------------------------------+ | | |
|
|
; | | | 5) Compute SWDs for 2 ref MBs and pick best of 3. |----->| | |
|
|
; | | +---------------------------------------------------+ | | |
|
|
; | | | | |
|
|
; | +--------------------------------------------------------------+ | |
|
|
; | | | |
|
|
; | v | |
|
|
; | /-----------------------------------------\ | |
|
|
; | < 6) Is best motion vector the 0-vector? > | |
|
|
; | \-----------------------------------------/ | |
|
|
; | | | | |
|
|
; | |No |Yes | |
|
|
; | v v | |
|
|
; | +-----------------+ +-------------------------------------------+ | |
|
|
; | | Mark all blocks | | 6) Identify as empty block any in which: |<-+ |
|
|
; | +--| non-empty. | | --> 0-motion SWD < EmptyThresh, and | |
|
|
; | | +-----------------+ +-------------------------------------------+ |
|
|
; | | | |
|
|
; | | v |
|
|
; | | /--------------------------------\ Yes +--------------------------+ |
|
|
; | | < 6) Are all blocks marked empty? >--->| 6) Classify FORCEDEMPTY |-->|
|
|
; | | \--------------------------------/ +--------------------------+ |
|
|
; | | | |
|
|
; | | |No |
|
|
; | | v |
|
|
; | | /--------------------------------------------\ |
|
|
; | | < 7) Are any non-phantom blocks marked empty? > |
|
|
; | | \--------------------------------------------/ |
|
|
; | | | | |
|
|
; | | |No |Yes |
|
|
; | v v v |
|
|
; | +---------------------+ +--------------------------------+ |
|
|
; | | 8) Compute IntraSWD | | Set IntraSWD artificially high | |
|
|
; | +---------------------+ +--------------------------------+ |
|
|
; | | | |
|
|
; | v v |
|
|
; | +-------------------------------+ |
|
|
; | | 10) Classify block as one of: | |
|
|
; | | INTRA |--------------------------------->|
|
|
; | | INTER | |
|
|
; | +-------------------------------+ |
|
|
; | |
|
|
; +----------------------------------------------------------------------------+
|
|
;
|
|
;
|
|
|
|
OPTION PROLOGUE:None
|
|
OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
|
|
OPTION M510
|
|
|
|
include e3inst.inc
|
|
include e3mbad.inc
|
|
|
|
.xlist
|
|
include memmodel.inc
|
|
.list
|
|
.DATA
|
|
|
|
; Storage for tables and temps used by Motion Estimation function. Fit into
|
|
; 4Kbytes contiguous memory so that it uses one cache page, leaving other
|
|
; for reference area of previous frame and target macroblock of current frame.
|
|
|
|
PickPoint DB 0,4,?,4,0,?,2,2 ; Map CF accum to new central pt selector.
|
|
PickPoint_BLS DB 6,4,?,4,6,?,2,2 ; Same, for when doing block level search.
|
|
|
|
OffsetToRef LABEL DWORD ; Linearized adjustments to affect horz/vert motion.
|
|
DD ? ; This index used when zero-valued motion vector is good enough.
|
|
DD 0 ; Best fit of 3 SWDs is previous center.
|
|
DD 1 ; Best fit of 3 SWDs is the ref block 1 pel to the right.
|
|
DD -1 ; Best fit of 3 SWDs is the ref block 1 pel to the left.
|
|
DD 1*PITCH ; Best fit of 3 SWDs is the ref block 1 pel above.
|
|
DD -1*PITCH ; Best fit of 3 SWDs is the ref block 1 pel below.
|
|
DD 2 ; Best fit of 3 SWDs is the ref block 2 pels to the right.
|
|
DD -2 ; Best fit of 3 SWDs is the ref block 2 pels to the left.
|
|
DD 2*PITCH ; Best fit of 3 SWDs is the ref block 2 pel above.
|
|
DD -2*PITCH ; Best fit of 3 SWDs is the ref block 2 pel below.
|
|
DD 4 ; Best fit of 3 SWDs is the ref block 4 pels to the right.
|
|
DD -4 ; Best fit of 3 SWDs is the ref block 4 pels to the left.
|
|
DD 4*PITCH ; Best fit of 3 SWDs is the ref block 4 pel above.
|
|
DD -4*PITCH ; Best fit of 3 SWDs is the ref block 4 pel below.
|
|
DD 7 ; Best fit of 3 SWDs is the ref block 7 pels to the right.
|
|
DD -7 ; Best fit of 3 SWDs is the ref block 7 pels to the left.
|
|
DD 7*PITCH ; Best fit of 3 SWDs is the ref block 7 pel above.
|
|
DD -7*PITCH ; Best fit of 3 SWDs is the ref block 7 pel below.
|
|
|
|
M0 = 4 ; Define symbolic indices into OffsetToRef lookup table.
|
|
MHP1 = 8
|
|
MHN1 = 12
|
|
MVP1 = 16
|
|
MVN1 = 20
|
|
MHP2 = 24
|
|
MHN2 = 28
|
|
MVP2 = 32
|
|
MVN2 = 36
|
|
MHP4 = 40
|
|
MHN4 = 44
|
|
MVP4 = 48
|
|
MVN4 = 52
|
|
MHP7 = 56
|
|
MHN7 = 60
|
|
MVP7 = 64
|
|
MVN7 = 68
|
|
|
|
; Map linearized motion vector to vertical part.
|
|
; (Mask bottom byte of linearized MV to zero, then use result
|
|
; as index into this array to get vertical MV.)
|
|
IF PITCH-384
|
|
*** error: The magic of this table assumes a pitch of 384.
|
|
ENDIF
|
|
DB -32, -32
|
|
DB -30
|
|
DB -28, -28
|
|
DB -26
|
|
DB -24, -24
|
|
DB -22
|
|
DB -20, -20
|
|
DB -18
|
|
DB -16, -16
|
|
DB -14
|
|
DB -12, -12
|
|
DB -10
|
|
DB -8, -8
|
|
DB -6
|
|
DB -4, -4
|
|
DB -2
|
|
DB 0
|
|
UnlinearizedVertMV DB 0
|
|
DB 2
|
|
DB 4, 4
|
|
DB 6
|
|
DB 8, 8
|
|
DB 10
|
|
DB 12, 12
|
|
DB 14
|
|
DB 16, 16
|
|
DB 18
|
|
DB 20, 20
|
|
DB 22
|
|
DB 24, 24
|
|
DB 26
|
|
DB 28, 28
|
|
DB 30
|
|
|
|
; Map initial states to initializers for half pel search. Where search would
|
|
; illegally take us off edge of picture, set initializer artificially high.
|
|
|
|
InitHalfPelSearchHorz LABEL DWORD
|
|
DD 040000000H, 000000000H, 000004000H
|
|
DD 040000000H, 000000000H, 000004000H
|
|
DD 040000000H, 000000000H, 000004000H
|
|
DD 040000000H, 000000000H, 000004000H
|
|
|
|
InitHalfPelSearchVert LABEL DWORD
|
|
DD 040000000H, 040000000H, 040000000H
|
|
DD 000000000H, 000000000H, 000000000H
|
|
DD 000004000H, 000004000H, 000004000H
|
|
DD 040004000H, 040004000H, 040004000H
|
|
|
|
|
|
SWDState LABEL BYTE ; Rules that govern state engine of motion estimator.
|
|
|
|
DB 8 DUP (?) ; 0: not used.
|
|
|
|
; 1: Upper Left Corner. Explore 4 right and 4 down.
|
|
DB 21, M0 ; (0,0)
|
|
DB 22, MHP4 ; (0,4)
|
|
DB 23, MVP4, ?, ? ; (4,0)
|
|
|
|
; 2: Upper Edge. Explore 4 left and 4 right.
|
|
DB 22, M0 ; (0, 0)
|
|
DB 22, MHN4 ; (0,-4)
|
|
DB 22, MHP4, ?, ? ; (0, 4)
|
|
|
|
; 3: Upper Right Corner. Explore 4 right and 4 down.
|
|
DB 31, M0 ; (0, 0)
|
|
DB 22, MHN4 ; (0,-4)
|
|
DB 32, MVP4, ?, ? ; (4, 0)
|
|
|
|
; 4: Left Edge. Explore 4 up and 4 down.
|
|
DB 23, M0 ; ( 0,0)
|
|
DB 23, MVN4 ; (-4,0)
|
|
DB 23, MVP4, ?, ? ; ( 4,0)
|
|
|
|
; 5: Interior Macroblock. Explore 4 up and 4 down.
|
|
DB 37, M0 ; ( 0,0)
|
|
DB 37, MVN4 ; (-4,0)
|
|
DB 37, MVP4, ?, ? ; ( 4,0)
|
|
|
|
; 6: Right Edge. Explore 4 up and 4 down.
|
|
DB 32, M0 ; ( 0,0)
|
|
DB 32, MVN4 ; (-4,0)
|
|
DB 32, MVP4, ?, ? ; ( 4,0)
|
|
|
|
; 7: Lower Left Corner. Explore 4 up and 4 right.
|
|
DB 38, M0 ; ( 0,0)
|
|
DB 39, MHP4 ; ( 0,4)
|
|
DB 23, MVN4, ?, ? ; (-4,0)
|
|
|
|
; 8: Lower Edge. Explore 4 left and 4 right.
|
|
DB 39, M0 ; (0, 0)
|
|
DB 39, MHN4 ; (0,-4)
|
|
DB 39, MHP4, ?, ? ; (0, 4)
|
|
|
|
; 9: Lower Right Corner. Explore 4 up and 4 left.
|
|
DB 44, M0 ; ( 0, 0)
|
|
DB 39, MHN4 ; ( 0,-4)
|
|
DB 32, MVN4, ?, ? ; (-4, 0)
|
|
|
|
; 10: Left Edge, No Vertical Motion Allowed.
|
|
DB 46, M0 ; (0,0)
|
|
DB 48, MHP2 ; (0,2)
|
|
DB 47, MHP4, ?, ? ; (0,4)
|
|
|
|
; 11: Interior Macroblock, No Vertical Motion Allowed.
|
|
DB 47, M0 ; (0, 0)
|
|
DB 47, MHN4 ; (0,-4)
|
|
DB 47, MHP4, ?, ? ; (0, 4)
|
|
|
|
; 12: Right Edge, No Vertical Motion Allowed.
|
|
DB 49, M0 ; (0, 0)
|
|
DB 48, MHN2 ; (0,-2)
|
|
DB 47, MHN4, ?, ? ; (0,-4)
|
|
|
|
; 13: Horz by 2, Vert by 2, Horz by 1, Vert by 1.
|
|
DB 14, M0
|
|
DB 14, MHP2
|
|
DB 14, MHN2, ?, ?
|
|
|
|
; 14: Vert by 2, Horz by 1, Vert by 1.
|
|
DB 15, M0
|
|
DB 15, MVP2
|
|
DB 15, MVN2, ?, ?
|
|
|
|
; 15: Horz by 1, Vert by 1.
|
|
DB 16, M0
|
|
DB 16, MHP1
|
|
DB 16, MHN1, ?, ?
|
|
|
|
; 16: Vert by 1.
|
|
DB 0, M0
|
|
DB 0, MVP1
|
|
DB 0, MVN1, ?, ?
|
|
|
|
; 17: Vert by 2, Horz by 2, Vert by 1, Horz by 1.
|
|
DB 18, M0
|
|
DB 18, MVP2
|
|
DB 18, MVN2, ?, ?
|
|
|
|
; 18: Horz by 2, Vert by 1, Horz by 1.
|
|
DB 19, M0
|
|
DB 19, MHP2
|
|
DB 19, MHN2, ?, ?
|
|
|
|
; 19: Vert by 1, Horz by 1.
|
|
DB 20, M0
|
|
DB 20, MVP1
|
|
DB 20, MVN1, ?, ?
|
|
|
|
; 20: Horz by 1.
|
|
DB 0, M0
|
|
DB 0, MHP1
|
|
DB 0, MHN1, ?, ?
|
|
|
|
; 21: From 1A. Upper Left. Try 2 right and 2 down.
|
|
DB 24, M0 ; (0, 0)
|
|
DB 25, MHP2 ; (0, 2)
|
|
DB 26, MVP2, ?, ? ; (2, 0)
|
|
|
|
; 22: From 1B.
|
|
; From 2 center point would be (0,-4/0/4).
|
|
; From 3B center point would be (0,-4).
|
|
DB 27, M0 ; (0, 4)
|
|
DB 18, MVP2 ; (2, 4) Next: Horz 2, Vert 1, Horz 1. (1:3,1:7)
|
|
DB 13, MVP4, ?, ? ; (4, 4) Next: Horz 2, Vert 2, Horz 1, Vert 1. (1:7,1:7)
|
|
|
|
; 23: From 1C.
|
|
; From 4 center point would be (-4/0/4,0).
|
|
; From 7C center point would be (-4,0).
|
|
DB 29, M0 ; (4, 0)
|
|
DB 14, MHP2 ; (4, 2) Next: Vert 2, Horz 1, Vert 1. (1:7,1:3)
|
|
DB 17, MHP4, ?, ? ; (4, 4) Next: Vert 2, Horz 2, Vert 1, Horz 1. (1:7,1:7)
|
|
|
|
; 24: From 21A. Upper Left. Try 1 right and 1 down.
|
|
DB 0, M0 ; (0, 0)
|
|
DB 0, MHP1 ; (1, 0)
|
|
DB 0, MVP1, ?, ? ; (0, 1)
|
|
|
|
; 25: From 21B.
|
|
; From 31B center point would be (0,-2).
|
|
DB 20, M0 ; (0, 2) Next: Horz 1 (0,1:3)
|
|
DB 20, MVP1 ; (1, 2) Next: Horz 1 (1,1:3)
|
|
DB 15, MVP2, ?, ? ; (2, 2) Next: Horz 1, Vert 1 (1:3,1:3)
|
|
|
|
; 26: From 21C.
|
|
; From 38C center point would be (-2,0).
|
|
DB 16, M0 ; (2, 0) Next: Vert 1 (1:3,0)
|
|
DB 16, MHP1 ; (2, 1) Next: Vert 1 (1:3,1)
|
|
DB 19, MHP2, ?, ? ; (2, 2) Next: Vert 1, Horz 1 (1:3,1:3)
|
|
|
|
; 27: From 22A.
|
|
DB 28, M0 ; (0, 4)
|
|
DB 28, MHN2 ; (0, 2)
|
|
DB 28, MHP2, ?, ? ; (0, 6)
|
|
|
|
; 28: From 27.
|
|
DB 20, M0 ; (0, 2/4/6) Next: Horz 1. (0,1:7)
|
|
DB 20, MVP1 ; (1, 2/4/6) Next: Horz 1. (1,1:7)
|
|
DB 20, MVP2, ?, ? ; (2, 2/4/6) Next: Horz 1. (2,1:7)
|
|
|
|
; 29: From 23A.
|
|
DB 30, M0 ; (4, 0)
|
|
DB 30, MVN2 ; (2, 0)
|
|
DB 30, MVP2, ?, ? ; (6, 0)
|
|
|
|
; 30: From 29.
|
|
DB 16, M0 ; (2/4/6, 0) Next: Vert 1. (1:7,0)
|
|
DB 16, MHP1 ; (2/4/6, 1) Next: Vert 1. (1:7,1)
|
|
DB 16, MHP2, ?, ? ; (2/4/6, 2) Next: Vert 1. (1:7,2)
|
|
|
|
; 31: From 3A. Upper Right. Try 2 left and 2 down.
|
|
DB 33, M0 ; (0, 0)
|
|
DB 25, MHN2 ; (0,-2)
|
|
DB 34, MVP2, ?, ? ; (2, 0)
|
|
|
|
; 32: From 3C.
|
|
; From 6 center point would be (-4/0/4, 0)
|
|
; From 9C center point would be (-4, 0)
|
|
DB 35, M0 ; (4, 0)
|
|
DB 14, MHN2 ; (4,-2) Next: Vert2,Horz1,Vert1. (1:7,-1:-3)
|
|
DB 17, MHN4, ?, ? ; (4,-4) Next: Vert2,Horz2,Vert1,Horz1. (1:7,-1:-7)
|
|
|
|
; 33: From 31A. Upper Right. Try 1 left and 1 down.
|
|
DB 0, M0 ; (0, 0)
|
|
DB 0, MHN1 ; (0,-1)
|
|
DB 0, MVP1, ?, ? ; (1, 0)
|
|
|
|
; 34: From 31C.
|
|
; From 44C center point would be (-2, 0)
|
|
DB 16, M0 ; (2, 0) Next: Vert 1 (1:3, 0)
|
|
DB 16, MHN1 ; (2,-1) Next: Vert 1 (1:3,-1)
|
|
DB 19, MHN2, ?, ? ; (2,-2) Next: Vert 1, Horz 1 (1:3,-1:-3)
|
|
|
|
; 35: From 32A.
|
|
DB 36, M0 ; (4, 0)
|
|
DB 36, MVN2 ; (2, 0)
|
|
DB 36, MVP2, ?, ? ; (6, 0)
|
|
|
|
; 36: From 35.
|
|
DB 16, M0 ; (2/4/6, 0) Next: Vert 1. (1:7, 0)
|
|
DB 16, MHN1 ; (2/4/6,-1) Next: Vert 1. (1:7,-1)
|
|
DB 16, MHN2, ?, ? ; (2/4/6,-2) Next: Vert 1. (1:7,-2)
|
|
|
|
; 37: From 5.
|
|
DB 17, M0 ; (-4/0/4, 0) Next: Vert2,Horz2,Vert1,Horz1 (-7:7,-3: 3)
|
|
DB 17, MHP4 ; (-4/0/4,-4) Next: Vert2,Horz2,Vert1,Horz1 (-7:7, 1: 7)
|
|
DB 17, MHN4, ?, ? ; (-4/0/4, 4) Next: Vert2,Horz2,Vert1,Horz1 (-7:7,-7:-1)
|
|
|
|
; 38: From 7A. Lower Left. Try 2 right and 2 up.
|
|
DB 42, M0 ; ( 0,0)
|
|
DB 43, MHP2 ; ( 0,2)
|
|
DB 26, MVN2, ?, ? ; (-2,0)
|
|
|
|
; 39: From 13B.
|
|
; From 14 center point would be (0,-4/0/4)
|
|
; From 16B center point would be (0,-4)
|
|
DB 40, M0 ; ( 0,4)
|
|
DB 18, MVN2 ; (-2,4) Next: Horz2,Vert1,Horz1. (-3:-1,1:7)
|
|
DB 13, MVN4, ?, ? ; (-4,4) Next: Horz2,Vert2,Horz1,Vert1. (-7:-1,1:7)
|
|
|
|
; 40: From 39A.
|
|
DB 41, M0 ; (0, 4)
|
|
DB 41, MHN2 ; (0, 2)
|
|
DB 41, MHP2, ?, ? ; (0, 6)
|
|
|
|
; 41: From 40.
|
|
DB 20, M0 ; ( 0,2/4/6) Next: Horz 1. ( 0,1:7)
|
|
DB 20, MVN1 ; (-1,2/4/6) Next: Horz 1. (-1,1:7)
|
|
DB 20, MVN2, ?, ? ; (-2,2/4/6) Next: Horz 1. (-2,1:7)
|
|
|
|
; 42: From 38A. Lower Left. Try 1 right and 1 up.
|
|
DB 0, M0 ; ( 0,0)
|
|
DB 0, MHP1 ; ( 0,1)
|
|
DB 0, MVN1, ?, ? ; (-1,0)
|
|
|
|
; 43: From 38B.
|
|
; From 44B center point would be (0,-2)
|
|
DB 20, M0 ; ( 0,2) Next: Horz 1 ( 0,1:3)
|
|
DB 20, MVN1 ; (-1,2) Next: Horz 1 (-1,1:3)
|
|
DB 15, MVN2, ?, ? ; (-2,2) Next: Horz 1, Vert 1 (-1:-3,1:3)
|
|
|
|
; 44: From 9A. Lower Right. Try 2 left and 2 up.
|
|
DB 45, M0 ; ( 0, 0)
|
|
DB 43, MHN2 ; ( 0,-2)
|
|
DB 34, MVN2, ?, ? ; (-2, 0)
|
|
|
|
; 45: From 44A. Lower Right. Try 1 left and 1 up.
|
|
DB 0, M0 ; ( 0, 0)
|
|
DB 0, MHN1 ; ( 0,-1)
|
|
DB 0, MVN1, ?, ? ; (-1, 0)
|
|
|
|
; 46: From 17A.
|
|
DB 0, M0 ; (0,0)
|
|
DB 0, MHP1 ; (0,1)
|
|
DB 0, MHP1, ?, ? ; (0,1)
|
|
|
|
; 47: From 10C.
|
|
; From 11 center point would be (0,4/0/-4)
|
|
; From 12C center point would be (0,-4)
|
|
DB 48, M0 ; (0,4)
|
|
DB 48, MHN2 ; (0,2)
|
|
DB 48, MHP2, ?, ? ; (0,6)
|
|
|
|
; 48 From 10B.
|
|
; From 47 center point would be (0,2/4/6)
|
|
; From 12B center point would be (0,-2)
|
|
DB 0, M0 ; (0,2)
|
|
DB 0, MHN1 ; (0,1)
|
|
DB 0, MHP1, ?, ? ; (0,3)
|
|
|
|
; 49 From 12A.
|
|
DB 0, M0 ; (0, 0)
|
|
DB 0, MHN1 ; (0,-1)
|
|
DB 0, MHN1, ?, ? ; (0,-1)
|
|
|
|
; 50: Interior Macroblock. Explore 7 up and 7 down.
|
|
DB 51, M0 ; ( 0,0)
|
|
DB 51, MVN7 ; (-7,0)
|
|
DB 51, MVP7, ?, ? ; ( 7,0)
|
|
|
|
; 51: Explore 7 left and 7 right.
|
|
DB 5, M0 ; (-7|0|7, 0)
|
|
DB 5, MHN7 ; (-7|0|7,-7)
|
|
DB 5, MHP7, ?, ? ; (-7|0|7, 7)
|
|
|
|
MulByNeg8 LABEL DWORD
|
|
|
|
CNT = 0
|
|
REPEAT 128
|
|
DD WeightedDiff+CNT
|
|
CNT = CNT - 8
|
|
ENDM
|
|
|
|
|
|
; The following treachery puts the numbers into byte 2 of each aligned DWORD.
|
|
DB 0, 0
|
|
DD 193 DUP (255)
|
|
DD 250,243,237,231,225,219,213,207,201,195,189,184,178,172,167,162,156
|
|
DD 151,146,141,135,130,126,121,116,111,107,102, 97, 93, 89, 84, 80, 76
|
|
DD 72, 68, 64, 61, 57, 53, 50, 46, 43, 40, 37, 34, 31, 28, 25, 22, 20
|
|
DD 18, 15, 13, 11, 9, 7, 6, 4, 3, 2, 1
|
|
DB 0, 0
|
|
WeightedDiff LABEL DWORD
|
|
DB 0, 0
|
|
DD 0, 0, 1, 2, 3, 4, 6, 7, 9, 11, 13, 15, 18
|
|
DD 20, 22, 25, 28, 31, 34, 37, 40, 43, 46, 50, 53, 57, 61, 64, 68, 72
|
|
DD 76, 80, 84, 89, 93, 97,102,107,111,116,121,126,130,135,141,146,151
|
|
DD 156,162,167,172,178,184,189,195,201,207,213,219,225,231,237,243,250
|
|
DD 191 DUP (255)
|
|
DB 255, 0
|
|
|
|
|
|
MotionOffsets DD 1*PITCH,0,?,?
|
|
|
|
RemnantOfCacheLine DB 8 DUP (?)
|
|
|
|
|
|
LocalStorage LABEL DWORD ; Local storage goes on the stack at addresses
|
|
; whose lower 12 bits match this address.
|
|
|
|
.CODE
|
|
|
|
ASSUME cs : FLAT
|
|
ASSUME ds : FLAT
|
|
ASSUME es : FLAT
|
|
ASSUME fs : FLAT
|
|
ASSUME gs : FLAT
|
|
ASSUME ss : FLAT
|
|
|
|
MOTIONESTIMATION proc C AMBAS: DWORD,
|
|
ATargFrmBase: DWORD,
|
|
APrevFrmBase: DWORD,
|
|
AFiltFrmBase: DWORD,
|
|
ADo15Search: DWORD,
|
|
ADoHalfPelEst: DWORD,
|
|
ADoBlkLvlVec: DWORD,
|
|
ADoSpatialFilt: DWORD,
|
|
AZeroVectorThresh: DWORD,
|
|
ANonZeroMVDiff: DWORD,
|
|
ABlockMVDiff: DWORD,
|
|
AEmptyThresh: DWORD,
|
|
AInterCodThresh: DWORD,
|
|
AIntraCodDiff: DWORD,
|
|
ASpatialFiltThresh: DWORD,
|
|
ASpatialFiltDiff: DWORD,
|
|
AIntraSWDTot: DWORD,
|
|
AIntraSWDBlks: DWORD,
|
|
AInterSWDTot: DWORD,
|
|
AInterSWDBlks: DWORD
|
|
|
|
LocalFrameSize = 128 + 168*4 + 32 ; 128 for locals; 168*4 for blocks; 32 for dummy block.
|
|
RegStoSize = 16
|
|
|
|
; Arguments:
|
|
|
|
MBlockActionStream_arg = RegStoSize + 4
|
|
TargetFrameBaseAddress_arg = RegStoSize + 8
|
|
PreviousFrameBaseAddress_arg = RegStoSize + 12
|
|
FilteredFrameBaseAddress_arg = RegStoSize + 16
|
|
DoRadius15Search_arg = RegStoSize + 20
|
|
DoHalfPelEstimation_arg = RegStoSize + 24
|
|
DoBlockLevelVectors_arg = RegStoSize + 28
|
|
DoSpatialFiltering_arg = RegStoSize + 32
|
|
ZeroVectorThreshold_arg = RegStoSize + 36
|
|
NonZeroMVDifferential_arg = RegStoSize + 40
|
|
BlockMVDifferential_arg = RegStoSize + 44
|
|
EmptyThreshold_arg = RegStoSize + 48
|
|
InterCodingThreshold_arg = RegStoSize + 52
|
|
IntraCodingDifferential_arg = RegStoSize + 56
|
|
SpatialFiltThreshold_arg = RegStoSize + 60
|
|
SpatialFiltDifferential_arg = RegStoSize + 64
|
|
IntraSWDTotal_arg = RegStoSize + 68
|
|
IntraSWDBlocks_arg = RegStoSize + 72
|
|
InterSWDTotal_arg = RegStoSize + 76
|
|
InterSWDBlocks_arg = RegStoSize + 80
|
|
EndOfArgList = RegStoSize + 84
|
|
|
|
; Locals (on local stack frame)
|
|
|
|
MBlockActionStream EQU [esp+ 0]
|
|
CurrSWDState EQU [esp+ 4]
|
|
MotionOffsetsCursor EQU CurrSWDState
|
|
HalfPelHorzSavings EQU CurrSWDState
|
|
VertFilterDoneAddr EQU CurrSWDState
|
|
IntraSWDTotal EQU [esp+ 8]
|
|
IntraSWDBlocks EQU [esp+ 12]
|
|
InterSWDTotal EQU [esp+ 16]
|
|
InterSWDBlocks EQU [esp+ 20]
|
|
|
|
MBCentralInterSWD EQU [esp+ 24]
|
|
MBRef1InterSWD EQU [esp+ 28]
|
|
MBRef2InterSWD EQU [esp+ 32]
|
|
MBCentralInterSWD_BLS EQU [esp+ 36]
|
|
MB0MVInterSWD EQU [esp+ 40]
|
|
MBAddrCentralPoint EQU [esp+ 44]
|
|
MBMotionVectors EQU [esp+ 48]
|
|
|
|
DoHalfPelEstimation EQU [esp+ 52]
|
|
DoBlockLevelVectors EQU [esp+ 56]
|
|
DoSpatialFiltering EQU [esp+ 60]
|
|
ZeroVectorThreshold EQU [esp+ 64]
|
|
NonZeroMVDifferential EQU [esp+ 68]
|
|
BlockMVDifferential EQU [esp+ 72]
|
|
EmptyThreshold EQU [esp+ 76]
|
|
InterCodingThreshold EQU [esp+ 80]
|
|
IntraCodingDifferential EQU [esp+ 84]
|
|
SpatialFiltThreshold EQU [esp+ 88]
|
|
SpatialFiltDifferential EQU [esp+ 92]
|
|
TargetMBAddr EQU [esp+ 96]
|
|
TargetFrameBaseAddress EQU [esp+ 100]
|
|
PreviousFrameBaseAddress EQU [esp+ 104]
|
|
TargToRef EQU [esp+ 108]
|
|
TargToSLF EQU [esp+ 112]
|
|
DoRadius15Search EQU [esp+ 116]
|
|
|
|
StashESP EQU [esp+ 120]
|
|
|
|
BlockLen EQU 168
|
|
Block1 EQU [esp+ 128+40] ; "128" is for locals. "40" is so offsets range from -40 to 124.
|
|
Block2 EQU Block1 + BlockLen
|
|
Block3 EQU Block2 + BlockLen
|
|
Block4 EQU Block3 + BlockLen
|
|
BlockN EQU Block4 + BlockLen
|
|
BlockNM1 EQU Block4
|
|
BlockNM2 EQU Block3
|
|
BlockNP1 EQU Block4 + BlockLen + BlockLen
|
|
DummyBlock EQU Block4 + BlockLen
|
|
|
|
|
|
Ref1Addr EQU -40
|
|
Ref2Addr EQU -36
|
|
AddrCentralPoint EQU -32
|
|
CentralInterSWD EQU -28
|
|
Ref1InterSWD EQU -24
|
|
Ref2InterSWD EQU -20
|
|
CentralInterSWD_BLS EQU -16 ; CentralInterSWD, when doing blk level search.
|
|
CentralInterSWD_SLF EQU -16 ; CentralInterSWD, when doing spatial filter.
|
|
HalfPelSavings EQU Ref2Addr
|
|
ZeroMVInterSWD EQU -12
|
|
BlkHMV EQU -8
|
|
BlkVMV EQU -7
|
|
BlkMVs EQU -8
|
|
AccumTargetPels EQU -4
|
|
|
|
; Offsets for Negated Quadrupled Target Pels:
|
|
N8T00 EQU 0
|
|
N8T04 EQU 4
|
|
N8T02 EQU 8
|
|
N8T06 EQU 12
|
|
N8T20 EQU 16
|
|
N8T24 EQU 20
|
|
N8T22 EQU 24
|
|
N8T26 EQU 28
|
|
N8T40 EQU 32
|
|
N8T44 EQU 36
|
|
N8T42 EQU 40
|
|
N8T46 EQU 44
|
|
N8T60 EQU 48
|
|
N8T64 EQU 52
|
|
N8T62 EQU 56
|
|
N8T66 EQU 60
|
|
N8T11 EQU 64
|
|
N8T15 EQU 68
|
|
N8T13 EQU 72
|
|
N8T17 EQU 76
|
|
N8T31 EQU 80
|
|
N8T35 EQU 84
|
|
N8T33 EQU 88
|
|
N8T37 EQU 92
|
|
N8T51 EQU 96
|
|
N8T55 EQU 100
|
|
N8T53 EQU 104
|
|
N8T57 EQU 108
|
|
N8T71 EQU 112
|
|
N8T75 EQU 116
|
|
N8T73 EQU 120
|
|
N8T77 EQU 124
|
|
|
|
push esi
|
|
push edi
|
|
push ebp
|
|
push ebx
|
|
|
|
; Adjust stack ptr so that local frame fits nicely in cache w.r.t. other data.
|
|
|
|
mov esi,esp
|
|
sub esp,000001000H
|
|
mov eax,[esp] ; Cause system to commit page.
|
|
sub esp,000001000H
|
|
and esp,0FFFFF000H
|
|
mov ebx,OFFSET LocalStorage+31
|
|
and ebx,000000FE0H
|
|
mov edx,PD [esi+MBlockActionStream_arg]
|
|
or esp,ebx
|
|
mov eax,PD [esi+TargetFrameBaseAddress_arg]
|
|
mov TargetFrameBaseAddress,eax
|
|
mov ebx,PD [esi+PreviousFrameBaseAddress_arg]
|
|
mov PreviousFrameBaseAddress,ebx
|
|
sub ebx,eax
|
|
mov ecx,PD [esi+FilteredFrameBaseAddress_arg]
|
|
sub ecx,eax
|
|
mov TargToRef,ebx
|
|
mov TargToSLF,ecx
|
|
mov eax,PD [esi+EmptyThreshold_arg]
|
|
mov EmptyThreshold,eax
|
|
mov eax,PD [esi+DoHalfPelEstimation_arg]
|
|
mov DoHalfPelEstimation,eax
|
|
mov eax,PD [esi+DoBlockLevelVectors_arg]
|
|
mov DoBlockLevelVectors,eax
|
|
mov eax,PD [esi+DoRadius15Search_arg]
|
|
mov DoRadius15Search,eax
|
|
mov eax,PD [esi+DoSpatialFiltering_arg]
|
|
mov DoSpatialFiltering,eax
|
|
mov eax,PD [esi+ZeroVectorThreshold_arg]
|
|
mov ZeroVectorThreshold,eax
|
|
mov eax,PD [esi+NonZeroMVDifferential_arg]
|
|
mov NonZeroMVDifferential,eax
|
|
mov eax,PD [esi+BlockMVDifferential_arg]
|
|
mov BlockMVDifferential,eax
|
|
mov eax,PD [esi+InterCodingThreshold_arg]
|
|
mov InterCodingThreshold,eax
|
|
mov eax,PD [esi+IntraCodingDifferential_arg]
|
|
mov IntraCodingDifferential,eax
|
|
mov eax,PD [esi+SpatialFiltThreshold_arg]
|
|
mov SpatialFiltThreshold,eax
|
|
mov eax,PD [esi+SpatialFiltDifferential_arg]
|
|
mov SpatialFiltDifferential,eax
|
|
xor ebx,ebx
|
|
mov IntraSWDBlocks,ebx
|
|
mov InterSWDBlocks,ebx
|
|
mov IntraSWDTotal,ebx
|
|
mov InterSWDTotal,ebx
|
|
mov Block1.BlkMVs,ebx
|
|
mov Block2.BlkMVs,ebx
|
|
mov Block3.BlkMVs,ebx
|
|
mov Block4.BlkMVs,ebx
|
|
mov DummyBlock.Ref1Addr,esp
|
|
mov DummyBlock.Ref2Addr,esp
|
|
mov StashESP,esi
|
|
jmp FirstMacroBlock
|
|
|
|
; Activity Details for this section of code (refer to flow diagram above):
|
|
;
|
|
; 1) To calculate an average value for the target match points of each
|
|
; block, we sum the 32 match points. The totals for each of the 4
|
|
; blocks is output seperately.
|
|
;
|
|
; 2) Define each prepared match point in the target macroblock as the
|
|
; real match point times negative 8, with the base address of the
|
|
; WeightedDiff lookup table added. I.e.
|
|
;
|
|
; for (i = 0; i < 16; i += 2)
|
|
; for (j = 0; j < 16; j += 2)
|
|
; N8T[i][j] = ( -8 * Target[i][j]) + ((U32) WeightedDiff);
|
|
;
|
|
; Both the multiply and the add of the WeightedDiff array base are
|
|
; effected by a table lookup into the array MulByNeg8.
|
|
;
|
|
; Then the SWD of a reference macroblock can be calculated as follows:
|
|
;
|
|
; SWD = 0;
|
|
; for each match point (i,j)
|
|
; SWD += *((U32 *) (N8T[i][j] + 8 * Ref[i][j]));
|
|
;
|
|
; In assembly, the fetch of WeightedDiff array element amounts to this:
|
|
;
|
|
; mov edi,DWORD PTR N8T[i][j] ; Fetch N8T[i][j]
|
|
; mov dl,BYTE PTR Ref[i][j] ; Fetch Ref[i][j]
|
|
; mov edi,DWORD PTR[edi+edx*8] ; Fetch WeithtedDiff of target & ref.
|
|
;
|
|
; 3) We calculate the 0-motion SWD, as described just above. We use 32
|
|
; match points per block, and write the result seperately for each
|
|
; block. The result is accumulated into the high half of ebp.
|
|
;
|
|
; 4) If the SWD for the 0-motion vector is below a threshold, we don't
|
|
; bother searching for other possibly better motion vectors. Presently,
|
|
; this threshold is set such that an average difference of less than
|
|
; three per match point causes the 0-motion vector to be accepted.
|
|
;
|
|
; Register usage for this section:
|
|
;
|
|
; Input of this section:
|
|
;
|
|
; edx -- MBlockActionStream
|
|
;
|
|
; Predominate usage for body of this section:
|
|
;
|
|
; esi -- Target block address.
|
|
; edi -- 0-motion reference block address.
|
|
; ebp[ 0:12] -- Accumulator for target pels.
|
|
; ebp[13:15] -- Loop control
|
|
; ebp[16:31] -- Accumulator for weighted diff between target and 0-MV ref.
|
|
; edx -- Address at which to store -8 times pels.
|
|
; ecx -- A reference pel.
|
|
; ebx -- A target pel.
|
|
; eax -- A target pel times -8; and a weighted difference.
|
|
;
|
|
; Expected Pentium (tm) microprocessor performance for section:
|
|
;
|
|
; Executed once per macroblock.
|
|
;
|
|
; 520 clocks for instruction execution
|
|
; 8 clocks for bank conflicts (64 dual mem ops with 1/8 chance of conflict)
|
|
; 80 clocks generously estimated for an average of 8 cache line fills for
|
|
; the target macroblock and 8 cache line fills for the reference area.
|
|
; ----
|
|
; 608 clocks total time for this section.
|
|
;
|
|
|
|
NextMacroBlock:
|
|
|
|
mov bl,[edx].CodedBlocks
|
|
add edx,SIZEOF T_MacroBlockActionDescr
|
|
and ebx,000000040H ; Check for end-of-stream
|
|
jne Done
|
|
|
|
FirstMacroBlock:
|
|
|
|
mov cl,[edx].CodedBlocks ; Init CBP for macroblock.
|
|
mov ebp,TargetFrameBaseAddress
|
|
mov bl,[edx].FirstMEState ; First State
|
|
mov eax,DoRadius15Search ; Searching 15 full pels out, or just 7?
|
|
neg al ; doing blk lvl => al=0, not => al=-1
|
|
or cl,03FH ; Indicate all 6 blocks are coded.
|
|
and al,bl
|
|
mov esi,[edx].BlkY1.BlkOffset ; Get address of next macroblock to do.
|
|
cmp al,5
|
|
jne @f
|
|
mov bl,50 ; Cause us to search +/- 15 if central
|
|
; ; block and willing to go that far.
|
|
@@:
|
|
mov edi,TargToRef
|
|
add esi,ebp
|
|
mov CurrSWDState,ebx ; Stash First State Number as current.
|
|
add edi,esi
|
|
xor ebp,ebp
|
|
mov TargetMBAddr,esi ; Stash address of target macroblock.
|
|
mov MBlockActionStream,edx ; Stash list ptr.
|
|
mov [edx].CodedBlocks,cl
|
|
mov ecx,INTER1MV ; Speculate INTER-coding, 1 motion vector.
|
|
mov [edx].BlockType,cl
|
|
lea edx,Block1
|
|
|
|
PrepMatchPointsNextBlock:
|
|
|
|
mov bl,PB [esi+6] ; 06A -- Target Pel 00.
|
|
add ebp,ebx ; 06B -- Accumulate target pels.
|
|
mov cl,PB [edi+6] ; 06C -- Reference Pel 00.
|
|
mov eax,MulByNeg8[ebx*4] ; 06D -- Target Pel 00 * -8.
|
|
mov bl,PB [esi+4] ; 04A
|
|
mov [edx].N8T06,eax ; 06E -- Store negated quadrupled Pel 00.
|
|
add ebp,ebx ; 04B
|
|
mov eax,PD [eax+ecx*8] ; 06F -- Weighted difference for Pel 00.
|
|
mov cl,PB [edi+4] ; 04C
|
|
add ebp,eax ; 06G -- Accumulate weighted difference.
|
|
mov eax,MulByNeg8[ebx*4] ; 04D
|
|
mov bl,PB [esi+2] ; 02A
|
|
mov [edx].N8T04,eax ; 04E
|
|
add ebp,ebx ; 02B
|
|
mov eax,PD [eax+ecx*8] ; 04F
|
|
mov cl,PB [edi+2] ; 02C
|
|
add ebp,eax ; 04G
|
|
mov eax,MulByNeg8[ebx*4] ; 02D
|
|
mov bl,PB [esi] ; 00A
|
|
mov [edx].N8T02,eax ; 02E
|
|
add ebp,ebx ; 00B
|
|
mov eax,PD [eax+ecx*8] ; 02F
|
|
add esi,PITCH+1
|
|
mov cl,PB [edi] ; 00C
|
|
add edi,PITCH+1
|
|
lea ebp,[ebp+eax+000004000H] ; 02G (plus loop control)
|
|
mov eax,MulByNeg8[ebx*4] ; 00D
|
|
mov bl,PB [esi+6] ; 17A
|
|
mov [edx].N8T00,eax ; 00E
|
|
add ebp,ebx ; 17B
|
|
mov eax,PD [eax+ecx*8] ; 00F
|
|
mov cl,PB [edi+6] ; 17C
|
|
add ebp,eax ; 00G
|
|
mov eax,MulByNeg8[ebx*4] ; 17D
|
|
mov bl,PB [esi+4] ; 15A
|
|
mov [edx].N8T17,eax ; 17E
|
|
add ebp,ebx ; 15B
|
|
mov eax,PD [eax+ecx*8] ; 17F
|
|
mov cl,PB [edi+4] ; 15C
|
|
add ebp,eax ; 17G
|
|
mov eax,MulByNeg8[ebx*4] ; 15D
|
|
mov bl,PB [esi+2] ; 13A
|
|
mov [edx].N8T15,eax ; 15E
|
|
add ebp,ebx ; 13B
|
|
mov eax,PD [eax+ecx*8] ; 15F
|
|
mov cl,PB [edi+2] ; 13C
|
|
add ebp,eax ; 15G
|
|
mov eax,MulByNeg8[ebx*4] ; 13D
|
|
mov bl,PB [esi] ; 11A
|
|
mov [edx].N8T13,eax ; 13E
|
|
add ebp,ebx ; 11B
|
|
mov eax,PD [eax+ecx*8] ; 13F
|
|
add esi,PITCH-1
|
|
mov cl,PB [edi] ; 11C
|
|
add edi,PITCH-1
|
|
add ebp,eax ; 13G
|
|
mov eax,MulByNeg8[ebx*4] ; 11D
|
|
mov bl,PB [esi+6] ; 26A
|
|
mov [edx].N8T11,eax ; 11E
|
|
add ebp,ebx ; 26B
|
|
mov eax,PD [eax+ecx*8] ; 11F
|
|
mov cl,PB [edi+6] ; 26C
|
|
add ebp,eax ; 11G
|
|
mov eax,MulByNeg8[ebx*4] ; 26D
|
|
mov bl,PB [esi+4] ; 24A
|
|
mov [edx].N8T26,eax ; 26E
|
|
add ebp,ebx ; 24B
|
|
mov eax,PD [eax+ecx*8] ; 26F
|
|
mov cl,PB [edi+4] ; 24C
|
|
add ebp,eax ; 26G
|
|
mov eax,MulByNeg8[ebx*4] ; 24D
|
|
mov bl,PB [esi+2] ; 22A
|
|
mov [edx].N8T24,eax ; 24E
|
|
add ebp,ebx ; 22B
|
|
mov eax,PD [eax+ecx*8] ; 24F
|
|
mov cl,PB [edi+2] ; 22C
|
|
add ebp,eax ; 24G
|
|
mov eax,MulByNeg8[ebx*4] ; 22D
|
|
mov bl,PB [esi] ; 20A
|
|
mov [edx].N8T22,eax ; 22E
|
|
add ebp,ebx ; 20B
|
|
mov eax,PD [eax+ecx*8] ; 22F
|
|
add esi,PITCH+1
|
|
mov cl,PB [edi] ; 20C
|
|
add edi,PITCH+1
|
|
add ebp,eax ; 22G
|
|
mov eax,MulByNeg8[ebx*4] ; 20D
|
|
mov bl,PB [esi+6] ; 37A
|
|
mov [edx].N8T20,eax ; 20E
|
|
add ebp,ebx ; 37B
|
|
mov eax,PD [eax+ecx*8] ; 20F
|
|
mov cl,PB [edi+6] ; 37C
|
|
add ebp,eax ; 20G
|
|
mov eax,MulByNeg8[ebx*4] ; 37D
|
|
mov bl,PB [esi+4] ; 35A
|
|
mov [edx].N8T37,eax ; 37E
|
|
add ebp,ebx ; 35B
|
|
mov eax,PD [eax+ecx*8] ; 37F
|
|
mov cl,PB [edi+4] ; 35C
|
|
add ebp,eax ; 37G
|
|
mov eax,MulByNeg8[ebx*4] ; 35D
|
|
mov bl,PB [esi+2] ; 33A
|
|
mov [edx].N8T35,eax ; 35E
|
|
add ebp,ebx ; 33B
|
|
mov eax,PD [eax+ecx*8] ; 35F
|
|
mov cl,PB [edi+2] ; 33C
|
|
add ebp,eax ; 35G
|
|
mov eax,MulByNeg8[ebx*4] ; 33D
|
|
mov bl,PB [esi] ; 31A
|
|
mov [edx].N8T33,eax ; 33E
|
|
add ebp,ebx ; 31B
|
|
mov eax,PD [eax+ecx*8] ; 33F
|
|
add esi,PITCH-1
|
|
mov cl,PB [edi] ; 31C
|
|
add edi,PITCH-1
|
|
add ebp,eax ; 33G
|
|
mov eax,MulByNeg8[ebx*4] ; 31D
|
|
mov bl,PB [esi+6] ; 46A
|
|
mov [edx].N8T31,eax ; 31E
|
|
add ebp,ebx ; 46B
|
|
mov eax,PD [eax+ecx*8] ; 31F
|
|
mov cl,PB [edi+6] ; 46C
|
|
add ebp,eax ; 31G
|
|
mov eax,MulByNeg8[ebx*4] ; 46D
|
|
mov bl,PB [esi+4] ; 44A
|
|
mov [edx].N8T46,eax ; 46E
|
|
add ebp,ebx ; 44B
|
|
mov eax,PD [eax+ecx*8] ; 46F
|
|
mov cl,PB [edi+4] ; 44C
|
|
add ebp,eax ; 46G
|
|
mov eax,MulByNeg8[ebx*4] ; 44D
|
|
mov bl,PB [esi+2] ; 42A
|
|
mov [edx].N8T44,eax ; 44E
|
|
add ebp,ebx ; 42B
|
|
mov eax,PD [eax+ecx*8] ; 44F
|
|
mov cl,PB [edi+2] ; 42C
|
|
add ebp,eax ; 44G
|
|
mov eax,MulByNeg8[ebx*4] ; 42D
|
|
mov bl,PB [esi] ; 40A
|
|
mov [edx].N8T42,eax ; 42E
|
|
add ebp,ebx ; 40B
|
|
mov eax,PD [eax+ecx*8] ; 42F
|
|
add esi,PITCH+1
|
|
mov cl,PB [edi] ; 40C
|
|
add edi,PITCH+1
|
|
add ebp,eax ; 42G
|
|
mov eax,MulByNeg8[ebx*4] ; 40D
|
|
mov bl,PB [esi+6] ; 57A
|
|
mov [edx].N8T40,eax ; 40E
|
|
add ebp,ebx ; 57B
|
|
mov eax,PD [eax+ecx*8] ; 40F
|
|
mov cl,PB [edi+6] ; 57C
|
|
add ebp,eax ; 40G
|
|
mov eax,MulByNeg8[ebx*4] ; 57D
|
|
mov bl,PB [esi+4] ; 55A
|
|
mov [edx].N8T57,eax ; 57E
|
|
add ebp,ebx ; 55B
|
|
mov eax,PD [eax+ecx*8] ; 57F
|
|
mov cl,PB [edi+4] ; 55C
|
|
add ebp,eax ; 57G
|
|
mov eax,MulByNeg8[ebx*4] ; 55D
|
|
mov bl,PB [esi+2] ; 53A
|
|
mov [edx].N8T55,eax ; 55E
|
|
add ebp,ebx ; 53B
|
|
mov eax,PD [eax+ecx*8] ; 55F
|
|
mov cl,PB [edi+2] ; 53C
|
|
add ebp,eax ; 55G
|
|
mov eax,MulByNeg8[ebx*4] ; 53D
|
|
mov bl,PB [esi] ; 51A
|
|
mov [edx].N8T53,eax ; 53E
|
|
add ebp,ebx ; 51B
|
|
mov eax,PD [eax+ecx*8] ; 53F
|
|
add esi,PITCH-1
|
|
mov cl,PB [edi] ; 51C
|
|
add edi,PITCH-1
|
|
add ebp,eax ; 53G
|
|
mov eax,MulByNeg8[ebx*4] ; 51D
|
|
mov bl,PB [esi+6] ; 66A
|
|
mov [edx].N8T51,eax ; 51E
|
|
add ebp,ebx ; 66B
|
|
mov eax,PD [eax+ecx*8] ; 51F
|
|
mov cl,PB [edi+6] ; 66C
|
|
add ebp,eax ; 51G
|
|
mov eax,MulByNeg8[ebx*4] ; 66D
|
|
mov bl,PB [esi+4] ; 64A
|
|
mov [edx].N8T66,eax ; 66E
|
|
add ebp,ebx ; 64B
|
|
mov eax,PD [eax+ecx*8] ; 66F
|
|
mov cl,PB [edi+4] ; 64C
|
|
add ebp,eax ; 66G
|
|
mov eax,MulByNeg8[ebx*4] ; 64D
|
|
mov bl,PB [esi+2] ; 62A
|
|
mov [edx].N8T64,eax ; 64E
|
|
add ebp,ebx ; 62B
|
|
mov eax,PD [eax+ecx*8] ; 64F
|
|
mov cl,PB [edi+2] ; 62C
|
|
add ebp,eax ; 64G
|
|
mov eax,MulByNeg8[ebx*4] ; 62D
|
|
mov bl,PB [esi] ; 60A
|
|
mov [edx].N8T62,eax ; 62E
|
|
add ebp,ebx ; 60B
|
|
mov eax,PD [eax+ecx*8] ; 62F
|
|
add esi,PITCH+1
|
|
mov cl,PB [edi] ; 60C
|
|
add edi,PITCH+1
|
|
add ebp,eax ; 62G
|
|
mov eax,MulByNeg8[ebx*4] ; 60D
|
|
mov bl,PB [esi+6] ; 77A
|
|
mov [edx].N8T60,eax ; 60E
|
|
add ebp,ebx ; 77B
|
|
mov eax,PD [eax+ecx*8] ; 60F
|
|
mov cl,PB [edi+6] ; 77C
|
|
add ebp,eax ; 60G
|
|
mov eax,MulByNeg8[ebx*4] ; 77D
|
|
mov bl,PB [esi+4] ; 75A
|
|
mov [edx].N8T77,eax ; 77E
|
|
add ebp,ebx ; 75B
|
|
mov eax,PD [eax+ecx*8] ; 77F
|
|
mov cl,PB [edi+4] ; 75C
|
|
add ebp,eax ; 77G
|
|
mov eax,MulByNeg8[ebx*4] ; 75D
|
|
mov bl,PB [esi+2] ; 73A
|
|
mov [edx].N8T75,eax ; 75E
|
|
add ebp,ebx ; 73B
|
|
mov eax,PD [eax+ecx*8] ; 75F
|
|
mov cl,PB [edi+2] ; 73C
|
|
add ebp,eax ; 75G
|
|
mov eax,MulByNeg8[ebx*4] ; 73D
|
|
mov bl,PB [esi] ; 71A
|
|
mov [edx].N8T73,eax ; 73E
|
|
add ebp,ebx ; 71B
|
|
mov eax,PD [eax+ecx*8] ; 73F
|
|
mov cl,PB [edi] ; 71C
|
|
add esi,PITCH-1-PITCH*8+8
|
|
add edi,PITCH-1-PITCH*8+8
|
|
add ebp,eax ; 73G
|
|
mov eax,MulByNeg8[ebx*4] ; 71D
|
|
mov ebx,ebp
|
|
mov [edx].N8T71,eax ; 71E
|
|
and ebx,000001FFFH ; Extract sum of target pels.
|
|
add edx,BlockLen ; Move to next output block
|
|
mov eax,PD [eax+ecx*8] ; 71F
|
|
mov [edx-BlockLen].AccumTargetPels,ebx ; Store acc of target pels for block.
|
|
add eax,ebp ; 71G
|
|
and ebp,000006000H ; Extract loop control
|
|
shr eax,16 ; Extract SWD; CF == 1 every second iter.
|
|
mov ebx,ecx
|
|
mov [edx-BlockLen].CentralInterSWD,eax ; Store SWD for 0-motion vector.
|
|
jnc PrepMatchPointsNextBlock
|
|
|
|
add esi,PITCH*8-16 ; Advance to block 3, or off end.
|
|
add edi,PITCH*8-16 ; Advance to block 3, or off end.
|
|
xor ebp,000002000H
|
|
jne PrepMatchPointsNextBlock ; Jump if advancing to block 3.
|
|
|
|
mov ebx,CurrSWDState ; Fetch First State Number for engine.
|
|
mov edi,Block1.CentralInterSWD
|
|
test bl,bl ; Test for INTRA-BY-DECREE.
|
|
je IntraByDecree
|
|
|
|
add eax,Block2.CentralInterSWD
|
|
add edi,Block3.CentralInterSWD
|
|
add eax,edi
|
|
mov edx,ZeroVectorThreshold
|
|
cmp eax,edx ; Compare 0-MV against ZeroVectorThresh
|
|
jle BelowZeroThresh ; Jump if 0-MV is good enough.
|
|
|
|
mov cl,PB SWDState[ebx*8+3] ; cl == Index of inc to apply to central
|
|
; ; point to get to ref1.
|
|
mov bl,PB SWDState[ebx*8+5] ; bl == Same as cl, but for ref2.
|
|
mov edx,TargToRef
|
|
mov MB0MVInterSWD,eax ; Stash SWD for zero motion vector.
|
|
mov edi,PD OffsetToRef[ebx] ; Get inc to apply to ctr to get to ref2.
|
|
mov ebp,PD OffsetToRef[ecx] ; Get inc to apply to ctr to get to ref1.
|
|
lea esi,[esi+edx-PITCH*16] ; Calculate address of 0-MV ref block.
|
|
;
|
|
mov MBAddrCentralPoint,esi ; Set central point to 0-MV.
|
|
mov MBCentralInterSWD,eax
|
|
mov eax,Block1.CentralInterSWD ; Stash Zero MV SWD, in case we decide
|
|
mov edx,Block2.CentralInterSWD ; the best non-zero MV isn't enough
|
|
mov Block1.ZeroMVInterSWD,eax ; better than the zero MV.
|
|
mov Block2.ZeroMVInterSWD,edx
|
|
mov eax,Block3.CentralInterSWD
|
|
mov edx,Block4.CentralInterSWD
|
|
mov Block3.ZeroMVInterSWD,eax
|
|
mov Block4.ZeroMVInterSWD,edx
|
|
|
|
; Activity Details for this section of code (refer to flow diagram above):
|
|
;
|
|
; 5) The SWD for two different reference macroblocks is calculated; ref1
|
|
; into the high order 16 bits of ebp, and ref2 into the low 16 bits.
|
|
; This is performed for each iteration of the state engine. A normal,
|
|
; internal macroblock will perform 6 iterations, searching +/- 4
|
|
; horizontally, then +/- 4 vertically, then +/- 2 horizontally, then
|
|
; +/- 2 vertically, then +/- 1 horizontally, then +/- 1 vertically.
|
|
;
|
|
; Register usage for this section:
|
|
;
|
|
; Input:
|
|
;
|
|
; esi -- Addr of 0-motion macroblock in ref frame.
|
|
; ebp -- Increment to apply to get to first ref1 macroblock.
|
|
; edi -- Increment to apply to get to first ref2 macroblock.
|
|
; ebx, ecx -- High order 24 bits are zero.
|
|
;
|
|
; Output:
|
|
;
|
|
; ebp -- SWD for the best-fit reference macroblock.
|
|
; ebx -- Index of increment to apply to get to best-fit reference MB.
|
|
; MBAddrCentralPoint -- the best-fit of the previous iteration; it is the
|
|
; value to which OffsetToRef[ebx] must be added.
|
|
;
|
|
;
|
|
; Expected performance for SWDLoop code:
|
|
;
|
|
; Execution frequency: Six times per block for which motion analysis is done
|
|
; beyond the 0-motion vector.
|
|
;
|
|
; Pentium (tm) microprocessor times per six iterations:
|
|
; 180 clocks for instruction execution setup to DoSWDLoop
|
|
; 2520 clocks for DoSWDLoop procedure, instruction execution.
|
|
; 192 clocks for bank conflicts in DoSWDLoop
|
|
; 30 clocks generously estimated for an average of 6 cache line fills for
|
|
; the reference area.
|
|
; ----
|
|
; 2922 clocks total time for this section.
|
|
|
|
MBFullPelMotionSearchLoop:
|
|
|
|
lea edi,[esi+edi+PITCH*8+8]
|
|
lea esi,[esi+ebp+PITCH*8+8]
|
|
mov Block4.Ref1Addr,esi
|
|
mov Block4.Ref2Addr,edi
|
|
sub esi,8
|
|
sub edi,8
|
|
mov Block3.Ref1Addr,esi
|
|
mov Block3.Ref2Addr,edi
|
|
sub esi,PITCH*8-8
|
|
sub edi,PITCH*8-8
|
|
mov Block2.Ref1Addr,esi
|
|
mov Block2.Ref2Addr,edi
|
|
sub esi,8
|
|
sub edi,8
|
|
mov Block1.Ref1Addr,esi
|
|
mov Block1.Ref2Addr,edi
|
|
|
|
; esi -- Points to ref1
|
|
; edi -- Points to ref2
|
|
; ecx -- Upper 24 bits zero
|
|
; ebx -- Upper 24 bits zero
|
|
|
|
call DoSWDLoop
|
|
|
|
; ebp -- Ref1 SWD for block 4
|
|
; edx -- Ref2 SWD for block 4
|
|
; ecx -- Upper 24 bits zero
|
|
; ebx -- Upper 24 bits zero
|
|
|
|
mov esi,MBCentralInterSWD ; Get SWD for central point of these 3 refs
|
|
xor eax,eax
|
|
add ebp,Block1.Ref1InterSWD
|
|
add edx,Block1.Ref2InterSWD
|
|
add ebp,Block2.Ref1InterSWD
|
|
add edx,Block2.Ref2InterSWD
|
|
add ebp,Block3.Ref1InterSWD
|
|
add edx,Block3.Ref2InterSWD
|
|
|
|
cmp ebp,edx ; Carry flag == 1 iff ref1 SWD < ref2 SWD.
|
|
mov edi,CurrSWDState ; Restore current state number.
|
|
adc eax,eax ; eax == 1 iff ref1 SWD < ref2 SWD.
|
|
cmp ebp,esi ; Carry flag == 1 iff ref1 SWD < central SWD.
|
|
adc eax,eax ;
|
|
cmp edx,esi ; Carry flag == 1 iff ref2 SWD < central SWD.
|
|
adc eax,eax ; 0 --> Pick central point.
|
|
; ; 1 --> Pick ref2.
|
|
; ; 2 --> Not possible.
|
|
; ; 3 --> Pick ref2.
|
|
; ; 4 --> Pick central point.
|
|
; ; 5 --> Not possible.
|
|
; ; 6 --> Pick ref1.
|
|
; ; 7 --> Pick ref1.
|
|
mov MBRef2InterSWD,edx
|
|
mov MBRef1InterSWD,ebp
|
|
xor edx,edx
|
|
mov dl,PB PickPoint[eax] ; dl == 0: central pt; 2: ref1; 4: ref2
|
|
mov esi,MBAddrCentralPoint ; Reload address of central ref block.
|
|
;
|
|
;
|
|
mov ebp,Block1.CentralInterSWD[edx*2] ; Get SWD for each block, picked pt.
|
|
mov al,PB SWDState[edx+edi*8+1] ; al == Index of inc to apply to old central
|
|
; ; point to get new central point.
|
|
mov Block1.CentralInterSWD,ebp ; Stash SWD for new central point.
|
|
mov ebp,Block2.CentralInterSWD[edx*2]
|
|
mov Block2.CentralInterSWD,ebp
|
|
mov ebp,Block3.CentralInterSWD[edx*2]
|
|
mov Block3.CentralInterSWD,ebp
|
|
mov ebp,Block4.CentralInterSWD[edx*2]
|
|
mov Block4.CentralInterSWD,ebp
|
|
mov ebp,MBCentralInterSWD[edx*2]; Get the SWD for the point we picked.
|
|
mov dl,PB SWDState[edx+edi*8] ; dl == New state number.
|
|
mov MBCentralInterSWD,ebp ; Stash SWD for new central point.
|
|
mov edi,PD OffsetToRef[eax] ; Get inc to apply to get to new central pt.
|
|
mov CurrSWDState,edx ; Stash current state number.
|
|
mov bl,PB SWDState[edx*8+3] ; bl == Index of inc to apply to central
|
|
; ; point to get to next ref1.
|
|
mov cl,PB SWDState[edx*8+5] ; cl == Same as bl, but for ref2.
|
|
add esi,edi ; Move to new central point.
|
|
test dl,dl
|
|
mov ebp,PD OffsetToRef[ebx] ; Get inc to apply to ctr to get to ref1.
|
|
mov edi,PD OffsetToRef[ecx] ; Get inc to apply to ctr to get to ref2.
|
|
mov MBAddrCentralPoint,esi ; Stash address of new central ref block.
|
|
jne MBFullPelMotionSearchLoop ; Jump if not done searching.
|
|
|
|
;Done searching for integer motion vector for full macroblock
|
|
|
|
IF PITCH-384
|
|
*** Error: The magic leaks out of the following code if PITCH isn't 384.
|
|
ENDIF
|
|
mov ecx,TargToRef ; To Linearize MV for winning ref blk.
|
|
mov eax,esi ; Copy of ref macroblock addr.
|
|
sub eax,ecx ; To Linearize MV for winning ref blk.
|
|
mov ecx,TargetMBAddr
|
|
sub eax,ecx
|
|
mov edx,MBlockActionStream ; Fetch list ptr.
|
|
mov ebx,eax
|
|
mov ebp,DoHalfPelEstimation ; Are we doing half pel motion estimation?
|
|
shl eax,25 ; Extract horz motion component.
|
|
mov [edx].BlkY1.PastRef,esi ; Save address of reference MB selected.
|
|
sar ebx,8 ; Hi 24 bits of linearized MV lookup vert MV.
|
|
mov ecx,MBCentralInterSWD
|
|
sar eax,24 ; Finish extract horz motion component.
|
|
test ebp,ebp
|
|
mov bl,PB UnlinearizedVertMV[ebx] ; Look up proper vert motion vector.
|
|
mov [edx].BlkY1.PHMV,al ; Save winning horz motion vector.
|
|
mov [edx].BlkY1.PVMV,bl ; Save winning vert motion vector.
|
|
|
|
IFDEF H261
|
|
ELSE
|
|
je SkipHalfPelSearch_1MV
|
|
|
|
;Search for half pel motion vector for full macroblock.
|
|
|
|
mov Block1.AddrCentralPoint,esi
|
|
lea ebp,[esi+8]
|
|
mov Block2.AddrCentralPoint,ebp
|
|
add ebp,PITCH*8-8
|
|
mov Block3.AddrCentralPoint,ebp
|
|
xor ecx,ecx
|
|
mov cl,[edx].FirstMEState
|
|
add ebp,8
|
|
mov edi,esi
|
|
mov Block4.AddrCentralPoint,ebp
|
|
mov ebp,InitHalfPelSearchHorz[ecx*4-4]
|
|
|
|
; ebp -- Initialized to 0, except when can't search off left or right edge.
|
|
; edi -- Ref addr for block 1. Ref1 is .5 pel to left. Ref2 is .5 to right.
|
|
|
|
call DoSWDHalfPelHorzLoop
|
|
|
|
; ebp, ebx -- Zero
|
|
; ecx -- Ref1 SWD for block 4
|
|
; edx -- Ref2 SWD for block 4
|
|
|
|
mov esi,MBlockActionStream
|
|
xor eax,eax ; Keep pairing happy
|
|
add ecx,Block1.Ref1InterSWD
|
|
add edx,Block1.Ref2InterSWD
|
|
add ecx,Block2.Ref1InterSWD
|
|
add edx,Block2.Ref2InterSWD
|
|
add ecx,Block3.Ref1InterSWD
|
|
add edx,Block3.Ref2InterSWD
|
|
mov bl,[esi].FirstMEState
|
|
mov edi,Block1.AddrCentralPoint
|
|
cmp ecx,edx
|
|
jl MBHorz_Ref1LTRef2
|
|
|
|
mov ebp,MBCentralInterSWD
|
|
mov esi,MBlockActionStream
|
|
sub ebp,edx
|
|
jle MBHorz_CenterBest
|
|
|
|
mov al,[esi].BlkY1.PHMV ; Half pel to the right is best.
|
|
mov ecx,Block1.Ref2InterSWD
|
|
mov Block1.CentralInterSWD_BLS,ecx
|
|
mov ecx,Block3.Ref2InterSWD
|
|
mov Block3.CentralInterSWD_BLS,ecx
|
|
mov ecx,Block2.Ref2InterSWD
|
|
mov Block2.CentralInterSWD_BLS,ecx
|
|
mov ecx,Block4.Ref2InterSWD
|
|
mov Block4.CentralInterSWD_BLS,ecx
|
|
inc al
|
|
mov [esi].BlkY1.PHMV,al
|
|
jmp MBHorz_Done
|
|
|
|
MBHorz_CenterBest:
|
|
|
|
mov ecx,Block1.CentralInterSWD
|
|
xor ebp,ebp
|
|
mov Block1.CentralInterSWD_BLS,ecx
|
|
mov ecx,Block2.CentralInterSWD
|
|
mov Block2.CentralInterSWD_BLS,ecx
|
|
mov ecx,Block3.CentralInterSWD
|
|
mov Block3.CentralInterSWD_BLS,ecx
|
|
mov ecx,Block4.CentralInterSWD
|
|
mov Block4.CentralInterSWD_BLS,ecx
|
|
jmp MBHorz_Done
|
|
|
|
MBHorz_Ref1LTRef2:
|
|
|
|
mov ebp,MBCentralInterSWD
|
|
mov esi,MBlockActionStream
|
|
sub ebp,ecx
|
|
jle MBHorz_CenterBest
|
|
|
|
mov al,[esi].BlkY1.PHMV ; Half pel to the left is best.
|
|
mov edx,[esi].BlkY1.PastRef
|
|
dec al
|
|
mov ecx,Block1.Ref1InterSWD
|
|
mov Block1.CentralInterSWD_BLS,ecx
|
|
mov ecx,Block3.Ref1InterSWD
|
|
mov Block3.CentralInterSWD_BLS,ecx
|
|
mov ecx,Block2.Ref1InterSWD
|
|
mov Block2.CentralInterSWD_BLS,ecx
|
|
mov ecx,Block4.Ref1InterSWD
|
|
mov Block4.CentralInterSWD_BLS,ecx
|
|
dec edx
|
|
mov [esi].BlkY1.PHMV,al
|
|
mov [esi].BlkY1.PastRef,edx
|
|
|
|
MBHorz_Done:
|
|
|
|
mov HalfPelHorzSavings,ebp
|
|
mov ebp,InitHalfPelSearchVert[ebx*4-4]
|
|
|
|
; ebp -- Initialized to 0, except when can't search off left or right edge.
|
|
; edi -- Ref addr for block 1. Ref1 is .5 pel above. Ref2 is .5 below.
|
|
|
|
call DoSWDHalfPelVertLoop
|
|
|
|
; ebp, ebx -- Zero
|
|
; ecx -- Ref1 SWD for block 4
|
|
; edx -- Ref2 SWD for block 4
|
|
|
|
add ecx,Block1.Ref1InterSWD
|
|
add edx,Block1.Ref2InterSWD
|
|
add ecx,Block2.Ref1InterSWD
|
|
add edx,Block2.Ref2InterSWD
|
|
add ecx,Block3.Ref1InterSWD
|
|
add edx,Block3.Ref2InterSWD
|
|
cmp ecx,edx
|
|
jl MBVert_Ref1LTRef2
|
|
|
|
mov ebp,MBCentralInterSWD
|
|
mov esi,MBlockActionStream
|
|
sub ebp,edx
|
|
jle MBVert_CenterBest
|
|
|
|
mov ecx,Block1.CentralInterSWD
|
|
mov edx,Block1.Ref2InterSWD
|
|
sub ecx,edx
|
|
mov edx,Block1.CentralInterSWD_BLS
|
|
sub edx,ecx
|
|
mov al,[esi].BlkY1.PVMV ; Half pel below is best.
|
|
mov Block1.CentralInterSWD,edx
|
|
inc al
|
|
mov ecx,Block3.CentralInterSWD
|
|
mov edx,Block3.Ref2InterSWD
|
|
sub ecx,edx
|
|
mov edx,Block3.CentralInterSWD_BLS
|
|
sub edx,ecx
|
|
mov ecx,Block2.CentralInterSWD
|
|
mov Block3.CentralInterSWD,edx
|
|
mov edx,Block2.Ref2InterSWD
|
|
sub ecx,edx
|
|
mov edx,Block2.CentralInterSWD_BLS
|
|
sub edx,ecx
|
|
mov ecx,Block4.CentralInterSWD
|
|
mov Block2.CentralInterSWD,edx
|
|
mov edx,Block4.Ref2InterSWD
|
|
sub ecx,edx
|
|
mov edx,Block4.CentralInterSWD_BLS
|
|
sub edx,ecx
|
|
mov [esi].BlkY1.PVMV,al
|
|
mov Block4.CentralInterSWD,edx
|
|
jmp MBVert_Done
|
|
|
|
MBVert_CenterBest:
|
|
|
|
mov ecx,Block1.CentralInterSWD_BLS
|
|
xor ebp,ebp
|
|
mov Block1.CentralInterSWD,ecx
|
|
mov ecx,Block2.CentralInterSWD_BLS
|
|
mov Block2.CentralInterSWD,ecx
|
|
mov ecx,Block3.CentralInterSWD_BLS
|
|
mov Block3.CentralInterSWD,ecx
|
|
mov ecx,Block4.CentralInterSWD_BLS
|
|
mov Block4.CentralInterSWD,ecx
|
|
jmp MBVert_Done
|
|
|
|
MBVert_Ref1LTRef2:
|
|
|
|
mov ebp,MBCentralInterSWD
|
|
mov esi,MBlockActionStream
|
|
sub ebp,ecx
|
|
jle MBVert_CenterBest
|
|
|
|
mov ecx,Block1.CentralInterSWD
|
|
mov edx,Block1.Ref1InterSWD
|
|
sub ecx,edx
|
|
mov edx,Block1.CentralInterSWD_BLS
|
|
sub edx,ecx
|
|
mov al,[esi].BlkY1.PVMV ; Half pel above is best.
|
|
mov Block1.CentralInterSWD,edx
|
|
dec al
|
|
mov ecx,Block3.CentralInterSWD
|
|
mov edx,Block3.Ref1InterSWD
|
|
sub ecx,edx
|
|
mov edx,Block3.CentralInterSWD_BLS
|
|
sub edx,ecx
|
|
mov ecx,Block2.CentralInterSWD
|
|
mov Block3.CentralInterSWD,edx
|
|
mov edx,Block2.Ref1InterSWD
|
|
sub ecx,edx
|
|
mov edx,Block2.CentralInterSWD_BLS
|
|
sub edx,ecx
|
|
mov ecx,Block4.CentralInterSWD
|
|
mov Block2.CentralInterSWD,edx
|
|
mov edx,Block4.Ref1InterSWD
|
|
sub ecx,edx
|
|
mov edx,Block4.CentralInterSWD_BLS
|
|
sub edx,ecx
|
|
mov ecx,[esi].BlkY1.PastRef
|
|
mov Block4.CentralInterSWD,edx
|
|
sub ecx,PITCH
|
|
mov [esi].BlkY1.PVMV,al
|
|
mov [esi].BlkY1.PastRef,ecx
|
|
|
|
MBVert_Done:
|
|
|
|
mov ecx,HalfPelHorzSavings
|
|
mov edx,esi
|
|
add ebp,ecx ; Savings for horz and vert half pel motion.
|
|
mov ecx,MBCentralInterSWD ; Reload SWD for new central point.
|
|
sub ecx,ebp ; Approx SWD for prescribed half pel motion.
|
|
mov esi,[edx].BlkY1.PastRef ; Reload address of reference MB selected.
|
|
mov MBCentralInterSWD,ecx
|
|
|
|
SkipHalfPelSearch_1MV:
|
|
|
|
ENDIF ; H263
|
|
|
|
mov ebp,[edx].BlkY1.MVs ; Load Motion Vectors
|
|
add esi,8
|
|
mov [edx].BlkY2.PastRef,esi
|
|
mov [edx].BlkY2.MVs,ebp
|
|
lea edi,[esi+PITCH*8]
|
|
add esi,PITCH*8-8
|
|
mov [edx].BlkY3.PastRef,esi
|
|
mov [edx].BlkY3.MVs,ebp
|
|
mov [edx].BlkY4.PastRef,edi
|
|
mov [edx].BlkY4.MVs,ebp
|
|
IFDEF H261
|
|
ELSE ; H263
|
|
mov MBMotionVectors,ebp ; Stash macroblock level motion vectors.
|
|
mov ebp,640 ; ??? BlockMVDifferential
|
|
cmp ecx,ebp
|
|
jl NoBlockMotionVectors
|
|
|
|
mov ecx,DoBlockLevelVectors
|
|
test ecx,ecx ; Are we doing block level motion vectors?
|
|
je NoBlockMotionVectors
|
|
|
|
; Activity Details for this section of code (refer to flow diagram above):
|
|
;
|
|
; The following search is done similarly to the searches done above, except
|
|
; these are block searches, instead of macroblock searches.
|
|
;
|
|
; Expected performance:
|
|
;
|
|
; Execution frequency: Six times per block for which motion analysis is done
|
|
; beyond the 0-motion vector.
|
|
;
|
|
; Pentium (tm) microprocessor times per six iterations:
|
|
; 180 clocks for instruction execution setup to DoSWDLoop
|
|
; 2520 clocks for DoSWDLoop procedure, instruction execution.
|
|
; 192 clocks for bank conflicts in DoSWDLoop
|
|
; 30 clocks generously estimated for an average of 6 cache line fills for
|
|
; the reference area.
|
|
; ----
|
|
; 2922 clocks total time for this section.
|
|
|
|
;
|
|
; Set up for the "BlkFullPelSWDLoop_4blks" loop to follow.
|
|
; - Store the SWD values for blocks 4, 3, 2, 1.
|
|
; - Compute and store the address of the central reference
|
|
; point for blocks 1, 2, 3, 4.
|
|
; - Compute and store the first address for ref 1 (minus 4
|
|
; pels horizontally) and ref 2 (plus 4 pels horizontally)
|
|
; for blocks 4, 3, 2, 1 (in that order).
|
|
; - Initialize MotionOffsetsCursor
|
|
; - On exit:
|
|
; esi = ref 1 address for block 1
|
|
; edi = ref 2 address for block 1
|
|
;
|
|
mov esi,Block4.CentralInterSWD
|
|
mov edi,Block3.CentralInterSWD
|
|
mov Block4.CentralInterSWD_BLS,esi
|
|
mov Block3.CentralInterSWD_BLS,edi
|
|
mov esi,Block2.CentralInterSWD
|
|
mov edi,Block1.CentralInterSWD
|
|
mov Block2.CentralInterSWD_BLS,esi
|
|
mov eax,MBAddrCentralPoint ; Reload addr of central, integer pel ref MB.
|
|
mov Block1.CentralInterSWD_BLS,edi
|
|
mov Block1.AddrCentralPoint,eax
|
|
lea edi,[eax+PITCH*8+8+1]
|
|
lea esi,[eax+PITCH*8+8-1]
|
|
mov Block4.Ref1Addr,esi
|
|
mov Block4.Ref2Addr,edi
|
|
sub esi,8
|
|
add eax,8
|
|
mov Block2.AddrCentralPoint,eax
|
|
add eax,PITCH*8-8
|
|
mov Block3.AddrCentralPoint,eax
|
|
add eax,8
|
|
mov Block4.AddrCentralPoint,eax
|
|
sub edi,8
|
|
mov Block3.Ref1Addr,esi
|
|
mov Block3.Ref2Addr,edi
|
|
sub esi,PITCH*8-8
|
|
sub edi,PITCH*8-8
|
|
mov Block2.Ref1Addr,esi
|
|
mov Block2.Ref2Addr,edi
|
|
sub esi,8
|
|
mov eax,OFFSET MotionOffsets
|
|
mov MotionOffsetsCursor,eax
|
|
sub edi,8
|
|
mov Block1.Ref1Addr,esi
|
|
mov Block1.Ref2Addr,edi
|
|
|
|
;
|
|
; This loop will execute 6 times:
|
|
; +- 4 pels horizontally
|
|
; +- 4 pels vertically
|
|
; +- 2 pels horizontally
|
|
; +- 2 pels vertically
|
|
; +- 1 pel horizontally
|
|
; +- 1 pel vertically
|
|
; It terminates when ref1 = ref2. This simple termination
|
|
; condition is what forces unrestricted motion vectors (UMV)
|
|
; to be ON when advanced prediction (4MV) is ON. Otherwise
|
|
; we would need a state engine as above to distinguish edge
|
|
; pels.
|
|
;
|
|
BlkFullPelSWDLoop_4blks:
|
|
|
|
; esi -- Points to ref1
|
|
; edi -- Points to ref2
|
|
; ecx -- Upper 24 bits zero
|
|
; ebx -- Upper 24 bits zero
|
|
|
|
call DoSWDLoop
|
|
|
|
; ebp -- Ref1 SWD for block 4
|
|
; edx -- Ref2 SWD for block 4
|
|
; ecx -- Upper 24 bits zero
|
|
; ebx -- Upper 24 bits zero
|
|
|
|
mov eax,MotionOffsetsCursor
|
|
|
|
BlkFullPelSWDLoop_1blk:
|
|
|
|
xor esi,esi
|
|
cmp ebp,edx ; CF == 1 iff ref1 SWD < ref2 SWD.
|
|
mov edi,BlockNM1.CentralInterSWD_BLS; Get SWD for central pt of these 3 refs
|
|
adc esi,esi ; esi == 1 iff ref1 SWD < ref2 SWD.
|
|
cmp ebp,edi ; CF == 1 iff ref1 SWD < central SWD.
|
|
mov ebp,BlockNM2.Ref1InterSWD ; Fetch next block's Ref1 SWD.
|
|
adc esi,esi
|
|
cmp edx,edi ; CF == 1 iff ref2 SWD < central SWD.
|
|
adc esi,esi ; 0 --> Pick central point.
|
|
; ; 1 --> Pick ref2.
|
|
; ; 2 --> Not possible.
|
|
; ; 3 --> Pick ref2.
|
|
; ; 4 --> Pick central point.
|
|
; ; 5 --> Not possible.
|
|
; ; 6 --> Pick ref1.
|
|
; ; 7 --> Pick ref1.
|
|
mov edx,BlockNM2.Ref2InterSWD ; Fetch next block's Ref2 SWD.
|
|
sub esp,BlockLen ; Move ahead to next block.
|
|
mov edi,[eax] ; Next ref2 motion vector offset.
|
|
mov cl,PickPoint_BLS[esi] ; cl == 6: central pt; 2: ref1; 4: ref2
|
|
mov ebx,esp ; For testing completion.
|
|
;
|
|
;
|
|
mov esi,BlockN.AddrCentralPoint[ecx*2-12] ; Get the addr for pt we picked.
|
|
mov ecx,BlockN.CentralInterSWD[ecx*2] ; Get the SWD for point we picked.
|
|
mov BlockN.AddrCentralPoint,esi ; Stash addr for new central point.
|
|
sub esi,edi ; Compute next ref1 addr.
|
|
mov BlockN.Ref1Addr,esi ; Stash next ref1 addr.
|
|
mov BlockN.CentralInterSWD_BLS,ecx ; Stash the SWD for central point.
|
|
lea edi,[esi+edi*2] ; Compute next ref2 addr.
|
|
xor ecx,ecx
|
|
mov BlockN.Ref2Addr,edi ; Stash next ref2 addr.
|
|
and ebx,00000001FH ; Done when esp at 32-byte bound.
|
|
jne BlkFullPelSWDLoop_1blk
|
|
|
|
add esp,BlockLen*4
|
|
add eax,4 ; Advance MotionOffsets pointer.
|
|
mov MotionOffsetsCursor,eax
|
|
cmp esi,edi
|
|
jne BlkFullPelSWDLoop_4blks
|
|
|
|
IF PITCH-384
|
|
*** Error: The magic leaks out of the following code if PITCH isn't 384.
|
|
ENDIF
|
|
|
|
;
|
|
; The following code has been modified to correctly decode the motion vectors
|
|
; The previous code was simply subtracting the target frame base address
|
|
; from the chosen (central) reference block address.
|
|
; What is now done is the begining reference macroblock address computed
|
|
; in ebp, then subtracted from the chosen (central) reference block address.
|
|
; Then, for blocks 2, 3, and 4, the distance from block 1 to that block
|
|
; is subtracted. Care was taken to preserve the original pairing.
|
|
;
|
|
mov esi,Block1.AddrCentralPoint ; B1a Reload address of central ref block.
|
|
mov ebp,TargetMBAddr ; **** CHANGE **** addr. of target MB
|
|
|
|
mov edi,Block2.AddrCentralPoint ; B2a
|
|
add ebp,TargToRef ; **** CHANGE **** add Reference - Target
|
|
|
|
; mov ebp,PreviousFrameBaseAddress **** CHANGE **** DELETED
|
|
|
|
mov Block1.Ref1Addr,esi ; B1b Stash addr central ref block.
|
|
sub esi,ebp ; B1c Addr of ref blk, but in target frame.
|
|
|
|
mov Block2.Ref1Addr,edi ; B2b
|
|
sub edi,ebp ; B2c
|
|
|
|
sub edi,8 ; **** CHANGE **** Correct for block 2
|
|
mov eax,esi ; B1e Copy linearized MV.
|
|
|
|
sar esi,8 ; B1f High 24 bits of lin MV lookup vert MV.
|
|
mov ebx,edi ; B2e
|
|
|
|
sar edi,8 ; B2f
|
|
add eax,eax ; B1g Sign extend HMV; *2 (# of half pels).
|
|
|
|
mov Block1.BlkHMV,al ; B1h Save winning horz motion vector.
|
|
add ebx,ebx ; B2g
|
|
|
|
mov Block2.BlkHMV,bl ; B2h
|
|
mov al,UnlinearizedVertMV[esi] ; B1i Look up proper vert motion vector.
|
|
|
|
mov Block1.BlkVMV,al ; B1j Save winning vert motion vector.
|
|
mov al,UnlinearizedVertMV[edi] ; B2i
|
|
|
|
mov esi,Block3.AddrCentralPoint ; B3a
|
|
mov edi,Block4.AddrCentralPoint ; B4a
|
|
|
|
mov Block3.Ref1Addr,esi ; B3b
|
|
mov Block4.Ref1Addr,edi ; B4b
|
|
|
|
mov Block2.BlkVMV,al ; B2j
|
|
sub esi,ebp ; B3c
|
|
|
|
sub esi,8*PITCH ; **** CHANGE **** Correct for block 3
|
|
sub edi,ebp ; B4c
|
|
|
|
sub edi,8*PITCH+8 ; **** CHANGE **** Correct for block 4
|
|
mov eax,esi ; B3e
|
|
|
|
sar esi,8 ; B3f
|
|
mov ebx,edi ; B4e
|
|
|
|
sar edi,8 ; B4f
|
|
add eax,eax ; B3g
|
|
|
|
mov Block3.BlkHMV,al ; B3h
|
|
add ebx,ebx ; B4g
|
|
|
|
mov Block4.BlkHMV,bl ; B4h
|
|
mov al,UnlinearizedVertMV[esi] ; B3i
|
|
|
|
mov Block3.BlkVMV,al ; B3j
|
|
mov al,UnlinearizedVertMV[edi] ; B4i
|
|
|
|
mov ebp,Block1.CentralInterSWD_BLS
|
|
mov ebx,Block2.CentralInterSWD_BLS
|
|
|
|
add ebp,Block3.CentralInterSWD_BLS
|
|
add ebx,Block4.CentralInterSWD_BLS
|
|
|
|
add ebx,ebp
|
|
mov Block4.BlkVMV,al ; B4j
|
|
|
|
mov ecx,DoHalfPelEstimation
|
|
mov MBCentralInterSWD_BLS,ebx
|
|
|
|
test ecx,ecx
|
|
je NoHalfPelBlockLevelMVs
|
|
|
|
HalfPelBlockLevelMotionSearch:
|
|
|
|
mov edi,Block1.AddrCentralPoint
|
|
xor ebp,ebp
|
|
|
|
; ebp -- Initialized to 0, implying can search both left and right.
|
|
; edi -- Ref addr for block 1. Ref1 is .5 pel to left. Ref2 is .5 to right.
|
|
|
|
call DoSWDHalfPelHorzLoop
|
|
|
|
; ebp, ebx -- Zero
|
|
; ecx -- Ref1 SWD for block 4
|
|
; edx -- Ref2 SWD for block 4
|
|
|
|
NextBlkHorz:
|
|
|
|
mov ebx,BlockNM1.CentralInterSWD_BLS
|
|
cmp ecx,edx
|
|
mov BlockNM1.HalfPelSavings,ebp
|
|
jl BlkHorz_Ref1LTRef2
|
|
|
|
mov al,BlockNM1.BlkHMV
|
|
sub esp,BlockLen
|
|
sub ebx,edx
|
|
jle BlkHorz_CenterBest
|
|
|
|
inc al
|
|
mov BlockN.HalfPelSavings,ebx
|
|
mov BlockN.BlkHMV,al
|
|
jmp BlkHorz_Done
|
|
|
|
BlkHorz_Ref1LTRef2:
|
|
|
|
mov al,BlockNM1.BlkHMV
|
|
sub esp,BlockLen
|
|
sub ebx,ecx
|
|
jle BlkHorz_CenterBest
|
|
|
|
mov ecx,BlockN.Ref1Addr
|
|
dec al
|
|
mov BlockN.HalfPelSavings,ebx
|
|
dec ecx
|
|
mov BlockN.BlkHMV,al
|
|
mov BlockN.Ref1Addr,ecx
|
|
|
|
BlkHorz_CenterBest:
|
|
BlkHorz_Done:
|
|
|
|
mov ecx,BlockNM1.Ref1InterSWD
|
|
mov edx,BlockNM1.Ref2InterSWD
|
|
test esp,000000018H
|
|
jne NextBlkHorz
|
|
|
|
mov edi,BlockN.AddrCentralPoint
|
|
add esp,BlockLen*4
|
|
|
|
; ebp -- Initialized to 0, implying search both up and down is okay.
|
|
; edi -- Ref addr for block 1. Ref1 is .5 pel above. Ref2 is .5 below.
|
|
|
|
call DoSWDHalfPelVertLoop
|
|
|
|
; ebp, ebx -- Zero
|
|
; ecx -- Ref1 SWD for block 4
|
|
; edx -- Ref2 SWD for block 4
|
|
|
|
NextBlkVert:
|
|
|
|
mov ebx,BlockNM1.CentralInterSWD_BLS
|
|
cmp ecx,edx
|
|
mov edi,BlockNM1.HalfPelSavings
|
|
jl BlkVert_Ref1LTRef2
|
|
|
|
mov al,BlockNM1.BlkVMV
|
|
sub esp,BlockLen
|
|
sub edx,ebx
|
|
jge BlkVert_CenterBest
|
|
|
|
inc al
|
|
sub edi,edx
|
|
mov BlockN.BlkVMV,al
|
|
jmp BlkVert_Done
|
|
|
|
BlkVert_Ref1LTRef2:
|
|
|
|
mov al,BlockNM1.BlkVMV
|
|
sub esp,BlockLen
|
|
sub ecx,ebx
|
|
jge BlkVert_CenterBest
|
|
|
|
sub edi,ecx
|
|
mov ecx,BlockN.Ref1Addr
|
|
dec al
|
|
sub ecx,PITCH
|
|
mov BlockN.BlkVMV,al
|
|
mov BlockN.Ref1Addr,ecx
|
|
|
|
BlkVert_CenterBest:
|
|
BlkVert_Done:
|
|
|
|
mov ecx,BlockNM1.Ref1InterSWD
|
|
sub ebx,edi
|
|
mov BlockN.CentralInterSWD_BLS,ebx
|
|
mov edx,BlockNM1.Ref2InterSWD
|
|
test esp,000000018H
|
|
lea ebp,[ebp+edi]
|
|
jne NextBlkVert
|
|
|
|
mov ebx,MBCentralInterSWD_BLS+BlockLen*4
|
|
add esp,BlockLen*4
|
|
sub ebx,ebp
|
|
xor eax,eax ; ??? Keep pairing happy
|
|
|
|
NoHalfPelBlockLevelMVs:
|
|
|
|
mov eax,MBCentralInterSWD
|
|
mov ecx,BlockMVDifferential
|
|
sub eax,ebx
|
|
mov edi,MB0MVInterSWD
|
|
cmp eax,ecx
|
|
jle BlockMVNotBigEnoughGain
|
|
|
|
sub edi,ebx
|
|
mov ecx,NonZeroMVDifferential
|
|
cmp edi,ecx
|
|
jle NonZeroMVNotBigEnoughGain
|
|
|
|
; Block motion vectors are best.
|
|
|
|
mov MBCentralInterSWD,ebx ; Set MBlock's SWD to sum of 4 blocks.
|
|
mov edx,MBlockActionStream
|
|
mov eax,Block1.CentralInterSWD_BLS ; Set each block's SWD.
|
|
mov ebx,Block2.CentralInterSWD_BLS
|
|
mov Block1.CentralInterSWD,eax
|
|
mov Block2.CentralInterSWD,ebx
|
|
mov eax,Block3.CentralInterSWD_BLS
|
|
mov ebx,Block4.CentralInterSWD_BLS
|
|
mov Block3.CentralInterSWD,eax
|
|
mov Block4.CentralInterSWD,ebx
|
|
mov eax,Block1.BlkMVs ; Set each block's motion vector.
|
|
mov ebx,Block2.BlkMVs
|
|
mov [edx].BlkY1.MVs,eax
|
|
mov [edx].BlkY2.MVs,ebx
|
|
mov eax,Block3.BlkMVs
|
|
mov ebx,Block4.BlkMVs
|
|
mov [edx].BlkY3.MVs,eax
|
|
mov [edx].BlkY4.MVs,ebx
|
|
mov eax,Block1.Ref1Addr ; Set each block's reference blk addr.
|
|
mov ebx,Block2.Ref1Addr
|
|
mov [edx].BlkY1.PastRef,eax
|
|
mov [edx].BlkY2.PastRef,ebx
|
|
mov eax,Block3.Ref1Addr
|
|
mov ebx,Block4.Ref1Addr
|
|
mov [edx].BlkY3.PastRef,eax
|
|
mov eax,INTER4MV ; Set type for MB to INTER-coded, 4 MVs.
|
|
mov [edx].BlkY4.PastRef,ebx
|
|
mov [edx].BlockType,al
|
|
jmp MotionVectorSettled
|
|
|
|
NoBlockMotionVectors:
|
|
|
|
ENDIF ; H263
|
|
|
|
mov edi,MB0MVInterSWD
|
|
|
|
BlockMVNotBigEnoughGain: ; Try MB-level motion vector.
|
|
|
|
mov eax,MBCentralInterSWD
|
|
mov ecx,NonZeroMVDifferential
|
|
sub edi,eax
|
|
mov edx,MBlockActionStream
|
|
cmp edi,ecx
|
|
jg MotionVectorSettled
|
|
|
|
NonZeroMVNotBigEnoughGain: ; Settle on zero MV.
|
|
|
|
mov eax,Block1.ZeroMVInterSWD ; Restore Zero MV SWD.
|
|
mov edx,Block2.ZeroMVInterSWD
|
|
mov Block1.CentralInterSWD,eax
|
|
mov Block2.CentralInterSWD,edx
|
|
mov eax,Block3.ZeroMVInterSWD
|
|
mov edx,Block4.ZeroMVInterSWD
|
|
mov Block3.CentralInterSWD,eax
|
|
mov Block4.CentralInterSWD,edx
|
|
mov eax,MB0MVInterSWD ; Restore SWD for zero motion vector.
|
|
|
|
BelowZeroThresh:
|
|
|
|
mov edx,MBlockActionStream
|
|
mov ebx,TargetMBAddr ; Get address of this target macroblock.
|
|
mov MBCentralInterSWD,eax ; Save SWD.
|
|
xor ebp,ebp
|
|
add ebx,TargToRef
|
|
mov [edx].BlkY1.MVs,ebp ; Set horz and vert MVs to 0 in all blks.
|
|
mov [edx].BlkY1.PastRef,ebx ; Save address of ref block, all blks.
|
|
add ebx,8
|
|
mov [edx].BlkY2.PastRef,ebx
|
|
mov [edx].BlkY2.MVs,ebp
|
|
lea ecx,[ebx+PITCH*8]
|
|
add ebx,PITCH*8-8
|
|
mov [edx].BlkY3.PastRef,ebx
|
|
mov [edx].BlkY3.MVs,ebp
|
|
mov [edx].BlkY4.PastRef,ecx
|
|
mov [edx].BlkY4.MVs,ebp
|
|
|
|
; Activity Details for this section of code (refer to flow diagram above):
|
|
;
|
|
; 6) We've settled on the motion vector that will be used if we do indeed
|
|
; code the macroblock with inter-coding. We need to determine if some
|
|
; or all of the blocks can be forced as empty (copy).
|
|
; blocks. If all the blocks can be forced empty, we force the whole
|
|
; macroblock to be empty.
|
|
;
|
|
; Expected Pentium (tm) microprocessor performance for this section:
|
|
;
|
|
; Execution frequency: Once per macroblock.
|
|
;
|
|
; 23 clocks.
|
|
;
|
|
|
|
MotionVectorSettled:
|
|
|
|
IFDEF H261
|
|
mov edi,MBCentralInterSWD
|
|
mov eax,DoSpatialFiltering ; Are we doing spatial filtering?
|
|
mov edi,TargetMBAddr
|
|
test eax,eax
|
|
je SkipSpatialFiltering
|
|
|
|
mov ebx,MBCentralInterSWD
|
|
mov esi,SpatialFiltThreshold
|
|
cmp ebx,esi
|
|
jle SkipSpatialFiltering
|
|
|
|
add edi,TargToSLF ; Compute addr at which to put SLF prediction.
|
|
xor ebx,ebx
|
|
mov esi,[edx].BlkY1.PastRef
|
|
xor edx,edx
|
|
mov ebp,16
|
|
xor ecx,ecx
|
|
|
|
SpatialFilterHorzLoop:
|
|
|
|
mov dl,[edi] ; Pre-load cache line for output.
|
|
mov bl,[esi+6] ; p6
|
|
mov al,[esi+7] ; p7
|
|
inc bl ; p6+1
|
|
mov cl,[esi+5] ; p5
|
|
mov [edi+7],al ; p7' = p7
|
|
add al,bl ; p7 + p6 + 1
|
|
add bl,cl ; p6 + p5 + 1
|
|
mov dl,[esi+4] ; p4
|
|
add eax,ebx ; p7 + 2p6 + p5 + 2
|
|
shr eax,2 ; p6' = (p7 + 2p6 + p5 + 2) / 4
|
|
inc dl ; p4 + 1
|
|
add cl,dl ; p5 + p4 + 1
|
|
mov [edi+6],al ; p6'
|
|
mov al,[esi+3] ; p3
|
|
add ebx,ecx ; p6 + 2p5 + p4 + 2
|
|
shr ebx,2 ; p5' = (p6 + 2p5 + p4 + 2) / 4
|
|
add dl,al ; p4 + p3 + 1
|
|
mov [edi+5],bl ; p5'
|
|
mov bl,[esi+2] ; p2
|
|
add ecx,edx ; p5 + 2p4 + p3 + 2
|
|
inc bl ; p2 + 1
|
|
shr ecx,2 ; p4' = (p5 + 2p4 + p3 + 2) / 4
|
|
add al,bl ; p3 + p2 + 1
|
|
mov [edi+4],cl ; p4'
|
|
add edx,eax ; p4 + 2p3 + p2 + 2
|
|
shr edx,2 ; p3' = (p4 + 2p3 + p2 + 2) / 4
|
|
mov cl,[esi+1] ; p1
|
|
add bl,cl ; p2 + p1 + 1
|
|
mov [edi+3],dl ; p3'
|
|
add eax,ebx ; p3 + 2p2 + p1 + 2
|
|
mov dl,[esi] ; p0
|
|
shr eax,2 ; p2' = (p3 + 2p2 + p1 + 2) / 4
|
|
inc ebx ; p2 + p1 + 2
|
|
mov [edi+2],al ; p2'
|
|
add ebx,ecx ; p2 + 2p1 + 2
|
|
mov [edi],dl ; p0' = p0
|
|
add ebx,edx ; p2 + 2p1 + p0 + 2
|
|
shr ebx,2 ; p1' = (p2 + 2p1 + p0 + 2) / 4
|
|
mov al,[esi+7+8]
|
|
mov [edi+1],bl ; p1'
|
|
mov bl,[esi+6+8]
|
|
inc bl
|
|
mov cl,[esi+5+8]
|
|
mov [edi+7+8],al
|
|
add al,bl
|
|
add bl,cl
|
|
mov dl,[esi+4+8]
|
|
add eax,ebx
|
|
;
|
|
shr eax,2
|
|
inc dl
|
|
add cl,dl
|
|
mov [edi+6+8],al
|
|
mov al,[esi+3+8]
|
|
add ebx,ecx
|
|
shr ebx,2
|
|
add dl,al
|
|
mov [edi+5+8],bl
|
|
mov bl,[esi+2+8]
|
|
add ecx,edx
|
|
inc bl
|
|
shr ecx,2
|
|
add al,bl
|
|
mov [edi+4+8],cl
|
|
add edx,eax
|
|
shr edx,2
|
|
mov cl,[esi+1+8]
|
|
add bl,cl
|
|
mov [edi+3+8],dl
|
|
add eax,ebx
|
|
mov dl,[esi+8]
|
|
shr eax,2
|
|
inc ebx
|
|
mov [edi+2+8],al
|
|
add ebx,ecx
|
|
mov [edi+8],dl
|
|
add ebx,edx
|
|
shr ebx,2
|
|
add esi,PITCH
|
|
mov [edi+1+8],bl
|
|
add edi,PITCH
|
|
dec ebp ; Done?
|
|
jne SpatialFilterHorzLoop
|
|
|
|
mov VertFilterDoneAddr,edi
|
|
sub edi,PITCH*16
|
|
|
|
SpatialFilterVertLoop:
|
|
|
|
mov eax,[edi] ; p0
|
|
; ; Bank conflict for sure.
|
|
;
|
|
mov ebx,[edi+PITCH] ; p1
|
|
add eax,ebx ; p0+p1
|
|
mov ecx,[edi+PITCH*2] ; p2
|
|
add ebx,ecx ; p1+p2
|
|
mov edx,[edi+PITCH*3] ; p3
|
|
shr eax,1 ; (p0+p1)/2 dirty
|
|
mov esi,[edi+PITCH*4] ; p4
|
|
add ecx,edx ; p2+p3
|
|
mov ebp,[edi+PITCH*5] ; p5
|
|
shr ebx,1 ; (p1+p2)/2 dirty
|
|
add edx,esi ; p3+p4
|
|
and eax,07F7F7F7FH ; (p0+p1)/2 clean
|
|
and ebx,07F7F7F7FH ; (p1+p2)/2 clean
|
|
and ecx,0FEFEFEFEH ; p2+p3 pre-cleaned
|
|
and edx,0FEFEFEFEH ; p3+p4 pre-cleaned
|
|
shr ecx,1 ; (p2+p3)/2 clean
|
|
add esi,ebp ; p4+p5
|
|
shr edx,1 ; (p3+p4)/2 clean
|
|
lea eax,[eax+ebx+001010101H] ; (p0+p1)/2+(p1+p2)/2+1
|
|
shr esi,1 ; (p4+p5)/2 dirty
|
|
;
|
|
and esi,07F7F7F7FH ; (p4+p5)/2 clean
|
|
lea ebx,[ebx+ecx+001010101H] ; (p1+p2)/2+(p2+p3)/2+1
|
|
shr eax,1 ; p1' = ((p0+p1)/2+(p1+p2)/2+1)/2 dirty
|
|
lea ecx,[ecx+edx+001010101H] ; (p2+p3)/2+(p3+p4)/2+1
|
|
shr ebx,1 ; p2' = ((p1+p2)/2+(p2+p3)/2+1)/2 dirty
|
|
lea edx,[edx+esi+001010101H] ; (p3+p4)/2+(p4+p5)/2+1
|
|
and eax,07F7F7F7FH ; p1' clean
|
|
and ebx,07F7F7F7FH ; p2' clean
|
|
shr ecx,1 ; p3' = ((p2+p3)/2+(p3+p4)/2+1)/2 dirty
|
|
mov [edi+PITCH],eax ; p1'
|
|
shr edx,1 ; p4' = ((p3+p4)/2+(p4+p5)/2+1)/2 dirty
|
|
mov eax,[edi+PITCH*6] ; p6
|
|
and ecx,07F7F7F7FH ; p3' clean
|
|
and edx,07F7F7F7FH ; p4' clean
|
|
mov [edi+PITCH*2],ebx ; p2'
|
|
add ebp,eax ; p5+p6
|
|
shr ebp,1 ; (p5+p6)/2 dirty
|
|
mov ebx,[edi+PITCH*7] ; p7
|
|
add eax,ebx ; p6+p7
|
|
and ebp,07F7F7F7FH ; (p5+p6)/2 clean
|
|
mov [edi+PITCH*3],ecx ; p3'
|
|
and eax,0FEFEFEFEH ; (p6+p7)/2 pre-cleaned
|
|
shr eax,1 ; (p6+p7)/2 clean
|
|
lea esi,[esi+ebp+001010101H] ; (p4+p5)/2+(p5+p6)/2+1
|
|
shr esi,1 ; p5' = ((p4+p5)/2+(p5+p6)/2+1)/2 dirty
|
|
mov [edi+PITCH*4],edx ; p4'
|
|
lea ebp,[ebp+eax+001010101H] ; (p5+p6)/2+(p6+p7)/2+1
|
|
and esi,07F7F7F7FH ; p5' clean
|
|
shr ebp,1 ; p6' = ((p5+p6)/2+(p6+p7)/2+1)/2 dirty
|
|
mov [edi+PITCH*5],esi ; p5'
|
|
and ebp,07F7F7F7FH ; p6' clean
|
|
add edi,4
|
|
test edi,00000000FH
|
|
mov [edi+PITCH*6-4],ebp ; p6'
|
|
jne SpatialFilterVertLoop
|
|
|
|
add edi,PITCH*8-16
|
|
mov eax,VertFilterDoneAddr
|
|
cmp eax,edi
|
|
jne SpatialFilterVertLoop
|
|
|
|
|
|
; Activity Details for this section of code (refer to flow diagram above):
|
|
;
|
|
; 9) The SAD for the spatially filtered reference macroblock is calculated
|
|
; with half the pel differences accumulating into the low order half
|
|
; of ebp, and the other half into the high order half.
|
|
;
|
|
; Register usage for this section:
|
|
;
|
|
; Input of this section:
|
|
;
|
|
; edi -- Address of pel 0,0 of spatially filtered reference macroblock.
|
|
;
|
|
; Predominate usage for body of this section:
|
|
;
|
|
; edi -- Address of pel 0,0 of spatially filtered reference macroblock.
|
|
; esi, eax -- -8 times pel values from target macroblock.
|
|
; ebp[ 0:15] -- SAD Accumulator for half of the match points.
|
|
; ebp[16:31] -- SAD Accumulator for other half of the match points.
|
|
; edx[ 0: 7] -- Weighted difference for one pel.
|
|
; edx[ 8:15] -- Zero.
|
|
; edx[16:23] -- Weighted difference for another pel.
|
|
; edx[24:31] -- Zero.
|
|
; bl, cl -- Pel values from the spatially filtered reference macroblock.
|
|
;
|
|
; Expected Pentium (tm) microprocessor performance for this section:
|
|
;
|
|
; Execution frequency: Once per block for which motion analysis is done
|
|
; beyond the 0-motion vector.
|
|
;
|
|
; 146 clocks instruction execution (typically).
|
|
; 6 clocks for bank conflicts (1/8 chance with 48 dual mem ops).
|
|
; 0 clocks for new cache line fills.
|
|
; ----
|
|
; 152 clocks total time for this section.
|
|
;
|
|
|
|
SpatialFilterDone:
|
|
|
|
sub edi,PITCH*8-8 ; Get to block 4.
|
|
xor ebp,ebp
|
|
xor ebx,ebx
|
|
xor ecx,ecx
|
|
|
|
SLFSWDLoop:
|
|
|
|
mov eax,BlockNM1.N8T00 ; Get -8 times target Pel00.
|
|
mov bl,[edi] ; Get Pel00 in spatially filtered reference.
|
|
mov esi,BlockNM1.N8T04
|
|
mov cl,[edi+4]
|
|
mov edx,[eax+ebx*8] ; Get abs diff for spatial filtered ref pel00.
|
|
mov eax,BlockNM1.N8T02
|
|
mov dl,[esi+ecx*8+2] ; Get abs diff for spatial filtered ref pel04.
|
|
mov bl,[edi+2]
|
|
mov esi,BlockNM1.N8T06
|
|
mov cl,[edi+6]
|
|
mov ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T11
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*1+1]
|
|
mov cl,[edi+PITCH*1+5]
|
|
mov esi,BlockNM1.N8T15
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T13
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*1+3]
|
|
mov cl,[edi+PITCH*1+7]
|
|
mov esi,BlockNM1.N8T17
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T20
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*2+0]
|
|
mov cl,[edi+PITCH*2+4]
|
|
mov esi,BlockNM1.N8T24
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T22
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*2+2]
|
|
mov cl,[edi+PITCH*2+6]
|
|
mov esi,BlockNM1.N8T26
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T31
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*3+1]
|
|
mov cl,[edi+PITCH*3+5]
|
|
mov esi,BlockNM1.N8T35
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T33
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*3+3]
|
|
mov cl,[edi+PITCH*3+7]
|
|
mov esi,BlockNM1.N8T37
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T40
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*4+0]
|
|
mov cl,[edi+PITCH*4+4]
|
|
mov esi,BlockNM1.N8T44
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T42
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*4+2]
|
|
mov cl,[edi+PITCH*4+6]
|
|
mov esi,BlockNM1.N8T46
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T51
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*5+1]
|
|
mov cl,[edi+PITCH*5+5]
|
|
mov esi,BlockNM1.N8T55
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T53
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*5+3]
|
|
mov cl,[edi+PITCH*5+7]
|
|
mov esi,BlockNM1.N8T57
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T60
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*6+0]
|
|
mov cl,[edi+PITCH*6+4]
|
|
mov esi,BlockNM1.N8T64
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T62
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*6+2]
|
|
mov cl,[edi+PITCH*6+6]
|
|
mov esi,BlockNM1.N8T66
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T71
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*7+1]
|
|
mov cl,[edi+PITCH*7+5]
|
|
mov esi,BlockNM1.N8T75
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
mov eax,BlockNM1.N8T73
|
|
mov dl,[esi+ecx*8+2]
|
|
mov bl,[edi+PITCH*7+3]
|
|
mov cl,[edi+PITCH*7+7]
|
|
mov esi,BlockNM1.N8T77
|
|
add ebp,edx
|
|
mov edx,[eax+ebx*8]
|
|
add edx,ebp
|
|
mov cl,[esi+ecx*8+2]
|
|
shr edx,16
|
|
add ebp,ecx
|
|
and ebp,0FFFFH
|
|
sub esp,BlockLen
|
|
add ebp,edx
|
|
sub edi,8
|
|
test esp,000000008H
|
|
mov BlockN.CentralInterSWD_SLF,ebp
|
|
jne SLFSWDLoop
|
|
|
|
test esp,000000010H
|
|
lea edi,[edi-PITCH*8+16]
|
|
jne SLFSWDLoop
|
|
|
|
mov eax,Block2.CentralInterSWD_SLF+BlockLen*4
|
|
mov ebx,Block3.CentralInterSWD_SLF+BlockLen*4
|
|
mov ecx,Block4.CentralInterSWD_SLF+BlockLen*4
|
|
add esp,BlockLen*4
|
|
add ebp,ecx
|
|
lea edx,[eax+ebx]
|
|
add ebp,edx
|
|
mov edx,SpatialFiltDifferential
|
|
lea esi,[edi+PITCH*8-8]
|
|
mov edi,MBCentralInterSWD
|
|
sub edi,edx
|
|
mov edx,MBlockActionStream
|
|
cmp ebp,edi
|
|
jge SpatialFilterNotAsGood
|
|
|
|
mov MBCentralInterSWD,ebp ; Spatial filter was better. Stash
|
|
mov ebp,Block1.CentralInterSWD_SLF ; pertinent calculations.
|
|
mov Block2.CentralInterSWD,eax
|
|
mov Block3.CentralInterSWD,ebx
|
|
mov Block4.CentralInterSWD,ecx
|
|
mov Block1.CentralInterSWD,ebp
|
|
mov [edx].BlkY1.PastRef,esi
|
|
mov al,INTERSLF
|
|
mov [edx].BlockType,al
|
|
|
|
SkipSpatialFiltering:
|
|
SpatialFilterNotAsGood:
|
|
ENDIF ; H261
|
|
|
|
mov al,[edx].CodedBlocks ; Fetch coded block pattern.
|
|
mov edi,EmptyThreshold ; Get threshold for forcing block empty?
|
|
mov ebp,MBCentralInterSWD
|
|
mov esi,InterSWDBlocks
|
|
mov ebx,Block4.CentralInterSWD ; Is SWD > threshold?
|
|
cmp ebx,edi
|
|
jg @f
|
|
|
|
and al,0F7H ; If not, indicate block 4 is NOT coded.
|
|
dec esi
|
|
sub ebp,ebx
|
|
|
|
@@:
|
|
|
|
mov ebx,Block3.CentralInterSWD
|
|
cmp ebx,edi
|
|
jg @f
|
|
|
|
and al,0FBH
|
|
dec esi
|
|
sub ebp,ebx
|
|
|
|
@@:
|
|
|
|
mov ebx,Block2.CentralInterSWD
|
|
cmp ebx,edi
|
|
jg @f
|
|
|
|
and al,0FDH
|
|
dec esi
|
|
sub ebp,ebx
|
|
|
|
@@:
|
|
|
|
mov ebx,Block1.CentralInterSWD
|
|
cmp ebx,edi
|
|
jg @f
|
|
|
|
and al,0FEH
|
|
dec esi
|
|
sub ebp,ebx
|
|
|
|
@@:
|
|
|
|
mov [edx].CodedBlocks,al ; Store coded block pattern.
|
|
add esi,4
|
|
mov InterSWDBlocks,esi
|
|
xor ebx,ebx
|
|
and eax,00FH
|
|
mov MBCentralInterSWD,ebp
|
|
cmp al,00FH ; Are any blocks marked empty?
|
|
jne InterBest ; If some blocks are empty, can't code as Intra
|
|
|
|
cmp ebp,InterCodingThreshold ; Is InterSWD below inter-coding threshhold.
|
|
lea esi,Block1+128
|
|
mov ebp,0
|
|
jae CalculateIntraSWD
|
|
|
|
InterBest:
|
|
|
|
mov ecx,InterSWDTotal
|
|
mov ebp,MBCentralInterSWD
|
|
add ecx,ebp ; Add to total for this macroblock class.
|
|
mov PD [edx].SWD,ebp
|
|
mov InterSWDTotal,ecx
|
|
jmp NextMacroBlock
|
|
|
|
|
|
; Activity Details for this section of code (refer to flow diagram above):
|
|
;
|
|
; 11) The IntraSWD is calculated as two partial sums, one in the low order
|
|
; 16 bits of ebp and one in the high order 16 bits. An average pel
|
|
; value for each block will be calculated to the nearest half.
|
|
;
|
|
; Register usage for this section:
|
|
;
|
|
; Input of this section:
|
|
;
|
|
; None
|
|
;
|
|
; Predominate usage for body of this section:
|
|
;
|
|
; esi -- Address of target block 1 (3), plus 128.
|
|
; ebp[ 0:15] -- IntraSWD Accumulator for block 1 (3).
|
|
; ebp[16:31] -- IntraSWD Accumulator for block 2 (4).
|
|
; edi -- Block 2 (4) target pel, times -8, and with WeightedDiff added.
|
|
; edx -- Block 1 (3) target pel, times -8, and with WeightedDiff added.
|
|
; ecx[ 0: 7] -- Weighted difference for one pel in block 2 (4).
|
|
; ecx[ 8:15] -- Zero.
|
|
; ecx[16:23] -- Weighted difference for one pel in block 1 (3).
|
|
; ecx[24:31] -- Zero.
|
|
; ebx -- Average block 2 (4) target pel to nearest .5.
|
|
; eax -- Average block 1 (3) target pel to nearest .5.
|
|
;
|
|
; Output of this section:
|
|
;
|
|
; edi -- Scratch.
|
|
; ebp[ 0:15] -- IntraSWD. (Also written to MBlockActionStream.)
|
|
; ebp[16:31] -- garbage.
|
|
; ebx -- Zero.
|
|
; eax -- MBlockActionStream.
|
|
;
|
|
; Expected Pentium (tm) microprocessor performance for this section:
|
|
;
|
|
; Executed once per macroblock, (except for those for which one of more blocks
|
|
; are marked empty, or where the InterSWD is less than a threshold).
|
|
;
|
|
; 183 clocks for instruction execution
|
|
; 12 clocks for bank conflicts (94 dual mem ops with 1/8 chance of conflict)
|
|
; ----
|
|
; 195 clocks total time for this section.
|
|
|
|
IntraByDecree:
|
|
|
|
mov eax,InterSWDBlocks ; Inc by 4, because we will undo it below.
|
|
xor ebp,ebp
|
|
mov MBMotionVectors,ebp ; Stash zero for MB level motion vectors.
|
|
mov ebp,040000000H ; Set Inter SWD artificially high.
|
|
lea esi,Block1+128
|
|
add eax,4
|
|
mov MBCentralInterSWD,ebp
|
|
mov InterSWDBlocks,eax
|
|
|
|
CalculateIntraSWD:
|
|
CalculateIntraSWDLoop:
|
|
|
|
mov eax,[esi-128].AccumTargetPels ; Fetch acc of target pels for 1st block.
|
|
mov edx,[esi-128].N8T00
|
|
add eax,8
|
|
mov ebx,[esi-128+BlockLen].AccumTargetPels
|
|
shr eax,4 ; Average block 1 target pel rounded to nearest .5.
|
|
add ebx,8
|
|
shr ebx,4
|
|
mov edi,[esi-128+BlockLen].N8T00
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T02
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T02
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T04
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T04
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T06
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T06
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T11
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T11
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T13
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T13
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T15
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T15
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T17
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T17
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T20
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T20
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T22
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T22
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T24
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T24
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T26
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T26
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T31
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T31
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T33
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T33
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T35
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T35
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T37
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T37
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T40
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T40
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T42
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T42
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T44
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T44
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T46
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T46
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T51
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T51
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T53
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T53
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T55
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T55
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T57
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T57
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T60
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T60
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T62
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T62
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T64
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T64
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T66
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T66
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T71
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T71
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T73
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T73
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T75
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T75
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov edx,[esi-128].N8T77
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov edi,[esi-128+BlockLen].N8T77
|
|
add ebp,ecx
|
|
mov ecx,PD [edx+eax*4]
|
|
mov cl,PB [edi+ebx*4+2]
|
|
mov eax,000007FFFH
|
|
add ebp,ecx
|
|
add esi,BlockLen*2
|
|
and eax,ebp
|
|
mov ecx,MBCentralInterSWD
|
|
shr ebp,16
|
|
sub ecx,IntraCodingDifferential
|
|
add ebp,eax
|
|
mov edx,MBlockActionStream ; Reload list ptr.
|
|
cmp ecx,ebp ; Is IntraSWD > InterSWD - differential?
|
|
jl InterBest
|
|
|
|
lea ecx,Block1+128+BlockLen*2
|
|
cmp ecx,esi
|
|
je CalculateIntraSWDLoop
|
|
|
|
|
|
; ebp -- IntraSWD
|
|
; edx -- MBlockActionStream
|
|
|
|
DoneCalcIntraSWD:
|
|
|
|
IntraBest:
|
|
|
|
mov ecx,IntraSWDTotal
|
|
mov edi,IntraSWDBlocks
|
|
add ecx,ebp ; Add to total for this macroblock class.
|
|
add edi,4 ; Accumulate # of blocks for this type.
|
|
mov IntraSWDBlocks,edi
|
|
mov edi,InterSWDBlocks
|
|
sub edi,4
|
|
mov IntraSWDTotal,ecx
|
|
mov InterSWDBlocks,edi
|
|
mov bl,INTRA
|
|
mov PB [edx].BlockType,bl ; Indicate macroblock handling decision.
|
|
IFDEF H261
|
|
xor ebx,ebx
|
|
ELSE ; H263
|
|
mov ebx,MBMotionVectors ; Set MVs to best MB level motion vectors.
|
|
ENDIF
|
|
mov PD [edx].BlkY1.MVs,ebx
|
|
mov PD [edx].BlkY2.MVs,ebx
|
|
mov PD [edx].BlkY3.MVs,ebx
|
|
mov PD [edx].BlkY4.MVs,ebx
|
|
xor ebx,ebx
|
|
mov PD [edx].SWD,ebp
|
|
jmp NextMacroBlock
|
|
|
|
;==============================================================================
|
|
; Internal functions
|
|
;==============================================================================
|
|
|
|
DoSWDLoop:
|
|
|
|
; Upon entry:
|
|
; esi -- Points to ref1
|
|
; edi -- Points to ref2
|
|
; ecx -- Upper 24 bits zero
|
|
; ebx -- Upper 24 bits zero
|
|
|
|
mov bl,PB [esi] ; 00A -- Get Pel 00 in reference ref1.
|
|
mov eax,Block1.N8T00+4 ; 00B -- Get -8 times target pel 00.
|
|
mov cl,PB [edi] ; 00C -- Get Pel 00 in reference ref2.
|
|
sub esp,BlockLen*4+28
|
|
|
|
SWDLoop:
|
|
|
|
mov edx,PD [eax+ebx*8] ; 00D -- Get weighted diff for ref1 pel 00.
|
|
mov bl,PB [esi+2] ; 02A
|
|
mov dl,PB [eax+ecx*8+2] ; 00E -- Get weighted diff for ref2 pel 00.
|
|
mov eax,BlockN.N8T02+32 ; 02B
|
|
mov ebp,edx ; 00F -- Accum weighted diffs for pel 00.
|
|
mov cl,PB [edi+2] ; 02C
|
|
mov edx,PD [eax+ebx*8] ; 02D
|
|
mov bl,PB [esi+4] ; 04A
|
|
mov dl,PB [eax+ecx*8+2] ; 02E
|
|
mov eax,BlockN.N8T04+32 ; 04B
|
|
mov cl,PB [edi+4] ; 04C
|
|
add ebp,edx ; 02F
|
|
mov edx,PD [eax+ebx*8] ; 04D
|
|
mov bl,PB [esi+6]
|
|
mov dl,PB [eax+ecx*8+2] ; 04E
|
|
mov eax,BlockN.N8T06+32
|
|
mov cl,PB [edi+6]
|
|
add ebp,edx ; 04F
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*1+1]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T11+32
|
|
mov cl,PB [edi+PITCH*1+1]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*1+3]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T13+32
|
|
mov cl,PB [edi+PITCH*1+3]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*1+5]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T15+32
|
|
mov cl,PB [edi+PITCH*1+5]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*1+7]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T17+32
|
|
mov cl,PB [edi+PITCH*1+7]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*2+0]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T20+32
|
|
mov cl,PB [edi+PITCH*2+0]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*2+2]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T22+32
|
|
mov cl,PB [edi+PITCH*2+2]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*2+4]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T24+32
|
|
mov cl,PB [edi+PITCH*2+4]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*2+6]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T26+32
|
|
mov cl,PB [edi+PITCH*2+6]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*3+1]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T31+32
|
|
mov cl,PB [edi+PITCH*3+1]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*3+3]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T33+32
|
|
mov cl,PB [edi+PITCH*3+3]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*3+5]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T35+32
|
|
mov cl,PB [edi+PITCH*3+5]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*3+7]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T37+32
|
|
mov cl,PB [edi+PITCH*3+7]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*4+0]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T40+32
|
|
mov cl,PB [edi+PITCH*4+0]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*4+2]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T42+32
|
|
mov cl,PB [edi+PITCH*4+2]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*4+4]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T44+32
|
|
mov cl,PB [edi+PITCH*4+4]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*4+6]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T46+32
|
|
mov cl,PB [edi+PITCH*4+6]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*5+1]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T51+32
|
|
mov cl,PB [edi+PITCH*5+1]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*5+3]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T53+32
|
|
mov cl,PB [edi+PITCH*5+3]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*5+5]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T55+32
|
|
mov cl,PB [edi+PITCH*5+5]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*5+7]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T57+32
|
|
mov cl,PB [edi+PITCH*5+7]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*6+0]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T60+32
|
|
mov cl,PB [edi+PITCH*6+0]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*6+2]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T62+32
|
|
mov cl,PB [edi+PITCH*6+2]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*6+4]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T64+32
|
|
mov cl,PB [edi+PITCH*6+4]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*6+6]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T66+32
|
|
mov cl,PB [edi+PITCH*6+6]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*7+1]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T71+32
|
|
mov cl,PB [edi+PITCH*7+1]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*7+3]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T73+32
|
|
mov cl,PB [edi+PITCH*7+3]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*7+5]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T75+32
|
|
mov cl,PB [edi+PITCH*7+5]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
mov bl,PB [esi+PITCH*7+7]
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,BlockN.N8T77+32
|
|
mov cl,PB [edi+PITCH*7+7]
|
|
add ebp,edx
|
|
mov edx,PD [eax+ebx*8]
|
|
add esp,BlockLen
|
|
mov dl,PB [eax+ecx*8+2]
|
|
mov eax,ebp
|
|
add ebp,edx
|
|
add edx,eax
|
|
shr ebp,16 ; Extract SWD for ref1.
|
|
and edx,00000FFFFH ; Extract SWD for ref2.
|
|
mov esi,BlockN.Ref1Addr+32 ; Get address of next ref1 block.
|
|
mov edi,BlockN.Ref2Addr+32 ; Get address of next ref2 block.
|
|
mov BlockNM1.Ref1InterSWD+32,ebp ; Store SWD for ref1.
|
|
mov BlockNM1.Ref2InterSWD+32,edx ; Store SWD for ref2.
|
|
mov bl,PB [esi] ; 00A -- Get Pel 02 in reference ref1.
|
|
mov eax,BlockN.N8T00+32 ; 00B -- Get -8 times target pel 00.
|
|
test esp,000000018H ; Done when esp is 32-byte aligned.
|
|
mov cl,PB [edi] ; 00C -- Get Pel 02 in reference ref2.
|
|
jne SWDLoop
|
|
|
|
; Output:
|
|
; ebp -- Ref1 SWD for block 4
|
|
; edx -- Ref2 SWD for block 4
|
|
; ecx -- Upper 24 bits zero
|
|
; ebx -- Upper 24 bits zero
|
|
|
|
add esp,28
|
|
ret
|
|
|
|
IFDEF H261
|
|
ELSE ; H263
|
|
|
|
DoSWDHalfPelHorzLoop:
|
|
|
|
; ebp -- Initialized to 0, except when can't search off left or right edge.
|
|
; edi -- Ref addr for block 1. Ref1 is .5 pel to left. Ref2 is .5 to right.
|
|
|
|
xor ecx,ecx
|
|
sub esp,BlockLen*4+28
|
|
xor eax,eax
|
|
xor ebx,ebx
|
|
|
|
SWDHalfPelHorzLoop:
|
|
|
|
mov al,[edi] ; 00A -- Fetch center ref pel 00.
|
|
mov esi,BlockN.N8T00+32; 00B -- Target pel 00 (times -8).
|
|
mov bl,[edi+2] ; 02A -- Fetch center ref pel 02.
|
|
mov edx,BlockN.N8T02+32; 02B -- Target pel 02 (times -8).
|
|
lea esi,[esi+eax*4] ; 00C -- Combine target pel 00 and center ref pel 00.
|
|
mov al,[edi-1] ; 00D -- Get pel to left for match against pel 00.
|
|
lea edx,[edx+ebx*4] ; 02C -- Combine target pel 02 and center ref pel 02.
|
|
mov bl,[edi+1] ; 00E -- Get pel to right for match against pel 00,
|
|
; ; 02D -- and pel to left for match against pel 02.
|
|
mov ecx,[esi+eax*4] ; 00F -- [16:23] weighted diff for left ref pel 00.
|
|
mov al,[edi+3] ; 02E -- Get pel to right for match against pel 02.
|
|
add ebp,ecx ; 00G -- Accumulate left ref pel 00.
|
|
mov ecx,[edx+ebx*4] ; 02F -- [16:23] weighted diff for left ref pel 02.
|
|
mov cl,[edx+eax*4+2] ; 02H -- [0:7] is weighted diff for right ref pel 02.
|
|
mov al,[edi+4] ; 04A
|
|
add ebp,ecx ; 02I -- Accumulate right ref pel 02,
|
|
; ; 02G -- Accumulate left ref pel 02.
|
|
mov bl,[esi+ebx*4+2] ; 00H -- [0:7] is weighted diff for right ref pel 00.
|
|
add ebp,ebx ; 00I -- Accumulate right ref pel 00.
|
|
mov esi,BlockN.N8T04+32; 04B
|
|
mov bl,[edi+6] ; 06A
|
|
mov edx,BlockN.N8T06+32; 06B
|
|
lea esi,[esi+eax*4] ; 04C
|
|
mov al,[edi+3] ; 04D
|
|
lea edx,[edx+ebx*4] ; 06C
|
|
mov bl,[edi+5] ; 04E & 06D
|
|
mov ecx,[esi+eax*4] ; 04F
|
|
mov al,[edi+7] ; 06E
|
|
add ebp,ecx ; 04G
|
|
mov ecx,[edx+ebx*4] ; 06F
|
|
mov cl,[edx+eax*4+2] ; 06H
|
|
mov al,[edi+PITCH*1+1] ; 11A
|
|
add ebp,ecx ; 04I & 06G
|
|
mov bl,[esi+ebx*4+2] ; 04H
|
|
add ebp,ebx ; 04I
|
|
mov esi,BlockN.N8T11+32; 11B
|
|
mov bl,[edi+PITCH*1+3] ; 13A
|
|
mov edx,BlockN.N8T13+32; 13B
|
|
lea esi,[esi+eax*4] ; 11C
|
|
mov al,[edi+PITCH*1+0] ; 11D
|
|
lea edx,[edx+ebx*4] ; 13C
|
|
mov bl,[edi+PITCH*1+2] ; 11E & 13D
|
|
mov ecx,[esi+eax*4] ; 11F
|
|
mov al,[edi+PITCH*1+4] ; 13E
|
|
add ebp,ecx ; 11G
|
|
mov ecx,[edx+ebx*4] ; 13F
|
|
mov cl,[edx+eax*4+2] ; 13H
|
|
mov al,[edi+PITCH*1+5] ; 15A
|
|
add ebp,ecx ; 11I & 13G
|
|
mov bl,[esi+ebx*4+2] ; 11H
|
|
add ebp,ebx ; 11I
|
|
mov esi,BlockN.N8T15+32; 15B
|
|
mov bl,[edi+PITCH*1+7] ; 17A
|
|
mov edx,BlockN.N8T17+32; 17B
|
|
lea esi,[esi+eax*4] ; 15C
|
|
mov al,[edi+PITCH*1+4] ; 15D
|
|
lea edx,[edx+ebx*4] ; 17C
|
|
mov bl,[edi+PITCH*1+6] ; 15E & 17D
|
|
mov ecx,[esi+eax*4] ; 15F
|
|
mov al,[edi+PITCH*1+8] ; 17E
|
|
add ebp,ecx ; 15G
|
|
mov ecx,[edx+ebx*4] ; 17F
|
|
mov cl,[edx+eax*4+2] ; 17H
|
|
mov al,[edi+PITCH*2+0] ; 20A
|
|
add ebp,ecx ; 15I & 17G
|
|
mov bl,[esi+ebx*4+2] ; 15H
|
|
add ebp,ebx ; 15I
|
|
mov esi,BlockN.N8T20+32; 20B
|
|
mov bl,[edi+PITCH*2+2] ; 22A
|
|
mov edx,BlockN.N8T22+32; 22B
|
|
lea esi,[esi+eax*4] ; 20C
|
|
mov al,[edi+PITCH*2-1] ; 20D
|
|
lea edx,[edx+ebx*4] ; 22C
|
|
mov bl,[edi+PITCH*2+1] ; 20E & 22D
|
|
mov ecx,[esi+eax*4] ; 20F
|
|
mov al,[edi+PITCH*2+3] ; 22E
|
|
add ebp,ecx ; 20G
|
|
mov ecx,[edx+ebx*4] ; 22F
|
|
mov cl,[edx+eax*4+2] ; 22H
|
|
mov al,[edi+PITCH*2+4] ; 24A
|
|
add ebp,ecx ; 20I & 22G
|
|
mov bl,[esi+ebx*4+2] ; 20H
|
|
add ebp,ebx ; 20I
|
|
mov esi,BlockN.N8T24+32; 24B
|
|
mov bl,[edi+PITCH*2+6] ; 26A
|
|
mov edx,BlockN.N8T26+32; 26B
|
|
lea esi,[esi+eax*4] ; 24C
|
|
mov al,[edi+PITCH*2+3] ; 24D
|
|
lea edx,[edx+ebx*4] ; 26C
|
|
mov bl,[edi+PITCH*2+5] ; 24E & 26D
|
|
mov ecx,[esi+eax*4] ; 24F
|
|
mov al,[edi+PITCH*2+7] ; 26E
|
|
add ebp,ecx ; 24G
|
|
mov ecx,[edx+ebx*4] ; 26F
|
|
mov cl,[edx+eax*4+2] ; 26H
|
|
mov al,[edi+PITCH*3+1] ; 31A
|
|
add ebp,ecx ; 24I & 26G
|
|
mov bl,[esi+ebx*4+2] ; 24H
|
|
add ebp,ebx ; 24I
|
|
mov esi,BlockN.N8T31+32; 31B
|
|
mov bl,[edi+PITCH*3+3] ; 33A
|
|
mov edx,BlockN.N8T33+32; 33B
|
|
lea esi,[esi+eax*4] ; 31C
|
|
mov al,[edi+PITCH*3+0] ; 31D
|
|
lea edx,[edx+ebx*4] ; 33C
|
|
mov bl,[edi+PITCH*3+2] ; 31E & 33D
|
|
mov ecx,[esi+eax*4] ; 31F
|
|
mov al,[edi+PITCH*3+4] ; 33E
|
|
add ebp,ecx ; 31G
|
|
mov ecx,[edx+ebx*4] ; 33F
|
|
mov cl,[edx+eax*4+2] ; 33H
|
|
mov al,[edi+PITCH*3+5] ; 35A
|
|
add ebp,ecx ; 31I & 33G
|
|
mov bl,[esi+ebx*4+2] ; 31H
|
|
add ebp,ebx ; 31I
|
|
mov esi,BlockN.N8T35+32; 35B
|
|
mov bl,[edi+PITCH*3+7] ; 37A
|
|
mov edx,BlockN.N8T37+32; 37B
|
|
lea esi,[esi+eax*4] ; 35C
|
|
mov al,[edi+PITCH*3+4] ; 35D
|
|
lea edx,[edx+ebx*4] ; 37C
|
|
mov bl,[edi+PITCH*3+6] ; 35E & 37D
|
|
mov ecx,[esi+eax*4] ; 35F
|
|
mov al,[edi+PITCH*3+8] ; 37E
|
|
add ebp,ecx ; 35G
|
|
mov ecx,[edx+ebx*4] ; 37F
|
|
mov cl,[edx+eax*4+2] ; 37H
|
|
mov al,[edi+PITCH*4+0] ; 40A
|
|
add ebp,ecx ; 35I & 37G
|
|
mov bl,[esi+ebx*4+2] ; 35H
|
|
add ebp,ebx ; 35I
|
|
mov esi,BlockN.N8T40+32; 40B
|
|
mov bl,[edi+PITCH*4+2] ; 42A
|
|
mov edx,BlockN.N8T42+32; 42B
|
|
lea esi,[esi+eax*4] ; 40C
|
|
mov al,[edi+PITCH*4-1] ; 40D
|
|
lea edx,[edx+ebx*4] ; 42C
|
|
mov bl,[edi+PITCH*4+1] ; 40E & 42D
|
|
mov ecx,[esi+eax*4] ; 40F
|
|
mov al,[edi+PITCH*4+3] ; 42E
|
|
add ebp,ecx ; 40G
|
|
mov ecx,[edx+ebx*4] ; 42F
|
|
mov cl,[edx+eax*4+2] ; 42H
|
|
mov al,[edi+PITCH*4+4] ; 44A
|
|
add ebp,ecx ; 40I & 42G
|
|
mov bl,[esi+ebx*4+2] ; 40H
|
|
add ebp,ebx ; 40I
|
|
mov esi,BlockN.N8T44+32; 44B
|
|
mov bl,[edi+PITCH*4+6] ; 46A
|
|
mov edx,BlockN.N8T46+32; 46B
|
|
lea esi,[esi+eax*4] ; 44C
|
|
mov al,[edi+PITCH*4+3] ; 44D
|
|
lea edx,[edx+ebx*4] ; 46C
|
|
mov bl,[edi+PITCH*4+5] ; 44E & 46D
|
|
mov ecx,[esi+eax*4] ; 44F
|
|
mov al,[edi+PITCH*4+7] ; 46E
|
|
add ebp,ecx ; 44G
|
|
mov ecx,[edx+ebx*4] ; 46F
|
|
mov cl,[edx+eax*4+2] ; 46H
|
|
mov al,[edi+PITCH*5+1] ; 51A
|
|
add ebp,ecx ; 44I & 46G
|
|
mov bl,[esi+ebx*4+2] ; 44H
|
|
add ebp,ebx ; 44I
|
|
mov esi,BlockN.N8T51+32; 51B
|
|
mov bl,[edi+PITCH*5+3] ; 53A
|
|
mov edx,BlockN.N8T53+32; 53B
|
|
lea esi,[esi+eax*4] ; 51C
|
|
mov al,[edi+PITCH*5+0] ; 51D
|
|
lea edx,[edx+ebx*4] ; 53C
|
|
mov bl,[edi+PITCH*5+2] ; 51E & 53D
|
|
mov ecx,[esi+eax*4] ; 51F
|
|
mov al,[edi+PITCH*5+4] ; 53E
|
|
add ebp,ecx ; 51G
|
|
mov ecx,[edx+ebx*4] ; 53F
|
|
mov cl,[edx+eax*4+2] ; 53H
|
|
mov al,[edi+PITCH*5+5] ; 55A
|
|
add ebp,ecx ; 51I & 53G
|
|
mov bl,[esi+ebx*4+2] ; 51H
|
|
add ebp,ebx ; 51I
|
|
mov esi,BlockN.N8T55+32; 55B
|
|
mov bl,[edi+PITCH*5+7] ; 57A
|
|
mov edx,BlockN.N8T57+32; 57B
|
|
lea esi,[esi+eax*4] ; 55C
|
|
mov al,[edi+PITCH*5+4] ; 55D
|
|
lea edx,[edx+ebx*4] ; 57C
|
|
mov bl,[edi+PITCH*5+6] ; 55E & 57D
|
|
mov ecx,[esi+eax*4] ; 55F
|
|
mov al,[edi+PITCH*5+8] ; 57E
|
|
add ebp,ecx ; 55G
|
|
mov ecx,[edx+ebx*4] ; 57F
|
|
mov cl,[edx+eax*4+2] ; 57H
|
|
mov al,[edi+PITCH*6+0] ; 60A
|
|
add ebp,ecx ; 55I & 57G
|
|
mov bl,[esi+ebx*4+2] ; 55H
|
|
add ebp,ebx ; 55I
|
|
mov esi,BlockN.N8T60+32; 60B
|
|
mov bl,[edi+PITCH*6+2] ; 62A
|
|
mov edx,BlockN.N8T62+32; 62B
|
|
lea esi,[esi+eax*4] ; 60C
|
|
mov al,[edi+PITCH*6-1] ; 60D
|
|
lea edx,[edx+ebx*4] ; 62C
|
|
mov bl,[edi+PITCH*6+1] ; 60E & 62D
|
|
mov ecx,[esi+eax*4] ; 60F
|
|
mov al,[edi+PITCH*6+3] ; 62E
|
|
add ebp,ecx ; 60G
|
|
mov ecx,[edx+ebx*4] ; 62F
|
|
mov cl,[edx+eax*4+2] ; 62H
|
|
mov al,[edi+PITCH*6+4] ; 64A
|
|
add ebp,ecx ; 60I & 62G
|
|
mov bl,[esi+ebx*4+2] ; 60H
|
|
add ebp,ebx ; 60I
|
|
mov esi,BlockN.N8T64+32; 64B
|
|
mov bl,[edi+PITCH*6+6] ; 66A
|
|
mov edx,BlockN.N8T66+32; 66B
|
|
lea esi,[esi+eax*4] ; 64C
|
|
mov al,[edi+PITCH*6+3] ; 64D
|
|
lea edx,[edx+ebx*4] ; 66C
|
|
mov bl,[edi+PITCH*6+5] ; 64E & 66D
|
|
mov ecx,[esi+eax*4] ; 64F
|
|
mov al,[edi+PITCH*6+7] ; 66E
|
|
add ebp,ecx ; 64G
|
|
mov ecx,[edx+ebx*4] ; 66F
|
|
mov cl,[edx+eax*4+2] ; 66H
|
|
mov al,[edi+PITCH*7+1] ; 71A
|
|
add ebp,ecx ; 64I & 66G
|
|
mov bl,[esi+ebx*4+2] ; 64H
|
|
add ebp,ebx ; 64I
|
|
mov esi,BlockN.N8T71+32; 71B
|
|
mov bl,[edi+PITCH*7+3] ; 73A
|
|
mov edx,BlockN.N8T73+32; 73B
|
|
lea esi,[esi+eax*4] ; 71C
|
|
mov al,[edi+PITCH*7+0] ; 71D
|
|
lea edx,[edx+ebx*4] ; 73C
|
|
mov bl,[edi+PITCH*7+2] ; 71E & 73D
|
|
mov ecx,[esi+eax*4] ; 71F
|
|
mov al,[edi+PITCH*7+4] ; 73E
|
|
add ebp,ecx ; 71G
|
|
mov ecx,[edx+ebx*4] ; 73F
|
|
mov cl,[edx+eax*4+2] ; 73H
|
|
mov al,[edi+PITCH*7+5] ; 75A
|
|
add ebp,ecx ; 71I & 73G
|
|
mov bl,[esi+ebx*4+2] ; 71H
|
|
add ebp,ebx ; 71I
|
|
mov esi,BlockN.N8T75+32; 75B
|
|
mov bl,[edi+PITCH*7+7] ; 77A
|
|
mov edx,BlockN.N8T77+32; 77B
|
|
lea esi,[esi+eax*4] ; 75C
|
|
mov al,[edi+PITCH*7+4] ; 75D
|
|
lea edx,[edx+ebx*4] ; 77C
|
|
mov bl,[edi+PITCH*7+6] ; 75E & 77D
|
|
mov ecx,[esi+eax*4] ; 75F
|
|
mov al,[edi+PITCH*7+8] ; 77E
|
|
add ebp,ecx ; 75G
|
|
mov ecx,[edx+ebx*4] ; 77F
|
|
mov cl,[edx+eax*4+2] ; 77H
|
|
add esp,BlockLen
|
|
add ecx,ebp ; 75I & 77G
|
|
mov bl,[esi+ebx*4+2] ; 75H
|
|
add ebx,ecx ; 75I
|
|
mov edi,BlockN.AddrCentralPoint+32 ; Get address of next ref1 block.
|
|
shr ecx,16 ; Extract SWD for ref1.
|
|
and ebx,00000FFFFH ; Extract SWD for ref2.
|
|
mov BlockNM1.Ref1InterSWD+32,ecx ; Store SWD for ref1.
|
|
mov BlockNM1.Ref2InterSWD+32,ebx ; Store SWD for ref2.
|
|
xor ebp,ebp
|
|
mov edx,ebx
|
|
test esp,000000018H
|
|
mov ebx,ebp
|
|
jne SWDHalfPelHorzLoop
|
|
|
|
; Output:
|
|
; ebp, ebx -- Zero
|
|
; ecx -- Ref1 SWD for block 4
|
|
; edx -- Ref2 SWD for block 4
|
|
|
|
add esp,28
|
|
ret
|
|
|
|
|
|
DoSWDHalfPelVertLoop:
|
|
|
|
; ebp -- Initialized to 0, except when can't search off left or right edge.
|
|
; edi -- Ref addr for block 1. Ref1 is .5 pel up. Ref2 is .5 down.
|
|
|
|
xor ecx,ecx
|
|
sub esp,BlockLen*4+28
|
|
xor eax,eax
|
|
xor ebx,ebx
|
|
|
|
SWDHalfPelVertLoop:
|
|
|
|
mov al,[edi]
|
|
mov esi,BlockN.N8T00+32
|
|
mov bl,[edi+2*PITCH]
|
|
mov edx,BlockN.N8T20+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi-1*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+1*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+3*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+4*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T40+32
|
|
mov bl,[edi+6*PITCH]
|
|
mov edx,BlockN.N8T60+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+3*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+5*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+7*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+1+1*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T11+32
|
|
mov bl,[edi+1+3*PITCH]
|
|
mov edx,BlockN.N8T31+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+1+0*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+1+2*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+1+4*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+1+5*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T51+32
|
|
mov bl,[edi+1+7*PITCH]
|
|
mov edx,BlockN.N8T71+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+1+4*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+1+6*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+1+8*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+2+0*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T02+32
|
|
mov bl,[edi+2+2*PITCH]
|
|
mov edx,BlockN.N8T22+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+2-1*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+2+1*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+2+3*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+2+4*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T42+32
|
|
mov bl,[edi+2+6*PITCH]
|
|
mov edx,BlockN.N8T62+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+2+3*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+2+5*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+2+7*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+3+1*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T13+32
|
|
mov bl,[edi+3+3*PITCH]
|
|
mov edx,BlockN.N8T33+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+3+0*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+3+2*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+3+4*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+3+5*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T53+32
|
|
mov bl,[edi+3+7*PITCH]
|
|
mov edx,BlockN.N8T73+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+3+4*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+3+6*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+3+8*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+4+0*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T04+32
|
|
mov bl,[edi+4+2*PITCH]
|
|
mov edx,BlockN.N8T24+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+4-1*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+4+1*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+4+3*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+4+4*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T44+32
|
|
mov bl,[edi+4+6*PITCH]
|
|
mov edx,BlockN.N8T64+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+4+3*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+4+5*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+4+7*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+5+1*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T15+32
|
|
mov bl,[edi+5+3*PITCH]
|
|
mov edx,BlockN.N8T35+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+5+0*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+5+2*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+5+4*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+5+5*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T55+32
|
|
mov bl,[edi+5+7*PITCH]
|
|
mov edx,BlockN.N8T75+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+5+4*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+5+6*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+5+8*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+6+0*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T06+32
|
|
mov bl,[edi+6+2*PITCH]
|
|
mov edx,BlockN.N8T26+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+6-1*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+6+1*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+6+3*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+6+4*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T46+32
|
|
mov bl,[edi+6+6*PITCH]
|
|
mov edx,BlockN.N8T66+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+6+3*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+6+5*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+6+7*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+7+1*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T17+32
|
|
mov bl,[edi+7+3*PITCH]
|
|
mov edx,BlockN.N8T37+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+7+0*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+7+2*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+7+4*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
mov al,[edi+7+5*PITCH]
|
|
add ebp,ecx
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebp,ebx
|
|
mov esi,BlockN.N8T57+32
|
|
mov bl,[edi+7+7*PITCH]
|
|
mov edx,BlockN.N8T77+32
|
|
lea esi,[esi+eax*4]
|
|
mov al,[edi+7+4*PITCH]
|
|
lea edx,[edx+ebx*4]
|
|
mov bl,[edi+7+6*PITCH]
|
|
mov ecx,[esi+eax*4]
|
|
mov al,[edi+7+8*PITCH]
|
|
add ebp,ecx
|
|
mov ecx,[edx+ebx*4]
|
|
mov cl,[edx+eax*4+2]
|
|
add esp,BlockLen
|
|
add ecx,ebp
|
|
mov bl,[esi+ebx*4+2]
|
|
add ebx,ecx
|
|
mov edi,BlockN.AddrCentralPoint+32
|
|
shr ecx,16
|
|
and ebx,00000FFFFH
|
|
mov BlockNM1.Ref1InterSWD+32,ecx
|
|
mov BlockNM1.Ref2InterSWD+32,ebx
|
|
xor ebp,ebp
|
|
mov edx,ebx
|
|
test esp,000000018H
|
|
mov ebx,ebp
|
|
jne SWDHalfPelVertLoop
|
|
|
|
; Output:
|
|
; ebp, ebx -- Zero
|
|
; ecx -- Ref1 SWD for block 4
|
|
; edx -- Ref2 SWD for block 4
|
|
|
|
add esp,28
|
|
ret
|
|
|
|
ENDIF ; H263
|
|
|
|
|
|
; Performance for common macroblocks:
|
|
; 298 clocks: prepare target pels, compute avg target pel, compute 0-MV SWD.
|
|
; 90 clocks: compute IntraSWD.
|
|
; 1412 clocks: 6-level search for best SWD.
|
|
; 16 clocks: record best fit.
|
|
; 945 clocks: calculate spatial loop filtered prediction.
|
|
; 152 clocks: calculate SWD for spatially filtered prediction and classify.
|
|
; ----
|
|
; 2913 clocks total
|
|
;
|
|
; Performance for macroblocks in which 0-motion vector is "good enough":
|
|
; 298 clocks: prepare target pels, compute avg target pel, compute 0-MV SWD.
|
|
; 90 clocks: compute IntraSWD.
|
|
; 16 clocks: record best fit.
|
|
; 58 clocks: extra cache fill burden on adjacent MB if SWD-search not done.
|
|
; 945 clocks: calculate spatial loop filtered prediction.
|
|
; 152 clocks: calculate SWD for spatially filtered prediction and classify.
|
|
; ----
|
|
; 1559 clocks total
|
|
;
|
|
; Performance for macroblocks marked as intrablock by decree of caller:
|
|
; 298 clocks: prepare target pels, compute avg target pel, compute 0-MV SWD.
|
|
; 90 clocks: compute IntraSWD.
|
|
; 58 clocks: extra cache fill burden on adjacent MB if SWD-search not done.
|
|
; 20 clocks: classify (just weight the SWD for # of match points).
|
|
; ----
|
|
; 476 clocks total
|
|
;
|
|
; 160*120 performance, generously estimated (assuming lots of motion):
|
|
;
|
|
; 2913 * 80 = 233000 clocks for luma.
|
|
; 2913 * 12 = 35000 clocks for chroma.
|
|
; 268000 clocks per frame * 15 = 4,020,000 clocks/sec.
|
|
;
|
|
; 160*120 performance, assuming typical motion:
|
|
;
|
|
; 2913 * 40 + 1559 * 40 = 179000 clocks for luma.
|
|
; 2913 * 8 + 1559 * 4 = 30000 clocks for chroma.
|
|
; 209000 clocks per frame * 15 = 3,135,000 clocks/sec.
|
|
;
|
|
; Add 10-20% to allow for initial cache-filling, and unfortunate cases where
|
|
; cache-filling policy preempts areas of the tables that are not locally "hot",
|
|
; instead of preempting macroblocks upon which the processing was just finished.
|
|
|
|
|
|
Done:
|
|
|
|
mov eax,IntraSWDTotal
|
|
mov ebx,IntraSWDBlocks
|
|
mov ecx,InterSWDTotal
|
|
mov edx,InterSWDBlocks
|
|
mov esp,StashESP
|
|
mov edi,[esp+IntraSWDTotal_arg]
|
|
mov [edi],eax
|
|
mov edi,[esp+IntraSWDBlocks_arg]
|
|
mov [edi],ebx
|
|
mov edi,[esp+InterSWDTotal_arg]
|
|
mov [edi],ecx
|
|
mov edi,[esp+InterSWDBlocks_arg]
|
|
mov [edi],edx
|
|
pop ebx
|
|
pop ebp
|
|
pop edi
|
|
pop esi
|
|
rturn
|
|
|
|
|
|
MOTIONESTIMATION endp
|
|
|
|
END
|