/******************************************************************************\
*
* $Workfile:   bltio.c  $
*
* Contains the low-level IO blt functions.
*
* Hopefully, if you're basing your display driver on this code, to
* support all of DrvBitBlt and DrvCopyBits, you'll only have to implement
* the following routines.  You shouldn't have to modify much in
* 'bitblt.c'.  I've tried to make these routines as few, modular, simple,
* and efficient as I could, while still accelerating as many calls as
* possible that would be cost-effective in terms of performance wins
* versus size and effort.
*
* Note: In the following, 'relative' coordinates refers to coordinates
*       that haven't yet had the offscreen bitmap (DFB) offset applied.
*       'Absolute' coordinates have had the offset applied.  For example,
*       we may be told to blt to (1, 1) of the bitmap, but the bitmap may
*       be sitting in offscreen memory starting at coordinate (0, 768) --
*       (1, 1) would be the 'relative' start coordinate, and (1, 769)
*       would be the 'absolute' start coordinate'.
*
* Copyright (c) 1992-1995 Microsoft Corporation
* Copyright (c) 1996 Cirrus Logic, Inc.
*
* $Log:   S:/projects/drivers/ntsrc/display/bltio.c_v  $
 * 
 *    Rev 1.2   Nov 07 1996 16:47:52   unknown
 * Clean up CAPS flags
 * 
 *    Rev 1.1   Oct 10 1996 15:36:10   unknown
 *  
* 
*    Rev 1.1   12 Aug 1996 16:49:42   frido
* Removed unaccessed local parameters.
*
* jl01  10-08-96  Do Transparent BLT w/o Solid Fill.  Refer to PDRs#5511/6817.
\******************************************************************************/

#include "precomp.h"

/**************************************************************************
* VOID vIoFastPatRealize
*
* Realizes a pattern into offscreen memory.
*
**************************************************************************/

VOID vIoFastPatRealize(
PDEV*   ppdev,
RBRUSH* prb)                    // Points to brush realization structure
{
    BRUSHENTRY* pbe;
    LONG        iBrushCache;
    BYTE*       pjPattern;
    LONG        cjPattern;
    BYTE*       pjPorts = ppdev->pjPorts;
    LONG        lDelta = ppdev->lDelta;
    LONG        lDeltaPat;
    LONG        xCnt;
    LONG        yCnt;
    ULONG       ulDst;

    DISPDBG((10,"vFastPatRealize called"));

    pbe = prb->pbe;
    if ((pbe == NULL) || (pbe->prbVerify != prb))
    {
        // We have to allocate a new offscreen cache brush entry for
        // the brush:

        iBrushCache = ppdev->iBrushCache;
        pbe         = &ppdev->abe[iBrushCache];

        iBrushCache++;
        if (iBrushCache >= ppdev->cBrushCache)
            iBrushCache = 0;

        ppdev->iBrushCache = iBrushCache;

        // Update our links:

        pbe->prbVerify           = prb;
        prb->pbe = pbe;
    }

    //
    // Download brush into cache
    //

    pjPattern = (PBYTE) &prb->aulPattern[0];        // Copy from brush buffer
    cjPattern = PELS_TO_BYTES(TOTAL_BRUSH_SIZE);

    lDeltaPat = PELS_TO_BYTES(8);
    xCnt = PELS_TO_BYTES(8);
    yCnt = 8;

    ulDst = (pbe->y * ppdev->lDelta) + PELS_TO_BYTES(pbe->x);

    ppdev->pfnBankMap(ppdev, ppdev->lXferBank);

    CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

    CP_IO_DST_Y_OFFSET(ppdev, pjPorts, (lDeltaPat * 2));
    CP_IO_XCNT(ppdev, pjPorts, (xCnt - 1));
    CP_IO_YCNT(ppdev, pjPorts, (yCnt - 1));
    CP_IO_BLT_MODE(ppdev, pjPorts, SRC_CPU_DATA);
    CP_IO_ROP(ppdev, pjPorts, CL_SRC_COPY);
    CP_IO_DST_ADDR_ABS(ppdev, pjPorts, ulDst);

    CP_IO_START_BLT(ppdev, pjPorts);

    vImageTransfer(ppdev, pjPattern, lDeltaPat, xCnt, yCnt);

    //
    // Duplicate brush horizontally
    //

    CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

    CP_IO_XCNT(ppdev, pjPorts, (xCnt - 1));
    CP_IO_YCNT(ppdev, pjPorts, (yCnt - 1));
    CP_IO_BLT_MODE(ppdev, pjPorts, 0);
    CP_IO_SRC_Y_OFFSET(ppdev, pjPorts, (lDeltaPat * 2));
    CP_IO_SRC_ADDR(ppdev, pjPorts, ulDst);
    CP_IO_DST_ADDR_ABS(ppdev, pjPorts, (ulDst + lDeltaPat));

    CP_IO_START_BLT(ppdev, pjPorts);

    //
    // Duplicate brush vertically
    //

    CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

    CP_IO_SRC_Y_OFFSET(ppdev, pjPorts, (xCnt * 2));
    CP_IO_DST_Y_OFFSET(ppdev, pjPorts, (xCnt * 2));
    CP_IO_BLT_MODE(ppdev, pjPorts, 0);
    CP_IO_XCNT(ppdev, pjPorts, ((xCnt * 2) - 1));
    CP_IO_YCNT(ppdev, pjPorts, (yCnt - 1));
    CP_IO_SRC_ADDR(ppdev, pjPorts, ulDst);
    CP_IO_DST_ADDR_ABS(ppdev, pjPorts, (ulDst + PELS_TO_BYTES(128)));
    CP_IO_START_BLT(ppdev, pjPorts);

    #if 0
    {
        ////////////////////////////////////////////////////////////////
        // DEBUG TILED PATTERNS
        //
        // The following code helps to debug patterns if you break the
        // realization code.  It copies the 2x2 tiled copy of the brush
        // to the visible screen.
        //

        POINTL ptl;
        RECTL rcl;

        ptl.x = pbe->x;
        ptl.y = pbe->y;

        rcl.left = 10;
        rcl.right = 10 + 16;
        rcl.top = ppdev->cyScreen - 10 - 16;
        rcl.bottom = ppdev->cyScreen - 10;

        {
            LONG        lDelta = ppdev->lDelta;
            BYTE        jHwRop;
            BYTE        jMode;

            //
            // Make sure we can write to the video registers.
            //

            CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

            CP_IO_ROP(ppdev, pjPorts, CL_SRC_COPY);
            CP_IO_SRC_Y_OFFSET(ppdev, pjPorts, PELS_TO_BYTES(16));
            CP_IO_DST_Y_OFFSET(ppdev, pjPorts, lDelta);

            {
                //
                // Top to Bottom - Left to Right
                //

                jMode |= DIR_TBLR;
                CP_IO_BLT_MODE(ppdev, pjPorts, ppdev->jModeColor);

                {

                    CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

                    CP_IO_XCNT(ppdev, pjPorts, (PELS_TO_BYTES(rcl.right - rcl.left) - 1));
                    CP_IO_YCNT(ppdev, pjPorts, (rcl.bottom - rcl.top - 1));

                    CP_IO_SRC_ADDR(ppdev, pjPorts, (0 + ((ptl.y) * lDelta) + PELS_TO_BYTES(ptl.x)));
                    CP_IO_DST_ADDR_ABS(ppdev, pjPorts, ((rcl.top * lDelta) + PELS_TO_BYTES(rcl.left)));
                    CP_IO_START_BLT(ppdev, pjPorts);
                }
            }
        }
    }
    #endif
}

/**************************************************************************
* VOID vIoFillPat
*
* This routine uses the pattern hardware to draw a patterned list of
* rectangles.
*
**************************************************************************/

VOID vIoFillPat(
PDEV*           ppdev,
LONG            c,          // Can't be zero
RECTL*          prcl,       // Array of relative coordinate destination rects
ROP4            rop4,       // Obvious?
RBRUSH_COLOR    rbc,        // Drawing color is rbc.iSolidColor
POINTL*         pptlBrush)  //
{
    BYTE*       pjPorts = ppdev->pjPorts;
    LONG        lDelta = ppdev->lDelta;
    ULONG       ulAlignedPatternOffset = ppdev->ulAlignedPatternOffset;
    ULONG       ulPatternAddrBase;
    BYTE        jHwRop;
    BYTE        jMode;
    BRUSHENTRY* pbe;        // Pointer to brush entry data, which is used
                            //   for keeping track of the location and status
                            //   of the pattern bits cached in off-screen
                            //   memory

    DISPDBG((10,"vFillPat called"));

    ASSERTDD(c > 0, "Can't handle zero rectangles");
    ASSERTDD(ppdev->cBpp < 3, "vFillPat only works at 8bpp and 16bpp");

    if ((rbc.prb->pbe == NULL) ||
        (rbc.prb->pbe->prbVerify != rbc.prb))
    {
        vIoFastPatRealize(ppdev, rbc.prb);
        DISPDBG((5, " -- Brush cache miss, put it at (%d,%d)", rbc.prb->pbe->x, rbc.prb->pbe->y));
    }
    else
    {
        DISPDBG((5, " -- Brush cache hit on brush at (%d,%d)", rbc.prb->pbe->x, rbc.prb->pbe->y));
    }

    pbe = rbc.prb->pbe;

    //
    // Fill the list of rectangles
    //

    ulPatternAddrBase = pbe->xy;
    jHwRop = gajHwMixFromRop2[(rop4 >> 2) & 0xf];
    jMode = ppdev->jModeColor | ENABLE_8x8_PATTERN_COPY;

    do {
        ULONG offset = 0;

        offset = PELS_TO_BYTES(
            (((prcl->top-pptlBrush->y)&7) << 4)
            +((prcl->left-pptlBrush->x)&7)
        );

        // align the pattern to a new location

        CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

        CP_IO_BLT_MODE(ppdev, pjPorts, 0);
        CP_IO_ROP(ppdev, pjPorts, CL_SRC_COPY);
        CP_IO_SRC_Y_OFFSET(ppdev, pjPorts, PELS_TO_BYTES(16));
        CP_IO_DST_Y_OFFSET(ppdev, pjPorts, PELS_TO_BYTES(8));
        CP_IO_SRC_ADDR(ppdev, pjPorts, (ulPatternAddrBase + offset));
        CP_IO_XCNT(ppdev, pjPorts, (PELS_TO_BYTES(8) - 1));
        CP_IO_YCNT(ppdev, pjPorts, (8 - 1));
        CP_IO_DST_ADDR_ABS(ppdev, pjPorts, ulAlignedPatternOffset);
        CP_IO_START_BLT(ppdev, pjPorts);

        // fill using aligned pattern

        CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

        CP_IO_BLT_MODE(ppdev, pjPorts, jMode);
        CP_IO_ROP(ppdev, pjPorts, jHwRop);
        CP_IO_DST_Y_OFFSET(ppdev, pjPorts, lDelta);
        CP_IO_SRC_ADDR(ppdev, pjPorts, ulAlignedPatternOffset);
        CP_IO_XCNT(ppdev, pjPorts, (PELS_TO_BYTES(prcl->right - prcl->left) - 1));
        CP_IO_YCNT(ppdev, pjPorts, (prcl->bottom - prcl->top - 1));
        CP_IO_DST_ADDR(ppdev, pjPorts, ((prcl->top * lDelta) + PELS_TO_BYTES(prcl->left)));
        CP_IO_START_BLT(ppdev, pjPorts);

        prcl++;

    } while (--c != 0);
}


/**************************************************************************
* VOID vIoFillSolid
*
* Does a solid fill to a list of rectangles.
*
**************************************************************************/

VOID vIoFillSolid(
PDEV*           ppdev,
LONG            c,          // Can't be zero
RECTL*          prcl,       // Array of relative coordinate destination rects
ROP4            rop4,       // Obvious?
RBRUSH_COLOR    rbc,        // Drawing color is rbc.iSolidColor
POINTL*         pptlBrush)  // Not used
{
    BYTE*       pjPorts = ppdev->pjPorts;
    LONG        lDelta = ppdev->lDelta;
    LONG        cBpp = ppdev->cBpp;
    ULONG       ulSolidColor;
    BYTE        jHwRop;

    DISPDBG((10,"vFillSolid called"));

    ASSERTDD(c > 0, "Can't handle zero rectangles");

    ulSolidColor = rbc.iSolidColor;

    if (cBpp == 1)
    {
        ulSolidColor |= ulSolidColor << 8;
        ulSolidColor |= ulSolidColor << 16;
    }
    else if (cBpp == 2)
    {
        ulSolidColor |= ulSolidColor << 16;
    }

    jHwRop = gajHwMixFromRop2[(rop4 >> 2) & 0xf];

    //
    // Make sure we can write to the video registers.
    //

    CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

    CP_IO_ROP(ppdev, pjPorts, jHwRop);
    CP_IO_SRC_ADDR(ppdev, pjPorts, ppdev->ulSolidColorOffset);
    CP_IO_DST_Y_OFFSET(ppdev, pjPorts, lDelta);
    CP_IO_BLT_MODE(ppdev, pjPorts, ENABLE_COLOR_EXPAND |
                                ENABLE_8x8_PATTERN_COPY |
                                ppdev->jModeColor);
    CP_IO_FG_COLOR(ppdev, pjPorts, ulSolidColor);

    //
    // Fill the list of rectangles
    //

    while (TRUE)
    {
        CP_IO_XCNT(ppdev, pjPorts, (PELS_TO_BYTES(prcl->right - prcl->left) - 1));
        CP_IO_YCNT(ppdev, pjPorts, (prcl->bottom - prcl->top - 1));
        CP_IO_DST_ADDR(ppdev, pjPorts, ((prcl->top * lDelta) + PELS_TO_BYTES(prcl->left)));
        CP_IO_START_BLT(ppdev, pjPorts);

        if (--c == 0)
            return;

        prcl++;
        CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);
    }
}


/**************************************************************************
* VOID vIoCopyBlt
*
* Does a screen-to-screen blt of a list of rectangles.
*
**************************************************************************/

VOID vIoCopyBlt(
PDEV*   ppdev,
LONG    c,          // Can't be zero
RECTL*  prcl,       // Array of relative coordinates destination rectangles
ROP4    rop4,       // Obvious?
POINTL* pptlSrc,    // Original unclipped source point
RECTL*  prclDst)    // Original unclipped destination rectangle
{
    LONG        dx;
    LONG        dy;     // Add delta to destination to get source

    LONG        xyOffset = ppdev->xyOffset;
    BYTE*       pjPorts = ppdev->pjPorts;
    LONG        lDelta = ppdev->lDelta;
    BYTE        jHwRop;

    DISPDBG((10,"vCopyBlt called"));

    ASSERTDD(c > 0, "Can't handle zero rectangles");

    //
    // The src-dst delta will be the same for all rectangles
    //

    dx = pptlSrc->x - prclDst->left;
    dy = pptlSrc->y - prclDst->top;

    //
    // Make sure we can write to the video registers.
    //

    CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

    jHwRop = gajHwMixFromRop2[rop4 & 0xf];
    CP_IO_ROP(ppdev, pjPorts, jHwRop);

    CP_IO_SRC_Y_OFFSET(ppdev, pjPorts, lDelta);
    CP_IO_DST_Y_OFFSET(ppdev, pjPorts, lDelta);

    //
    // The accelerator may not be as fast at doing right-to-left copies, so
    // only do them when the rectangles truly overlap:
    //

    if (!OVERLAP(prclDst, pptlSrc) ||
        (prclDst->top < pptlSrc->y) ||
        ((prclDst->top == pptlSrc->y) && (prclDst->left <= pptlSrc->x))
        )
    {
        //
        // Top to Bottom - Left to Right
        //

        DISPDBG((12,"Top to Bottom - Left to Right"));

        CP_IO_BLT_MODE(ppdev, pjPorts, DIR_TBLR);

        while (TRUE)
        {
            CP_IO_XCNT(ppdev, pjPorts, (PELS_TO_BYTES(prcl->right - prcl->left) - 1));
            CP_IO_YCNT(ppdev, pjPorts, (prcl->bottom - prcl->top - 1));

            CP_IO_SRC_ADDR(ppdev, pjPorts, (xyOffset + ((prcl->top + dy) * lDelta) + PELS_TO_BYTES(prcl->left + dx)));
            CP_IO_DST_ADDR(ppdev, pjPorts, ((prcl->top * lDelta) + PELS_TO_BYTES(prcl->left)));
            CP_IO_START_BLT(ppdev, pjPorts);

            if (--c == 0)
                return;

            prcl++;
            CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);
        }
    }
    else
    {
        //
        // Bottom to Top - Right to Left
        //

        DISPDBG((12,"Bottom to Top - Right to Left"));

        CP_IO_BLT_MODE(ppdev, pjPorts, DIR_BTRL);

        while (TRUE)
        {
            CP_IO_XCNT(ppdev, pjPorts, (PELS_TO_BYTES(prcl->right - prcl->left) - 1));
            CP_IO_YCNT(ppdev, pjPorts, (prcl->bottom - prcl->top - 1));

            CP_IO_SRC_ADDR(ppdev, pjPorts, (xyOffset + ((prcl->bottom - 1 + dy) * lDelta) + PELS_TO_BYTES(prcl->right + dx) - 1));
            CP_IO_DST_ADDR(ppdev, pjPorts, (((prcl->bottom - 1) * lDelta) + PELS_TO_BYTES(prcl->right) - 1));
            CP_IO_START_BLT(ppdev, pjPorts);

            if (--c == 0)
                return;

            prcl++;
            CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);
        }
    }
}

/******************************Public*Routine******************************\
* VOID vIoXfer1bpp
*
* Low-level routine used to transfer monochrome data to the screen using
* DWORD writes to the blt engine.
*
* This can handle opaque or transparent expansions.  It does opaque
* expansions by drawing the opaque rectangle first and then transparently
* expands the foreground bits.
*
\**************************************************************************/
VOID vIoXfer1bpp(
PDEV*       ppdev,
LONG        c,          // Count of rectangles, can't be zero
RECTL*      prcl,       // List of destination rectangles, in relative
                        //   coordinates
ROP4        rop4,       // Actually had better be a rop3
SURFOBJ*    psoSrc,     // Source surface
POINTL*     pptlSrc,    // Original unclipped source point
RECTL*      prclDst,    // Original unclipped destination rectangle
XLATEOBJ*   pxlo)       // Translate that provides color-expansion information
{
    ULONG* pulXfer;
    ULONG* pul;
    LONG   ix;
    LONG   iy;
    LONG   cxWidthInBytes;
    BYTE*  pjBits;
    POINTL ptlDst;
    POINTL ptlSrc;
    SIZEL  sizlDst;
    LONG   cxLeftMask;
    LONG   cxRightMask;
    ULONG  ulDstAddr;
    INT    nDwords;
    ULONG  ulLeftMask;
    ULONG  ulRightMask;
    LONG   dx;
    LONG   dy;

    BYTE* pjPorts   = ppdev->pjPorts;
    LONG  lDelta    = ppdev->lDelta;
    LONG  lDeltaSrc = psoSrc->lDelta;
    LONG  cBpp      = ppdev->cBpp;
    ULONG ulFgColor = pxlo->pulXlate[1];
    ULONG ulBgColor = pxlo->pulXlate[0];

    // Since the hardware clipping on some of the Cirrus chips is broken, we
    // do the clipping by rounding out the edges to dword boundaries and then
    // doing the blt transparently.  In the event that we want the expansion
    // to be opaque, we do the opaquing blt in advance.  One side effect of
    // this is that the destination bits are no longer valid for processing
    // the rop.  This could probably be optimized by doing the edges seperately
    // and then doing the middle section in one pass.  However, this is
    // complicated by a 5434 bug that breaks blts less than 10 pixels wide.

    ASSERTDD(c > 0, "Can't handle zero rectangles");
    ASSERTDD(((rop4 & 0xff00) == 0xcc00), "Expected foreground rop of 0xcc");

    //
    // The src-dst delta will be the same for all rectangles
    //

    dx = pptlSrc->x - prclDst->left;
    dy = pptlSrc->y - prclDst->top;

    if (cBpp == 1)
    {
        ulFgColor = (ulFgColor << 8) | (ulFgColor & 0xff);
        ulBgColor = (ulBgColor << 8) | (ulBgColor & 0xff);
        ulFgColor = (ulFgColor << 16) | (ulFgColor & 0xffff);
        ulBgColor = (ulBgColor << 16) | (ulBgColor & 0xffff);
    }
    else if (cBpp == 2)
    {
        ulFgColor = (ulFgColor << 16) | (ulFgColor & 0xffff);
        ulBgColor = (ulBgColor << 16) | (ulBgColor & 0xffff);
    }

    pulXfer = ppdev->pulXfer;
    ppdev->pfnBankMap(ppdev, ppdev->lXferBank);

    CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);
    CP_IO_DST_Y_OFFSET(ppdev, pjPorts, lDelta);

    if (rop4 != 0xCCAA)
    {
        LONG    lCnt = c;
        RECTL*  prclTmp = prcl;
        BYTE    jHwBgRop = gajHwMixFromRop2[rop4 & 0xf];

        CP_IO_ROP(ppdev, pjPorts, jHwBgRop);
        CP_IO_FG_COLOR(ppdev, pjPorts, ulBgColor);
        CP_IO_SRC_ADDR(ppdev, pjPorts, ppdev->ulSolidColorOffset);
        CP_IO_BLT_MODE(ppdev, pjPorts, ppdev->jModeColor |
                                    ENABLE_COLOR_EXPAND |
                                    ENABLE_8x8_PATTERN_COPY);
		
        do
        {
            // calculate the size of the blt

            ptlDst.x = prclTmp->left;
            ptlDst.y = prclTmp->top;
            sizlDst.cx = prclTmp->right - ptlDst.x;
            sizlDst.cy = prclTmp->bottom - ptlDst.y;

            //
            // Fill the background rectangle with the background color
            //

            // Set the dest addresses

            ulDstAddr = (ptlDst.y * lDelta) + PELS_TO_BYTES(ptlDst.x);

            CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

			//
			// Tell the hardware how many bytes we'd like to write:
			// sizlDst.cx * sizelDst.cy
			//
            CP_IO_XCNT(ppdev, pjPorts, PELS_TO_BYTES(sizlDst.cx) - 1);
            CP_IO_YCNT(ppdev, pjPorts, sizlDst.cy - 1);
            CP_IO_DST_ADDR(ppdev, pjPorts, ulDstAddr);

            // Start the blt operation

            CP_IO_START_BLT(ppdev, pjPorts);
            prclTmp++;
        } while (--lCnt != 0);

        CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);
    }

    CP_IO_FG_COLOR(ppdev, pjPorts, ulFgColor);

    CP_IO_BG_COLOR(ppdev, pjPorts, ~ulFgColor);
    CP_IO_XPAR_COLOR(ppdev, pjPorts, ~ulFgColor);
    CP_IO_ROP(ppdev, pjPorts, CL_SRC_COPY);
    CP_IO_BLT_MODE(ppdev, pjPorts, ppdev->jModeColor |
                                ENABLE_COLOR_EXPAND |
                                ENABLE_TRANSPARENCY_COMPARE |
                                SRC_CPU_DATA);
    CP_IO_BLT_EXT_MODE(ppdev, pjPorts, 0);            // jl01

    do
    {
        // calculate the size of the blt

        ptlDst.x = prcl->left;
        ptlDst.y = prcl->top;
        sizlDst.cx = prcl->right - ptlDst.x;
        sizlDst.cy = prcl->bottom - ptlDst.y;

        // calculate the number of dwords per scan line

        ptlSrc.x = prcl->left + dx;
        ptlSrc.y = prcl->top + dy;

        // Floor the source.
        // Extend the width by the amount required to floor to a dword boundary.
        // Set the size of the left mask.
        // Floor the dest, so it aligns with the floored source.

        if ((cxLeftMask = (ptlSrc.x & 31)))
        {
            sizlDst.cx += cxLeftMask;
            ptlSrc.x &= ~31;
            ptlDst.x -= cxLeftMask;
        }

        ulLeftMask = gaulLeftClipMask[cxLeftMask];

        // Ceil the cx to a dword boundary.

        if (cxRightMask = (sizlDst.cx & 31))
        {
            cxRightMask = 32 - cxRightMask;
            sizlDst.cx = (sizlDst.cx + 31) & ~31;
        }

        ulRightMask = gaulRightClipMask[cxRightMask];

        if (sizlDst.cx == 32)
        {
            ulLeftMask &= ulRightMask;
            ulRightMask = 0;
        }

        // Note: At this point sizlDst.cx is the width of the blt in pixels,
        //       floored to a dword boundary, and ceiled to a dword boundary.

        // Calculate the width in Bytes

        cxWidthInBytes  = sizlDst.cx >> 3;

        // Calculate the number of Dwords and any remaining bytes

        nDwords = cxWidthInBytes >> 2;

        ASSERTDD(((cxWidthInBytes & 0x03) == 0),
                 "cxWidthInBytes is not a DWORD multiple");

        // Calculate the address of the source bitmap
        // This is to a byte boundary.

        pjBits  = (PBYTE) psoSrc->pvScan0;
        pjBits += ptlSrc.y * lDeltaSrc;
        pjBits += ptlSrc.x >> 3;

        ASSERTDD((((ULONG_PTR)pjBits & 0x03) == 0),
                 "pjBits not DWORD aligned like it should be");

        //
        // Blt the 1 bpp bitmap
        //

        ulDstAddr = (ptlDst.y * lDelta) + PELS_TO_BYTES(ptlDst.x);

        CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

        CP_IO_XCNT(ppdev, pjPorts, PELS_TO_BYTES(sizlDst.cx) - 1);
        CP_IO_YCNT(ppdev, pjPorts, sizlDst.cy - 1);

        //
        // The 542x chips require a write to the Src Address Register when
        // doing a host transfer with color expansion.  The value is
        // irrelevant, but the write is crucial.  This is documented in
        // the manual, not the errata.  Go figure.
        //

        CP_IO_SRC_ADDR(ppdev, pjPorts, 0);
        CP_IO_DST_ADDR(ppdev, pjPorts, ulDstAddr);

        CP_IO_START_BLT(ppdev, pjPorts);

        //
        // Transfer the host bitmap.
        //
        if (ulRightMask)
        {
            //
            // Blt is > 1 DWORD wide (nDwords > 1)
            //

            for (iy = 0; iy < sizlDst.cy; iy++)
            {
                pul = (ULONG*) pjBits;

                //*pulXfer++ = *(((ULONG*)pul)++) & ulLeftMask;
                WRITE_REGISTER_ULONG(pulXfer, (*((ULONG*)pul) & ulLeftMask));
                pul++;

                for (ix = 0; ix < (nDwords-2); ix++)
                {
                    //*pulXfer++ = *(((ULONG*)pul)++);
                    WRITE_REGISTER_ULONG(pulXfer, (*((ULONG*)pul)));
                    pul++;
                }
                //*pulXfer++ = *(((ULONG*)pul)++) & ulRightMask;
                WRITE_REGISTER_ULONG(pulXfer, (*((ULONG*)pul) & ulRightMask));
                pul++;

                pjBits += lDeltaSrc;
                //pulXfer = ppdev->pulXfer;
                CP_MEMORY_BARRIER();     // Flush memory cache when we reset the address

            }
        }
        else
        {
            //
            // Blt is 1 DWORD wide (nDwords == 1)
            //

            for (iy = 0; iy < sizlDst.cy; iy++)
            {
                //*pulXfer = *((ULONG*)pjBits) & ulLeftMask;
                WRITE_REGISTER_ULONG(pulXfer, (*((ULONG*)pjBits) & ulLeftMask));
                pjBits += lDeltaSrc;
                CP_MEMORY_BARRIER();     // Flush memory cache
            }
        }

        prcl++;
    } while (--c != 0);
}

/******************************Public*Routine******************************\
* VOID vIoXfer4bpp
*
* Does a 4bpp transfer from a bitmap to the screen.
*
* NOTE: The screen must be 8bpp for this function to be called!
*
* The reason we implement this is that a lot of resources are kept as 4bpp,
* and used to initialize DFBs, some of which we of course keep off-screen.
*
\**************************************************************************/

// XLATE_BUFFER_SIZE defines the size of the stack-based buffer we use
// for doing the translate.  Note that in general stack buffers should
// be kept as small as possible.  The OS guarantees us only 8k for stack
// from GDI down to the display driver in low memory situations; if we
// ask for more, we'll access violate.  Note also that at any time the
// stack buffer cannot be larger than a page (4k) -- otherwise we may
// miss touching the 'guard page' and access violate then too.

#define XLATE_BUFFER_SIZE 256

VOID vIoXfer4bpp(
PDEV*       ppdev,
LONG        c,          // Count of rectangles, can't be zero
RECTL*      prcl,       // List of destination rectangles, in relative
                        //   coordinates
ULONG       rop4,       // rop4
SURFOBJ*    psoSrc,     // Source surface
POINTL*     pptlSrc,    // Original unclipped source point
RECTL*      prclDst,    // Original unclipped destination rectangle
XLATEOBJ*   pxlo)       // Translate that provides colour-expansion information
{
    ULONG*          pulXfer = ppdev->pulXfer;
    BYTE*           pjPorts = ppdev->pjPorts;
    LONG            lDelta  = ppdev->lDelta;
    ULONG           ulDstAddr;
    LONG            dx;
    LONG            dy;
    LONG            cx;
    LONG            cy;
    LONG            lSrcDelta;
    BYTE*           pjSrcScan0;
    BYTE*           pjScan;
    BYTE*           pjSrc;
    BYTE*           pjDst;
    LONG            cxThis;
    LONG            cxToGo;
    LONG            xSrc;
    LONG            iLoop;
    BYTE            jSrc;
    ULONG*          pulXlate;
    LONG            cdwThis;
    BYTE*           pjBuf;
    BYTE            ajBuf[XLATE_BUFFER_SIZE];

    ASSERTDD(ppdev->iBitmapFormat == BMF_8BPP, "Screen must be 8bpp");
    ASSERTDD(psoSrc->iBitmapFormat == BMF_4BPP, "Source must be 4bpp");
    ASSERTDD(c > 0, "Can't handle zero rectangles");
    ASSERTDD(((rop4 & 0xff00) >> 8) == (rop4 & 0xff),
             "Expect only a rop2");

    DISPDBG((5, "vXfer4bpp: entry"));

    dx = pptlSrc->x - prclDst->left;
    dy = pptlSrc->y - prclDst->top;     // Add to destination to get source

    lSrcDelta  = psoSrc->lDelta;
    pjSrcScan0 = psoSrc->pvScan0;

    ppdev->pfnBankMap(ppdev, ppdev->lXferBank);

    CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

    CP_IO_DST_Y_OFFSET(ppdev, pjPorts, lDelta);
    CP_IO_ROP(ppdev, pjPorts, gajHwMixFromRop2[rop4 & 0xf]);
    CP_IO_BLT_MODE(ppdev, pjPorts, SRC_CPU_DATA);

    while(TRUE)
    {
        ulDstAddr = (prcl->top * lDelta) + PELS_TO_BYTES(prcl->left);
        cx = prcl->right  - prcl->left;
        cy = prcl->bottom - prcl->top;

        CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

        CP_IO_XCNT(ppdev, pjPorts, PELS_TO_BYTES(cx) - 1);
        CP_IO_YCNT(ppdev, pjPorts, cy - 1);
        CP_IO_DST_ADDR(ppdev, pjPorts, ulDstAddr);

        pulXlate  =  pxlo->pulXlate;
        xSrc      =  prcl->left + dx;
        pjScan    =  pjSrcScan0 + (prcl->top + dy) * lSrcDelta + (xSrc >> 1);

        CP_IO_START_BLT(ppdev, pjPorts);

        do {
            pjSrc  = pjScan;
            cxToGo = cx;            // # of pels per scan in 4bpp source
            do {
                cxThis  = XLATE_BUFFER_SIZE;
                                    // We can handle XLATE_BUFFER_SIZE number
                                    //   of pels in this xlate batch
                cxToGo -= cxThis;   // cxThis will be the actual number of
                                    //   pels we'll do in this xlate batch
                if (cxToGo < 0)
                    cxThis += cxToGo;

                pjDst = ajBuf;      // Points to our temporary batch buffer

                // We handle alignment ourselves because it's easy to
                // do, rather than pay the cost of setting/resetting
                // the scissors register:

                if (xSrc & 1)
                {
                    // When unaligned, we have to be careful not to read
                    // past the end of the 4bpp bitmap (that could
                    // potentially cause us to access violate):

                    iLoop = cxThis >> 1;        // Each loop handles 2 pels;
                                                //   we'll handle odd pel
                                                //   separately
                    jSrc  = *pjSrc;
                    while (iLoop-- != 0)
                    {
                        *pjDst++ = (BYTE) pulXlate[jSrc & 0xf];
                        jSrc = *(++pjSrc);
                        *pjDst++ = (BYTE) pulXlate[jSrc >> 4];
                    }

                    if (cxThis & 1)
                        *pjDst = (BYTE) pulXlate[jSrc & 0xf];
                }
                else
                {
                    iLoop = (cxThis + 1) >> 1;  // Each loop handles 2 pels
                    do {
                        jSrc = *pjSrc++;

                        *pjDst++ = (BYTE) pulXlate[jSrc >> 4];
                        *pjDst++ = (BYTE) pulXlate[jSrc & 0xf];

                    } while (--iLoop != 0);
                }

                // The number of bytes we'll transfer is equal to the number
                // of pels we've processed in the batch.  Since we're
                // transferring words, we have to round up to get the word
                // count:

                cdwThis = (cxThis + 3) >> 2;
                pjBuf  = ajBuf;

                TRANSFER_DWORD_ALIGNED(ppdev, pulXfer, pjBuf, cdwThis);

            } while (cxToGo > 0);

            pjScan += lSrcDelta;        // Advance to next source scan.  Note
                                        //   that we could have computed the
                                        //   value to advance 'pjSrc' directly,
                                        //   but this method is less
                                        //   error-prone.

        } while (--cy != 0);

        if (--c == 0)
            return;

        prcl++;
    }
}

/******************************Public*Routine******************************\
* VOID vIoXferNative
*
* Transfers a bitmap that is the same color depth as the display to
* the screen via the data transfer register, with no translation.
*
\**************************************************************************/

VOID vIoXferNative(
PDEV*       ppdev,
LONG        c,          // Count of rectangles, can't be zero
RECTL*      prcl,       // Array of relative coordinates destination rectangles
ULONG       rop4,       // rop4
SURFOBJ*    psoSrc,     // Source surface
POINTL*     pptlSrc,    // Original unclipped source point
RECTL*      prclDst,    // Original unclipped destination rectangle
XLATEOBJ*   pxlo)       // Not used
{
    ULONG*          pulXfer = ppdev->pulXfer;
    BYTE*           pjPorts = ppdev->pjPorts;
    LONG            lDelta  = ppdev->lDelta;
    ULONG           ulDstAddr;
    LONG            dx;
    LONG            dy;
    LONG            cx;
    LONG            cy;
    LONG            lSrcDelta;
    BYTE*           pjSrcScan0;
    BYTE*           pjSrc;
    LONG            cjSrc;

    ASSERTDD((pxlo == NULL) || (pxlo->flXlate & XO_TRIVIAL),
            "Can handle trivial xlate only");
    ASSERTDD(psoSrc->iBitmapFormat == ppdev->iBitmapFormat,
            "Source must be same color depth as screen");
    ASSERTDD(c > 0, "Can't handle zero rectangles");
    ASSERTDD(((rop4 & 0xff00) >> 8) == (rop4 & 0xff),
             "Expect only a rop2");

    dx = pptlSrc->x - prclDst->left;
    dy = pptlSrc->y - prclDst->top;     // Add to destination to get source

    lSrcDelta  = psoSrc->lDelta;
    pjSrcScan0 = psoSrc->pvScan0;

    ppdev->pfnBankMap(ppdev, ppdev->lXferBank);

    CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

    CP_IO_DST_Y_OFFSET(ppdev, pjPorts, lDelta);
    CP_IO_ROP(ppdev, pjPorts, gajHwMixFromRop2[rop4 & 0xf]);
    CP_IO_BLT_MODE(ppdev, pjPorts, SRC_CPU_DATA);

    while(TRUE)
    {
        ulDstAddr = (prcl->top * lDelta) + PELS_TO_BYTES(prcl->left);
        cx = prcl->right  - prcl->left;
        cy = prcl->bottom - prcl->top;

        CP_IO_WAIT_FOR_BLT_COMPLETE(ppdev, pjPorts);

        CP_IO_XCNT(ppdev, pjPorts, PELS_TO_BYTES(cx) - 1);
        CP_IO_YCNT(ppdev, pjPorts, cy - 1);
        CP_IO_DST_ADDR(ppdev, pjPorts, ulDstAddr);

        cjSrc = PELS_TO_BYTES(cx);
        pjSrc = pjSrcScan0 + (prcl->top  + dy) * lSrcDelta
                           + (PELS_TO_BYTES(prcl->left + dx));

        CP_IO_START_BLT(ppdev, pjPorts);
        vImageTransfer(ppdev, pjSrc, lSrcDelta, cjSrc, cy);

        if (--c == 0)
            return;

        prcl++;
    }
}