#include "stdafx.h"
#pragma  hdrstop

/*
 * jccolor.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
 * This file contains input colorspace conversion routines.
 */
#pragma warning( disable : 4799 )
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"

#ifdef NIFTY

#include <math.h>

#define SCALE_PREC      5
#define SCALE_RND       (1 << (SCALE_PREC - 1))
#define SCALE           (1 << SCALE_PREC)
#define unscale(x)      (((long)(x) + SCALE_RND) >> SCALE_PREC)
#define clip(x)         (((long)(x) & ~0xff) ? (((long)(x) < 0) ? 0 : 255) : (long)(x))

#endif


/* Private subobject */

typedef struct {
  struct jpeg_color_converter pub; /* public fields */

  /* Private state for RGB->YCC conversion */
  INT32 * rgb_ycc_tab;		/* => table for RGB to YCbCr conversion */
} my_color_converter;

typedef my_color_converter * my_cconvert_ptr;

extern void MRGB2YCbCr(
int rows,
int cols,
unsigned char *inRGB,
unsigned char *outY,
unsigned char *outU,
unsigned char *outV);

extern void MRGBA2YCbCrA(
int rows,
int cols,
unsigned char *inRGB,
unsigned char *outY,
unsigned char *outU,
unsigned char *outV,
unsigned char *outA);

extern void MRGBA2YCbCrALegacy(
int rows,
int cols,
unsigned char *inRGB,
unsigned char *outY,
unsigned char *outU,
unsigned char *outV,
unsigned char *outA);

// ******************************************************************
// Macros and Constants
#define FCONVERSION_BITS 15
#define ICONVERSION_BITS  8

const __int64  const_0			= 0x0000000000000000;
const __int64  const_1			= 0x0001000100010001;
const __int64  const_128		= 0x0080008000800080;
// These constants correspond to CCIR 601-1
// Y  = [ (9798*R + 19235*G +  3736*B) / 32768]
// Cb = [(-5529*R - 10855*G + 16384*B) / 32768] + 128
// Cr = [(16384*R - 13720*G -  2664*B) / 32768] + 128
//Conventional floating point equations:
// Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
// Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 0.5
// Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 0.5
//Yr = 2646 Yg = 4b23 Yb = 0e98
//Ur = ea67 Ug = d599 Ub = 4000
//Vr = 4000 Vg = ca68 Vb = f598
// constants for RGB->YCrCb
const __int64  const_YR0GR		= 0x264600004B232646;
const __int64  const_YBG0B		= 0x0E984B2300000E98;
const __int64  const_UR0GR		= 0xEA670000D599EA67;
const __int64  const_UBG0B		= 0x4000D59900004000;
const __int64  const_VR0GR		= 0x40000000CA684000;
const __int64  const_VBG0B		= 0xF598CA680000F598;

// constants for RGBA->YCrCbA
const __int64  const2_YGRGR		= 0x4B2326464B232646;
const __int64  const2_Y0B0B		= 0x00000E9800000E98;
const __int64  const2_UGRGR		= 0xD599EA67D599EA67;
const __int64  const2_U0B0B		= 0x0000400000004000;
const __int64  const2_VGRGR		= 0xCA684000CA684000;
const __int64  const2_V0B0B		= 0x0000F5980000F598;
const __int64  const2_A			= 0x0001000000010000;
const __int64  const2_Legacy	= 0x00FFFFFF00FFFFFF;




// These constants correspond to the original FPX SDK
// ... using 2^15
//Y  = [ (9869*R + 19738*G +  3290*B) / 32768]
//Cb = [(-4935*R -  9869*G + 14739*B) / 32768] + 128
//Cr = [(14312*R - 12336*G -  2056*B) / 32768] + 128
//Conventional floating point equations:
// Y  =  0.30118*R + 0.60235*G + 0.10039*B
// Cb = -0.15059*R - 0.30118*G + 0.44981*B + 0.5
// Cr =  0.43676*R - 0.37647*G - 0.06274*G + 0.5
//Yr = 268d Yg = 4d1a Yb = 0cda
//Ur = ecb9 Ug = d973 Ub = 3993
//Vr = 37e8 Vg = cfd0 Vb = f7f8
// constants for RGB->YCrCb
//const __int64  const_YR0GR		= 0x268D00004D1A268D;
//const __int64  const_YBG0B		= 0x0CDA4D1A00000CDA;
//const __int64  const_UR0GR		= 0xECB90000D973ECB9;
//const __int64  const_UBG0B		= 0x3993D97300003993;
//const __int64  const_VR0GR		= 0x37E80000CFD037E8;
//const __int64  const_VBG0B		= 0xF7F8CFD00000F7F8;

// constants for RGBA->YCrCbA
//const __int64  const2_YGRGR		= 0x4D1A268D4D1A268D;
//const __int64  const2_Y0B0B		= 0x00000CDA00000CDA;
//const __int64  const2_UGRGR		= 0xD973ECB9D973ECB9;
//const __int64  const2_U0B0B		= 0x0000399300003993;
//const __int64  const2_VGRGR		= 0xCFD037E8CFD037E8;
//const __int64  const2_V0B0B		= 0x0000F7F80000F7F8;
//const __int64  const2_A			= 0x0001000000010000;
//const __int64  const2_Legacy	= 0x00FFFFFF00FFFFFF;

// ... using 2^8
//const __int64  const_X0YY0		= 0x0000010001000000;
//const __int64  const_RVUVU		= 0x019A0000019A0000;
//const __int64  const_GVUVU		= 0xFF33FFABFF33FFAB;
//const __int64  const_BVUVU		= 0x0000020000000200;
__int64  temp0, tempY, tempU, tempV, tempA;


/**************** RGB -> YCbCr conversion: most common case **************/

/*
 * YCbCr is defined per CCIR 601-1, except that Cb and Cr are
 * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5.
 * The conversion equations to be implemented are therefore
 *	Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
 *	Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + CENTERJSAMPLE
 *	Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + CENTERJSAMPLE
 * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.)
 * Note: older versions of the IJG code used a zero offset of MAXJSAMPLE/2,
 * rather than CENTERJSAMPLE, for Cb and Cr.  This gave equal positive and
 * negative swings for Cb/Cr, but meant that grayscale values (Cb=Cr=0)
 * were not represented exactly.  Now we sacrifice exact representation of
 * maximum red and maximum blue in order to get exact grayscales.
 *
 * To avoid floating-point arithmetic, we represent the fractional constants
 * as integers scaled up by 2^16 (about 4 digits precision); we have to divide
 * the products by 2^16, with appropriate rounding, to get the correct answer.
 *
 * For even more speed, we avoid doing any multiplications in the inner loop
 * by precalculating the constants times R,G,B for all possible values.
 * For 8-bit JSAMPLEs this is very reasonable (only 256 entries per table);
 * for 12-bit samples it is still acceptable.  It's not very reasonable for
 * 16-bit samples, but if you want lossless storage you shouldn't be changing
 * colorspace anyway.
 * The CENTERJSAMPLE offsets and the rounding fudge-factor of 0.5 are included
 * in the tables to save adding them separately in the inner loop.
 */

#define SCALEBITS	16	/* speediest right-shift on some machines */
#define CBCR_OFFSET	((INT32) CENTERJSAMPLE << SCALEBITS)
#define ONE_HALF	((INT32) 1 << (SCALEBITS-1))
#define FIX(x)		((INT32) ((x) * (1L<<SCALEBITS) + 0.5))

/* We allocate one big table and divide it up into eight parts, instead of
 * doing eight alloc_small requests.  This lets us use a single table base
 * address, which can be held in a register in the inner loops on many
 * machines (more than can hold all eight addresses, anyway).
 */

#define R_Y_OFF		0			/* offset to R => Y section */
#define G_Y_OFF		(1*(MAXJSAMPLE+1))	/* offset to G => Y section */
#define B_Y_OFF		(2*(MAXJSAMPLE+1))	/* etc. */
#define R_CB_OFF	(3*(MAXJSAMPLE+1))
#define G_CB_OFF	(4*(MAXJSAMPLE+1))
#define B_CB_OFF	(5*(MAXJSAMPLE+1))
#define R_CR_OFF	B_CB_OFF		/* B=>Cb, R=>Cr are the same */
#define G_CR_OFF	(6*(MAXJSAMPLE+1))
#define B_CR_OFF	(7*(MAXJSAMPLE+1))
#define TABLE_SIZE	(8*(MAXJSAMPLE+1))

#ifdef NIFTY
/*
 * Initialize for RGB->PhotoYCC colorspace conversion.
 */
METHODDEF (void)
rgb_pycc_start (j_compress_ptr cinfo)
{
 
}

/*
 * RGB->PhotoYCC colorspace convertion.
 */
METHODDEF (void)
rgb_pycc_convert (j_compress_ptr cinfo,
                 JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
                 JDIMENSION output_row, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
  register JSAMPROW inptr;
  register JSAMPROW outptr0, outptr1, outptr2;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->image_width;
  unsigned char r, g, b;
 
  while (--num_rows >= 0) {
    inptr = *input_buf++;
    outptr0 = output_buf[0][output_row];
    outptr1 = output_buf[1][output_row];
    outptr2 = output_buf[2][output_row];
    output_row++;
    for (col = 0; col < num_cols; col++) {
      r = GETJSAMPLE(inptr[RGB_RED]);
      g = GETJSAMPLE(inptr[RGB_GREEN]);
      b = GETJSAMPLE(inptr[RGB_BLUE]);
      inptr+=RGB_PIXELSIZE;
 
      /* Y */
      outptr0[col] = (JSAMPLE)((float)((float)r * 0.2200179046) + (float)((float)g * 0.4322754970) + (float)((float)b * 0.0838667868));
      /* C1 */
      outptr1[col] = (JSAMPLE)((float)((float)r * -0.1347546425) - (float)((float)g * 0.2647563169) + (float)((float)b * 0.3995109594) + 156);
      /* C2 */
      outptr2[col] = (JSAMPLE)((float)((float)r * 0.3849177482) - (float)((float)g * 0.3223733380) + (float)((float)b * 0.0625444102) + 137);
    }
  }
}

#endif

/*
 * Initialize for RGB->YCC colorspace conversion.
 */

METHODDEF(void)
rgb_ycc_start (j_compress_ptr cinfo)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  INT32 * rgb_ycc_tab;
  INT32 i;

  /* Allocate and fill in the conversion tables. */
  cconvert->rgb_ycc_tab = rgb_ycc_tab = (INT32 *)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
				(TABLE_SIZE * SIZEOF(INT32)));

  for (i = 0; i <= MAXJSAMPLE; i++) {
    rgb_ycc_tab[i+R_Y_OFF] = FIX(0.29900) * i;
    rgb_ycc_tab[i+G_Y_OFF] = FIX(0.58700) * i;
    rgb_ycc_tab[i+B_Y_OFF] = FIX(0.11400) * i     + ONE_HALF;
    rgb_ycc_tab[i+R_CB_OFF] = (-FIX(0.16874)) * i;
    rgb_ycc_tab[i+G_CB_OFF] = (-FIX(0.33126)) * i;
    /* We use a rounding fudge-factor of 0.5-epsilon for Cb and Cr.
     * This ensures that the maximum output will round to MAXJSAMPLE
     * not MAXJSAMPLE+1, and thus that we don't have to range-limit.
     */
    rgb_ycc_tab[i+B_CB_OFF] = FIX(0.50000) * i    + CBCR_OFFSET + ONE_HALF-1;
/*  B=>Cb and R=>Cr tables are the same
    rgb_ycc_tab[i+R_CR_OFF] = FIX(0.50000) * i    + CBCR_OFFSET + ONE_HALF-1;
*/
    rgb_ycc_tab[i+G_CR_OFF] = (-FIX(0.41869)) * i;
    rgb_ycc_tab[i+B_CR_OFF] = (-FIX(0.08131)) * i;
  }
}


/*
 * Convert some rows of samples to the JPEG colorspace.
 *
 * Note that we change from the application's interleaved-pixel format
 * to our internal noninterleaved, one-plane-per-component format.
 * The input buffer is therefore three times as wide as the output buffer.
 *
 * A starting row offset is provided only for the output buffer.  The caller
 * can easily adjust the passed input_buf value to accommodate any row
 * offset required on that side.
 */


METHODDEF(void)
rgb_ycc_convert (j_compress_ptr cinfo,
		 JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
		 JDIMENSION output_row, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  register int r, g, b;
  register INT32 * ctab = cconvert->rgb_ycc_tab;
  register JSAMPROW inptr;
  register JSAMPROW outptr0, outptr1, outptr2;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->image_width;
  JDIMENSION tail_cols = num_cols&7;
  JDIMENSION mmx_cols=num_cols&~7;

  while (--num_rows >= 0) {
    inptr = *input_buf++;
    outptr0 = output_buf[0][output_row];
    outptr1 = output_buf[1][output_row];
    outptr2 = output_buf[2][output_row];
    output_row++;

//
// Need to add #ifdef for Alpha port
//
#if defined (_X86_)
	if (vfMMXMachine)
	{

		MRGB2YCbCr(	(int)(1), mmx_cols, inptr, outptr0, outptr1, outptr2);

		inptr += 3*mmx_cols;
		for (col = mmx_cols; col < num_cols; col++) {
		  r = GETJSAMPLE(inptr[RGB_RED]);
		  g = GETJSAMPLE(inptr[RGB_GREEN]);
		  b = GETJSAMPLE(inptr[RGB_BLUE]);
	      inptr += RGB_PIXELSIZE;
	      /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
	       * must be too; we do not need an explicit range-limiting operation.
	       * Hence the value being shifted is never negative, and we don't
	       * need the general RIGHT_SHIFT macro.
	       */
	      /* Y */
	      outptr0[col] = (JSAMPLE)
			((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
			 >> SCALEBITS);
	      /* Cb */
	      outptr1[col] = (JSAMPLE)
			((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
			 >> SCALEBITS);
	      /* Cr */
	      outptr2[col] = (JSAMPLE)
			((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
			 >> SCALEBITS);
		}
	}
	else
#endif
	{
	    for (col = 0; col < num_cols; col++) {
	      r = GETJSAMPLE(inptr[RGB_RED]);
	      g = GETJSAMPLE(inptr[RGB_GREEN]);
	      b = GETJSAMPLE(inptr[RGB_BLUE]);
	      inptr += RGB_PIXELSIZE;
	      /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
	       * must be too; we do not need an explicit range-limiting operation.
	       * Hence the value being shifted is never negative, and we don't
	       * need the general RIGHT_SHIFT macro.
	       */
	      /* Y */
	      outptr0[col] = (JSAMPLE)
			((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
			 >> SCALEBITS);
	      /* Cb */
	      outptr1[col] = (JSAMPLE)
			((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
			 >> SCALEBITS);
	      /* Cr */
	      outptr2[col] = (JSAMPLE)
			((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
			 >> SCALEBITS);
	    }
	}
  }
}




/**************** Cases other than RGB -> YCbCr **************/


/*
 * Convert some rows of samples to the JPEG colorspace.
 * This version handles RGB->grayscale conversion, which is the same
 * as the RGB->Y portion of RGB->YCbCr.
 * We assume rgb_ycc_start has been called (we only use the Y tables).
 */

METHODDEF(void)
rgb_gray_convert (j_compress_ptr cinfo,
		  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
		  JDIMENSION output_row, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  register int r, g, b;
  register INT32 * ctab = cconvert->rgb_ycc_tab;
  register JSAMPROW inptr;
  register JSAMPROW outptr;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->image_width;

  while (--num_rows >= 0) {
    inptr = *input_buf++;
    outptr = output_buf[0][output_row];
    output_row++;
    for (col = 0; col < num_cols; col++) {
      r = GETJSAMPLE(inptr[RGB_RED]);
      g = GETJSAMPLE(inptr[RGB_GREEN]);
      b = GETJSAMPLE(inptr[RGB_BLUE]);
      inptr += RGB_PIXELSIZE;
      /* Y */
      outptr[col] = (JSAMPLE)
		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
		 >> SCALEBITS);
    }
  }
}

#ifdef NIFTY

METHODDEF (void)
rgba_ycbcra_convert (j_compress_ptr cinfo,
		   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
		   JDIMENSION output_row, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  register int r, g, b;
  register INT32 * ctab = cconvert->rgb_ycc_tab;
  register JSAMPROW inptr;
  register JSAMPROW outptr0, outptr1, outptr2, outptr3;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->image_width;
  JDIMENSION tail_cols = num_cols&7;
  JDIMENSION mmx_cols=num_cols&~7;
 
  while (--num_rows >= 0) {
    inptr = *input_buf++;
    outptr0 = output_buf[0][output_row];
    outptr1 = output_buf[1][output_row];
	outptr2 = output_buf[2][output_row];
    outptr3 = output_buf[3][output_row];
    output_row++;

//
// Need to add #ifdef for Alpha port
//
#if defined (_X86_)
        if (vfMMXMachine)
	{

		MRGBA2YCbCrA(	(int)(1), mmx_cols, inptr, outptr0, outptr1, outptr2, outptr3);

		inptr += 4*mmx_cols;
	    for (col = mmx_cols; col < num_cols; col++) {
	      r = GETJSAMPLE(inptr[0]);
	      g = GETJSAMPLE(inptr[1]);
	      b = GETJSAMPLE(inptr[2]);
	      /* Alpha passes through as-is */
	      outptr3[col] = inptr[3];  /* don't need GETJSAMPLE here */
	      inptr += 4;
	      /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
	       * must be too; we do not need an explicit range-limiting operation.
	       * Hence the value being shifted is never negative, and we don't
	       * need the general RIGHT_SHIFT macro.
	       */
	      /* Y */
	      outptr0[col] = (JSAMPLE)
	                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
		             >> SCALEBITS);
	      /* Cb */
	      outptr1[col] = (JSAMPLE)
	                ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
	                 >> SCALEBITS);
	      /* Cr */
	      outptr2[col] = (JSAMPLE)
	                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
	                 >> SCALEBITS);
	    }
	}
	else
 #endif // defined (_X86_)
        {

	    for (col = 0; col < num_cols; col++) {
	      r = GETJSAMPLE(inptr[0]);
	      g = GETJSAMPLE(inptr[1]);
	      b = GETJSAMPLE(inptr[2]);
	      /* Alpha passes through as-is */
	      outptr3[col] = inptr[3];  /* don't need GETJSAMPLE here */
	      inptr += 4;
	      /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
	       * must be too; we do not need an explicit range-limiting operation.
	       * Hence the value being shifted is never negative, and we don't
	       * need the general RIGHT_SHIFT macro.
	       */
	      /* Y */
	      outptr0[col] = (JSAMPLE)
	                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
		             >> SCALEBITS);
	      /* Cb */
	      outptr1[col] = (JSAMPLE)
	                ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
	                 >> SCALEBITS);
	      /* Cr */
	      outptr2[col] = (JSAMPLE)
	                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
	                 >> SCALEBITS);
	    }
	}

  }
}


METHODDEF (void)
rgba_ycbcralegacy_convert (j_compress_ptr cinfo,
		   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
		   JDIMENSION output_row, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  register int r, g, b;
  register INT32 * ctab = cconvert->rgb_ycc_tab;
  register JSAMPROW inptr;
  register JSAMPROW outptr0, outptr1, outptr2, outptr3;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->image_width;
  JDIMENSION tail_cols = num_cols&7;
  JDIMENSION mmx_cols=num_cols&~7;
 
  while (--num_rows >= 0) {
    inptr = *input_buf++;
    outptr0 = output_buf[0][output_row];
    outptr1 = output_buf[1][output_row];
    outptr2 = output_buf[2][output_row];
    outptr3 = output_buf[3][output_row];
    output_row++;

//
// Need to add #ifdef for Alpha port
//
#if defined (_X86_)
	if (vfMMXMachine)
	{

		MRGBA2YCbCrALegacy(	(int)(1), mmx_cols, inptr, outptr0, outptr1, outptr2, outptr3);

		inptr += 4*mmx_cols;
	    for (col = mmx_cols; col < num_cols; col++) {
	      r = MAXJSAMPLE - GETJSAMPLE(inptr[0]);
	      g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
	      b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
	      /* Alpha passes through as-is */
	      outptr3[col] = inptr[3];  /* don't need GETJSAMPLE here */
	      inptr += 4;
	      /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
	       * must be too; we do not need an explicit range-limiting operation.
	       * Hence the value being shifted is never negative, and we don't
	       * need the general RIGHT_SHIFT macro.
	       */
	      /* Y */
	      outptr0[col] = (JSAMPLE)
	                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
	                 >> SCALEBITS);
	      /* Cb */
	      outptr1[col] = (JSAMPLE)
	                ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
	                 >> SCALEBITS);
	      /* Cr */
	      outptr2[col] = (JSAMPLE)
	                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
	                 >> SCALEBITS);
	    }
	}
	else
#endif // defined (_X86_)
        {
	    for (col = 0; col < num_cols; col++) {
	      r = MAXJSAMPLE - GETJSAMPLE(inptr[0]);
	      g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
	      b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
	      /* Alpha passes through as-is */
	      outptr3[col] = inptr[3];  /* don't need GETJSAMPLE here */
	      inptr += 4;
	      /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
	       * must be too; we do not need an explicit range-limiting operation.
	       * Hence the value being shifted is never negative, and we don't
	       * need the general RIGHT_SHIFT macro.
	       */
	      /* Y */
	      outptr0[col] = (JSAMPLE)
	                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
	                 >> SCALEBITS);
	      /* Cb */
	      outptr1[col] = (JSAMPLE)
	                ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
	                 >> SCALEBITS);
	      /* Cr */
	      outptr2[col] = (JSAMPLE)
	                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
	                 >> SCALEBITS);
	    }
	}
  }
}



#endif

/*
 * Convert some rows of samples to the JPEG colorspace.
 * This version handles Adobe-style CMYK->YCCK conversion,
 * where we convert R=1-C, G=1-M, and B=1-Y to YCbCr using the same
 * conversion as above, while passing K (black) unchanged.
 * We assume rgb_ycc_start has been called.
 */

METHODDEF(void)
cmyk_ycck_convert (j_compress_ptr cinfo,
		   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
		   JDIMENSION output_row, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  register int r, g, b;
  register INT32 * ctab = cconvert->rgb_ycc_tab;
  register JSAMPROW inptr;
  register JSAMPROW outptr0, outptr1, outptr2, outptr3;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->image_width;

  while (--num_rows >= 0) {
    inptr = *input_buf++;
    outptr0 = output_buf[0][output_row];
    outptr1 = output_buf[1][output_row];
    outptr2 = output_buf[2][output_row];
    outptr3 = output_buf[3][output_row];
    output_row++;
    for (col = 0; col < num_cols; col++) {
      r = MAXJSAMPLE - GETJSAMPLE(inptr[0]);
      g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
      b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
      /* K passes through as-is */
      outptr3[col] = inptr[3];	/* don't need GETJSAMPLE here */
      inptr += 4;
      /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
       * must be too; we do not need an explicit range-limiting operation.
       * Hence the value being shifted is never negative, and we don't
       * need the general RIGHT_SHIFT macro.
       */
      /* Y */
      outptr0[col] = (JSAMPLE)
		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
		 >> SCALEBITS);
      /* Cb */
      outptr1[col] = (JSAMPLE)
		((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
		 >> SCALEBITS);
      /* Cr */
      outptr2[col] = (JSAMPLE)
		((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
		 >> SCALEBITS);
    }
  }
}


/*
 * Convert some rows of samples to the JPEG colorspace.
 * This version handles grayscale output with no conversion.
 * The source can be either plain grayscale or YCbCr (since Y == gray).
 */

METHODDEF(void)
grayscale_convert (j_compress_ptr cinfo,
		   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
		   JDIMENSION output_row, int num_rows)
{
  register JSAMPROW inptr;
  register JSAMPROW outptr;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->image_width;
  int instride = cinfo->input_components;

  while (--num_rows >= 0) {
    inptr = *input_buf++;
    outptr = output_buf[0][output_row];
    output_row++;
    for (col = 0; col < num_cols; col++) {
      outptr[col] = inptr[0];	/* don't need GETJSAMPLE() here */
      inptr += instride;
    }
  }
}


/*
 * Convert some rows of samples to the JPEG colorspace.
 * This version handles multi-component colorspaces without conversion.
 * We assume input_components == num_components.
 */

METHODDEF(void)
null_convert (j_compress_ptr cinfo,
	      JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
	      JDIMENSION output_row, int num_rows)
{
  register JSAMPROW inptr;
  register JSAMPROW outptr;
  register JDIMENSION col;
  register int ci;
  int nc = cinfo->num_components;
  JDIMENSION num_cols = cinfo->image_width;

  while (--num_rows >= 0) {
    /* It seems fastest to make a separate pass for each component. */
    for (ci = 0; ci < nc; ci++) {
      inptr = *input_buf;
      outptr = output_buf[ci][output_row];
      for (col = 0; col < num_cols; col++) {
	outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */
	inptr += nc;
      }
    }
    input_buf++;
    output_row++;
  }
}


/*
 * Empty method for start_pass.
 */

METHODDEF(void)
null_method (j_compress_ptr cinfo)
{
  /* no work needed */
}


/*
 * Module initialization routine for input colorspace conversion.
 */

GLOBAL(void)
jinit_color_converter (j_compress_ptr cinfo)
{
  my_cconvert_ptr cconvert;

  cconvert = (my_cconvert_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
				SIZEOF(my_color_converter));
  cinfo->cconvert = (struct jpeg_color_converter *) cconvert;
  /* set start_pass to null method until we find out differently */
  cconvert->pub.start_pass = null_method;

  /* Make sure input_components agrees with in_color_space */
  switch (cinfo->in_color_space) {
  case JCS_GRAYSCALE:
    if (cinfo->input_components != 1)
      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
    break;

#ifdef NIFTY
    case JCS_YCC:
    if (cinfo->input_components != 3)
      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
    break;

    case JCS_RGBA:
    if (cinfo->input_components != 4)
      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
    break;

    case JCS_YCbCrA:
    if (cinfo->input_components != 4)
      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
    break;
    
	case JCS_YCbCrALegacy:
    if (cinfo->input_components != 4)
      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
    break;

    case JCS_YCCA:
    if (cinfo->input_components != 4)
      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
    break;
#endif

	case JCS_RGB:

#if RGB_PIXELSIZE != 3
    if (cinfo->input_components != RGB_PIXELSIZE)
      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
    break;
#endif /* else share code with YCbCr */

  case JCS_YCbCr:
    if (cinfo->input_components != 3)
      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
    break;

  case JCS_CMYK:
  case JCS_YCCK:
    if (cinfo->input_components != 4)
      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
    break;

  default:			/* JCS_UNKNOWN can be anything */
    if (cinfo->input_components < 1)
      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
    break;
  }

  /* Check num_components, set conversion method based on requested space */
  switch (cinfo->jpeg_color_space) {
  case JCS_GRAYSCALE:
    if (cinfo->num_components != 1)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    if (cinfo->in_color_space == JCS_GRAYSCALE)
      cconvert->pub.color_convert = grayscale_convert;
    else if (cinfo->in_color_space == JCS_RGB) {
      cconvert->pub.start_pass = rgb_ycc_start;
      cconvert->pub.color_convert = rgb_gray_convert;
    } else if (cinfo->in_color_space == JCS_YCbCr)
      cconvert->pub.color_convert = grayscale_convert;
    else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;
#ifdef NIFTY
  /* Store and compress data as PhotoYCC */
  /* Only current option is to start with PhotoYCC
   * although I do include the function RGB->PhotoYCC
   * in the compressor, I don't think it's a good idea
   * to rotate to PhotoYCC from RGB in this context.
   * If subsampling is required, then just use YCrCb.
   */
  case JCS_YCC:
    if (cinfo->num_components != 3)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    if (cinfo->in_color_space == JCS_YCC)
      cconvert->pub.color_convert = null_convert;
    else
      if (cinfo->in_color_space == JCS_RGB) {
	/* this is where the RGB->PhotoYCC could be called */
        ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);	
      } else {
        ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
      }
    break;

  case JCS_YCCA:
    if (cinfo->num_components != 4)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    if (cinfo->in_color_space == JCS_YCCA)
      cconvert->pub.color_convert = null_convert;
    else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;

  case JCS_RGBA:
    if (cinfo->num_components != 4)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    if (cinfo->in_color_space == JCS_RGBA) {
      cconvert->pub.color_convert = null_convert;
    } else {
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    }
    break;

  case JCS_YCbCrA:
    if (cinfo->num_components != 4)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    if (cinfo->in_color_space == JCS_YCbCrA)
      cconvert->pub.color_convert = null_convert;
    else if (cinfo->in_color_space == JCS_RGBA) {
      cconvert->pub.start_pass = rgb_ycc_start;
      cconvert->pub.color_convert = rgba_ycbcra_convert;
    } else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;

  case JCS_YCbCrALegacy:
    if (cinfo->num_components != 4)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    if (cinfo->in_color_space == JCS_YCbCrALegacy)
      cconvert->pub.color_convert = null_convert;
    else if (cinfo->in_color_space == JCS_RGBA) {
      cconvert->pub.start_pass = rgb_ycc_start;
      cconvert->pub.color_convert = rgba_ycbcralegacy_convert;
    } else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;


#endif
  case JCS_RGB:
    if (cinfo->num_components != 3)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    if (cinfo->in_color_space == JCS_RGB && RGB_PIXELSIZE == 3)
      cconvert->pub.color_convert = null_convert;
    else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;

  case JCS_YCbCr:
    if (cinfo->num_components != 3)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    if (cinfo->in_color_space == JCS_RGB) {
      cconvert->pub.start_pass = rgb_ycc_start;
      cconvert->pub.color_convert = rgb_ycc_convert;
    } else if (cinfo->in_color_space == JCS_YCbCr)
      cconvert->pub.color_convert = null_convert;
    else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;

  case JCS_CMYK:
    if (cinfo->num_components != 4)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    if (cinfo->in_color_space == JCS_CMYK)
      cconvert->pub.color_convert = null_convert;
    else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;

  case JCS_YCCK:
    if (cinfo->num_components != 4)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    if (cinfo->in_color_space == JCS_CMYK) {
      cconvert->pub.start_pass = rgb_ycc_start;
      cconvert->pub.color_convert = cmyk_ycck_convert;
    } else if (cinfo->in_color_space == JCS_YCCK)
      cconvert->pub.color_convert = null_convert;
    else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;

  default:			/* allow null conversion of JCS_UNKNOWN */
    if (cinfo->jpeg_color_space != cinfo->in_color_space ||
	cinfo->num_components != cinfo->input_components)
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    cconvert->pub.color_convert = null_convert;
    break;
  }
}


//
// Need to add #ifdef for Alpha port
//
#if defined (_X86_)

void MRGB2YCbCr(
	int rows,
	int cols,
	unsigned char *inRGB,
	unsigned char *outY,
	unsigned char *outU,
	unsigned char *outV)
{
// make global to ensure proper stack alignment
//	__int64  temp0, tempY, tempU, tempV;

	__asm {

		// initializations
//DS - IJG will always call with rows=1, so don't multiply
//		mov			eax, rows
//		mov			ebx, cols
//		mul			ebx							;number pixels
// reorder to take advantage of v-pipe
		mov			esi, cols
		mov			eax, inRGB

		shr			esi, 3						;number of loops = (rows*cols)/8
		mov			edx, outV

		mov			edi, esi					;loop counter in edi
		mov			ecx, outU

		mov			ebx, outY

		// top of loop

	RGBtoYUV:

		movq		mm1, [eax]					;load #1 G2R2B1G1R1B0G0R0 -> mm1
		pxor		mm6, mm6					;0 -> mm6

		movq		mm0, mm1					;G2R2B1G1R1B0G0R0 -> mm0
		psrlq		mm1, 16						;00G2R2B1G1R1B0 -> mm1

		punpcklbw	mm0, const_0				;R1B0G0R0 -> mm0
		movq		mm7, mm1					;00G2R2B1G1R1B0	-> mm7

		punpcklbw	mm1, const_0				;B1G1R1B0 -> mm1
		movq		mm2, mm0					;R1B0G0R0 -> mm2

		pmaddwd		mm0, const_YR0GR			;yrR1,ygG0+yrR0 -> mm0
		movq		mm3, mm1					;B1G1R1B0 -> mm3

		pmaddwd		mm1, const_YBG0B			;ybB1+ygG1,ybB0 -> mm1
		movq		mm4, mm2					;R1B0G0R0 -> mm4

		pmaddwd		mm2, const_UR0GR			;urR1,ugG0+urR0 -> mm2
		movq		mm5, mm3					;B1G1R1B0 -> mm5

		pmaddwd		mm3, const_UBG0B			;ubB1+ugG1,ubB0 -> mm3
		punpckhbw	mm7, mm6					;00G2R2 -> mm7

		pmaddwd		mm4, const_VR0GR			;vrR1,vgG0+vrR0 -> mm4
		paddd		mm0, mm1					;Y1Y0 -> mm0

		pmaddwd		mm5, const_VBG0B			;vbB1+vgG1,vbB0 -> mm5
//		nop

		movq		mm1, [eax][8]				;load #2 R5B4G4R4B3G3R3B2 -> mm1
		paddd		mm2, mm3					;U1U0 -> mm2

		movq		mm6, mm1					;R5B4G4R4B3G3R3B2 -> mm6
//		nop

		punpcklbw	mm1, const_0				;B3G3R3B2 -> mm1
		paddd		mm4, mm5					;V1V0 -> mm4

		movq		mm5, mm1					;B3G3R3B2 -> mm5
 		psllq		mm1, 32						;R3B200 -> mm1

		paddd		mm1, mm7					;R3B200 + 00G2R2 = R3B2G2R2 -> mm1
//		nop

		punpckhbw	mm6, const_0				;R5B4G4R4 -> mm6
		movq		mm3, mm1					;R3B2G2R2 -> mm3

		pmaddwd		mm1, const_YR0GR			;yrR3,ygG2+yrR2 -> mm1
		movq		mm7, mm5					;B3G3R3B2 -> mm7

		pmaddwd		mm5, const_YBG0B			;ybB3+ygG3,ybB2 -> mm5
		psrad		mm0, FCONVERSION_BITS		;32-bit scaled Y1Y0 -> mm0

		movq		temp0, mm6					;R5B4G4R4 -> temp0
		movq		mm6, mm3					;R3B2G2R2 -> mm6

		pmaddwd		mm6, const_UR0GR			;urR3,ugG2+urR2 -> mm6
		psrad		mm2, FCONVERSION_BITS		;32-bit scaled U1U0 -> mm2

		paddd		mm1, mm5					;Y3Y2 -> mm1
		movq		mm5, mm7					;B3G3R3B2 -> mm5

		pmaddwd		mm7, const_UBG0B			;ubB3+ugG3,ubB2
		psrad		mm1, FCONVERSION_BITS		;32-bit scaled Y3Y2 -> mm1

		pmaddwd		mm3, const_VR0GR			;vrR3,vgG2+vgR2
		packssdw	mm0, mm1					;Y3Y2Y1Y0 -> mm0

		pmaddwd		mm5, const_VBG0B			;vbB3+vgG3,vbB2 -> mm5
		psrad		mm4, FCONVERSION_BITS		;32-bit scaled V1V0 -> mm4	

		movq		mm1, [eax][16]				;load #3 B7G7R7B6G6R6B5G5 -> mm7
		paddd		mm6, mm7					;U3U2 -> mm6

		movq		mm7, mm1					;B7G7R7B6G6R6B5G5 -> mm1
		psrad		mm6, FCONVERSION_BITS		;32-bit scaled U3U2 -> mm6

		paddd		mm3, mm5					;V3V2 -> mm3
		psllq		mm7, 16						;R7B6G6R6B5G500 -> mm7

		movq		mm5, mm7					;R7B6G6R6B5G500 -> mm5
		psrad		mm3, FCONVERSION_BITS		;32-bit scaled V3V2 -> mm3

		movq		tempY, mm0					;32-bit scaled Y3Y2Y1Y0 -> tempY
		packssdw	mm2, mm6					;32-bit scaled U3U2U1U0 -> mm2

		movq		mm0, temp0					;R5B4G4R4 -> mm0
//		nop 

		punpcklbw	mm7, const_0				;B5G500 -> mm7
		movq		mm6, mm0					;R5B4G4R4 -> mm6

		movq		tempU, mm2					;32-bit scaled U3U2U1U0	-> tempU
		psrlq		mm0, 32						;00R5B4 -> mm0

		paddw		mm7, mm0					;B5G5R5B4 -> mm7
		movq		mm2, mm6					;B5B4G4R4 -> mm2

		pmaddwd		mm2, const_YR0GR			;yrR5,ygG4+yrR4 -> mm2
		movq		mm0, mm7					;B5G5R5B4 -> mm0

		pmaddwd		mm7, const_YBG0B			;ybB5+ygG5,ybB4 -> mm7
		packssdw	mm4, mm3					;32-bit scaled V3V2V1V0 -> mm4

		add			eax, 24						;increment RGB count
//		nop										;//JS

		movq		tempV, mm4					;32-bit scaled V3V2V1V0 -> tempV
		movq		mm4, mm6					;B5B4G4R4 -> mm4

		pmaddwd		mm6, const_UR0GR			;urR5,ugG4+urR4
		movq		mm3, mm0					;B5G5R5B4 -> mm0

		pmaddwd		mm0, const_UBG0B			;ubB5+ugG5,ubB4
		paddd		mm2, mm7					;Y5Y4 -> mm2

		pmaddwd		mm4, const_VR0GR			;vrR5,vgG4+vrR4 -> mm4
		pxor		mm7, mm7					;0 -> mm7

		pmaddwd		mm3, const_VBG0B			;vbB5+vgG5,vbB4 -> mm3
		punpckhbw	mm1, mm7					;B7G7R7B6 -> mm1

		paddd		mm0, mm6					;U5U4 -> mm0
		movq		mm6, mm1					;B7G7R7B6 -> mm6

		pmaddwd		mm6, const_YBG0B			;ybB7+ygG7,ybB6 -> mm6
		punpckhbw	mm5, mm7					;R7B6G6R6 -> mm5

		movq		mm7, mm5					;R7B6G6R6 -> mm7
		paddd		mm3, mm4					;V5V4 -> mm3

		pmaddwd		mm5, const_YR0GR			;yrR7,ygG6+yrR6 -> mm5
		movq		mm4, mm1					;B7G7R7B6 -> mm4

		pmaddwd		mm4,const_UBG0B				;ubB7+ugG7,ubB6 -> mm4
		psrad		mm0, FCONVERSION_BITS		;32-bit scaled U5U4 -> mm0

		psrad		mm2, FCONVERSION_BITS		;32-bit scaled Y5Y4 -> mm2	
		nop										;//JS

		paddd		mm6, mm5					;Y7Y6 -> mm6
		movq		mm5, mm7					;R7B6G6R6 -> mm5

		pmaddwd		mm7, const_UR0GR			;urR7,ugG6+ugR6 -> mm7
		psrad		mm3, FCONVERSION_BITS		;32-bit scaled V5V4 -> mm3

		pmaddwd		mm1, const_VBG0B			;vbB7+vgG7,vbB6 -> mm1
		psrad		mm6, FCONVERSION_BITS		;32-bit scaled Y7Y6 -> mm6

		packssdw	mm2, mm6					;Y7Y6Y5Y4 -> mm2
//		nop										;//JS

		pmaddwd		mm5, const_VR0GR			;vrR7,vgG6+vrR6 -> mm5
		paddd		mm7, mm4					;U7U6 -> mm7

		psrad		mm7, FCONVERSION_BITS		;32-bit scaled U7U6 -> mm7
//		nop

		movq		mm6, tempY					;32-bit scaled Y3Y2Y1Y0 -> mm6
		packssdw	mm0, mm7					;32-bit scaled U7U6U5U4 -> mm0

		movq		mm4, tempU					;32-bit scaled U3U2U1U0 -> mm4
		packuswb	mm6, mm2					;all 8 Y values -> mm6

		movq		mm7, const_128				;128,128,128,128 -> mm7
		paddd		mm1, mm5					;V7V6  -> mm1

		paddw		mm0, mm7					;add offset to U7U6U5U4
//		nop

		paddw		mm4, mm7					;add offset to U3U2U1U0
		psrad		mm1, FCONVERSION_BITS		;32-bit scaled V7V6 -> mm1

		movq		[ebx], mm6					;store Y
		packuswb	mm4, mm0					;all 8 U values -> mm4

		movq		mm5, tempV					;32-bit scaled V3V2V1V0 -> mm5
		packssdw	mm3, mm1					;V7V6V5V4 -> mm3

		paddw		mm5, mm7					;add offset to 	V3V2V1V0
		paddw		mm3, mm7					;add offset to 	V7V6V5V4

		movq		[ecx], mm4					;store U
		packuswb	mm5, mm3					;all 8 V values -> mm5

		add			ebx, 8						;increment Y count
		add			ecx, 8						;increment U count

		movq		[edx], mm5					;store V
//		nop

		add			edx, 8						;increment V count
//		nop

		dec			edi							;decrement loop counter
		jnz			RGBtoYUV					;do 24 more bytes if not 0

//JS  The following emms instruction is purposely commented out.
	//emms       // commented out since it is done after the DCT

	} // end of __asm

} // end of MRGB2YCbCr

void MRGBA2YCbCrA(
	int rows,
	int cols,
	unsigned char *inRGBA,
	unsigned char *outY,
	unsigned char *outU,
	unsigned char *outV,
	unsigned char *outA)
{
//	make global to align on stack properly
//	__int64  tempY, tempU, tempV, tempA;


	
// written by Dave Shade - Intel Corp.
// Feb '97
//
// This color space conversion routine converts 
// true color pixels from RGBA to YCbCrA 
// one pass through the loop processes 4 pixels
// there is no provision for cols not an even multiple of 4

	__asm {

		// initializations
//DS - IJG will always call with rows=1, so don't multiply
//		mov			eax, rows
//		mov			ebx, cols
//		mul			ebx					;number pixels
// reorder to take advantage of Pentium v-pipe
		mov			edi, cols
		mov			eax, inRGBA

		shr			edi, 2				;number of loops = (rows*cols)/4
		mov			edx, outV

		mov			ecx, outU
		mov			esi, outA

		mov			ebx, outY

		// top of loop

	RGBAtoYUVA:

		movq		mm3, [eax+8]		;load #1 A1B1G1R1A0B0G0R0 -> mm3
		pxor		mm6, mm6			;0 -> mm6

		movq		mm4, mm3			;A1B1G1R1A0B0G0R0 -> mm4
		psrlq		mm3, 32				;00000000A1B1G1R1 -> mm3

		punpcklwd	mm4, mm3			;A1B1A0B0G1R1G0R0 -> mm4
		add			esi, 4

		movq		mm0, mm4			;A1B1A0B0G1R1G0R0 -> mm0
		punpckhbw	mm4, mm6			;A1B1A0B0 -> mm4

		movq		mm3, mm4			;A1B1A0B0 -> mm3
		punpcklbw	mm0, mm6			;G1R1G0R0 -> mm0

		pmaddwd		mm3, const2_Y0B0B	;ybB1,ybB0 -> mm3
		movq		mm1, mm0			;G1R1G0R0 -> mm1

		pmaddwd		mm0, const2_YGRGR	;yrG1+ygR1,ygG0+yrR0 -> mm0
		movq		mm5, mm4			;A1B1A0B0 -> mm5

		pmaddwd		mm4, const2_U0B0B	;ubB1,ubB0 -> mm4
		movq		mm2, mm1			;G1R1G0R0 -> mm2

		pmaddwd		mm1, const2_UGRGR	;urG1+ugR1,ugG0+urR0 -> mm1
		movq		mm7, mm5			;A1B1A0B0 -> mm7

		pmaddwd		mm5, const2_V0B0B	;vbB1,vbB0 -> mm5
		paddd		mm0, mm3			;Y1Y0 -> mm0

		pmaddwd		mm2, const2_VGRGR	;vgG1+vrR1,vgG0+vrR0 -> mm2
		psrad		mm0, FCONVERSION_BITS	;32 bit scaled Y1Y0

		movq		mm3, [eax]		;*load #2 A3B3G3R3A2B2G2R2 -> mm3
		paddd		mm1, mm4			;U1U0 -> mm2

		pmaddwd		mm7, const2_A		;1*A1,1*A0
		psrad		mm1, FCONVERSION_BITS	;32 bit scaled U1U0

		movq		tempY, mm0			;write out Y1Y0 in 32 bit format
		paddd		mm2, mm5			;V1V0 -> mm2
		
		movq		mm4, mm3			;*A3B3G3R3A2B2G2R2 -> mm4
		psrad		mm2, FCONVERSION_BITS	;32bit scaled V1V0

		movq		tempU, mm1			;write out U1U0 in 32 bit format
		psrlq		mm3, 32				;*00000000A3B3G3R3 -> mm3

		movq		tempV, mm2			;write out V1V0 in 32 bit format
		punpcklwd	mm4, mm3			;*A3B3A2B2G3R3G2R2 -> mm4

		movq		tempA, mm7
		movq		mm0, mm4			;*A3B3A2B2G3R3G2R2 -> mm0

		punpckhbw	mm4, mm6			;*A3B3A2B2 -> mm4
		add			eax, 16

		movq		mm3, mm4			;*A3B3A2B2 -> mm3
		punpcklbw	mm0, mm6			;*G3R3G2R2 -> mm0

		pmaddwd		mm3, const2_Y0B0B	;*ybB3,ybB2 -> mm3
		movq		mm1, mm0			;*G3R3G2R2 -> mm1

		pmaddwd		mm0, const2_YGRGR	;*yrG3+ygR3,ygG2+yrR2 -> mm0
		movq		mm5, mm4			;*A3B3A2B2 -> mm5

		pmaddwd		mm4, const2_U0B0B	;*ubB3,ubB2 -> mm4
		movq		mm2, mm1			;*G3R3G2R2 -> mm2

		pmaddwd		mm1, const2_UGRGR	;*urG3+ugR3,ugG2+urR2 -> mm1
		movq		mm7, mm5			;*A3B3A2B2 -> mm7

		pmaddwd		mm5, const2_V0B0B	;*vbB3,vbB2 -> mm5
		paddd		mm0, mm3			;*Y3Y2 -> mm0

		pmaddwd		mm2, const2_VGRGR	;*vgG3+vrR3,vgG2+vrR2 -> mm2
		psrad		mm0, FCONVERSION_BITS

		pmaddwd		mm7, const2_A		;* 1*A3,1*A2
		paddd		mm1, mm4			;*U3U2 -> mm2

		movq		mm6, const_128
		psrad		mm1, FCONVERSION_BITS

		packssdw	mm0, tempY			;*pack Y3Y2,Y1Y0 -> mm0
		paddd		mm2, mm5			;*V3V2 -> mm2

		psrad		mm2, FCONVERSION_BITS
		add			ebx, 4

		packssdw	mm1, tempU			;*pack U3U2,U1U0 -> mm1

		
		packssdw	mm2, tempV			;*pack V3V2,V1V0 -> mm2
		paddw		mm1, mm6			;add 128

		packssdw	mm7, tempA			;*pack A3A2,A1A0 -> mm7
		paddw		mm2, mm6			;add 128

		packuswb	mm0, mm0
		add			ecx, 4

		packuswb	mm1, mm1
		add			edx, 4

		movd		[ebx-4], mm0
		packuswb	mm2, mm2

		movd		[ecx-4], mm1
		packuswb	mm7, mm7

		movd		[edx-4], mm2

		movd		[esi-4], mm7
		
		dec			edi
		jnz			RGBAtoYUVA


//JS  The following emms instruction is purposely commented out.
	//emms       // commented out since it is done after the DCT


	} // end of __asm

} // end of MRGBA2YCbCrA

void MRGBA2YCbCrALegacy(
	int rows,
	int cols,
	unsigned char *inRGBA,
	unsigned char *outY,
	unsigned char *outU,
	unsigned char *outV,
	unsigned char *outA)
{
// ensure proper stack alignment by making global
//	__int64  tempY, tempU, tempV, tempA;

// written by Dave Shade - Intel Corp.
// Feb '97
//
// This color space conversion routine converts 
// true color pixels from RGBA to YCbCrA 
// This routine subtracts the RGB components from 255 before converting them
// one pass through the loop processes 4 pixels
// there is no provision for cols not an even multiple of 4

	__asm {

		// initializations
//DS - IJG will always call with rows=1, so don't multiply
//		mov			eax, rows
//		mov			ebx, cols
//		mul			ebx					;number pixels
// reorder to take advantage of Pentium v-pipe
		mov			edi, cols
		mov			eax, inRGBA

		shr			edi, 2				;number of loops = (rows*cols)/4
		mov			edx, outV

		mov			ecx, outU
		mov			esi, outA

		mov			ebx, outY

		// top of loop

	RGBAtoYUVALegacy:

		movq		mm3, [eax+8]		;load #1 A1B1G1R1A0B0G0R0 -> mm3
		pxor		mm6, mm6			;0 -> mm6
		
		pxor		mm3, const2_Legacy	; subtract MaxJSample FlashPix rev. 1 "thing"

		movq		mm4, mm3			;A1B1G1R1A0B0G0R0 -> mm4
		psrlq		mm3, 32				;00000000A1B1G1R1 -> mm3

		punpcklwd	mm4, mm3			;A1B1A0B0G1R1G0R0 -> mm4
		add			esi, 4				;opportunistically increment pointer

		movq		mm0, mm4			;A1B1A0B0G1R1G0R0 -> mm0
		punpckhbw	mm4, mm6			;A1B1A0B0 -> mm4

		movq		mm3, mm4			;A1B1A0B0 -> mm3
		punpcklbw	mm0, mm6			;G1R1G0R0 -> mm0

		pmaddwd		mm3, const2_Y0B0B	;ybB1,ybB0 -> mm3
		movq		mm1, mm0			;G1R1G0R0 -> mm1

		pmaddwd		mm0, const2_YGRGR	;yrG1+ygR1,ygG0+yrR0 -> mm0
		movq		mm5, mm4			;A1B1A0B0 -> mm5

		pmaddwd		mm4, const2_U0B0B	;ubB1,ubB0 -> mm4
		movq		mm2, mm1			;G1R1G0R0 -> mm2

		pmaddwd		mm1, const2_UGRGR	;urG1+ugR1,ugG0+urR0 -> mm1
		movq		mm7, mm5			;A1B1A0B0 -> mm7

		pmaddwd		mm5, const2_V0B0B	;vbB1,vbB0 -> mm5
		paddd		mm0, mm3			;Y1Y0 -> mm0

		pmaddwd		mm2, const2_VGRGR	;vgG1+vrR1,vgG0+vrR0 -> mm2
		psrad		mm0, FCONVERSION_BITS	;32 bit scaled Y1Y0

		psrld		mm7, 16				;shift A1A0 down 
		
		movq		mm3, [eax]			;*load #2 A3B3G3R3A2B2G2R2 -> mm3
		paddd		mm1, mm4			;U1U0 -> mm2

		pxor		mm3, const2_Legacy
		psrad		mm1, FCONVERSION_BITS	;32 bit scaled U1U0

		movq		tempY, mm0			;write out Y1Y0 in 32 bit format
		paddd		mm2, mm5			;V1V0 -> mm2
		
		movq		mm4, mm3			;*A3B3G3R3A2B2G2R2 -> mm4
		psrad		mm2, FCONVERSION_BITS	;32bit scaled V1V0

		movq		tempU, mm1			;write out U1U0 in 32 bit format
		psrlq		mm3, 32				;*00000000A3B3G3R3 -> mm3

		movq		tempV, mm2			;write out V1V0 in 32 bit format
		punpcklwd	mm4, mm3			;*A3B3A2B2G3R3G2R2 -> mm4

		movq		tempA, mm7
		movq		mm0, mm4			;*A3B3A2B2G3R3G2R2 -> mm0

		punpckhbw	mm4, mm6			;*A3B3A2B2 -> mm4
		add			eax, 16				;opportunistically increment pointer

		movq		mm3, mm4			;*A3B3A2B2 -> mm3
		punpcklbw	mm0, mm6			;*G3R3G2R2 -> mm0

		pmaddwd		mm3, const2_Y0B0B	;*ybB3,ybB2 -> mm3
		movq		mm1, mm0			;*G3R3G2R2 -> mm1

		pmaddwd		mm0, const2_YGRGR	;*yrG3+ygR3,ygG2+yrR2 -> mm0
		movq		mm5, mm4			;*A3B3A2B2 -> mm5

		pmaddwd		mm4, const2_U0B0B	;*ubB3,ubB2 -> mm4
		movq		mm2, mm1			;*G3R3G2R2 -> mm2

		pmaddwd		mm1, const2_UGRGR	;*urG3+ugR3,ugG2+urR2 -> mm1
		movq		mm7, mm5			;*A3B3A2B2 -> mm7

		pmaddwd		mm5, const2_V0B0B	;*vbB3,vbB2 -> mm5
		paddd		mm0, mm3			;*Y3Y2 -> mm0

		pmaddwd		mm2, const2_VGRGR	;*vgG3+vrR3,vgG2+vrR2 -> mm2
		psrad		mm0, FCONVERSION_BITS	;shift Y3Y2 by 15 bits

		psrld		mm7, 16				;shift the alpha values down
		paddd		mm1, mm4			;*U3U2 -> mm2

		movq		mm6, const_128		; load mm6 with 128
		psrad		mm1, FCONVERSION_BITS	;shift U3U2 by 15 bits

		packssdw	mm0, tempY			;*pack Y3Y2,Y1Y0 -> mm0
		paddd		mm2, mm5			;*V3V2 -> mm2

		packssdw	mm1, tempU			;*pack U3U2,U1U0 -> mm1

		psrad		mm2, FCONVERSION_BITS	;shift V3V2 by 15 bits
		add			ebx, 4				;opportunistically increment pointer

		packssdw	mm2, tempV			;pack V3V2,V1V0 -> mm2
		paddw		mm1, mm6			;add 128

		packssdw	mm7, tempA			;pack A3A2,A1A0 -> mm7
		paddw		mm2, mm6			;add 128

		packuswb	mm0, mm0			;pack Y3Y2Y1Y0 from 16 bit to 8 bit
		add			ecx, 4				;opportunistically increment pointer

		packuswb	mm1, mm1			;pack U3U2U1U0 from 16 bit to 8 bit
		add			edx, 4				;opportunistically increment pointer

		movd		[ebx-4], mm0		;write out Y3Y2Y1Y0
		packuswb	mm2, mm2			;pack V3V2V1V0 from 16 bit to 8 bit

		movd		[ecx-4], mm1		;write out U3U2U1U0
		packuswb	mm7, mm7			;pack A3A2A1A0 from 16 bit to 8 bits

		movd		[edx-4], mm2		;write out V3V2V1V0

		movd		[esi-4], mm7		;write out A3A2A1A0
		
		dec			edi					;subtract 4 from number of pixels
		jnz			RGBAtoYUVALegacy


//JS  The following emms instruction is purposely commented out.
	//emms       // commented out since it is done after the DCT


	} // end of __asm

} // end of MRGBA2YCbCrALegacy

#endif // defined (_X86_)