/*
 * Automatic language and codepage detector
 * 
 * Bob Powell, 2/97
 * Copyright (C) 1996, 1997, Microsoft Corp.  All rights reserved.
 */

#ifdef  __cplusplus

#include <wtypes.h>
#include <limits.h>

#include "lcdetect.h"
#include "lccommon.h"

#include <qsort.h>


// Turn this on in SOURCES to enable debug output
#ifdef DEBUG_LCDETECT
#include <stdio.h>
extern int g_fDebug;
#define debug(x) { if (g_fDebug) { x; }}
#define unmapch(x) ((x) >= 2 ? (x)+'a'-2 : ' ')
#else
#define debug(x)
#endif

class LCDetect;
typedef LCDetect *PLCDetect;

class Language;
class Language7Bit;
class Language8Bit;
class LanguageUnicode;
typedef Language *PLanguage;
typedef Language7Bit *PLanguage7Bit;
typedef Language8Bit *PLanguage8Bit;
typedef LanguageUnicode *PLanguageUnicode;

class CScore;
class CScores;

/****************************************************************/

#define MAXSCORES 50			// Max possible simultaneous # of scores

#define MINRAWSCORE 100			// Score threshhold (weight * char count) 
								// for further processing

/****************************************************************/

// Histograms

// A histogram stores an array of n-gram occurrence counts.  
// HElt stores the count, at present this is an unsigned char.

// The in-memory structure is similar to the file.
// The histogram array pointers m_panElts point into the mapped file image.

class Histogram {

public:
	Histogram (const PFileHistogramSection pHS, const PHIdx pMap);
	Histogram (const Histogram &H, const PHIdx pMap);
	virtual ~Histogram (void);

	DWORD Validate (DWORD nBytes) const;

	UCHAR Dimensionality (void) { return m_nDimensionality; }
	UCHAR EdgeSize (void) { return m_nEdgeSize; }
	USHORT CodePage (void) { return m_nCodePage; }
	USHORT GetRangeID (void) { return m_nRangeID; }
	USHORT NElts (void) { return m_nElts; }
	PHIdx GetMap (void) { return m_pMap; }

	HElt Ref (USHORT i1) const { return m_panElts[i1]; }
	HElt Ref (UCHAR i1, UCHAR i2) const {
		return m_panElts[(i1 * m_nEdgeSize) + i2]; }
	HElt Ref (UCHAR i1, UCHAR i2, UCHAR i3) const {
		return m_panElts[((i1 * m_nEdgeSize) + i2) * m_nEdgeSize + i3]; }

	HElt *Array (void) { return m_panElts; }

protected:
	UCHAR m_nDimensionality;		// 1=unigram, 2=digram etc.
	UCHAR m_nEdgeSize;				// edge size (is a function of char map)
	union {
		USHORT m_nCodePage;			// For 7 and 8-bit, is code page
		USHORT m_nRangeID;			// For Unicode, is sub-language range ID
	};
	USHORT m_nElts;					// (edge size ^ dimensionality)
	PHIdx m_pMap;					// char/WCHAR to histogram idx mapping

	HElt *m_panElts;				// array of elements / counts
};
typedef Histogram *PHistogram;

/****************************************************************/

// A Language object stores all the detection state for a given language,
// i.e. primary language ID.

class Language {
public:
	// nCodePages is same as nSubLangs
	Language (PLCDetect pL, int nLangID, int nCodePages, int nRangeID = 0);
	virtual ~Language (void) { }

	virtual DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx) = 0;

	// Score the code pages for this language
	virtual void ScoreCodePage (LPCSTR, int nCh, CScore &S, int &idx) const;

	int LanguageID (void) const { return m_nLangID; }
	int NCodePages (void) const { return m_nCodePages; }
	int NSubLangs (void) const { return m_nSubLangs; }
	int RangeID (void) const { return m_nRangeID; }
	int GetScoreIdx (void) const { return m_nScoreIdx; }
	void SetScoreIdx (int nScoreIdx) { m_nScoreIdx = nScoreIdx; }

	virtual int GetCodePage (int n) const { return 0; }
	virtual int GetSublangRangeID (int n) const { return 0; }
	virtual int GetSublangID (int n) const { return 0; }

	virtual DetectionType Type (void) = 0;
	virtual Language7Bit const * GetLanguage7Bit (void) const { return NULL; }
	virtual Language8Bit const * GetLanguage8Bit (void) const { return NULL; }
	virtual LanguageUnicode const * GetLanguageUnicode (void) const { return NULL; }

protected:
	PLCDetect m_pLC;

	int m_nLangID;		// Win32 primary language ID
	int m_nRangeID;		// Unicode range ID, for Unicode langs
	union {
		int m_nCodePages;	// # of code pages trained for this language
		int m_nSubLangs;
	};
	int m_nScoreIdx;	// Used to create a unique index into the score arrays
						// for each lang + cp combination, to eliminate the
						// need to search the arrays to merge scores.  Add
						// the code page index to this to get the array index.
};

////////////////////////////////////////////////////////////////

class Language7Bit : public Language {
public:
	Language7Bit (PLCDetect pL, int nLangID, int nCodePages);
	~Language7Bit (void);

	DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx);

	void ScoreCodePage (LPCSTR, int nCh, CScore &S, int &idx) const;

	int GetCodePage (int n) const { return m_ppCodePageHistogram[n]->CodePage();}
	virtual DetectionType Type (void) { return DETECT_7BIT; }

	PHistogram GetLangHistogram (void) const { return m_pLangHistogram; }
	PHistogram GetCodePageHistogram (int i) const { 
		return m_ppCodePageHistogram[i]; }

	virtual Language7Bit const * GetLanguage7Bit (void) const { return this; }

	const PHElt * GetPHEltArray (void) const { return m_paHElt; }

private:
	PHistogram m_pLangHistogram;
	PHistogram m_ppCodePageHistogram[MAXSUBLANG];

	PHElt m_paHElt[MAXSUBLANG];
};

////////////////////////////////////////////////////////////////

class Language8Bit : public Language {
public:
	Language8Bit (PLCDetect pL, int nLangID, int nCodePages);
	~Language8Bit (void);

	DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx);

	int GetCodePage (int n) const { return m_ppHistogram[n]->CodePage(); }

	virtual DetectionType Type (void) { return DETECT_8BIT; }

	PHistogram GetHistogram (int i) const { return m_ppHistogram[i]; }

	virtual Language8Bit const * GetLanguage8Bit (void) const { return this; }

private:
	PHistogram m_ppHistogram[MAXSUBLANG];
};

////////////////////////////////////////////////////////////////

class LanguageUnicode : public Language {
public:
	LanguageUnicode (PLCDetect pL, int nLangID, int nRecordCount, int nRangeID);
	~LanguageUnicode (void);
	
	DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx);
	
	void ScoreSublanguages (LPCWSTR wcs, int nch, CScores &S) const;

	int GetSublangRangeID (int i) const{return GetHistogram(i)->GetRangeID();}
	PLanguageUnicode GetSublanguage (int n) const;

	virtual DetectionType Type (void) { return DETECT_UNICODE; }

	PHistogram GetHistogram (int i) const { return m_ppSubLangHistogram[i]; }

	virtual LanguageUnicode const * GetLanguageUnicode (void) const { 
		return this; 
	}

	const PHElt * GetPHEltArray (void) const { return m_paHElt; }

private:
	PHistogram m_ppSubLangHistogram[MAXSUBLANG];

	PHElt m_paHElt[MAXSUBLANG];
};

/****************************************************************/

class Charmap {

public:
	Charmap (PFileMapSection pMS) :	m_nID(pMS->m_dwID),	m_nSize(pMS->m_dwSize),
		m_nUnique(pMS->m_dwNUnique), m_pElts( (PHIdx) (&pMS[1]) ) { }

//	int ID (void) const { return m_nID; }
	int Size (void) const { return m_nSize; }
	int NUnique (void) const { return m_nUnique; }
	PHIdx Map (void) const { return m_pElts; }
	HIdx Map (WCHAR x) const { return m_pElts[x]; }

private:
	int m_nID;			// ID by which hardwired code finds the table
	int	m_nSize;		// size of table (256 or 65536)
	int m_nUnique;		// # of unique output values
	
	PHIdx m_pElts;
};
typedef Charmap *PCharmap;

/****************************************************************/

// class CScore -- score for one lang and/or code page, variously used for
// individual chunks and also for an entire document.

class CScore {
public:
	// Only these two slots need to be initialized
	CScore (void) : m_nScore(0), m_nChars(0) {}
	~CScore (void) { }
	
	const PLanguage GetLang (void) const { return m_pLang; }
	int GetScore (void) const { return m_nScore; }
	unsigned short GetCodePage (void) const { return m_nCodePage; }
	unsigned short GetCharCount (void) const { return m_nChars; }

	void SetLang (PLanguage p) { m_pLang = p; }
	void SetScore (int x) { m_nScore = x; }
	void SetCharCount (unsigned x) { m_nChars = (unsigned short)x; }
	void SetCodePage (unsigned x) { m_nCodePage = (unsigned short)x; }

	void Add (CScore &S) { 
		SetLang(S.GetLang());
		SetCodePage(S.GetCodePage());
		SetScore(GetScore() + S.GetScore());
		SetCharCount(GetCharCount() + S.GetCharCount());
	}
	CScore & operator += (CScore &S) { Add (S); return *this; }

	int operator <= (CScore &S) {
		// Special:  always put 8-bit langs first since the code page
		// matters more for them.
		if (GetLang()->Type() != S.GetLang()->Type())
			return GetLang()->Type() == DETECT_8BIT ? -1 : 1;
		return GetScore() <= S.GetScore();
	}

#ifdef DEBUG_LCDETECT
	void Print(void) {
		printf("Lang=%d CodePage=%d Score=%d NChars=%d\n",
			GetLang() ? GetLang()->LanguageID() : -1, 
			GetCodePage(), GetScore(), GetCharCount());
	}
#endif

private:
	PLanguage m_pLang;
	int m_nScore;
	unsigned short m_nCodePage;
	unsigned short m_nChars;
};
typedef CScore *PScore;

////////////////////////////////////////////////////////////////

// class CScores
//
// For SBCS detection, the index e.g. Ref(i) is the language+codepage index,
// one of a contiguous set of values which identifies each unique supported
// language and codepage combination.
//
// For DBCS detection, the index is just the Unicode language group.

class CScores {
public:
	CScores (int nAlloc, PScore p) : m_nAlloc(nAlloc), m_nUsed(0), m_p(p) { }
	virtual ~CScores (void) { }

	void Reset (void) {
		memset ((void *)m_p, 0, sizeof(CScore) * m_nUsed);
		m_nUsed = 0;
	}

	unsigned int &NElts (void) { return m_nUsed; }
	CScore &Ref (unsigned int n) {
		if (m_nUsed <= n)
			m_nUsed = n + 1; 
		return m_p[n]; 
	}

	void SelectCodePages (void);

	void RemoveZeroScores (void) {
		for (unsigned int i = 0, j = 0; i < m_nUsed; i++)
		{
			if (m_p[i].GetScore() > MINRAWSCORE)
				m_p[j++] = m_p[i];
		}
		m_nUsed = j;
	}

	// Sort by decreasing score.
	// Instantiates template qsort using CScore::operator <=

	void SortByScore (void) {
		RemoveZeroScores ();
		if (m_nUsed)
			QSort (m_p, m_nUsed, FALSE);
	}

	CScore & FindHighScore (void) {
		int highscore = 0;
		for (unsigned int i = 0, highidx = 0; i < m_nUsed; i++) {
			if (m_p[i].GetScore() > highscore)
			{
				highscore = m_p[i].GetScore();
				highidx = i;
			}
		}
		return m_p[highidx];
	}

protected:
	unsigned int m_nAlloc;
	unsigned int m_nUsed;	// high water mark to optimize NElts(), Reset()
	PScore m_p;				// score array, typically per TScores<NNN>
};

template<ULONG Size>class TScores : public CScores {

public:
	TScores (void) : CScores (Size, m_S) { }
	virtual ~TScores (void) { }

private:
	CScore m_S[Size];
};

////////////////////////////////////////////////////////////////

class LCDetect {

public:
	LCDetect (HMODULE hM);
	~LCDetect (void);

	unsigned int GetNCharmaps() const { return m_nCharmaps; }
	unsigned int GetN7BitLanguages() const { return m_n7BitLanguages; }
	unsigned int GetN8BitLanguages() const { return m_n8BitLanguages; }
	unsigned int GetNUnicodeLanguages() const { return m_nUnicodeLanguages; }

	PLanguage7Bit Get7BitLanguage (int i) const { return m_pp7BitLanguages[i]; }
	PLanguage8Bit Get8BitLanguage (int i) const { return m_pp8BitLanguages[i]; }
	PLanguageUnicode GetUnicodeLanguage (int i) const { return m_ppUnicodeLanguages[i]; }

	PHIdx GetMap (int i) const { return m_ppCharmaps[i]->Map(); }

	const LCDConfigure &GetConfig () const { return m_LCDConfigureDefault; }

	DWORD LoadState (void);

	DWORD DetectA (LPCSTR pStr, int nChars, PLCDScore paScores, 
							int *pnScores, PCLCDConfigure pLCDC) const;

	DWORD DetectW (LPCWSTR wcs, int nInputChars, PLCDScore paScores, 
							int *pnScores, PCLCDConfigure pLCDC) const;

private:
	DWORD Initialize7BitLanguage (PFileLanguageSection pLS, PLanguage *ppL);
	DWORD Initialize8BitLanguage (PFileLanguageSection pLS, Language **ppL);
	DWORD InitializeUnicodeLanguage (PFileLanguageSection pLS,Language **ppL);
	DWORD LoadLanguageSection (void *pv, int nSectionSize, PLanguage *ppL);
	DWORD LoadHistogramSection (void *pv, int nSectionSize, Language *pL);
	DWORD LoadMapSection (void *pv, int nSectionSize);
	DWORD BuildState (DWORD nFileSize);

	void Score7Bit (LPCSTR pcszText, int nChars, CScores &S) const;
	void Score8Bit (LPCSTR pcszText, int nChars, CScores &S) const;
	int ScoreCodePage (LPCSTR pStr, int nChars, CScore &S) const;
	int ChooseDetectionType (LPCSTR pcszText, int nChars) const;
	void ScoreLanguageA (LPCSTR pStr, int nChars, CScores &S) const;
	void ScoreLanguageW (LPCWSTR wcs, int nChars, CScores &S, PCLCDConfigure) const;
	void ScoreLanguageAsSBCS (LPCWSTR wcs, int nch, CScores &S) const;
	void ScoreUnicodeSublanguages (PLanguageUnicode pL, LPCWSTR wcs, 
			int nch, CScores &S) const;

private:
	// Language training info virtual-mapped in training file

	unsigned int m_nCharmaps;
	unsigned int m_n7BitLanguages;
	unsigned int m_n8BitLanguages;
	unsigned int m_nUnicodeLanguages;

	PCharmap *m_ppCharmaps;
	PLanguage7Bit *m_pp7BitLanguages;
	PLanguage8Bit *m_pp8BitLanguages;
	PLanguageUnicode *m_ppUnicodeLanguages;

	// Cached information for the optimized scoring inner-loops.

	PHElt m_paHElt7Bit[MAX7BITLANG];
	PHElt m_paHElt8Bit[MAXSCORES];
	int m_nHElt8Bit;

	// Special 7-bit lang histogram for ScoreLanguageAsSBCS()

	PHistogram m_pHU27Bit;

	// Initialization state variables

	unsigned int m_n7BitLangsRead;
	unsigned int m_n8BitLangsRead;
	unsigned int m_nUnicodeLangsRead;
	unsigned int m_nMapsRead;
	int m_nHistogramsRead;
	int m_nScoreIdx;

	// Default configuration to use when NULL parameter passed to detect

	LCDConfigure m_LCDConfigureDefault;

	// File mapping information for the training data file

	HANDLE m_hf;
	HANDLE m_hmap;
	void *m_pv;

	HMODULE m_hModule;
};

////////////////////////////////////////////////////////////////

inline PLanguageUnicode 
LanguageUnicode::GetSublanguage (int n) const 
{ 
	return m_pLC->GetUnicodeLanguage(GetSublangRangeID(n));
}

#endif  // __cplusplus