1383 lines
32 KiB
C++
1383 lines
32 KiB
C++
// Analyze.cpp
|
||
//
|
||
// main CHART PARSING routines
|
||
//
|
||
// Copyright 2000 Microsoft Corp.
|
||
//
|
||
// Modification History:
|
||
// 31 MAR 2000 bhshin created
|
||
|
||
#include "StdAfx.h"
|
||
#include "KorWbrk.h"
|
||
#include "Record.h"
|
||
#include "Analyze.h"
|
||
#include "Lookup.h"
|
||
#include "Morpho.h"
|
||
#include "unikor.h"
|
||
#include "GuessIndex.h"
|
||
#include "WbData.h"
|
||
#include "Token.h"
|
||
|
||
//////////////////////////////////////////////////////////////////////////////
|
||
// Definitions
|
||
|
||
// threshold for making index terms
|
||
const int THRESHOLD_MAKE_INDEX = 3;
|
||
const int LENGTH_MAKE_INDEX = 4;
|
||
|
||
//////////////////////////////////////////////////////////////////////////////
|
||
// Function Declarations
|
||
|
||
BOOL PreFiltering(const WCHAR *pwzToken, int cchInput, WCHAR wchLast, CIndexInfo *pIndexInfo);
|
||
BOOL PreProcessingLeafNode(PARSE_INFO *pPI, CLeafChartPool *pLeafChartPool);
|
||
|
||
BOOL MakeCombinedRecord(PARSE_INFO *pPI, int nLeftRec, int nRightRec, float fWeight);
|
||
|
||
BOOL MakeIndexTerms(PARSE_INFO *pPI, CEndChartPool *pEndChartPool,
|
||
CIndexInfo *pIndexInfo, BOOL *pfNeedGuessing);
|
||
|
||
BOOL MakeQueryTerms(PARSE_INFO *pPI, CEndChartPool *pEndChartPool,
|
||
CIndexInfo *pIndexInfo, BOOL *pfNeedGuessing);
|
||
|
||
BOOL TraverseIndexString(PARSE_INFO *pPI, BOOL fOnlySuffix, WORD_REC *pWordRec, CIndexInfo *pIndexInfo);
|
||
|
||
BOOL TraverseQueryString(PARSE_INFO *pPI, WORD_REC *pWordRec, WCHAR *pwzSeqTerm, int cchSeqTerm);
|
||
|
||
|
||
//////////////////////////////////////////////////////////////////////////////
|
||
// Function Implementation
|
||
|
||
// AnalyzeString
|
||
//
|
||
// lookup & process CHART PARSING (index time)
|
||
//
|
||
// Parameters:
|
||
// pPI -> (PARSE_INFO*) ptr to parse-info struct
|
||
// fQuery -> (BOOL) query flag
|
||
// pwzInput -> (const WCHAR*) input string to analyze (NOT decomposed)
|
||
// cchInput -> (int) length of input string to analyze
|
||
// cwcSrcPos -> (int) original source start position
|
||
// pIndexList -> (CIndexList *) output index list
|
||
// wchLast -> (WCHAR) last character of previous token
|
||
//
|
||
// Result:
|
||
// (BOOL) TRUE if succeed, otherwise return FALSE
|
||
//
|
||
// 12APR00 bhshin added PreFiltering
|
||
// 30MAR00 bhshin began
|
||
BOOL AnalyzeString(PARSE_INFO *pPI,
|
||
BOOL fQuery,
|
||
const WCHAR *pwzInput,
|
||
int cchInput,
|
||
int cwcSrcPos,
|
||
CIndexInfo *pIndexInfo,
|
||
WCHAR wchLast)
|
||
{
|
||
CLeafChartPool LeafChartPool;
|
||
CEndChartPool EndChartPool;
|
||
BOOL fNeedGuessing;
|
||
WCHAR wchStart, wchEnd;
|
||
|
||
if (cchInput > MAX_INPUT_TOKEN)
|
||
return TRUE;
|
||
|
||
InitAnalyze(pPI);
|
||
|
||
// copy input string to process
|
||
pPI->pwzInputString = new WCHAR[cchInput+1];
|
||
if (pPI->pwzInputString == NULL)
|
||
goto ErrorReturn;
|
||
|
||
wcsncpy(pPI->pwzInputString, pwzInput, cchInput);
|
||
pPI->pwzInputString[cchInput] = L'\0';
|
||
|
||
// check string inside group
|
||
if (cwcSrcPos > 0)
|
||
{
|
||
wchStart = *(pwzInput - 1);
|
||
wchEnd = *(pwzInput + cchInput);
|
||
|
||
// check inside group string
|
||
if (fIsGroupStart(wchStart) && fIsGroupEnd(wchEnd))
|
||
{
|
||
// add index and keep going
|
||
pIndexInfo->AddIndex(pPI->pwzInputString, cchInput, WEIGHT_HARD_MATCH, 0, cchInput-1);
|
||
WB_LOG_ADD_INDEX(pPI->pwzInputString, cchInput, INDEX_INSIDE_GROUP);
|
||
}
|
||
}
|
||
|
||
// check pre-filtering
|
||
if (PreFiltering(pPI->pwzInputString, cchInput, wchLast, pIndexInfo))
|
||
{
|
||
// stop processing
|
||
UninitAnalyze(pPI);
|
||
return TRUE;
|
||
}
|
||
|
||
// normalize string
|
||
pPI->pwzSourceString = new WCHAR[cchInput*3+1];
|
||
if (pPI->pwzSourceString == NULL)
|
||
goto ErrorReturn;
|
||
|
||
pPI->rgCharInfo = new CHAR_INFO_REC[cchInput*3+1];
|
||
if (pPI->rgCharInfo == NULL)
|
||
goto ErrorReturn;
|
||
|
||
decompose_jamo(pPI->pwzSourceString, pPI->pwzInputString, pPI->rgCharInfo, cchInput*3+1);
|
||
|
||
pPI->nLen = wcslen(pPI->pwzSourceString);
|
||
pPI->nMaxLT = pPI->nLen-1;
|
||
|
||
// person's name guessing
|
||
GuessPersonName(pPI, pIndexInfo);
|
||
|
||
// index time lookup (lookup all pos)
|
||
if (!DictionaryLookup(pPI, pwzInput, cchInput, FALSE))
|
||
goto ErrorReturn;
|
||
|
||
if (!IntializeLeafChartPool(pPI, &LeafChartPool))
|
||
goto ErrorReturn;
|
||
|
||
if (!PreProcessingLeafNode(pPI, &LeafChartPool))
|
||
goto ErrorReturn;
|
||
|
||
if (!ChartParsing(pPI, &LeafChartPool, &EndChartPool))
|
||
goto ErrorReturn;
|
||
|
||
if (fQuery)
|
||
{
|
||
if (!MakeQueryTerms(pPI, &EndChartPool, pIndexInfo, &fNeedGuessing))
|
||
goto ErrorReturn;
|
||
}
|
||
else
|
||
{
|
||
if (!MakeIndexTerms(pPI, &EndChartPool, pIndexInfo, &fNeedGuessing))
|
||
goto ErrorReturn;
|
||
}
|
||
|
||
// if no all cover record, then guess index term
|
||
if (fNeedGuessing)
|
||
{
|
||
GuessIndexTerms(pPI, &LeafChartPool, pIndexInfo);
|
||
}
|
||
else
|
||
{
|
||
// all cover but no index term (verb/adj/Ix) -> add itself
|
||
if (pIndexInfo->IsEmpty())
|
||
{
|
||
WB_LOG_ROOT_INDEX(L"", TRUE);
|
||
|
||
pIndexInfo->AddIndex(pwzInput, cchInput, WEIGHT_HARD_MATCH, 0, cchInput-1);
|
||
WB_LOG_ADD_INDEX(pwzInput, cchInput, INDEX_PARSE);
|
||
}
|
||
}
|
||
|
||
UninitAnalyze(pPI);
|
||
|
||
return TRUE;
|
||
|
||
ErrorReturn:
|
||
UninitAnalyze(pPI);
|
||
|
||
return FALSE;
|
||
}
|
||
|
||
// InitAnalyze
|
||
//
|
||
// init the parse state struct required for parsing
|
||
//
|
||
// Parameters:
|
||
// pPI -> (PARSE_INFO*) ptr to parse-info struct
|
||
// <- (PARSE_INFO*) initialized parse-info struct
|
||
//
|
||
// Result:
|
||
// (void)
|
||
//
|
||
// 20MAR00 bhshin began
|
||
void InitAnalyze(PARSE_INFO *pPI)
|
||
{
|
||
pPI->pwzInputString = NULL;
|
||
pPI->pwzSourceString = NULL;
|
||
|
||
pPI->rgCharInfo = NULL;
|
||
|
||
pPI->nMaxLT = 0;
|
||
|
||
InitRecords(pPI);
|
||
}
|
||
|
||
// UninitAnalyze
|
||
//
|
||
// clean up the parse state struct
|
||
//
|
||
// Parameters:
|
||
// pPI -> (PARSE_INFO*) ptr to parse-info struct
|
||
//
|
||
// Result:
|
||
// (void)
|
||
//
|
||
// 20MAR00 bhshin began
|
||
void UninitAnalyze(PARSE_INFO *pPI)
|
||
{
|
||
UninitRecords(pPI);
|
||
|
||
if (pPI->pwzInputString != NULL)
|
||
{
|
||
delete [] pPI->pwzInputString;
|
||
}
|
||
|
||
if (pPI->pwzSourceString != NULL)
|
||
{
|
||
delete [] pPI->pwzSourceString;
|
||
}
|
||
|
||
if (pPI->rgCharInfo != NULL)
|
||
{
|
||
delete [] pPI->rgCharInfo;
|
||
}
|
||
}
|
||
|
||
// PreFiltering
|
||
//
|
||
// check filtered token with automata
|
||
//
|
||
// Parameters:
|
||
// pwzToken -> (const WCHAR*) current token string (NULL terminated)
|
||
// cchInput -> (int) length of input string to analyze
|
||
// wchLast -> (WCHAR) last character of previous token
|
||
// pIndexInfo -> (CIndexInfo *) output index list
|
||
//
|
||
// Result:
|
||
// (BOOL) TRUE if it's filtered, otherwise return FALSE
|
||
//
|
||
// 20APR00 bhshin added single length processing
|
||
// 14APR00 bhshin began
|
||
BOOL PreFiltering(const WCHAR *pwzToken, int cchInput, WCHAR wchLast, CIndexInfo *pIndexInfo)
|
||
{
|
||
WCHAR wzInput[MAX_INDEX_STRING+2];
|
||
WCHAR *pwzInput;
|
||
WCHAR wchPrev, wchCurr;
|
||
BOOL fStop, fResult;
|
||
|
||
// single length processing
|
||
if (cchInput == 1)
|
||
{
|
||
pIndexInfo->AddIndex(pwzToken, cchInput, WEIGHT_HARD_MATCH, 0, cchInput-1);
|
||
WB_LOG_ADD_INDEX(pwzToken, cchInput, INDEX_PREFILTER);
|
||
|
||
return TRUE;
|
||
}
|
||
|
||
if (wchLast == L'\0')
|
||
return FALSE;
|
||
|
||
// make string to check automata
|
||
wzInput[0] = wchLast;
|
||
wcscpy(wzInput+1, pwzToken);
|
||
|
||
// automata
|
||
pwzInput = wzInput;
|
||
|
||
fResult = FALSE;
|
||
fStop = FALSE;
|
||
wchPrev = L'\0';
|
||
|
||
// <...<2E><>(<28><>)> <<3C><><EFBFBD><EFBFBD>, <20><><EFBFBD><EFBFBD>, <20><><EFBFBD>ؼ<EFBFBD>, <20><><EFBFBD>Ͽ<EFBFBD>>
|
||
// <...<2E><>> <<3C><><EFBFBD><EFBFBD>, <20><><EFBFBD><EFBFBD>, <20><><EFBFBD>ؼ<EFBFBD>, <20><><EFBFBD>Ͽ<EFBFBD>>
|
||
// <...<2E><>> <<3C><><EFBFBD><EFBFBD>, <20><><EFBFBD><EFBFBD>, <20><><EFBFBD>ؼ<EFBFBD>, <20><><EFBFBD>Ͽ<EFBFBD>>
|
||
// <...<2E><>> <<3C><>, <20><><EFBFBD><EFBFBD>>
|
||
while (*pwzInput != L'\0')
|
||
{
|
||
wchCurr = *pwzInput;
|
||
|
||
switch (wchPrev)
|
||
{
|
||
case 0x0000: // NULL
|
||
// wchCurr != (<28><> <20><> <20><> <20><>)
|
||
if (wchCurr != 0xC744 && wchCurr != 0xB97C && wchCurr != 0xC5D0 && wchCurr != 0xB85C)
|
||
{
|
||
WCHAR wzLast[2];
|
||
WCHAR wzDecomp[4];
|
||
int cchDecomp;
|
||
CHAR_INFO_REC rgCharInfo[4];
|
||
|
||
wzLast[0] = wchCurr;
|
||
wzLast[1] = L'\0';
|
||
|
||
decompose_jamo(wzDecomp, wzLast, rgCharInfo, 4);
|
||
cchDecomp = wcslen(wzDecomp);
|
||
|
||
if (cchDecomp == 0)
|
||
break;
|
||
|
||
wchCurr = wzDecomp[cchDecomp-1];
|
||
|
||
// check jong seong <20><>
|
||
if (wchCurr != 0x11AF)
|
||
fStop = TRUE;
|
||
}
|
||
break;
|
||
case 0xC744: // <20><>
|
||
case 0xB97C: // <20><>
|
||
if (wchCurr != 0xC704) // <20><>
|
||
fStop = TRUE;
|
||
break;
|
||
case 0xC5D0: // <20><>
|
||
if (wchCurr != 0xB300) // <20><>
|
||
fStop = TRUE;
|
||
break;
|
||
case 0xB85C: // <20><>
|
||
if (wchCurr != 0xC778) // <20><>
|
||
fStop = TRUE;
|
||
break;
|
||
case 0xC704: // <20><>
|
||
case 0xB300: // <20><>
|
||
case 0xC778: // <20><>
|
||
if (wchCurr == 0xD55C || wchCurr == 0xD574) // <20><> <20><>
|
||
fResult = TRUE;
|
||
else if (wchCurr != 0xD558) // <20><>
|
||
fStop = TRUE;
|
||
break;
|
||
case 0xD574: // <20><>
|
||
if (wchCurr != 0xC11C) // <20><>
|
||
fStop = TRUE;
|
||
break;
|
||
case 0xD558: // <20><>
|
||
if (wchCurr == 0xC5EC) // <20><>
|
||
fResult = TRUE;
|
||
else
|
||
fStop = TRUE;
|
||
break;
|
||
case 0x11AF: // jong seong <20><>
|
||
if (wchCurr == 0xC218) // <20><>
|
||
fResult = TRUE;
|
||
else
|
||
fStop = TRUE;
|
||
break;
|
||
case 0xC218:
|
||
if (wchCurr != 0xB97C) // <20><>
|
||
fStop = TRUE;
|
||
break;
|
||
default:
|
||
fStop = TRUE;
|
||
break;
|
||
}
|
||
|
||
if (fStop)
|
||
return FALSE; // not filtered
|
||
|
||
wchPrev = wchCurr;
|
||
|
||
pwzInput++;
|
||
}
|
||
|
||
ATLTRACE("BLOCK: PreFiltering\n");
|
||
|
||
return fResult; // filter string
|
||
}
|
||
|
||
// IntializeLeafChartPool
|
||
//
|
||
// init Leaf Chart Pool & copy records of PI into LeafChart
|
||
//
|
||
// Parameters:
|
||
// pPI -> (PARSE_INFO*) ptr to parse-info struct
|
||
// pLeafChartPool <- (CLeafChartPool*) ptr to Leaf Chart Pool
|
||
//
|
||
// Result:
|
||
// (BOOL) TRUE if succeed, otherwise return FALSE
|
||
//
|
||
// 31MAR00 bhshin began
|
||
BOOL IntializeLeafChartPool(PARSE_INFO *pPI, CLeafChartPool *pLeafChartPool)
|
||
{
|
||
int curr;
|
||
|
||
if (pPI == NULL || pLeafChartPool == NULL)
|
||
return FALSE;
|
||
|
||
if (!pLeafChartPool->Initialize(pPI))
|
||
return FALSE;
|
||
|
||
// copy all the Record ID into CLeafChartPool
|
||
for (curr = MIN_RECORD; curr < pPI->nCurrRec; curr++)
|
||
{
|
||
if (pLeafChartPool->AddRecord(curr) < MIN_RECORD)
|
||
return FALSE;
|
||
}
|
||
|
||
return TRUE;
|
||
}
|
||
|
||
// PreProcessingLeafNode
|
||
//
|
||
// pre processing leaf chart pool
|
||
//
|
||
// Parameters:
|
||
// pPI -> (PARSE_INFO*) ptr to parse-info struct
|
||
// pLeafChartPool <- (CLeafChartPool*) ptr to Leaf Chart Pool
|
||
//
|
||
// Result:
|
||
// (BOOL) TRUE if succeed, otherwise return FALSE
|
||
//
|
||
// 31MAR00 bhshin began
|
||
BOOL PreProcessingLeafNode(PARSE_INFO *pPI, CLeafChartPool *pLeafChartPool)
|
||
{
|
||
int i;
|
||
int curr, next;
|
||
int currSub, nextSub;
|
||
WORD_REC *pWordRec, *pRecSub;
|
||
BYTE bPOS;
|
||
int nFT, nLT;
|
||
int nMaxEnding, iMaxEnding;
|
||
int nMaxParticle, iMaxParticle;
|
||
int cchFuncWord;
|
||
|
||
if (pPI == NULL || pLeafChartPool == NULL)
|
||
return FALSE;
|
||
|
||
// traverse all the record of LeafChartPool
|
||
for (i = 0; i < pPI->nLen; i++)
|
||
{
|
||
curr = pLeafChartPool->GetFTHead(i);
|
||
|
||
while (curr != 0)
|
||
{
|
||
next = pLeafChartPool->GetFTNext(curr);
|
||
|
||
pWordRec = pLeafChartPool->GetWordRec(curr);
|
||
if (pWordRec == NULL)
|
||
return FALSE;
|
||
|
||
bPOS = HIBYTE(pWordRec->nRightCat); // currently, RightCat == LeftCat
|
||
nFT = pWordRec->nFT;
|
||
nLT = pWordRec->nLT;
|
||
|
||
// delete NOUN/IJ records which have unmatched character boundary
|
||
if (bPOS == POS_NF || bPOS == POS_NC || bPOS == POS_NO || bPOS == POS_NN ||
|
||
bPOS == POS_IJ || bPOS == POS_IX)
|
||
{
|
||
if (!pPI->rgCharInfo[nFT].fValidStart || !pPI->rgCharInfo[nLT].fValidEnd)
|
||
pLeafChartPool->DeleteRecord(curr);
|
||
}
|
||
// delete single length particle which is inside words
|
||
else if (bPOS == POS_POSP)
|
||
{
|
||
if (compose_length(pWordRec->wzIndex) == 1 &&
|
||
nLT != pPI->nLen-1)
|
||
pLeafChartPool->DeleteRecord(curr);
|
||
}
|
||
|
||
// delete POS_NO record inside POS_NF record
|
||
if (bPOS == POS_NF)
|
||
{
|
||
for (int j = nFT; j < nLT; j++)
|
||
{
|
||
currSub = pLeafChartPool->GetFTHead(j);
|
||
|
||
while (currSub)
|
||
{
|
||
nextSub = pLeafChartPool->GetFTNext(currSub);
|
||
|
||
pRecSub = pLeafChartPool->GetWordRec(currSub);
|
||
if (pRecSub == NULL)
|
||
return FALSE;
|
||
|
||
// currently, RightCat == LeftCat
|
||
if (pRecSub->nLT < nLT && HIBYTE(pRecSub->nRightCat) == POS_NO)
|
||
pLeafChartPool->DeleteRecord(currSub);
|
||
|
||
currSub = nextSub;
|
||
}
|
||
}
|
||
}
|
||
|
||
curr = next;
|
||
}
|
||
}
|
||
|
||
// find the longest ENDING/PARTICLE from the end of word
|
||
nMaxEnding = 0;
|
||
iMaxEnding = 0;
|
||
nMaxParticle = 0;
|
||
iMaxParticle = 0;
|
||
|
||
for (i = pPI->nLen-1; i >= 0; i--)
|
||
{
|
||
curr = pLeafChartPool->GetLTHead(i);
|
||
|
||
while (curr != 0)
|
||
{
|
||
next = pLeafChartPool->GetLTNext(curr);
|
||
|
||
pWordRec = pLeafChartPool->GetWordRec(curr);
|
||
if (pWordRec == NULL)
|
||
return FALSE;
|
||
|
||
bPOS = HIBYTE(pWordRec->nRightCat); // currently, RightCat == LeftCat
|
||
nFT = pWordRec->nFT;
|
||
nLT = pWordRec->nLT;
|
||
|
||
cchFuncWord = nLT - nFT + 1;
|
||
|
||
if (bPOS == POS_FUNCW)
|
||
{
|
||
if (cchFuncWord > nMaxEnding)
|
||
{
|
||
nMaxEnding = cchFuncWord;
|
||
iMaxEnding = curr;
|
||
}
|
||
}
|
||
else if (bPOS == POS_POSP)
|
||
{
|
||
if (cchFuncWord > nMaxParticle)
|
||
{
|
||
nMaxParticle = cchFuncWord;
|
||
iMaxParticle = curr;
|
||
}
|
||
}
|
||
|
||
curr = next;
|
||
}
|
||
}
|
||
|
||
// remove ENDING with same FT of longest functional record
|
||
if (iMaxEnding != 0)
|
||
{
|
||
pWordRec = pLeafChartPool->GetWordRec(iMaxEnding);
|
||
if (pWordRec == NULL)
|
||
return FALSE;
|
||
|
||
nFT = pWordRec->nFT;
|
||
nLT = pWordRec->nLT;
|
||
|
||
curr = pLeafChartPool->GetFTHead(nFT);
|
||
|
||
while (curr != 0)
|
||
{
|
||
next = pLeafChartPool->GetFTNext(curr);
|
||
|
||
if (curr == iMaxEnding)
|
||
{
|
||
curr = next;
|
||
continue;
|
||
}
|
||
|
||
pWordRec = pLeafChartPool->GetWordRec(curr);
|
||
if (pWordRec == NULL)
|
||
return FALSE;
|
||
|
||
bPOS = HIBYTE(pWordRec->nRightCat); // currently, RightCat == LeftCat
|
||
|
||
// skip same length record
|
||
if (nLT != pWordRec->nLT && bPOS == POS_FUNCW)
|
||
{
|
||
pLeafChartPool->DeleteRecord(curr);
|
||
}
|
||
|
||
curr = next;
|
||
}
|
||
}
|
||
|
||
// remove PARTICLE with same FT of longest functional record
|
||
if (iMaxParticle != 0)
|
||
{
|
||
pWordRec = pLeafChartPool->GetWordRec(iMaxParticle);
|
||
if (pWordRec == NULL)
|
||
return FALSE;
|
||
|
||
nFT = pWordRec->nFT;
|
||
nLT = pWordRec->nLT;
|
||
|
||
curr = pLeafChartPool->GetFTHead(nFT);
|
||
|
||
while (curr != 0)
|
||
{
|
||
next = pLeafChartPool->GetFTNext(curr);
|
||
|
||
if (curr == iMaxParticle)
|
||
{
|
||
curr = next;
|
||
continue;
|
||
}
|
||
|
||
pWordRec = pLeafChartPool->GetWordRec(curr);
|
||
if (pWordRec == NULL)
|
||
return FALSE;
|
||
|
||
bPOS = HIBYTE(pWordRec->nRightCat); // currently, RightCat == LeftCat
|
||
|
||
// skip same length record
|
||
if (nLT != pWordRec->nLT && bPOS == POS_POSP)
|
||
{
|
||
pLeafChartPool->DeleteRecord(curr);
|
||
}
|
||
|
||
curr = next;
|
||
}
|
||
}
|
||
|
||
return TRUE;
|
||
}
|
||
|
||
// ChartParsing
|
||
//
|
||
// implement chart parsing algorithm
|
||
//
|
||
// Parameters:
|
||
// pPI -> (PARSE_INFO*) ptr to parse-info struct
|
||
// pLeafChartPool -> (CLeafChartPool*) ptr to Leaf Chart Pool
|
||
// pEndChartPool -> (CEndChartPool*) analyzed End Chart Pool
|
||
// fQuery -> (BOOL) query time flag
|
||
//
|
||
// Result:
|
||
// (BOOL) TRUE if succeed, otherwise return FALSE
|
||
//
|
||
// 10APR00 bhshin began
|
||
BOOL ChartParsing(PARSE_INFO *pPI, CLeafChartPool *pLeafChartPool,
|
||
CEndChartPool *pEndChartPool, BOOL fQuery /*=FALSE*/)
|
||
{
|
||
int nRightRec, nLeftRec, nRecordID;
|
||
float fWeight;
|
||
WORD_REC *pRightRec;
|
||
int nFT;
|
||
int i, curr;
|
||
|
||
if (pPI == NULL || pLeafChartPool == NULL || pEndChartPool == NULL)
|
||
return FALSE;
|
||
|
||
if (!pEndChartPool->Initialize(pPI))
|
||
return FALSE;
|
||
|
||
for (i = 1; i <= pPI->nLen; i++)
|
||
{
|
||
CActiveChartPool ActiveChartPool;
|
||
|
||
if (!InitializeActiveChartPool(pPI, pLeafChartPool, i,
|
||
&ActiveChartPool, pEndChartPool))
|
||
{
|
||
return FALSE;
|
||
}
|
||
|
||
while (!ActiveChartPool.IsEmpty())
|
||
{
|
||
nRightRec = ActiveChartPool.Pop();
|
||
pRightRec = &pPI->rgWordRec[nRightRec];
|
||
|
||
nFT = pRightRec->nFT;
|
||
|
||
// FT is zero, then combine's meaningless.
|
||
if (nFT == 0)
|
||
continue;
|
||
|
||
if (!CheckValidFinal(pPI, pRightRec))
|
||
continue;
|
||
|
||
// LT of combined record is (FT-1)
|
||
curr = pEndChartPool->GetLTHead(nFT-1);
|
||
|
||
while (curr != 0)
|
||
{
|
||
nLeftRec = pEndChartPool->GetRecordID(curr);
|
||
|
||
fWeight = CheckMorphotactics(pPI, nLeftRec, nRightRec, fQuery);
|
||
if (fWeight != WEIGHT_NOT_MATCH)
|
||
{
|
||
nRecordID = MakeCombinedRecord(pPI, nLeftRec, nRightRec, fWeight);
|
||
if (nRecordID >= MIN_RECORD)
|
||
{
|
||
ActiveChartPool.Push(nRecordID);
|
||
pEndChartPool->AddRecord(nRecordID);
|
||
}
|
||
}
|
||
|
||
curr = pEndChartPool->GetLTNext(curr);
|
||
}
|
||
}
|
||
}
|
||
|
||
return TRUE;
|
||
}
|
||
|
||
// InitializeActiveChartPool
|
||
//
|
||
// copy LT records of LeafChart into ActiveChart/EndChart
|
||
//
|
||
// Parameters:
|
||
// pPI -> (PARSE_INFO*) ptr to parse-info struct
|
||
// pLeafChartPool -> (CLeafChartPool*) ptr to Leaf Chart Pool
|
||
// pActiveChartPool -> (CActiveChartPool*) ptr to Active Chart Pool
|
||
// pEndChartPool -> (CEndChartPool*) ptr to End Chart Pool
|
||
//
|
||
// Result:
|
||
// (BOOL) TRUE if succeed, otherwise return FALSE
|
||
//
|
||
// 31MAR00 bhshin began
|
||
BOOL InitializeActiveChartPool(PARSE_INFO *pPI,
|
||
CLeafChartPool *pLeafChartPool,
|
||
int nLT,
|
||
CActiveChartPool *pActiveChartPool,
|
||
CEndChartPool *pEndChartPool)
|
||
{
|
||
int curr;
|
||
int nRecordID;
|
||
|
||
if (pPI == NULL || pLeafChartPool == NULL ||
|
||
pActiveChartPool == NULL || pEndChartPool == NULL)
|
||
return FALSE;
|
||
|
||
// intialize Active Chart Pool
|
||
if (!pActiveChartPool->Initialize())
|
||
return FALSE;
|
||
|
||
// get the LT records of LeafChart
|
||
curr = pLeafChartPool->GetLTHead(nLT);
|
||
while (curr != 0)
|
||
{
|
||
nRecordID = pLeafChartPool->GetRecordID(curr);
|
||
|
||
// add it to Active/End Chart Pool
|
||
if (pActiveChartPool->Push(nRecordID) < MIN_RECORD)
|
||
return FALSE;
|
||
|
||
if (pEndChartPool->AddRecord(nRecordID) < MIN_RECORD)
|
||
return FALSE;
|
||
|
||
curr = pLeafChartPool->GetLTNext(curr);
|
||
}
|
||
|
||
return TRUE;
|
||
}
|
||
|
||
// MakeCombinedRecord
|
||
//
|
||
// check morphotactics & return corresponding weight value
|
||
//
|
||
// Parameters:
|
||
// pPI -> (PARSE_INFO*) ptr to parse-info struct
|
||
// nLeftRec -> (int) left side record ID
|
||
// nRightRec -> (int) right side record ID
|
||
// fWeight -> (float) new weight value
|
||
//
|
||
// Result:
|
||
// (int) record ID of record pool, if faild, return 0
|
||
//
|
||
// 31MAR00 bhshin began
|
||
int MakeCombinedRecord(PARSE_INFO *pPI, int nLeftRec, int nRightRec, float fWeight)
|
||
{
|
||
WORD_REC *pLeftRec = NULL;
|
||
WORD_REC *pRightRec = NULL;
|
||
RECORD_INFO rec;
|
||
BYTE bLeftPOS, bRightPOS;
|
||
WCHAR wzIndex[MAX_INDEX_STRING];
|
||
WCHAR *pwzIndex;
|
||
|
||
if (pPI == NULL)
|
||
return 0;
|
||
|
||
if (nLeftRec < MIN_RECORD || nLeftRec >= pPI->nCurrRec)
|
||
return 0;
|
||
|
||
if (nRightRec < MIN_RECORD || nRightRec >= pPI->nCurrRec)
|
||
return 0;
|
||
|
||
pLeftRec = &pPI->rgWordRec[nLeftRec];
|
||
pRightRec = &pPI->rgWordRec[nRightRec];
|
||
|
||
rec.fWeight = fWeight;
|
||
rec.nFT = pLeftRec->nFT;
|
||
rec.nLT = pRightRec->nLT;
|
||
rec.nDict = DICT_ADDED;
|
||
rec.nLeftCat = pLeftRec->nLeftCat;
|
||
rec.nRightCat = pRightRec->nRightCat;
|
||
|
||
bLeftPOS = HIBYTE(pLeftRec->nLeftCat);
|
||
bRightPOS = HIBYTE(pRightRec->nLeftCat);
|
||
|
||
rec.nLeftChild = (unsigned short)nLeftRec;
|
||
rec.nRightChild = (unsigned short)nRightRec;
|
||
|
||
// add noun childs records number
|
||
rec.cNounRec = pLeftRec->cNounRec + pRightRec->cNounRec;
|
||
|
||
// check # of NO record
|
||
rec.cNoRec = pLeftRec->cNoRec + pRightRec->cNoRec;
|
||
|
||
// if it has more than 2 No record, then return
|
||
if (rec.cNoRec > 2)
|
||
return 0;
|
||
|
||
// WB combine only successive No case.
|
||
if (pLeftRec->cNoRec == 1 && pRightRec->cNoRec == 1)
|
||
{
|
||
if (HIBYTE(pLeftRec->nRightCat) != POS_NO ||
|
||
HIBYTE(pRightRec->nLeftCat) != POS_NO)
|
||
return 0;
|
||
}
|
||
|
||
// make combined index string
|
||
// <index> = <left><.><right>
|
||
int i = 0;
|
||
|
||
pwzIndex = pLeftRec->wzIndex;
|
||
|
||
// recordB is VA && recordA is FUNCW(ending) &&
|
||
// Lemma(recordA) starts with "<22><> <20><> <20><>"
|
||
// string = Lemma(recordB) + "<22><> <20><> <20><>"
|
||
if (bLeftPOS == POS_VA && bRightPOS == POS_FUNCW && pLeftRec->nFT == 0)
|
||
{
|
||
// copy left index term
|
||
while (*pwzIndex != L'\0')
|
||
{
|
||
if (*pwzIndex != L'.')
|
||
wzIndex[i++] = *pwzIndex;
|
||
|
||
pwzIndex++;
|
||
}
|
||
|
||
// <20><> case
|
||
if (pRightRec->wzIndex[0] == 0x11B7)
|
||
{
|
||
wzIndex[i++] = 0x11B7;
|
||
goto Exit;
|
||
}
|
||
// <20><> case
|
||
else if (pRightRec->wzIndex[0] == 0x110B &&
|
||
pRightRec->wzIndex[1] == 0x1173 &&
|
||
pRightRec->wzIndex[2] == 0x11B7)
|
||
{
|
||
wzIndex[i++] = 0x110B;
|
||
wzIndex[i++] = 0x1173;
|
||
wzIndex[i++] = 0x11B7;
|
||
goto Exit;
|
||
}
|
||
// <20><> case
|
||
else if (pRightRec->wzIndex[0] == 0x1100 &&
|
||
pRightRec->wzIndex[1] == 0x1175 &&
|
||
!fIsJongSeong(pRightRec->wzIndex[2]))
|
||
{
|
||
wzIndex[i++] = 0x1100;
|
||
wzIndex[i++] = 0x1175;
|
||
goto Exit;
|
||
}
|
||
else
|
||
{
|
||
i = 0; // undo forwarding copy
|
||
}
|
||
}
|
||
|
||
if (i == 0)
|
||
{
|
||
if (bLeftPOS == POS_FUNCW || bLeftPOS == POS_POSP ||
|
||
bLeftPOS == POS_VA || bLeftPOS == POS_IX)
|
||
{
|
||
wzIndex[i++] = L'X';
|
||
}
|
||
else
|
||
{
|
||
// remove <.> from left index string
|
||
while (*pwzIndex != L'\0')
|
||
{
|
||
if (*pwzIndex != L'.')
|
||
wzIndex[i++] = *pwzIndex;
|
||
|
||
pwzIndex++;
|
||
}
|
||
}
|
||
}
|
||
|
||
wzIndex[i++] = L'.';
|
||
|
||
pwzIndex = pRightRec->wzIndex;
|
||
|
||
if (bRightPOS == POS_FUNCW || bRightPOS == POS_POSP ||
|
||
bRightPOS == POS_VA || bRightPOS == POS_IX)
|
||
{
|
||
wzIndex[i++] = L'X';
|
||
}
|
||
else
|
||
{
|
||
// remove <.> from right index string
|
||
while (*pwzIndex != L'\0')
|
||
{
|
||
if (*pwzIndex != L'.')
|
||
wzIndex[i++] = *pwzIndex;
|
||
|
||
pwzIndex++;
|
||
}
|
||
}
|
||
|
||
Exit:
|
||
|
||
wzIndex[i] = L'\0';
|
||
|
||
rec.pwzIndex = wzIndex;
|
||
|
||
return AddRecord(pPI, &rec);
|
||
}
|
||
|
||
// MakeIndexTerms
|
||
//
|
||
// make index term (index time)
|
||
//
|
||
// Parameters:
|
||
// pPI -> (PARSE_INFO*) ptr to parse-info struct
|
||
// pEndChartPool -> (CEndChartPool*) analyzed End Chart Pool
|
||
// pIndexInfo -> (CIndexInfo *) output index list
|
||
// pfNeedGuessing -> (BOOL*) output need to guess flag
|
||
//
|
||
// Result:
|
||
// (BOOL) TRUE if succeed, otherwise return FALSE
|
||
//
|
||
// 06APR00 bhshin began
|
||
BOOL MakeIndexTerms(PARSE_INFO *pPI, CEndChartPool *pEndChartPool,
|
||
CIndexInfo *pIndexInfo, BOOL *pfNeedGuessing)
|
||
{
|
||
int nLTMaxLen;
|
||
int curr;
|
||
WORD_REC *pWordRec;
|
||
int cchRecord;
|
||
float fBestWeight = 0;
|
||
int cMinNoRec;
|
||
BOOL fOnlySuffix = FALSE;
|
||
|
||
// intialize guessing flag
|
||
*pfNeedGuessing = TRUE;
|
||
|
||
if (pPI == NULL || pEndChartPool == NULL)
|
||
return FALSE;
|
||
|
||
// if all cover record exist, then make index term
|
||
nLTMaxLen = pEndChartPool->GetLTMaxLen(pPI->nMaxLT);
|
||
|
||
// make index terms for all cover records
|
||
if (nLTMaxLen < pPI->nLen)
|
||
return TRUE;
|
||
|
||
// LT of EndChartPool increasing length order
|
||
curr = pEndChartPool->GetLTHead(pPI->nMaxLT);
|
||
while (curr != 0)
|
||
{
|
||
pWordRec = pEndChartPool->GetWordRec(curr);
|
||
if (pWordRec == NULL)
|
||
break;
|
||
|
||
if (!CheckValidFinal(pPI, pWordRec))
|
||
{
|
||
curr = pEndChartPool->GetLTNext(curr);
|
||
continue;
|
||
}
|
||
|
||
cchRecord = pWordRec->nLT - pWordRec->nFT + 1;
|
||
|
||
// get index string from tree traverse
|
||
if (cchRecord == nLTMaxLen && pWordRec->fWeight > THRESHOLD_MAKE_INDEX)
|
||
{
|
||
// Now, we find index terms. DO NOT guessing
|
||
*pfNeedGuessing = FALSE;
|
||
|
||
float fWeight = pWordRec->fWeight;
|
||
int cNoRec = pWordRec->cNoRec;
|
||
|
||
if (fBestWeight == 0)
|
||
{
|
||
fBestWeight = fWeight;
|
||
cMinNoRec = cNoRec;
|
||
}
|
||
|
||
// we just traverse best weight list
|
||
if (fWeight == fBestWeight && cMinNoRec == cNoRec)
|
||
{
|
||
WB_LOG_ROOT_INDEX(pWordRec->wzIndex, TRUE); // root
|
||
TraverseIndexString(pPI, fOnlySuffix, pWordRec, pIndexInfo);
|
||
|
||
// on index time, just pick up suffix on processing other than best
|
||
if (pIndexInfo->IsEmpty() == FALSE)
|
||
{
|
||
fOnlySuffix = TRUE;
|
||
}
|
||
}
|
||
}
|
||
|
||
curr = pEndChartPool->GetLTNext(curr);
|
||
}
|
||
|
||
return TRUE;
|
||
}
|
||
|
||
// TraverseIndexString
|
||
//
|
||
// get the index string from tree traversing
|
||
//
|
||
// Parameters:
|
||
// pPI -> (PARSE_INFO*) ptr to parse-info struct
|
||
// fOnlySuffix -> (BOOL) process only suffix (nFT == 0)
|
||
// pWordRec -> (WORD_REC*) parent WORD RECORD
|
||
// pIndexInfo -> (CIndexInfo *) output index list
|
||
//
|
||
// Result:
|
||
// (BOOL) TRUE if succeed, otherwise return FALSE
|
||
//
|
||
// 07APR00 bhshin began
|
||
BOOL TraverseIndexString(PARSE_INFO *pPI, BOOL fOnlySuffix, WORD_REC *pWordRec, CIndexInfo *pIndexInfo)
|
||
{
|
||
WCHAR *pwzIndex;
|
||
BYTE bPOS;
|
||
WCHAR wzDecomp[MAX_INDEX_STRING*3+1];
|
||
WCHAR wzIndex[MAX_INDEX_STRING+1];
|
||
int cchIndex, cchRecord;
|
||
int nLeft, nRight;
|
||
WORD_REC *pWordLeft, *pWordRight;
|
||
int nPrevX, nMiddleX, nLastX, idx;
|
||
int nFT, nLT;
|
||
|
||
if (pPI == NULL || pWordRec == NULL)
|
||
return FALSE;
|
||
|
||
if (pPI->rgCharInfo == NULL)
|
||
{
|
||
ATLTRACE("Character Info is NULL\n");
|
||
return FALSE;
|
||
}
|
||
|
||
if (fOnlySuffix)
|
||
{
|
||
if (pWordRec->nFT > 0)
|
||
return TRUE;
|
||
}
|
||
|
||
nLeft = pWordRec->nLeftChild;
|
||
nRight = pWordRec->nRightChild;
|
||
|
||
// if it has child node, then don't add index term
|
||
if (nLeft != 0 || nRight != 0)
|
||
{
|
||
// go to child traversing
|
||
// recursively traverse Left/Right child
|
||
if (nLeft != 0)
|
||
{
|
||
pWordLeft = &pPI->rgWordRec[nLeft];
|
||
|
||
WB_LOG_ROOT_INDEX(pWordLeft->wzIndex, FALSE); // child
|
||
TraverseIndexString(pPI, fOnlySuffix, pWordLeft, pIndexInfo);
|
||
}
|
||
|
||
if (nRight != 0)
|
||
{
|
||
pWordRight = &pPI->rgWordRec[nRight];
|
||
|
||
WB_LOG_ROOT_INDEX(pWordRight->wzIndex, FALSE); // child
|
||
TraverseIndexString(pPI, fOnlySuffix, pWordRight, pIndexInfo);
|
||
}
|
||
|
||
return TRUE;
|
||
}
|
||
|
||
bPOS = HIBYTE(pWordRec->nLeftCat);
|
||
|
||
// copy index string
|
||
pwzIndex = pWordRec->wzIndex;
|
||
|
||
// remove connection character(.) and functional character(X)
|
||
nPrevX = 0;
|
||
nMiddleX = 0;
|
||
nLastX = 0;
|
||
idx = 0;
|
||
while (*pwzIndex != L'\0')
|
||
{
|
||
// check the existence of X
|
||
if (*pwzIndex == L'X')
|
||
{
|
||
if (idx == 0)
|
||
nPrevX++;
|
||
else
|
||
nLastX++;
|
||
}
|
||
else if (*pwzIndex != L'.')
|
||
{
|
||
// valid hangul jamo
|
||
wzDecomp[idx++] = *pwzIndex;
|
||
|
||
// check middle X
|
||
nMiddleX = nLastX;
|
||
nLastX = 0;
|
||
}
|
||
|
||
pwzIndex++;
|
||
}
|
||
wzDecomp[idx] = L'\0';
|
||
|
||
compose_jamo(wzIndex, wzDecomp, MAX_INDEX_STRING);
|
||
|
||
cchIndex = wcslen(wzIndex);
|
||
cchRecord = pWordRec->nLT - pWordRec->nFT + 1;
|
||
|
||
// lengh one index term
|
||
if (cchIndex == 1)
|
||
{
|
||
// it should not have leading X or position of last X should be 1
|
||
if (nPrevX > 0 || nLastX > 1)
|
||
return TRUE;
|
||
}
|
||
|
||
// 1. it should not have middle X
|
||
// 2. zero index string is not allowed
|
||
if (nMiddleX == 0 && cchIndex > 0)
|
||
{
|
||
if (bPOS == POS_NF || bPOS == POS_NC || bPOS == POS_NO || bPOS == POS_NN || bPOS == POS_IJ ||
|
||
(bPOS == POS_VA && pWordRec->nLeftChild > 0 && pWordRec->nRightChild > 0))
|
||
{
|
||
nFT = pPI->rgCharInfo[pWordRec->nFT].nToken;
|
||
nLT = pPI->rgCharInfo[pWordRec->nLT].nToken;
|
||
|
||
pIndexInfo->AddIndex(wzIndex, cchIndex, pWordRec->fWeight, nFT, nLT);
|
||
WB_LOG_ADD_INDEX(wzIndex, cchIndex, INDEX_PARSE);
|
||
}
|
||
}
|
||
|
||
return TRUE;
|
||
}
|
||
|
||
// MakeQueryTerms
|
||
//
|
||
// make index term (query time)
|
||
//
|
||
// Parameters:
|
||
// pPI -> (PARSE_INFO*) ptr to parse-info struct
|
||
// pEndChartPool -> (CEndChartPool*) analyzed End Chart Pool
|
||
// pIndexInfo -> (CIndexInfo *) output index list
|
||
// pfNeedGuessing -> (BOOL*) output need to guess flag
|
||
//
|
||
// Result:
|
||
// (BOOL) TRUE if succeed, otherwise return FALSE
|
||
//
|
||
// 04DEC00 bhshin began
|
||
BOOL MakeQueryTerms(PARSE_INFO *pPI, CEndChartPool *pEndChartPool,
|
||
CIndexInfo *pIndexInfo, BOOL *pfNeedGuessing)
|
||
{
|
||
int nLTMaxLen;
|
||
int curr;
|
||
WORD_REC *pWordRec;
|
||
int cchRecord;
|
||
float fBestWeight = 0;
|
||
int cMinNoRec;
|
||
BOOL fOnlySuffix = FALSE;
|
||
WCHAR wzIndex[MAX_INDEX_STRING*2];
|
||
int cchIndex, nFT, nLT;
|
||
|
||
// intialize guessing flag
|
||
*pfNeedGuessing = TRUE;
|
||
|
||
if (pPI == NULL || pEndChartPool == NULL)
|
||
return FALSE;
|
||
|
||
// if all cover record exist, then make index term
|
||
nLTMaxLen = pEndChartPool->GetLTMaxLen(pPI->nMaxLT);
|
||
|
||
// make index terms for all cover records
|
||
if (nLTMaxLen < pPI->nLen)
|
||
return TRUE;
|
||
|
||
// LT of EndChartPool increasing length order
|
||
curr = pEndChartPool->GetLTHead(pPI->nMaxLT);
|
||
while (curr != 0)
|
||
{
|
||
pWordRec = pEndChartPool->GetWordRec(curr);
|
||
if (pWordRec == NULL)
|
||
break;
|
||
|
||
if (!CheckValidFinal(pPI, pWordRec))
|
||
{
|
||
curr = pEndChartPool->GetLTNext(curr);
|
||
continue;
|
||
}
|
||
|
||
cchRecord = pWordRec->nLT - pWordRec->nFT + 1;
|
||
|
||
// get index string from tree traverse
|
||
if (cchRecord == nLTMaxLen && pWordRec->fWeight > THRESHOLD_MAKE_INDEX)
|
||
{
|
||
// Now, we find index terms. DO NOT guessing
|
||
*pfNeedGuessing = FALSE;
|
||
|
||
float fWeight = pWordRec->fWeight;
|
||
int cNoRec = pWordRec->cNoRec;
|
||
|
||
if (fBestWeight == 0)
|
||
{
|
||
fBestWeight = fWeight;
|
||
cMinNoRec = cNoRec;
|
||
}
|
||
|
||
// we just traverse best weight list
|
||
if (fWeight == fBestWeight && cMinNoRec == cNoRec)
|
||
{
|
||
wzIndex[0] = L'\0';
|
||
|
||
TraverseQueryString(pPI, pWordRec, wzIndex, MAX_INDEX_STRING*2);
|
||
|
||
cchIndex = wcslen(wzIndex);
|
||
if (cchIndex > 0)
|
||
{
|
||
nFT = pPI->rgCharInfo[pWordRec->nFT].nToken;
|
||
nLT = pPI->rgCharInfo[pWordRec->nLT].nToken;
|
||
|
||
pIndexInfo->AddIndex(wzIndex, cchIndex, pWordRec->fWeight, nFT, nLT);
|
||
WB_LOG_ADD_INDEX(wzIndex, cchIndex, INDEX_PARSE);
|
||
}
|
||
}
|
||
}
|
||
|
||
curr = pEndChartPool->GetLTNext(curr);
|
||
}
|
||
|
||
return TRUE;
|
||
}
|
||
|
||
|
||
// TraverseQueryString
|
||
//
|
||
// get the query string from tree traversing
|
||
//
|
||
// Parameters:
|
||
// pPI -> (PARSE_INFO*) ptr to parse-info struct
|
||
// pWordRec -> (WORD_REC*) parent WORD RECORD
|
||
// pwzSeqTerm -> (WCHAR *) output sequence index term buffer
|
||
// cchSeqTerm -> (int) output buffer size
|
||
//
|
||
// Result:
|
||
// (BOOL) TRUE if succeed, otherwise return FALSE
|
||
//
|
||
// 04DEC00 bhshin began
|
||
BOOL TraverseQueryString(PARSE_INFO *pPI, WORD_REC *pWordRec, WCHAR *pwzSeqTerm, int cchSeqTerm)
|
||
{
|
||
WCHAR *pwzIndex;
|
||
BYTE bPOS;
|
||
WCHAR wzDecomp[MAX_INDEX_STRING*3+1];
|
||
WCHAR wzIndex[MAX_INDEX_STRING+1];
|
||
int cchIndex, cchRecord;
|
||
int nLeft, nRight;
|
||
WORD_REC *pWordLeft, *pWordRight;
|
||
int nPrevX, nMiddleX, nLastX, idx;
|
||
int cchPrevSeqTerm;
|
||
int nFT;
|
||
WCHAR wchIndex;
|
||
|
||
if (pPI == NULL || pWordRec == NULL)
|
||
return FALSE;
|
||
|
||
if (pPI->rgCharInfo == NULL)
|
||
{
|
||
ATLTRACE("Character Info is NULL\n");
|
||
return FALSE;
|
||
}
|
||
|
||
nLeft = pWordRec->nLeftChild;
|
||
nRight = pWordRec->nRightChild;
|
||
|
||
// if it has child node, then don't add index term
|
||
if (nLeft != 0 || nRight != 0)
|
||
{
|
||
// go to child traversing
|
||
// recursively traverse Left/Right child
|
||
if (nLeft != 0)
|
||
{
|
||
pWordLeft = &pPI->rgWordRec[nLeft];
|
||
|
||
WB_LOG_ROOT_INDEX(pWordLeft->wzIndex, FALSE); // child
|
||
TraverseQueryString(pPI, pWordLeft, pwzSeqTerm, cchSeqTerm);
|
||
}
|
||
|
||
if (nRight != 0)
|
||
{
|
||
pWordRight = &pPI->rgWordRec[nRight];
|
||
|
||
WB_LOG_ROOT_INDEX(pWordRight->wzIndex, FALSE); // child
|
||
TraverseQueryString(pPI, pWordRight, pwzSeqTerm, cchSeqTerm);
|
||
}
|
||
|
||
return TRUE;
|
||
}
|
||
|
||
bPOS = HIBYTE(pWordRec->nLeftCat);
|
||
|
||
// copy index string
|
||
pwzIndex = pWordRec->wzIndex;
|
||
|
||
// remove connection character(.) and functional character(X)
|
||
nPrevX = 0;
|
||
nMiddleX = 0;
|
||
nLastX = 0;
|
||
idx = 0;
|
||
while (*pwzIndex != L'\0')
|
||
{
|
||
// check the existence of X
|
||
if (*pwzIndex == L'X')
|
||
{
|
||
if (idx == 0)
|
||
nPrevX++;
|
||
else
|
||
nLastX++;
|
||
}
|
||
else if (*pwzIndex != L'.')
|
||
{
|
||
// valid hangul jamo
|
||
wzDecomp[idx++] = *pwzIndex;
|
||
|
||
// check middle X
|
||
nMiddleX = nLastX;
|
||
nLastX = 0;
|
||
}
|
||
|
||
pwzIndex++;
|
||
}
|
||
wzDecomp[idx] = L'\0';
|
||
|
||
compose_jamo(wzIndex, wzDecomp, MAX_INDEX_STRING);
|
||
|
||
cchIndex = wcslen(wzIndex);
|
||
cchRecord = pWordRec->nLT - pWordRec->nFT + 1;
|
||
|
||
// lengh one index term
|
||
if (cchIndex == 1)
|
||
{
|
||
// it should not have leading X or position of last X should be 1
|
||
if (nPrevX > 0 || nLastX > 1)
|
||
return TRUE;
|
||
}
|
||
|
||
// 1. it should not have middle X
|
||
// 2. zero index string is not allowed
|
||
if (nMiddleX == 0 && cchIndex > 0)
|
||
{
|
||
if (bPOS == POS_NF || bPOS == POS_NC || bPOS == POS_NO || bPOS == POS_NN || bPOS == POS_IJ ||
|
||
(bPOS == POS_VA && pWordRec->nLeftChild > 0 && pWordRec->nRightChild > 0))
|
||
{
|
||
// check buffer size
|
||
cchPrevSeqTerm = wcslen(pwzSeqTerm);
|
||
|
||
if (cchSeqTerm <= cchPrevSeqTerm + cchIndex)
|
||
return FALSE; // output buffer too small
|
||
|
||
// add conjoining symbol TAB
|
||
if (cchPrevSeqTerm > 1 && cchIndex > 1)
|
||
wcscat(pwzSeqTerm, L"\t");
|
||
|
||
if (cchIndex == 1)
|
||
{
|
||
nFT = pWordRec->nFT;
|
||
wchIndex = wzIndex[0];
|
||
|
||
// check [<5B><>,<2C><>] suffix case, then just remove it
|
||
if (nFT > 0 && (wchIndex == 0xB4E4 || wchIndex == 0xBFD0))
|
||
return TRUE;
|
||
}
|
||
|
||
// concat index term
|
||
wcscat(pwzSeqTerm, wzIndex);
|
||
}
|
||
}
|
||
|
||
return TRUE;
|
||
}
|
||
|