/*++ Copyright (c) 1995-1997 Microsoft Corporation Module Name : spechash.cxx Abstract: This module calculates the hash value for SpecWeb96 style file-names Author: Murali R. Krishnan ( MuraliK ) 06-Nov-1997 Environment: Win32 - User Mode Project: Internet Server Test DLL Functions Exported: Revision History: --*/ /************************************************************ * Include Headers ************************************************************/ # include # include # include # include /************************************************************ * Functions ************************************************************/ // specifies how many characters we will use for hashing string names DWORD g_dwFastHashLength = (16); // // Following is the original hash-calculator used by Tsunami cache in // IIS 4.0 // DWORD CalculateHashAndLengthOfPathName( LPCSTR pszPathName, PULONG pcbLength ) { DWORD hash = 0; CHAR ch; DWORD start; DWORD index; *pcbLength = strlen(pszPathName); // // hash the last g_dwFastHashLength characters // if ( *pcbLength < g_dwFastHashLength ) { start = 0; } else { start = *pcbLength - g_dwFastHashLength; } for ( index = start; pszPathName[index] != '\0'; index++ ) { // // This is an extremely slimey way of getting upper case. // Kids, don't try this at home // -johnson // ch = pszPathName[index] & (CHAR)~0x20; hash <<= 1; hash ^= ch; hash <<= 1; hash += ch; } // // Multiply by length to give additional randomness // return( hash * *pcbLength); } // CalculateHashAndLengthOfPathName() // // Following is the version of Perfect hash function used inside // NTFS for hashing file names - used in NT5 // DWORD NTPerfCalcHashAndLengthOfPathName( LPCSTR pszPathName, PULONG pcbLength ) { /* default value for "scrambling constant" */ #define FILENAME_STRING_CONVERT_CONSTANT 314159269 /* prime number, also used for scrambling */ #define FILENAME_STRING_PRIME 1000000007 // // Compute a hash value for the string which is invairant to case. The // distribution of hash strings should be nearly perfect using the following // function. Unless file names become extremely long, say average over 512 // characters, processing all characters is the best way to get perfect // distribution. // // For really long file names a better algorithm could include taking the // first n plus the last n plus psuedo-random number of characters in the // middle using the last n has the psuedo-random seed. Just the cost of // calculating a new end point (say last 10 characters) will decrease the // speed of this function enough to cancel the savings of not using the first // n-1 characters, so any changes here should be tested for performance. // // Leave out the upcase code for now (see sample here). // todo: Leave in _ch until case issue solved //_ch = RtlUpcaseUnicodeChar( *_p++ ); and put slash here DWORD hash = 0; CHAR ch; LPCSTR pszScan; LPCSTR pszEnd; *pcbLength = strlen(pszPathName); pszScan = pszPathName; pszEnd = pszPathName + *pcbLength; while ( pszScan < pszEnd) { hash = 37 * hash + (unsigned int)((*pszScan++) & (CHAR)~0x20); } // while return ( abs(FILENAME_STRING_CONVERT_CONSTANT * hash) % FILENAME_STRING_PRIME); } // NTPerfCalcHashAndLengthOfPathName() DWORD CalculateVariance( IN LPDWORD prgBinDistribs, IN DWORD nBins, IN DWORD dwAverage) { // // Variance = Sigma( (X-Avg)^2)/n // DWORD dwVariance = 0; LONG delta; DWORD binIndex; for ( binIndex = 0; binIndex < nBins; binIndex++) { delta = (prgBinDistribs[binIndex] - dwAverage); dwVariance += (delta * delta); } // for binIndex return ( dwVariance/nBins); } // CalculateVariance() void PrintBinDistributions( IN LPDWORD prgBinDistribs, IN DWORD nBins) { printf( "\n-------------------------------------------------------\n" "\n Hash Value Distributions Calculated Bins = %d \n" "[Index] " , nBins ); DWORD binIndex; DWORD dwAverage = 0; // print the header for (binIndex = 0; binIndex < 10; binIndex++) { printf( "%4d,", binIndex); } for ( binIndex = 0; binIndex < nBins; binIndex++) { if ( binIndex %10 == 0) { printf( "\n[%5d] ", binIndex); } dwAverage += prgBinDistribs[binIndex]; printf( "%4d,", prgBinDistribs[binIndex]); } // for binIndex dwAverage /= nBins; DWORD dwVariance = CalculateVariance( prgBinDistribs, nBins, dwAverage); printf( "\n--------------------------------------------------------\n" "\n Average = %d; Variance = %d; Standard Deviation +/- %8.3g\n" "\n--------------------------------------------------------\n" , dwAverage, dwVariance, sqrt( (double ) dwVariance) ); return; } // PrintBinDistributions() /* ------------------------------------------------------------ * SpecWeb96 file names have following syntax * /spec/dirX/classY_Z * where, * X = 0 ... 200 * Y = 0, 1, 2, 3 * Z = 0 ... 8 * ------------------------------------------------------------ */ # define MAX_DIR_INDEX (200) # define MAX_Y_INDEX (3) # define MAX_Z_INDEX (8) DWORD g_rgBinDistrib113[114]; DWORD g_rgBinDistrib223[224]; int __cdecl main(int argc, char * argv[]) { DWORD dirIndex; DWORD YIndex; DWORD ZIndex; DWORD index = 0; if ( argc > 1 ) { g_dwFastHashLength = atoi( argv[1]); printf( "Using fast hash-lenght with match for %d suffix chars\n", g_dwFastHashLength); } ZeroMemory( g_rgBinDistrib113, sizeof(g_rgBinDistrib113)); ZeroMemory( g_rgBinDistrib223, sizeof(g_rgBinDistrib223)); printf( "\n------------------------------------------------------------\n" "\n Hash Values Calculated \n" "%6s, %30s, %6s, %8s, %6s, %6s\n", "Index", "File Name", "Len", "Hash", "Bin113", "Bin223" ); for ( dirIndex = 0; dirIndex <= MAX_DIR_INDEX; dirIndex++) { CHAR rgchFileName[MAX_PATH]; DWORD cchDirName; cchDirName = wsprintf( rgchFileName, "/spec/dir%d/class", dirIndex); for ( YIndex = 0; YIndex <= MAX_Y_INDEX; YIndex++) { for (ZIndex = 0; ZIndex <= MAX_Z_INDEX; ZIndex++) { DWORD cchFileName; DWORD hashValue; DWORD cchPathLen = 0; cchFileName = (cchDirName + wsprintf( rgchFileName + cchDirName, "%d_%d", YIndex, ZIndex) ); # ifdef IIS_K2_HASH hashValue = CalculateHashAndLengthOfPathName( rgchFileName, &cchPathLen ); # else hashValue = NTPerfCalcHashAndLengthOfPathName( rgchFileName, &cchPathLen); # endif // IIS_K2_HA printf( "[%6d], %30s, %6d, 0x%8X, %6d, %6d\n", index++, rgchFileName, cchPathLen, hashValue, hashValue % 113, // bin for 113 bins hashValue % 223 // bin for 223 bins ); g_rgBinDistrib113[hashValue%113] += 1; g_rgBinDistrib223[hashValue%223] += 1; } // for ZIndex } // for YIndex } // for dirIndex() PrintBinDistributions( g_rgBinDistrib113, 113); PrintBinDistributions( g_rgBinDistrib223, 223); return (0); } // main() /************************ End of File ***********************/