2025-04-27 07:49:33 -04:00

335 lines
8.7 KiB
C++

/*++
Copyright (c) 1995-1997 Microsoft Corporation
Module Name :
spechash.cxx
Abstract:
This module calculates the hash value for SpecWeb96 style file-names
Author:
Murali R. Krishnan ( MuraliK ) 06-Nov-1997
Environment:
Win32 - User Mode
Project:
Internet Server Test DLL
Functions Exported:
Revision History:
--*/
/************************************************************
* Include Headers
************************************************************/
# include <windows.h>
# include <stdio.h>
# include <stdlib.h>
# include <math.h>
/************************************************************
* Functions
************************************************************/
// specifies how many characters we will use for hashing string names
DWORD g_dwFastHashLength = (16);
//
// Following is the original hash-calculator used by Tsunami cache in
// IIS 4.0
//
DWORD
CalculateHashAndLengthOfPathName(
LPCSTR pszPathName,
PULONG pcbLength
)
{
DWORD hash = 0;
CHAR ch;
DWORD start;
DWORD index;
*pcbLength = strlen(pszPathName);
//
// hash the last g_dwFastHashLength characters
//
if ( *pcbLength < g_dwFastHashLength ) {
start = 0;
} else {
start = *pcbLength - g_dwFastHashLength;
}
for ( index = start; pszPathName[index] != '\0'; index++ ) {
//
// This is an extremely slimey way of getting upper case.
// Kids, don't try this at home
// -johnson
//
ch = pszPathName[index] & (CHAR)~0x20;
hash <<= 1;
hash ^= ch;
hash <<= 1;
hash += ch;
}
//
// Multiply by length to give additional randomness
//
return( hash * *pcbLength);
} // CalculateHashAndLengthOfPathName()
//
// Following is the version of Perfect hash function used inside
// NTFS for hashing file names - used in NT5
//
DWORD
NTPerfCalcHashAndLengthOfPathName(
LPCSTR pszPathName,
PULONG pcbLength
)
{
/* default value for "scrambling constant" */
#define FILENAME_STRING_CONVERT_CONSTANT 314159269
/* prime number, also used for scrambling */
#define FILENAME_STRING_PRIME 1000000007
//
// Compute a hash value for the string which is invairant to case. The
// distribution of hash strings should be nearly perfect using the following
// function. Unless file names become extremely long, say average over 512
// characters, processing all characters is the best way to get perfect
// distribution.
//
// For really long file names a better algorithm could include taking the
// first n plus the last n plus psuedo-random number of characters in the
// middle using the last n has the psuedo-random seed. Just the cost of
// calculating a new end point (say last 10 characters) will decrease the
// speed of this function enough to cancel the savings of not using the first
// n-1 characters, so any changes here should be tested for performance.
//
// Leave out the upcase code for now (see sample here).
// todo: Leave in _ch until case issue solved
//_ch = RtlUpcaseUnicodeChar( *_p++ ); and put slash here
DWORD hash = 0;
CHAR ch;
LPCSTR pszScan;
LPCSTR pszEnd;
*pcbLength = strlen(pszPathName);
pszScan = pszPathName;
pszEnd = pszPathName + *pcbLength;
while ( pszScan < pszEnd) {
hash = 37 * hash + (unsigned int)((*pszScan++) & (CHAR)~0x20);
} // while
return ( abs(FILENAME_STRING_CONVERT_CONSTANT * hash) %
FILENAME_STRING_PRIME);
} // NTPerfCalcHashAndLengthOfPathName()
DWORD
CalculateVariance( IN LPDWORD prgBinDistribs, IN DWORD nBins,
IN DWORD dwAverage)
{
//
// Variance = Sigma( (X-Avg)^2)/n
//
DWORD dwVariance = 0;
LONG delta;
DWORD binIndex;
for ( binIndex = 0; binIndex < nBins; binIndex++) {
delta = (prgBinDistribs[binIndex] - dwAverage);
dwVariance += (delta * delta);
} // for binIndex
return ( dwVariance/nBins);
} // CalculateVariance()
void
PrintBinDistributions( IN LPDWORD prgBinDistribs, IN DWORD nBins)
{
printf( "\n-------------------------------------------------------\n"
"\n Hash Value Distributions Calculated Bins = %d \n"
"[Index] "
,
nBins
);
DWORD binIndex;
DWORD dwAverage = 0;
// print the header
for (binIndex = 0; binIndex < 10; binIndex++) {
printf( "%4d,", binIndex);
}
for ( binIndex = 0; binIndex < nBins; binIndex++) {
if ( binIndex %10 == 0) {
printf( "\n[%5d] ", binIndex);
}
dwAverage += prgBinDistribs[binIndex];
printf( "%4d,", prgBinDistribs[binIndex]);
} // for binIndex
dwAverage /= nBins;
DWORD dwVariance = CalculateVariance( prgBinDistribs, nBins, dwAverage);
printf( "\n--------------------------------------------------------\n"
"\n Average = %d; Variance = %d; Standard Deviation +/- %8.3g\n"
"\n--------------------------------------------------------\n"
,
dwAverage,
dwVariance,
sqrt( (double ) dwVariance)
);
return;
} // PrintBinDistributions()
/* ------------------------------------------------------------
* SpecWeb96 file names have following syntax
* /spec/dirX/classY_Z
* where,
* X = 0 ... 200
* Y = 0, 1, 2, 3
* Z = 0 ... 8
* ------------------------------------------------------------
*/
# define MAX_DIR_INDEX (200)
# define MAX_Y_INDEX (3)
# define MAX_Z_INDEX (8)
DWORD g_rgBinDistrib113[114];
DWORD g_rgBinDistrib223[224];
int __cdecl
main(int argc, char * argv[])
{
DWORD dirIndex;
DWORD YIndex;
DWORD ZIndex;
DWORD index = 0;
if ( argc > 1 ) {
g_dwFastHashLength = atoi( argv[1]);
printf( "Using fast hash-lenght with match for %d suffix chars\n",
g_dwFastHashLength);
}
ZeroMemory( g_rgBinDistrib113, sizeof(g_rgBinDistrib113));
ZeroMemory( g_rgBinDistrib223, sizeof(g_rgBinDistrib223));
printf( "\n------------------------------------------------------------\n"
"\n Hash Values Calculated \n"
"%6s, %30s, %6s, %8s, %6s, %6s\n",
"Index",
"File Name",
"Len",
"Hash",
"Bin113",
"Bin223"
);
for ( dirIndex = 0; dirIndex <= MAX_DIR_INDEX; dirIndex++) {
CHAR rgchFileName[MAX_PATH];
DWORD cchDirName;
cchDirName = wsprintf( rgchFileName, "/spec/dir%d/class", dirIndex);
for ( YIndex = 0; YIndex <= MAX_Y_INDEX; YIndex++) {
for (ZIndex = 0; ZIndex <= MAX_Z_INDEX; ZIndex++) {
DWORD cchFileName;
DWORD hashValue;
DWORD cchPathLen = 0;
cchFileName = (cchDirName +
wsprintf( rgchFileName + cchDirName,
"%d_%d",
YIndex, ZIndex)
);
# ifdef IIS_K2_HASH
hashValue = CalculateHashAndLengthOfPathName( rgchFileName,
&cchPathLen
);
# else
hashValue = NTPerfCalcHashAndLengthOfPathName( rgchFileName,
&cchPathLen);
# endif // IIS_K2_HA
printf( "[%6d], %30s, %6d, 0x%8X, %6d, %6d\n",
index++,
rgchFileName,
cchPathLen,
hashValue,
hashValue % 113, // bin for 113 bins
hashValue % 223 // bin for 223 bins
);
g_rgBinDistrib113[hashValue%113] += 1;
g_rgBinDistrib223[hashValue%223] += 1;
} // for ZIndex
} // for YIndex
} // for dirIndex()
PrintBinDistributions( g_rgBinDistrib113, 113);
PrintBinDistributions( g_rgBinDistrib223, 223);
return (0);
} // main()
/************************ End of File ***********************/