# This script models a proposed dictionary structure that
# divides the search space across N sub-lists (selected
# with a hash function), with each list acting as a MRU
# cache.  Specifically, when an item is found during a
# search, it is moved to the front of the list.  Newly
# added words are also added to the front of the list.
#
# Notes:
#
# This model assumes a case insensitive dictionary.


use strict;


    # configuration

my %dict;                   # sublists indexed by hash function value
my $sublists = 16;          # number of sublists we'll maintain
my $enable_mru = 1;         # for testing without MRU option
my $found_within = 8;       # depth of histogram


    # statistics

my $finds = 0;              # total calls to find function
my $probes = 0;             # total number of probes
my $found = 0;              # number of times a symbol was found in the dictionary
my $mru_moves = 0;          # number of times a symbol moved to start of list
my $dict_adds = 0;          # number of new symbols added to dictionary
my $numbers = 0;            # number of symbols assumed to be numbers
my @within;                 # histogram of how deep the found symbols were


    # One of the stupider hash functions, chosen for speed
    # of producing a value rather than an ideal distribution.
    # This is just the sum of the string's length and all
    # the characters of the string, modulo the number of
    # sublists.

sub hash {
    my $symbol = lc shift;
    my $bits = length $symbol;
    $bits += ord foreach (split //, $symbol);
    return  $bits % $sublists;
}



sub find {
    my $symbol = lc shift;
    my $sublist = \@{$dict{hash $symbol}};                  # get reference to this symbol's sublist
    $finds++;                                               # how many calls to find made

    for (my $depth = 0; $depth <= $#{$sublist}; $depth++) {
        $probes++;                                          # keep track of probes
        if ($symbol eq ${$sublist}[$depth]) {               # if symbol found
            $within[$depth]++ if $depth < $found_within;    # maintain histogram of depths where symbol found
            if ($enable_mru && $depth > 0) {                # if symbol wasn't at start of list
                splice @{$sublist}, $depth, 1;              # remove item
                unshift @{$sublist}, $symbol;               # add to start of list
                $mru_moves++;                               # keep statistics on moves
            }
            return;
        }
    }

    if ($symbol =~ /^-?[0-9.]+$/) {                           # if symbol is a number
        $numbers++;                                         # don't add to dictionary, just count it
    } else {
        unshift @{$sublist}, $symbol;                       # add new symbol to start of list
        $dict_adds++;                                       # keep statistics on adds
    }
}



# The dictionary is seeded with the list of words taken from
# the ANS Forth standard.  The following words are added in
# ASCII order to each sub-list.  This probably doesn't
# matter much, as the MRU cache behavior will reorder each
# sub-list once we start searching.

$dict{$_} = [] foreach (0 .. $sublists);

push @{$dict{hash $_}}, lc $_ foreach (map {split} (<DATA>));


# Read in Forth source, grab words, try to find them.

while (<>) {
    s/\\.*$/\\ /;       # get rid of line comments
    s/\( .*?\)/( /;     # get rid of paren comments
    s/" .*?"/" /;       # get rid of double-quoted strings

    find($_) foreach (split);
}


# Write MRU histogram and other information.

printf(
    "%2d: %4d (%5.2f)\n", 
    $_, $within[$_], 100*($within[$_] / $finds)
) foreach (0 .. $found_within-1);

my $sum;
$sum += $within[$_] foreach (0 .. $found_within-1);
printf("that's %d total within the first %d or %5.2f\n", $sum, $found_within, 100*($sum/$finds));


showDict();
printf("finds=%4d probes=%4d moves=%4d adds=%4d numbers=%4d\n", $finds, $probes, $mru_moves, $dict_adds, $numbers);


sub showDict {
    printf(
        "%2d (%5d): %s ...\n", 
        $_, scalar @{$dict{$_}}, join(' ', @{$dict{$_}}[0 .. $found_within-1])
    ) foreach (0 .. $sublists-1);
    print "\n";
}


__DATA__
< <> <# = > - , ; : ! ? / . ." .( ' ( [ ['] ] @ * */ \ # #> + +!
0< 0<> 0= 0> 1- 1+ 2! 2/ 2@ 2* 2CONSTANT 2DROP 2DUP 2LITERAL 2OVER 2>R
2R> 2R@ 2ROT 2SWAP 2VARIABLE ABORT ABORT" ABS ACCEPT AGAIN AHEAD ALIGN
ALIGNED ALLOCATE ALLOT ALSO AND ASSEMBLER AT-XY BASE BEGIN BIN BL BLANK
BLK BLOCK >BODY BUFFER BYE C, C! C" C@ CASE CATCH CELL+ CELLS CHAR [CHAR]
CHAR+ CHARS CLOSE-FILE CMOVE CMOVE> CODE ;CODE COMPARE [COMPILE] COMPILE,
CONSTANT CONVERT COUNT CR CREATE CREATE-FILE CS-PICK CS-ROLL D< D= D- D.
D+ D0< D0= D2/ D2* DABS DECIMAL DEFINITIONS DELETE-FILE DEPTH D>F DF!
DF@ DFALIGN DFALIGNED DFLOAT+ DFLOATS DMAX DMIN DNEGATE DO ?DO DOES>
D.R DROP D>S DU< DUMP DUP ?DUP EDITOR EKEY EKEY? EKEY>CHAR ELSE [ELSE]
EMIT EMIT? EMPTY-BUFFERS ENDCASE ENDOF ENVIRONMENT? ERASE EVALUATE
EXECUTE EXIT EXPECT F~ F< F- F! F/ F. F@ F* F** F+ F0< F0= FABS
FACOS FACOSH FALIGN FALIGNED FALOG FALSE FASIN FASINH FATAN FATAN2
FATANH FCONSTANT FCOS FCOSH F>D FDEPTH FDROP FDUP FE. FEXP FEXPM1
FILE-POSITION FILE-SIZE FILE-STATUS FILL FIND FLITERAL FLN FLNP1 >FLOAT
FLOAT+ FLOATS FLOG FLOOR FLUSH FLUSH-FILE FMAX FMIN FM/MOD FNEGATE FORGET
FORTH FORTH-WORDLIST FOVER FREE FROT FROUND FS. FSIN FSINCOS FSINH FSQRT
FSWAP FTAN FTANH FVARIABLE GET-CURRENT GET-ORDER HERE HEX HOLD I IF [IF]
IMMEDIATE >IN INCLUDED INCLUDE-FILE INVERT J KEY KEY? LEAVE LIST LITERAL
LOAD (LOCAL) LOCALS| LOOP +LOOP LSHIFT M* M*/ M+ MARKER MAX MIN MOD /MOD
*/MOD MOVE MS NEGATE NIP :NONAME >NUMBER OF ONLY OPEN-FILE OR ORDER OVER
PAD PAGE PARSE PICK POSTPONE PRECISION PREVIOUS QUERY QUIT >R .R R> R@
READ-FILE READ-LINE RECURSE REFILL RENAME-FILE REPEAT REPOSITION-FILE
REPRESENT RESIZE RESIZE-FILE RESTORE-INPUT R/O ROLL ROT RSHIFT R/W .S #S
S" SAVE-BUFFERS SAVE-INPUT SCR S>D SEARCH SEARCH-WORDLIST SEE SET-CURRENT
SET-ORDER SET-PRECISION SF! SF@ SFALIGN SFALIGNED SFLOAT+ SFLOATS SIGN
SLITERAL SM/REM SOURCE SOURCE-ID SPACE SPACES SPAN STATE /STRING SWAP THEN
[THEN] THROW THRU TIB #TIB TIME&DATE TO -TRAILING TRUE TUCK TYPE U< U> U.
UM* UM/MOD UNLOOP UNTIL UNUSED UPDATE U.R VALUE VARIABLE WHILE WITHIN
W/O WORD WORDLIST WORDS WRITE-FILE WRITE-LINE XOR
