// crm114_config.h - Controllable Regex Mutilator base config, version X0.1 // Copyright 2001 William S. Yerazunis, all rights reserved. // // This software is licensed to the public under the Free Software // Foundation's GNU GPL, version 1.0. You may obtain a copy of the // GPL by visiting the Free Software Foundations web site at // www.fsf.org . Other licenses may be negotiated; contact the // author for details. // // default size of the variables hashtable (a.k.a. the VHT) #define DEFAULT_VHT_SIZE 4095 // default limit on the control stack (for catching infinite loops, // not a preallocated variable) #define DEFAULT_CSTK_LIMIT 1024 // how many levels (pending operations) will we allow in // math evaluations. We _could_ have it be unlimited, but // this serves as an error catcher in runaway programs. #define DEFAULT_MATHSTK_LIMIT 1024 // default maximum number of lines in any program file #define DEFAULT_MAX_PGMLINES 10000 // define maximum number of inserts before we think we're in an // infinite loop... #define DEFAULT_MAX_INSERTS 1024 // default size of the data window: 8 megabytes. #define DEFAULT_DATA_WINDOW 8388608 //#define DEFAULT_DATA_WINDOW 16777216 //#define DEFAULT_DATA_WINDOW 1048576 // // do we use Sparse Binary Polynomial Hashing (sensitive to both // sequence and spacing of individual words), Token Grab Bag, or // Token Sequence Sensitive? Testing against the SpamAssassin // "hard" database shows that SBPH, TGB, and TGB2, are somewhat // more accurate than TSS, and about 50% more accurate than First // Order Only. However, that's for English, and other natural // languages may show a different statistical distribution. // // Choose ONE of the following: // SBPH, TGB2, TGB, TSS, or ARBITRARY_WINDOW_LEN: // // *** DANGER, WILL ROBINSON *** You MUST rebuild your .css files from // samples of text if you change this. // // // Sparse Binary Polynomial Hashing #define SBPH // // Token Grab Bag, noaliasing //#define TGB2 // // Token Grab Bag, aliasing //#define TGB // // Token Sequence Sensitive //#define TSS // // First Order Only (i.e. single words, like SpamBayes) // Note- if you use FOO, you must turn off weights!! //#define FOO // // Generalized format for the window length. // // DO NOT SET THIS TO MORE THAN 10 WITHOUT LENGTHENING hctable // the classifier modules !!!!!! "hctable" contains the pipeline // hashing coefficients and needs to be extended to 2 * WINDOW_LEN // // Generic window length code //#define ARBITRARY_WINDOW_LENGTH // #define MARKOVIAN_WINDOW_LEN 5 // #define OSB_BAYES_WINDOW_LEN 5 // // // Winnow algorithm parameters here... // #define OSB_WINNOW_WINDOW_LEN 5 #define OSB_WINNOW_PROMOTION 1.23 #define OSB_WINNOW_DEMOTION 0.83 // // Now, choose whether we want to use the "old" or the "new" local // probability calculation. The "old" one works slightly better // for SBPH and much better for TSS, the "new" one works slightly // better for TGB and TGB2, and _much_ better for FOO // // The current default (not necessarily optimal) // is Markovian SBPH, STATIC_LOCAL_PROBABILITIES, // LOCAL_PROB_DENOM = 16, and SUPER_MARKOV // //#define LOCAL_PROB_DENOM 2.0 #define LOCAL_PROB_DENOM 16.0 //#define LOCAL_PROB_DENOM 256.0 #define STATIC_LOCAL_PROBABILITIES //#define LENGTHBASED_LOCAL_PROBABILITIES // //#define ENTROPIC_WEIGHTS //#define MARKOV_WEIGHTS #define SUPER_MARKOV_WEIGHTS //#define BREYER_CHHABRA_SIEFKES_WEIGHTS //#define BREYER_CHHABRA_SIEFKES_BASE7_WEIGHTS //#define BCS_MWS_WEIGHTS //#define BCS_EXP_WEIGHTS // // // // Do we take only the maximum probability feature? // //#define USE_PEAK // // define the default max chain length in a .css file that triggers // autogrooming, the rescale factor when we rescale, and how often // we rescale, and what chance (mask and key) for any particular // slot to get rescaled when a rescale is triggered for that slot chain. //#define MICROGROOM_CHAIN_LENGTH 1024 #define MICROGROOM_CHAIN_LENGTH 256 #define MICROGROOM_RESCALE_FACTOR .75 #define MICROGROOM_STOCHASTIC_MASK 0x0000000F #define MICROGROOM_STOCHASTIC_KEY 0x00000001 #define MICROGROOM_STOP_AFTER 32 // maximum number of buckets groom-zeroed #define FEATURE_HIT_INCREMENT_SIZE 7 // define the "block ratio" of how of a memory data window we're // willing to suck in from a minion process before we block on // input. Normally a factor of 2 (1/4th of the size of a full memory // window, or 2 megabytes in the default configuraton) is sufficient. #define SYSCALL_WINDOW_RATIO 2 // define default internal debug level #define DEFAULT_INTERNAL_TRACE_LEVEL 0 // define default user debug level #define DEFAULT_USER_TRACE_LEVEL 0 // define maximum number of parenthesized sub regexes we'll accept #define MAX_SUBREGEX 256 // define maximum bracket depth nesting we'll allow.... #define MAX_BRACKETDEPTH 256 // define maximum number of iterations allowed for EVAL expansion //#define MAX_EVAL_ITERATIONS 16384 //#define MAX_EVAL_ITERATIONS 1024 #define MAX_EVAL_ITERATIONS 4096 // define maximum size of a pattern in bytes #define MAX_PATTERN 16384 // and how long can a variable name be #define MAX_VARNAME 2048 // define the default number of bytes in a learning file hash table // (note that this should be a prime number, or at least one with a // lot of big factors) // // this value (1048577) is one more than a meg, for a .css of 12 megs #define DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH 1048577 // this value (2097153) is one more than 2 megs, for a .css of 24 megs //#define DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH 2097153 // // define the maximum length of a filename #define MAX_FILE_NAME_LEN 255 // define how many microseconds to sleep waiting for a minion process // to complete: #define MINION_SLEEP_USEC 1000 // How many microseconds to sleep if we're looping on input WINDOW stmt. // try .1 second for now #define INPUT_WINDOW_SLEEP_USEC 1000 // Maximum number of different .CSS files in a CLASSIFY #define MAX_CLASSIFIERS 128 // Maximum number of nonfatal errors we'll allow before tossing our // cookies on a fatal error #define MAX_NONFATAL_ERRORS 100 // How big is a feature bucket? Is it a byte, a short, a long, // a float, whatever. :) #define FEATUREBUCKET_TYPE FEATUREBUCKET_STRUCT #define FEATUREBUCKET_VALUE_MAX 32767 //#define FEATUREBUCKET_VALUE_MAX 1000000 //#define FEATUREBUCKET_TYPE unsigned short // End of configurable parameters. #include "config.h"