// crm_osb_bayes.c - Controllable Regex Mutilator, version v1.0 // Copyright 2001-2004 William S. Yerazunis, all rights reserved. // // This software is licensed to the public under the Free Software // Foundation's GNU GPL, version 2. You may obtain a copy of the // GPL by visiting the Free Software Foundations web site at // www.fsf.org, and a copy is included in this distribution. // // Other licenses may be negotiated; contact the // author for details. // // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the command line argc, argv extern int prog_argc; extern char **prog_argv; // the auxilliary input buffer (for WINDOW input) extern char *newinputbuf; // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *inbuf; extern char *outbuf; extern char *tempbuf; //////////////////////////////////////////////////////////////////// // // the hash coefficient table (hctable) should be full of relatively // prime numbers, and preferably superincreasing, though both of those // are not strict requirements. // static long hctable[] = { 1, 7, 3, 13, 5, 29, 11, 51, 23, 101, 47, 203, 97, 407, 197, 817, 397, 1637, 797, 3277 }; // Where does the nominative data start? static long spectra_start; // How to learn Osb_Bayes style - in this case, we'll include the single // word terms that may not strictly be necessary. // int crm_expr_osb_bayes_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb) { // learn the osb_bayes transform spectrum of this input window as // belonging to a particular type. // learn (classname) /word/ // long i, j, k; long h; // h is our counter in the hashpipe; char ptext[MAX_PATTERN]; // the regex pattern long plen; char ltext[MAX_PATTERN]; // the variable to learn long llen; char htext[MAX_PATTERN]; // the hash name long hlen; long cflags, eflags; struct stat statbuf; // for statting the hash file int hfd; // hashfile fd long hfsize; // size of the hash file FEATUREBUCKET_TYPE *hashes; // the text of the hash file unsigned long hashpipe[OSB_BAYES_WINDOW_LEN+1]; // regex_t regcb; regmatch_t match[5]; // we only care about the outermost match long textoffset; long textmaxoffset; long sense; long vhtindex; long microgroom; long logboost; long fev; long made_new_file; if (internal_trace) fprintf (stderr, "executing a LEARN\n"); // Keep the gcc compiler from complaining about unused variables // i = hctable[0]; // extract the hash file name crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len); hlen = apb->p1len; hlen = crm_nexpandvar (htext, hlen, MAX_PATTERN); // // extract the variable name (if present) crm_get_pgm_arg (ltext, MAX_PATTERN, apb->b1start, apb->b1len); llen = apb->b1len; llen = crm_nexpandvar (ltext, llen, MAX_PATTERN); // get the "this is a word" regex crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len); plen = apb->s1len; plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); // set our cflags, if needed. The defaults are // "case" and "affirm", (both zero valued). // and "microgroom" disabled. cflags = REG_EXTENDED; eflags = 0; sense = +1; if (apb->sflags & CRM_NOCASE) { cflags = cflags || REG_ICASE; eflags = 1; if (user_trace) fprintf (stderr, "turning oncase-insensitive match\n"); }; if (apb->sflags & CRM_REFUTE) { sense = -sense; if (user_trace) fprintf (stderr, " refuting learning\n"); }; microgroom = 0; if (apb->sflags & CRM_MICROGROOM) { microgroom = 1; if (user_trace) fprintf (stderr, " enabling microgrooming.\n"); }; logboost = 0; if (apb->sflags & CRM_LOGBOOST) { logboost = 1; if (user_trace) fprintf (stderr, " enabling LOGBOOST learning.\n"); }; // // grab the filename, and stat the file // note that neither "stat", "fopen", nor "open" are // fully 8-bit or wchar clean... i = 0; while (htext[i] < 0x021) i++; j = i; while (htext[j] >= 0x021) j++; // filename starts at i, ends at j. null terminate it. htext[j] = '\000'; // and stat it to get it's length k = stat (&htext[i], &statbuf); made_new_file = 0; // quick check- does the file even exist? if (k != 0) { // file didn't exist... create it FILE *f; if (user_trace) fprintf (stderr, "\nHad to create new CSS file %s\n", &htext[i]); f = fopen (&htext[i], "w"); if (!f) { fprintf (stderr, "\n Couldn't open your new CSS file %s for writing; errno=%d .\n", &htext[i], errno); exit (EXIT_FAILURE); }; // put in sparse_spectrum_file_length entries of NULL for (j = 0; j < sparse_spectrum_file_length * sizeof ( FEATUREBUCKET_TYPE); j++) fputc ('\000', f); made_new_file = 1; // fclose (f); // and reset the statbuf to be correct k = stat (&htext[i], &statbuf); }; // hfsize = statbuf.st_size; if (user_trace) fprintf (stderr, "Sparse spectra file %s has length %ld bins\n", &htext[i], hfsize / sizeof (FEATUREBUCKET_TYPE)); // // open the hash file into memory so we can bitwhack it // hfd = open (&(htext[i]), O_RDWR); if (hfd < 0) { fev = fatalerror ("Couldn't open the .css file named: ", &htext[i]); return (fev); }; // // map the .css file into memory // hashes = (FEATUREBUCKET_TYPE *) mmap (NULL, hfsize, PROT_READ + PROT_WRITE, MAP_SHARED, hfd, 0); if (hashes == MAP_FAILED) { fev = fatalerror ("Couldn't memory-map the .css file named: ", &htext[i]); return (fev); }; // if this is a new file, set the proper version number. if (made_new_file) { hashes[0].hash = 0; hashes[0].key = 0; hashes[0].value = 1; }; // check the version of the file // if (hashes[0].hash != 0 || hashes[0].key != 0 ) { fprintf (stderr, "Hash was: %ld, key was %ld\n", hashes[0].hash, hashes[0].key); fev =fatalerror ("The .css file is the wrong type! We're expecting " "a Osb_Bayes-spectrum file. The filename is: ", &htext[i]); return (fev); }; // // In this format, bucket 0.value contains the start of the spectra. // hashes[0].value = 1; spectra_start = hashes[0].value; // // now set the hfsize to the number of entries, not the number // of bytes total hfsize = hfsize / sizeof ( FEATUREBUCKET_TYPE ); // LOGBOOST IS DISABLED IN THIS RELEASE. DO NOT ENABLE IT, // AS IT GIVES _BAD_ RESULTS!!!! // // if logboost is engaged, get the log of the featurecount // if ( 0 ) // disabled logboost) { for (i = 1; i < hfsize; i++) { logboost = logboost + hashes[i].value; }; logboost = log10 (log10 ((logboost * 1.0))) * FEATURE_HIT_INCREMENT_SIZE; if (logboost < 1) logboost = 1; logboost = 1; }; // and if logboost is engaged, mangle "sense" as well. // if (logboost) sense = sense * logboost ; // compile the word regex // if ( internal_trace) fprintf (stderr, "\nWordmatch pattern is %s", ptext); i = crm_regcomp (®cb, ptext, plen, cflags); if ( i > 0) { crm_regerror ( i, ®cb, tempbuf, data_window_size); nonfatalerror ("Regular Expression Compilation Problem:", tempbuf); goto regcomp_failed; }; // Start by priming the pipe... we will shift to the left next. // sliding, hashing, xoring, moduloing, and incrmenting the // hashes till there are no more. k = 0; j = 0; i = 0; if (llen > 0) { vhtindex = crm_vht_lookup (vht, ltext, llen); } else { vhtindex = crm_vht_lookup (vht, ":_dw:", 5); }; if (vht[vhtindex] == NULL) { long q; q = fatalerror (" Attempt to LEARN from a nonexistent variable ", ltext); return (q); }; mdw = NULL; if (tdw->filetext == vht[vhtindex]->valtxt) mdw = tdw; if (cdw->filetext == vht[vhtindex]->valtxt) mdw = cdw; if (mdw == NULL) { long q; q = fatalerror (" Bogus text block containing variable ", ltext); return (q); } textoffset = vht[vhtindex]->vstart; textmaxoffset = textoffset + vht[vhtindex]->vlen; // init the hashpipe with 0xDEADBEEF for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++) { hashpipe[h] = 0xDEADBEEF; }; // and the big loop... i = 0; while (k == 0 && textoffset <= textmaxoffset) { long wlen; long slen; // do the regex // slen = endpoint (= start + len) // - startpoint (= curr textoffset) slen = vht[vhtindex]->vstart + vht[vhtindex]->vlen - textoffset ; k = crm_regexec ( ®cb, &(mdw->filetext[textoffset]), slen, 5, match, 0, NULL); if (k != 0 || textoffset > textmaxoffset) goto learn_end_regex_loop; { wlen = match[0].rm_eo - match[0].rm_so; memmove (tempbuf, &(mdw->filetext[textoffset + match[0].rm_so]), wlen); tempbuf[wlen] = '\000'; if (internal_trace) { fprintf (stderr, " Learn #%ld t.o. %ld strt %ld end %ld len %ld is -%s-\n", i, textoffset, (long) match[0].rm_so, (long) match[0].rm_eo, wlen, tempbuf); }; if (match[0].rm_eo == 0) { nonfatalerror ( "The LEARN pattern matched zero length! ", "\n Forcing an increment to avoid an infinite loop."); match[0].rm_eo = 1; }; // Shift the hash pipe down one // for (h = OSB_BAYES_WINDOW_LEN-1; h > 0; h--) { hashpipe [h] = hashpipe [h-1]; }; // and put new hash into pipeline hashpipe[0] = strnhash (tempbuf, wlen); if (internal_trace) { fprintf (stderr, " Hashpipe contents: "); for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++) fprintf (stderr, " %ld", hashpipe[h]); fprintf (stderr, "\n"); }; // and account for the text used up. textoffset = textoffset + match[0].rm_eo; i++; // is the pipe full enough to do the hashing? if (1) // we always run the hashpipe now, even if it's // just full of 0xDEADBEEF. (was i >=5) { unsigned long hindex; unsigned long h1, h2; long th = 0; // a counter used for TSS tokenizing unsigned long incrs; long j; // // old Hash polynomial: h0 + 3h1 + 5h2 +11h3 +23h4 // (coefficients chosen by requiring superincreasing, // as well as prime) // th = 0; // for (j = 1; j < OSB_BAYES_WINDOW_LEN; j++) { h1 = hashpipe[0]*hctable[0] + hashpipe[j] * hctable[j<<1]; if (h1 < spectra_start) h1 = spectra_start; h2 = hashpipe[0]*hctable[1] + hashpipe[j] * hctable[(j<<1)-1]; if (h2 == 0) h2 = 0xdeadbeef; hindex = h1 % hfsize; if (hindex < spectra_start ) hindex = spectra_start; if (internal_trace) fprintf (stderr, "Polynomial %ld has h1:%ld h2: %ld\n", j, h1, h2); // // we now look at both the primary (h1) and // crosscut (h2) indexes to see if we've got // the right bucket or if we need to look further // incrs = 0; while ( hashes[hindex].key != 0 && ( hashes[hindex].hash != h1 || hashes[hindex].key != h2 )) { // incrs++; // // If microgrooming is enabled, and we've found a // chain that's too long, we groom it down. // if (microgroom && (incrs > MICROGROOM_CHAIN_LENGTH)) { // set the random number generator up... // note that this is repeatable for a // particular test set, yet dynamic. That // way, we don't always autogroom away the // same feature; we depend on the previous // feature's key. srand ( (unsigned int) h2); // // and do the groom. crm_microgroom (hashes, hfsize, hindex); // since things may have moved after a // microgroom, restart our search hindex = h1 % hfsize; if (hindex < spectra_start ) hindex = spectra_start; incrs = 0; }; // check to see if we've incremented ourself all the // way around the .css file. If so, we're full, and // can hold no more features (this is unrecoverable) if (incrs > hfsize - 3) { nonfatalerror ("Your program is stuffing too many " "features into this size .css file. " "Adding any more features is " "impossible in this file.", "You are advised to build a larger " ".css file and merge your data into " "it."); goto learn_end_regex_loop; }; hindex++; if (hindex >= hfsize) hindex = spectra_start; }; if (internal_trace) { if (hashes[hindex].value == 0) { fprintf (stderr,"New feature at %ld\n", hindex); } else { fprintf (stderr, "Old feature at %ld\n", hindex); }; }; // always rewrite hash and key, as they may be incorrect // (on a reused bucket) or zero (on a fresh one) // hashes[hindex].hash = h1; hashes[hindex].key = h2; // watch out - sense may be both + or -, so check before // adding it... // if (sense > 0 && hashes[hindex].value + sense >= FEATUREBUCKET_VALUE_MAX-1) hashes[hindex].value = FEATUREBUCKET_VALUE_MAX - 1; else if ( sense < 0 && hashes[hindex].value <= -sense ) hashes[hindex].value = 0; else hashes[hindex].value += sense; }; }; }; }; // end the while k==0 learn_end_regex_loop: regcomp_failed: // and remember to let go of the mmap and the pattern bufffer msync (hashes, hfsize * sizeof (FEATUREBUCKET_TYPE), MS_SYNC); munmap (hashes, hfsize * sizeof (FEATUREBUCKET_TYPE)); // Because mmap/munmap doesn't set atime, nor set the "modified" // flag, some network filesystems will fail to mark the file as // modified and so their cacheing will make a mistake. // // The fix is to do a trivial read/write on the .css ile, to force // the filesystem to repropagate it's caches. // { FEATURE_HEADER_STRUCT foo; read (hfd, &foo, sizeof(foo)); lseek (hfd, 0, SEEK_SET); write (hfd, &foo, sizeof(foo)); } close (hfd); crm_regfree (®cb); return (0); } // How to do a Osb_Bayes CLASSIFY some text. // int crm_expr_osb_bayes_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb) { // classify the sparse spectrum of this input window // as belonging to a particular type. // // This code should look very familiar- it's cribbed from // the code for LEARN // long i, j, k; long h; // we use h for our hashpipe counter, as needed. char ptext[MAX_PATTERN]; // the regex pattern long plen; char ltext[MAX_PATTERN]; // the variable to classify long llen; // the hash file names char htext[MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN]; long htext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN; long hlen; // the match statistics variable inbuf char stext[MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100)]; long stext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100); long slen; char svrbl[MAX_PATTERN]; // the match statistics text buffer long svlen; long fnameoffset; char fname[MAX_FILE_NAME_LEN]; long eflags; long cflags; long vhtindex; long not_microgroom = 1; struct stat statbuf; // for statting the hash file unsigned long hashpipe[OSB_BAYES_WINDOW_LEN+1]; regex_t regcb; regmatch_t match[5]; // we only care about the outermost match unsigned long fcounts[MAX_CLASSIFIERS]; // total counts for feature normalize unsigned long totalcount = 0; double cpcorr[MAX_CLASSIFIERS]; // corpus correction factors double hits[MAX_CLASSIFIERS]; // actual hits per feature per classifier long totalhits[MAX_CLASSIFIERS]; // actual total hits per classifier long totalfeatures; // total features double htf; // hits this feature got. double tprob; // total probability in the "success" domain. double textlen; // text length - rougly corresponds to // information content of the text to classify double ptc[MAX_CLASSIFIERS]; // current running probability of this class double renorm = 0.0; double pltc[MAX_CLASSIFIERS]; // current local probability of this class double plltc[MAX_CLASSIFIERS]; // current local probability of this class int hfds[MAX_CLASSIFIERS]; FEATUREBUCKET_TYPE *hashes[MAX_CLASSIFIERS]; long hashlens[MAX_CLASSIFIERS]; char *hashname[MAX_CLASSIFIERS]; long succhash; long vbar_seen; // did we see '|' in classify's args? long maxhash; long fnstart, fnlen; long fn_start_here; long textoffset; long textmaxoffset; long bestseen; long thistotal; long feature_weight = 1; double top10scores[10]; long top10polys[10]; char top10texts[10][MAX_PATTERN]; if (internal_trace) fprintf (stderr, "executing a CLASSIFY\n"); // extract the variable name (if present) // crm_get_pgm_arg (ltext, MAX_PATTERN, apb->b1start, apb->b1len); llen = apb->b1len; llen = crm_nexpandvar (ltext, llen, MAX_PATTERN); // extract the hash file names crm_get_pgm_arg (htext, htext_maxlen, apb->p1start, apb->p1len); hlen = apb->p1len; hlen = crm_nexpandvar (htext, hlen, htext_maxlen); // extract the "this is a word" regex // crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len); plen = apb->s1len; plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); // extract the optional "match statistics" variable // crm_get_pgm_arg (svrbl, MAX_PATTERN, apb->p2start, apb->p2len); svlen = apb->p2len; svlen = crm_nexpandvar (svrbl, svlen, MAX_PATTERN); { long vstart, vlen; crm_nextword (svrbl, svlen, 0, &vstart, &vlen); memmove (svrbl, &svrbl[vstart], vlen); svlen = vlen; svrbl[vlen] = '\000'; }; // status variable's text (used for output stats) // stext[0] = '\000'; slen = 0; // set our flags, if needed. The defaults are // "case" cflags = REG_EXTENDED; eflags = 0; if (apb->sflags & CRM_NOCASE) { cflags += REG_ICASE; eflags = 1; }; not_microgroom = 1; if (apb->sflags & CRM_MICROGROOM) { not_microgroom = 0; if (user_trace) fprintf (stderr, " disabling fast-skip optimization.\n"); }; // compile the word regex if ( internal_trace) fprintf (stderr, "\nWordmatch pattern is %s", ptext); i = crm_regcomp (®cb, ptext, plen, cflags); if ( i > 0) { crm_regerror ( i, ®cb, tempbuf, data_window_size); nonfatalerror ("Regular Expression Compilation Problem:", tempbuf); goto regcomp_failed; }; // Now, the loop to open the files. bestseen = 0; thistotal = 0; // goodcount = evilcount = 1; // prevents a divide-by-zero error. //cpgood = cpevil = 0.0; //ghits = ehits = 0.0 ; //psucc = 0.5; //pfail = (1.0 - psucc); //pic = 0.5; //pnic = 0.5; // initialize our arrays for N .css files for (i = 0; i < MAX_CLASSIFIERS; i++) { fcounts[i] = 0; // check later to prevent a divide-by-zero // error on empty .css file cpcorr[i] = 0.0; // corpus correction factors hits[i] = 0.0; // absolute hit counts totalhits[i] = 0.0; // absolute hit counts ptc[i] = 0.5; // priori probability pltc[i] = 0.5; // local probability }; for (i = 0; i < 10; i++) { top10scores[i] = 0; top10polys[i] = 0; strcpy (top10texts[i], ""); }; // -- probabilistic evaluator --- // S = success; A = a testable attribute of success // ns = not success, na = not attribute // the chain rule we use is: // // P(A|S) P(S) // P (S|A) = ------------------------- // P(A|S) P(S) + P(A|NS) P(NS) // // and we apply it repeatedly to evaluate the final prob. For // the initial a-priori probability, we use 0.5. The output // value (here, P(S|A) ) becomes the new a-priori for the next // iteration. // // Extension - we generalize the above to I classes as and feature // F as follows: // // P(F|Ci) P(Ci) // P(Ci|F) = ---------------------------------------- // Sum over all classes Ci of P(F|Ci) P(Ci) // // We also correct for the unequal corpus sizes by multiplying // the probabilities by a renormalization factor. if Tg is the // total number of good features, and Te is the total number of // evil features, and Rg and Re are the raw relative scores, // then the corrected relative scores Cg aqnd Ce are // // Cg = (Rg / Tg) // Ce = (Re / Te) // // or Ci = (Ri / Ti) // // Cg and Ce can now be used as "corrected" relative counts // to calculate the naive Bayesian probabilities. // // Lastly, the issue of "over-certainty" rears it's ugly head. // This is what happens when there's a zero raw count of either // good or evil features at a particular place in the file; the // strict but naive mathematical interpretation of this is that // "feature A never/always occurs when in good/evil, hence this // is conclusive evidence of good/evil and the probabilities go // to 1.0 or 0.0, and are stuck there forevermore. We use the // somewhat ad-hoc interpretation that it is unreasonable to // assume that any finite number of samples can appropriately // represent an infinite continuum of spewage, so we can bound // the certainty of any meausre to be in the range: // // limit: [ 1/featurecount+2 , 1 - 1/featurecount+2]. // // The prior bound is strictly made-up-on-the-spot and has NO // strong theoretical basis. It does have the nice behavior // that for feature counts of 0 the probability is clipped to // [0.5, 0.5], for feature counts of 1 to [0.333, 0.666] // for feature counts of 2 to [0.25, 0.75], for 3 to // [0.2, 0.8], for 4 to [0.166, 0.833] and so on. // vbar_seen = 0; maxhash = 0; succhash = 0; fnameoffset = 0; // now, get the file names and mmap each file // get the file name (grody and non-8-bit-safe, but doesn't matter // because the result is used for open() and nothing else. // GROT GROT GROT this isn't NULL-clean on filenames. But then // again, stdio.h itself isn't NULL-clean on filenames. if (user_trace) fprintf (stderr, "Classify list: -%s- \n", htext); fn_start_here = 0; fnlen = 1; while ( fnlen > 0 && ((maxhash < MAX_CLASSIFIERS-1))) { crm_nextword (htext, hlen, fn_start_here, &fnstart, &fnlen); if (fnlen > 0) { strncpy (fname, &htext[fnstart], fnlen); fn_start_here = fnstart + fnlen + 1; fname[fnlen] = '\000'; if (user_trace) fprintf (stderr, "Classifying with file -%s- "\ "succhash=%ld, maxhash=%ld\n", fname, succhash, maxhash); if ( fname[0] == '|' && fname[1] == '\000') { if (vbar_seen) { nonfatalerror ("Only one ' | ' allowed in a CLASSIFY. \n" , "We'll ignore it for now."); } else { succhash = maxhash; }; vbar_seen ++; } else { // be sure the file exists // stat the file to get it's length k = stat (fname, &statbuf); // quick check- does the file even exist? if (k != 0) { nonfatalerror ("Nonexistent Classify table named: ", fname); } else { // file exists - do the open/process/close // hashlens[maxhash] = statbuf.st_size; // mmap the hash file into memory so we can bitwhack it hfds[maxhash] = open (fname, O_RDONLY); if (hfds[maxhash] < 0) { nonfatalerror ("Couldn't open the table file", fname); } else { hashes[maxhash] = (FEATUREBUCKET_TYPE *) mmap (NULL, hashlens[maxhash], PROT_READ, MAP_SHARED, hfds[maxhash], 0); if (hashes[maxhash] == MAP_FAILED ) { nonfatalerror ("Couldn't memory-map the table file", fname); } else { // // Check to see if this file is the right version // long fev; if (hashes[maxhash][0].hash != 0 || hashes[maxhash][0].key != 0) { fev =fatalerror ("The .css file is the wrong version! Filename is: ", fname); return (fev); }; // grab the start of the actual spectrum data. // spectra_start = hashes[maxhash][0].value; // set this hashlens to the length in features instead // of the length in bytes. hashlens[maxhash] = hashlens[maxhash] / sizeof (FEATUREBUCKET_TYPE); hashname[maxhash] = (char *) malloc (fnlen+10); if (!hashname[maxhash]) untrappableerror( "Couldn't malloc hashname[maxhash]\n","We need that part later, so we're stuck. Sorry."); strncpy(hashname[maxhash],fname,fnlen); hashname[maxhash][fnlen]='\000'; maxhash++; }; }; }; }; if (maxhash > MAX_CLASSIFIERS-1) nonfatalerror ("Too many classifier files.", "Some may have been disregarded"); }; }; // // If there is no '|', then all files are "success" files. if (succhash == 0) succhash = maxhash; // a CLASSIFY with no arguments is always a "success". if (maxhash == 0) return (0); // now, set up the normalization factor fcount[] if (user_trace) fprintf (stderr, "Running with %ld files for success out of %ld files\n", succhash, maxhash ); // sanity checks... Uncomment for super-strict CLASSIFY. // // do we have at least 1 valid .css files? if (maxhash == 0) { fatalerror ("Couldn't open at least 2 .css files for classify().", ""); }; // do we have at least 1 valid .css file at both sides of '|'? //if (!vbar_seen || succhash < 0 || (maxhash < succhash + 2)) // { // nonfatalerror ( // "Couldn't open at least 1 .css file per SUCC | FAIL classes " // " for classify().\n","Hope you know what are you doing."); // }; { long ifile; long k; // count up the total first for (ifile = 0; ifile < maxhash; ifile++) { fcounts[ifile] = 0; for (k = 1; k < hashlens[ifile]; k++) fcounts [ifile] = fcounts[ifile] + hashes[ifile][k].value; if (fcounts[ifile] == 0) fcounts[ifile] = 1; totalcount = totalcount + fcounts[ifile]; }; // // calculate cpcorr (count compensation correction) // for (ifile = 0; ifile < maxhash; ifile++) { // cpcorr [ifile] = ( totalcount / (fcounts[ifile] * (maxhash-1))); // // disable cpcorr for now... unclear that it's useful. cpcorr[ifile] = 1.0; }; }; // // now all of the files are mmapped into memory, // and we can do the polynomials and add up points. i = 0; j = 0; k = 0; thistotal = 0; if (llen > 0) { vhtindex = crm_vht_lookup (vht, ltext, llen ); } else { vhtindex = crm_vht_lookup (vht, ":_dw:", 5); } if (vht[vhtindex] == NULL) { return (fatalerror (" Attempt to CLASSIFY from a nonexistent variable ", ltext)); }; mdw = NULL; if (tdw->filetext == vht[vhtindex]->valtxt) mdw = tdw; if (cdw->filetext == vht[vhtindex]->valtxt) mdw = cdw; if (mdw == NULL) return ( fatalerror (" Bogus text block containing variable ", ltext)); textoffset = vht[vhtindex]->vstart; textmaxoffset = textoffset + vht[vhtindex]->vlen; textlen = (vht[vhtindex]->vlen); if (textlen < 1.0) textlen = 1.0; // init the hashpipe with 0xDEADBEEF for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++) { hashpipe[h] = 0xDEADBEEF; }; totalfeatures = 0; // stop when we no longer get any regex matches // possible edge effect here- last character must be matchable, yet // it's also the "end of buffer". while (k == 0 && textoffset <= textmaxoffset) { long wlen; long slen; // do the regex slen = vht[vhtindex]->vstart + vht[vhtindex]->vlen - textoffset ; k = crm_regexec (®cb, &(mdw->filetext[textoffset]), slen, 1, match, 0, NULL); if (k != 0 || textoffset > textmaxoffset) goto classify_end_regex_loop; wlen = match[0].rm_eo - match[0].rm_so; memmove (tempbuf, &(mdw->filetext[textoffset + match[0].rm_so]), wlen); tempbuf[wlen] = '\000'; if (match[0].rm_eo == 0) { nonfatalerror("The CLASSIFY pattern matched zero length! ", "\n Forcing an increment to avoid an infinite loop."); match[0].rm_eo = 1; }; if (internal_trace) { fprintf (stderr, " Classify #%ld t.o. %ld strt %ld end %ld len %ld is -%s-\n", i, textoffset, (long) match[0].rm_so, (long) match[0].rm_eo, wlen, tempbuf); }; if (match[0].rm_eo == 0) { nonfatalerror ( "The CLASSIFYpattern matched zero length! ", "\n Forcing an increment to avoid an infinite loop."); match[0].rm_eo = 1; }; // slide previous hashes up 1 for (h = OSB_BAYES_WINDOW_LEN-1; h > 0; h--) { hashpipe [h] = hashpipe [h-1]; }; // and put new hash into pipeline hashpipe[0] = strnhash ( tempbuf, wlen); if (0) { fprintf (stderr, " Hashpipe contents: "); for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++) fprintf (stderr, " %ld", hashpipe[h]); fprintf (stderr, "\n"); }; // account for the text we used up... textoffset = textoffset + match[0].rm_eo; i++; // is the pipe full enough to do the hashing? if (1) // we init with 0xDEADBEEF, so the pipe is always full (i >=5) { int j, k, l; unsigned th=0; // a counter used only in TSS hashing unsigned long hindex; unsigned long h1, h2; //unsigned long good, evil; // th = 0; // for (j = 1; j < OSB_BAYES_WINDOW_LEN; j++) { h1 = hashpipe[0]*hctable[0] + hashpipe[j] * hctable[j<<1]; if (h1 < spectra_start) h1 = spectra_start; h2 = hashpipe[0]*hctable[1] + hashpipe[j] * hctable[(j<<1)-1]; if (h2 == 0) h2 = 0xdeadbeef; hindex = h1; if (internal_trace) fprintf (stderr, "Polynomial %d has h1:%ld h2: %ld\n", j, h1, h2); // // Note - a strict interpretation of Bayesian // chain probabilities should use 0 as the initial // state. However, because we rapidly run out of // significant digits, we use a much less strong // initial state. Note also that any nonzero // positive value prevents divide-by-zero // // Fidelis Assis suggests that 0, 24, 14, 7, 4, 0... // may be optimal. We keep the 0th term only // because it allows Arne's Optimization, NOT because // it improves accuracy. { int fw[10] = { 0, 24, 14, 7, 4, 2, 1, 0}; feature_weight = fw[j]; }; // feature_weight = 1; // // // Zero out "Hits This Feature" htf = 0; totalfeatures++; // // calculate the precursors to the local probabilities; // these are the hits[k] array, and the htf total. // for (k = 0; k < maxhash; k++) { long lh, lh0; lh = hindex % (hashlens[k]); if (lh < spectra_start ) lh = spectra_start; lh0 = lh; hits[k] = 0; while ( hashes[k][lh].key != 0 && ( hashes[k][lh].hash != h1 || hashes[k][lh].key != h2 )) { lh++; if (lh >= hashlens[k]) lh = spectra_start; if (lh == lh0) break; // wraparound }; if (hashes[k][lh].hash == h1 && hashes[k][lh].key == h2) { // l = hashes[k][lh].value * feature_weight; totalhits[k] = totalhits[k] + l; // remember totalhits hits[k] = l * cpcorr [k]; // remember corr. hits htf = htf + hits[k]; // and hits-this-feature }; }; // now update the probabilities. // // NOTA BENE: there are a bunch of different ways to // calculate local probabilities. The text below // refers to an experiment that may or may not make it // as the "best way". // // The hard part is this - what is the local in-class // versus local out-of-class probability given the finding // of a particular feature? // // I'm guessing this- the validity is the differntial // seen on the feature (that is, fgood - fevil ) // times the entropy of that feature as seen in the // corpus (that is, // // Pfeature*log2(Pfeature) // // = // totalcount_this_feature // --------------- * log2 (totalcount_this_feature) // totalcount_all_features // // (note, yes, the last term seems like it should be // relative to totalcount_all_features, but a bit of algebra // will show that if you view fgood and fevil as two different // signals, then you end up with + and - totalcount inside // the logarithm parenthesis, and they cancel out. // (the 0.30102 converts "log10" to "log2" - it's not // a magic number, it's just that log2 isn't in glibc) // // HACK ALERT- this code here is still under development // and should be viewed with the greatest possible // suspicion. :=) for (k = 0; k < maxhash; k++) { pltc[k] = 0.5 + (( hits[k] - (htf - hits[k])) / (LOCAL_PROB_DENOM * (htf + 1.0))); }; // Now, some funky magic. Our formula above is // mathematically correct (if features were // independent- something we conveniently ignore.), // but because of the limited word length in a real // computer, we can quickly run out of dynamic range // early in a CLASSIFY (P(S) indistinguishable from // 1.00) and then there is no way to recover. To // remedy this, we use two alternate forms of the // formula (in Psucc and Pfail) and use whichever // form that yields the smaller probability to // recalculate the value of the larger. // // The net result of this subterfuge is a nonuniform // representation of the probability, with a huge dynamic // range in two places - near 0.0, and near 1.0 , which // are the two places where we actually care. // // update the global probabilities // ptemp = ( pic*plic) / ((pic * plic) + (pnic * plnic)); // pnic = (pnic*plnic) / ((pic * plic) + (pnic * plnic)); // pic = ptemp; // calculate renormalizer (the Bayesian formula's denomenator) renorm = 0.0; // now calculate the per-ptc numerators plltc[0] = 0; // keep gcc from complaining about unused vars. for (k = 0; k < maxhash; k++) renorm = renorm + (ptc[k]*pltc[k]); for (k = 0; k < maxhash; k++) ptc[k] = (ptc[k] * pltc[k]) / renorm; // if we have underflow (any probability == 0.0 ) then // bump the probability back up to 10^-308, or // whatever a small multiple of the minimum double // precision value is on the current platform. // for (k = 0; k < maxhash; k++) if (ptc[k] < 10*DBL_MIN) ptc[k] = 10 * DBL_MIN; // // part 2) renormalize to sum probabilities to 1.0 // renorm = 0.0; for (k = 0; k < maxhash; k++) renorm = renorm + ptc[k]; for (k = 0; k < maxhash; k++) ptc[k] = ptc[k] / renorm; for (k = 0; k < maxhash; k++) if (ptc[k] < 10*DBL_MIN) ptc[k] = 10 * DBL_MIN; //if (pnic < pic) // { pic = 1.0 - pnic;} else { pnic = 1.0 - pic; }; if (internal_trace) { for (k = 0; k < maxhash; k++) { // fprintf (stderr, "ZZZ\n"); fprintf (stderr, " poly: %d filenum: %d, HTF: %7.0f, hits: %7.0f, Pl: %6.4e, Pc: %6.4e\n", j, k, htf, hits[k], pltc[k], ptc[k]); }; }; // // avoid the fencepost error for window=1 if ( OSB_BAYES_WINDOW_LEN == 1) { j = 99999; }; }; }; }; // end of repeat-the-regex loop classify_end_regex_loop: // cleanup time! // remember to let go of the fd's and mmaps for (k = 0; k < maxhash; k++) { close (hfds [k]); munmap (hashes[k], hashlens[k] * sizeof (FEATUREBUCKET_TYPE)); }; // and let go of the regex buffery crm_regfree (®cb); // and one last chance to force probabilities into the non-stuck zone // // if (pic == 0.0 ) pic = DBL_MIN; //if (pnic == 0.0 ) pnic = DBL_MIN; for (k = 0; k < maxhash; k++) if (ptc[k] < 10*DBL_MIN) ptc[k] = 10 * DBL_MIN; if (user_trace) { for (k = 0; k < maxhash; k++) fprintf (stderr, "Probability of match for file %ld: %f\n", k, ptc[k]); }; // tprob = 0.0; for (k = 0; k < succhash; k++) tprob = tprob + ptc[k]; if (svlen > 0) { char buf[1024]; double accumulator; double remainder; double overall_pR; long m; buf [0] = '\000'; accumulator = 10 * DBL_MIN; for (m = 0; m < succhash; m++) { accumulator = accumulator + ptc[m]; }; remainder = 10 * DBL_MIN; for (m = succhash; m < maxhash; m++) if (bestseen != m) { remainder = remainder + ptc[m]; }; overall_pR = log10 (accumulator) - log10 (remainder); // note also that strcat _accumulates_ in stext. // There would be a possible buffer overflow except that _we_ control // what gets written here. So it's no biggie. if (tprob >= 0.5000) { sprintf (buf, "CLASSIFY succeeds; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR ); } else { sprintf (buf, "CLASSIFY fails; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR ); }; if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); bestseen = 0; for (k = 0; k < maxhash; k++) if (ptc[k] > ptc[bestseen] ) bestseen = k; remainder = 10 * DBL_MIN; for (m = 0; m < maxhash; m++) if (bestseen != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "Best match to file #%ld (%s) "\ "prob: %6.4f pR: %6.4f \n", bestseen, hashname[bestseen], ptc[bestseen], (log10(ptc[bestseen]) - log10(remainder))); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); sprintf (buf, "Total features in input file: %ld\n", totalfeatures); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); for (k = 0; k < maxhash; k++) { long m; remainder = 10 * DBL_MIN; for (m = 0; m < maxhash; m++) if (k != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "#%ld (%s):"\ " features: %ld, hits: %ld, prob: %3.2e, pR: %6.2f \n", k, hashname[k], fcounts[k], totalhits[k], ptc[k], (log10 (ptc[k]) - log10 (remainder) ) ); // strcat (stext, buf); if (strlen(stext)+strlen(buf) <= stext_maxlen) strcat (stext, buf); }; // check here if we got enough room in stext to stuff everything // perhaps we'd better rise a nonfatalerror, instead of just // whining on stderr if (strcmp(&(stext[strlen(stext)-strlen(buf)]), buf) != 0) { nonfatalerror( "WARNING: not enough room in the buffer to create " "the statistics text. Perhaps you could try bigger " "values for MAX_CLASSIFIERS or MAX_FILE_NAME_LEN?", " "); }; crm_destructive_alter_nvariable (svrbl, svlen, stext, strlen (stext)); }; // // Free the hashnames, to avoid a memory leak. // for (i = 0; i < maxhash; i++) free (hashname[i]); if (tprob < 0.5000) { if (user_trace) fprintf (stderr, "CLASSIFY was a FAIL, skipping forward.\n"); // and do what we do for a FAIL here csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1; return (0); }; // // all done... if we got here, we should just continue execution if (user_trace) fprintf (stderr, "CLASSIFY was a SUCCESS, continuing execution.\n"); regcomp_failed: return (0); };