// crm_markovian.c - Controllable Regex Mutilator, version v1.0 // Copyright 2001-2004 William S. Yerazunis, all rights reserved. // // This software is licensed to the public under the Free Software // Foundation's GNU GPL, version 2. You may // obtain a copy of the GPL by visiting the Free Software Foundations // web site at www.fsf.org, and a copy is included in this // distribution. // // Other licenses may be negotiated; contact the // author for details. // // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the command line argc, argv extern int prog_argc; extern char **prog_argv; // the auxilliary input buffer (for WINDOW input) extern char *newinputbuf; // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *inbuf; extern char *outbuf; extern char *tempbuf; //////////////////////////////////////////////////////////////////// // // the hash coefficient table (hctable) should be full of relatively // prime numbers, and preferably superincreasing, though both of those // are not strict requirements. // static long hctable[] = { 1, 7, 3, 13, 5, 29, 11, 51, 23, 101, 47, 203, 97, 407, 197, 817, 397, 1637, 797, 3277 }; // How to learn Markovian style. // int crm_expr_markov_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb) { // learn the sparse spectrum of this input window as // belonging to a particular type. // learn (classname) /word/ // long i, j, k; long h; // h is our counter in the hashpipe; char ptext[MAX_PATTERN]; // the regex pattern long plen; char ltext[MAX_PATTERN]; // the variable to learn long llen; char htext[MAX_PATTERN]; // the hash name long hlen; long cflags, eflags; struct stat statbuf; // for statting the hash file int hfd; // hashfile fd long hfsize; // size of the hash file FEATUREBUCKET_TYPE *hashes; // the text of the hash file unsigned long hashpipe[MARKOVIAN_WINDOW_LEN+1]; // regex_t regcb; regmatch_t match[5]; // we only care about the outermost match long textoffset; long textmaxoffset; long sense; long vhtindex; long microgroom; long logboost; long fev; if (internal_trace) fprintf (stderr, "executing a LEARN\n"); // Keep the gcc compiler from complaining about unused variables i = hctable[0]; // extract the hash file name crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len); hlen = apb->p1len; hlen = crm_nexpandvar (htext, hlen, MAX_PATTERN); // // extract the variable name (if present) crm_get_pgm_arg (ltext, MAX_PATTERN, apb->b1start, apb->b1len); llen = apb->b1len; llen = crm_nexpandvar (ltext, llen, MAX_PATTERN); // get the "this is a word" regex crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len); plen = apb->s1len; plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); // set our cflags, if needed. The defaults are // "case" and "affirm", (both zero valued). // and "microgroom" disabled. cflags = REG_EXTENDED; eflags = 0; sense = +1; if (apb->sflags & CRM_NOCASE) { cflags = cflags || REG_ICASE; eflags = 1; if (user_trace) fprintf (stderr, "turning oncase-insensitive match\n"); }; if (apb->sflags & CRM_REFUTE) { sense = -sense; if (user_trace) fprintf (stderr, " refuting learning\n"); }; microgroom = 0; if (apb->sflags & CRM_MICROGROOM) { microgroom = 1; if (user_trace) fprintf (stderr, " enabling microgrooming.\n"); }; logboost = 0; if (apb->sflags & CRM_LOGBOOST) { logboost = 1; if (user_trace) fprintf (stderr, " enabling LOGBOOST learning.\n"); }; // // grab the filename, and stat the file // note that neither "stat", "fopen", nor "open" are // fully 8-bit or wchar clean... i = 0; while (htext[i] < 0x021) i++; j = i; while (htext[j] >= 0x021) j++; // filename starts at i, ends at j. null terminate it. htext[j] = '\000'; // and stat it to get it's length k = stat (&htext[i], &statbuf); // quick check- does the file even exist? if (k != 0) { // file didn't exist... create it FILE *f; if (user_trace) fprintf (stderr, "\nHad to create new CSS file %s\n", &htext[i]); f = fopen (&htext[i], "w"); if (!f) { fprintf (stderr, "\n Couldn't open your new CSS file %s for writing; errno=%d .\n", &htext[i], errno); exit (EXIT_FAILURE); }; // put in sparse_spectrum_file_length entries of NULL for (j = 0; j < sparse_spectrum_file_length * sizeof ( FEATUREBUCKET_TYPE); j++) fputc ('\000', f); // fprintf (f,"%c", '\000'); fclose (f); // and reset the statbuf to be correct k = stat (&htext[i], &statbuf); }; // hfsize = statbuf.st_size; if (user_trace) fprintf (stderr, "Sparse spectra file %s has length %ld bins\n", &htext[i], hfsize / sizeof (FEATUREBUCKET_TYPE)); // // open the hash file into memory so we can bitwhack it // hfd = open (&(htext[i]), O_RDWR); if (hfd < 0) { fev = fatalerror ("Couldn't open the .css file named: ", &htext[i]); return (fev); }; // // map the .css file into memory // hashes = (FEATUREBUCKET_TYPE *) mmap (NULL, hfsize, PROT_READ + PROT_WRITE, MAP_SHARED, hfd, 0); if (hashes == MAP_FAILED) { fev = fatalerror ("Couldn't memory-map the .css file named: ", &htext[i]); return (fev); }; // // check the version of the file // if (hashes[0].hash != 0 || hashes[0].key != 0 || hashes[0].value!= 0) { fev =fatalerror ("The .css file is the wrong version! Filename is: ", &htext[i]); return (fev); }; // // now set the hfsize to the number of entries, not the number // of bytes total hfsize = hfsize / sizeof ( FEATUREBUCKET_TYPE ); // LOGBOOST IS DISABLED IN THIS RELEASE. DO NOT ENABLE IT, // AS IT GIVES _BAD_ RESULTS!!!! // // if logboost is engaged, get the log of the featurecount // if ( 0 ) // disabled logboost) { for (i = 1; i < hfsize; i++) { logboost = logboost + hashes[i].value; }; logboost = log10 (log10 ((logboost * 1.0))) * FEATURE_HIT_INCREMENT_SIZE; if (logboost < 1) logboost = 1; logboost = 1; }; // and if logboost is engaged, mangle "sense" as well. // if (logboost) sense = sense * logboost ; // compile the word regex // if ( internal_trace) fprintf (stderr, "\nWordmatch pattern is %s", ptext); i = crm_regcomp (®cb, ptext, plen, cflags); if ( i > 0) { crm_regerror ( i, ®cb, tempbuf, data_window_size); nonfatalerror ("Regular Expression Compilation Problem:", tempbuf); goto regcomp_failed; }; // Start by priming the pipe... we will shift to the left next. // sliding, hashing, xoring, moduloing, and incrmenting the // hashes till there are no more. k = 0; j = 0; i = 0; if (llen > 0) { vhtindex = crm_vht_lookup (vht, ltext, llen); } else { vhtindex = crm_vht_lookup (vht, ":_dw:", 5); }; if (vht[vhtindex] == NULL) { long q; q = fatalerror (" Attempt to LEARN from a nonexistent variable ", ltext); return (q); }; mdw = NULL; if (tdw->filetext == vht[vhtindex]->valtxt) mdw = tdw; if (cdw->filetext == vht[vhtindex]->valtxt) mdw = cdw; if (mdw == NULL) { long q; q = fatalerror (" Bogus text block containing variable ", ltext); return (q); } textoffset = vht[vhtindex]->vstart; textmaxoffset = textoffset + vht[vhtindex]->vlen; // init the hashpipe with 0xDEADBEEF for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++) { hashpipe[h] = 0xDEADBEEF; }; // and the big loop... i = 0; while (k == 0 && textoffset <= textmaxoffset) { long wlen; long slen; // do the regex // slen = endpoint (= start + len) // - startpoint (= curr textoffset) slen = vht[vhtindex]->vstart + vht[vhtindex]->vlen - textoffset ; k = crm_regexec ( ®cb, &(mdw->filetext[textoffset]), slen, 5, match, 0, NULL); if (k != 0 || textoffset > textmaxoffset) goto learn_end_regex_loop; { wlen = match[0].rm_eo - match[0].rm_so; memmove (tempbuf, &(mdw->filetext[textoffset + match[0].rm_so]), wlen); tempbuf[wlen] = '\000'; if (internal_trace) { fprintf (stderr, " Learn #%ld t.o. %ld strt %ld end %ld len %ld is -%s-\n", i, textoffset, (long) match[0].rm_so, (long) match[0].rm_eo, wlen, tempbuf); }; if (match[0].rm_eo == 0) { nonfatalerror ( "The LEARN pattern matched zero length! ", "\n Forcing an increment to avoid an infinite loop."); match[0].rm_eo = 1; }; // Shift the hash pipe down one // for (h = MARKOVIAN_WINDOW_LEN-1; h > 0; h--) { hashpipe [h] = hashpipe [h-1]; }; // and put new hash into pipeline hashpipe[0] = strnhash (tempbuf, wlen); if (internal_trace) { fprintf (stderr, " Hashpipe contents: "); for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++) fprintf (stderr, " %ld", hashpipe[h]); fprintf (stderr, "\n"); }; // and account for the text used up. textoffset = textoffset + match[0].rm_eo; i++; // is the pipe full enough to do the hashing? if (1) // we always run the hashpipe now, even if it's // just full of 0xDEADBEEF. (was i >=5) { unsigned long hindex; unsigned long h1, h2; long th = 0; // a counter used for TSS tokenizing unsigned long incrs; long j; // // old Hash polynomial: h0 + 3h1 + 5h2 +11h3 +23h4 // (coefficients chosen by requiring superincreasing, // as well as prime) // th = 0; // for ( j = 0; j <= 15 ; j++) for (j = 0; j < (1 << (MARKOVIAN_WINDOW_LEN - 1)); j++) { #ifdef FOO // // First Order Only - 1 token per. // hindex = hashpipe [0] ; if (hindex == 0) hindex = 1; h1 = hindex; hindex = hindex % hfsize; if (hindex == 0) hindex = 1; // this is the secondary (crosscut) hash, used for // confirmation of the key value. h2 = hashpipe[0]; if (h2 == 0) h2 = 0xdeadbeef; j = 16; #endif #ifdef TGB // // Token Grab Bag - ignore sequence, distance. alias // hindex = hashpipe [0] + ( hashpipe[1] * ((j>>0) & 0x0001)) + ( hashpipe[2] * ((j>>1) & 0x0001)) + ( hashpipe[3] * ((j>>2) & 0x0001)) + ( hashpipe[4] * ((j>>3) & 0x0001)); if (hindex == 0) hindex = 1; h1 = hindex; hindex = hindex % hfsize; if (hindex == 0) hindex = 1; // this is the secondary (crosscut) hash, used for // confirmation of the key value. h2 = hashpipe[0] + ( hashpipe[1] * ((j>>0) & 0x0001)) + ( hashpipe[2] * ((j>>1) & 0x0001)) + ( hashpipe[3] * ((j>>2) & 0x0001)) + ( hashpipe[4] * ((j>>3) & 0x0001)); if (h2 == 0) h2 = 0xdeadbeef; #endif #ifdef TGB2 // // Token Grab Bag - ignore sequence, distance. // hindex = hashpipe [0] + ( hashpipe[1] * ((j>>0) & 0x0001)) + ( hashpipe[2] * ((j>>1) & 0x0001)) + ( hashpipe[3] * ((j>>2) & 0x0001)) + ( hashpipe[4] * ((j>>3) & 0x0001)); if (hindex == 0) hindex = 1; h1 = hindex; hindex = hindex % hfsize; if (hindex == 0) hindex = 1; // this is the secondary (crosscut) hash, used for // confirmation of the key value. h2 = hashpipe[0] * hashpipe[0] + ( hashpipe[1] * hashpipe[1] * ((j>>0) & 0x0001)) + ( hashpipe[2] * hashpipe[2] * ((j>>1) & 0x0001)) + ( hashpipe[3] * hashpipe[3] * ((j>>2) & 0x0001)) + ( hashpipe[4] * hashpipe[4] * ((j>>3) & 0x0001)); if (h2 == 0) h2 = 0xdeadbeef; #endif #ifdef TSS // // Token Grab Bag - ignore sequence, distance, prevent // aliasing (quadratic H2) // hindex = hashpipe [0] + ( hashpipe[1] * ((j>>0) & 0x0001)) + ( hashpipe[2] * ((j>>1) & 0x0001)) + ( hashpipe[3] * ((j>>2) & 0x0001)) + ( hashpipe[4] * ((j>>3) & 0x0001)); if (hindex == 0) hindex = 1; h1 = hindex; hindex = hindex % hfsize; if (hindex == 0) hindex = 1; // this is the secondary (crosscut) hash, used for // confirmation of the key value. th = 2; h2 = hashpipe[0]; if ((j>>0) & 0x0001) { h2 = h2 + hashpipe[1] * th; th++; }; if ((j>>1) & 0x0001) { h2 = h2 + hashpipe[2] * th; th++; }; if ((j>>2) & 0x0001) { h2 = h2 + hashpipe[3] * th; th++; }; if ((j>>3) & 0x0001) { h2 = h2 + hashpipe[4] * th; th++; }; if (h2 == 0) h2 = 0xdeadbeef; #endif #ifdef SBPH hindex = hashpipe [0] + ( 3 * hashpipe[1] * ((j>>0) & 0x0001)) + ( 5 * hashpipe[2] * ((j>>1) & 0x0001)) + ( 11 * hashpipe[3] * ((j>>2) & 0x0001)) + ( 23 * hashpipe[4] * ((j>>3) & 0x0001)); h1 = hindex; // and what's our primary hash index? Note that // hindex = 0 is reserved for our version and // usage flags, so we autobump those to hindex=1 hindex = hindex % hfsize; if (hindex == 0) hindex = 1; // this is the secondary (crosscut) hash, used for // confirmation of the key value. Note that it shares // no common coefficients with the previous hash. h2 = 7 * hashpipe [0] + ( 13 * hashpipe[1] * ((j>>0) & 0x0001)) + ( 29 * hashpipe[2] * ((j>>1) & 0x0001)) + ( 51 * hashpipe[3] * ((j>>2) & 0x0001)) + ( 101 * hashpipe[4] * ((j>>3) & 0x0001)); if (h2 == 0) h2 = 0xdeadbeef; #endif #ifdef ARBITRARY_WINDOW_LENGTH ////////////////////////////////////////////////// // // Generic N-length hashing. // // first term (0th) is always on h1 = hashpipe[0] * hctable [0]; // 2nd and onward terms are variable. for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++) { h1 = h1 + hashpipe[h] * hctable[h*2] * ((j>>(h-1))&0x0001); }; hindex = h1; hindex = hindex % hfsize; if (hindex == 0) hindex = 1; // 0th term is always turned on. h2 = hashpipe[0] * hctable[1]; // terms 1 through N are variable for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++) { h2 = h2 +hashpipe[h] * hctable[h*2+1]*((j>>(h-1))&0x0001); }; if (h2 == 0) h2 = 0xDEADBEEF; #endif if (internal_trace) fprintf (stderr, "Polynomial %ld has h1:%ld h2: %ld\n", j, h1, h2); // // we now look at both the primary (h1) and // crosscut (h2) indexes to see if we've got // the right bucket or if we need to look further // incrs = 0; while ( hashes[hindex].key != 0 && ( hashes[hindex].hash != h1 || hashes[hindex].key != h2 )) { // incrs++; // // If microgrooming is enabled, and we've found a // chain that's too long, we groom it down. // if (microgroom && (incrs > MICROGROOM_CHAIN_LENGTH)) { // set the random number generator up... // note that this is repeatable for a // particular test set, yet dynamic. That // way, we don't always autogroom away the // same feature; we depend on the previous // feature's key. srand ( (unsigned int) h2); // // and do the groom. crm_microgroom (hashes, hfsize, hindex); // since things may have moved after a // microgroom, restart our search hindex = h1 % hfsize; if (hindex == 0) hindex = 1; incrs = 0; }; // check to see if we've incremented ourself all the // way around the .css file. If so, we're full, and // can hold no more features (this is unrecoverable) if (incrs > hfsize - 3) { nonfatalerror ("Your program is stuffing too many " "features into this size .css file. " "Adding any more features is " "impossible in this file.", "You are advised to build a larger " ".css file and merge your data into " "it."); goto learn_end_regex_loop; }; hindex++; if (hindex >= hfsize) hindex = 1; }; if (internal_trace) { if (hashes[hindex].value == 0) { fprintf (stderr,"New feature at %ld\n", hindex); } else { fprintf (stderr, "Old feature at %ld\n", hindex); }; }; // always rewrite hash and key, as they may be incorrect // (on a reused bucket) or zero (on a fresh one) // hashes[hindex].hash = h1; hashes[hindex].key = h2; // watch out - sense may be both + or -, so check before // adding it... // if (sense > 0 && hashes[hindex].value + sense >= FEATUREBUCKET_VALUE_MAX-1) hashes[hindex].value = FEATUREBUCKET_VALUE_MAX - 1; else if ( sense < 0 && hashes[hindex].value <= -sense ) hashes[hindex].value = 0; else hashes[hindex].value += sense; }; }; }; }; // end the while k==0 learn_end_regex_loop: regcomp_failed: // and remember to let go of the mmap and the pattern bufffer msync (hashes, hfsize * sizeof (FEATUREBUCKET_TYPE), MS_SYNC); munmap (hashes, hfsize * sizeof (FEATUREBUCKET_TYPE)); // Because mmap/munmap doesn't set atime, nor set the "modified" // flag, some network filesystems will fail to mark the file as // modified and so their cacheing will make a mistake. // // The fix is to do a trivial read/write on the .css ile, to force // the filesystem to repropagate it's caches. // { FEATURE_HEADER_STRUCT foo; read (hfd, &foo, sizeof(foo)); lseek (hfd, 0, SEEK_SET); write (hfd, &foo, sizeof(foo)); } close (hfd); crm_regfree (®cb); return (0); } // How to Markovian CLASSIFY some text. // int crm_expr_markov_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb) { // classify the sparse spectrum of this input window // as belonging to a particular type. // // This code should look very familiar- it's cribbed from // the code for LEARN // long i, j, k; long h; // we use h for our hashpipe counter, as needed. char ptext[MAX_PATTERN]; // the regex pattern long plen; char ltext[MAX_PATTERN]; // the variable to classify long llen; // the hash file names char htext[MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN]; long htext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN; long hlen; // the match statistics variable inbuf char stext[MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100)]; long stext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100); long slen; char svrbl[MAX_PATTERN]; // the match statistics text buffer long svlen; long fnameoffset; char fname[MAX_FILE_NAME_LEN]; long eflags; long cflags; long not_microgroom; // is microgrooming disabled (fast-quit // optimization if 0th feature gone) long vhtindex; struct stat statbuf; // for statting the hash file unsigned long hashpipe[5]; // longest association set in the hashing regex_t regcb; regmatch_t match[5]; // we only care about the outermost match unsigned long fcounts[MAX_CLASSIFIERS]; // total counts for feature normalize unsigned long totalcount = 0; double cpcorr[MAX_CLASSIFIERS]; // corpus correction factors double hits[MAX_CLASSIFIERS]; // actual hits per feature per classifier long totalhits[MAX_CLASSIFIERS]; // actual total hits per classifier long totalfeatures; // total features double htf; // hits this feature got. double tprob; // total probability in the "success" domain. double textlen; // text length - rougly corresponds to // information content of the text to classify double ptc[MAX_CLASSIFIERS]; // current running probability of this class double renorm = 0.0; double pltc[MAX_CLASSIFIERS]; // current local probability of this class double plltc[MAX_CLASSIFIERS]; // current local probability of this class int hfds[MAX_CLASSIFIERS]; FEATUREBUCKET_TYPE *hashes[MAX_CLASSIFIERS]; long hashlens[MAX_CLASSIFIERS]; char *hashname[MAX_CLASSIFIERS]; long succhash; long vbar_seen; // did we see '|' in classify's args? long maxhash; long fnstart, fnlen; long fn_start_here; long textoffset; long textmaxoffset; long bestseen; long thistotal; long feature_weight = 1; double top10scores[10]; long top10polys[10]; char top10texts[10][MAX_PATTERN]; if (internal_trace) fprintf (stderr, "executing a CLASSIFY\n"); // extract the variable name (if present) // crm_get_pgm_arg (ltext, MAX_PATTERN, apb->b1start, apb->b1len); llen = apb->b1len; llen = crm_nexpandvar (ltext, llen, MAX_PATTERN); // extract the hash file names crm_get_pgm_arg (htext, htext_maxlen, apb->p1start, apb->p1len); hlen = apb->p1len; hlen = crm_nexpandvar (htext, hlen, htext_maxlen); // extract the "this is a word" regex // crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len); plen = apb->s1len; plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); // extract the optional "match statistics" variable // crm_get_pgm_arg (svrbl, MAX_PATTERN, apb->p2start, apb->p2len); svlen = apb->p2len; svlen = crm_nexpandvar (svrbl, svlen, MAX_PATTERN); { long vstart, vlen; crm_nextword (svrbl, svlen, 0, &vstart, &vlen); memmove (svrbl, &svrbl[vstart], vlen); svlen = vlen; svrbl[vlen] = '\000'; }; // status variable's text (used for output stats) // stext[0] = '\000'; slen = 0; // set our flags, if needed. The defaults are // "case" cflags = REG_EXTENDED; eflags = 0; if (apb->sflags & CRM_NOCASE) { cflags += REG_ICASE; eflags = 1; }; not_microgroom = 1; if (apb->sflags & CRM_MICROGROOM) { not_microgroom = 0; if (user_trace) fprintf (stderr, " disabling fast-skip optimization.\n"); }; // compile the word regex if ( internal_trace) fprintf (stderr, "\nWordmatch pattern is %s", ptext); i = crm_regcomp (®cb, ptext, plen, cflags); if ( i > 0) { crm_regerror ( i, ®cb, tempbuf, data_window_size); nonfatalerror ("Regular Expression Compilation Problem:", tempbuf); goto regcomp_failed; }; // Now, the loop to open the files. bestseen = 0; thistotal = 0; // goodcount = evilcount = 1; // prevents a divide-by-zero error. //cpgood = cpevil = 0.0; //ghits = ehits = 0.0 ; //psucc = 0.5; //pfail = (1.0 - psucc); //pic = 0.5; //pnic = 0.5; // initialize our arrays for N .css files for (i = 0; i < MAX_CLASSIFIERS; i++) { fcounts[i] = 0; // check later to prevent a divide-by-zero // error on empty .css file cpcorr[i] = 0.0; // corpus correction factors hits[i] = 0.0; // absolute hit counts totalhits[i] = 0.0; // absolute hit counts ptc[i] = 0.5; // priori probability pltc[i] = 0.5; // local probability }; for (i = 0; i < 10; i++) { top10scores[i] = 0; top10polys[i] = 0; strcpy (top10texts[i], ""); }; // -- probabilistic evaluator --- // S = success; A = a testable attribute of success // ns = not success, na = not attribute // the chain rule we use is: // // P(A|S) P(S) // P (S|A) = ------------------------- // P(A|S) P(S) + P(A|NS) P(NS) // // and we apply it repeatedly to evaluate the final prob. For // the initial a-priori probability, we use 0.5. The output // value (here, P(S|A) ) becomes the new a-priori for the next // iteration. // // Extension - we generalize the above to I classes as and feature // F as follows: // // P(F|Ci) P(Ci) // P(Ci|F) = ---------------------------------------- // Sum over all classes Ci of P(F|Ci) P(Ci) // // We also correct for the unequal corpus sizes by multiplying // the probabilities by a renormalization factor. if Tg is the // total number of good features, and Te is the total number of // evil features, and Rg and Re are the raw relative scores, // then the corrected relative scores Cg aqnd Ce are // // Cg = (Rg / Tg) // Ce = (Re / Te) // // or Ci = (Ri / Ti) // // Cg and Ce can now be used as "corrected" relative counts // to calculate the naive Bayesian probabilities. // // Lastly, the issue of "over-certainty" rears it's ugly head. // This is what happens when there's a zero raw count of either // good or evil features at a particular place in the file; the // strict but naive mathematical interpretation of this is that // "feature A never/always occurs when in good/evil, hence this // is conclusive evidence of good/evil and the probabilities go // to 1.0 or 0.0, and are stuck there forevermore. We use the // somewhat ad-hoc interpretation that it is unreasonable to // assume that any finite number of samples can appropriately // represent an infinite continuum of spewage, so we can bound // the certainty of any meausre to be in the range: // // limit: [ 1/featurecount+2 , 1 - 1/featurecount+2]. // // The prior bound is strictly made-up-on-the-spot and has NO // strong theoretical basis. It does have the nice behavior // that for feature counts of 0 the probability is clipped to // [0.5, 0.5], for feature counts of 1 to [0.333, 0.666] // for feature counts of 2 to [0.25, 0.75], for 3 to // [0.2, 0.8], for 4 to [0.166, 0.833] and so on. // vbar_seen = 0; maxhash = 0; succhash = 0; fnameoffset = 0; // now, get the file names and mmap each file // get the file name (grody and non-8-bit-safe, but doesn't matter // because the result is used for open() and nothing else. // GROT GROT GROT this isn't NULL-clean on filenames. But then // again, stdio.h itself isn't NULL-clean on filenames. if (user_trace) fprintf (stderr, "Classify list: -%s- \n", htext); fn_start_here = 0; fnlen = 1; while ( fnlen > 0 && ((maxhash < MAX_CLASSIFIERS-1))) { crm_nextword (htext, hlen, fn_start_here, &fnstart, &fnlen); if (fnlen > 0) { strncpy (fname, &htext[fnstart], fnlen); fn_start_here = fnstart + fnlen + 1; fname[fnlen] = '\000'; if (user_trace) fprintf (stderr, "Classifying with file -%s- "\ "succhash=%ld, maxhash=%ld\n", fname, succhash, maxhash); if ( fname[0] == '|' && fname[1] == '\000') { if (vbar_seen) { nonfatalerror ("Only one ' | ' allowed in a CLASSIFY. \n" , "We'll ignore it for now."); } else { succhash = maxhash; }; vbar_seen ++; } else { // be sure the file exists // stat the file to get it's length k = stat (fname, &statbuf); // quick check- does the file even exist? if (k != 0) { nonfatalerror ("Nonexistent Classify table named: ", fname); } else { // file exists - do the open/process/close // hashlens[maxhash] = statbuf.st_size; // mmap the hash file into memory so we can bitwhack it hfds[maxhash] = open (fname, O_RDONLY); if (hfds[maxhash] < 0) { nonfatalerror ("Couldn't open the table file", fname); } else { hashes[maxhash] = (FEATUREBUCKET_TYPE *) mmap (NULL, hashlens[maxhash], PROT_READ, MAP_SHARED, hfds[maxhash], 0); if (hashes[maxhash] == MAP_FAILED ) { nonfatalerror ("Couldn't memory-map the table file", fname); } else { // // Check to see if this file is the right version // long fev; if (hashes[maxhash][0].hash != 0 || hashes[maxhash][0].key != 0 || hashes[maxhash][0].value!= 0) { fev =fatalerror ("The .css file is the wrong version! Filename is: ", &htext[i]); return (fev); }; // set this hashlens to the length in features instead // of the length in bytes. hashlens[maxhash] = hashlens[maxhash] / sizeof (FEATUREBUCKET_TYPE); hashname[maxhash] = (char *) malloc (fnlen+10); if (!hashname[maxhash]) untrappableerror( "Couldn't malloc hashname[maxhash]\n","We need that part later, so we're stuck. Sorry."); strncpy(hashname[maxhash],fname,fnlen); hashname[maxhash][fnlen]='\000'; maxhash++; }; }; }; }; if (maxhash > MAX_CLASSIFIERS-1) nonfatalerror ("Too many classifier files.", "Some may have been disregarded"); }; }; // // If there is no '|', then all files are "success" files. if (succhash == 0) succhash = maxhash; // a CLASSIFY with no arguments is always a "success". if (maxhash == 0) return (0); // now, set up the normalization factor fcount[] if (user_trace) fprintf (stderr, "Running with %ld files for success out of %ld files\n", succhash, maxhash ); // sanity checks... Uncomment for super-strict CLASSIFY. // // do we have at least 1 valid .css files? if (maxhash == 0) { fatalerror ("Couldn't open at least 2 .css files for classify().", ""); }; // do we have at least 1 valid .css file at both sides of '|'? //if (!vbar_seen || succhash < 0 || (maxhash < succhash + 2)) // { // nonfatalerror ( // "Couldn't open at least 1 .css file per SUCC | FAIL classes " // " for classify().\n","Hope you know what are you doing."); // }; { long ifile; long k; // count up the total first for (ifile = 0; ifile < maxhash; ifile++) { fcounts[ifile] = 0; for (k = 1; k < hashlens[ifile]; k++) fcounts [ifile] = fcounts[ifile] + hashes[ifile][k].value; if (fcounts[ifile] == 0) fcounts[ifile] = 1; totalcount = totalcount + fcounts[ifile]; }; // // calculate cpcorr (count compensation correction) // for (ifile = 0; ifile < maxhash; ifile++) { // cpcorr [ifile] = ( totalcount / (fcounts[ifile] * (maxhash-1))); // // disable cpcorr for now... unclear that it's useful. cpcorr[ifile] = 1.0; }; }; // // now all of the files are mmapped into memory, // and we can do the polynomials and add up points. i = 0; j = 0; k = 0; thistotal = 0; if (llen > 0) { vhtindex = crm_vht_lookup (vht, ltext, llen ); } else { vhtindex = crm_vht_lookup (vht, ":_dw:", 5); } if (vht[vhtindex] == NULL) { return (fatalerror (" Attempt to CLASSIFY from a nonexistent variable ", ltext)); }; mdw = NULL; if (tdw->filetext == vht[vhtindex]->valtxt) mdw = tdw; if (cdw->filetext == vht[vhtindex]->valtxt) mdw = cdw; if (mdw == NULL) return ( fatalerror (" Bogus text block containing variable ", ltext)); textoffset = vht[vhtindex]->vstart; textmaxoffset = textoffset + vht[vhtindex]->vlen; textlen = (vht[vhtindex]->vlen); if (textlen < 1.0) textlen = 1.0; // init the hashpipe with 0xDEADBEEF for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++) { hashpipe[h] = 0xDEADBEEF; }; totalfeatures = 0; // stop when we no longer get any regex matches // possible edge effect here- last character must be matchable, yet // it's also the "end of buffer". while (k == 0 && textoffset <= textmaxoffset) { long wlen; long slen; // do the regex slen = vht[vhtindex]->vstart + vht[vhtindex]->vlen - textoffset ; k = crm_regexec (®cb, &(mdw->filetext[textoffset]), slen, 1, match, 0, NULL); if (k != 0 || textoffset > textmaxoffset) goto classify_end_regex_loop; wlen = match[0].rm_eo - match[0].rm_so; memmove (tempbuf, &(mdw->filetext[textoffset + match[0].rm_so]), wlen); tempbuf[wlen] = '\000'; if (match[0].rm_eo == 0) { nonfatalerror("The CLASSIFY pattern matched zero length! ", "\n Forcing an increment to avoid an infinite loop."); match[0].rm_eo = 1; }; if (internal_trace) { fprintf (stderr, " Classify #%ld t.o. %ld strt %ld end %ld len %ld is -%s-\n", i, textoffset, (long) match[0].rm_so, (long) match[0].rm_eo, wlen, tempbuf); }; if (match[0].rm_eo == 0) { nonfatalerror ( "The CLASSIFYpattern matched zero length! ", "\n Forcing an increment to avoid an infinite loop."); match[0].rm_eo = 1; }; // slide previous hashes up 1 for (h = MARKOVIAN_WINDOW_LEN-1; h >= 1; h--) { hashpipe [h] = hashpipe [h-1]; }; // and put new hash into pipeline hashpipe[0] = strnhash ( tempbuf, wlen); if (0) { fprintf (stderr, " Hashpipe contents: "); for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++) fprintf (stderr, " %ld", hashpipe[h]); fprintf (stderr, "\n"); }; // account for the text we used up... textoffset = textoffset + match[0].rm_eo; i++; // is the pipe full enough to do the hashing? if (1) // we init with 0xDEADBEEF, so the pipe is always full (i >=5) { int j, k, l; unsigned th=0; // a counter used only in TSS hashing unsigned long hindex; unsigned long h1, h2; //unsigned long good, evil; // // Hash polynomial: h0 + 3h1 + 5h2 +9h3 +17h4 // (coefficients chosen by formula of 1 + 2^n) // // GROT GROT GROT make the order of the SBPH // a compile-time parameter th = 0; // for ( j = 0; j < 16 ; j++) for (j = 0; j < (1 << (MARKOVIAN_WINDOW_LEN - 1)); j++) { #ifdef FOO // // First Order Only - only use 1 token // hindex = hashpipe [0] ; if (hindex == 0) hindex = 1; h1 = hindex; // this is the secondary (crosscut) hash, used for // confirmation of the key value. h2 = hashpipe[0] ; if (h2 == 0) h2 = 0xdeadbeef; j = 99999; #endif #ifdef TGB // // Token Grab Bag - ignore sequence, distance, allow // aliasing ( note that H2 is linear ). // hindex = hashpipe [0] + ( hashpipe[1] * ((j>>0) & 0x0001)) + ( hashpipe[2] * ((j>>1) & 0x0001)) + ( hashpipe[3] * ((j>>2) & 0x0001)) + ( hashpipe[4] * ((j>>3) & 0x0001)); if (hindex == 0) hindex = 1; h1 = hindex; // this is the secondary (crosscut) hash, used for // confirmation of the key value. h2 = hashpipe[0] + ( hashpipe[1] * ((j>>0) & 0x0001)) + ( hashpipe[2] * ((j>>1) & 0x0001)) + ( hashpipe[3] * ((j>>2) & 0x0001)) + ( hashpipe[4] * ((j>>3) & 0x0001)); if (h2 == 0) h2 = 0xdeadbeef; #endif #ifdef TGB2 // // Token Grab Bag - ignore sequence, distance, prevent // aliasing (note that H2 is quadratic ). // hindex = hashpipe [0] + ( hashpipe[1] * ((j>>0) & 0x0001)) + ( hashpipe[2] * ((j>>1) & 0x0001)) + ( hashpipe[3] * ((j>>2) & 0x0001)) + ( hashpipe[4] * ((j>>3) & 0x0001)); if (hindex == 0) hindex = 1; h1 = hindex; // this is the secondary (crosscut) hash, used for // confirmation of the key value. h2 = hashpipe[0] * hashpipe[0] + ( hashpipe[1] * hashpipe[1] * ((j>>0) & 0x0001)) + ( hashpipe[2] * hashpipe[2] * ((j>>1) & 0x0001)) + ( hashpipe[3] * hashpipe[3] * ((j>>2) & 0x0001)) + ( hashpipe[4] * hashpipe[4] * ((j>>3) & 0x0001)); if (h2 == 0) h2 = 0xdeadbeef; #endif #ifdef TSS // // Token Seqence Sensitive - ignore distance, prevent // aliasing (quadratic H2) // hindex = hashpipe [0] + ( hashpipe[1] * ((j>>0) & 0x0001)) + ( hashpipe[2] * ((j>>1) & 0x0001)) + ( hashpipe[3] * ((j>>2) & 0x0001)) + ( hashpipe[4] * ((j>>3) & 0x0001)); if (hindex == 0) hindex = 1; h1 = hindex; // this is the secondary (crosscut) hash, used for // confirmation of the key value. th = 2; h2 = hashpipe[0]; if ((j>>0) & 0x0001) { h2 = h2 + hashpipe[1] * th; th++; }; if ((j>>1) & 0x0001) { h2 = h2 + hashpipe[2] * th; th++; }; if ((j>>2) & 0x0001) { h2 = h2 + hashpipe[3] * th; th++; }; if ((j>>3) & 0x0001) { h2 = h2 + hashpipe[4] * th; th++; }; #endif #ifdef SBPH hindex = hashpipe [0] + ( 3 * hashpipe[1] * ((j>>0) & 0x0001)) + ( 5 * hashpipe[2] * ((j>>1) & 0x0001)) + ( 11 * hashpipe[3] * ((j>>2) & 0x0001)) + ( 23 * hashpipe[4] * ((j>>3) & 0x0001)); if (hindex == 0) hindex = 1; h1 = hindex; // this is the secondary (crosscut) hash, used for // confirmation of the key value. h2 = 7 * hashpipe [0] + ( 13 * hashpipe[1] * ((j>>0) & 0x0001)) + ( 29 * hashpipe[2] * ((j>>1) & 0x0001)) + ( 51 * hashpipe[3] * ((j>>2) & 0x0001)) + ( 101 * hashpipe[4] * ((j>>3) & 0x0001)); if (h2 == 0) h2 = 0xdeadbeef; #endif #ifdef ARBITRARY_WINDOW_LENGTH ////////////////////////////////////////////////// // // Generic N-length hashing. // // first term (0th) is always on h1 = hashpipe[0] * hctable [0]; // 2nd and onward terms are variable. for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++) { h1 = h1 + hashpipe[h] * hctable[h*2] * ((j>>(h-1))&0x0001); }; hindex = h1; if (hindex == 0) hindex = 1; // 0th term is always turned on. h2 = hashpipe[0] * hctable[1]; // terms 1 through N are variable for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++) { h2 = h2 +hashpipe[h] * hctable[h*2+1]*((j>>(h-1))&0x0001); }; if (h2 == 0) h2 = 0xDEADBEEF; #endif // // Note - a strict interpretation of Bayesian // chain probabilities should use 0 as the initial // state. However, because we rapidly run out of // significant digits, we use a much less strong // initial state. Note also that any nonzero // positive value prevents divide-by-zero // #ifdef ENTROPIC_WEIGHTS // // Calculate entropic weight of this feature. // (because these are correllated features, this is // linear, not logarithmic) // // These weights correspond to the number of elements // in the hashpipe that are used for this particular // calculation, == 1 + bitcount(Jval) { long ew[16] = // Jval { 1, // 0 2, // 1 2, // 2 3, // 3 2, // 4 3, // 5 3, // 6 4, // 7 2, // 8 3, // 9 3, // 10 4, // 11 3, // 12 4, // 13 4, // 14 5 }; // 15 feature_weight = ew[j]; }; #endif #ifdef MARKOV_WEIGHTS // // Calculate entropic weight of this feature. // However, this is based on a Hidden Markov Model // calculation. Maybe it's right, maybe not. It does // seem to work better than constant or entropic... { long ew[16] = // Jval { 1, // 0 2, // 1 2, // 2 4, // 3 2, // 4 4, // 5 4, // 6 8, // 7 2, // 8 4, // 9 4, // 10 8, // 11 4, // 12 8, // 13 8, // 14 16 }; // 15 feature_weight = ew[j]; }; #endif #ifdef SUPER_MARKOV_WEIGHTS // // Calculate entropic weight of this feature. // However, this is based on a more agressive Hidden // Markov Model calculation. Maybe it's right, maybe // not. However, testing shows that Super-Markov is // more accurate than constant, entropic, or straight Markov, // with errcounts of 69, 69, 63, and 56 per 5k,respectively. // // hibits are 0, 1, 2. 3, 4, 5 // weights are 1, 4, 16, 64, 256, 1024 { long ew[32] = // Jval { 1, // 0 4, // 1 4, // 2 16, // 3 4, // 4 16, // 5 16, // 6 64, // 7 4, // 8 16, // 9 16, // 10 - A 64, // 11 - B 16, // 12 - C 64, // 13 - D 64, // 14 - E 256, // 15 - F 4, // 16 - 10 16, // 17 - 11 16, // 18 - 12 64, // 19 - 13 16, // 20 - 14 64, // 21 - 15 64, // 22 - 16 256, // 23 - 17 16, // 24 - 18 64, // 25 - 19 64, // 26 - 1A 256, // 27 - 1B 64, // 28 - 1C 256, // 29 - 1D 256, // 30 - 1E 1024 // 31 - 1F }; feature_weight = ew[j]; }; #endif #ifdef BREYER_CHHABRA_SIEFKES_WEIGHTS // // This uses the Breyer-Chhabra-Siefkes model that // uses coefficients of 1, 3, 13,, 75, and 541, which // assures complete override for any shorter string in // a single occurrence. // { long ew[16] = // Jval { 1, // 0 3, // 1 3, // 2 13, // 3 3, // 4 13, // 5 13, // 6 75, // 7 3, // 8 13, // 9 13, // 10 75, // 11 13, // 12 75, // 13 75, // 14 541 }; // 15 feature_weight = ew[j]; }; #endif #ifdef BCS_MWS_WEIGHTS // // This uses the Breyer-Chhabra-Siefkes model that // uses coefficients of 1, 3, 13,, 75, and 541, which // assures complete override for any shorter string in // a single occurrence. // { long ew[] = // Jval { 1, // 0 3, // 1 3, // 2 13, // 3 3, // 4 13, // 5 13, // 6 75, // 7 3, // 8 13, // 9 13, // 10 - A 75, // 11 - B 13, // 12 - C 75, // 13 - D 75, // 14 - E 541, // 15 - F 3, // 16 - 10 13, // 17 - 11 13, // 18 - 12 75, // 19 - 13 13, // 20 - 14 75, // 21 - 15 75, // 22 - 13 541, // 23 - 17 13, // 24 - 18 75, // 25 - 19 75, // 26 - 1A 541, // 27 - 1B 75, // 28 - 1C 541, // 29 - 1D 541, // 30 - 1E 4683 // 31 - 1F }; feature_weight = ew[j]; }; #endif #ifdef BCS_EXP_WEIGHTS // // This uses the Breyer-Chhabra-Siefkes model that // uses coefficients of 1, 3, 13,, 75, and 541, which // assures complete override for any shorter string in // a single occurrence. // { long ew[] = // Jval { 1, // 0 8, // 1 8, // 2 64, // 3 8, // 4 64, // 5 64, // 6 512, // 7 8, // 8 64, // 9 64, // 10 - A 512, // 11 - B 64, // 12 - C 512, // 13 - D 512, // 14 - E 4096, // 15 - F 8, // 16 - 10 64, // 17 - 11 64, // 18 - 12 512, // 19 - 13 64, // 20 - 14 512, // 21 - 15 512, // 22 - 13 4096, // 23 - 17 64, // 24 - 18 512, // 25 - 19 512, // 26 - 1A 4096, // 27 - 1B 512, // 28 - 1C 4096, // 29 - 1D 4096, // 30 - 1E 32768 // 31 - 1F }; feature_weight = ew[j]; }; #endif #ifdef BREYER_CHHABRA_SIEFKES_BASE7_WEIGHTS // // This uses the Breyer-Chhabra-Siefkes base 7 model that // uses coefficients of 1, 7, 49, 343, 2401 which // assures complete override for any shorter string in // a single occurrence. // { long ew[16] = // Jval { 1, // 0 7, // 1 7, // 2 49, // 3 7, // 4 49, // 5 49, // 6 343, // 7 7, // 8 49, // 9 343, // 10 343, // 11 49, // 12 343, // 13 343, // 14 2401 }; // 15 feature_weight = ew[j]; }; #endif // Zero out "Hits This Feature" htf = 0; totalfeatures++; // // calculate the precursors to the local probabilities; // these are the hits[k] array, and the htf total. // for (k = 0; k < maxhash; k++) { long lh, lh0; lh = hindex % (hashlens[k]); if (lh==0) lh=1; lh0 = lh; hits[k] = 0; while ( hashes[k][lh].key != 0 && ( hashes[k][lh].hash != h1 || hashes[k][lh].key != h2 )) { lh++; if (lh >= hashlens[k]) lh = 1; if (lh == lh0) break; // wraparound }; if (hashes[k][lh].hash == h1 && hashes[k][lh].key == h2) { // l = hashes[k][lh].value * feature_weight; totalhits[k] = totalhits[k] + l; // remember totalhits hits[k] = l * cpcorr [k]; // remember corr. hits htf = htf + hits[k]; // and hits-this-feature }; }; // now update the probabilities. // // NOTA BENE: there are a bunch of different ways to // calculate local probabilities. The text below // refers to an experiment that may or may not make it // as the "best way". // // The hard part is this - what is the local in-class // versus local out-of-class probability given the finding // of a particular feature? // // I'm guessing this- the validity is the differntial // seen on the feature (that is, fgood - fevil ) // times the entropy of that feature as seen in the // corpus (that is, // // Pfeature*log2(Pfeature) // // = // totalcount_this_feature // --------------- * log2 (totalcount_this_feature) // totalcount_all_features // // (note, yes, the last term seems like it should be // relative to totalcount_all_features, but a bit of algebra // will show that if you view fgood and fevil as two different // signals, then you end up with + and - totalcount inside // the logarithm parenthesis, and they cancel out. // (the 0.30102 converts "log10" to "log2" - it's not // a magic number, it's just that log2 isn't in glibc) // // HACK ALERT- this code here is still under development // and should be viewed with the greatest possible // suspicion. :=) #ifdef STATIC_LOCAL_PROBABILITIES //fgood = good; //fevil = evil; //pmag = (fgood - fevil) / ( 2 * (fgood + fevil + 1 ) ); //plnic = 0.5 - pmag; //plic = 1.0 - plnic; for (k = 0; k < maxhash; k++) { pltc[k] = 0.5 + (( hits[k] - (htf - hits[k])) / (LOCAL_PROB_DENOM * (htf + 1.0))); }; #endif #ifdef LENGTHBASED_LOCAL_PROBABILIIES // fgood = good; //fevil = evil; //pmag = (fgood - fevil) / ( (fgood + fevil + 1) * textlen); //plnic = 0.5 - pmag; //plic = 1.0 - plnic; for (k = 0; k < maxhash; k++) { pltc[k] = 0.5 + (( hits[k] - (htf - hits[k])) / ((htf + 1)*textlen)); }; #endif // Now, some funky magic. Our formula above is // mathematically correct (if features were // independent- something we conveniently ignore.), // but because of the limited word length in a real // computer, we can quickly run out of dynamic range // early in a CLASSIFY (P(S) indistinguishable from // 1.00) and then there is no way to recover. To // remedy this, we use two alternate forms of the // formula (in Psucc and Pfail) and use whichever // form that yields the smaller probability to // recalculate the value of the larger. // // The net result of this subterfuge is a nonuniform // representation of the probability, with a huge dynamic // range in two places - near 0.0, and near 1.0 , which // is where we actually care. // // update the global probabilities // ptemp = ( pic*plic) / ((pic * plic) + (pnic * plnic)); // pnic = (pnic*plnic) / ((pic * plic) + (pnic * plnic)); // pic = ptemp; // calculate renormalizer (the Bayesian formula's denomenator) renorm = 0.0; // now calculate the per-ptc numerators plltc[0] = 0; // keep gcc from complaining about unused vars. #ifdef USE_PEAK for (k = 0; k < maxhash; k++) renorm = renorm + (ptc[k]*plltc[k]); if ( j == 0) for (k = 0; k < maxhash; k++) plltc[k] = pltc[k]; if ( j < 15) for (k = 0; k < maxhash; k++) if (pltc[k] > plltc[k]) plltc[k] = pltc[k]; if ( j == 15) for (k = 0; k < maxhash; k++) ptc[k] = (ptc[k] * plltc[k]) / renorm; #else for (k = 0; k < maxhash; k++) renorm = renorm + (ptc[k]*pltc[k]); for (k = 0; k < maxhash; k++) ptc[k] = (ptc[k] * pltc[k]) / renorm; #endif // // Arne's Optimization: If the low-order feature // (that is, hashpipe[0]) isn't there, then there's no // possibility of other features being there either. // So, we can abort the rest of this classification // for this particular file at least. // // Of course this is only legal to do if the file has // not been microgroomed- otherwise the low-order // (single-word token) feature might already // have been deleted or microgroomed away, causing // the higher-order feature to be missed. #define ARNE_OPTIMIZATION #ifdef ARNE_OPTIMIZATION if (not_microgroom && htf == 0 && j == 0) j = 999999; #endif // if we have underflow (any probability == 0.0 ) then // bump the probability back up to 10^-308, or // whatever a small multiple of the minimum double // precision value is on the current platform. // for (k = 0; k < maxhash; k++) if (ptc[k] < 10*DBL_MIN) ptc[k] = 10 * DBL_MIN; // // part 2) renormalize to sum probabilities to 1.0 // renorm = 0.0; for (k = 0; k < maxhash; k++) renorm = renorm + ptc[k]; for (k = 0; k < maxhash; k++) ptc[k] = ptc[k] / renorm; for (k = 0; k < maxhash; k++) if (ptc[k] < 10*DBL_MIN) ptc[k] = 10 * DBL_MIN; //if (pnic < pic) // { pic = 1.0 - pnic;} else { pnic = 1.0 - pic; }; if (internal_trace) { for (k = 0; k < maxhash; k++) { // fprintf (stderr, "ZZZ\n"); fprintf (stderr, " poly: %d filenum: %d, HTF: %7.0f, hits: %7.0f, Pl: %6.4e, Pc: %6.4e\n", j, k, htf, hits[k], pltc[k], ptc[k]); }; }; // // avoid the fencepost error for window=1 if ( MARKOVIAN_WINDOW_LEN == 1) { j = 99999; }; }; }; }; // end of repeat-the-regex loop classify_end_regex_loop: // cleanup time! // remember to let go of the fd's and mmaps for (k = 0; k < maxhash; k++) { close (hfds [k]); munmap (hashes[k], hashlens[k] * sizeof (FEATUREBUCKET_TYPE)); }; // and let go of the regex buffery crm_regfree (®cb); // and one last chance to force probabilities into the non-stuck zone // // if (pic == 0.0 ) pic = DBL_MIN; //if (pnic == 0.0 ) pnic = DBL_MIN; for (k = 0; k < maxhash; k++) if (ptc[k] < 10*DBL_MIN) ptc[k] = 10 * DBL_MIN; if (user_trace) { for (k = 0; k < maxhash; k++) fprintf (stderr, "Probability of match for file %ld: %f\n", k, ptc[k]); }; // tprob = 0.0; for (k = 0; k < succhash; k++) tprob = tprob + ptc[k]; if (svlen > 0) { char buf[1024]; double accumulator; double remainder; double overall_pR; long m; buf [0] = '\000'; accumulator = 10 * DBL_MIN; for (m = 0; m < succhash; m++) { accumulator = accumulator + ptc[m]; }; remainder = 10 * DBL_MIN; for (m = succhash; m < maxhash; m++) { remainder = remainder + ptc[m]; }; overall_pR = log10 (accumulator) - log10 (remainder); // note also that strcat _accumulates_ in stext. // There would be a possible buffer overflow except that _we_ control // what gets written here. So it's no biggie. if (tprob >= 0.5000) { sprintf (buf, "CLASSIFY succeeds; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR ); } else { sprintf (buf, "CLASSIFY fails; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR ); }; if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); bestseen = 0; for (k = 0; k < maxhash; k++) if (ptc[k] > ptc[bestseen] ) bestseen = k; remainder = 10 * DBL_MIN; for (m = 0; m < maxhash; m++) if (bestseen != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "Best match to file #%ld (%s) "\ "prob: %6.4f pR: %6.4f \n", bestseen, hashname[bestseen], ptc[bestseen], (log10(ptc[bestseen]) - log10(remainder))); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); sprintf (buf, "Total features in input file: %ld\n", totalfeatures); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); for (k = 0; k < maxhash; k++) { long m; remainder = 10 * DBL_MIN; for (m = 0; m < maxhash; m++) if (k != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "#%ld (%s):"\ " features: %ld, hits: %ld, prob: %3.2e, pR: %6.2f \n", k, hashname[k], fcounts[k], totalhits[k], ptc[k], (log10 (ptc[k]) - log10 (remainder) ) ); // strcat (stext, buf); if (strlen(stext)+strlen(buf) <= stext_maxlen) strcat (stext, buf); }; // check here if we got enough room in stext to stuff everything // perhaps we'd better rise a nonfatalerror, instead of just // whining on stderr if (strcmp(&(stext[strlen(stext)-strlen(buf)]), buf) != 0) { nonfatalerror( "WARNING: not enough room in the buffer to create " "the statistics text. Perhaps you could try bigger " "values for MAX_CLASSIFIERS or MAX_FILE_NAME_LEN?", " "); }; crm_destructive_alter_nvariable (svrbl, svlen, stext, strlen (stext)); }; // // Free the hashnames, to avoid a memory leak. // for (i = 0; i < maxhash; i++) free (hashname[i]); if (tprob < 0.5000) { if (user_trace) fprintf (stderr, "CLASSIFY was a FAIL, skipping forward.\n"); // and do what we do for a FAIL here csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1; return (0); }; // // all done... if we got here, we should just continue execution if (user_trace) fprintf (stderr, "CLASSIFY was a SUCCESS, continuing execution.\n"); regcomp_failed: return (0); };