// crm_correlate.c - Controllable Regex Mutilator, version v1.0 // Copyright 2001-2004 William S. Yerazunis, all rights reserved. // // This software is licensed to the public under the Free Software // Foundation's GNU GPL, version 2. You may obtain a copy of the // GPL by visiting the Free Software Foundations web site at // www.fsf.org, and a copy is included in this distribution. // // Other licenses may be negotiated; contact the // author for details. // // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the command line argc, argv extern int prog_argc; extern char **prog_argv; // the auxilliary input buffer (for WINDOW input) extern char *newinputbuf; // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *inbuf; extern char *outbuf; extern char *tempbuf; // How to learn correlation-style- just append the text to be // learned to the target file. We don't care about the /regexes/ // int crm_expr_correlate_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb) { // learn the given text as correlative text // belonging to a particular type. // learn (classname) /regex/ (regex is ignored) // long i, j, k; char ptext[MAX_PATTERN]; // the regex pattern long plen; char ltext[MAX_PATTERN]; // the variable to learn long llen; char htext[MAX_PATTERN]; // the hash name long hlen; long cflags, eflags; struct stat statbuf; // for statting the hash file int hfd; // hashfile fd long hfsize; // size of the hash file // regex_t regcb; long textoffset; long textlen; long sense; long vhtindex; long microgroom; long logboost; long fev; long made_new_file; if (internal_trace) fprintf (stderr, "executing a LEARN (correlation format) \n"); // Keep the gcc compiler from complaining about unused variables // i = hctable[0]; // extract the hash file name crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len); hlen = apb->p1len; hlen = crm_nexpandvar (htext, hlen, MAX_PATTERN); // // extract the variable name (if present) crm_get_pgm_arg (ltext, MAX_PATTERN, apb->b1start, apb->b1len); llen = apb->b1len; llen = crm_nexpandvar (ltext, llen, MAX_PATTERN); // get the "this is a word" regex crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len); plen = apb->s1len; plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); // set our cflags, if needed. The defaults are // "case" and "affirm", (both zero valued). // and "microgroom" disabled. cflags = REG_EXTENDED; eflags = 0; sense = +1; if (apb->sflags & CRM_NOCASE) { cflags = cflags || REG_ICASE; eflags = 1; if (user_trace) fprintf (stderr, "turning oncase-insensitive match\n"); }; if (apb->sflags & CRM_REFUTE) { sense = -sense; if (user_trace) fprintf (stderr, " refuting learning\n"); }; microgroom = 0; if (apb->sflags & CRM_MICROGROOM) { microgroom = 1; if (user_trace) fprintf (stderr, " enabling microgrooming.\n"); }; logboost = 0; if (apb->sflags & CRM_LOGBOOST) { logboost = 1; if (user_trace) fprintf (stderr, " enabling LOGBOOST learning.\n"); }; // // grab the filename, and stat the file // note that neither "stat", "fopen", nor "open" are // fully 8-bit or wchar clean... i = 0; while (htext[i] < 0x021) i++; j = i; while (htext[j] >= 0x021) j++; // filename starts at i, ends at j. null terminate it. htext[j] = '\000'; // and stat it to get it's length k = stat (&htext[i], &statbuf); made_new_file = 0; // quick check- does the file even exist? if (k != 0) { // file didn't exist... create it FILE *f; if (user_trace) fprintf (stderr, "\nHad to create new correlate file %s\n", &htext[i]); f = fopen (&htext[i], "w"); if (!f) { fprintf (stderr, "\n Couldn't open your new correlate file %s for writing; errno=%d .\n", &htext[i], errno); exit (EXIT_FAILURE); }; // fputc ('\001', f); don't do any output at all. made_new_file = 1; // fclose (f); // and reset the statbuf to be correct k = stat (&htext[i], &statbuf); }; // hfsize = statbuf.st_size; if (user_trace) fprintf (stderr, "Correlation text file %s has length %ld characters\n", &htext[i], hfsize / sizeof (FEATUREBUCKET_TYPE)); // // open the text file into memory so we can bitwhack it // hfd = open (&(htext[i]), O_RDWR); if (hfd < 0) { fev = fatalerror ("Couldn't open the correlation file named: ", &htext[i]); return (fev); }; // // get the text to "learn" (well, append to the correlation file) // // This is the text that we'll append to the correlation file. k = 0; j = 0; i = 0; if (llen > 0) { vhtindex = crm_vht_lookup (vht, ltext, llen); } else { vhtindex = crm_vht_lookup (vht, ":_dw:", 5); }; if (vht[vhtindex] == NULL) { long q; q = fatalerror (" Attempt to LEARN from a nonexistent variable ", ltext); return (q); }; mdw = NULL; if (tdw->filetext == vht[vhtindex]->valtxt) mdw = tdw; if (cdw->filetext == vht[vhtindex]->valtxt) mdw = cdw; if (mdw == NULL) { long q; q = fatalerror (" Bogus text block containing variable ", ltext); return (q); } textoffset = vht[vhtindex]->vstart; textlen = vht[vhtindex]->vlen; if (user_trace) { fprintf (stderr, "learning the text (len %ld) :", textlen); fwrite (&(mdw->filetext[textoffset]), ((textlen < 128) ? textlen : 128),1,stderr); fprintf (stderr, "\n"); }; // append the "learn" text to the end of the file. // lseek (hfd, 0, SEEK_END); write (hfd, &(mdw->filetext[textoffset]), textlen); close (hfd); crm_regfree (®cb); return (0); } // How to do a correlate-style CLASSIFY on some text. // int crm_expr_correlate_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb) { // classify the sparse spectrum of this input window // as belonging to a particular type. // // This code should look very familiar- it's cribbed from // the code for LEARN // long i, j, k; char ptext[MAX_PATTERN]; // the regex pattern long plen; char ltext[MAX_PATTERN]; // the variable to classify long llen; // the hash file names char htext[MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN]; long htext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN; long hlen; // the match statistics variable inbuf char stext[MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100)]; long stext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100); long slen; char svrbl[MAX_PATTERN]; // the match statistics text buffer long svlen; long fnameoffset; char fname[MAX_FILE_NAME_LEN]; long eflags; long cflags; long vhtindex; struct stat statbuf; // for statting the hash file regex_t regcb; unsigned long fcounts[MAX_CLASSIFIERS]; // total counts for feature normalize double cpcorr[MAX_CLASSIFIERS]; // corpus correction factors long long linear_hits[MAX_CLASSIFIERS]; // actual hits per classifier long long square_hits[MAX_CLASSIFIERS]; // square of runlenths of match long incr_hits[MAX_CLASSIFIERS]; // 1+2+3... hits per classifier long long total_linear_hits; // actual total linear hits for all classifiers long long total_square_hits; // actual total square hits for all classifiers long long total_features; // total number of characters in the system long totalhits [MAX_CLASSIFIERS]; double tprob; // total probability in the "success" domain. double textlen; // text length - rougly corresponds to // information content of the text to classify double ptc[MAX_CLASSIFIERS]; // current running probability of this class double renorm = 0.0; int hfds[MAX_CLASSIFIERS]; char *hashes[MAX_CLASSIFIERS]; long hashlens[MAX_CLASSIFIERS]; char *hashname[MAX_CLASSIFIERS]; long succhash; long vbar_seen; // did we see '|' in classify's args? long maxhash; long fnstart, fnlen; long fn_start_here; long textoffset; long textmaxoffset; long bestseen; long thistotal; if (internal_trace) fprintf (stderr, "executing a CLASSIFY\n"); // extract the variable name (if present) // crm_get_pgm_arg (ltext, MAX_PATTERN, apb->b1start, apb->b1len); llen = apb->b1len; llen = crm_nexpandvar (ltext, llen, MAX_PATTERN); // extract the hash file names crm_get_pgm_arg (htext, htext_maxlen, apb->p1start, apb->p1len); hlen = apb->p1len; hlen = crm_nexpandvar (htext, hlen, htext_maxlen); // extract the "this is a word" regex // crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len); plen = apb->s1len; plen = crm_nexpandvar (ptext, plen, MAX_PATTERN); // extract the optional "match statistics" variable // crm_get_pgm_arg (svrbl, MAX_PATTERN, apb->p2start, apb->p2len); svlen = apb->p2len; svlen = crm_nexpandvar (svrbl, svlen, MAX_PATTERN); { long vstart, vlen; crm_nextword (svrbl, svlen, 0, &vstart, &vlen); memmove (svrbl, &svrbl[vstart], vlen); svlen = vlen; svrbl[vlen] = '\000'; }; // status variable's text (used for output stats) // stext[0] = '\000'; slen = 0; // set our flags, if needed. The defaults are // "case" cflags = REG_EXTENDED; eflags = 0; if (apb->sflags & CRM_NOCASE) { cflags += REG_ICASE; eflags = 1; }; // compile the word regex if ( internal_trace) fprintf (stderr, "\nWordmatch pattern is %s", ptext); i = crm_regcomp (®cb, ptext, plen, cflags); if ( i > 0) { crm_regerror ( i, ®cb, tempbuf, data_window_size); nonfatalerror ("Regular Expression Compilation Problem:", tempbuf); goto regcomp_failed; }; // Now, the loop to open the files. bestseen = 0; thistotal = 0; // initialize our arrays for N .css files for (i = 0; i < MAX_CLASSIFIERS; i++) { fcounts[i] = 0; // check later to prevent a divide-by-zero // error on empty .css file cpcorr[i] = 0.0; // corpus correction factors linear_hits[i] = 0; // linear hits square_hits[i] = 0; // square of the runlength incr_hits[i] = 0; // 1+2+3... hits hits totalhits[i] = 0.0; // absolute hit counts ptc[i] = 0.5; // priori probability }; // vbar_seen = 0; maxhash = 0; succhash = 0; fnameoffset = 0; // now, get the file names and mmap each file // get the file name (grody and non-8-bit-safe, but doesn't matter // because the result is used for open() and nothing else. // GROT GROT GROT this isn't NULL-clean on filenames. But then // again, stdio.h itself isn't NULL-clean on filenames. if (user_trace) fprintf (stderr, "Classify list: -%s- \n", htext); fn_start_here = 0; fnlen = 1; while ( fnlen > 0 && ((maxhash < MAX_CLASSIFIERS-1))) { crm_nextword (htext, hlen, fn_start_here, &fnstart, &fnlen); if (fnlen > 0) { strncpy (fname, &htext[fnstart], fnlen); fn_start_here = fnstart + fnlen + 1; fname[fnlen] = '\000'; if (user_trace) fprintf (stderr, "Classifying with file -%s- "\ "succhash=%ld, maxhash=%ld\n", fname, succhash, maxhash); if ( fname[0] == '|' && fname[1] == '\000') { if (vbar_seen) { nonfatalerror ("Only one ' | ' allowed in a CLASSIFY. \n" , "We'll ignore it for now."); } else { succhash = maxhash; }; vbar_seen ++; } else { // be sure the file exists // stat the file to get it's length k = stat (fname, &statbuf); // quick check- does the file even exist? if (k != 0) { nonfatalerror ("Nonexistent Classify table named: ", fname); } else { // file exists - do the open/process/close // hashlens[maxhash] = statbuf.st_size; // mmap the hash file into memory so we can bitwhack it hfds[maxhash] = open (fname, O_RDONLY); if (hfds[maxhash] < 0) { nonfatalerror ("Couldn't open the table file", fname); } else { hashes[maxhash] = (char *) mmap (NULL, hashlens[maxhash], PROT_READ, MAP_SHARED, hfds[maxhash], 0); if (hashes[maxhash] == MAP_FAILED ) { nonfatalerror ("Couldn't memory-map the table file", fname); } else { // // Check to see if this file is the right version // // FIXME : for now, there's no version number // associated with a .correllation file long fev; if (0) //(hashes[maxhash][0].hash != 1 || // hashes[maxhash][0].key != 0) { fev =fatalerror ("The .css file is the wrong version! Filename is: ", &htext[i]); return (fev); }; // // save the name for later... // hashname[maxhash] = (char *) malloc (fnlen+10); if (!hashname[maxhash]) untrappableerror( "Couldn't malloc hashname[maxhash]\n","We need that part later, so we're stuck. Sorry."); strncpy(hashname[maxhash],fname,fnlen); hashname[maxhash][fnlen]='\000'; maxhash++; }; }; }; }; if (maxhash > MAX_CLASSIFIERS-1) nonfatalerror ("Too many classifier files.", "Some may have been disregarded"); }; }; // // If there is no '|', then all files are "success" files. if (succhash == 0) succhash = maxhash; // a CLASSIFY with no arguments is always a "success". if (maxhash == 0) return (0); // now, set up the normalization factor fcount[] if (user_trace) fprintf (stderr, "Running with %ld files for success out of %ld files\n", succhash, maxhash ); // sanity checks... Uncomment for super-strict CLASSIFY. // // do we have at least 1 valid .css files? if (maxhash == 0) { fatalerror ("Couldn't open at least 2 .css files for classify().", ""); }; // do we have at least 1 valid .css file at both sides of '|'? //if (!vbar_seen || succhash < 0 || (maxhash < succhash + 2)) // { // nonfatalerror ( // "Couldn't open at least 1 .css file per SUCC | FAIL classes " // " for classify().\n","Hope you know what are you doing."); // }; // // now all of the files are mmapped into memory, // and we can do the correlations and add up matches. i = 0; j = 0; k = 0; thistotal = 0; if (llen > 0) { vhtindex = crm_vht_lookup (vht, ltext, llen ); } else { vhtindex = crm_vht_lookup (vht, ":_dw:", 5); } if (vht[vhtindex] == NULL) { return (fatalerror (" Attempt to CLASSIFY from a nonexistent variable ", ltext)); }; mdw = NULL; if (tdw->filetext == vht[vhtindex]->valtxt) mdw = tdw; if (cdw->filetext == vht[vhtindex]->valtxt) mdw = cdw; if (mdw == NULL) return ( fatalerror (" Bogus text block containing variable ", ltext)); textoffset = vht[vhtindex]->vstart; textmaxoffset = textoffset + vht[vhtindex]->vlen; textlen = (vht[vhtindex]->vlen); if (textlen < 1.0) textlen = 1.0; // // We keep track of the hits in these categories // linear_hits[MAX_CLASSIFIERS]; // actual hits per classifier // square_hits[MAX_CLASSIFIERS]; // square of runlenths of match // incr_hits[MAX_CLASSIFIERS]; // 1+2+3... hits per classifier // // Now we do the actual correllation. // for each file... // slide the incoming text (mdw->filetext[textofset]) // across the corpus text (hashes[] from 0 to hashlens[]) // and count the bytes that are the same, the runlengths, // etc. for (k = 0; k < maxhash; k++) { long it; // it is the start index into the tested text long ik; // ik is the start index into the known corpus text long ilm; // ilm is the "local" matches (N in a row) // for each displacement of the test text... for (ik = 0; ik < (hashlens[k] - vht[vhtindex]->vlen); ik++) { ilm = 0; // for each position in the test text... for (it = 0; it < vht[vhtindex]->vlen; it++) { // do the characters in this position match? if ( hashes[k][ik+it] == mdw->filetext[textoffset+it]) { // yes they matched linear_hits[k]++; ilm++; square_hits[k] = square_hits[k] + (ilm * ilm); } else { // nope, they didn't match. // So, we do the end-of-runlength stuff: ilm = 0; }; if (0) fprintf (stderr, "ik: %ld it: %ld c1: %c c2: %c ilm: %ld lin: %lld sqr: %lld\n", ik, it, hashes[k][ik+it], mdw->filetext[textoffset+it], ilm, linear_hits[k], square_hits[k]); }; }; }; // Now we have the total hits for each text corpus. We can then // turn that into a vague probability measure, and then renormalize // that to get probabilities. // // But first, let's reflect on what we've got here. We our test // text, and we have a corpus which is "nominally correllated", // and another corpus that is nominally uncorrellated. // // The uncorrellated text will have an average match rate of 1/256'th // in the linear domain (well, for random bytes; english text will match // a lot more often, due to the fact that ASCII only uses the low 7 // bits, most text is written in lower case, Zipf's law, etc. // // We can calculate a predicted total on a per-character basis for all // of the corpi, then use that as an average expectation. // Calculate total hits total_linear_hits = 0; total_square_hits = 0; total_features = 0; for (k = 0; k < maxhash; k++) { total_linear_hits += linear_hits[k]; total_square_hits += square_hits[k]; total_features += hashlens[k]; }; for (k = 0; k < maxhash; k++) { if (hashlens > 0 && total_features > 0 ) { // Note that we don't normalize the probabilities yet- we do // that down below. // // .00397 is not a magic number - it's the random coincidence // rate for 1 chance in 256, with run-length-squared boost. // .00806 is the random coincidence rate for 7-bit characters. // //ptc[k] = ((0.0+square_hits[k] - (.00397 * hashlens[k] ))); ptc[k] = ((0.0+square_hits[k] - (.00806 * hashlens[k] ))) / hashlens[k]; if (ptc[k] < 0) ptc[k] = 10*DBL_MIN; } else { ptc [k] = 0.5; }; } // ptc[k] = (sqrt (0.0 + square_hits[k])-linear_hits[k] ) / hashlens[k] ; // ptc[k] = (0.0 + square_hits[k] - linear_hits[k] ) ; // ptc[k] = ((0.0 + square_hits[k]) / hashlens[k]) ; // ptc[k] = sqrt ((0.0 + square_hits[k]) / hashlens[k]) ; // ptc[k] = ((0.0 + linear_hits[k]) / hashlens[k]) ; // calculate renormalizer (the Bayesian formula's denomenator) renorm = 0.0; // now calculate the per-ptc numerators for (k = 0; k < maxhash; k++) renorm = renorm + (ptc[k]); // check for a zero normalizer if (renorm == 0) renorm = 1.0; // and renormalize for (k = 0; k < maxhash; k++) ptc[k] = ptc[k] / renorm; // if we have underflow (any probability == 0.0 ) then // bump the probability back up to 10^-308, or // whatever a small multiple of the minimum double // precision value is on the current platform. // for (k = 0; k < maxhash; k++) if (ptc[k] < 10*DBL_MIN) ptc[k] = 10 * DBL_MIN; if (internal_trace) { for (k = 0; k < maxhash; k++) { fprintf (stderr, " file: %ld linear: %lld square: %lld RMS: %6.4e ptc[%ld] = %6.4e \n", k, linear_hits[k], square_hits[k], sqrt(0.0+square_hits[k]), k, ptc[k]); }; }; // ; // end of repeat-the-regex loop // cleanup time! // remember to let go of the fd's and mmaps for (k = 0; k < maxhash; k++) { close (hfds [k]); munmap (hashes[k], hashlens[k]); }; // and let go of the regex buffery crm_regfree (®cb); if (user_trace) { for (k = 0; k < maxhash; k++) fprintf (stderr, "Probability of match for file %ld: %f\n", k, ptc[k]); }; // tprob = 0.0; for (k = 0; k < succhash; k++) tprob = tprob + ptc[k]; if (svlen > 0) { char buf[1024]; double accumulator; double remainder; double overall_pR; long m; buf [0] = '\000'; accumulator = 10 * DBL_MIN; for (m = 0; m < succhash; m++) { accumulator = accumulator + ptc[m]; }; remainder = 10 * DBL_MIN; for (m = succhash; m < maxhash; m++) if (bestseen != m) { remainder = remainder + ptc[m]; }; overall_pR = log10 (accumulator) - log10 (remainder); // note also that strcat _accumulates_ in stext. // There would be a possible buffer overflow except that _we_ control // what gets written here. So it's no biggie. if (tprob > 0.5000) { sprintf (buf, "CLASSIFY succeeds; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR ); } else { sprintf (buf, "CLASSIFY fails; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR ); }; if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); bestseen = 0; for (k = 0; k < maxhash; k++) if (ptc[k] > ptc[bestseen] ) bestseen = k; remainder = 10 * DBL_MIN; for (m = 0; m < maxhash; m++) if (bestseen != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "Best match to file #%ld (%s) "\ "prob: %6.4f pR: %6.4f \n", bestseen, hashname[bestseen], ptc[bestseen], (log10(ptc[bestseen]) - log10(remainder))); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); sprintf (buf, "Total features in input file: %ld\n", hashlens[bestseen]); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); for (k = 0; k < maxhash; k++) { long m; remainder = 10 * DBL_MIN; for (m = 0; m < maxhash; m++) if (k != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "#%ld (%s):"\ " features: %ld, RMS hits: %lld, prob: %3.2e, pR: %6.2f \n", k, hashname[k], hashlens[k], square_hits[k], ptc[k], (log10 (ptc[k]) - log10 (remainder) ) ); // strcat (stext, buf); if (strlen(stext)+strlen(buf) <= stext_maxlen) strcat (stext, buf); }; // check here if we got enough room in stext to stuff everything // perhaps we'd better rise a nonfatalerror, instead of just // whining on stderr if (strcmp(&(stext[strlen(stext)-strlen(buf)]), buf) != 0) { nonfatalerror( "WARNING: not enough room in the buffer to create " "the statistics text. Perhaps you could try bigger " "values for MAX_CLASSIFIERS or MAX_FILE_NAME_LEN?", " "); }; crm_destructive_alter_nvariable (svrbl, svlen, stext, strlen (stext)); }; // // Free the hashnames, to avoid a memory leak. // for (i = 0; i < maxhash; i++) free (hashname[i]); if (tprob < 0.5000) { if (user_trace) fprintf (stderr, "CLASSIFY was a FAIL, skipping forward.\n"); // and do what we do for a FAIL here csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1; return (0); }; // // all done... if we got here, we should just continue execution if (user_trace) fprintf (stderr, "CLASSIFY was a SUCCESS, continuing execution.\n"); regcomp_failed: return (0); };