// crm_css_maintenance_.c - Controllable Regex Mutilator, version v1.0 // Copyright 2001-2004 William S. Yerazunis, all rights reserved. // // This software is licensed to the public under the Free Software // Foundation's GNU GPL, version 2. You may obtain a copy of the // GPL by visiting the Free Software Foundations web site at // www.fsf.org, and a copy is included in this distribution. // // Other licenses may be negotiated; contact the // author for details. // // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the command line argc, argv extern int prog_argc; extern char **prog_argv; // the auxilliary input buffer (for WINDOW input) extern char *newinputbuf; // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *inbuf; extern char *outbuf; extern char *tempbuf; // How to microgroom a .css file that's getting full // // NOTA BENE NOTA BENE NOTA BENE NOTA BENE // // This whole section of code is under intense develoment; right now // it "works" but not any better than nothing at all. Be warned // that any patches issued on it may well never see the light of // day, as intense testing and comparison may show that the current // algorithms are, well, suckful. // // // There are two steps to microgrooming - first, since we know we're // already too full, we execute a 'zero unity bins'. Then, we see // how the file looks, and if necessary, we do a "scale by R", where // R is the "MICROGROOM_RESCALE_FACTOR" // void crm_microgroom (FEATUREBUCKET_TYPE *h, long hs, unsigned long hindex) { long i, j, k; static long microgroom_count = 0; long steps; long packstart; long packlen; // for stochastic grooming we need a place for the random... unsigned long randy; long zeroed_countdown; long force_rescale; j = 0; k = 0; zeroed_countdown = MICROGROOM_STOP_AFTER; // i = j = k = 0; microgroom_count++; if (user_trace) { if (microgroom_count == 1) fprintf (stderr, "CSS file too full: microgrooming this css chain: "); fprintf (stderr, " %ld ", microgroom_count); }; // micropack - start at initial chain start, move to back of // chain that overflowed, then scale just that chain. i = j = hindex % hs; if (i == 0) i = 1; while (h[i].value != 0) { i--; if (i < 1) i = hs - 1; if (i == j) break; // don't hang if we have a 100% full .css file // fprintf (stderr, "-"); } // now, move our index to point to the first bucket in this chain. i++; if (i >= hs) i = 1; packstart = i; // set our stochastic amnesia matcher - note that we add // our microgroom count so that we _eventually_ can knock out anything // even if we get a whole string of buckets with hash keys that all alias // to the same value. // // We also keep track of how many buckets we've zeroed and we stop // zeroing additional buckets after that point. NO! BUG! That // messes up the tail length, and if we don't repack the tail, then // features in the tail can become permanently inaccessible! Therefore, // we really can't stop in the middle of the tail (well, we could // stop zeroing, but we need to pass the full length of the tail in. // // Note that we can't do this "adaptively" in packcss, because zeroes // there aren't necessarily overflow chain terminators (because -we- // might have inserted them here. // steps = 0; force_rescale = 0; while (h[i].value != 0 ) { // fprintf (stderr, "="); randy = rand() + microgroom_count; if ( force_rescale || (( h[i].key + randy ) & MICROGROOM_STOCHASTIC_MASK ) == MICROGROOM_STOCHASTIC_KEY) { h[i].value = h[i].value * MICROGROOM_RESCALE_FACTOR; }; if (h[i].value == 0) zeroed_countdown--; i++; if (i >= hs ) i = 1; steps++; } packlen = steps; // now we pack the buckets crm_packcss (h, hs, packstart, packlen); } void crm_packcss (FEATUREBUCKET_TYPE *h, long hs, long packstart, long packlen) { // How we pack... // // We look at each bucket, and attempt to reinsert it at the "best" // place. We know at worst it will end up where it already is, and // at best it will end up lower (at a lower index) in the file, except // if it's in wraparound mode, in which case we know it will not get // back up past us (since the file must contain at least one empty) // and so it's still below us in the file. //fprintf (stderr, "Packing %ld len %ld total %ld", // packstart, packlen, packstart+packlen); // if (packstart+packlen >= hs) // fprintf (stderr, " BLORTTTTTT "); if (packstart+packlen <= hs) // no wraparound in this case { crm_packseg (h, hs, packstart, packlen); } else // wraparound mode - do it as two separate repacks { crm_packseg (h, hs, packstart, (hs - packstart)); crm_packseg (h, hs, 1, (packlen - (hs - packstart))); }; } void crm_packseg (FEATUREBUCKET_TYPE *h, long hs, long packstart, long packlen) { unsigned long ifrom, ito; unsigned long thash, tkey, tvalue; if (internal_trace) fprintf (stderr, " < %ld %ld >", packstart, packlen); for (ifrom = packstart; ifrom < packstart + packlen; ifrom++) { // Is it an empty bucket? (remember, we're compressing out // all placeholder buckets, so any bucket that's zero-valued // is a valid target.) if ( h[ifrom].value == 0) { // Empty bucket - turn it from marker to empty if (internal_trace) fprintf (stderr, "x"); h[ifrom].key = 0; h[ifrom].hash = 0; } else { if (internal_trace) fprintf (stderr, "-");}; } // Our slot values are now somewhat in disorder because empty // buckets may now have been inserted into a chain where there used // to be placeholder buckets. We need to re-insert slot data in a // bucket where it will be found. // ito = 0; for (ifrom = packstart; ifrom < packstart+packlen; ifrom++) { // Now find the next bucket to place somewhere // thash = h[ifrom].hash; tkey = h[ifrom].key; tvalue = h[ifrom].value; if (tvalue == 0) { if (internal_trace) fprintf (stderr, "X"); } else { ito = thash % hs; if (ito == 0) ito = 1; // fprintf (stderr, "a %ld", ito); while ( ! ( (h[ito].value == 0) || ( h[ito].hash == thash && h[ito].key == tkey ))) { ito++; if (ito >= hs) ito = 1; // fprintf (stderr, "a %ld", ito); }; // // found an empty slot, put this value there, and zero the // original one. Sometimes this is a noop. We don't care. if (internal_trace) { if ( ifrom == ito ) fprintf (stderr, "="); if ( ito < ifrom) fprintf (stderr, "<"); if ( ito > ifrom ) fprintf (stderr, ">"); }; h[ifrom].hash = 0; h[ifrom].key = 0; h[ifrom].value = 0; h[ito].hash = thash; h[ito].key = tkey; h[ito].value = tvalue; }; }; } int crm_create_cssfile(char *cssfile, long buckets, long major, long minor, long spectrum_start) { FILE *f; long i; FEATUREBUCKET_STRUCT feature = {0, 0, 0}; f = fopen (cssfile, "w"); if (!f) { fprintf (stderr, "\n Couldn't open file %s for writing; errno=%d .\n", cssfile, errno); return (EXIT_FAILURE); } // Initialize CSS file - zero all buckets feature.hash = major; feature.key = minor; feature.value = spectrum_start; for (i=0; i < buckets; i++) { if (fwrite(&feature, sizeof(feature), 1, f) != 1) { fprintf (stderr, "\n Couldn't initialize .CSS file %s, " "errno=%d.\n", cssfile, errno); return (EXIT_FAILURE); } // // HACK ALERT HACK ALERT HACK ALERT // // yeah,there's more efficient ways to do this, but this will // stay in cache; an IF-statement will need at least three ops as // well. Probably six of one... feature.hash = 0; feature.key = 0; feature.value = 0; } fclose (f); return (EXIT_SUCCESS); }