// crm_winnow_maintenance.c - Controllable Regex Mutilator, version v1.0 // Copyright 2001-2004 William S. Yerazunis, all rights reserved. // // This software is licensed to the public under the Free Software // Foundation's GNU GPL, version 2. You may obtain a copy of the // GPL by visiting the Free Software Foundations web site at // www.fsf.org, and a copy is included in this distribution. // // Other licenses may be negotiated; contact the // author for details. // // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the command line argc, argv extern int prog_argc; extern char **prog_argv; // the auxilliary input buffer (for WINDOW input) extern char *newinputbuf; // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *inbuf; extern char *outbuf; extern char *tempbuf; // How to microgroom a .css file that's getting full // // NOTA BENE NOTA BENE NOTA BENE NOTA BENE // // This whole section of code is under intense develoment; right now // it "works" but not any better than nothing at all. Be warned // that any patches issued on it may well never see the light of // day, as intense testing and comparison may show that the current // algorithms are, well, suckful. // // // There are two steps to microgrooming - first, since we know we're // already too full, we execute a 'zero unity bins'. Then, we see // how the file looks, and if necessary, we do a "scale by R", where // R is the "MICROGROOM_RESCALE_FACTOR" // void crm_winnow_microgroom (WINNOW_FEATUREBUCKET_STRUCT *h, unsigned char *xhashes, unsigned long hfsize, unsigned long hindex) { long i, j, k; static long microgroom_count = 0; long steps; long packstart; long packlen; // for stochastic grooming we need a place for the random... unsigned long randy; long zeroed_countdown; long force_rescale; j = 0; k = 0; zeroed_countdown = MICROGROOM_STOP_AFTER; // // micropack - start at initial chain start, move to back of // chain that overflowed, then scale just that chain. k = 0; i = j = hindex; if (i == 0) i = 1; if (j == 0) j = 1; microgroom_count++; // if (user_trace) //if (microgroom_count == 1) // fprintf (stderr, "="); if (0) { if (microgroom_count == 1) fprintf (stderr, " OSBW file too full: microgrooming "); fprintf (stderr, " .. pass %ld", microgroom_count); fprintf (stderr, " at %lu ", i); fprintf (stderr, "(val) %f ", h[i].value); }; if (h[i].value == 0) { nonfatalerror ("Your program has thrown a willie during microgrooming", "... which should never happen in polite company. " " Please file a bug report."); }; while (h[i].value != 0) { i--; if (i < 1) i = hfsize - 1; if (i == j) break; // don't hang if we have a 100% full .css file } // now, move our index to point to the first bucket in this chain. i++; if (i >= hfsize ) i = 1; packstart = i; // fprintf (stderr, "start was %ld, backstep to %ld ", // hindex, packstart); // set our stochastic amnesia matcher - note that we add // our microgroom count so that we _eventually_ can knock out anything // even if we get a whole string of buckets with hash keys that all alias // to the same value. // // We also keep track of how many buckets we've zeroed and we stop // zeroing additional buckets after that point. NO! THAT'S A BUG! // FEATURES IN THE TAIL CAN NO LONGER BE ACCESSED UNLESS THE WHOLE TAIL // IS PACKED! // steps = 0; force_rescale = 0; while (h[i].value != 0 ) { // fprintf (stderr, "="); randy = rand() + microgroom_count; if ( (( h[i].key + randy ) & MICROGROOM_STOCHASTIC_MASK ) == MICROGROOM_STOCHASTIC_KEY) { // for winnow, we normalize toward 1.0 by averaging with 1. h[i].value = (h[i].value + 1.0 )/ 2.0; if ( (h[i].value > 0.9 && h[i].value < 1.1) || h[i].value == 0.0 ) { // fprintf (stderr, "|"); zeroed_countdown--; h[i].value = 0.0; h[i].key = 0; h[i].hash = 0; }; }; i++; if ( i >= hfsize ) i = 1; steps++; }; packlen = steps; //fprintf (stderr, "with steps %ld ", packlen); // now we pack the buckets crm_packcss_winnow ((FEATUREBUCKET_TYPE *)h, xhashes, hfsize, packstart, packlen); } void crm_packcss_winnow (FEATUREBUCKET_TYPE *h, unsigned char *xhashes, long hs, long packstart, long packlen) { // How we pack... // // We look at each bucket, and attempt to reinsert it at the "best" // place. We know at worst it will end up where it already is, and // at best it will end up lower (at a lower index) in the file, except // if it's in wraparound mode, in which case we know it will not get // back up past us (since the file must contain at least one empty) // and so it's still below us in the file. if (packstart+packlen <= hs) // no wraparound in this case { crm_packseg_winnow (h, xhashes, hs, packstart, packlen); } else // wraparound mode - do it as two separate repacks { crm_packseg_winnow (h, xhashes, hs, packstart, (hs - packstart)); crm_packseg_winnow (h, xhashes, hs, 1, (packlen - (hs - packstart))); }; } void crm_packseg_winnow (FEATUREBUCKET_TYPE *h, unsigned char *xhashes, long hs, long packstart, long packlen) { unsigned long ifrom, ito; unsigned long thash, tkey, tvalue, txhash; if (internal_trace) fprintf (stderr, " < %ld %ld >", packstart, packlen); for (ifrom = packstart; ifrom < packstart + packlen; ifrom++) { // Is it an empty bucket? (remember, we're compressing out // all placeholder buckets, so any bucket that's zero-valued // is a valid target.) if ( h[ifrom].value == 0) { // Empty bucket - turn it from marker to empty h[ifrom].key = 0; h[ifrom].hash = 0; xhashes[ifrom] = 0; } } // Our slot values are now somewhat in disorder because empty // buckets may now have been inserted into a chain where there used // to be placeholder buckets. We need to re-insert slot data in a // bucket where it will be found. // for (ifrom = packstart; ifrom < packstart+packlen; ifrom++) { // Now find the next bucket to place somewhere // thash = h[ifrom].hash; tkey = h[ifrom].key; tvalue = h[ifrom].value; txhash = xhashes[ifrom]; if (tvalue != 0) { ito = thash % hs; if (ito == 0) ito = 1; // fprintf (stderr, "a %ld", ito); while ( ! (h[ito].value == 0 || (h[ito].hash == thash && h[ito].key == tkey ))) { ito++; if (ito >= hs) ito = 1; // fprintf (stderr, "A %ld", ito); }; // // found an empty slot, put this value there, and zero the // original one. Sometimes this is a noop. We don't care. // fprintf (stderr, "b %ld", ito); h[ifrom].hash = 0; h[ifrom].key = 0; h[ifrom].value = 0; xhashes[ifrom] = 0; // fprintf (stderr, "c %ld ", ito ); h[ito].hash = thash; h[ito].key = tkey; h[ito].value = tvalue; xhashes[ito] = txhash; // fprintf (stderr, "d %ld", ito); }; }; }