// crm_preprocessor.c - Controllable Regex Mutilator, version v1.0 // Copyright 2001-2004 William S. Yerazunis, all rights reserved. // // This software is licensed to the public under the Free Software // Foundation's GNU GPL, version 2. You may obtain a copy of the // GPL by visiting the Free Software Foundations web site at // www.fsf.org, and a copy is included in this distribution. // // Other licenses may be negotiated; contact the // author for details. // // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the command line argc, argv extern int prog_argc; extern char **prog_argv; // the auxilliary input buffer (for WINDOW input) extern char *newinputbuf; // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *inbuf; extern char *outbuf; extern char *tempbuf; // crm preprocessor - pre-process a CRM file to make it // palatable to the sorry excuse we have for a compiler. int crm_preprocessor (CSL_CELL *csl, int flags) { int lflag; int status; long i, j; long done; regex_t preg; int numinserts = 0; int maxinserts = DEFAULT_MAX_INSERTS; regmatch_t matches[3]; // regex commentary: // we want to match both // "\n[ ]*insert[ ]*[file][ ]*\n" // and // "^[ ]*insert[ ]*[file][ ]*\n" // // This is the file insertion regex. Note that it does NOT allow // spaces in filenames, nor does it deal with embedded #comments // but then again, the "fixes" to deal with spaces in filenames // also don't deal wth embedded #comments, because #comments // themselves aren't dealt with till lower down in the code.. // // However, there's another problem with the above. The trailing // newline may not be there - consider: // // #insert foo.crm ; output /hello, world!\n/ // // which fails because we aren't regex_conforming. // So, what we really need is to grab the next nonblank token, then // either get a newline or a semicolon. char *insert_regex = // "\n[[:blank:]]*(insert)[[:blank:]]+([[:graph:]]+)[[:blank:]]*\n"; "\n[[:blank:]]*(insert)[[:blank:]]+([[:graph:]]+)[[:blank:]]*[\n;]"; // // if (internal_trace ) fprintf (stderr, " preprocessor - #insert processing...\n"); lflag = 0; i = 0; done = 0; // // Compile the insert regex // i = crm_regcomp (&preg, insert_regex, strlen (insert_regex), REG_EXTENDED | REG_ICASE | REG_NEWLINE); if ( i != 0) { crm_regerror ( i, &preg, tempbuf, data_window_size); untrappableerror ( "Regular Expression Compilation Problem during INSERT processing:", tempbuf); }; // // Do the initial breaking pass // crm_break_statements (0, csl->nchars, csl); if (internal_trace) fprintf (stderr, "After first pass, breaking statements we have -->>%s<<--\nlength %ld\n", csl->filetext, csl->nchars); while (!done) { j = crm_regexec ( &preg, csl->filetext, csl->nchars, 3, matches, lflag, NULL); if ( j != 0) { if (internal_trace) fprintf (stderr, "No insert files left to do.\n"); done = 1; } else { char insertfilename [MAX_FILE_NAME_LEN]; struct stat statbuf; for (j = 0; j < matches[2].rm_eo - matches[2].rm_so && j < MAX_FILE_NAME_LEN; j++) insertfilename[j] = csl->filetext[matches[2].rm_so + j]; insertfilename[j] = '\000'; // OK, we have a filename; check to see if it will blow the // gaskets on the filesystem or not: // if (matches[2].rm_eo - matches[2].rm_so > MAX_FILE_NAME_LEN-1) untrappableerror ("INSERT Filename was too long! Here's the" "first part of it: ", insertfilename); // stat the file - if 0, file exists status = stat ( insertfilename, &statbuf ); if (! status ) { // // OK, now we have to "insert" the file, but we have to // do it gracefully. In particular, the file itself // must be loaded, then newline-fixupped, then // we know it's actual size and can actually -insert- // it. // // We malloc a big hunk of memory, read the file in. // We expand it there (with impunity), then // we make a temporary copy in malloced memory, // and do the real insertion. CSL_CELL *ecsl; char *insert_buf; if (user_trace) { fprintf (stderr, "Inserting file '%s' .\n", insertfilename); }; // To keep the matcher from looping, we change the string // 'insert' to 'insert=' . Cool, eh? // csl->filetext[matches[1].rm_eo] = '='; // smash in an "=" ecsl = (CSL_CELL *) malloc (sizeof (CSL_CELL)); insert_buf = malloc (sizeof (char) * max_pgmsize); if (!insert_buf || !ecsl) untrappableerror ("Couldn't malloc enough memory to do" " the insert of file ", insertfilename); // Loop prevention check - too many inserts? // numinserts++; if (numinserts > maxinserts) untrappableerror ("Too many inserts! Limit exceeded with" "filename : ", insertfilename); ecsl->filetext = insert_buf; ecsl->nchars = 0; // OK, we now have a buffer. Read the file in... { int fd; fd = open (insertfilename, O_RDONLY); read (fd, ecsl->filetext, statbuf.st_size); close (fd); // // file's read in, put in a trailing newline ecsl->nchars = statbuf.st_size; ecsl->filetext[ecsl->nchars] = '\n'; ecsl->nchars ++; ecsl->filename = insertfilename; // // now do the statement-break thing on this file crm_break_statements (0, ecsl->nchars, ecsl); // // and we have the expanded text ready to insert. // // will it fit? // if ( (csl->nchars + ecsl->nchars + 64) > (sizeof (char) * max_pgmsize)) untrappableerror ( " Program file buffer overflow when " " INSERTing file ", insertfilename); // Does the result end with a newline? If not, fix it. if (ecsl->filetext[ecsl->nchars-1] != '\n') { ecsl->filetext [ecsl->nchars ] = '\n'; ecsl->nchars++; }; // Does the result end with two newlines? Fix // that, too. //if (ecsl->filetext[ecsl->nchars-1] == '\n' // && ecsl->filetext[ecsl->nchars-2] == '\n') // { // ecsl->nchars--; // }; // Make a hole in the csl->filetext // // (note- Fidelis' points out that we need to pace // off from the end of matches[0] so as to not smash // trailing stuff on the line. // memmove (&(csl->filetext[matches[0].rm_eo + ecsl->nchars]), &(csl->filetext[matches[0].rm_eo]), csl->nchars - matches[0].rm_eo + 1); // +1 for '\0'! // // and put the new text into that hole // memmove (&(csl->filetext[matches[0].rm_eo]), ecsl->filetext, ecsl->nchars); // Mark the new length of the csl text. if (internal_trace) fprintf (stderr, "Old length: %ld, ", csl->nchars); csl->nchars += ecsl->nchars; if (internal_trace) fprintf (stderr, "new length: %ld\n ", csl->nchars); // Now we clean up (de-malloc all that memory) free (ecsl->filetext); free (ecsl); } } else { untrappableerror (" I'm having a problem inserting file ", insertfilename); }; i = matches[1].rm_so + 1; }; if (internal_trace) fprintf (stderr, "----------Result after preprocessing-----\n" "%s" "\n-------------end preprocessing------\n", csl->filetext); }; // define a hash of the expanded program for sanity checking on bugreps: // { char myhash[16]; sprintf (myhash, "%08lX", strnhash (csl->filetext, csl->nchars)); myhash[8] = '\0'; crm_set_temp_var (":_pgm_hash:", myhash); }; /// GROT GROT GROT for some reason, Gnu Regex segfaults if it // tries to free this register. // crm_regfree (&preg); //fprintf (stderr, "returning\n"); return (0); }; // // Set up statement breaks. // // If we're not in a nesting (paren, angle, box, slash) then // we need to assure that there are newlines before and after // any { and }, and that there is a newline after every ; and // before every #. // // If we ARE in a nesting, then all characters pass unchanged. // // Note that this is an "in-place" mutilation, not a copying mutilation. // void crm_break_statements (long ini, long nchars, CSL_CELL *csl) { int seennewline; int in_comment; int neednewline; int paren_nest, angle_nest, box_nest, slash_nest; long i; seennewline = 1; neednewline = 0; in_comment = 0; paren_nest = slash_nest = angle_nest = box_nest = 0; if ( internal_trace ) fprintf (stderr, " preprocessor - breaking statmeents... \n"); for (i = ini; i < ini + nchars; i++) { // now, no matter what, we're looking at a non-quoted character. // // are we looking at a nonprinting character? if (csl->filetext[i] < 0x021 ) { if (csl->filetext[i] == '\n') { // get rid of extraneous newlines. //if (internal_trace) // fprintf (stderr, " newline ."); seennewline = 1; neednewline = 0; in_comment = 0; // Userbug containment - a newline closes all nests paren_nest = slash_nest = angle_nest = box_nest = 0; }; // other nonprinting characters do not change things. } else { // we don't do any processing inside a comment! if ( in_comment ) { // inside a comment, we don't do squat to printing chars. // unless it's an escaped hash; in that case // it's end-of-comment if (csl->filetext[i] == '#' && (i - 1) >= 0 && csl->filetext[i-1] == '\\') { neednewline = 1; seennewline = 0; in_comment = 0; }; } else { // we are looking at a printing character, so maybe we have // to add a newline. Or maybe not... if (neednewline) { if ((csl->nchars+1) > (sizeof(char) * max_pgmsize)) untrappableerror ( "Program file buffer overflow - " "post-inserting newline to: ", &(csl->filetext[i])); // we need a newline and are looking at a printingchar // so we need to insert a newline. memmove ( &(csl->filetext[i+1]), &(csl->filetext[i]), strlen (&csl->filetext[i])+1); csl->filetext[i] = '\n'; i++; csl->nchars++; nchars++; neednewline = 0; seennewline = 1; }; // switch (csl->filetext[i]) { case '\\': { // if it's a backslash at the end of a line, // delete +both+ the backslash and newline, making // one big line out of it. // // We do this whether or not we're in a nesting. if ( csl->filetext[i+1] == '\n' ) { if (internal_trace) fprintf (stderr, " backquoted EOL - splicing.\n"); memmove ( &(csl->filetext[i]), &(csl->filetext[i+2]), strlen (&csl->filetext[i+2])+1); csl->nchars--; csl->nchars--; nchars--; nchars--; i--; } else { // Otherwise, we _always_ step over the next // character- it can't change nesting, it can't // close a string. Thus, the preprocessor will // do nothing to it. // // // TRICKY BIT HERE !!! Notice that we do // this '\' step-over test _BEFORE_ we do // any other character testing, so the '\' // gets to do it's escape magic before // anything else can operate - and it // _preempts_ any other character's // actions. // i++; }; }; break; case '{': case '}': { // put an unquoted '{' or '}' onto it's own line. // do we need to put in a prefix new line? // if (internal_trace) // // Are we inside a nesting? if (paren_nest == 0 && angle_nest == 0 && box_nest == 0 && slash_nest == 0) { if ( !seennewline ) { if ((csl->nchars+1) > sizeof(char)*max_pgmsize) untrappableerror ( "Program buffer overflow when" "post-inserting newline on:", &csl->filetext[i]); if (internal_trace) fprintf (stderr, " preinserting a newline.\n"); memmove ( &(csl->filetext[i+1]), &(csl->filetext[i]), strlen (&csl->filetext[i])+1); csl->filetext[i] = '\n'; csl->nchars++; nchars++; i++; }; seennewline = 0; // and mark that we need a newline before any more // printable characters come through. neednewline = 1; } }; break; case ';': { // we can replace non-escaped semicolons with // newlines. if (paren_nest == 0 && angle_nest == 0 && box_nest == 0 && slash_nest == 0) { if ( seennewline ) // we just saw a newline { // was preceded by a newline so just get rid // of the ; if (internal_trace) fprintf (stderr, "superfluous semicolon, *poof*.\n"); memmove ( &(csl->filetext[i]), &(csl->filetext[i+1]), strlen (&csl->filetext[i])+1); csl->nchars--; nchars--; i--; neednewline = 0; seennewline = 1; } else { // this was not preceded by a newline, // so we just replace the semicolon with a // newline before any printed characters if (internal_trace) fprintf (stderr, " statement break semi.\n" "--> \\n \n"); csl->filetext[i] = '\n'; neednewline = 0; seennewline = 1; }; }; }; break; case '#': { // now, we're in a comment - everything should be // done only with the comment thing enabled. if (paren_nest == 0 && angle_nest == 0 && box_nest == 0 && slash_nest == 0) { in_comment = 1; }; }; break; case '(': { // Update nesting if necessary if (paren_nest == 0 && angle_nest == 0 && box_nest == 0 && slash_nest == 0) { paren_nest = 1; }; }; break; case ')': { // Update nesting if necessary if (paren_nest == 1 && angle_nest == 0 && box_nest == 0 && slash_nest == 0) { paren_nest = 0; }; }; break; case '<': { // Update nesting if necessary if (paren_nest == 0 && angle_nest == 0 && box_nest == 0 && slash_nest == 0) { angle_nest = 1; }; }; break; case '>': { // Update nesting if necessary if (paren_nest == 0 && angle_nest == 1 && box_nest == 0 && slash_nest == 0) { angle_nest = 0; }; }; break; case '[': { // Update nesting if necessary if (paren_nest == 0 && angle_nest == 0 && box_nest == 0 && slash_nest == 0) { box_nest = 1; }; }; break; case ']': { // Update nesting if necessary if (paren_nest == 0 && angle_nest == 0 && box_nest == 1 && slash_nest == 0) { box_nest = 0; }; }; break; case '/': { // Update nesting if necessary if (paren_nest == 0 && angle_nest == 0 && box_nest == 0 ) { if (slash_nest == 0) { slash_nest = 1; } else { slash_nest = 0; }; }; }; break; default: { // none of the above - it's a normal printing // character - we can just do the // clearing of all the "seen/need" flags seennewline = 0; neednewline = 0; }; break; }; }; }; }; };