diff options
author | Warner Losh <imp@FreeBSD.org> | 2023-11-26 15:12:04 +0000 |
---|---|---|
committer | Warner Losh <imp@FreeBSD.org> | 2023-11-26 15:12:04 +0000 |
commit | 18df98168fa7af30532be5c8988b80f2987c9b84 (patch) | |
tree | 7fa027d6f9d99ae0820175adb08771ef11585b5d | |
parent | 2e406c584fe40d654e4b8042006d2206eed016b3 (diff) |
ota: Import ota from 20241124 (930d75157063)vendor/one-true-awk/930d75157063
Minor bug fixes to man page, tests, fnematch (fixing erroneous output),
string escape sequences and improve flow control by optimizing gototab.
Nov 24, 2023:
Fix issue #199: gototab improvements to dynamically resize the
table, qsort and bsearch to improve the lookup speed as the
table gets larger for multibyte input. thanks to Arnold Robbins.
Nov 23, 2023:
Fix Issue #169, related to escape sequences in strings.
Thanks to Github user rajeevvp.
Fix Issue #147, reported by Github user drawkula, and fixed
by Miguel Pineiro Jr.
Nov 20, 2023:
rewrite of fnematch to fix a number of issues, including
extraneous output, out-of-bounds access, number of bytes
to push back after a failed match etc.
thanks to Miguel Pineiro Jr.
Nov 15, 2023:
Man page edit, regression test fixes. thanks to Arnold Robbins
consolidation of sub and gsub into dosub, removing duplicate
code. thanks to Miguel Pineiro Jr.
gcc replaced with cc everywhere.
Sponsored by: Netflix
-rw-r--r-- | FIXES | 24 | ||||
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | awk.1 | 3 | ||||
-rw-r--r-- | awk.h | 11 | ||||
-rw-r--r-- | b.c | 279 | ||||
-rwxr-xr-x | bugs-fixed/REGRESS | 2 | ||||
-rw-r--r-- | lex.c | 14 | ||||
-rw-r--r-- | main.c | 2 | ||||
-rw-r--r-- | makefile | 8 | ||||
-rw-r--r-- | maketab.c | 4 | ||||
-rw-r--r-- | proto.h | 3 | ||||
-rw-r--r-- | run.c | 273 | ||||
-rwxr-xr-x | testdir/Compare.tt | 2 | ||||
-rwxr-xr-x | testdir/REGRESS | 2 | ||||
-rwxr-xr-x | testdir/T.csv | 1 | ||||
-rwxr-xr-x | testdir/T.flags | 5 | ||||
-rwxr-xr-x | testdir/T.misc | 14 |
17 files changed, 354 insertions, 297 deletions
@@ -25,6 +25,29 @@ THIS SOFTWARE. This file lists all bug fixes, changes, etc., made since the second edition of the AWK book was published in September 2023. +Nov 24, 2023: + Fix issue #199: gototab improvements to dynamically resize the + table, qsort and bsearch to improve the lookup speed as the + table gets larger for multibyte input. thanks to Arnold Robbins. + +Nov 23, 2023: + Fix Issue #169, related to escape sequences in strings. + Thanks to Github user rajeevvp. + Fix Issue #147, reported by Github user drawkula, and fixed + by Miguel Pineiro Jr. + +Nov 20, 2023: + rewrite of fnematch to fix a number of issues, including + extraneous output, out-of-bounds access, number of bytes + to push back after a failed match etc. + thanks to Miguel Pineiro Jr. + +Nov 15, 2023: + Man page edit, regression test fixes. thanks to Arnold Robbins + consolidation of sub and gsub into dosub, removing duplicate + code. thanks to Miguel Pineiro Jr. + gcc replaced with cc everywhere. + Oct 30, 2023: multiple fixes and a minor code cleanup. disabled utf-8 for non-multibyte locales, such as C or POSIX. @@ -32,7 +55,6 @@ Oct 30, 2023: systems. also fixed an out-of-bounds read for empty CCL. fixed a buffer overflow in substr with utf-8 strings. many thanks to Todd C Miller. - Sep 24, 2023: fnematch and getrune have been overhauled to solve issues around diff --git a/README.md b/README.md index daace23e166e..84fb06e48833 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,6 @@ Aribtrary characters may be included with `\u` followed by 1 to 8 hexadecimal di ### Regular expressions ### Regular expressions may include UTF-8 code points, including `\u`. -Character classes are likely to be limited to about 256 characters -when expanded. ### CSV ### @@ -145,4 +143,4 @@ is not at the top of our priority list. #### Last Updated -Sun 15 Oct 2023 06:28:36 IDT +Mon 16 Oct 2023 11:23:08 IDT @@ -638,6 +638,9 @@ the syntax is worse. .PP Input is expected to be UTF-8 encoded. Other multibyte character sets are not handled. +However, in eight-bit locales, +.I awk +treats each input byte as a separate character. .SH UNUSUAL FLOATING-POINT VALUES .I Awk was designed before IEEE 754 arithmetic defined Not-A-Number (NaN) @@ -254,14 +254,19 @@ typedef struct rrow { int *lfollow; } rrow; -typedef struct gtt { /* gototab entry */ +typedef struct gtte { /* gototab entry */ unsigned int ch; unsigned int state; +} gtte; + +typedef struct gtt { /* gototab */ + size_t allocated; + size_t inuse; + gtte *entries; } gtt; typedef struct fa { - gtt **gototab; - int gototab_len; + gtt *gototab; uschar *out; uschar *restr; int **posns; @@ -96,9 +96,8 @@ extern int u8_nextlen(const char *s); mechanism of the goto table used 8-bit byte indices into the gototab entries to compute the next state. Unicode is a lot bigger, so the gototab entries are now structs with a character - and a next state, and there is a linear search of the characters - to find the state. (Yes, this is slower, by a significant - amount. Tough.) + and a next state. These are sorted by code point and binary + searched. Throughout the RE mechanism in b.c, utf-8 characters are converted to their utf-32 value. This mostly shows up in @@ -113,8 +112,10 @@ extern int u8_nextlen(const char *s); */ +static int entry_cmp(const void *l, const void *r); static int get_gototab(fa*, int, int); static int set_gototab(fa*, int, int, int); +static void clear_gototab(fa*, int); extern int u8_rune(int *, const uschar *); static int * @@ -142,7 +143,7 @@ resizesetvec(const char *f) static void resize_state(fa *f, int state) { - gtt **p; + gtt *p; uschar *p2; int **p3; int i, new_count; @@ -152,7 +153,7 @@ resize_state(fa *f, int state) new_count = state + 10; /* needs to be tuned */ - p = (gtt **) realloc(f->gototab, new_count * sizeof(f->gototab[0])); + p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt)); if (p == NULL) goto out; f->gototab = p; @@ -168,13 +169,14 @@ resize_state(fa *f, int state) f->posns = p3; for (i = f->state_count; i < new_count; ++i) { - f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab)); - if (f->gototab[i] == NULL) + f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte)); + if (f->gototab[i].entries == NULL) goto out; - f->out[i] = 0; + f->gototab[i].allocated = NCHARS; + f->gototab[i].inuse = 0; + f->out[i] = 0; f->posns[i] = NULL; } - f->gototab_len = NCHARS; /* should be variable, growable */ f->state_count = new_count; return; out: @@ -268,8 +270,7 @@ int makeinit(fa *f, bool anchor) } if ((f->posns[2])[1] == f->accept) f->out[2] = 1; - for (i = 0; i < NCHARS; i++) - set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */ + clear_gototab(f, 2); f->curstat = cgoto(f, 2, HAT); if (anchor) { *f->posns[2] = k-1; /* leave out position 0 */ @@ -595,32 +596,104 @@ int member(int c, int *sarg) /* is c in s? */ return(0); } +static void resize_gototab(fa *f, int state) +{ + size_t new_size = f->gototab[state].allocated * 2; + gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); + if (p == NULL) + overflo(__func__); + + // need to initialized the new memory to zero + size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size + memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out + + f->gototab[state].allocated = new_size; // update gotottab info + f->gototab[state].entries = p; +} + static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */ { - int i; - for (i = 0; i < f->gototab_len; i++) { - if (f->gototab[state][i].ch == 0) - break; - if (f->gototab[state][i].ch == ch) - return f->gototab[state][i].state; - } - return 0; + gtte key; + gtte *item; + + key.ch = ch; + key.state = 0; /* irrelevant */ + item = bsearch(& key, f->gototab[state].entries, + f->gototab[state].inuse, sizeof(gtte), + entry_cmp); + + if (item == NULL) + return 0; + else + return item->state; +} + +static int entry_cmp(const void *l, const void *r) +{ + const gtte *left, *right; + + left = (const gtte *) l; + right = (const gtte *) r; + + return left->ch - right->ch; } static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */ { - int i; - for (i = 0; i < f->gototab_len; i++) { - if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) { - f->gototab[state][i].ch = ch; - f->gototab[state][i].state = val; - return val; + if (f->gototab[state].inuse == 0) { + f->gototab[state].entries[0].ch = ch; + f->gototab[state].entries[0].state = val; + f->gototab[state].inuse++; + return val; + } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) { + // not seen yet, insert and return + gtt *tab = & f->gototab[state]; + if (tab->inuse + 1 >= tab->allocated) + resize_gototab(f, state); + + f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch; + f->gototab[state].entries[f->gototab[state].inuse-1].state = val; + f->gototab[state].inuse++; + return val; + } else { + // maybe we have it, maybe we don't + gtte key; + gtte *item; + + key.ch = ch; + key.state = 0; /* irrelevant */ + item = bsearch(& key, f->gototab[state].entries, + f->gototab[state].inuse, sizeof(gtte), + entry_cmp); + + if (item != NULL) { + // we have it, update state and return + item->state = val; + return item->state; } + // otherwise, fall through to insert and reallocate. } - overflo(__func__); + + gtt *tab = & f->gototab[state]; + if (tab->inuse + 1 >= tab->allocated) + resize_gototab(f, state); + ++tab->inuse; + f->gototab[state].entries[tab->inuse].ch = ch; + f->gototab[state].entries[tab->inuse].state = val; + + qsort(f->gototab[state].entries, + f->gototab[state].inuse, sizeof(gtte), entry_cmp); + return val; /* not used anywhere at the moment */ } +static void clear_gototab(fa *f, int state) +{ + memset(f->gototab[state].entries, 0, + f->gototab[state].allocated * sizeof(gtte)); + f->gototab[state].inuse = 0; +} + int match(fa *f, const char *p0) /* shortest match ? */ { int s, ns; @@ -759,59 +832,6 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */ #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long -// Read one rune at a time from the given FILE*. Return both -// the bytes and the actual rune. - -struct runedata { - int rune; - size_t len; - char bytes[6]; -}; - -struct runedata getrune(FILE *fp) -{ - struct runedata result; - int c, next; - - memset(&result, 0, sizeof(result)); - - c = getc(fp); - if (c == EOF) - return result; // result.rune == 0 --> EOF - else if (c < 128 || awk_mb_cur_max == 1) { - result.bytes[0] = c; - result.len = 1; - result.rune = c; - - return result; - } - - // need to get bytes and fill things in - result.bytes[0] = c; - result.len = 1; - - next = 1; - for (int i = 1; i < MAX_UTF_BYTES; i++) { - c = getc(fp); - if (c == EOF) - break; - result.bytes[next++] = c; - result.len++; - } - - // put back any extra input bytes - int actual_len = u8_nextlen(result.bytes); - while (result.len > actual_len) { - ungetc(result.bytes[--result.len], fp); - } - - result.bytes[result.len] = '\0'; - (void) u8_rune(& result.rune, (uschar *) result.bytes); - - return result; -} - - /* * NAME * fnematch @@ -829,58 +849,76 @@ struct runedata getrune(FILE *fp) bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) { - char *buf = *pbuf; + char *i, *j, *k, *buf = *pbuf; int bufsize = *pbufsize; - int i, j, k, ns, s; - struct runedata r; + int c, n, ns, s; s = pfa->initstat; patlen = 0; /* - * All indices relative to buf. - * i <= j <= k <= bufsize + * buf <= i <= j <= k <= buf+bufsize * - * i: origin of active substring (first byte of first character) - * j: current character (last byte of current character) - * k: destination of next getc() + * i: origin of active substring + * j: current character + * k: destination of the next getc */ - i = -1, k = 0; - do { - j = i++; - do { - r = getrune(f); - if ((++j + r.len) >= k) { - if (k >= bufsize) - if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) - FATAL("stream '%.30s...' too long", buf); - } - memcpy(buf + k, r.bytes, r.len); - j += r.len - 1; // incremented next time around the loop - k += r.len; - if ((ns = get_gototab(pfa, s, r.rune)) != 0) - s = ns; - else - s = cgoto(pfa, s, r.rune); + i = j = k = buf; - if (pfa->out[s]) { /* final state */ - patlen = j - i + 1; - if (r.rune == 0) /* don't count $ */ - patlen--; + do { + /* + * Call u8_rune with at least MAX_UTF_BYTES ahead in + * the buffer until EOF interferes. + */ + if (k - j < MAX_UTF_BYTES) { + if (k + MAX_UTF_BYTES > buf + bufsize) { + adjbuf((char **) &buf, &bufsize, + bufsize + MAX_UTF_BYTES, + quantum, 0, "fnematch"); } - } while (buf[j] && s != 1); + for (n = MAX_UTF_BYTES ; n > 0; n--) { + *k++ = (c = getc(f)) != EOF ? c : 0; + if (c == EOF) { + if (ferror(f)) + FATAL("fnematch: getc error"); + break; + } + } + } + + j += u8_rune(&c, (uschar *)j); + + if ((ns = get_gototab(pfa, s, c)) != 0) + s = ns; + else + s = cgoto(pfa, s, c); + + if (pfa->out[s]) { /* final state */ + patbeg = i; + patlen = j - i; + if (c == 0) /* don't count $ */ + patlen--; + } + + if (c && s != 1) + continue; /* origin i still viable, next j */ + if (patlen) + break; /* best match found */ + + /* no match at origin i, next i and start over */ + i += u8_rune(&c, (uschar *)i); + if (c == 0) + break; /* no match */ + j = i; s = 2; - if (r.len > 1) - i += r.len - 1; // i incremented around the loop - } while (buf[i] && !patlen); + } while (1); /* adjbuf() may have relocated a resized buffer. Inform the world. */ *pbuf = buf; *pbufsize = bufsize; if (patlen) { - patbeg = (char *) buf + i; /* * Under no circumstances is the last character fed to * the automaton part of the match. It is EOF's nullbyte, @@ -893,11 +931,10 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) * terminate the buffer. */ do - for (int ii = r.len; ii > 0; ii--) - if (buf[--k] && ungetc(buf[k], f) == EOF) - FATAL("unable to ungetc '%c'", buf[k]); - while (k > i + patlen); - buf[k] = '\0'; + if (*--k && ungetc(*k, f) == EOF) + FATAL("unable to ungetc '%c'", *k); + while (k > patbeg + patlen); + *k = '\0'; return true; } else @@ -1486,8 +1523,7 @@ int cgoto(fa *f, int s, int c) /* add tmpset to current set of states */ ++(f->curstat); resize_state(f, f->curstat); - for (i = 0; i < NCHARS; i++) - set_gototab(f, f->curstat, 0, 0); + clear_gototab(f, f->curstat); xfree(f->posns[f->curstat]); p = intalloc(setcnt + 1, __func__); @@ -1511,7 +1547,8 @@ void freefa(fa *f) /* free a finite automaton */ if (f == NULL) return; for (i = 0; i < f->state_count; i++) - xfree(f->gototab[i]) + xfree(f->gototab[i].entries); + xfree(f->gototab); for (i = 0; i <= f->curstat; i++) xfree(f->posns[i]); for (i = 0; i <= f->accept; i++) { diff --git a/bugs-fixed/REGRESS b/bugs-fixed/REGRESS index 07160031ca07..98d578ac22ef 100755 --- a/bugs-fixed/REGRESS +++ b/bugs-fixed/REGRESS @@ -1,4 +1,4 @@ -#! /bin/bash +#! /bin/sh if [ ! -f ../a.out ] then @@ -430,8 +430,12 @@ int string(void) { int i; + if (!isxdigit(peek())) { + unput(c); + break; + } n = 0; - for (i = 1; i <= 2; i++) { + for (i = 0; i < 2; i++) { c = input(); if (c == 0) break; @@ -442,13 +446,13 @@ int string(void) n += (c - '0'); else n += 10 + (c - 'a'); - } else + } else { + unput(c); break; + } } - if (n) + if (i) *bp++ = n; - else - unput(c); break; } @@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ****************************************************************/ -const char *version = "version 20231030"; +const char *version = "version 20231124"; #define DEBUG #include <stdio.h> @@ -28,10 +28,10 @@ CFLAGS = CFLAGS = -O2 # compiler options -#CC = gcc -Wall -g -Wwrite-strings -#CC = gcc -O4 -Wall -pedantic -fno-strict-aliasing -#CC = gcc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov -HOSTCC = gcc -g -Wall -pedantic -Wcast-qual +#CC = cc -Wall -g -Wwrite-strings +#CC = cc -O4 -Wall -pedantic -fno-strict-aliasing +#CC = cc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov +HOSTCC = cc -g -Wall -pedantic -Wcast-qual CC = $(HOSTCC) # change this is cross-compiling. # By fiat, to make our lives easier, yacc is now defined to be bison. diff --git a/maketab.c b/maketab.c index d4b756ad6706..3a80c87725ac 100644 --- a/maketab.c +++ b/maketab.c @@ -52,8 +52,8 @@ struct xx { ARRAY, "array", NULL }, { INDIRECT, "indirect", "$(" }, { SUBSTR, "substr", "substr" }, - { SUB, "sub", "sub" }, - { GSUB, "gsub", "gsub" }, + { SUB, "dosub", "sub" }, + { GSUB, "dosub", "gsub" }, { INDEX, "sindex", "sindex" }, { SPRINTF, "awksprintf", "sprintf " }, { ADD, "arith", " + " }, @@ -198,8 +198,7 @@ extern FILE *openfile(int, const char *, bool *); extern const char *filename(FILE *); extern Cell *closefile(Node **, int); extern void closeall(void); -extern Cell *sub(Node **, int); -extern Cell *gsub(Node **, int); +extern Cell *dosub(Node **, int); extern Cell *gensub(Node **, int); extern FILE *popen(const char *, const char *); @@ -1540,8 +1540,9 @@ Cell *assign(Node **a, int n) /* a[0] = a[1], a[0] += a[1], etc. */ if (x == y && !(x->tval & (FLD|REC)) && x != nfloc) ; /* self-assignment: leave alone unless it's a field or NF */ else if ((y->tval & (STR|NUM)) == (STR|NUM)) { + yf = getfval(y); setsval(x, getsval(y)); - x->fval = getfval(y); + x->fval = yf; x->tval |= NUM; } else if (isstr(y)) @@ -2492,169 +2493,143 @@ static void flush_all(void) void backsub(char **pb_ptr, const char **sptr_ptr); -Cell *sub(Node **a, int nnn) /* substitute command */ +Cell *dosub(Node **a, int subop) /* sub and gsub */ { - const char *sptr, *q; - Cell *x, *y, *result; - char *t, *buf, *pb; fa *pfa; + int tempstat; + char *repl; + Cell *x; + + char *buf = NULL; + char *pb = NULL; int bufsz = recsize; - if ((buf = (char *) malloc(bufsz)) == NULL) - FATAL("out of memory in sub"); - x = execute(a[3]); /* target string */ - t = getsval(x); - if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */ - pfa = (fa *) a[1]; /* regular expression */ - else { - y = execute(a[1]); - pfa = makedfa(getsval(y), 1); - tempfree(y); + const char *r, *s; + const char *start; + const char *noempty = NULL; /* empty match disallowed here */ + size_t m = 0; /* match count */ + size_t whichm; /* which match to select, 0 = global */ + int mtype; /* match type */ + + if (a[0] == NULL) { /* 0 => a[1] is already-compiled regexpr */ + pfa = (fa *) a[1]; + } else { + x = execute(a[1]); + pfa = makedfa(getsval(x), 1); + tempfree(x); } - y = execute(a[2]); /* replacement string */ - result = False; - if (pmatch(pfa, t)) { - sptr = t; - adjbuf(&buf, &bufsz, 1+patbeg-sptr, recsize, 0, "sub"); - pb = buf; - while (sptr < patbeg) - *pb++ = *sptr++; - sptr = getsval(y); - while (*sptr != '\0') { - adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "sub"); - if (*sptr == '\\') { - backsub(&pb, &sptr); - } else if (*sptr == '&') { - sptr++; - adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "sub"); - for (q = patbeg; q < patbeg+patlen; ) - *pb++ = *q++; - } else - *pb++ = *sptr++; + + x = execute(a[2]); /* replacement string */ + repl = tostring(getsval(x)); + tempfree(x); + + switch (subop) { + case SUB: + whichm = 1; + x = execute(a[3]); /* source string */ + break; + case GSUB: + whichm = 0; + x = execute(a[3]); /* source string */ + break; + default: + FATAL("dosub: unrecognized subop: %d", subop); + } + + start = getsval(x); + while (pmatch(pfa, start)) { + if (buf == NULL) { + if ((pb = buf = malloc(bufsz)) == NULL) + FATAL("out of memory in dosub"); + tempstat = pfa->initstat; + pfa->initstat = 2; } - *pb = '\0'; - if (pb > buf + bufsz) - FATAL("sub result1 %.30s too big; can't happen", buf); - sptr = patbeg + patlen; - if ((patlen == 0 && *patbeg) || (patlen && *(sptr-1))) { - adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "sub"); - while ((*pb++ = *sptr++) != '\0') - continue; + + /* match types */ + #define MT_IGNORE 0 /* unselected or invalid */ + #define MT_INSERT 1 /* selected, empty */ + #define MT_REPLACE 2 /* selected, not empty */ + + /* an empty match just after replacement is invalid */ + + if (patbeg == noempty && patlen == 0) { + mtype = MT_IGNORE; /* invalid, not counted */ + } else if (whichm == ++m || whichm == 0) { + mtype = patlen ? MT_REPLACE : MT_INSERT; + } else { + mtype = MT_IGNORE; /* unselected, but counted */ } - if (pb > buf + bufsz) - FATAL("sub result2 %.30s too big; can't happen", buf); - setsval(x, buf); /* BUG: should be able to avoid copy */ - result = True; - } - tempfree(x); - tempfree(y); - free(buf); - return result; -} -Cell *gsub(Node **a, int nnn) /* global substitute */ -{ - Cell *x, *y; - char *rptr, *pb; - const char *q, *t, *sptr; - char *buf; - fa *pfa; - int mflag, tempstat, num; - int bufsz = recsize; - int charlen = 0; + /* leading text: */ + if (patbeg > start) { + adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start), + recsize, &pb, "dosub"); + s = start; + while (s < patbeg) + *pb++ = *s++; + } - if ((buf = (char *) malloc(bufsz)) == NULL) - FATAL("out of memory in gsub"); - mflag = 0; /* if mflag == 0, can replace empty string */ - num = 0; - x = execute(a[3]); /* target string */ - t = getsval(x); - if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */ - pfa = (fa *) a[1]; /* regular expression */ - else { - y = execute(a[1]); - pfa = makedfa(getsval(y), 1); - tempfree(y); - } - y = execute(a[2]); /* replacement string */ - if (pmatch(pfa, t)) { - tempstat = pfa->initstat; - pfa->initstat = 2; - pb = buf; - rptr = getsval(y); - do { - if (patlen == 0 && *patbeg != '\0') { /* matched empty string */ - if (mflag == 0) { /* can replace empty */ - num++; - sptr = rptr; - while (*sptr != '\0') { - adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub"); - if (*sptr == '\\') { - backsub(&pb, &sptr); - } else if (*sptr == '&') { - sptr++; - adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub"); - for (q = patbeg; q < patbeg+patlen; ) - *pb++ = *q++; - } else - *pb++ = *sptr++; - } - } - if (*t == '\0') /* at end */ - goto done; - adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub"); - charlen = u8_nextlen(t); - while (charlen-- > 0) - *pb++ = *t++; - if (pb > buf + bufsz) /* BUG: not sure of this test */ - FATAL("gsub result0 %.30s too big; can't happen", buf); - mflag = 0; - } - else { /* matched nonempty string */ - num++; - sptr = t; - adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gsub"); - while (sptr < patbeg) - *pb++ = *sptr++; - sptr = rptr; - while (*sptr != '\0') { - adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub"); - if (*sptr == '\\') { - backsub(&pb, &sptr); - } else if (*sptr == '&') { - sptr++; - adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub"); - for (q = patbeg; q < patbeg+patlen; ) - *pb++ = *q++; - } else - *pb++ = *sptr++; - } - t = patbeg + patlen; - if (patlen == 0 || *t == '\0' || *(t-1) == '\0') - goto done; - if (pb > buf + bufsz) - FATAL("gsub result1 %.30s too big; can't happen", buf); - mflag = 1; + if (mtype == MT_IGNORE) + goto matching_text; /* skip replacement text */ + + r = repl; + while (*r != 0) { + adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub"); + if (*r == '\\') { + backsub(&pb, &r); + } else if (*r == '&') { + r++; + adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, + &pb, "dosub"); + for (s = patbeg; s < patbeg+patlen; ) + *pb++ = *s++; + } else { + *pb++ = *r++; } - } while (pmatch(pfa,t)); - sptr = t; - adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gsub"); - while ((*pb++ = *sptr++) != '\0') - continue; - done: if (pb < buf + bufsz) - *pb = '\0'; - else if (*(pb-1) != '\0') - FATAL("gsub result2 %.30s truncated; can't happen", buf); - setsval(x, buf); /* BUG: should be able to avoid copy + free */ + } + +matching_text: + if (mtype == MT_REPLACE || *patbeg == '\0') + goto next_search; /* skip matching text */ + + if (patlen == 0) + patlen = u8_nextlen(patbeg); + adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub"); + s = patbeg; + while (s < patbeg + patlen) + *pb++ = *s++; + +next_search: + start = patbeg + patlen; + if (m == whichm || *patbeg == '\0') + break; + if (mtype == MT_REPLACE) + noempty = start; + + #undef MT_IGNORE + #undef MT_INSERT + #undef MT_REPLACE + } + + xfree(repl); + + if (buf != NULL) { pfa->initstat = tempstat; + + /* trailing text */ + adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub"); + while ((*pb++ = *start++) != '\0') + ; + + setsval(x, buf); + free(buf); } + tempfree(x); - tempfree(y); x = gettemp(); x->tval = NUM; - x->fval = num; - free(buf); - return(x); + x->fval = m; + return x; } Cell *gensub(Node **a, int nnn) /* global selective substitute */ diff --git a/testdir/Compare.tt b/testdir/Compare.tt index ca828d258658..4b297d731c94 100755 --- a/testdir/Compare.tt +++ b/testdir/Compare.tt @@ -4,7 +4,7 @@ oldawk=${oldawk-awk} awk=${awk-../a.out} echo compiling time.c -gcc time.c -o time +cc time.c -o time time=./time echo time command = $time diff --git a/testdir/REGRESS b/testdir/REGRESS index 5c3667f5eede..b54ce3f68ea0 100755 --- a/testdir/REGRESS +++ b/testdir/REGRESS @@ -1,7 +1,7 @@ #!/bin/sh uname -a -gcc echo.c -o echo && echo echo compiled +cc echo.c -o echo && echo echo compiled oldawk=${oldawk-awk} awk=${awk-../a.out} diff --git a/testdir/T.csv b/testdir/T.csv index 10da1ea90b8b..79c15104cfb3 100755 --- a/testdir/T.csv +++ b/testdir/T.csv @@ -77,5 +77,4 @@ a''b [a''b] a, [a][] "", [][] , [][] -a"b [a"b] !!!! diff --git a/testdir/T.flags b/testdir/T.flags index 33d7c8db7fe6..17ce56133745 100755 --- a/testdir/T.flags +++ b/testdir/T.flags @@ -20,5 +20,6 @@ grep 'unknown option' foo >/dev/null || echo 'T.flags: bad unknown option' $awk -F >foo 2>&1 grep 'no field separator' foo >/dev/null || echo 'T.flags: bad missing field separator' -$awk -F '' >foo 2>&1 -grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator' +### Awk is now like gawk and splits into separate characters if FS = "" +# $awk -F '' >foo 2>&1 +# grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator' diff --git a/testdir/T.misc b/testdir/T.misc index 1e5c3c553d9e..b8ed3c1c45f9 100755 --- a/testdir/T.misc +++ b/testdir/T.misc @@ -510,3 +510,17 @@ cmp -s foo1 foo2 || echo 'BAD: T.misc exit status on I/O error' echo 1b >foo1 echo ab | $awk '{ sub(/a/, "b" ~ /b/); print }' >foo2 cmp -s foo1 foo2 || echo 'BAD: T.misc lexer regex buffer clobbered' + +# Check handling of octal \OOO and hex \xHH esc. seqs. in strings. +echo 'hello888 +hello +hello +helloxGOO +hello +0A' > foo1 +$awk 'BEGIN { print "hello\888" }' > foo2 +$awk 'BEGIN { print "hello\x000A" }' >> foo2 +$awk 'BEGIN { printf "hello\x0A" }' >> foo2 +$awk 'BEGIN { print "hello\xGOO" }' >> foo2 +$awk 'BEGIN { print "hello\x0A0A" }' >> foo2 +cmp -s foo1 foo2 || echo '�BAD: T.misc escape sequences in strings mishandled' |