From 1309450668aa571dee97f4373f9555b4fddcf1aa Mon Sep 17 00:00:00 2001 From: Fazlul Shahriar Date: Tue, 29 Oct 2019 10:04:06 -0400 Subject: awk: split record into runes for empty FS (#292) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit awk was splitting records into bytes instead of runes for empty FS. For example, this was printing only the first byte of the utf-8 encoding of é: echo é | awk 'BEGIN{FS=""}{print $1}' The change just copies how the `split` function handles runes. Originally reported by kris on twitter: https://twitter.com/p9luv/status/1180436083433201665 --- src/cmd/awk/lib.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'src/cmd/awk') diff --git a/src/cmd/awk/lib.c b/src/cmd/awk/lib.c index 6a6849c5..3eb30687 100644 --- a/src/cmd/awk/lib.c +++ b/src/cmd/awk/lib.c @@ -29,6 +29,7 @@ THIS SOFTWARE. #include #include #include +#include #include "awk.h" #include "y.tab.h" @@ -293,15 +294,19 @@ void fldbld(void) /* create fields from current record */ } *fr = 0; } else if ((sep = *inputFS) == 0) { /* new: FS="" => 1 char/field */ - for (i = 0; *r != 0; r++) { - char buf[2]; + int nb; + for (i = 0; *r != 0; r += nb) { + Rune rr; + char buf[UTFmax+1]; + i++; if (i > nfields) growfldtab(i); if (freeable(fldtab[i])) xfree(fldtab[i]->sval); - buf[0] = *r; - buf[1] = 0; + nb = chartorune(&rr, r); + memmove(buf, r, nb); + buf[nb] = '\0'; fldtab[i]->sval = tostring(buf); fldtab[i]->tval = FLD | STR; } -- cgit v1.2.3 From fa325e9b42b0bdfb48857d1958d9fb7ceac55151 Mon Sep 17 00:00:00 2001 From: Dan Cross Date: Fri, 10 Jan 2020 14:44:21 +0000 Subject: Trivial changes: whitespace and modes. Remote whitespace at the ends of lines. Remove blank lines from the ends of files. Change modes on source files so that they are not executable. Signed-off-by: Dan Cross --- src/cmd/awk/awk.h | 3 +-- src/cmd/awk/lex.c | 27 +++++++++++++-------------- src/cmd/awk/lib.c | 3 +-- src/cmd/awk/main.c | 1 - src/cmd/awk/maketab.c | 1 - src/cmd/awk/parse.c | 1 - src/cmd/awk/proto.h | 1 - src/cmd/awk/re.c | 3 +-- src/cmd/awk/run.c | 3 +-- src/cmd/awk/tran.c | 9 ++++----- 10 files changed, 21 insertions(+), 31 deletions(-) (limited to 'src/cmd/awk') diff --git a/src/cmd/awk/awk.h b/src/cmd/awk/awk.h index 1853381d..913f4509 100644 --- a/src/cmd/awk/awk.h +++ b/src/cmd/awk/awk.h @@ -134,7 +134,7 @@ extern Node *nullnode; #define CCOPY 6 #define CCON 5 #define CTEMP 4 -#define CNAME 3 +#define CNAME 3 #define CVAR 2 #define CFLD 1 #define CUNK 0 @@ -182,4 +182,3 @@ extern int pairstack[], paircnt; #define freeable(p) ( ((p)->tval & (STR|DONTFREE)) == STR ) #include "proto.h" - diff --git a/src/cmd/awk/lex.c b/src/cmd/awk/lex.c index 74a99030..0a051ede 100644 --- a/src/cmd/awk/lex.c +++ b/src/cmd/awk/lex.c @@ -140,7 +140,7 @@ int gettok(char **pbuf, int *psz) /* get next input token */ if (bp-buf >= sz) if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) FATAL( "out of space for number %.10s...", buf ); - if (isdigit(c) || c == 'e' || c == 'E' + if (isdigit(c) || c == 'e' || c == 'E' || c == '.' || c == '+' || c == '-') *bp++ = c; else { @@ -191,7 +191,7 @@ int yylex(void) /* should this also have STR set? */ RET(NUMBER); } - + yylval.i = c; switch (c) { case '\n': /* {EOL} */ @@ -220,7 +220,7 @@ int yylex(void) case '&': if (peek() == '&') { input(); RET(AND); - } else + } else RET('&'); case '|': if (peek() == '|') { @@ -295,7 +295,7 @@ int yylex(void) input(); yylval.i = POWEQ; RET(ASGNOP); } else RET(POWER); - + case '$': /* BUG: awkward, if not wrong */ c = gettok(&buf, &bufsize); @@ -313,7 +313,7 @@ int yylex(void) unputstr(buf); RET(INDIRECT); } - + case '}': if (--bracecnt < 0) SYNTAX( "extra }" ); @@ -336,10 +336,10 @@ int yylex(void) case '(': parencnt++; RET('('); - + case '"': return string(); /* BUG: should be like tran.c ? */ - + default: RET(c); } @@ -369,7 +369,7 @@ int string(void) c = input(); switch (c) { case '"': *bp++ = '"'; break; - case 'n': *bp++ = '\n'; break; + case 'n': *bp++ = '\n'; break; case 't': *bp++ = '\t'; break; case 'f': *bp++ = '\f'; break; case 'r': *bp++ = '\r'; break; @@ -406,7 +406,7 @@ int string(void) break; } - default: + default: *bp++ = c; break; } @@ -416,7 +416,7 @@ int string(void) break; } } - *bp = 0; + *bp = 0; s = tostring(buf); *bp++ = ' '; *bp++ = 0; yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); @@ -442,7 +442,7 @@ int binsearch(char *w, Keyword *kp, int n) return -1; } -int word(char *w) +int word(char *w) { Keyword *kp; int c, n; @@ -504,11 +504,11 @@ int regexpr(void) if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0)) FATAL("out of space for reg expr %.10s...", buf); if (c == '\n') { - SYNTAX( "newline in regular expression %.10s...", buf ); + SYNTAX( "newline in regular expression %.10s...", buf ); unput('\n'); break; } else if (c == '\\') { - *bp++ = '\\'; + *bp++ = '\\'; *bp++ = input(); } else { *bp++ = c; @@ -567,4 +567,3 @@ void unputstr(char *s) /* put a string back on input */ for (i = strlen(s)-1; i >= 0; i--) unput(s[i]); } - diff --git a/src/cmd/awk/lib.c b/src/cmd/awk/lib.c index 3eb30687..30c3ddd4 100644 --- a/src/cmd/awk/lib.c +++ b/src/cmd/awk/lib.c @@ -435,7 +435,7 @@ int refldbld(char *rec, char *fs) /* build fields from reg expr in FS */ break; } } - return i; + return i; } void recbld(void) /* create $0 from $1..$NF if necessary */ @@ -715,4 +715,3 @@ int is_number(char *s) else return 0; } - diff --git a/src/cmd/awk/main.c b/src/cmd/awk/main.c index ea20f63e..532cc1f6 100644 --- a/src/cmd/awk/main.c +++ b/src/cmd/awk/main.c @@ -195,4 +195,3 @@ char *cursource(void) /* current source file name */ else return NULL; } - diff --git a/src/cmd/awk/maketab.c b/src/cmd/awk/maketab.c index 50908ce9..13d34e96 100644 --- a/src/cmd/awk/maketab.c +++ b/src/cmd/awk/maketab.c @@ -166,4 +166,3 @@ int main(int argc, char *argv[]) printf("}\n"); return 0; } - diff --git a/src/cmd/awk/parse.c b/src/cmd/awk/parse.c index d4c88324..2ee6eca6 100644 --- a/src/cmd/awk/parse.c +++ b/src/cmd/awk/parse.c @@ -269,4 +269,3 @@ Node *itonp(int i) /* and vice versa */ { return (Node *) (long) i; } - diff --git a/src/cmd/awk/proto.h b/src/cmd/awk/proto.h index f124f4e4..d12a9322 100644 --- a/src/cmd/awk/proto.h +++ b/src/cmd/awk/proto.h @@ -177,4 +177,3 @@ extern Cell *gsub(Node **, int); extern FILE *popen(const char *, const char *); extern int pclose(FILE *); - diff --git a/src/cmd/awk/re.c b/src/cmd/awk/re.c index a15d2f4d..2a226768 100644 --- a/src/cmd/awk/re.c +++ b/src/cmd/awk/re.c @@ -215,7 +215,7 @@ nematch(void *p, char *s, char *start) if (pmatch(p, s, start) == 1 && patlen > 0) return 1; patlen = -1; - patbeg = start; + patbeg = start; return 0; } /* in the parsing of regular expressions, metacharacters like . have */ @@ -322,4 +322,3 @@ overflow(void) { FATAL("%s", "regular expression too big"); } - diff --git a/src/cmd/awk/run.c b/src/cmd/awk/run.c index b145758c..9aa3e0e4 100644 --- a/src/cmd/awk/run.c +++ b/src/cmd/awk/run.c @@ -506,7 +506,7 @@ Cell *awkdelete(Node **a, int n) /* a[0] is symtab, a[1] is list of subscripts * s = getsval(y); if (!adjbuf(&buf, &bufsz, strlen(buf)+strlen(s)+nsub+1, recsize, 0, 0)) FATAL("out of memory deleting %s[%s...]", x->nval, buf); - strcat(buf, s); + strcat(buf, s); if (np->nnext) strcat(buf, *SUBSEP); tempfree(y); @@ -1914,4 +1914,3 @@ void backsub(char **pb_ptr, char **sptr_ptr) /* handle \\& variations */ *pb_ptr = pb; *sptr_ptr = sptr; } - diff --git a/src/cmd/awk/tran.c b/src/cmd/awk/tran.c index 272a7fdc..387bc00b 100644 --- a/src/cmd/awk/tran.c +++ b/src/cmd/awk/tran.c @@ -167,7 +167,7 @@ void freesymtab(Cell *ap) /* free a symbol table */ if (freeable(cp)) xfree(cp->sval); temp = cp->cnext; /* avoids freeing then using */ - free(cp); + free(cp); } tp->tab[i] = 0; } @@ -180,7 +180,7 @@ void freeelem(Cell *ap, char *s) /* free elem s from ap (i.e., ap["s"] */ Array *tp; Cell *p, *prev = NULL; int h; - + tp = (Array *) ap->sval; h = hash(s, tp->size); for (p = tp->tab[h]; p != NULL; prev = p, p = p->cnext) @@ -275,7 +275,7 @@ Awkfloat setfval(Cell *vp, Awkfloat f) /* set float val of a Cell */ { int fldno; - if ((vp->tval & (NUM | STR)) == 0) + if ((vp->tval & (NUM | STR)) == 0) funnyvar(vp, "assign to"); if (isfld(vp)) { donerec = 0; /* mark $0 invalid */ @@ -405,7 +405,7 @@ char *qstring(char *s, int delim) /* collect string up to next delim */ if (c == 0) { /* \ at end */ *bp++ = '\\'; break; /* for loop */ - } + } switch (c) { case '\\': *bp++ = '\\'; break; case 'n': *bp++ = '\n'; break; @@ -432,4 +432,3 @@ char *qstring(char *s, int delim) /* collect string up to next delim */ *bp++ = 0; return buf; } - -- cgit v1.2.3