I'll leave my progress on the Unicode identifiers patch here (or the PR in Github, if you fancy that: https://github.com/dualbus/bash/pull/2/files).
I won't have much time to work on this for a few weeks, so it's up to you all to complete it :-) It has markers on the places where it needs work (mainly more `ifdef HANDLE_MULTIBYTE' and passing a wide character instead of a single byte character to `legal_variable_starter'). diff --git a/expr.c b/expr.c index fee7a4aa..c7b9dc7e 100644 --- a/expr.c +++ b/expr.c @@ -1286,6 +1286,7 @@ readtok () } lasttp = tp = cp - 1; + /* XXX: Update */ if (legal_variable_starter (c)) { /* variable names not preceded with a dollar sign are shell variables. */ @@ -1293,6 +1294,7 @@ readtok () EXPR_CONTEXT ec; int peektok; + /* XXX: Update */ while (legal_variable_char (c)) c = *cp++; @@ -1417,6 +1419,7 @@ readtok () xp = cp; while (xp && *xp && cr_whitespace (*xp)) xp++; + /* XXX: Update */ if (legal_variable_starter ((unsigned char)*xp)) c = (c == '-') ? PREDEC : PREINC; else diff --git a/general.c b/general.c index 584e7859..bf9d683b 100644 --- a/general.c +++ b/general.c @@ -214,18 +214,51 @@ int legal_identifier (name) const char *name; { - register const char *s; - unsigned char c; +#ifdef HANDLE_MULTIBYTE + wchar_t *p, *s; + wchar_t c; + size_t n; +#else + char *p, *s; + char c; +#endif + + if (!name || *name == '\0') + return (0); + +#ifdef HANDLE_MULTIBYTE + n = mbstowcs (NULL, name, 0); + if ((size_t) -1 == n) + return (0); - if (!name || !(c = *name) || (legal_variable_starter (c) == 0)) + s = xmalloc (sizeof(wchar_t) * (n+1)); + if (!s) return (0); - for (s = name + 1; (c = *s) != 0; s++) + n = mbstowcs(s, name, n+1); + if ((size_t) -1 == n) + goto illegal_name; +#else + s = name; +#endif + + if (legal_variable_starter (*s) == 0) + goto illegal_name; + + for (p = s + 1; (c = *p) != 0; p++) { if (legal_variable_char (c) == 0) - return (0); + goto illegal_name; } + return (1); + + illegal_name: +#ifdef HANDLE_MULTIBYTE + if (s) + free (s); +#endif + return (0); } /* Return 1 if NAME is a valid value that can be assigned to a nameref @@ -349,6 +382,11 @@ legal_alias_name (string, flags) return 1; } +#ifdef HANDLE_MULTIBYTE +#define WC_OR_C(c) (L##c) +#else +#define WC_OR_C(c) (c) +#endif /* Returns non-zero if STRING is an assignment statement. The returned value is the index of the `=' sign. If FLAGS&1 we are expecting a compound assignment and don't want an array subscript before the `='. */ @@ -357,27 +395,45 @@ assignment (string, flags) const char *string; int flags; { - register unsigned char c; register int newi, indx; +#ifdef HANDLE_MULTIBYTE + wchar_t c; + wchar_t *ws; + int nb; + size_t n; +#else + char c; + char *ws; +#endif + +#ifdef HANDLE_MULTIBYTE + n = strlen(string); + nb = mbtowc (&c, &string[indx = 0], n); + if ((size_t) nb == -1) + return (0); + + indx += nb; n -= nb; +#else + indx++; +#endif - c = string[indx = 0]; #if defined (ARRAY_VARS) - if ((legal_variable_starter (c) == 0) && ((flags&1) == 0 || c != '[')) /* ] */ + if ((legal_variable_starter (c) == 0) && ((flags&1) == 0 || c != WC_OR_C ('['))) /* ] */ #else if (legal_variable_starter (c) == 0) #endif return (0); - while (c = string[indx]) + while ((nb=mbtowc (&c, &string[indx], n)) > 0) { /* The following is safe. Note that '=' at the start of a word is not an assignment statement. */ - if (c == '=') + if (c == WC_OR_C ('=')) return (indx); #if defined (ARRAY_VARS) - if (c == '[') + if (c == WC_OR_C ('[')) { newi = skipsubscript (string, indx, (flags & 2) ? 1 : 0); if (string[newi++] != ']') @@ -389,7 +445,7 @@ assignment (string, flags) #endif /* ARRAY_VARS */ /* Check for `+=' */ - if (c == '+' && string[indx+1] == '=') + if (c == WC_OR_C ('+') && string[indx+1] == '=') return (indx + 1); /* Variable names in assignment statements may contain only letters, @@ -397,7 +453,11 @@ assignment (string, flags) if (legal_variable_char (c) == 0) return (0); +#ifdef HANDLE_MULTIBYTE + indx += nb; n -= nb; +#else indx++; +#endif } return (0); } diff --git a/general.h b/general.h index d55f26bf..5452e956 100644 --- a/general.h +++ b/general.h @@ -103,8 +103,13 @@ extern char *strcpy __P((char *, const char *)); /* Define exactly what a legal shell identifier consists of. */ +#ifdef HANDLE_MULTIBYTE +#define legal_variable_starter(wc) (iswalpha(wc) || (L'_' == wc)) +#define legal_variable_char(wc) (iswalnum(wc) || (L'_' == wc)) +#else #define legal_variable_starter(c) (ISALPHA(c) || (c == '_')) #define legal_variable_char(c) (ISALNUM(c) || c == '_') +#endif /* Definitions used in subst.c and by the `read' builtin for field splitting. */ diff --git a/subst.c b/subst.c index 3093309f..3bd399dd 100644 --- a/subst.c +++ b/subst.c @@ -6717,6 +6717,7 @@ parameter_brace_expand_rhs (name, value, op, quoted, pflags, qdollaratp, hasdoll free (t); /* bash-4.4/5.0 */ + /* XXX: Update */ vname = name; if (*name == '!' && (legal_variable_starter ((unsigned char)name[1]) || DIGIT (name[1]) || VALID_INDIR_PARAM (name[1]))) @@ -7070,6 +7071,7 @@ get_var_and_type (varname, value, ind, quoted, flags, varp, valp) SHELL_VAR *v; arrayind_t lind; + /* XXX: Update */ want_indir = *varname == '!' && (legal_variable_starter ((unsigned char)varname[1]) || DIGIT (varname[1]) || VALID_INDIR_PARAM (varname[1])); @@ -8217,6 +8219,7 @@ parameter_brace_expand (string, indexp, quoted, pflags, quoted_dollar_atp, conta sindex = *indexp; t_index = ++sindex; /* ${#var} doesn't have any of the other parameter expansions on it. */ + /* XXX: Update */ if (string[t_index] == '#' && legal_variable_starter (string[t_index+1])) /* {{ */ name = string_extract (string, &t_index, "}", SX_VARNAME); else @@ -8330,6 +8333,7 @@ parameter_brace_expand (string, indexp, quoted, pflags, quoted_dollar_atp, conta /* Indirect expansion begins with a `!'. A valid indirect expansion is either a variable name, one of the positional parameters or a special variable that expands to one of the positional parameters. */ + /* XXX: Update */ want_indir = *name == '!' && (legal_variable_starter ((unsigned char)name[1]) || DIGIT (name[1]) || VALID_INDIR_PARAM (name[1])); @@ -8388,6 +8392,7 @@ parameter_brace_expand (string, indexp, quoted, pflags, quoted_dollar_atp, conta } /* Process ${!PREFIX*} expansion. */ + /* XXX: Update */ if (want_indir && string[sindex - 1] == RBRACE && (string[sindex - 2] == '*' || string[sindex - 2] == '@') && legal_variable_starter ((unsigned char) name[1])) @@ -9213,6 +9218,7 @@ comsub: /* Find the variable in VARIABLE_LIST. */ temp = (char *)NULL; + /* XXX: Update */ for (t_index = zindex; (c = string[zindex]) && legal_variable_char (c); zindex++) ; temp1 = (zindex > t_index) ? substring (string, t_index, zindex) : (char *)NULL; diff --git a/variables.c b/variables.c index a08313d7..a41f1ba0 100644 --- a/variables.c +++ b/variables.c @@ -4409,6 +4409,7 @@ valid_exportstr (v) internal_error (_("%s has null exportstr"), v->name); return (0); } + /* XXX: Update */ if (legal_variable_starter ((unsigned char)*s) == 0) { internal_error (_("invalid character %d in exportstr for %s"), *s, v->name); @@ -4418,6 +4419,7 @@ valid_exportstr (v) { if (*s == '=') break; + /* XXX: Update */ if (legal_variable_char ((unsigned char)*s) == 0) { internal_error (_("invalid character %d in exportstr for %s"), *s, v->name); -- Eduardo Bustamante https://dualbus.me/