Hi.

Attached is a diff with bits and pieces (fixes / changes) in gawk's
regex routines that you may wish to apply to the GNULIB version.

Much of this is cosmetic, but there are a few substantial fixes as well.

I *may* have already sent this to you, but I'm not sure, so I'm resending
just in case.

Thanks,

Arnold
diff --git a/lib/regcomp.c b/lib/regcomp.c
index 4cbb1b2..3b5c68d 100644
--- a/lib/regcomp.c
+++ b/lib/regcomp.c
@@ -129,7 +129,7 @@ static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
    POSIX doesn't require that we do anything for REG_NOERROR,
    but why not be nice?  */
 
-static const char __re_error_msgid[] =
+static const char __re_error_msgid[] attribute_hidden =
   {
 #define REG_NOERROR_IDX	0
     gettext_noop ("Success")	/* REG_NOERROR */
@@ -153,9 +153,9 @@ static const char __re_error_msgid[] =
     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
     "\0"
 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
-    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
+    gettext_noop ("Unmatched [, [^, [:, [., or [=")	/* REG_EBRACK */
     "\0"
-#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
+#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
     "\0"
 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
@@ -183,7 +183,7 @@ static const char __re_error_msgid[] =
     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
   };
 
-static const size_t __re_error_msgid_idx[] =
+static const size_t __re_error_msgid_idx[] attribute_hidden =
   {
     REG_NOERROR_IDX,
     REG_NOMATCH_IDX,
@@ -277,7 +277,7 @@ int
 re_compile_fastmap (bufp)
     struct re_pattern_buffer *bufp;
 {
-  re_dfa_t *dfa = bufp->buffer;
+  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
   char *fastmap = bufp->fastmap;
 
   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
@@ -311,7 +311,7 @@ static void
 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 			 char *fastmap)
 {
-  re_dfa_t *dfa = bufp->buffer;
+  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
   Idx node_cnt;
   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
@@ -665,7 +665,7 @@ void
 regfree (preg)
     regex_t *preg;
 {
-  re_dfa_t *dfa = preg->buffer;
+  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   if (BE (dfa != NULL, 1))
     {
       lock_fini (dfa->lock);
@@ -775,7 +775,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length,
   preg->regs_allocated = REGS_UNALLOCATED;
 
   /* Initialize the dfa.  */
-  dfa = preg->buffer;
+  dfa = (re_dfa_t *) preg->buffer;
   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
     {
       /* If zero allocated, but buffer is non-null, try to realloc
@@ -786,7 +786,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length,
       if (dfa == NULL)
 	return REG_ESPACE;
       preg->allocated = sizeof (re_dfa_t);
-      preg->buffer = dfa;
+      preg->buffer = (unsigned char *) dfa;
     }
   preg->used = sizeof (re_dfa_t);
 
@@ -1009,7 +1009,7 @@ init_word_char (re_dfa_t *dfa)
 static void
 free_workarea_compile (regex_t *preg)
 {
-  re_dfa_t *dfa = preg->buffer;
+  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   bin_tree_storage_t *storage, *next;
   for (storage = dfa->str_tree_storage; storage; storage = next)
     {
@@ -1193,7 +1193,7 @@ optimize_utf8 (re_dfa_t *dfa)
 static reg_errcode_t
 analyze (regex_t *preg)
 {
-  re_dfa_t *dfa = preg->buffer;
+  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   reg_errcode_t ret;
 
   /* Allocate arrays.  */
@@ -1205,7 +1205,12 @@ analyze (regex_t *preg)
 	  || dfa->eclosures == NULL, 0))
     return REG_ESPACE;
 
-  dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
+  /* some malloc()-checkers don't like zero allocations */
+  if (preg->re_nsub > 0)
+    dfa->subexp_map = re_malloc (int, preg->re_nsub);
+  else
+    dfa->subexp_map = NULL;
+
   if (dfa->subexp_map != NULL)
     {
       Idx i;
@@ -1374,7 +1379,7 @@ lower_subexps (void *extra, bin_tree_t *node)
 static bin_tree_t *
 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
 {
-  re_dfa_t *dfa = preg->buffer;
+  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   bin_tree_t *body = node->left;
   bin_tree_t *op, *cls, *tree1, *tree;
 
@@ -2155,7 +2160,7 @@ static bin_tree_t *
 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
        reg_errcode_t *err)
 {
-  re_dfa_t *dfa = preg->buffer;
+  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   bin_tree_t *tree, *eor, *root;
   re_token_t current_token;
   dfa->syntax = syntax;
@@ -2189,7 +2194,7 @@ static bin_tree_t *
 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
 {
-  re_dfa_t *dfa = preg->buffer;
+  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   bin_tree_t *tree, *branch = NULL;
   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
   tree = parse_branch (regexp, preg, token, syntax, nest, err);
@@ -2239,7 +2244,7 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
 {
   bin_tree_t *tree, *expr;
-  re_dfa_t *dfa = preg->buffer;
+  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   tree = parse_expression (regexp, preg, token, syntax, nest, err);
   if (BE (*err != REG_NOERROR && tree == NULL, 0))
     return NULL;
@@ -2283,7 +2288,7 @@ static bin_tree_t *
 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
 {
-  re_dfa_t *dfa = preg->buffer;
+  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   bin_tree_t *tree;
   switch (token->type)
     {
@@ -2507,7 +2512,7 @@ static bin_tree_t *
 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
 {
-  re_dfa_t *dfa = preg->buffer;
+  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   bin_tree_t *tree;
   size_t cur_nsub;
   cur_nsub = preg->re_nsub++;
@@ -2746,10 +2751,15 @@ build_range_exp (const reg_syntax_t syntax,
     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
 		 : 0));
+    /*
+     * Fedora Core 2, maybe others, have broken `btowc' that returns -1
+     * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
+     * unsigned, so we don't have sign extension problems.
+     */
     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
-		? __btowc (start_ch) : start_elem->opr.wch);
+		? start_ch : start_elem->opr.wch);
     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
-	      ? __btowc (end_ch) : end_elem->opr.wch);
+	      ? end_ch : end_elem->opr.wch);
     if (start_wc == WEOF || end_wc == WEOF)
       return REG_ECOLLATE;
     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
@@ -2779,7 +2789,14 @@ build_range_exp (const reg_syntax_t syntax,
 					new_nranges);
 
 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
-	      return REG_ESPACE;
+              {
+                 /* if one is not NULL, free it to avoid leaks */
+                 if (new_array_start != NULL)
+                     re_free(new_array_start);
+                 if (new_array_end != NULL)
+                     re_free(new_array_end);
+	         return REG_ESPACE;
+	      }
 
 	    mbcset->range_starts = new_array_start;
 	    mbcset->range_ends = new_array_end;
@@ -3517,6 +3534,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name)
 	return REG_ECOLLATE;
 
       /* Build single byte matching table for this equivalence class.  */
+      char_buf[1] = (unsigned char) '\0';
       len = weights[idx1 & 0xffffff];
       for (ch = 0; ch < SBC_MAX; ++ch)
 	{
@@ -3686,6 +3704,13 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
   if (BE (sbcset == NULL, 0))
 #endif /* not RE_ENABLE_I18N */
     {
+      /* if one is not NULL, free it to avoid leaks */
+      if (sbcset != NULL)
+         free(sbcset);
+#ifdef RE_ENABLE_I18N
+      if (mbcset != NULL)
+         free(mbcset);
+#endif
       *err = REG_ESPACE;
       return NULL;
     }
@@ -3728,6 +3753,7 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
 #endif
 
   /* Build a tree for simple bracket.  */
+  memset(& br_token, 0, sizeof(br_token));	/* silence "not initialized" errors froms static checkers */
   br_token.type = SIMPLE_BRACKET;
   br_token.opr.sbcset = sbcset;
   tree = create_token_tree (dfa, NULL, NULL, &br_token);
@@ -3822,6 +3848,7 @@ create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
 	     re_token_type_t type)
 {
   re_token_t t;
+  memset(& t, 0, sizeof(t));	/* silence "not initialized" errors froms static checkers */
   t.type = type;
   return create_token_tree (dfa, left, right, &t);
 }
diff --git a/lib/regex_internal.c b/lib/regex_internal.c
index 93d7ee9..c299759 100644
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -512,11 +512,11 @@ re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 	  pstr->cur_state = prev_st;
 	}
       else
-	wc = wc2;
+	wc = (wint_t) wc2;
       /* Then proceed the next character.  */
       rawbuf_idx += mbclen;
     }
-  *last_wc = wc;
+  *last_wc = (wint_t) wc;
   return rawbuf_idx;
 }
 #endif /* RE_ENABLE_I18N  */
@@ -536,7 +536,10 @@ build_upper_buffer (re_string_t *pstr)
       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
       if (BE (pstr->trans != NULL, 0))
 	ch = pstr->trans[ch];
-      pstr->mbs[char_idx] = toupper (ch);
+      if (islower (ch))
+	pstr->mbs[char_idx] = toupper (ch);
+      else
+	pstr->mbs[char_idx] = ch;
     }
   pstr->valid_len = char_idx;
   pstr->valid_raw_len = char_idx;
@@ -966,6 +969,16 @@ static reg_errcode_t
 internal_function __attribute_warn_unused_result__
 re_node_set_alloc (re_node_set *set, Idx size)
 {
+  /*
+   * ADR: valgrind says size can be 0, which then doesn't
+   * free the block of size 0.  Harumph. This seems
+   * to work ok, though.
+   */
+  if (size == 0)
+    {
+       memset(set, 0, sizeof(*set));
+       return REG_NOERROR;
+    }
   set->alloc = size;
   set->nelem = 0;
   set->elems = re_malloc (Idx, size);
@@ -1428,7 +1441,18 @@ re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
       if (BE (new_nexts == NULL || new_indices == NULL
 	      || new_edests == NULL || new_eclosures == NULL, 0))
-	return REG_MISSING;
+        {
+	   /* if any are not NULL, free them, avoid leaks */
+	   if (new_nexts != NULL)
+              re_free(new_nexts);
+	   if (new_indices != NULL)
+              re_free(new_indices);
+	   if (new_edests != NULL)
+              re_free(new_edests);
+	   if (new_eclosures != NULL)
+              re_free(new_eclosures);
+	   return REG_MISSING;
+	}
       dfa->nexts = new_nexts;
       dfa->org_indices = new_indices;
       dfa->edests = new_edests;
diff --git a/lib/regex_internal.h b/lib/regex_internal.h
index 0307a34..c634a00 100644
--- a/lib/regex_internal.h
+++ b/lib/regex_internal.h
@@ -117,6 +117,10 @@
 # define BE(expr, val) __builtin_expect (expr, val)
 #else
 # define BE(expr, val) (expr)
+# ifdef inline
+# undef inline
+# endif
+# define inline
 #endif
 
 /* Number of ASCII characters.  */

Reply via email to