Hello Reuben, > > Before we can decide on this, IMO some analysis is needed: > > > > - What are the possible effects of reg_syntax_t on the string of > > characters to be escaped? I can see > > RE_BK_PLUS_QM -> +? > > RE_INTERVALS, RE_NO_BK_BRACES -> {} > > What other relations are there? > > RE_NO_BK_PARENS -> () > RE_NO_BK_VBAR -> |
Yup, thanks. > RE_NO_BK_REFS -> [:digit:] I don't know what you mean by that? '[' and ']' are already in the list of characters to be escaped. So no need to look at RE_NO_BK_REFS, right? > > - What characters need to be escaped in PCRE syntax? > > According to pcrepattern(3): > > ^$.[|()?*+{ Thanks. I'll add ] and } for symmetry. > > - Do Emacs and PCRE view a regex as a sequence of bytes or as a sequence > > of multibyte characters in the locale encoding (given by LC_CTYPE)? > > PCRE doesn't do locales; it treats strings as either bytes or, given a > specific flag, UTF-8. Weird! This means that the regex_quote task also needs to work on bytes when PCRE syntax is requested. > I don't really understand the question about Emacs: someone using > regex-quote in their own programs is worried about Emacs syntax There are two possible uses of regex_quote with EMACS syntax: - if your program wants to call re_compile_pattern, - if your program wants to pass such a regular expression to Emacs via command-line invocations or similar. In the first case, the result should be in locale encoding. In the second case, maybe not. How about this proposed API? 2011-03-06 Bruno Haible <br...@clisp.org> regex-quote: New API. * lib/regex-quote.h: Include <stdbool.h>. (struct regex_quote_spec): New type. (regex_quote_spec_posix, regex_quote_spec_gnu, regex_quote_spec_pcre): New declarations. (regex_quote_length, regex_quote_copy, regex_quote): Take a 'const struct regex_quote_spec *' argument. * lib/regex-quote.c (RE_*, PCRE_*): New macros. (pcre_special): New constant. (regex_quote_spec_posix, regex_quote_spec_gnu, regex_quote_spec_pcre): New functions. (regex_quote_length, regex_quote_copy, regex_quote): Take a 'const struct regex_quote_spec *' argument. * modules/regex-quote (Depends-on): Add stdbool. * tests/test-regex-quote.c (check): Update for new API. Add test for anchored results. * NEWS: Mention the API change. Reported by Reuben Thomas and Eric Blake. *** NEWS.orig Sun Mar 6 14:37:53 2011 --- NEWS Sun Mar 6 14:26:31 2011 *************** *** 12,17 **** --- 12,21 ---- Date Modules Changes + 2011-03-06 regex-quote The last argument is no longer an 'int cflags' + but instead a pointer to a previously constructed + 'struct regex_quote_spec'. + 2011-02-25 dirname These modules no longer put #defines for the dirname-lgpl following symbols into <config.h>: ISSLASH, backupfile FILE_SYSTEM_ACCEPTS_DRIVE_LETTER_PREFIX, *** lib/regex-quote.h.orig Sun Mar 6 14:37:53 2011 --- lib/regex-quote.h Sun Mar 6 14:26:31 2011 *************** *** 15,41 **** You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <stddef.h> ! /* regex_quote converts a literal string to a regular expression that will ! look for this literal string. ! cflags can be 0 or REG_EXTENDED. If it is 0, the result is a Basic Regular Expression (BRE) <http://www.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_03>. If it is REG_EXTENDED, the result is an Extended Regular Expression (ERE) <http://www.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04>. ! The result is not anchored; if you want it to match only complete lines, ! you need to add "^" at the beginning of the result and "$" at the end of the ! result. ! */ /* Returns the number of bytes needed for the quoted string. */ ! extern size_t regex_quote_length (const char *string, int cflags); /* Copies the quoted string to p and returns the incremented p. ! There must be room for regex_quote_length (string, cflags) + 1 bytes at p. ! */ ! extern char * regex_quote_copy (char *p, const char *string, int cflags); /* Returns the freshly allocated quoted string. */ ! extern char * regex_quote (const char *string, int cflags); --- 15,87 ---- You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ + #ifndef _REGEX_QUOTE_H + #define _REGEX_QUOTE_H + #include <stddef.h> + #include <stdbool.h> + + + /* Specifies a quotation task for converting a fixed string to a regular + expression pattern. */ + struct regex_quote_spec + { + /* True if the regular expression pattern consists of multibyte characters, + false if it consists of single bytes or UTF-8 characters. */ + unsigned int /*bool*/ multibyte : 1; + /* True if the regular expression pattern shall match only entire lines. */ + unsigned int /*bool*/ anchored : 1; + /* Set of characters that need to be escaped (all ASCII), as a + NUL-terminated string. */ + char special[30 + 1]; + }; ! ! /* Creates a quotation task that produces a POSIX regular expression, that is, ! a pattern that can be compiled with regcomp(). ! CFLAGS can be 0 or REG_EXTENDED. If it is 0, the result is a Basic Regular Expression (BRE) <http://www.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_03>. If it is REG_EXTENDED, the result is an Extended Regular Expression (ERE) <http://www.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04>. ! If ANCHORED is false, the regular expression will match substrings of lines. ! If ANCHORED is true, it will match only complete lines, */ ! extern struct regex_quote_spec ! regex_quote_spec_posix (int cflags, bool anchored); ! ! /* Creates a quotation task that produces a regular expression that can be ! compiled with the GNU API function re_compile_pattern(). ! SYNTAX describes the syntax of the regular expression (such as ! RE_SYNTAX_POSIX_BASIC, RE_SYNTAX_POSIX_EXTENDED, RE_SYNTAX_EMACS, all ! defined in <regex.h>). It must be the same value as 're_syntax_options' ! at the moment of the re_compile_pattern() call. ! If ANCHORED is false, the regular expression will match substrings of lines. ! If ANCHORED is true, it will match only complete lines, */ ! extern struct regex_quote_spec ! regex_quote_spec_gnu (unsigned long /*reg_syntax_t*/ syntax, bool anchored); ! ! /* Creates a quotation task that produces a PCRE regular expression, that is, ! a pattern that can be compiled with pcre_compile(). ! OPTIONS is the same value as the second argument passed to pcre_compile(). ! If ANCHORED is false, the regular expression will match substrings of lines. ! If ANCHORED is true, it will match only complete lines, */ ! extern struct regex_quote_spec ! regex_quote_spec_pcre (int options, bool anchored); ! /* Returns the number of bytes needed for the quoted string. */ ! extern size_t ! regex_quote_length (const char *string, const struct regex_quote_spec *spec); /* Copies the quoted string to p and returns the incremented p. ! There must be room for regex_quote_length (string, spec) + 1 bytes at p. */ ! extern char * ! regex_quote_copy (char *p, ! const char *string, const struct regex_quote_spec *spec); /* Returns the freshly allocated quoted string. */ ! extern char * ! regex_quote (const char *string, const struct regex_quote_spec *spec); ! ! ! #endif /* _REGEX_QUOTE_H */ *** lib/regex-quote.c.orig Sun Mar 6 14:37:53 2011 --- lib/regex-quote.c Sun Mar 6 14:26:58 2011 *************** *** 31,86 **** /* Characters that are special in an ERE. */ static const char ere_special[] = "$^.*[]\\+?{}()|"; size_t ! regex_quote_length (const char *string, int cflags) { ! const char *special = (cflags != 0 ? ere_special : bre_special); size_t length; - mbui_iterator_t iter; length = 0; ! for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter)) { ! /* We know that special contains only ASCII characters. */ ! if (mb_len (mbui_cur (iter)) == 1 ! && strchr (special, * mbui_cur_ptr (iter))) ! length += 1; ! length += mb_len (mbui_cur (iter)); } return length; } - /* Copies the quoted string to p and returns the incremented p. - There must be room for regex_quote_length (string, cflags) + 1 bytes at p. - */ char * ! regex_quote_copy (char *p, const char *string, int cflags) { ! const char *special = (cflags != 0 ? ere_special : bre_special); ! mbui_iterator_t iter; ! for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter)) { ! /* We know that special contains only ASCII characters. */ ! if (mb_len (mbui_cur (iter)) == 1 ! && strchr (special, * mbui_cur_ptr (iter))) ! *p++ = '\\'; ! memcpy (p, mbui_cur_ptr (iter), mb_len (mbui_cur (iter))); ! p += mb_len (mbui_cur (iter)); } return p; } - /* Returns the freshly allocated quoted string. */ char * ! regex_quote (const char *string, int cflags) { ! size_t length = regex_quote_length (string, cflags); char *result = XNMALLOC (length + 1, char); char *p; p = result; ! p = regex_quote_copy (p, string, cflags); *p = '\0'; return result; } --- 31,216 ---- /* Characters that are special in an ERE. */ static const char ere_special[] = "$^.*[]\\+?{}()|"; + struct regex_quote_spec + regex_quote_spec_posix (int cflags, bool anchored) + { + struct regex_quote_spec result; + + strcpy (result.special, cflags != 0 ? ere_special : bre_special); + result.multibyte = true; + result.anchored = anchored; + + return result; + } + + /* Syntax bit values, defined in GNU <regex.h>. We don't include it here, + otherwise this module would need to depend on gnulib module 'regex'. */ + #define RE_BK_PLUS_QM 0x00000002 + #define RE_INTERVALS 0x00000200 + #define RE_LIMITED_OPS 0x00000400 + #define RE_NEWLINE_ALT 0x00000800 + #define RE_NO_BK_BRACES 0x00001000 + #define RE_NO_BK_PARENS 0x00002000 + #define RE_NO_BK_VBAR 0x00008000 + + struct regex_quote_spec + regex_quote_spec_gnu (unsigned long /*reg_syntax_t*/ syntax, bool anchored) + { + struct regex_quote_spec result; + char *p; + + p = result.special; + memcpy (p, bre_special, sizeof (bre_special) - 1); + p += sizeof (bre_special) - 1; + if ((syntax & RE_LIMITED_OPS) == 0 && (syntax & RE_BK_PLUS_QM) == 0) + { + *p++ = '+'; + *p++ = '?'; + } + if ((syntax & RE_INTERVALS) != 0 && (syntax & RE_NO_BK_BRACES) != 0) + { + *p++ = '{'; + *p++ = '}'; + } + if ((syntax & RE_NO_BK_PARENS) != 0) + { + *p++ = '('; + *p++ = ')'; + } + if ((syntax & RE_LIMITED_OPS) == 0 && (syntax & RE_NO_BK_VBAR) != 0) + *p++ = '|'; + if ((syntax & RE_NEWLINE_ALT) != 0) + *p++ = '\n'; + *p = '\0'; + + result.multibyte = true; + result.anchored = anchored; + + return result; + } + + /* Characters that are special in a PCRE. */ + static const char pcre_special[] = "$^.*[]\\+?{}()|"; + + /* Options bit values, defined in <pcre.h>. We don't include it here, because + it is not a standard header. */ + #define PCRE_ANCHORED 0x00000010 + #define PCRE_EXTENDED 0x00000008 + + struct regex_quote_spec + regex_quote_spec_pcre (int options, bool anchored) + { + struct regex_quote_spec result; + char *p; + + p = result.special; + memcpy (p, bre_special, sizeof (pcre_special) - 1); + p += sizeof (pcre_special) - 1; + if (options & PCRE_EXTENDED) + { + *p++ = ' '; + *p++ = '\t'; + *p++ = '\n'; + *p++ = '\v'; + *p++ = '\f'; + *p++ = '\r'; + *p++ = '#'; + } + *p = '\0'; + + /* PCRE regular expressions consist of UTF-8 characters of options contains + PCRE_UTF8 and of single bytes otherwise. */ + result.multibyte = false; + /* If options contains PCRE_ANCHORED, the anchoring is implicit. */ + result.anchored = (options & PCRE_ANCHORED ? 0 : anchored); + + return result; + } + size_t ! regex_quote_length (const char *string, const struct regex_quote_spec *spec) { ! const char *special = spec->special; size_t length; length = 0; ! if (spec->anchored) ! length += 2; /* for '^' at the beginning and '$' at the end */ ! if (spec->multibyte) ! { ! mbui_iterator_t iter; ! ! for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter)) ! { ! /* We know that special contains only ASCII characters. */ ! if (mb_len (mbui_cur (iter)) == 1 ! && strchr (special, * mbui_cur_ptr (iter))) ! length += 1; ! length += mb_len (mbui_cur (iter)); ! } ! } ! else { ! const char *iter; ! ! for (iter = string; *iter != '\0'; iter++) ! { ! if (strchr (special, *iter)) ! length += 1; ! length += 1; ! } } + return length; } char * ! regex_quote_copy (char *p, const char *string, const struct regex_quote_spec *spec) { ! const char *special = spec->special; ! if (spec->anchored) ! *p++ = '^'; ! if (spec->multibyte) { ! mbui_iterator_t iter; ! ! for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter)) ! { ! /* We know that special contains only ASCII characters. */ ! if (mb_len (mbui_cur (iter)) == 1 ! && strchr (special, * mbui_cur_ptr (iter))) ! *p++ = '\\'; ! memcpy (p, mbui_cur_ptr (iter), mb_len (mbui_cur (iter))); ! p += mb_len (mbui_cur (iter)); ! } } + else + { + const char *iter; + + for (iter = string; *iter != '\0'; iter++) + { + if (strchr (special, *iter)) + *p++ = '\\'; + *p++ = *iter++; + } + } + if (spec->anchored) + *p++ = '$'; + return p; } char * ! regex_quote (const char *string, const struct regex_quote_spec *spec) { ! size_t length = regex_quote_length (string, spec); char *result = XNMALLOC (length + 1, char); char *p; p = result; ! p = regex_quote_copy (p, string, spec); *p = '\0'; return result; } *** modules/regex-quote.orig Sun Mar 6 14:37:53 2011 --- modules/regex-quote Sun Mar 6 14:26:31 2011 *************** *** 6,11 **** --- 6,12 ---- lib/regex-quote.c Depends-on: + stdbool xalloc mbuiter *** tests/test-regex-quote.c.orig Sun Mar 6 14:37:53 2011 --- tests/test-regex-quote.c Sun Mar 6 14:26:31 2011 *************** *** 29,46 **** static void check (const char *literal, int cflags, const char *expected) { char *result; size_t length; ! result = regex_quote (literal, cflags); ASSERT (strcmp (result, expected) == 0); ! length = regex_quote_length (literal, cflags); ASSERT (length == strlen (result)); free (result); result = (char *) xmalloc (1 + length + 1 + 1); result[0] = '^'; ! strcpy (regex_quote_copy (result + 1, literal, cflags), "$"); { regex_t regex; regmatch_t match[1]; --- 29,65 ---- static void check (const char *literal, int cflags, const char *expected) { + struct regex_quote_spec spec; char *result; size_t length; ! spec = regex_quote_spec_posix (cflags, false); ! result = regex_quote (literal, &spec); ASSERT (strcmp (result, expected) == 0); ! length = regex_quote_length (literal, &spec); ASSERT (length == strlen (result)); free (result); result = (char *) xmalloc (1 + length + 1 + 1); result[0] = '^'; ! strcpy (regex_quote_copy (result + 1, literal, &spec), "$"); ! { ! regex_t regex; ! regmatch_t match[1]; ! ! ASSERT (regcomp (®ex, result, cflags) == 0); ! ! ASSERT (regexec (®ex, literal, 1, match, 0) == 0); ! ASSERT (match[0].rm_so == 0); ! ASSERT (match[0].rm_eo == strlen (literal)); ! regfree (®ex); ! } ! free (result); ! ! spec = regex_quote_spec_posix (cflags, true); ! result = regex_quote (literal, &spec); ! length = regex_quote_length (literal, &spec); ! ASSERT (length == strlen (result)); { regex_t regex; regmatch_t match[1]; -- In memoriam Marie Politzer <http://fr.wikipedia.org/wiki/Marie_Politzer>