#------------------------------------------------------------------------ import re, sys
def q(c):
"""Returns a regular expression that matches a region delimited by c,
inside which c may be escaped with a backslash"""
return r"%s(\\.|[^%s])*%s" % (c, c, c)
single_quoted_string = q('"')
double_quoted_string = q("'")
c_comment = r"/\*.*?\*/"
cxx_comment = r"//[^\n]*[\n]"
rx = re.compile("|".join([single_quoted_string, double_quoted_string,
c_comment, cxx_comment]), re.DOTALL)
def replace(x):
x = x.group(0)
if x.startswith("/"): return ' '
return x
result = rx.sub(replace, sys.stdin.read())
sys.stdout.write(result)
#------------------------------------------------------------------------
The regular expression matches ""-strings, ''-character-constants,
c-comments, and c++-comments. The replace function returns ' ' (space)
when the matched thing was a comment, or the original thing otherwise.
Depending on your use for this code, replace() should return as many
'\n's as are in the matched thing, or ' ' otherwise, so that line
numbers remain unchanged.
Basically, the regular expression is a tokenizer, and replace() chooses
what to do with each recognized token. Things not recognized as tokens
by the regular expression are left unchanged.
Jeff
PS this is the test file I used:
/* ... */ xyzzy;
456 // 123
const char *mystr = "This is /*trouble*/";
/* * */
/* /* */
// /* /* */
/* // /* */
/*
* */
pgp0CcH5aHF1o.pgp
Description: PGP signature
-- http://mail.python.org/mailman/listinfo/python-list
