Very often....make that very very very very very very very very very often, I find myself processing text in python that when .split()'ing a line, I'd like to exclude the split for a 'quoted' item...quoted because it contains whitespace or the sep char.
For example: s = ' Chan: 11 SNR: 22 ESSID: "Spaced Out Wifi" Enc: On' If I want to yank the essid in the above example, it's a pain. But with my new dandy split quoted method, we have a 3rd argument to .split() that we can spec the quote delimiter where no splitting will occur, and the quote char will be dropped: s.split(None,-1,'"')[5] 'Spaced Out Wifi' Attached is a proof of concept patch against Python-2.4.1/Objects/stringobject.c that implements this. It is limited to whitespace splitting only. (sep == None) As implemented the quote delimiter also doubles as an additional separator for the spliting out a substr. For example: 'There is"no whitespace before these"quotes'.split(None,-1,'"') ['There', 'is', 'no whitespace before these', 'quotes'] This is useful, but possibly better put into practice as a separate method?? Comments please. Dave
--- stringobject.c.orig 2006-05-17 16:12:13.000000000 -0400 +++ stringobject.c 2006-05-17 23:49:52.000000000 -0400 @@ -1336,6 +1336,85 @@ return NULL; } +// dc: split quoted example +// 'This string has "not only this" "and this" but"this mixed in string"as well as this "" empty one and two more at the end""""'.split(None,-1,'"') +// CORRECT: ['This', 'string', 'has', 'not only this', 'and this', 'but', 'this mixed in string', 'as', 'well', 'as', 'this', '', 'empty', 'one', 'and', 'two', 'more', 'at', 'the', 'end', '', ''] +static PyObject * +split_whitespace_quoted(const char *s, int len, int maxsplit, const char *qsub) +{ + int i, j, quoted = 0; + PyObject *str; + PyObject *list = PyList_New(0); + + if (list == NULL) + return NULL; + + for (i = j = 0; i < len; ) { + + if (!quoted) { + while (i < len && isspace(Py_CHARMASK(s[i])) ) + i++; + } + + if (Py_CHARMASK(s[i]) == Py_CHARMASK(qsub[0])) { + quoted = 1; + i++; + } + + j = i; + + while (i < len) { + if (Py_CHARMASK(s[i]) == Py_CHARMASK(qsub[0])) { + if (quoted) + quoted = 2; // End of quotes found + else { + quoted = 1; // Else start of new quotes in the middle of a string + } + break; + } else if (!quoted && isspace(Py_CHARMASK(s[i]))) + break; + i++; + } + + if (quoted == 2 && j == i) { // Empty string in quotes + SPLIT_APPEND("", 0, 0); + quoted = 0; + i++; + j = i; + + } else if (j < i) { + if (maxsplit-- <= 0) + break; + SPLIT_APPEND(s, j, i); + + if (quoted == 2) { + quoted = 0; + i++; + } else if (quoted == 1) { + i++; + if (Py_CHARMASK(s[i]) == Py_CHARMASK(qsub[0])) { // Embedded empty string in quotes (at end of string?) + SPLIT_APPEND("", 0, 0); + quoted = 0; + i++; + } + } else { + while (i < len && isspace(Py_CHARMASK(s[i]))) + i++; + } + + j = i; + } + } + if (j < len) { + SPLIT_APPEND(s, j, len); + } + return list; + onError: + Py_DECREF(list); + return NULL; +} + + static PyObject * split_char(const char *s, int len, char ch, int maxcount) { @@ -1376,15 +1455,27 @@ static PyObject * string_split(PyStringObject *self, PyObject *args) { - int len = PyString_GET_SIZE(self), n, i, j, err; + int len = PyString_GET_SIZE(self), n, qn, i, j, err; int maxsplit = -1; - const char *s = PyString_AS_STRING(self), *sub; - PyObject *list, *item, *subobj = Py_None; + const char *s = PyString_AS_STRING(self), *sub, *qsub; + PyObject *list, *item, *subobj = Py_None, *qsubobj = Py_None; - if (!PyArg_ParseTuple(args, "|Oi:split", &subobj, &maxsplit)) + if (!PyArg_ParseTuple(args, "|OiO:split", &subobj, &maxsplit, &qsubobj)) return NULL; if (maxsplit < 0) maxsplit = INT_MAX; + if (qsubobj != Py_None) { + if (PyString_Check(qsubobj)) { + qsub = PyString_AS_STRING(qsubobj); + qn = PyString_GET_SIZE(qsubobj); + } + if (qn == 0) { + PyErr_SetString(PyExc_ValueError, "empty delimiter"); + return NULL; + } + if (subobj == Py_None) + return split_whitespace_quoted(s, len, maxsplit, qsub); + } if (subobj == Py_None) return split_whitespace(s, len, maxsplit); if (PyString_Check(subobj)) {
_______________________________________________ Python-Dev mailing list Python-Dev@python.org http://mail.python.org/mailman/listinfo/python-dev Unsubscribe: http://mail.python.org/mailman/options/python-dev/archive%40mail-archive.com