Dear R core developers, One feature from Python that I have been wanting in R is the ability to capture groups in regular expressions using names. Consider the following example in R.
> notables <- c(" Ben Franklin and Jefferson Davis","\tMillard Fillmore") > name.rex <- "(?<first>[A-Z][a-z]+) (?<last>[A-Z][a-z]+)" > (parsed <- regexpr(name.rex,notables,perl=TRUE)) [1] 3 2 attr(,"match.length") [1] 12 16 attr(,"capture.start") [,1] [,2] [1,] 3 7 [2,] 2 10 attr(,"capture.length") [,1] [,2] [1,] 3 8 [2,] 7 8 attr(,"capture.names") [1] "first" "last" > parse.one(notables,parsed) first last [1,] "Ben" "Franklin" [2,] "Millard" "Fillmore" > parse.one(notables,parsed)[,"last"] [1] "Franklin" "Fillmore" The advantage to this approach is that you can tag groups by name, and then use the names later in the code to extract the matched substrings. I realized this is possible by using the PCRE library which ships with R, so in the last couple days I hacked a bit in src/main/grep.c in the R source code. I managed to get named capture to work with the standard gregexpr and regexpr functions. For backwards-compatibility, my strategy was to just add more attributes to the results of these functions, as shown above. Attached is the patch and some R code for testing the new features. It works fine for me with no memory problems. However, I noticed that there is some UTF8 handling code, which I did not touch (use_UTF8 is false on my machine). I presume we will need to make some small modifications to get it to work with unicode, but I'm not sure how to do them. Would you consider integrating this patch into the R source code for future releases, so the larger R community can take advantage of this feature? If there's anything else I can do to help please let me know. Sincerely, Toby Dylan Hocking http://cbio.ensmp.fr/~thocking/
Index: ../r-devel/src/main/grep.c =================================================================== --- ../r-devel/src/main/grep.c (revision 54562) +++ ../r-devel/src/main/grep.c (working copy) @@ -1635,28 +1635,38 @@ static SEXP gregexpr_perl(const char *pattern, const char *string, pcre *re_pcre, pcre_extra *re_pe, - Rboolean useBytes, Rboolean use_UTF8) + Rboolean useBytes, Rboolean use_UTF8, + int *ovector, int ovector_size, + int capture_count) { - int matchIndex = -1, st = 0, foundAll = 0, foundAny = 0, j, start=0; + int matchIndex = -1, st = 0, foundAll = 0, foundAny = 0, i,j, start=0; SEXP ans, matchlen; /* return vect and its attribute */ + SEXP capture,capturelen,capturebuf,capturelenbuf; SEXP matchbuf, matchlenbuf; /* buffers for storing multiple matches */ int bufsize = 1024; /* starting size for buffers */ + PROTECT(capturelenbuf = allocVector(INTSXP, bufsize*capture_count)); + PROTECT(capturebuf = allocVector(INTSXP, bufsize*capture_count)); PROTECT(matchbuf = allocVector(INTSXP, bufsize)); PROTECT(matchlenbuf = allocVector(INTSXP, bufsize)); while (!foundAll) { - int rc, ovector[3], slen = strlen(string); - rc = pcre_exec(re_pcre, re_pe, string, slen, start, 0, ovector, 3); + int rc, slen = strlen(string); + rc = pcre_exec(re_pcre, re_pe, string, slen, start, 0, + ovector, ovector_size); if (rc >= 0) { if ((matchIndex + 1) == bufsize) { - /* Reallocate match buffers */ + /* Reallocate match buffers + TODO: need to update this for new args + */ int newbufsize = bufsize * 2; SEXP tmp; + tmp = allocVector(INTSXP, 2 * bufsize); for (j = 0; j < bufsize; j++) INTEGER(tmp)[j] = INTEGER(matchlenbuf)[j]; UNPROTECT(1); matchlenbuf = tmp; PROTECT(matchlenbuf); + tmp = allocVector(INTSXP, 2 * bufsize); for (j = 0; j < bufsize; j++) INTEGER(tmp)[j] = INTEGER(matchbuf)[j]; @@ -1664,6 +1674,28 @@ UNPROTECT(2); PROTECT(matchbuf); PROTECT(matchlenbuf); + + tmp = allocVector(INTSXP, 2 * bufsize*capture_count); + for(j=0;j<bufsize;j++)for(i=0;i<capture_count;i++) + INTEGER(tmp)[j+2*bufsize*i] = + INTEGER(capturebuf)[j+bufsize*i]; + capturebuf=tmp; + UNPROTECT(3); + PROTECT(matchbuf); + PROTECT(matchlenbuf); + PROTECT(capturebuf); + + tmp = allocVector(INTSXP, 2 * bufsize*capture_count); + for(j=0;j<bufsize;j++)for(i=0;i<capture_count;i++) + INTEGER(tmp)[j+2*bufsize*i] = + INTEGER(capturelenbuf)[j+bufsize*i]; + capturelenbuf=tmp; + UNPROTECT(4); + PROTECT(matchbuf); + PROTECT(matchlenbuf); + PROTECT(capturebuf); + PROTECT(capturelenbuf); + bufsize = newbufsize; } matchIndex++; @@ -1677,21 +1709,31 @@ else start = ovector[1]; if (use_UTF8) { - int mlen = ovector[1] - st; - /* Unfortunately these are in bytes */ - if (st > 0) { - INTEGER(matchbuf)[matchIndex] = 1 + getNc(string, st); - if (INTEGER(matchbuf)[matchIndex] <= 0) { /* an invalid string */ - INTEGER(matchbuf)[matchIndex] = NA_INTEGER; - foundAll = 1; /* if we get here, we are done */ - } + int mlen = ovector[1] - st; + /* Unfortunately these are in bytes */ + if (st > 0) { + INTEGER(matchbuf)[matchIndex] = 1 + getNc(string, st); + if (INTEGER(matchbuf)[matchIndex] <= 0) { /* an invalid string */ + INTEGER(matchbuf)[matchIndex] = NA_INTEGER; + foundAll = 1; /* if we get here, we are done */ } - INTEGER(matchlenbuf)[matchIndex] = getNc(string+st, mlen); - if (INTEGER(matchlenbuf)[matchIndex] < 0) {/* an invalid string */ - INTEGER(matchlenbuf)[matchIndex] = NA_INTEGER; - foundAll = 1; - } + } + INTEGER(matchlenbuf)[matchIndex] = getNc(string+st, mlen); + if (INTEGER(matchlenbuf)[matchIndex] < 0) {/* an invalid string */ + INTEGER(matchlenbuf)[matchIndex] = NA_INTEGER; + foundAll = 1; + } } + /* also extract capture locations */ + int ind, *ov; + for(i=0;i<capture_count;i++){ + ov = ovector+2*(i+1); + ind=matchIndex+bufsize*i; + st=ov[0]+1; + INTEGER(capturebuf)[ind] = st; + INTEGER(capturelenbuf)[ind] = ov[1]-st+1; + } + /* TODO: if(use_UTF8)... */ if (start >= slen) foundAll = 1; } else { foundAll = 1; @@ -1700,15 +1742,28 @@ } PROTECT(ans = allocVector(INTSXP, matchIndex + 1)); PROTECT(matchlen = allocVector(INTSXP, matchIndex + 1)); + PROTECT(capture = allocMatrix(INTSXP,matchIndex+1,capture_count)); + PROTECT(capturelen = allocMatrix(INTSXP,matchIndex+1,capture_count)); + if (foundAny) { /* copy from buffers */ for (j = 0; j <= matchIndex; j++) { INTEGER(ans)[j] = INTEGER(matchbuf)[j]; INTEGER(matchlen)[j] = INTEGER(matchlenbuf)[j]; + for(i=0;i<capture_count;i++){ + int return_index = j+(matchIndex+1)*i; + int buffer_index = j+bufsize*i; + INTEGER(capture)[return_index] = + INTEGER(capturebuf)[buffer_index]; + INTEGER(capturelen)[return_index] = + INTEGER(capturelenbuf)[buffer_index]; + } } } else INTEGER(ans)[0] = INTEGER(matchlen)[0] = -1; setAttrib(ans, install("match.length"), matchlen); - UNPROTECT(4); + setAttrib(ans, install("capture.start"), capture); + setAttrib(ans, install("capture.length"), capturelen); + UNPROTECT(8); return ans; } @@ -1747,6 +1802,10 @@ const unsigned char *tables = NULL /* -Wall */; Rboolean use_UTF8 = FALSE, use_WC = FALSE; const void *vmax; + int capture_count,capture_num, *ovector, ovector_size, + name_count, name_entry_size, info_code; + char *name_table; + SEXP capture_names; checkArity(op, args); pat = CAR(args); args = CDR(args); @@ -1834,6 +1893,26 @@ if (errorptr) warning(_("PCRE pattern study error\n\t'%s'\n"), errorptr); } + /* also extract info for named groups */ + pcre_fullinfo(re_pcre,re_pe,PCRE_INFO_NAMECOUNT,&name_count); + pcre_fullinfo(re_pcre,re_pe,PCRE_INFO_NAMEENTRYSIZE,&name_entry_size); + pcre_fullinfo(re_pcre,re_pe,PCRE_INFO_NAMETABLE,&name_table); + info_code = + pcre_fullinfo(re_pcre,re_pe,PCRE_INFO_CAPTURECOUNT,&capture_count); + if(info_code<0){ + error(_("pcre_fullinfo returned '%d' "), info_code); + } + ovector_size = (capture_count+1)*3; + ovector = (int*)malloc(ovector_size*sizeof(int)); + SEXP thisname; + PROTECT(capture_names=allocVector(STRSXP,capture_count)); + for(i=0;i<name_count;i++){ + char *entry = name_table + name_entry_size * i; + PROTECT(thisname = mkChar((char*)(entry+2))); + capture_num = (entry[0]<<8)+entry[1]-1; + SET_STRING_ELT(capture_names,capture_num,thisname); + UNPROTECT(1); + } } else { int cflags = REG_EXTENDED; if (igcase_opt) cflags |= REG_ICASE; @@ -1845,10 +1924,16 @@ } if (PRIMVAL(op) == 0) { /* regexpr */ - SEXP matchlen; + SEXP matchlen,capture_start,capturelen; PROTECT(ans = allocVector(INTSXP, n)); matchlen = allocVector(INTSXP, n); /* protected by next line */ setAttrib(ans, install("match.length"), matchlen); + if(perl_opt){ + capture_start = allocMatrix(INTSXP,n,capture_count); + setAttrib(ans, install("capture.start"), capture_start); + capturelen = allocMatrix(INTSXP,n,capture_count); + setAttrib(ans, install("capture.length"), capturelen); + } vmax = vmaxget(); for (i = 0 ; i < n ; i++) { @@ -1886,8 +1971,9 @@ INTEGER(matchlen)[i] = INTEGER(ans)[i] >= 0 ? strlen(spat):-1; } else if (perl_opt) { - int rc, ovector[3]; - rc = pcre_exec(re_pcre, re_pe, s, strlen(s), 0, 0, ovector, 3); + int rc; + rc = pcre_exec(re_pcre, re_pe, s, strlen(s), 0, 0, + ovector, ovector_size); if (rc >= 0) { int st = ovector[0]; INTEGER(ans)[i] = st + 1; /* index from one */ @@ -1902,6 +1988,17 @@ nc = getNc(s + st, mlen); INTEGER(matchlen)[i] = (nc >= 0) ? nc : NA_INTEGER; } + /* also readout capture group locations */ + int ind, *ov; + for(capture_num=0;capture_num<capture_count; + capture_num++){ + ov = ovector+2*(capture_num+1); + ind=i+n*capture_num; + st=ov[0]+1; + INTEGER(capture_start)[ind] = st; + INTEGER(capturelen)[ind] = ov[1]-st+1; + } + /* TODO: add if(use_UTF8)... */ } else INTEGER(ans)[i] = INTEGER(matchlen)[i] = -1; } else { if (!use_WC) @@ -1940,7 +2037,9 @@ if (fixed_opt) elt = gregexpr_fixed(spat, s, useBytes, use_UTF8); else - elt = gregexpr_perl(spat, s, re_pcre, re_pe, useBytes, use_UTF8); + elt = gregexpr_perl(spat, s, re_pcre, re_pe, + useBytes, use_UTF8, ovector, + ovector_size, capture_count); } } else elt = gregexpr_Regexc(®, STRING_ELT(text, i), useBytes, use_WC); @@ -1955,6 +2054,9 @@ if (re_pe) pcre_free(re_pe); pcre_free(re_pcre); pcre_free((void *)tables); + setAttrib(ans, install("capture.names"), capture_names); + UNPROTECT(1); + free(ovector); } else tre_regfree(®);
### Toby Dylan Hocking, 25 feb 2011. Some R code to test my ### implementation of new named capture group features in gregexpr gctorture(FALSE)##for debugging ### Parse result of gregexpr(,string) result2list <- function(string,result){ extract.substrings <- function(string,starts,lengths,names){ subs <- substring(string,starts,starts+lengths-1) m <- matrix(subs,ncol=length(names)) colnames(m) <- names m } N <- attr(result,"capture.names") lapply(seq_along(result),function(i){ extract.substrings(string[i], attr(result[[i]],"capture.start"), attr(result[[i]],"capture.length"), N) }) } ### Parse result of regexpr(,string) parse.one <- function(string,result){ m <- do.call(rbind,lapply(seq_along(string),function(i){ st <- attr(result,"capture.start")[i,] substring(string[i],st,st+attr(result,"capture.length")[i,]-1) })) colnames(m) <- attr(result,"capture.names") m } string <- c("another foobar bazing string", "fbar baz foooobar baz", "no matches here", "my foobar baz st another fooooobar baz dude fobar baz") notables <- c(" Ben Franklin and Jefferson Davis","\tMillard Fillmore") name.rex <- "(?<first>[A-Z][a-z]+) (?<last>[A-Z][a-z]+)" for(i in 1:100) parsed <- gregexpr(name.rex,notables,perl=TRUE) parsed[[1]] result2list(notables,parsed) (parsed <- regexpr(name.rex,notables,perl=TRUE)) parse.one(notables,parsed) parse.one(notables,parsed)[,"last"] result <- gregexpr("f(?<os_in_foo>o*)b(?<as_in_bar>a*)r (baz)",string,perl=TRUE) print(result) s2 <- paste(rep("foobar",1030),collapse=" ") result <- gregexpr("f(?<os_in_foo>o*)b(?<as_in_bar>a*)r",s2,perl=TRUE) ## negative controls regexpr(name.rex,notables)##perl not TRUE, bad regexp regexpr("([A-Z][a-z]+) ([A-Z][a-z]+)",notables)##still works like usual
______________________________________________ R-devel@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-devel