[Rd] Named capture in regexp

Toby Dylan Hocking Fri, 25 Feb 2011 13:16:46 -0800

Dear R core developers,

One feature from Python that I have been wanting in R is the ability
to capture groups in regular expressions using names. Consider the
following example in R.


> notables <- c("  Ben Franklin and Jefferson Davis","\tMillard Fillmore")
> name.rex <- "(?<first>[A-Z][a-z]+) (?<last>[A-Z][a-z]+)"
> (parsed <- regexpr(name.rex,notables,perl=TRUE))
[1] 3 2
attr(,"match.length")
[1] 12 16
attr(,"capture.start")
     [,1] [,2]
[1,]    3    7
[2,]    2   10
attr(,"capture.length")
     [,1] [,2]
[1,]    3    8
[2,]    7    8
attr(,"capture.names")
[1] "first" "last" 
> parse.one(notables,parsed)
     first     last      
[1,] "Ben"     "Franklin"
[2,] "Millard" "Fillmore"
> parse.one(notables,parsed)[,"last"]
[1] "Franklin" "Fillmore"

The advantage to this approach is that you can tag groups by name, and
then use the names later in the code to extract the matched substrings.

I realized this is possible by using the PCRE library which ships with
R, so in the last couple days I hacked a bit in src/main/grep.c in the
R source code. I managed to get named capture to work with the
standard gregexpr and regexpr functions. For backwards-compatibility,
my strategy was to just add more attributes to the results of these
functions, as shown above.

Attached is the patch and some R code for testing the new features. It
works fine for me with no memory problems. However, I noticed that
there is some UTF8 handling code, which I did not touch (use_UTF8 is
false on my machine). I presume we will need to make some small
modifications to get it to work with unicode, but I'm not sure how to
do them.

Would you consider integrating this patch into the R source code for
future releases, so the larger R community can take advantage of this
feature? If there's anything else I can do to help please let me know.

Sincerely,
Toby Dylan Hocking
http://cbio.ensmp.fr/~thocking/

Index: ../r-devel/src/main/grep.c
===================================================================
--- ../r-devel/src/main/grep.c	(revision 54562)
+++ ../r-devel/src/main/grep.c	(working copy)
@@ -1635,28 +1635,38 @@
 static SEXP
 gregexpr_perl(const char *pattern, const char *string,
 	      pcre *re_pcre, pcre_extra *re_pe,
-	      Rboolean useBytes, Rboolean use_UTF8)
+	      Rboolean useBytes, Rboolean use_UTF8,
+	      int *ovector, int ovector_size,
+	      int capture_count)
 {
-    int matchIndex = -1, st = 0, foundAll = 0, foundAny = 0, j, start=0;
+    int matchIndex = -1, st = 0, foundAll = 0, foundAny = 0, i,j, start=0;
     SEXP ans, matchlen;         /* return vect and its attribute */
+    SEXP capture,capturelen,capturebuf,capturelenbuf;
     SEXP matchbuf, matchlenbuf; /* buffers for storing multiple matches */
     int bufsize = 1024;         /* starting size for buffers */
+    PROTECT(capturelenbuf = allocVector(INTSXP, bufsize*capture_count));
+    PROTECT(capturebuf = allocVector(INTSXP, bufsize*capture_count));
     PROTECT(matchbuf = allocVector(INTSXP, bufsize));
     PROTECT(matchlenbuf = allocVector(INTSXP, bufsize));
     while (!foundAll) {
-	int rc, ovector[3], slen = strlen(string);
-	rc = pcre_exec(re_pcre, re_pe, string, slen, start, 0, ovector, 3);
+	int rc, slen = strlen(string);
+	rc = pcre_exec(re_pcre, re_pe, string, slen, start, 0, 
+		       ovector, ovector_size);
 	if (rc >= 0) {
 	    if ((matchIndex + 1) == bufsize) {
-		/* Reallocate match buffers */
+		/* Reallocate match buffers
+		   TODO: need to update this for new args
+		 */
 		int newbufsize = bufsize * 2;
 		SEXP tmp;
+
 		tmp = allocVector(INTSXP, 2 * bufsize);
 		for (j = 0; j < bufsize; j++)
 		    INTEGER(tmp)[j] = INTEGER(matchlenbuf)[j];
 		UNPROTECT(1);
 		matchlenbuf = tmp;
 		PROTECT(matchlenbuf);
+
 		tmp = allocVector(INTSXP, 2 * bufsize);
 		for (j = 0; j < bufsize; j++)
 		    INTEGER(tmp)[j] = INTEGER(matchbuf)[j];
@@ -1664,6 +1674,28 @@
 		UNPROTECT(2);
 		PROTECT(matchbuf);
 		PROTECT(matchlenbuf);
+
+		tmp = allocVector(INTSXP, 2 * bufsize*capture_count);
+		for(j=0;j<bufsize;j++)for(i=0;i<capture_count;i++)
+		  INTEGER(tmp)[j+2*bufsize*i] = 
+		    INTEGER(capturebuf)[j+bufsize*i];
+		capturebuf=tmp;
+		UNPROTECT(3);
+		PROTECT(matchbuf);
+		PROTECT(matchlenbuf);
+		PROTECT(capturebuf);
+
+		tmp = allocVector(INTSXP, 2 * bufsize*capture_count);
+		for(j=0;j<bufsize;j++)for(i=0;i<capture_count;i++)
+		  INTEGER(tmp)[j+2*bufsize*i] = 
+		    INTEGER(capturelenbuf)[j+bufsize*i];
+		capturelenbuf=tmp;
+		UNPROTECT(4);
+		PROTECT(matchbuf);
+		PROTECT(matchlenbuf);
+		PROTECT(capturebuf);
+		PROTECT(capturelenbuf);
+
 		bufsize = newbufsize;
 	    }
 	    matchIndex++;
@@ -1677,21 +1709,31 @@
 	    else
 		start = ovector[1];
 	    if (use_UTF8) {
-		int mlen = ovector[1] - st;
-		/* Unfortunately these are in bytes */
-		if (st > 0) {
-		    INTEGER(matchbuf)[matchIndex] = 1 + getNc(string, st);
-		    if (INTEGER(matchbuf)[matchIndex] <= 0) { /* an invalid string */
-			INTEGER(matchbuf)[matchIndex] = NA_INTEGER;
-			foundAll = 1; /* if we get here, we are done */
-		    }
+	      int mlen = ovector[1] - st;
+	      /* Unfortunately these are in bytes */
+	      if (st > 0) {
+		INTEGER(matchbuf)[matchIndex] = 1 + getNc(string, st);
+		if (INTEGER(matchbuf)[matchIndex] <= 0) { /* an invalid string */
+		  INTEGER(matchbuf)[matchIndex] = NA_INTEGER;
+		  foundAll = 1; /* if we get here, we are done */
 		}
-		INTEGER(matchlenbuf)[matchIndex] = getNc(string+st, mlen);
-		if (INTEGER(matchlenbuf)[matchIndex] < 0) {/* an invalid string */
-		    INTEGER(matchlenbuf)[matchIndex] = NA_INTEGER;
-		    foundAll = 1;
-		}
+	      }
+	      INTEGER(matchlenbuf)[matchIndex] = getNc(string+st, mlen);
+	      if (INTEGER(matchlenbuf)[matchIndex] < 0) {/* an invalid string */
+		INTEGER(matchlenbuf)[matchIndex] = NA_INTEGER;
+		foundAll = 1;
+	      }
 	    }
+	    /* also extract capture locations */
+	    int ind, *ov;
+	    for(i=0;i<capture_count;i++){
+	      ov = ovector+2*(i+1);
+	      ind=matchIndex+bufsize*i;
+	      st=ov[0]+1;
+	      INTEGER(capturebuf)[ind] = st;
+	      INTEGER(capturelenbuf)[ind] = ov[1]-st+1;
+	    }
+	    /* TODO: if(use_UTF8)... */
 	    if (start >= slen) foundAll = 1;
 	} else {
 	    foundAll = 1;
@@ -1700,15 +1742,28 @@
     }
     PROTECT(ans = allocVector(INTSXP, matchIndex + 1));
     PROTECT(matchlen = allocVector(INTSXP, matchIndex + 1));
+    PROTECT(capture = allocMatrix(INTSXP,matchIndex+1,capture_count));
+    PROTECT(capturelen = allocMatrix(INTSXP,matchIndex+1,capture_count));
+
     if (foundAny) {
 	/* copy from buffers */
 	for (j = 0; j <= matchIndex; j++) {
 	    INTEGER(ans)[j] = INTEGER(matchbuf)[j];
 	    INTEGER(matchlen)[j] = INTEGER(matchlenbuf)[j];
+	    for(i=0;i<capture_count;i++){
+	      int return_index = j+(matchIndex+1)*i;
+	      int buffer_index = j+bufsize*i;
+	      INTEGER(capture)[return_index] =
+		INTEGER(capturebuf)[buffer_index];
+	      INTEGER(capturelen)[return_index] =
+		INTEGER(capturelenbuf)[buffer_index];
+	    }
 	}
     } else INTEGER(ans)[0] = INTEGER(matchlen)[0] = -1;
     setAttrib(ans, install("match.length"), matchlen);
-    UNPROTECT(4);
+    setAttrib(ans, install("capture.start"), capture);
+    setAttrib(ans, install("capture.length"), capturelen);
+    UNPROTECT(8);
     return ans;
 }
 
@@ -1747,6 +1802,10 @@
     const unsigned char *tables = NULL /* -Wall */;
     Rboolean use_UTF8 = FALSE, use_WC = FALSE;
     const void *vmax;
+    int capture_count,capture_num, *ovector, ovector_size, 
+      name_count, name_entry_size, info_code;
+    char *name_table;
+    SEXP capture_names;
 
     checkArity(op, args);
     pat = CAR(args); args = CDR(args);
@@ -1834,6 +1893,26 @@
 	    if (errorptr)
 		warning(_("PCRE pattern study error\n\t'%s'\n"), errorptr);
 	}
+	/* also extract info for named groups */
+	pcre_fullinfo(re_pcre,re_pe,PCRE_INFO_NAMECOUNT,&name_count);
+	pcre_fullinfo(re_pcre,re_pe,PCRE_INFO_NAMEENTRYSIZE,&name_entry_size);
+	pcre_fullinfo(re_pcre,re_pe,PCRE_INFO_NAMETABLE,&name_table);
+	info_code = 
+	  pcre_fullinfo(re_pcre,re_pe,PCRE_INFO_CAPTURECOUNT,&capture_count);
+	if(info_code<0){
+	  error(_("pcre_fullinfo returned '%d' "), info_code);
+	}
+	ovector_size = (capture_count+1)*3;
+	ovector = (int*)malloc(ovector_size*sizeof(int));
+	SEXP thisname;
+	PROTECT(capture_names=allocVector(STRSXP,capture_count));
+	for(i=0;i<name_count;i++){
+	  char *entry = name_table + name_entry_size * i;
+	  PROTECT(thisname = mkChar((char*)(entry+2)));
+	  capture_num = (entry[0]<<8)+entry[1]-1;
+	  SET_STRING_ELT(capture_names,capture_num,thisname);
+	  UNPROTECT(1);
+	}
     } else {
 	int cflags = REG_EXTENDED;
 	if (igcase_opt) cflags |= REG_ICASE;
@@ -1845,10 +1924,16 @@
     }
 
     if (PRIMVAL(op) == 0) { /* regexpr */
-	SEXP matchlen;
+      SEXP matchlen,capture_start,capturelen;
 	PROTECT(ans = allocVector(INTSXP, n));
 	matchlen = allocVector(INTSXP, n); /* protected by next line */
 	setAttrib(ans, install("match.length"), matchlen);
+	if(perl_opt){
+	  capture_start = allocMatrix(INTSXP,n,capture_count);
+	  setAttrib(ans, install("capture.start"), capture_start);
+	  capturelen = allocMatrix(INTSXP,n,capture_count);
+	  setAttrib(ans, install("capture.length"), capturelen);
+	}
 
 	vmax = vmaxget();
 	for (i = 0 ; i < n ; i++) {
@@ -1886,8 +1971,9 @@
 			INTEGER(matchlen)[i] = INTEGER(ans)[i] >= 0 ?
 			    strlen(spat):-1;
 		} else if (perl_opt) {
-		    int rc, ovector[3];
-		    rc = pcre_exec(re_pcre, re_pe, s, strlen(s), 0, 0, ovector, 3);
+		    int rc;
+		    rc = pcre_exec(re_pcre, re_pe, s, strlen(s), 0, 0, 
+				   ovector, ovector_size);
 		    if (rc >= 0) {
 			int st = ovector[0];
 			INTEGER(ans)[i] = st + 1; /* index from one */
@@ -1902,6 +1988,17 @@
 			    nc = getNc(s + st, mlen);
 			    INTEGER(matchlen)[i] = (nc >= 0) ? nc : NA_INTEGER;
 			}
+			/* also readout capture group locations */
+			int ind, *ov;
+			for(capture_num=0;capture_num<capture_count;
+			    capture_num++){
+			  ov = ovector+2*(capture_num+1);
+			  ind=i+n*capture_num;
+			  st=ov[0]+1;
+			  INTEGER(capture_start)[ind] = st;
+			  INTEGER(capturelen)[ind] = ov[1]-st+1;
+			}
+			/* TODO: add if(use_UTF8)... */
 		    } else INTEGER(ans)[i] = INTEGER(matchlen)[i] = -1;
 		} else {
 		    if (!use_WC)
@@ -1940,7 +2037,9 @@
 			if (fixed_opt)
 			    elt = gregexpr_fixed(spat, s, useBytes, use_UTF8);
 			else
-			    elt = gregexpr_perl(spat, s, re_pcre, re_pe, useBytes, use_UTF8);
+			    elt = gregexpr_perl(spat, s, re_pcre, re_pe, 
+						useBytes, use_UTF8, ovector,
+						ovector_size, capture_count);
 		    }
 		} else
 		    elt = gregexpr_Regexc(&reg, STRING_ELT(text, i), useBytes, use_WC);
@@ -1955,6 +2054,9 @@
 	if (re_pe) pcre_free(re_pe);
 	pcre_free(re_pcre);
 	pcre_free((void *)tables);
+	setAttrib(ans, install("capture.names"), capture_names);
+	UNPROTECT(1);
+	free(ovector);
     } else
 	tre_regfree(&reg);

### Toby Dylan Hocking, 25 feb 2011. Some R code to test my
### implementation of new named capture group features in gregexpr
gctorture(FALSE)##for debugging

### Parse result of gregexpr(,string)
result2list <- function(string,result){
  extract.substrings <- function(string,starts,lengths,names){
    subs <- substring(string,starts,starts+lengths-1)
    m <- matrix(subs,ncol=length(names))
    colnames(m) <- names
    m
  }
  N <- attr(result,"capture.names")
  lapply(seq_along(result),function(i){
    extract.substrings(string[i],
                       attr(result[[i]],"capture.start"),
                       attr(result[[i]],"capture.length"),
                       N)
  })
}

### Parse result of regexpr(,string)
parse.one <- function(string,result){
  m <- do.call(rbind,lapply(seq_along(string),function(i){
    st <- attr(result,"capture.start")[i,]
    substring(string[i],st,st+attr(result,"capture.length")[i,]-1)
  }))
  colnames(m) <- attr(result,"capture.names")
  m
}
string <- c("another foobar bazing string",
            "fbar baz foooobar baz",
            "no matches here",
            "my foobar baz st another fooooobar baz dude fobar baz")
notables <- c("  Ben Franklin and Jefferson Davis","\tMillard Fillmore")
name.rex <- "(?<first>[A-Z][a-z]+) (?<last>[A-Z][a-z]+)"
for(i in 1:100)
parsed <- gregexpr(name.rex,notables,perl=TRUE)
parsed[[1]]
result2list(notables,parsed)
(parsed <- regexpr(name.rex,notables,perl=TRUE))
parse.one(notables,parsed)
parse.one(notables,parsed)[,"last"]
result <- gregexpr("f(?<os_in_foo>o*)b(?<as_in_bar>a*)r (baz)",string,perl=TRUE)
print(result)
s2 <- paste(rep("foobar",1030),collapse=" ")
result <- gregexpr("f(?<os_in_foo>o*)b(?<as_in_bar>a*)r",s2,perl=TRUE)

## negative controls
regexpr(name.rex,notables)##perl not TRUE, bad regexp
regexpr("([A-Z][a-z]+) ([A-Z][a-z]+)",notables)##still works like usual

______________________________________________
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

[Rd] Named capture in regexp

Reply via email to