[Rd] invert argument in grep

Romain Francois Thu, 09 Nov 2006 22:57:10 -0800

Hello,

What about an `invert` argument in grep, to return elements that are*not* matching a regular expression :


R> grep("pink", colors(), invert = TRUE, value = TRUE)

would essentially return the same as :

R> colors() [ - grep("pink", colors()) ]

I'm attaching the files that I modified (against today's tarball) forthat purpose.


Cheers,

Romain

--
*mangosolutions*
/data analysis that delivers/

Tel   +44 1249 467 467
Fax   +44 1249 467 468

grep <-
function(pattern, x, ignore.case = FALSE, extended = TRUE, perl = FALSE,
         value = FALSE, fixed = FALSE, useBytes = FALSE, invert = FALSE)
{
    pattern <- as.character(pattern)
    ## when value = TRUE we return names
    if(!is.character(x)) x <- structure(as.character(x), names=names(x))
    ## behaves like == for NA pattern
    if (is.na(pattern)) {
        if(value)
            return(structure(rep.int(as.character(NA), length(x)),
                             names = names(x)))
        else
            return(rep.int(NA, length(x)))
    }

    if(perl)
        .Internal(grep.perl(pattern, x, ignore.case, value, useBytes, invert))
    else
        .Internal(grep(pattern, x, ignore.case, extended, value, fixed,
                       useBytes, invert))
}

sub <-
function(pattern, replacement, x, ignore.case = FALSE, extended = TRUE,
         perl = FALSE, fixed = FALSE, useBytes = FALSE)
{
    pattern <- as.character(pattern)
    replacement <- as.character(replacement)
    if(!is.character(x)) x <- as.character(x)
    if (is.na(pattern))
        return(rep.int(as.character(NA), length(x)))

    if(perl)
        .Internal(sub.perl(pattern, replacement, x, ignore.case, useBytes))
    else
        .Internal(sub(pattern, replacement, x, ignore.case,
                      extended, fixed, useBytes))
}

gsub <-
function(pattern, replacement, x, ignore.case = FALSE, extended = TRUE,
         perl = FALSE, fixed = FALSE, useBytes = FALSE)
{
    pattern <- as.character(pattern)
    replacement <- as.character(replacement)
    if(!is.character(x)) x <- as.character(x)
    if (is.na(pattern))
        return(rep.int(as.character(NA), length(x)))

    if(perl)
        .Internal(gsub.perl(pattern, replacement, x, ignore.case, useBytes))
    else
        .Internal(gsub(pattern, replacement, x, ignore.case,
                       extended, fixed, useBytes))
}

regexpr <-
function(pattern, text, extended = TRUE, perl = FALSE,
         fixed = FALSE, useBytes = FALSE)
{
    pattern <- as.character(pattern)
    text <- as.character(text)
    if(perl)
        .Internal(regexpr.perl(pattern, text, useBytes))
    else
        .Internal(regexpr(pattern, text, extended, fixed, useBytes))
}

gregexpr <-
function(pattern, text, extended = TRUE, perl = FALSE,
         fixed = FALSE, useBytes = FALSE)
{
    pattern <- as.character(pattern)
    text <- as.character(text)
    if(perl)
      .Internal(gregexpr.perl(pattern, text, useBytes))
    else
      .Internal(gregexpr(pattern, text, extended, fixed, useBytes))
}

agrep <-
function(pattern, x, ignore.case = FALSE, value = FALSE,
         max.distance = 0.1)
{
    pattern <- as.character(pattern)
    if(!is.character(x)) x <- as.character(x)
    ## behaves like == for NA pattern
    if (is.na(pattern)){
        if (value)
            return(structure(rep.int(as.character(NA), length(x)),
                             names = names(x)))
        else
            return(rep.int(NA, length(x)))
    }

    if(!is.character(pattern)
       || (length(pattern) < 1)
       || ((n <- nchar(pattern)) == 0))
        stop("'pattern' must be a non-empty character string")

    if(!is.list(max.distance)) {
        if(!is.numeric(max.distance) || (max.distance < 0))
            stop("'max.distance' must be non-negative")
        if(max.distance < 1)            # transform percentages
            max.distance <- ceiling(n * max.distance)
        max.insertions <- max.deletions <- max.substitutions <-
            max.distance
    } else {
        ## partial matching
        table <- c("all", "deletions", "insertions", "substitutions")
        ind <- pmatch(names(max.distance), table)
        if(any(is.na(ind)))
            warning("unknown match distance components ignored")
        max.distance <- max.distance[!is.na(ind)]
        names(max.distance) <- table[ind]
        ## sanity checks
        comps <- unlist(max.distance)
        if(!all(is.numeric(comps)) || any(comps < 0))
            stop("'max.distance' components must be non-negative")
        ## extract restrictions
        if(is.null(max.distance$all))
            max.distance$all <- 0.1
        max.insertions <- max.deletions <- max.substitutions <-
            max.distance$all
        if(!is.null(max.distance$deletions))
            max.deletions <- max.distance$deletions
        if(!is.null(max.distance$insertions))
            max.insertions <- max.distance$insertions
        if(!is.null(max.distance$substitutions))
            max.substitutions <- max.distance$substitutions
        max.distance <- max.distance$all
        ## transform percentages
        if(max.distance < 1)
            max.distance <- ceiling(n * max.distance)
        if(max.deletions < 1)
            max.deletions <- ceiling(n * max.deletions)
        if(max.insertions < 1)
            max.insertions <- ceiling(n * max.insertions)
        if(max.substitutions < 1)
            max.substitutions <- ceiling(n * max.substitutions)
    }

    .Internal(agrep(pattern, x, ignore.case, value, max.distance,
                    max.deletions, max.insertions, max.substitutions))
}

\name{grep}
\title{Pattern Matching and Replacement}
\alias{grep}
\alias{sub}
\alias{gsub}
\alias{regexpr}
\alias{gregexpr}
\description{
  \code{grep} searches for matches to \code{pattern} (its first
  argument) within the character vector \code{x} (second argument).
  \code{regexpr} and \code{gregexpr} do too, but return more detail in
  a different format.

  \code{sub} and \code{gsub} perform replacement of matches determined
  by regular expression matching.
}
\usage{
grep(pattern, x, ignore.case = FALSE, extended = TRUE,
     perl = FALSE, value = FALSE, fixed = FALSE, 
     useBytes = FALSE, invert = FALSE)

sub(pattern, replacement, x,
    ignore.case = FALSE, extended = TRUE, perl = FALSE,
    fixed = FALSE, useBytes = FALSE)

gsub(pattern, replacement, x,
     ignore.case = FALSE, extended = TRUE, perl = FALSE,
     fixed = FALSE, useBytes = FALSE)

regexpr(pattern, text, extended = TRUE, perl = FALSE,
        fixed = FALSE, useBytes = FALSE)

gregexpr(pattern, text, extended = TRUE, perl = FALSE,
         fixed = FALSE, useBytes = FALSE)
}
\arguments{
  \item{pattern}{character string containing a \link{regular expression}
    (or character string for \code{fixed = TRUE}) to be matched
    in the given character vector.  Coerced by
    \code{\link{as.character}} to a character string if possible.}
  \item{x, text}{a character vector where matches are sought, or an
    object which can be coerced by \code{as.character} to a character vector.}
  \item{ignore.case}{if \code{FALSE}, the pattern matching is \emph{case
      sensitive} and if \code{TRUE}, case is ignored during matching.}
  \item{extended}{if \code{TRUE}, extended regular expression matching
    is used, and if \code{FALSE} basic regular expressions are used.}
  \item{perl}{logical. Should perl-compatible regexps be used?
    Has priority over \code{extended}.}
  \item{value}{if \code{FALSE}, a vector containing the (\code{integer})
    indices of the matches determined by \code{grep} is returned, and if
    \code{TRUE}, a vector containing the matching elements themselves is
    returned.}
  \item{fixed}{logical.  If \code{TRUE}, \code{pattern} is a string to be
    matched as is.  Overrides all conflicting arguments.}
  \item{useBytes}{logical.  If \code{TRUE} the matching is done
    byte-by-byte rather than character-by-character.  See Details.}
  \item{replacement}{a replacement for matched pattern in \code{sub} and
    \code{gsub}.  Coerced to character if possible.  For \code{fixed =
      FALSE} this can include backreferences \code{"\\1"} to
    \code{"\\9"} to parenthesized subexpressions of \code{pattern}.  For
    \code{perl = TRUE} only, it can also contain \code{"\\U"} or
    \code{"\\L"} to convert the rest of the replacement to upper or
    lower case.
  }
  \item{invert}{logical. If \code{TRUE}, then the match is reversed 
  in order to get the items (or the values) that are \emph{not} matching the 
  pattern}
}
\details{
  Arguments which should be character strings or character vectors are
  coerced to character if possible.

  The two \code{*sub} functions differ only in that \code{sub} replaces
  only the first occurrence of a \code{pattern} whereas \code{gsub}
  replaces all occurrences.

  For \code{regexpr} it is an error for \code{pattern} to be \code{NA},
  otherwise \code{NA} is permitted and matches only itself.

  The regular expressions used are those specified by POSIX 1003.2,
  either extended or basic, depending on the value of the
  \code{extended} argument, unless \code{perl = TRUE} when they are
  those of PCRE, \url{http://www.pcre.org/}.
  (The exact set of patterns supported may depend on the version of
  PCRE installed on the system in use, if \R was configured to use the
  system PCRE.  \R's internal copy used PCRE 6.7.)

  \code{useBytes} is only used if \code{fixed = TRUE} or \code{perl = TRUE}.
  For \code{grep} its main effect is to avoid errors/warnings about
  invalid inputs, but for \code{regexpr} it changes the interpretation
  of the output.
}
\value{
  For \code{grep} a vector giving either the indices of the elements of
  \code{x} that yielded a match or, if \code{value} is \code{TRUE}, the
  matched elements of \code{x} (after coercion, preserving names but no
  other attributes).

  For \code{sub} and \code{gsub} a character vector of the same length
  and with the same attributes as \code{x} (after possible coercion).

  For \code{regexpr} an integer vector of the same length as \code{text}
  giving the starting position of the first match, or \eqn{-1} if there
  is none, with attribute \code{"match.length"} giving the length of the
  matched text (or \eqn{-1} for no match).  In a multi-byte locale these
  quantities are in characters rather than bytes unless
  \code{useBytes = TRUE} is used with \code{fixed = TRUE} or
  \code{perl = TRUE}.

  For \code{gregexpr} a list of the same length as \code{text} each
  element of which is an integer vector as in \code{regexpr}, except
  that the starting positions of every match are given.

  If in a multi-byte locale the pattern or replacement is not a valid
  sequence of bytes, an error is thrown.  An invalid string in \code{x}
  or \code{text} is a non-match with a warning for \code{grep} or
  \code{regexpr}, but an error for \code{sub} or \code{gsub}.
}
\section{Warning}{
  The standard regular-expression code has been reported to be very slow
  when applied to extremely long character strings
  (tens of thousands of characters or more): the code used when
  \code{perl = TRUE} seems much faster and more reliable for such
  usages.

  The standard version of \code{gsub} does not substitute correctly
  repeated word-boundaries (e.g. \code{pattern = "\\b"}).
  Use \code{perl = TRUE} for such matches.

  The \code{perl = TRUE} option is only implemented for single-byte and
  UTF-8 encodings, and will warn if used in a non-UTF-8 multi-byte
  locale (unless \code{useBytes = TRUE}).
}
\references{
  Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
  \emph{The New S Language}.
  Wadsworth \& Brooks/Cole (\code{grep})
}
\seealso{
  \link{regular expression} (aka \code{\link{regexp}}) for the details
% the `aka' above is for ESS (and ?reg....) where a space is problematic
  of the pattern specification.

  \code{\link{glob2rx}} to turn wildcard matches into regular expressions.
  
  \code{\link{agrep}} for approximate matching.

  \code{\link{tolower}}, \code{\link{toupper}} and \code{\link{chartr}}
  for character translations.
  \code{\link{charmatch}}, \code{\link{pmatch}}, \code{\link{match}}.
  \code{\link{apropos}} uses regexps and has nice examples.
}
\examples{
grep("[a-z]", letters)

txt <- c("arm","foot","lefroo", "bafoobar")
if(any(i <- grep("foo",txt)))
   cat("'foo' appears at least once in\n\t",txt,"\n")
i # 2 and 4
txt[i]

## Double all 'a' or 'b's;  "\\" must be escaped, i.e., 'doubled'
%% and escaped even once more in this *.Rd file!
gsub("([ab])", "\\\\1_\\\\1_", "abc and ABC")

txt <- c("The", "licenses", "for", "most", "software", "are",
  "designed", "to", "take", "away", "your", "freedom",
  "to", "share", "and", "change", "it.",
   "", "By", "contrast,", "the", "GNU", "General", "Public", "License",
   "is", "intended", "to", "guarantee", "your", "freedom", "to",
   "share", "and", "change", "free", "software", "--",
   "to", "make", "sure", "the", "software", "is",
   "free", "for", "all", "its", "users")
( i <- grep("[gu]", txt) ) # indices
stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) )

## Note that in locales such as en_US this includes B as the
## collation order is aAbBcCdEe ...
(ot <- sub("[b-e]",".", txt))
txt[ot != gsub("[b-e]",".", txt)]#- gsub does "global" substitution

txt[gsub("g","#", txt) !=
    gsub("g","#", txt, ignore.case = TRUE)] # the "G" words

regexpr("en", txt)

gregexpr("e", txt)

## trim trailing white space
str = 'Now is the time      '
sub(' +$', '', str)  ## spaces only
sub('[[:space:]]+$', '', str) ## white space, POSIX-style
sub('\\\\s+$', '', str, perl = TRUE) ## Perl-style white space

## capitalizing
gsub("(\\\\w)(\\\\w*)", "\\\\U\\\\1\\\\L\\\\2", "a test of capitalizing", 
perl=TRUE)
gsub("\\\\b(\\\\w)", "\\\\U\\\\1", "a test of capitalizing", perl=TRUE)
}
\keyword{character}
\keyword{utilities}

______________________________________________
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

[Rd] invert argument in grep

Reply via email to