On 24-02-2012, at 20:02, Petr Savicky wrote: > On Fri, Feb 24, 2012 at 08:59:44AM -0800, robertfeldt wrote: >> Hi, >> >> I have R code like so: >> >> num.columns.back.since.last.occurence <- function(m, outcome) { >> nrows <- dim(m)[1]; >> ncols <- dim(m)[2]; >> res <- matrix(rep.int(0, nrows*ncols), nrow=nrows); >> for(row in 1:nrows) { >> for(col in 2:ncols) { >> res[row,col] <- if(m[row,col-1]==outcome) {0} else >> {1+res[row,col-1]} >> } >> } >> res; >> } >> >> but on the very large matrices I apply this the execution times are a >> problem. I would appreciate any help to rewrite this with more >> "standard"/native R functions to speed things up. > > Hi. > > If the number of rows is large and the number of columns is not, > then try the following. > > # random matrix > A <- matrix((runif(49) < 0.2) + 0, nrow=7) > outcome <- 1 > > # transformation > B <- array(0, dim=dim(A)) > curr <- B[, 1] > for (i in seq.int(from=2, length=ncol(A)-1)) { > curr <- ifelse (A[, i-1] == outcome, 0, 1 + curr) > B[, i] <- curr > }
I've done some speed tests. t1 <- function(m, outcome) { nrows <- dim(m)[1] ncols <- dim(m)[2] res <- matrix(rep.int(0, nrows*ncols), nrow=nrows) for(row in 1:nrows) { for(col in 2:ncols) { res[row,col] <- if(m[row,col-1]==outcome) {0} else {1+res[row,col-1]} } } res } # by row t2 <- function(A, outcome) { oneRow <- function(x, outcome) { n <- length(x) y <- c(0, cumsum(x[-n] == outcome)) ave(x, y, FUN = function(z) seq.int(along=z) - 1) } t(apply(A, 1, oneRow, outcome=1)) } # transformation t3 <- function(A, outcome) { B <- array(0, dim=dim(A)) curr <- B[, 1] for (i in seq.int(from=2, length=ncol(A)-1)) { curr <- ifelse (A[, i-1] == outcome, 0, 1 + curr) B[, i] <- curr } B } # Compile functions to byte code library(compiler) t1.c <- cmpfun(t1) t2.c <- cmpfun(t2) t3.c <- cmpfun(t3) Nrow <- 100 Ncol <- 1000 A <- matrix((runif(Ncol*Nrow)<0.2)+0, nrow=Nrow) z1 <- system.time(res1 <- t1(A,outcome=1)) z2 <- system.time(res2 <- t2(A,outcome=1)) z3 <- system.time(res3 <- t3(A, outcome=1)) z4 <- system.time(res4 <- t1.c(A,outcome=1)) z5 <- system.time(res5 <- t2.c(A,outcome=1)) z6 <- system.time(res6 <- t3.c(A, outcome=1)) print(all(res2==res1)) print(all(res3==res1)) print(all(res4==res1)) print(all(res5==res1)) print(all(res6==res1)) print(rbind(z1,z2,z3,z4,z5,z6)) Nrow <- 1000 Ncol <- 100 A <- matrix((runif(Ncol*Nrow)<0.2)+0, nrow=Nrow) z1 <- system.time(res1 <- t1(A,outcome=1)) z2 <- system.time(res2 <- t2(A,outcome=1)) z3 <- system.time(res3 <- t3(A, outcome=1)) z4 <- system.time(res4 <- t1.c(A,outcome=1)) z5 <- system.time(res5 <- t2.c(A,outcome=1)) z6 <- system.time(res6 <- t3.c(A, outcome=1)) print(all(res2==res1)) print(all(res3==res1)) print(all(res4==res1)) print(all(res5==res1)) print(all(res6==res1)) print(rbind(z1,z2,z3,z4,z5,z6)) Nrow <- 1000 Ncol <- 1000 A <- matrix((runif(Ncol*Nrow)<0.2)+0, nrow=Nrow) z1 <- system.time(res1 <- t1(A,outcome=1)) z2 <- system.time(res2 <- t2(A,outcome=1)) z3 <- system.time(res3 <- t3(A, outcome=1)) z4 <- system.time(res4 <- t1.c(A,outcome=1)) z5 <- system.time(res5 <- t2.c(A,outcome=1)) z6 <- system.time(res6 <- t3.c(A, outcome=1)) print(all(res2==res1)) print(all(res3==res1)) print(all(res4==res1)) print(all(res5==res1)) print(all(res6==res1)) print(rbind(z1,z2,z3,z4,z5,z6)) ------------------------------------------ Summary of results > Nrow <- 100 > Ncol <- 1000 > A <- matrix((runif(Ncol*Nrow)<0.2)+0, nrow=Nrow) > print(rbind(z1,z2,z3,z4,z5,z6)) user.self sys.self elapsed user.child sys.child z1 0.543 0.005 0.551 0 0 z2 0.299 0.004 0.304 0 0 z3 0.070 0.012 0.082 0 0 z4 0.112 0.002 0.114 0 0 z5 0.263 0.007 0.271 0 0 z6 0.062 0.009 0.070 0 0 > Nrow <- 1000 > Ncol <- 100 > A <- matrix((runif(Ncol*Nrow)<0.2)+0, nrow=Nrow) > > z1 <- system.time(res1 <- t1(A,outcome=1)) > z2 <- system.time(res2 <- t2(A,outcome=1)) > z3 <- system.time(res3 <- t3(A, outcome=1)) > z4 <- system.time(res4 <- t1.c(A,outcome=1)) > z5 <- system.time(res5 <- t2.c(A,outcome=1)) > z6 <- system.time(res6 <- t3.c(A, outcome=1)) user.self sys.self elapsed user.child sys.child z1 0.543 0.005 0.551 0 0 z2 0.299 0.004 0.304 0 0 z3 0.070 0.012 0.082 0 0 z4 0.112 0.002 0.114 0 0 z5 0.263 0.007 0.271 0 0 z6 0.062 0.009 0.070 0 0 > Nrow <- 1000 > Ncol <- 1000 > A <- matrix((runif(Ncol*Nrow)<0.2)+0, nrow=Nrow) > print(rbind(z1,z2,z3,z4,z5,z6)) user.self sys.self elapsed user.child sys.child z1 5.381 0.019 5.417 0 0 z2 2.812 0.044 2.858 0 0 z3 0.307 0.049 0.357 0 0 z4 1.176 0.015 1.191 0 0 z5 2.686 0.038 2.728 0 0 z6 0.305 0.049 0.355 0 0 <End of timing results> The original function (t1) benefits a lot from compiling to byte code. The function that operates on columns (t3) seems to be always the quickest in this experiment. Berend ______________________________________________ R-help@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.