As Don suggests, looking for ways to do the whole calculation at once is a big efficiency booster. Also, avoiding unnecessary calculations (e.g. mean of 1:n is (n+1)/2 and mean(x+a) where a is a constant is mean(x)+a.

Reproducible example:

####################
#library(tictoc)
library(microbenchmark)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#>     filter, lag
#> The following objects are masked from 'package:base':
#>
#>     intersect, setdiff, setequal, union
library(purrr)

func1 <- function( coord, A, B, C ) {

  X1 <- as.vector( coord[ 1 ] )
  Y1 <- as.vector( coord[ 2 ] )
  X2 <- as.vector( coord[ 3 ] )
  Y2 <- as.vector( coord[ 4 ] )

  if( C == 0 ) {
    res1 <- mean( c( ( X1 - A ) : ( X1 - 1 )
                   , ( Y1 + 1 ) : ( Y1 + 40 )
                   )
                )
    res2 <- mean( c( ( X2 - A ) : ( X2 - 1 )
                   , ( Y2 + 1 ) : ( Y2 + 40 )
                   )
                )
    res <- matrix( c( res1, res2 )
                 , ncol=2
                 , nrow=1
                 )

  } else {

    res1 <- mean( c( ( X1 - A ) : ( X1 - 1 )
                   , ( Y1 + 1 ) : ( Y1 + 40 )
                   )
                )*B
    res2 <- mean( c( ( X2 - A ) : ( X2 - 1 )
                   , ( Y2 + 1 ) : ( Y2 + 40 )
                   )
                )*B
    res <- matrix( c( res1, res2 )
                 , ncol=2
                 , nrow=1
                 )

  }

  res
}

#' @param coord is a one-row data frame
func2 <- function( coord, A, B, C ) {
  X1 <- coord[[ 1 ]]
  Y1 <- coord[[ 2 ]]
  X2 <- coord[[ 3 ]]
  Y2 <- coord[[ 4 ]]

  res <- matrix( c( mean( c( X1, Y1 ) )
                  , mean( c( X2, Y2 ) )
                  )
               , ncol=2
               , nrow=1
               ) + ( 40 - A ) / 2

  if ( C != 0 ) {
    res <- res * B
  }

  setNames( as.data.frame( res ), c( "V1", "V2" ) )
}

#' @param coord is a numeric vector of length 4
#' @return Numeric vector of length 2
func3 <- function( coord, A, B, C ) {
  res <- ( c( ( coord[ 1 ] + coord[ 2 ] )
            , ( coord[ 3 ] + coord[ 4 ] )
            )
         + ( 40 - A )
         ) / 2

  if ( C != 0 ) {
    res <- res * B
  }

  res
}

#' @param coord is a matrix with four columns
func4 <- function( coord, A, B, C ) {
  res <- ( cbind( ( coord[ , 1 ] + coord[ , 2 ] )
                , ( coord[ , 3 ] + coord[ , 4 ] )
                )
         + ( 40 - A )
         ) / 2

  if ( length( C ) == nrow( coord ) || length( C ) == 1 ) {
    idx <- C == 1
    res[ idx, ] <- res[ idx, ] * B
  }

  res
}

## Apply the function
set.seed( 1 )
n <- 1000
N <- 100
Nseq <- seq.int( N )
# Using T instead of TRUE is asking to get an unexpected result someday
tabDF <- data.frame( x1 = sample( Nseq, n, replace = TRUE )
                   , y1 = sample( Nseq, n, replace = TRUE )
                   , x2 = sample( Nseq, n, replace = TRUE )
                   , y2 = sample( Nseq, n, replace = TRUE )
                   )
tab <- as.matrix( tabDF )

fTest1 <- function() {
  test <- tab %>%
    split( 1:nrow(tab) ) %>%
    map(~ func1(.x, 40, 5, 1) ) %>%
    do.call( "rbind", . )
}

fTest2 <- function() {
  # conventional dplyr approach
  test <- tabDF %>%
    rowwise %>%
    do({
      func2( ., 40, 5, 1 )
    }) %>%
    ungroup
}

fTest3 <- function() {
  t( apply( tab, 1, func3, A=40, B=5, C=1 ) )
}

fTest4 <- function() {
  func4( tabDF, A=40, B=5, C=1 )
}

microbenchmark( result1 <- fTest1()
              , result2 <- fTest2()
              , result3 <- fTest3()
              , result4 <- fTest4()
              )
#> Unit: microseconds
#>                 expr        min         lq        mean      median
#>  result1 <- fTest1()  20305.562  23384.359  26939.6559  26262.8495
#>  result2 <- fTest2() 255441.229 276794.201 290628.3221 286046.6385
#>  result3 <- fTest3()   4869.288   5772.462   7242.2194   6615.7900
#>  result4 <- fTest4()     52.862     94.962    216.3508    105.7235
#>           uq        max neval
#>   29324.2775  46207.632   100
#>  294248.0795 473898.379   100
#>    7874.6455  21288.783   100
#>     127.0565   9253.006   100

stopifnot( result1[ , 1 ] == result2[[ 1 ]] )
stopifnot( result1[ , 2 ] == result2[[ 2 ]] )
stopifnot( result1 == result3 )
stopifnot( result1 == result4 )
####################

On Thu, 1 Nov 2018, MacQueen, Don via R-help wrote:

Without more study, I can only give some general pointers.

The as.vector() in X1 <- as.vector(coord[1]) is almost certainly not needed. It 
will add a little bit to your execution time.
Converting the output of func() to a one row matrix is almost certainly not 
needed. Just return c(res1, res2).

Your data frame appears to be entirely numeric, in which case you don't need to 
ever use a data frame.

Try
 apply( tab, 1, func, a=40, b=5, c=1 )
instead of all that dplyr stuff.


Your function can be redefined as

func <- function(coord, a, b, c){

         X1 <- as.vector(coord[1])
         Y1 <- as.vector(coord[2])
         X2 <- as.vector(coord[3])
         Y2 <- as.vector(coord[4])

          res1 <- mean(c((X1 - a) : (X1 - 1), (Y1 + 1) : (Y1 + 40)))
          res2 <- mean(c((X2 - a) : (X2 - 1), (Y2 + 1) : (Y2 + 40)))

           if (c==0) c(res1, res2) else c(res1, res2)*b
       }

I suspect you can operate on the entire matrix, without looping (which both the 
apply() method, and the split/rbind method do, in effect), and if so it will be 
much faster. But I can't say for sure without more study.

--
Don MacQueen
Lawrence Livermore National Laboratory
7000 East Ave., L-627
Livermore, CA 94550
925-423-1062
Lab cell 925-724-7509



On 11/1/18, 12:35 PM, "R-help on behalf of Nelly Reduan" 
<[email protected] on behalf of [email protected]> wrote:

   Hello,

   I have a input data frame with multiple rows. For each row, I want to apply a 
function. The input data frame has 1,000,000+ rows. How can I speed up my code ? I would 
like to keep the function "func".

   Here is a reproducible example with a simple function:

       library(tictoc)
       library(dplyr)

   func <- function(coord, a, b, c){

         X1 <- as.vector(coord[1])
         Y1 <- as.vector(coord[2])
         X2 <- as.vector(coord[3])
         Y2 <- as.vector(coord[4])

         if(c == 0) {

           res1 <- mean(c((X1 - a) : (X1 - 1), (Y1 + 1) : (Y1 + 40)))
           res2 <- mean(c((X2 - a) : (X2 - 1), (Y2 + 1) : (Y2 + 40)))
           res <- matrix(c(res1, res2), ncol=2, nrow=1)

         } else {

           res1 <- mean(c((X1 - a) : (X1 - 1), (Y1 + 1) : (Y1 + 40)))*b
           res2 <- mean(c((X2 - a) : (X2 - 1), (Y2 + 1) : (Y2 + 40)))*b
           res <- matrix(c(res1, res2), ncol=2, nrow=1)

         }

         return(res)
       }

       ## Apply the function
       set.seed(1)
       n = 10000000
       tab <- as.matrix(data.frame(x1 = sample(1:100, n, replace = T), y1 = 
sample(1:100, n, replace = T), x2 = sample(1:100, n, replace = T), y2 = 
sample(1:100, n, replace = T)))


     tic("test 1")
     test <- tab %>%
       split(1:nrow(tab)) %>%
       map(~ func(.x, 40, 5, 1)) %>%
       do.call("rbind", .)
     toc()

   test 1: 599.2 sec elapsed

   Thanks very much for your time
   Have a nice day
   Nell

        [[alternative HTML version deleted]]

   ______________________________________________
   [email protected] mailing list -- To UNSUBSCRIBE and more, see
   https://stat.ethz.ch/mailman/listinfo/r-help
   PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
   and provide commented, minimal, self-contained, reproducible code.


______________________________________________
[email protected] mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

---------------------------------------------------------------------------
Jeff Newmiller                        The     .....       .....  Go Live...
DCN:<[email protected]>        Basics: ##.#.       ##.#.  Live Go...
                                      Live:   OO#.. Dead: OO#..  Playing
Research Engineer (Solar/Batteries            O.O#.       #.O#.  with
/Software/Embedded Controllers)               .OO#.       .OO#.  rocks...1k
---------------------------------------------------------------------------
______________________________________________
[email protected] mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Reply via email to