Sorry, I was on my phone and did not see that you were already using but completely missing the vectorized nature of these functions.

Consider the following:

#############
# after executing your sample code
slowmethod <- function( whalines ) {
  lines   <- whalines
  mc_list <- NULL
  for (i in 1:length(lines)){
    # Look for start of water content
    if(grepl(srchStr1, lines[i])){
      mc_list <- c(mc_list, i)
    }
  }

  tmp_list <- NULL
  for (i in 1:length(lines)){
    # Look for start of temperature data
    if(grepl(srchStr2, lines[i])){
      tmp_list <- c(tmp_list, i)
    }
  }

  # Store the water content arrays
  wc <- list()
  # Read all the moisture content profiles
  for(i in 1:length(mc_list)){
    lineNum <- mc_list[i] + 3
    mct <- read.table( text = wha
                     , skip=lineNum
                     , nrows=5
                     , col.names=c('depth','wc')
                     )
    wc[[i]] <- mct
  }

  # Store the water temperature arrays
  tmp <- list()
  # Read all the temperature profiles
  for(i in 1:length(tmp_list)){
    lineNum <- tmp_list[i] + 3
    tmpt <- read.table(text = wha, skip=lineNum, nrows=5,
                      col.names=c('depth','tmp'))
    tmp[[i]] <- tmpt
  }
  list(mc = wc, wt = tmp )
}

library(data.table)

fastmethod <- function( whalines ) {
  # identify tabular formatted lines
  idx_tbl0 <- grepl( "^\\s*[.\\d+-]+\\s+[.\\d+-]+\\s*$"
                   , whalines
                   , perl=TRUE )
  # identify groups of lines according to moisture content
  mc_block <- grep( srchStr1, whalines, fixed = TRUE )
  mc_start <- rep( FALSE, length( whalines ) )
  mc_start[ mc_block+4L ] <- TRUE
  grp_mc <- cumsum( mc_start )
  # identify contiguous block of tabular lines in each block
  mc_tab <- 0 == ave( !idx_tbl0, grp_mc, FUN = cumsum )
  # extract moisture content data
  mc_dta <- fread( text = whalines[ mc_tab ], header=FALSE )
  mclist <- split( mc_dta, grp_mc[ mc_tab ] )

  # identify groups of lines according to moisture content
  wt_block <- grep( srchStr2, whalines, fixed = TRUE )
  wt_start <- rep( FALSE, length( whalines ) )
  wt_start[ wt_block+4L ] <- TRUE
  grp_wt <- cumsum( wt_start )
  # identify contiguous block of tabular lines in each block
  wt_tab <- 0 == ave( !idx_tbl0, grp_wt, FUN = cumsum )
  # extract data frames
  wt_dta <- fread( text = whalines[ wt_tab ], header=FALSE )
  wtlist <- split( wt_dta, grp_wt[ wt_tab ] )
  list(mc = mclist, wt = wtlist )
}
library(microbenchmark)
bigwhalines <- strsplit( paste( rep( wha, 100 )
                              , collapse = "\n" )
                       , "\n" )[[ 1 ]]
microbenchmark( slowresult <- slowmethod( bigwhalines )
              , fastresult <- fastmethod( bigwhalines )
              )


On Wed, 24 Jul 2019, Jeff Newmiller wrote:

?readLines
?grep
?textConnection

On July 24, 2019 11:54:07 AM PDT, "Morway, Eric via R-help" 
<r-help@r-project.org> wrote:
The small reproducible example below works, but is way too slow on the
real
problem.  The real problem is attempting to extract ~2920 repeated
arrays
from a 60 Mb file and takes ~80 minutes.  I'm wondering how I might
re-engineer the script to avoid opening and closing the file 2920 times
as
is the case now.  That is, is there a way to keep the file open and
peel
out the arrays and stuff them into a list of data.tables, as is done in
the
small reproducible example below, but in a significantly faster way?

wha <- "     INITIAL PRESSURE HEAD
    INITIAL TEMPERATURE SET TO 4.000E+00 DEGREES C
    VS2DH - MedSand for TL test

    TOTAL ELAPSED TIME =  0.000000E+00 sec
    TIME STEP         0

    MOISTURE CONTENT
 Z, IN
 m                       X OR R DISTANCE, IN m
               0.500
    0.075     0.1475
    0.225     0.1475
    0.375     0.1475
    0.525     0.1475
    0.675     0.1475
blah
blah
blah
    TEMPERATURE, IN DECREES C
 Z, IN
 m                       X OR R DISTANCE, IN m
               0.500
    0.075     1.1475
    0.225     2.1475
    0.375     3.1475
    0.525     4.1475
    0.675     5.1475
blah
blah
blah

    TOTAL ELAPSED TIME =  8.6400E+04 sec
    TIME STEP         0

    MOISTURE CONTENT
 Z, IN
 m                       X OR R DISTANCE, IN m
               0.500
    0.075     0.1875
    0.225     0.1775
    0.375     0.1575
    0.525     0.1675
    0.675     0.1475
blah
blah
blah     TEMPERATURE, IN DECREES C
 Z, IN
 m                       X OR R DISTANCE, IN m
               0.500
    0.075     1.1475
    0.225     2.1475
    0.375     3.1475
    0.525     4.1475
    0.675     5.1475
blah
blah
blah"

example_content <- textConnection(wha)

srchStr1 <- '     MOISTURE CONTENT'
srchStr2 <- 'TEMPERATURE, IN DECREES C'

lines   <- readLines(example_content)
mc_list <- NULL
for (i in 1:length(lines)){
 # Look for start of water content
 if(grepl(srchStr1, lines[i])){
   mc_list <- c(mc_list, i)
 }
}

tmp_list <- NULL
for (i in 1:length(lines)){
 # Look for start of temperature data
 if(grepl(srchStr2, lines[i])){
   tmp_list <- c(tmp_list, i)
 }
}

# Store the water content arrays
wc <- list()
# Read all the moisture content profiles
for(i in 1:length(mc_list)){
 lineNum <- mc_list[i] + 3
 mct <- read.table(text = wha, skip=lineNum, nrows=5,
                   col.names=c('depth','wc'))
 wc[[i]] <- mct
}

# Store the water temperature arrays
tmp <- list()
# Read all the temperature profiles
for(i in 1:length(tmp_list)){
 lineNum <- tmp_list[i] + 3
 tmpt <- read.table(text = wha, skip=lineNum, nrows=5,
                   col.names=c('depth','tmp'))
 tmp[[i]] <- tmpt
}

# quick inspection
length(wc)
wc[[1]]
# Looks like what I'm after, but too slow in real world problem

        [[alternative HTML version deleted]]

______________________________________________
R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide
http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

--
Sent from my phone. Please excuse my brevity.

______________________________________________
R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.


---------------------------------------------------------------------------
Jeff Newmiller                        The     .....       .....  Go Live...
DCN:<jdnew...@dcn.davis.ca.us>        Basics: ##.#.       ##.#.  Live Go...
                                      Live:   OO#.. Dead: OO#..  Playing
Research Engineer (Solar/Batteries            O.O#.       #.O#.  with
/Software/Embedded Controllers)               .OO#.       .OO#.  rocks...1k

______________________________________________
R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Reply via email to