Sorry, I was on my phone and did not see that you were already using but
completely missing the vectorized nature of these functions.
Consider the following:
#############
# after executing your sample code
slowmethod <- function( whalines ) {
lines <- whalines
mc_list <- NULL
for (i in 1:length(lines)){
# Look for start of water content
if(grepl(srchStr1, lines[i])){
mc_list <- c(mc_list, i)
}
}
tmp_list <- NULL
for (i in 1:length(lines)){
# Look for start of temperature data
if(grepl(srchStr2, lines[i])){
tmp_list <- c(tmp_list, i)
}
}
# Store the water content arrays
wc <- list()
# Read all the moisture content profiles
for(i in 1:length(mc_list)){
lineNum <- mc_list[i] + 3
mct <- read.table( text = wha
, skip=lineNum
, nrows=5
, col.names=c('depth','wc')
)
wc[[i]] <- mct
}
# Store the water temperature arrays
tmp <- list()
# Read all the temperature profiles
for(i in 1:length(tmp_list)){
lineNum <- tmp_list[i] + 3
tmpt <- read.table(text = wha, skip=lineNum, nrows=5,
col.names=c('depth','tmp'))
tmp[[i]] <- tmpt
}
list(mc = wc, wt = tmp )
}
library(data.table)
fastmethod <- function( whalines ) {
# identify tabular formatted lines
idx_tbl0 <- grepl( "^\\s*[.\\d+-]+\\s+[.\\d+-]+\\s*$"
, whalines
, perl=TRUE )
# identify groups of lines according to moisture content
mc_block <- grep( srchStr1, whalines, fixed = TRUE )
mc_start <- rep( FALSE, length( whalines ) )
mc_start[ mc_block+4L ] <- TRUE
grp_mc <- cumsum( mc_start )
# identify contiguous block of tabular lines in each block
mc_tab <- 0 == ave( !idx_tbl0, grp_mc, FUN = cumsum )
# extract moisture content data
mc_dta <- fread( text = whalines[ mc_tab ], header=FALSE )
mclist <- split( mc_dta, grp_mc[ mc_tab ] )
# identify groups of lines according to moisture content
wt_block <- grep( srchStr2, whalines, fixed = TRUE )
wt_start <- rep( FALSE, length( whalines ) )
wt_start[ wt_block+4L ] <- TRUE
grp_wt <- cumsum( wt_start )
# identify contiguous block of tabular lines in each block
wt_tab <- 0 == ave( !idx_tbl0, grp_wt, FUN = cumsum )
# extract data frames
wt_dta <- fread( text = whalines[ wt_tab ], header=FALSE )
wtlist <- split( wt_dta, grp_wt[ wt_tab ] )
list(mc = mclist, wt = wtlist )
}
library(microbenchmark)
bigwhalines <- strsplit( paste( rep( wha, 100 )
, collapse = "\n" )
, "\n" )[[ 1 ]]
microbenchmark( slowresult <- slowmethod( bigwhalines )
, fastresult <- fastmethod( bigwhalines )
)
On Wed, 24 Jul 2019, Jeff Newmiller wrote:
?readLines
?grep
?textConnection
On July 24, 2019 11:54:07 AM PDT, "Morway, Eric via R-help"
<r-help@r-project.org> wrote:
The small reproducible example below works, but is way too slow on the
real
problem. The real problem is attempting to extract ~2920 repeated
arrays
from a 60 Mb file and takes ~80 minutes. I'm wondering how I might
re-engineer the script to avoid opening and closing the file 2920 times
as
is the case now. That is, is there a way to keep the file open and
peel
out the arrays and stuff them into a list of data.tables, as is done in
the
small reproducible example below, but in a significantly faster way?
wha <- " INITIAL PRESSURE HEAD
INITIAL TEMPERATURE SET TO 4.000E+00 DEGREES C
VS2DH - MedSand for TL test
TOTAL ELAPSED TIME = 0.000000E+00 sec
TIME STEP 0
MOISTURE CONTENT
Z, IN
m X OR R DISTANCE, IN m
0.500
0.075 0.1475
0.225 0.1475
0.375 0.1475
0.525 0.1475
0.675 0.1475
blah
blah
blah
TEMPERATURE, IN DECREES C
Z, IN
m X OR R DISTANCE, IN m
0.500
0.075 1.1475
0.225 2.1475
0.375 3.1475
0.525 4.1475
0.675 5.1475
blah
blah
blah
TOTAL ELAPSED TIME = 8.6400E+04 sec
TIME STEP 0
MOISTURE CONTENT
Z, IN
m X OR R DISTANCE, IN m
0.500
0.075 0.1875
0.225 0.1775
0.375 0.1575
0.525 0.1675
0.675 0.1475
blah
blah
blah TEMPERATURE, IN DECREES C
Z, IN
m X OR R DISTANCE, IN m
0.500
0.075 1.1475
0.225 2.1475
0.375 3.1475
0.525 4.1475
0.675 5.1475
blah
blah
blah"
example_content <- textConnection(wha)
srchStr1 <- ' MOISTURE CONTENT'
srchStr2 <- 'TEMPERATURE, IN DECREES C'
lines <- readLines(example_content)
mc_list <- NULL
for (i in 1:length(lines)){
# Look for start of water content
if(grepl(srchStr1, lines[i])){
mc_list <- c(mc_list, i)
}
}
tmp_list <- NULL
for (i in 1:length(lines)){
# Look for start of temperature data
if(grepl(srchStr2, lines[i])){
tmp_list <- c(tmp_list, i)
}
}
# Store the water content arrays
wc <- list()
# Read all the moisture content profiles
for(i in 1:length(mc_list)){
lineNum <- mc_list[i] + 3
mct <- read.table(text = wha, skip=lineNum, nrows=5,
col.names=c('depth','wc'))
wc[[i]] <- mct
}
# Store the water temperature arrays
tmp <- list()
# Read all the temperature profiles
for(i in 1:length(tmp_list)){
lineNum <- tmp_list[i] + 3
tmpt <- read.table(text = wha, skip=lineNum, nrows=5,
col.names=c('depth','tmp'))
tmp[[i]] <- tmpt
}
# quick inspection
length(wc)
wc[[1]]
# Looks like what I'm after, but too slow in real world problem
[[alternative HTML version deleted]]
______________________________________________
R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide
http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.
--
Sent from my phone. Please excuse my brevity.
______________________________________________
R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.
---------------------------------------------------------------------------
Jeff Newmiller The ..... ..... Go Live...
DCN:<jdnew...@dcn.davis.ca.us> Basics: ##.#. ##.#. Live Go...
Live: OO#.. Dead: OO#.. Playing
Research Engineer (Solar/Batteries O.O#. #.O#. with
/Software/Embedded Controllers) .OO#. .OO#. rocks...1k
______________________________________________
R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.