Re: [R] index instead of loop?

Ben quant Tue, 06 Mar 2012 07:46:55 -0800

Unfortunately, your solution is does not scale well.  (Tough for you to
test this without my real data.)  If ua is my data and rd1 are my report
dates (same as the code below) and I use more columns, it appears that your
solution slows considerably. Remember I have ~11k columns in my real data,
so scalability is critical.


Here are the processing times using real data:

Use 4 columns:
ua = ua[,1:4]
rd1 = rd1[,1:4]
mine: 2.4 sec's
yours: 1.39 sec's   Note: yours is faster with 4 columns (like the mockup
data I provided.)

Use 150 columns:
ua = ua[,1:150]
rd1 = rd1[,1:150]
mine: 5 sec's
yours: 9 sec's

Use 300 columns:
ua = ua[,1:300]
rd1 = rd1[,1:300]
mine: 9.5 sec's
yours: 1 min


##################### data
Here is the mockup date and code used: (Anyone looking to test the
scalability may want to add more columns.)

Mockup date:
z.dates =
c("2007-03-31","2007-06-30","2007-09-30","2007-12-31","2008-03-31","2008-06-30","2008-09-30","2008-12-31")

nms = c("A","B","C","D")
# these are the report dates that are the real days the data was available
rd1 = matrix(c("20070514","20070814","20071115",   "20080213",
"20080514",  "20080814",  "20081114",  "20090217",
               "20070410","20070709","20071009",   "20080109",
"20080407",  "20080708",  "20081007",  "20090112",
               "20070426","--","--","--","--","--","--","20090319",
               "--","--","--","--","--","--","--","--"),
             nrow=8,ncol=4)
dimnames(rd1) = list(z.dates,nms)


######################## My code:

start_t_all = Sys.time()
nms = colnames(ua)

fix = function(x)
{
  year = substring(x, 1, 4);
  mo = substring(x, 5, 6);
  day = substring(x, 7, 8);
  ifelse(year=="--", "NA", paste(year, mo, day, sep = "-"))
}

rd = apply(rd1, 2, fix)
dimnames(rd) = dimnames(rd)

dt1 <- seq(from =as.Date(z.dates[1]), to =
as.Date(z.dates[length(z.dates)]), by =
  "day")
dt = sapply(dt1, as.character)

fin = dt
ck_rows = length(dt)
bad = character(0)

for(cn in 1:ncol(ua)){
  uac = ua[,cn]
  tkr = colnames(ua)[cn]
  rdc = rd[,cn]
  ua_rd = cbind(uac,rdc)
  colnames(ua_rd) = c(tkr,'rt_date')
  xx1 = merge(dt,ua_rd,by.x=1,by.y= 'rt_date',all.x=T)
  xx = as.character(xx1[,2])
  values <- c(NA, xx[!is.na(xx)])
  ind = cumsum(!is.na(xx)) + 1
  y <- values[ind]
  if(ck_rows == length(y)){
    fin  = data.frame(fin,y)
  }else{
    bad = c(bad,tkr)
  }
}
if(length(bad)){
  nms = nms[bad != nms]
}
colnames(fin) = c('daily_dates',nms)

print("over all time for loop")
print(Sys.time()-start_t_all)


 ################################### Your code:


z.dates = rownames(ua)

start_t_all = Sys.time()
fdate <- function(x, format="%Y%m%d"){
  DF <- data.frame(x)
  for(i in colnames(DF)){
    DF[, i] <- as.Date(DF[, i], format=format)
    class(DF[, i]) <- "Date"
  }
  DF
}

rd1 <- fdate(rd1)
# This is yours, use it.
dt1 <- seq(from =as.Date(z.dates[1]), to =
as.Date(z.dates[length(z.dates)]), by ="day")
# Set up the result, no time expensive 'cbind' inside a loop
fin1 <- data.frame(matrix(NA, nrow=length(dt1), ncol=ncol(ua) + 1))
fin1[, 1] <- dt1
nr <- nrow(rd1)

# And vectorize
for(tkr in 1:ncol(ua)){
  x  <- c(rd1[, tkr], as.Date("9999-12-31"))
  # inxlist <- lapply(1:nr, function(i) which(x[i] <= dt1 & dt1 < x[i + 1]))
  ix <- lapply(1:nr, function(i) which(x[i] <= dt1 & dt1 < x[i + 1]))
  sapply(1:length(ix), function(i) if(length(ix[[i]])) fin1[ix[[i]], tkr +
1] <<- ua[i, tkr])
}
colnames(fin1) <- c("daily_dates", colnames(ua))
print(Sys.time()-start_t_all)


Thanks for your efforts though,

ben

On Tue, Mar 6, 2012 at 7:39 AM, Ben quant <ccqu...@gmail.com> wrote:

> I think this is what you meant:
>
>
> z.dates =
> c("2007-03-31","2007-06-30","2007-09-30","2007-12-31","2008-03-31","2008-06-30","2008-09-30","2008-12-31")
>
> nms = c("A","B","C","D")
> # these are the report dates that are the real days the data was available
> rd1 = matrix(c("20070514","20070814","20071115",   "20080213",
> "20080514",  "20080814",  "20081114",  "20090217",
>                "20070410","20070709","20071009",   "20080109",
> "20080407",  "20080708",  "20081007",  "20090112",
>                "20070426","--","--","--","--","--","--","20090319",
>                "--","--","--","--","--","--","--","--"),
>              nrow=8,ncol=4)
> dimnames(rd1) = list(z.dates,nms)
>
> # this is the unadjusted raw data, that always has the same dimensions,
> rownames, and colnames as the report dates
> ua = matrix(c(640.35,636.16,655.91,657.41,682.06,702.90,736.15,667.65,
>
> 2625.050,2625.050,2645.000,2302.000,1972.000,1805.000,1547.000,1025.000,
>               NaN, NaN,-98.426,190.304,180.894,183.220,172.520, 144.138,
>               NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN),
>             nrow=8,ncol=4)
> dimnames(ua) = list(z.dates,nms)
>
> ##############################
>
>
> fdate <- function(x, format="%Y%m%d"){
>   DF <- data.frame(x)
>   for(i in colnames(DF)){
>     DF[, i] <- as.Date(DF[, i], format=format)
>     class(DF[, i]) <- "Date"
>   }
>   DF
> }
>
> rd1 <- fdate(rd1)
> # This is yours, use it.
> dt1 <- seq(from =as.Date(z.dates[1]), to = as.Date("2009-03-25"), by =
>   "day")
> # Set up the result, no time expensive 'cbind' inside a loop
> fin1 <- data.frame(matrix(NA, nrow=length(dt1), ncol=ncol(ua) + 1))
> fin1[, 1] <- dt1
> nr <- nrow(rd1)
>
> # And vectorize
> for(tkr in 1:ncol(ua)){
>   x  <- c(rd1[, tkr], as.Date("9999-12-31"))
>  # inxlist <- lapply(1:nr, function(i) which(x[i] <= dt1 & dt1 < x[i + 1]))
>   ix <- lapply(1:nr, function(i) which(x[i] <= dt1 & dt1 < x[i + 1]))
>   sapply(1:length(ix), function(i) if(length(ix[[i]])) fin1[ix[[i]], tkr +
> 1] <<- ua[i, tkr])
>
> }
> colnames(fin1) <- c("daily_dates", colnames(ua))
>
> # Check results
> str(fin1)
> head(fin1)
> tail(fin1)
>
>
> On Tue, Mar 6, 2012 at 7:34 AM, Ben quant <ccqu...@gmail.com> wrote:
>
>> Just looking at this, but it looks like ix doesn't exist:
>>
>>        sapply(1:length(inxlist), function(i) if(length(ix[[i]]))
>> fin1[ix[[i]], tkr
>> + 1] <<- ua[i, tkr])
>>
>>  Trying to sort it out now.
>>
>> Ben
>>
>>
>> On Mon, Mar 5, 2012 at 7:48 PM, Rui Barradas <rui1...@sapo.pt> wrote:
>>
>>> Hello,
>>>
>>> >
>>> > Mar 05, 2012; 8:53pm  by Ben quant Ben quant
>>> > Hello,
>>> >
>>> > Does anyone know of a way I can speed this up?
>>> >
>>>
>>> Maybe, let's see.
>>>
>>> >
>>> > ################################# change anything below.
>>> >
>>>
>>> # Yes.
>>> # First, start by using dates, not characters
>>>
>>> fdate <- function(x, format="%Y%m%d"){
>>>        DF <- data.frame(x)
>>>        for(i in colnames(DF)){
>>>                DF[, i] <- as.Date(DF[, i], format=format)
>>>                class(DF[, i]) <- "Date"
>>>        }
>>>        DF
>>> }
>>>
>>> rd1 <- fdate(rd1)
>>> # This is yours, use it.
>>> dt1 <- seq(from =as.Date(z.dates[1]), to = as.Date("2009-03-25"), by =
>>> "day")
>>> # Set up the result, no time expensive 'cbind' inside a loop
>>> fin1 <- data.frame(matrix(NA, nrow=length(dt1), ncol=ncol(ua) + 1))
>>> fin1[, 1] <- dt1
>>> nr <- nrow(rd1)
>>>
>>> # And vectorize
>>> for(tkr in 1:ncol(ua)){
>>>        x  <- c(rd1[, tkr], as.Date("9999-12-31"))
>>>        inxlist <- lapply(1:nr, function(i) which(x[i] <= dt1 & dt1 < x[i
>>> + 1]))
>>>        sapply(1:length(inxlist), function(i) if(length(ix[[i]]))
>>> fin1[ix[[i]], tkr
>>> + 1] <<- ua[i, tkr])
>>> }
>>> colnames(fin1) <- c("daily_dates", colnames(ua))
>>>
>>> # Check results
>>> str(fin)
>>> str(fin1)
>>> head(fin)
>>> head(fin1)
>>> tail(fin)
>>> tail(fin1)
>>>
>>>
>>> Note that 'fin' has facotrs, 'fin1' numerics.
>>> I haven't timed it but I believe it should be faster.
>>>
>>> Hope this helps,
>>>
>>> Rui Barradas
>>>
>>>
>>>
>>>
>>>
>>> --
>>> View this message in context:
>>> http://r.789695.n4.nabble.com/index-instead-of-loop-tp4447672p4448567.html
>>> Sent from the R help mailing list archive at Nabble.com.
>>>
>>> ______________________________________________
>>> R-help@r-project.org mailing list
>>> https://stat.ethz.ch/mailman/listinfo/r-help
>>> PLEASE do read the posting guide
>>> http://www.R-project.org/posting-guide.html
>>> and provide commented, minimal, self-contained, reproducible code.
>>>
>>
>>
>

        [[alternative HTML version deleted]]

______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Re: [R] index instead of loop?

Reply via email to