William, Try a rolling join in data.table, something like this (untested) :
setkey(Data, UnitID, TranDt) # sort by unit then date previous = transform(Data, TranDt=TranDt-1) Data[previous,roll=TRUE] # lookup the prevailing date before, if any, for each row within that row's UnitID Thats all it is, no loops required. That should be fast and memory efficient. 100's of times faster than a subquery in SQL. If you have trouble please follow up on datatable-help. Matthew "William Rogers" <whroger...@gmail.com> wrote in message news:aanlktikk_avupm7j108iseryo9fucpnjhanxpaqvt...@mail.gmail.com... I have a dataset of property transactions that includes the transaction ID (TranID), property ID (UnitID), and transaction date (TranDt). I need to create a data frame (or data table) that includes the previous transaction date, if one exists. This is an easy problem in SQL, where I just run a sub-query, but I'm trying to make R my one-stop-shopping program. The following code works on a subset of my data, but I can't run this on my full dataset because my computer runs out of memory after about 30 minutes. (Using a 32-bit machine.) Use the following synthetic data for example. n<- 100 TranID<- lapply(n:(2*n), function(x) ( as.matrix(paste(x, sample(seq(as.Date('2000-01-01'), as.Date('2010-01-01'), "days"), sample(1:5, 1)), sep= "D"), ncol= 1))) TranID<- do.call("rbind", TranID) UnitID<- substr(TranID, 1, nchar(n)) TranDt<- substr(TranID, nchar(n)+2, nchar(n)+11) Data<- data.frame(TranID= TranID, UnitID= UnitID, TranDt= as.Date(TranDt)) #First I create a list of all the previous transactions by unit TranList<- as.matrix(Data$TranID, ncol= 1) PreTran<- lapply(TranList, function(x) (with(Data, Data[ UnitID== substr(x, 1, nchar(n))& TranDt< Data[TranID== x, "TranDt"], ] )) ) #I do get warnings about missing data because some transactions have no predecessor. #Some transactions have no previous transactions, others have many so I pick the most recent BeforeTran<- lapply(seq_along(PreTran), function(x) ( with(PreTran[[x]], PreTran[[x]][which(TranDt== max(TranDt)), ]))) #I need to add the current transaction's TranID to the list so I can merge later BeforeTran<- lapply(seq_along(PreTran), function(x) ( transform(BeforeTran[[x]], TranID= TranList[x, 1]))) #Finally, I convert from a list to a data frame BeforeTran<- do.call("rbind", BeforeTran) #I have used a combination of data.table and for loops, but that seems cheesey and doesn't preform much better. library(data.table) #First I create a list of all the previous transactions by unit TranList2<- vector(nrow(Data), mode= "list") names(TranList2)<- levels(Data$TranID) DataDT<- data.table(Data) #Use a for loop and data.table to find the date of the previous transaction for (i in levels(Data$TranID)) { if (DataDT[UnitID== substr(i, 1, nchar(n))& TranDt<= (DataDT[TranID== i, TranDt]), length(TranDt)]> 1) TranList2[[i]]<- cbind(TranID= i, DataDT[UnitID== substr(i, 1, nchar(n))& TranDt< (DataDT[TranID== i, TranDt]), list(TranDt= max(TranDt))]) } #Finally, I convert from a list to a data table BeforeTran2<- do.call("rbind", TranList2) #My intution says that this code doesn't take advantage of data.table's attributes. #Are there any ideas out there? Thank you. #P.S. I've tried plyr and it does not help my memory problem. -- William H. Rogers ______________________________________________ R-help@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.