Hi, Try: library(stringr) ##### Created the selected files (98) in a separate working folder (SubsetFiles1) (refer to my previous mail) filelst <- list.files() #Sublst <- filelst[1:2] res <- lapply(filelst,function(x) {con <- file(x) Lines1 <- readLines(con) close(con) Lines2 <- Lines1[-1] Lines3 <- str_split(Lines2,"-9999.9M") Lines4 <- str_trim(unlist(lapply(Lines3,function(x) {x[x==""] <- NA paste(x,collapse=" ")}))) Lines5 <- gsub("(\\d+)[A-Za-z]","\\1",Lines4) res1 <- read.table(text=Lines5,sep="",header=FALSE,fill=TRUE) res1})
##Created another folder "Modified" to store the "res" files lapply(seq_along(res),function(i) write.table(res[[i]],paste("/home/arunksa111/Zl/Modified",paste0("Mod_",filelst[i]),sep="/"),row.names=FALSE,quote=FALSE)) lstf1 <- list.files(path="/home/arunksa111/Zl/Modified") lst1 <- lapply(lstf1,function(x) readLines(paste("/home/arunksa111/Zl/Modified",x,sep="/"))) which(lapply(lst1,function(x) length(grep("\\d+-9999.9",x)))>0 ) #[1] 7 11 14 15 30 32 39 40 42 45 46 53 60 65 66 68 69 70 73 74 75 78 80 82 83 #[26] 86 87 90 91 93 lst2 <- lapply(lst1,function(x) gsub("(\\d+)(-9999.9)","\\1 \\2",x)) #lapply(lst2,function(x) x[grep("\\d+-9999.9",x)]) ##checking for the pattern lst3 <- lapply(lst2,function(x) {x<-gsub("(-9999.9)(-9999.9)","\\1 \\2",x)})# #lapply(lst3,function(x) x[grep("\\d+-9999.9",x)]) ##checking for the pattern # lapply(lst3,function(x) x[grep("-9999.9",x)]) ###second check lst4 <- lapply(lst3,function(x) gsub("(Day) (\\d+)","\\1_\\2", x[-1])) #removed the additional header "V1", "V2", etc. #sapply(lst4,function(x) length(strsplit(x[1]," ")[[1]])) #checking the number of columns that should be present lst5 <- lapply(lst4,function(x) unlist(lapply(x, function(y) word(y,1,33)))) lst6 <- lapply(lst5,function(x) read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE)) # head(lst6[[94]],3) lst7 <- lapply(lst6,function(x) x[x$Year >=1961 & x$Year <=2005,]) #head(lst7[[45]],3) lst8 <- lapply(lst7,function(x) x[!is.na(x$Year),]) lst9 <- lapply(lst8,function(x) { if((min(x$Year)>1961)|(max(x$Year)<2005)){ n1<- (min(x$Year)-1961)*12 x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1)) n2<- (2005-max(x$Year))*12 x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2)) colnames(x1) <- colnames(x) colnames(x2) <- colnames(x) x3<- rbind(x1,x,x2) } else if((min(x$Year)==1961) & (max(x$Year)==2005)) { if((min(x$Mo[x$Year==1961])>1)|(max(x$Mo[x$Year==2005])<12)){ n1 <- min(x$Mo[x$Year==1961])-1 x1 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1)) n2 <- (12-max(x$Mo[x$Year==2005])) x2 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2)) colnames(x1) <- colnames(x) colnames(x2) <- colnames(x) x3 <- rbind(x1,x,x2) } else { x } } }) which(sapply(lst9,nrow)!=540) #[1] 45 46 54 64 65 66 70 75 97 lst10 <- lapply(lst9,function(x) {x1 <- x[!is.na(x$Year),] hx1 <- head(x1,1) tx1 <- tail(x1,1) x2 <- as.data.frame(matrix(NA, ncol=ncol(x), nrow=hx1$Mo-1)) x3 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=12-tx1$Mo)) colnames(x2) <- colnames(x) colnames(x3) <- colnames(x) if(nrow(x) < 540) rbind(x2,x,x3) else x }) which(sapply(lst10,nrow)!=540) #integer(0) lst11 <-lapply(lst10,function(x) data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE))) lst12<- lapply(seq_along(lst10),function(i){ x<- lst11[[i]] colnames(x)<- lstf1[i] row.names(x)<- 1:nrow(x) x }) res2 <- do.call(cbind,lst11) dim(res2) #[1] 16740 98 res2[res2==-9999.9]<-NA # change missing value identifier as in your data set which(res2==-9999.9) #integer(0) dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day") dates2<- as.character(dates1) sldat<- split(dates2,list(gsub("-.*","",dates2))) lst12<-lapply(sldat,function(x) lapply(split(x,gsub(".*-(.*)-.*","\\1",x)), function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0) {x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)} else y} )) any(sapply(lst12,function(x) any(lapply(x,length)!=31))) #[1] FALSE lst22<-lapply(lst12,function(x) unlist(x,use.names=FALSE)) sapply(lst22,length) dates3<-unlist(lst22,use.names=FALSE) length(dates3) res3 <- data.frame(dates=dates3,res2,stringsAsFactors=FALSE) str(res3) res3$dates<-as.Date(res3$dates) res4 <- res3[!is.na(res3$dates),] res4[1:3,1:3] dim(res4) #[1] 16436 99 A.K. On Friday, November 8, 2013 5:54 PM, Zilefac Elvis <zilefacel...@yahoo.com> wrote: Hi Ak, I think I figured out how to do the sub-setting. All I needed was to use column 3 in Temperature_inventory and select matching .txt files in the .zip file. The final result would be a subset of files whose IDs are in column 3 of temp_inventory. ************************************************************************* I also have this script which you developed for managing precipitation files. Now I want to use the same code for the temperature files I sent to you. I tried doing it with some errors. Please try these scripts on my temperature data. If you need further information let me know. Note here that -9999.99M is -9999.9M in the temperature files. library(stringr)# load it res<-lapply(temp,function(x) {con <- file(x); Lines1<- readLines(con); close(con); Lines2<-Lines1[-1];# myfiles contain headers in row 2, so I removed the headers Lines3<- str_split(Lines2,"-9999.99M"); Lines4<- str_trim(unlist(lapply(Lines3,function(x){x[x==""]<-NA;#replace missing identifier with NA paste(x,collapse=" ")}))); Lines5<- gsub("(\\d+)[A-Za-z]","\\1",Lines4); res<- read.table(text=Lines5,sep="",header=FALSE,fill=TRUE)}) lapply(res,head,2)# take a look at first two rows of res. lapply(seq_along(res),function(i) write.table(res[[i]],paste0(gsub(".txt","",temp[i]),".txt"),row.names=FALSE,quote=FALSE)) #******************************************************************************************************** # Then use the following as a continuation from the one above lstf1<- list.files(pattern=".txt") length(lstf1) fun2<- function(lstf){ lst1<-lapply(lstf,function(x) readLines(x)) lst2<-lapply(lst1,function(x) {gsub("(\\d+)(-9999.99)","\\1 \\2",x)})#change missing value identifier as in your data set lst3<-lapply(lst2,function(x) {x<-gsub("(\\d+)(-9999.99)","\\1 \\2",x)})#change missing value identifier as in your data set lst4<- lapply(lst3,function(x) read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE)) lst5<- lapply(lst4,function(x) x[x$V1>=1961 & x$V1<=2005,]) lst6<- lapply(lst5,function(x) x[!is.na(x$V1),]) lst7<- lapply(lst6,function(x) { if((min(x$V1)>1961)|(max(x$V1)<2005)){ n1<- (min(x$V1)-1961)*12 x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1)) n2<- (2005-max(x$V1))*12 x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2)) x3<- rbind(x1,x,x2) } else { x } }) lst8<-lapply(lst7,function(x) data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE))) ####changed lst9<- lapply(seq_along(lst8),function(i){ x<- lst8[[i]] colnames(x)<- lstf1[i] row.names(x)<- 1:nrow(x) x }) do.call(cbind,lst9)} res<-fun2(lstf1) dim(res) res[res==-9999.99]<-NA # change missing value identifier as in your data set which(res==-9999.99)#change missing value identifier as in your data set dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day") dates2<- as.character(dates1) sldat<- split(dates2,list(gsub("-.*","",dates2))) lst11<-lapply(sldat,function(x) lapply(split(x,gsub(".*-(.*)-.*","\\1",x)), function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0) {x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)} else y} )) any(sapply(lst1,function(x) any(lapply(x,length)!=31))) lst22<-lapply(lst11,function(x) unlist(x,use.names=FALSE)) sapply(lst22,length) dates3<-unlist(lst22,use.names=FALSE) length(dates3) res1<- data.frame(dates=dates3,res,stringsAsFactors=FALSE) str(res1) res1$dates<-as.Date(res1$dates) res2<-res1[!is.na(res1$dates),] res2[1:3,1:3] dim(res2) write.csv(res2, file = "TemperatureAllstations.csv")# #*********************************************************************************** Waiting for your useful input. Thanks so much, Atem. On Friday, November 8, 2013 2:18 PM, arun <sm you wanted to do. If you want to transfer the subset of files from the main folder to a new location, then you may try: (make sure you create a copy of the original .txt folder before doing this) I created three sub folders and two files (BTemperature_Stations.txt and Tempearture inventory.csv) in my working directory. list.files() #[1] "BTemperature_Stations.txt" "Files1" ## Files1 folder contains all the .txt files; #SubsetFiles: created to subset the files that match the condition #[3] "FilesCopy" "SubsetFiles1" #FilesCopy. A copy of the Files1 folder #[5] "Tempearture inventory.csv" list.files(pattern="\\.") #[1] "BTemperature_Stations.txt" "Tempearture inventory.csv" fl1 <- list.files(pattern="\\.") dat1 <- read.table(fl1[1],header=TRUE,sep="",stringsAsFactors=FALSE,fill=TRUE,check.names=FALSE) dat2 <- read.csv(fl1[2],header=TRUE,sep=",",stringsAsFactors=FALSE,check.names=FALSE) vec1 <- dat1[,3][dat1[,3]%in% dat2[,3]] vec2 <- list.files(path="/home/arunksa111/Zl/Files1",recursive=TRUE) sum(gsub(".txt","",vec2) %in% vec1) #[1] 98 vec3 <- vec2[gsub(".txt","",vec2) %in% vec1] lapply(vec3, function(x) file.rename(paste("/home/arunksa111/Zl/Files1",x,sep="/"), paste("/home/arunksa111/Zl/SubsetFiles1",x,sep="/"))) #change the path accordingly. length(list.files(path="/home/arunksa111/Zl/SubsetFiles1")) #[1] 98 fileDim <- sapply(vec3,function(x) {x1 <-read.delim(paste("/home/arunksa111/Zl/SubsetFiles1",x,sep="/"),header=TRUE,stringsAsFactors=FALSE,sep=",",check.names=FALSE); dim(x1)}) fileDim[,1:3] # dn3011120.txt dn3011240.txt dn3011887.txt #[1,] 1151 791 1054 #[2,] 7 7 7 A.K. On Friday, November 8, 2013 1:41 PM, Zilefac Elvis < les from a list of files. All are text files. The index for selection is found in column 3 of both files. Attached are my data files. Btemperature_Stations is my main file. Temperature inventory is my 'wanted' file and is a subset of Btemperature_Stations. Using column 3 in both files, select the files in Temperature inventory from Btemperature_Stations. The .zip file contains the .txt files which you will extract to a folder and do the selection in R. Thanks, Atem. ______________________________________________ R-help@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.