Indeed it was the issue with data.table. I converted it to data.frame and it worked like a charm. Thank you so much for your insight!
This is the code that worked: library(parallel) library(data.table) library(doSNOW) n <- parallel::detectCores() cl <- parallel::makeCluster(n, type = "SOCK") doSNOW::registerDoSNOW(cl) files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T) lst_out <- foreach::foreach(i = seq_along(files), .packages = c("data.table") ) %dopar% { tmp <- get(load(files[i])) a <- data.table::copy(tmp) a=as.data.frame(a) rm(tmp) gc() names <- rownames(a) if("blup" %in% colnames(a)) { data <- data.table(names, a["blup"]) nm1 <- c("rsid", "ref_allele", "eff_allele") data[, (nm1) := tstrsplit(names, ":")[-2]] out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][, WGT := files[i]][] } else { data <- data.table(names) nm1 <- c("rsid", "ref_allele", "eff_allele") data[, (nm1) := tstrsplit(names, ":")[-2]] out <- data[, .(rsid, ref_allele, eff_allele)][, WGT := files[i]][] } return(out) rm(data) gc() } parallel::stopCluster(cl) big_data <- rbindlist(lst_out, fill = TRUE) On Wed, Dec 16, 2020 at 9:31 AM Ana Marija <sokovic.anamar...@gmail.com> wrote: > > HI Jim, > > this is what I as running: > > library(parallel) > library(data.table) > library(foreach) > library(doSNOW) > > n <- parallel::detectCores() > cl <- parallel::makeCluster(n, type = "SOCK") > doSNOW::registerDoSNOW(cl) > files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T) > > lst_out <- foreach::foreach(i = seq_along(files), > .packages = c("data.table") ) %dopar% { > > a <- get(load(files[i])) > namesplit<-strsplit(rownames(a),":") > rsid<-unlist(lapply(namesplit,"[",1)) > ref_allele<-unlist(lapply(namesplit,"[",3)) > eff_allele<-unlist(lapply(namesplit,"[",4)) > WGT<-rep(files[i],length(rsid)) > data<-data.frame(rsid=rsid,weight=a$blup, #weight is "blup" column > ref_allele=ref_allele,eff_allele,WGT=WGT) > > return(data) > } > parallel::stopCluster(cl) > > big_data <- rbindlist(lst_out) > > and i got: > Error in { : task 4 failed - "$ operator is invalid for atomic vectors" > > parallel::stopCluster(cl) > > I uploaded 3 RDat file here if you want to try it > https://github.com/montenegrina/sample > > Thank you for looking into this > Ana > > On Tue, Dec 15, 2020 at 11:45 PM Jim Lemon <drjimle...@gmail.com> wrote: > > > > Hi Ana, > > Back on the job. I'm not sure how this will work in your setup, but > > here is a try: > > > > a<-read.table(text="top1 blup lasso enet > > rs4980905:184404:C:A 0.07692622 -1.881795e-04 0 0 > > rs7978751:187541:G:C 0.62411425 9.934994e-04 0 0 > > rs2368831:188285:C:T 0.69529158 1.211028e-03 0 0 > > rs12830904:188335:T:A 0.92793158 -9.143555e-05 0 0 > > rs1500098:189093:G:C 0.42032471 9.001814e-04 0 0 > > rs79410690:190097:G:A 0.26194244 5.019037e-04 0 0", > > header=TRUE,stringsAsFactors=FALSE) > > namesplit<-strsplit(rownames(a),":") > > rsid<-unlist(lapply(namesplit,"[",1)) > > ref_allele<-unlist(lapply(namesplit,"[",3)) > > eff_allele<-unlist(lapply(namesplit,"[",4)) > > # here I'm assuming that the filename > > # is stored in files[i] > > files<-"retina.ENSG00000135776.wgt.RDat" > > i<-1 > > WGT<-rep(files[i],length(rsid)) > > data<-data.frame(rsid=rsid,weight=a$top1, > > ref_allele=ref_allele,eff_allele,WGT=WGT) > > data > > > > Note that the output is a data frame, not a data table. I hope that > > the function for creating a data table is close enough to that for a > > data frame for you to work it out. If not I can probably have a look > > at it a bit later. > > > > Jim > > > > On Wed, Dec 16, 2020 at 1:45 PM Ana Marija <sokovic.anamar...@gmail.com> > > wrote: > > > > > > Hi Jim, > > > > > > as always you're completely right, this is what is happening: > > > > > > > head(a) > > > top1 blup lasso enet > > > rs4980905:184404:C:A 0.07692622 -1.881795e-04 0 0 > > > rs7978751:187541:G:C 0.62411425 9.934994e-04 0 0 > > > rs2368831:188285:C:T 0.69529158 1.211028e-03 0 0 > > > rs12830904:188335:T:A 0.92793158 -9.143555e-05 0 0 > > > rs1500098:189093:G:C 0.42032471 9.001814e-04 0 0 > > > rs79410690:190097:G:A 0.26194244 5.019037e-04 0 0 > > > > names <- rownames(a) > > > > data <- data.table(names, a["blup"]) > > > > head(data) > > > names V2 > > > 1: rs4980905:184404:C:A NA > > > 2: rs7978751:187541:G:C NA > > > 3: rs2368831:188285:C:T NA > > > 4: rs12830904:188335:T:A NA > > > 5: rs1500098:189093:G:C NA > > > 6: rs79410690:190097:G:A NA > > > > > > So my goal is to transform what is in "a" to this for every RDat file: > > > > > > rsid weight ref_allele eff_allele > > > 1: rs72763981 9.376766e-09 C G > > > 2: rs144383755 -2.093346e-09 A G > > > 3: rs1925717 1.511376e-08 T C > > > 4: rs61827307 -1.625302e-08 C A > > > 5: rs61827308 -1.625302e-08 G C > > > 6: rs199623136 -9.128354e-10 GC G > > > WGT > > > 1: retina.ENSG00000135776.wgt.RDat > > > 2: retina.ENSG00000135776.wgt.RDat > > > 3: retina.ENSG00000135776.wgt.RDat > > > 4: retina.ENSG00000135776.wgt.RDat > > > 5: retina.ENSG00000135776.wgt.RDat > > > 6: retina.ENSG00000135776.wgt.RDat > > > > > > so from rs4980905:184404:C:A I would take rs4980905 to be in column > > > "rsid", C in column "ref_allele" and A to be in column "eff_allele", > > > WGT column would just be filled with a name of the particular RDat > > > file. > > > > > > So the issue is in these lines: > > > > > > a <- get(load(files[i])) > > > names <- rownames(a) > > > data <- data.table(names, a["blup"]) > > > nm1 <- c("rsid", "ref_allele", "eff_allele") > > > > > > any idea how I can rewrite this? > > > > > > > > > > > > On Tue, Dec 15, 2020 at 8:30 PM Jim Lemon <drjimle...@gmail.com> wrote: > > > > > > > > Hi Ana, > > > > I would look at "data" in your second example and see if it contains a > > > > column named "blup" or just the values that were extracted from > > > > a$blup. Also, I assume that weight=blup looks for an object named > > > > "blup", which may not be there. > > > > > > > > Jim > > > > > > > > On Wed, Dec 16, 2020 at 1:20 PM Ana Marija > > > > <sokovic.anamar...@gmail.com> wrote: > > > > > > > > > > Hi Jim, > > > > > > > > > > Maybe my post is confusing. > > > > > > > > > > so "dd" came from my slow code and I don't use it again in > > > > > parallelized code. > > > > > > > > > > So for example for one of my files: > > > > > > > > > > if > > > > > i="retina.ENSG00000120647.wgt.RDat" > > > > > > a <- get(load(i)) > > > > > > head(a) > > > > > top1 blup lasso enet > > > > > rs4980905:184404:C:A 0.07692622 -1.881795e-04 0 0 > > > > > rs7978751:187541:G:C 0.62411425 9.934994e-04 0 0 > > > > > rs2368831:188285:C:T 0.69529158 1.211028e-03 0 0 > > > > > ... > > > > > > > > > > Slow code was posted just to show what was running very slow and it > > > > > was running. I really need help fixing parallelized version. > > > > > > > > > > On Tue, Dec 15, 2020 at 7:35 PM Jim Lemon <drjimle...@gmail.com> > > > > > wrote: > > > > > > > > > > > > Hi Ana, > > > > > > My guess is that in your second code fragment you are assigning the > > > > > > rownames of "a" and the _values_ contained in a$blup to the > > > > > > data.table > > > > > > "data". As I don't have much experience with data tables I may be > > > > > > wrong, but I suspect that the column name "blup" may not be visible > > > > > > or > > > > > > even present in "data". I don't see it in "dd" above this code > > > > > > fragment. > > > > > > > > > > > > Jim > > > > > > > > > > > > On Wed, Dec 16, 2020 at 11:12 AM Ana Marija > > > > > > <sokovic.anamar...@gmail.com> wrote: > > > > > > > > > > > > > > Hello, > > > > > > > > > > > > > > I made a terribly inefficient code which runs forever but it does > > > > > > > run. > > > > > > > > > > > > > > library(dplyr) > > > > > > > library(splitstackshape) > > > > > > > > > > > > > > datalist = list() > > > > > > > files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", > > > > > > > ignore.case=T) > > > > > > > > > > > > > > for(i in files) > > > > > > > { > > > > > > > a<-get(load(i)) > > > > > > > names <- rownames(a) > > > > > > > data <- as.data.frame(cbind(names,a)) > > > > > > > rownames(data) <- NULL > > > > > > > dd=na.omit(concat.split.multiple(data = data, split.cols = > > > > > > > c("names"), > > > > > > > seps = ":")) > > > > > > > dd=select(dd,names_1,blup,names_3,names_4) > > > > > > > colnames(dd)=c("rsid","weight","ref_allele","eff_allele") > > > > > > > dd$WGT<-i > > > > > > > datalist[[i]] <- dd # add it to your list > > > > > > > } > > > > > > > > > > > > > > big_data = do.call(rbind, datalist) > > > > > > > > > > > > > > There is 17345 RDat files this loop has to go through. And each > > > > > > > file > > > > > > > has approximately 10,000 lines. All RDat files can be downloaded > > > > > > > from > > > > > > > here: > > > > > > > https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE115828 and > > > > > > > they are compressed in this file: > > > > > > > GSE115828_retina_TWAS_wgts.tar.gz . > > > > > > > And subset of 3 of those .RDat files is here: > > > > > > > https://github.com/montenegrina/sample > > > > > > > > > > > > > > For one of those files, say i="retina.ENSG00000135776.wgt.RDat" > > > > > > > dd looks like this: > > > > > > > > > > > > > > > head(dd) > > > > > > > rsid weight ref_allele eff_allele > > > > > > > 1: rs72763981 9.376766e-09 C G > > > > > > > 2: rs144383755 -2.093346e-09 A G > > > > > > > 3: rs1925717 1.511376e-08 T C > > > > > > > 4: rs61827307 -1.625302e-08 C A > > > > > > > 5: rs61827308 -1.625302e-08 G C > > > > > > > 6: rs199623136 -9.128354e-10 GC G > > > > > > > WGT > > > > > > > 1: retina.ENSG00000135776.wgt.RDat > > > > > > > 2: retina.ENSG00000135776.wgt.RDat > > > > > > > 3: retina.ENSG00000135776.wgt.RDat > > > > > > > 4: retina.ENSG00000135776.wgt.RDat > > > > > > > 5: retina.ENSG00000135776.wgt.RDat > > > > > > > 6: retina.ENSG00000135776.wgt.RDat > > > > > > > > > > > > > > so on attempt to parallelize this I did this: > > > > > > > > > > > > > > library(parallel) > > > > > > > library(data.table) > > > > > > > library(foreach) > > > > > > > library(doSNOW) > > > > > > > > > > > > > > n <- parallel::detectCores() > > > > > > > cl <- parallel::makeCluster(n, type = "SOCK") > > > > > > > doSNOW::registerDoSNOW(cl) > > > > > > > files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", > > > > > > > ignore.case=T) > > > > > > > > > > > > > > lst_out <- foreach::foreach(i = seq_along(files), > > > > > > > .packages = c("data.table") ) %dopar% { > > > > > > > > > > > > > > a <- get(load(files[i])) > > > > > > > names <- rownames(a) > > > > > > > data <- data.table(names, a["blup"]) > > > > > > > nm1 <- c("rsid", "ref_allele", "eff_allele") > > > > > > > data[, (nm1) := tstrsplit(names, ":")[-2]] > > > > > > > return(data[, .(rsid, weight = blup, ref_allele, eff_allele)][, > > > > > > > WGT := files[i]][]) > > > > > > > } > > > > > > > parallel::stopCluster(cl) > > > > > > > > > > > > > > big_data <- rbindlist(lst_out) > > > > > > > > > > > > > > I am getting this Error: > > > > > > > > > > > > > > Error in { : task 7 failed - "object 'blup' not found" > > > > > > > > parallel::stopCluster(cl) > > > > > > > > > > > > > > Can you please advise, > > > > > > > Ana > > > > > > > > > > > > > > ______________________________________________ > > > > > > > R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see > > > > > > > https://stat.ethz.ch/mailman/listinfo/r-help > > > > > > > PLEASE do read the posting guide > > > > > > > http://www.R-project.org/posting-guide.html > > > > > > > and provide commented, minimal, self-contained, reproducible code. ______________________________________________ R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.