Dear All, I think I have found a fix developing the draft syntax I have provided yesterday, see below or https://www.dropbox.com/s/pbz9dcgxu6ljj8x/sample_code_1.txt?dl=0.
Only desirable improvement is related to the block where I compute the modified v4 (lines 46-60 in the attached file). Provided the real data are of the dimension 8x13x13 (v1xv2xv3), is there anyway to write that block sentence in an automated way? I recall some function that could do that but I can't remenber which one... Thanks to everybody and especially to Bert and David for trying to assist me with this one. And apologizes for not being so clear upfront but I was trying to figure it out myself too... Kind regards, Luca === rm(list=ls()) # this is usual (an extract of) the INPUT file I have: f1 <- structure(list(v1 = c("A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B"), v2 = c("A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C"), v3 = c("B", "B", "B", "C", "C", "C", "B", "B", "B", "C", "C", "C"), v4 = c(18.18530, 3.43806,0.00273, 1.42917, 1.05786, 0.00042, 2.37232, 3.01835, 0, 1.13430, 0.92872, 0)), .Names = c("v1", "v2", "v3", "v4"), class = "data.frame", row.names = c(2L, 9L, 11L, 41L, 48L, 50L, 158L, 165L, 167L, 197L, 204L, 206L)) #first I order the file such that I have 6 distinct v1xv2 combinations f1 <- f1[order(f1$v1,f1$v2),] #I compute the relative importance of each v1xv2 automatically t1 <- aggregate(v4~1,f1,sum) tXX <- aggregate(v4~v1*v2,f1,sum) tAA <- as.numeric(tXX$v4[tXX$v1=="A"&tXX$v2=="A"]/t1) tAB <- as.numeric(tXX$v4[tXX$v1=="A"&tXX$v2=="B"]/t1) tAC <- as.numeric(tXX$v4[tXX$v1=="A"&tXX$v2=="C"]/t1) tBA <- as.numeric(tXX$v4[tXX$v1=="B"&tXX$v2=="A"]/t1) tBB <- as.numeric(tXX$v4[tXX$v1=="B"&tXX$v2=="B"]/t1) tBC <- as.numeric(tXX$v4[tXX$v1=="B"&tXX$v2=="C"]/t1) tAA+tAB+tAC+tBA+tBB+tBC rm(t1) # Next, I compute the difference I need to compute for each C category (t1 <- aggregate(v4~v3,f1,sum)) # this is the actual distribution (t2 <- structure(list(v3 = c("B", "C"), v4 = c(29, 2.56723)), .Names = c("v3", "v4"), row.names = c(NA, -2L), class = "data.frame")) # this is the target distribution # I verify t1 & t2 total is the same aggregate(v4~1,t1,sum) aggregate(v4~1,t2,sum) # I determine the value to be added/subtracted to each instance of v3 t1 <- merge(t1,t2,by="v3") t1$dif <- t1$v4.y-t1$v4.x t1 <- t1[,c("v3","dif")] t1 # I merge the t1 file with the f1 f1 <- merge (f1,t1,by="v3") f1 rm(t1,t2) # I compute the modified v4 value f1$v4mod <- f1$v4 f1$v4mod <- ifelse (f1$v1=="A" & f1$v2=="A" & f1$v3=="B", f1$v4+(tAA*f1$dif), f1$v4mod) f1$v4mod <- ifelse (f1$v1=="A" & f1$v2=="A" & f1$v3=="C", f1$v4+(tAA*f1$dif), f1$v4mod) f1$v4mod <- ifelse (f1$v1=="A" & f1$v2=="B" & f1$v3=="B", f1$v4+(tAB*f1$dif), f1$v4mod) f1$v4mod <- ifelse (f1$v1=="A" & f1$v2=="B" & f1$v3=="C", f1$v4+(tAB*f1$dif), f1$v4mod) f1$v4mod <- ifelse (f1$v1=="A" & f1$v2=="C" & f1$v3=="B", f1$v4+(tAC*f1$dif), f1$v4mod) f1$v4mod <- ifelse (f1$v1=="A" & f1$v2=="C" & f1$v3=="C", f1$v4+(tAC*f1$dif), f1$v4mod) f1$v4mod <- ifelse (f1$v1=="B" & f1$v2=="A" & f1$v3=="B", f1$v4+(tBA*f1$dif), f1$v4mod) f1$v4mod <- ifelse (f1$v1=="B" & f1$v2=="A" & f1$v3=="C", f1$v4+(tBA*f1$dif), f1$v4mod) f1$v4mod <- ifelse (f1$v1=="B" & f1$v2=="B" & f1$v3=="B", f1$v4+(tBB*f1$dif), f1$v4mod) f1$v4mod <- ifelse (f1$v1=="B" & f1$v2=="B" & f1$v3=="C", f1$v4+(tBB*f1$dif), f1$v4mod) f1$v4mod <- ifelse (f1$v1=="B" & f1$v2=="C" & f1$v3=="B", f1$v4+(tBC*f1$dif), f1$v4mod) f1$v4mod <- ifelse (f1$v1=="B" & f1$v2=="C" & f1$v3=="C", f1$v4+(tBC*f1$dif), f1$v4mod) f1 # i compare original vs modified marginal distributions aggregate(v4~v1*v2,f1,sum) aggregate(v4mod~v1*v2,f1,sum) aggregate(v4~v3,f1,sum) aggregate(v4mod~v3,f1,sum) aggregate(v4~1,f1,sum) aggregate(v4mod~1,f1,sum) rm(list=ls()) 2015-03-23 9:10 GMT+01:00 Luca Meyer <lucam1...@gmail.com>: > Hi David, hello R-experts > > Thank you for your input. I have tried the syntax you suggested but > unfortunately the marginal distributions v1xv2 change after the > manipulation. Please see below or > https://www.dropbox.com/s/qhmpkkrejjkpbkx/sample_code.txt?dl=0 for the > syntax. > > > rm(list=ls()) > > > > # this is usual (an extract of) the INPUT file I have: > > f1 <- structure(list(v1 = c("A", "A", "A", "A", "A", "A", "B", "B", > + "B", "B", "B", "B"), v2 = c("A", "B", "C", "A", "B", "C", "A", > + "B", "C", "A", "B", "C"), v3 = c("B", "B", "B", "C", "C", "C", > + "B", "B", "B", "C", "C", "C"), v4 = c(18.18530, 3.43806,0.00273, > 1.42917, 1.05786, 0.00042, 2.37232, 3.01835, 0, 1.13430, 0.92872, > + 0)), .Names = c("v1", "v2", "v3", "v4"), class = "data.frame", row.names > = c(2L, > + 9L, 11L, 41L, 48L, 50L, 158L, 165L, 167L, 197L, 204L, 206L)) > > > > #first I order the file such that I have 6 distinct v1xv2 combinations > > f1 <- f1[order(f1$v1,f1$v2),] > > > > # then I compute (manually) the relative importance of each v1xv2 > combination: > > tAA <- > (18.18530+1.42917)/(18.18530+1.42917+3.43806+1.05786+0.00273+0.00042+2.37232+1.13430+3.01835+0.92872+0.00000+0.00000) > # this is for combination v1=A & v2=A > > tAB <- > (3.43806+1.05786)/(18.18530+1.42917+3.43806+1.05786+0.00273+0.00042+2.37232+1.13430+3.01835+0.92872+0.00000+0.00000) > # this is for combination v1=A & v2=B > > tAC <- > (0.00273+0.00042)/(18.18530+1.42917+3.43806+1.05786+0.00273+0.00042+2.37232+1.13430+3.01835+0.92872+0.00000+0.00000) > # this is for combination v1=A & v2=C > > tBA <- > (2.37232+1.13430)/(18.18530+1.42917+3.43806+1.05786+0.00273+0.00042+2.37232+1.13430+3.01835+0.92872+0.00000+0.00000) > # this is for combination v1=B & v2=A > > tBB <- > (3.01835+0.92872)/(18.18530+1.42917+3.43806+1.05786+0.00273+0.00042+2.37232+1.13430+3.01835+0.92872+0.00000+0.00000) > # this is for combination v1=B & v2=B > > tBC <- > (0.00000+0.00000)/(18.18530+1.42917+3.43806+1.05786+0.00273+0.00042+2.37232+1.13430+3.01835+0.92872+0.00000+0.00000) > # this is for combination v1=B & v2=C > > # and just to make sure I have not made mistakes the following should be > equal to 1 > > tAA+tAB+tAC+tBA+tBB+tBC > [1] 1 > > > > # procedure suggested by David Winsemius > > lookarr <- array(NA, > dim=c(length(unique(f1$v1)),length(unique(f1$v2)),length(unique(f1$v3)) ) , > dimnames=list( unique(f1$v1), unique(f1$v2), unique(f1$v3) ) ) > > lookarr[] <- c(tAA,tAA,tAB,tAB,tAC,tAC,tBA,tBA,tBB,tBB,tBC,tBC) > > lookarr["A","B","C"] > [1] 0.1250369 > > lookarr[ with(f1, cbind(v1, v2, v3)) ] > [1] 6.213554e-01 1.110842e-01 1.424236e-01 1.250369e-01 9.978703e-05 > 0.000000e+00 6.213554e-01 1.110842e-01 1.424236e-01 1.250369e-01 > 9.978703e-05 > [12] 0.000000e+00 > > f1$v4mod <- f1$v4*lookarr[ with(f1, cbind(v1,v2,v3)) ] > > > > # i compare original vs modified marginal distributions > > aggregate(v4~v1*v2,f1,sum) > v1 v2 v4 > 1 A A 19.61447 > 2 B A 3.50662 > 3 A B 4.49592 > 4 B B 3.94707 > 5 A C 0.00315 > 6 B C 0.00000 > > aggregate(v4mod~v1*v2,f1,sum) > v1 v2 v4mod > 1 A A 1.145829e+01 > 2 B A 1.600057e+00 > 3 A B 6.219326e-01 > 4 B B 5.460087e-01 > 5 A C 2.724186e-07 > 6 B C 0.000000e+00 > > aggregate(v4~v3,f1,sum) > v3 v4 > 1 B 27.01676 > 2 C 4.55047 > > aggregate(v4mod~v3,f1,sum) > v3 v4mod > 1 B 13.6931347 > 2 C 0.5331569 > > Any suggestion on how this can be fixed? Remember, I am searching for a > solution where by aggregate(v4~v1*v2,f1,sum)==aggregate(v4~v1*v2,f1,sum) > while aggregate(v4~v3,f1,sum)!=aggregate(v4mod~v3,f1,sum) by specified > amounts (see my earlier example). > > Thank you very much, > > Luca > > > 2015-03-22 22:11 GMT+01:00 David Winsemius <dwinsem...@comcast.net>: > >> >> On Mar 22, 2015, at 1:12 PM, Luca Meyer wrote: >> >> > Hi Bert, >> > >> > Maybe I did not explain myself clearly enough. But let me show you with >> a >> > manual example that indeed what I would like to do is feasible. >> > >> > The following is also available for download from >> > https://www.dropbox.com/s/qhmpkkrejjkpbkx/sample_code.txt?dl=0 >> > >> > rm(list=ls()) >> > >> > This is usual (an extract of) the INPUT file I have: >> > >> > f1 <- structure(list(v1 = c("A", "A", "A", "A", "A", "A", "B", "B", >> > "B", "B", "B", "B"), v2 = c("A", "B", "C", "A", "B", "C", "A", >> > "B", "C", "A", "B", "C"), v3 = c("B", "B", "B", "C", "C", "C", >> > "B", "B", "B", "C", "C", "C"), v4 = c(18.18530, 3.43806,0.00273, >> 1.42917, >> > 1.05786, 0.00042, 2.37232, 3.01835, 0, 1.13430, 0.92872, >> > 0)), .Names = c("v1", "v2", "v3", "v4"), class = "data.frame", >> row.names = >> > c(2L, >> > 9L, 11L, 41L, 48L, 50L, 158L, 165L, 167L, 197L, 204L, 206L)) >> > >> > This are the initial marginal distributions >> > >> > aggregate(v4~v1*v2,f1,sum) >> > aggregate(v4~v3,f1,sum) >> > >> > First I order the file such that I have nicely listed 6 distinct v1xv2 >> > combinations. >> > >> > f1 <- f1[order(f1$v1,f1$v2),] >> > >> > Then I compute (manually) the relative importance of each v1xv2 >> combination: >> > >> > tAA <- >> > >> (18.18530+1.42917)/(18.18530+1.42917+3.43806+1.05786+0.00273+0.00042+2.37232+1.13430+3.01835+0.92872+0.00000+0.00000) >> > # this is for combination v1=A & v2=A >> > tAB <- >> > >> (3.43806+1.05786)/(18.18530+1.42917+3.43806+1.05786+0.00273+0.00042+2.37232+1.13430+3.01835+0.92872+0.00000+0.00000) >> > # this is for combination v1=A & v2=B >> > tAC <- >> > >> (0.00273+0.00042)/(18.18530+1.42917+3.43806+1.05786+0.00273+0.00042+2.37232+1.13430+3.01835+0.92872+0.00000+0.00000) >> > # this is for combination v1=A & v2=C >> > tBA <- >> > >> (2.37232+1.13430)/(18.18530+1.42917+3.43806+1.05786+0.00273+0.00042+2.37232+1.13430+3.01835+0.92872+0.00000+0.00000) >> > # this is for combination v1=B & v2=A >> > tBB <- >> > >> (3.01835+0.92872)/(18.18530+1.42917+3.43806+1.05786+0.00273+0.00042+2.37232+1.13430+3.01835+0.92872+0.00000+0.00000) >> > # this is for combination v1=B & v2=B >> > tBC <- >> > >> (0.00000+0.00000)/(18.18530+1.42917+3.43806+1.05786+0.00273+0.00042+2.37232+1.13430+3.01835+0.92872+0.00000+0.00000) >> > # this is for combination v1=B & v2=C >> > # and just to make sure I have not made mistakes the following should be >> > equal to 1 >> > tAA+tAB+tAC+tBA+tBB+tBC >> > >> > Next, I know I need to increase v4 any time v3=B and the total increase >> I >> > need to have over the whole dataset is 29-27.01676=1.98324. In turn, I >> need >> > to dimish v4 any time V3=C by the same amount (4.55047-2.56723=1.98324). >> > This aspect was perhaps not clear at first. I need to move v4 across v3 >> > categories, but the totals will always remain unchanged. >> > >> > Since I want the data alteration to be proportional to the v1xv2 >> > combinations I do the following: >> > >> > f1$v4 <- ifelse (f1$v1=="A" & f1$v2=="A" & f1$v3=="B", >> f1$v4+(tAA*1.98324), >> > f1$v4) >> > f1$v4 <- ifelse (f1$v1=="A" & f1$v2=="A" & f1$v3=="C", >> f1$v4-(tAA*1.98324), >> > f1$v4) >> > f1$v4 <- ifelse (f1$v1=="A" & f1$v2=="B" & f1$v3=="B", >> f1$v4+(tAB*1.98324), >> > f1$v4) >> > f1$v4 <- ifelse (f1$v1=="A" & f1$v2=="B" & f1$v3=="C", >> f1$v4-(tAB*1.98324), >> > f1$v4) >> > f1$v4 <- ifelse (f1$v1=="A" & f1$v2=="C" & f1$v3=="B", >> f1$v4+(tAC*1.98324), >> > f1$v4) >> > f1$v4 <- ifelse (f1$v1=="A" & f1$v2=="C" & f1$v3=="C", >> f1$v4-(tAC*1.98324), >> > f1$v4) >> > f1$v4 <- ifelse (f1$v1=="B" & f1$v2=="A" & f1$v3=="B", >> f1$v4+(tBA*1.98324), >> > f1$v4) >> > f1$v4 <- ifelse (f1$v1=="B" & f1$v2=="A" & f1$v3=="C", >> f1$v4-(tBA*1.98324), >> > f1$v4) >> > f1$v4 <- ifelse (f1$v1=="B" & f1$v2=="B" & f1$v3=="B", >> f1$v4+(tBB*1.98324), >> > f1$v4) >> > f1$v4 <- ifelse (f1$v1=="B" & f1$v2=="B" & f1$v3=="C", >> f1$v4-(tBB*1.98324), >> > f1$v4) >> > f1$v4 <- ifelse (f1$v1=="B" & f1$v2=="C" & f1$v3=="B", >> f1$v4+(tBC*1.98324), >> > f1$v4) >> > f1$v4 <- ifelse (f1$v1=="B" & f1$v2=="C" & f1$v3=="C", >> f1$v4-(tBC*1.98324), >> > f1$v4) >> > >> >> Seems that this could be done a lot more simply with a lookup matrix and >> ordinary indexing >> >> > lookarr <- array(NA, >> dim=c(length(unique(f1$v1)),length(unique(f1$v2)),length(unique(f1$v3)) ) , >> dimnames=list( unique(f1$v1), unique(f1$v2), unique(f1$v3) ) ) >> > lookarr[] <- c(tAA,tAA,tAB,tAB,tAC,tAC,tBA,tBA, >> tBB, tBB, tBC, tBC) >> >> > lookarr[ "A","B","C"] >> [1] 0.1250369 >> >> > lookarr[ with(f1, cbind(v1, v2, v3)) ] >> [1] 6.213554e-01 1.110842e-01 1.424236e-01 1.250369e-01 9.978703e-05 >> [6] 0.000000e+00 6.213554e-01 1.110842e-01 1.424236e-01 1.250369e-01 >> [11] 9.978703e-05 0.000000e+00 >> > f1$v4mod <- f1$v4*lookarr[ with(f1, cbind(v1,v2,v3)) ] >> > f1 >> v1 v2 v3 v4 v4mod >> 2 A A B 18.18530 1.129954e+01 >> 41 A A C 1.42917 1.587582e-01 >> 9 A B B 3.43806 4.896610e-01 >> 48 A B C 1.05786 1.322716e-01 >> 11 A C B 0.00273 2.724186e-07 >> 50 A C C 0.00042 0.000000e+00 >> 158 B A B 2.37232 1.474054e+00 >> 197 B A C 1.13430 1.260028e-01 >> 165 B B B 3.01835 4.298844e-01 >> 204 B B C 0.92872 1.161243e-01 >> 167 B C B 0.00000 0.000000e+00 >> 206 B C C 0.00000 0.000000e+00 >> >> -- >> david. >> >> >> > This are the final marginal distributions: >> > >> > aggregate(v4~v1*v2,f1,sum) >> > aggregate(v4~v3,f1,sum) >> > >> > Can this procedure be made programmatic so that I can run it on the >> > (8x13x13) categories matrix? if so, how would you do it? I have really >> hard >> > time to do it with some (semi)automatic procedure. >> > >> > Thank you very much indeed once more :) >> > >> > Luca >> > >> > >> > 2015-03-22 18:32 GMT+01:00 Bert Gunter <gunter.ber...@gene.com>: >> > >> >> Nonsense. You are not telling us something or I have failed to >> >> understand something. >> >> >> >> Consider: >> >> >> >> v1 = c("a","b") >> >> v2 = "c("a","a") >> >> >> >> It is not possible to change the value of a sum of values >> >> corresponding to v2="a" without also changing that for v1, which is >> >> not supposed to change according to my understanding of your >> >> specification. >> >> >> >> So I'm done. >> >> >> >> -- Bert >> >> >> >> >> >> Bert Gunter >> >> Genentech Nonclinical Biostatistics >> >> (650) 467-7374 >> >> >> >> "Data is not information. Information is not knowledge. And knowledge >> >> is certainly not wisdom." >> >> Clifford Stoll >> >> >> >> >> >> >> >> >> >> On Sun, Mar 22, 2015 at 8:28 AM, Luca Meyer <lucam1...@gmail.com> >> wrote: >> >>> Sorry forgot to keep the rest of the group in the loop - Luca >> >>> ---------- Forwarded message ---------- >> >>> From: Luca Meyer <lucam1...@gmail.com> >> >>> Date: 2015-03-22 16:27 GMT+01:00 >> >>> Subject: Re: [R] Joining two datasets - recursive procedure? >> >>> To: Bert Gunter <gunter.ber...@gene.com> >> >>> >> >>> >> >>> Hi Bert, >> >>> >> >>> That is exactly what I am trying to achieve. Please notice that >> negative >> >> v4 >> >>> values are allowed. I have done a similar task in the past manually by >> >>> recursively alterating v4 distribution across v3 categories within fix >> >> each >> >>> v1&v2 combination so I am quite positive it can be achieved but >> honestly >> >> I >> >>> took me forever to do it manually and since this is likely to be an >> >>> exercise I need to repeat from time to time I wish I could learn how >> to >> >> do >> >>> it programmatically.... >> >>> >> >>> Thanks again for any further suggestion you might have, >> >>> >> >>> Luca >> >>> >> >>> >> >>> 2015-03-22 16:05 GMT+01:00 Bert Gunter <gunter.ber...@gene.com>: >> >>> >> >>>> Oh, wait a minute ... >> >>>> >> >>>> You still want the marginals for the other columns to be as >> originally? >> >>>> >> >>>> If so, then this is impossible in general as the sum of all the >> values >> >>>> must be what they were originally and you cannot therefore choose >> your >> >>>> values for V3 arbitrarily. >> >>>> >> >>>> Or at least, that seems to be what you are trying to do. >> >>>> >> >>>> -- Bert >> >>>> >> >>>> Bert Gunter >> >>>> Genentech Nonclinical Biostatistics >> >>>> (650) 467-7374 >> >>>> >> >>>> "Data is not information. Information is not knowledge. And knowledge >> >>>> is certainly not wisdom." >> >>>> Clifford Stoll >> >>>> >> >>>> >> >>>> >> >>>> >> >>>> On Sun, Mar 22, 2015 at 7:55 AM, Bert Gunter <bgun...@gene.com> >> wrote: >> >>>>> I would have thought that this is straightforward given my previous >> >>>> email... >> >>>>> >> >>>>> Just set z to what you want -- e,g, all B values to 29/number of >> B's, >> >>>>> and all C values to 2.567/number of C's (etc. for more categories). >> >>>>> >> >>>>> A slick but sort of cheat way to do this programmatically -- in the >> >>>>> sense that it relies on the implementation of factor() rather than >> its >> >>>>> API -- is: >> >>>>> >> >>>>> y <- f1$v3 ## to simplify the notation; could be done using with() >> >>>>> z <- (c(29,2.567)/table(y))[c(y)] >> >>>>> >> >>>>> Then proceed to z1 as I previously described >> >>>>> >> >>>>> -- Bert >> >>>>> >> >>>>> >> >>>>> Bert Gunter >> >>>>> Genentech Nonclinical Biostatistics >> >>>>> (650) 467-7374 >> >>>>> >> >>>>> "Data is not information. Information is not knowledge. And >> knowledge >> >>>>> is certainly not wisdom." >> >>>>> Clifford Stoll >> >>>>> >> >>>>> >> >>>>> >> >>>>> >> >>>>> On Sun, Mar 22, 2015 at 2:00 AM, Luca Meyer <lucam1...@gmail.com> >> >> wrote: >> >>>>>> Hi Bert, hello R-experts, >> >>>>>> >> >>>>>> I am close to a solution but I still need one hint w.r.t. the >> >> following >> >>>>>> procedure (available also from >> >>>>>> https://www.dropbox.com/s/qhmpkkrejjkpbkx/sample_code.txt?dl=0) >> >>>>>> >> >>>>>> rm(list=ls()) >> >>>>>> >> >>>>>> # this is (an extract of) the INPUT file I have: >> >>>>>> f1 <- structure(list(v1 = c("A", "A", "A", "A", "A", "A", "B", "B", >> >> "B", >> >>>>>> "B", "B", "B"), v2 = c("A", "B", "C", "A", "B", "C", "A", "B", "C", >> >> "A", >> >>>>>> "B", "C"), v3 = c("B", "B", "B", "C", "C", "C", "B", "B", "B", "C", >> >> "C", >> >>>>>> "C"), v4 = c(18.18530, 3.43806,0.00273, 1.42917, 1.05786, 0.00042, >> >>>> 2.37232, >> >>>>>> 3.01835, 0, 1.13430, 0.92872, 0)), .Names = c("v1", "v2", "v3", >> >> "v4"), >> >>>> class >> >>>>>> = "data.frame", row.names = c(2L, 9L, 11L, 41L, 48L, 50L, 158L, >> 165L, >> >>>> 167L, >> >>>>>> 197L, 204L, 206L)) >> >>>>>> >> >>>>>> # this is the procedure that Bert suggested (slightly adjusted): >> >>>>>> z <- rnorm(nrow(f1)) ## or anything you want >> >>>>>> z1 <- round(with(f1,v4 + z -ave(z,v1,v2,FUN=mean)), digits=5) >> >>>>>> aggregate(v4~v1*v2,f1,sum) >> >>>>>> aggregate(z1~v1*v2,f1,sum) >> >>>>>> aggregate(v4~v3,f1,sum) >> >>>>>> aggregate(z1~v3,f1,sum) >> >>>>>> >> >>>>>> My question to you is: how can I set z so that I can obtain >> specific >> >>>> values >> >>>>>> for z1-v4 in the v3 aggregation? >> >>>>>> In other words, how can I configure the procedure so that e.g. B=29 >> >> and >> >>>>>> C=2.56723 after running the procedure: >> >>>>>> aggregate(z1~v3,f1,sum) >> >>>>>> >> >>>>>> Thank you, >> >>>>>> >> >>>>>> Luca >> >>>>>> >> >>>>>> PS: to avoid any doubts you might have about who I am the following >> >> is >> >>>> my >> >>>>>> web page: http://lucameyer.wordpress.com/ >> >>>>>> >> >>>>>> >> >>>>>> 2015-03-21 18:13 GMT+01:00 Bert Gunter <gunter.ber...@gene.com>: >> >>>>>>> >> >>>>>>> ... or cleaner: >> >>>>>>> >> >>>>>>> z1 <- with(f1,v4 + z -ave(z,v1,v2,FUN=mean)) >> >>>>>>> >> >>>>>>> >> >>>>>>> Just for curiosity, was this homework? (in which case I should >> >>>>>>> probably have not provided you an answer -- that is, assuming >> that I >> >>>>>>> HAVE provided an answer). >> >>>>>>> >> >>>>>>> Cheers, >> >>>>>>> Bert >> >>>>>>> >> >>>>>>> Bert Gunter >> >>>>>>> Genentech Nonclinical Biostatistics >> >>>>>>> (650) 467-7374 >> >>>>>>> >> >>>>>>> "Data is not information. Information is not knowledge. And >> >> knowledge >> >>>>>>> is certainly not wisdom." >> >>>>>>> Clifford Stoll >> >>>>>>> >> >>>>>>> >> >>>>>>> >> >>>>>>> >> >>>>>>> On Sat, Mar 21, 2015 at 7:53 AM, Bert Gunter <bgun...@gene.com> >> >> wrote: >> >>>>>>>> z <- rnorm(nrow(f1)) ## or anything you want >> >>>>>>>> z1 <- f1$v4 + z - with(f1,ave(z,v1,v2,FUN=mean)) >> >>>>>>>> >> >>>>>>>> >> >>>>>>>> aggregate(v4~v1,f1,sum) >> >>>>>>>> aggregate(z1~v1,f1,sum) >> >>>>>>>> aggregate(v4~v2,f1,sum) >> >>>>>>>> aggregate(z1~v2,f1,sum) >> >>>>>>>> aggregate(v4~v3,f1,sum) >> >>>>>>>> aggregate(z1~v3,f1,sum) >> >>>>>>>> >> >>>>>>>> >> >>>>>>>> Cheers, >> >>>>>>>> Bert >> >>>>>>>> >> >>>>>>>> Bert Gunter >> >>>>>>>> Genentech Nonclinical Biostatistics >> >>>>>>>> (650) 467-7374 >> >>>>>>>> >> >>>>>>>> "Data is not information. Information is not knowledge. And >> >> knowledge >> >>>>>>>> is certainly not wisdom." >> >>>>>>>> Clifford Stoll >> >>>>>>>> >> >>>>>>>> >> >>>>>>>> >> >>>>>>>> >> >>>>>>>> On Sat, Mar 21, 2015 at 6:49 AM, Luca Meyer <lucam1...@gmail.com >> > >> >>>> wrote: >> >>>>>>>>> Hi Bert, >> >>>>>>>>> >> >>>>>>>>> Thank you for your message. I am looking into ave() and tapply() >> >> as >> >>>> you >> >>>>>>>>> suggested but at the same time I have prepared a example of >> input >> >>>> and >> >>>>>>>>> output >> >>>>>>>>> files, just in case you or someone else would like to make an >> >>>> attempt >> >>>>>>>>> to >> >>>>>>>>> generate a code that goes from input to output. >> >>>>>>>>> >> >>>>>>>>> Please see below or download it from >> >>>>>>>>> https://www.dropbox.com/s/qhmpkkrejjkpbkx/sample_code.txt?dl=0 >> >>>>>>>>> >> >>>>>>>>> # this is (an extract of) the INPUT file I have: >> >>>>>>>>> f1 <- structure(list(v1 = c("A", "A", "A", "A", "A", "A", "B", >> >> "B", >> >>>>>>>>> "B", "B", "B", "B"), v2 = c("A", "B", "C", "A", "B", "C", "A", >> >>>>>>>>> "B", "C", "A", "B", "C"), v3 = c("B", "B", "B", "C", "C", "C", >> >>>>>>>>> "B", "B", "B", "C", "C", "C"), v4 = c(18.18530, 3.43806,0.00273, >> >>>>>>>>> 1.42917, >> >>>>>>>>> 1.05786, 0.00042, 2.37232, 3.01835, 0, 1.13430, 0.92872, >> >>>>>>>>> 0)), .Names = c("v1", "v2", "v3", "v4"), class = "data.frame", >> >>>>>>>>> row.names = >> >>>>>>>>> c(2L, >> >>>>>>>>> 9L, 11L, 41L, 48L, 50L, 158L, 165L, 167L, 197L, 204L, 206L)) >> >>>>>>>>> >> >>>>>>>>> # this is (an extract of) the OUTPUT file I would like to >> obtain: >> >>>>>>>>> f2 <- structure(list(v1 = c("A", "A", "A", "A", "A", "A", "B", >> >> "B", >> >>>>>>>>> "B", "B", "B", "B"), v2 = c("A", "B", "C", "A", "B", "C", "A", >> >>>>>>>>> "B", "C", "A", "B", "C"), v3 = c("B", "B", "B", "C", "C", "C", >> >>>>>>>>> "B", "B", "B", "C", "C", "C"), v4 = c(17.83529, 3.43806,0.00295, >> >>>>>>>>> 1.77918, >> >>>>>>>>> 1.05786, 0.0002, 2.37232, 3.01835, 0, 1.13430, 0.92872, >> >>>>>>>>> 0)), .Names = c("v1", "v2", "v3", "v4"), class = "data.frame", >> >>>>>>>>> row.names = >> >>>>>>>>> c(2L, >> >>>>>>>>> 9L, 11L, 41L, 48L, 50L, 158L, 165L, 167L, 197L, 204L, 206L)) >> >>>>>>>>> >> >>>>>>>>> # please notice that while the aggregated v4 on v3 has changed … >> >>>>>>>>> aggregate(f1[,c("v4")],list(f1$v3),sum) >> >>>>>>>>> aggregate(f2[,c("v4")],list(f2$v3),sum) >> >>>>>>>>> >> >>>>>>>>> # … the aggregated v4 over v1xv2 has remained unchanged: >> >>>>>>>>> aggregate(f1[,c("v4")],list(f1$v1,f1$v2),sum) >> >>>>>>>>> aggregate(f2[,c("v4")],list(f2$v1,f2$v2),sum) >> >>>>>>>>> >> >>>>>>>>> Thank you very much in advance for your assitance. >> >>>>>>>>> >> >>>>>>>>> Luca >> >>>>>>>>> >> >>>>>>>>> 2015-03-21 13:18 GMT+01:00 Bert Gunter <gunter.ber...@gene.com >> >: >> >>>>>>>>>> >> >>>>>>>>>> 1. Still not sure what you mean, but maybe look at ?ave and >> >>>> ?tapply, >> >>>>>>>>>> for which ave() is a wrapper. >> >>>>>>>>>> >> >>>>>>>>>> 2. You still need to heed the rest of Jeff's advice. >> >>>>>>>>>> >> >>>>>>>>>> Cheers, >> >>>>>>>>>> Bert >> >>>>>>>>>> >> >>>>>>>>>> Bert Gunter >> >>>>>>>>>> Genentech Nonclinical Biostatistics >> >>>>>>>>>> (650) 467-7374 >> >>>>>>>>>> >> >>>>>>>>>> "Data is not information. Information is not knowledge. And >> >>>> knowledge >> >>>>>>>>>> is certainly not wisdom." >> >>>>>>>>>> Clifford Stoll >> >>>>>>>>>> >> >>>>>>>>>> >> >>>>>>>>>> >> >>>>>>>>>> >> >>>>>>>>>> On Sat, Mar 21, 2015 at 4:53 AM, Luca Meyer < >> >> lucam1...@gmail.com> >> >>>>>>>>>> wrote: >> >>>>>>>>>>> Hi Jeff & other R-experts, >> >>>>>>>>>>> >> >>>>>>>>>>> Thank you for your note. I have tried myself to solve the >> >> issue >> >>>>>>>>>>> without >> >>>>>>>>>>> success. >> >>>>>>>>>>> >> >>>>>>>>>>> Following your suggestion, I am providing a sample of the >> >>>> dataset I >> >>>>>>>>>>> am >> >>>>>>>>>>> using below (also downloadble in plain text from >> >>>>>>>>>>> >> >> https://www.dropbox.com/s/qhmpkkrejjkpbkx/sample_code.txt?dl=0): >> >>>>>>>>>>> >> >>>>>>>>>>> #this is an extract of the overall dataset (n=1200 cases) >> >>>>>>>>>>> f1 <- structure(list(v1 = c("A", "A", "A", "A", "A", "A", "B", >> >>>> "B", >> >>>>>>>>>>> "B", "B", "B", "B"), v2 = c("A", "B", "C", "A", "B", "C", "A", >> >>>>>>>>>>> "B", "C", "A", "B", "C"), v3 = c("B", "B", "B", "C", "C", "C", >> >>>>>>>>>>> "B", "B", "B", "C", "C", "C"), v4 = c(18.1853007621835, >> >>>>>>>>>>> 3.43806581506388, >> >>>>>>>>>>> 0.002733567617055, 1.42917483425029, 1.05786640463504, >> >>>>>>>>>>> 0.000420548864162308, >> >>>>>>>>>>> 2.37232740842861, 3.01835841813241, 0, 1.13430282139936, >> >>>>>>>>>>> 0.928725667117666, >> >>>>>>>>>>> 0)), .Names = c("v1", "v2", "v3", "v4"), class = "data.frame", >> >>>>>>>>>>> row.names >> >>>>>>>>>>> = >> >>>>>>>>>>> c(2L, >> >>>>>>>>>>> 9L, 11L, 41L, 48L, 50L, 158L, 165L, 167L, 197L, 204L, 206L)) >> >>>>>>>>>>> >> >>>>>>>>>>> I need to find a automated procedure that allows me to adjust >> >> v3 >> >>>>>>>>>>> marginals >> >>>>>>>>>>> while maintaining v1xv2 marginals unchanged. >> >>>>>>>>>>> >> >>>>>>>>>>> That is: modify the v4 values you can find by running: >> >>>>>>>>>>> >> >>>>>>>>>>> aggregate(f1[,c("v4")],list(f1$v3),sum) >> >>>>>>>>>>> >> >>>>>>>>>>> while maintaining costant the values you can find by running: >> >>>>>>>>>>> >> >>>>>>>>>>> aggregate(f1[,c("v4")],list(f1$v1,f1$v2),sum) >> >>>>>>>>>>> >> >>>>>>>>>>> Now does it make sense? >> >>>>>>>>>>> >> >>>>>>>>>>> Please notice I have tried to build some syntax that tries to >> >>>> modify >> >>>>>>>>>>> values >> >>>>>>>>>>> within each v1xv2 combination by computing sum of v4, row >> >>>> percentage >> >>>>>>>>>>> in >> >>>>>>>>>>> terms of v4, and there is where my effort is blocked. Not >> >> really >> >>>>>>>>>>> sure >> >>>>>>>>>>> how I >> >>>>>>>>>>> should proceed. Any suggestion? >> >>>>>>>>>>> >> >>>>>>>>>>> Thanks, >> >>>>>>>>>>> >> >>>>>>>>>>> Luca >> >>>>>>>>>>> >> >>>>>>>>>>> >> >>>>>>>>>>> 2015-03-19 2:38 GMT+01:00 Jeff Newmiller < >> >>>> jdnew...@dcn.davis.ca.us>: >> >>>>>>>>>>> >> >>>>>>>>>>>> I don't understand your description. The standard practice on >> >>>> this >> >>>>>>>>>>>> list >> >>>>>>>>>>>> is >> >>>>>>>>>>>> to provide a reproducible R example [1] of the kind of data >> >> you >> >>>> are >> >>>>>>>>>>>> working >> >>>>>>>>>>>> with (and any code you have tried) to go along with your >> >>>>>>>>>>>> description. >> >>>>>>>>>>>> In >> >>>>>>>>>>>> this case, that would be two dputs of your input data frames >> >>>> and a >> >>>>>>>>>>>> dput >> >>>>>>>>>>>> of >> >>>>>>>>>>>> an output data frame (generated by hand from your input data >> >>>>>>>>>>>> frame). >> >>>>>>>>>>>> (Probably best to not use the full number of input values >> >> just >> >>>> to >> >>>>>>>>>>>> keep >> >>>>>>>>>>>> the >> >>>>>>>>>>>> size down.) We could then make an attempt to generate code >> >> that >> >>>>>>>>>>>> goes >> >>>>>>>>>>>> from >> >>>>>>>>>>>> input to output. >> >>>>>>>>>>>> >> >>>>>>>>>>>> Of course, if you post that hard work using HTML then it will >> >>>> get >> >>>>>>>>>>>> corrupted (much like the text below from your earlier emails) >> >>>> and >> >>>>>>>>>>>> we >> >>>>>>>>>>>> won't >> >>>>>>>>>>>> be able to use it. Please learn to post from your email >> >> software >> >>>>>>>>>>>> using >> >>>>>>>>>>>> plain text when corresponding with this mailing list. >> >>>>>>>>>>>> >> >>>>>>>>>>>> [1] >> >>>>>>>>>>>> >> >>>>>>>>>>>> >> >>>>>>>>>>>> >> >>>> >> >> >> http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example >> >>>>>>>>>>>> >> >>>>>>>>>>>> >> >>>>>>>>>>>> >> >>>> >> >> >> --------------------------------------------------------------------------- >> >>>>>>>>>>>> Jeff Newmiller The ..... >> >>>> ..... Go >> >>>>>>>>>>>> Live... >> >>>>>>>>>>>> DCN:<jdnew...@dcn.davis.ca.us> Basics: ##.#. >> >> ##.#. >> >>>>>>>>>>>> Live >> >>>>>>>>>>>> Go... >> >>>>>>>>>>>> Live: OO#.. Dead: >> >> OO#.. >> >>>>>>>>>>>> Playing >> >>>>>>>>>>>> Research Engineer (Solar/Batteries O.O#. >> >> #.O#. >> >>>>>>>>>>>> with >> >>>>>>>>>>>> /Software/Embedded Controllers) .OO#. >> >> .OO#. >> >>>>>>>>>>>> rocks...1k >> >>>>>>>>>>>> >> >>>>>>>>>>>> >> >>>>>>>>>>>> >> >>>> >> >> >> --------------------------------------------------------------------------- >> >>>>>>>>>>>> Sent from my phone. Please excuse my brevity. >> >>>>>>>>>>>> >> >>>>>>>>>>>> On March 18, 2015 9:05:37 AM PDT, Luca Meyer < >> >>>> lucam1...@gmail.com> >> >>>>>>>>>>>> wrote: >> >>>>>>>>>>>>> Thanks for you input Michael, >> >>>>>>>>>>>>> >> >>>>>>>>>>>>> The continuous variable I have measures quantities (down to >> >> the >> >>>>>>>>>>>>> 3rd >> >>>>>>>>>>>>> decimal level) so unfortunately are not frequencies. >> >>>>>>>>>>>>> >> >>>>>>>>>>>>> Any more specific suggestions on how that could be tackled? >> >>>>>>>>>>>>> >> >>>>>>>>>>>>> Thanks & kind regards, >> >>>>>>>>>>>>> >> >>>>>>>>>>>>> Luca >> >>>>>>>>>>>>> >> >>>>>>>>>>>>> >> >>>>>>>>>>>>> === >> >>>>>>>>>>>>> >> >>>>>>>>>>>>> Michael Friendly wrote: >> >>>>>>>>>>>>> I'm not sure I understand completely what you want to do, >> >> but >> >>>>>>>>>>>>> if the data were frequencies, it sounds like task for >> >> fitting a >> >>>>>>>>>>>>> loglinear model with the model formula >> >>>>>>>>>>>>> >> >>>>>>>>>>>>> ~ V1*V2 + V3 >> >>>>>>>>>>>>> >> >>>>>>>>>>>>> On 3/18/2015 2:17 AM, Luca Meyer wrote: >> >>>>>>>>>>>>>> * Hello, >> >>>>>>>>>>>>> *>>* I am facing a quite challenging task (at least to me) >> >> and >> >>>> I >> >>>>>>>>>>>>> was >> >>>>>>>>>>>>> wondering >> >>>>>>>>>>>>> *>* if someone could advise how R could assist me to speed >> >> the >> >>>>>>>>>>>>> task >> >>>>>>>>>>>>> up. >> >>>>>>>>>>>>> *>>* I am dealing with a dataset with 3 discrete variables >> >> and >> >>>> one >> >>>>>>>>>>>>> continuous >> >>>>>>>>>>>>> *>* variable. The discrete variables are: >> >>>>>>>>>>>>> *>>* V1: 8 modalities >> >>>>>>>>>>>>> *>* V2: 13 modalities >> >>>>>>>>>>>>> *>* V3: 13 modalities >> >>>>>>>>>>>>> *>>* The continuous variable V4 is a decimal number always >> >>>> greater >> >>>>>>>>>>>>> than >> >>>>>>>>>>>>> zero in >> >>>>>>>>>>>>> *>* the marginals of each of the 3 variables but it is >> >>>> sometimes >> >>>>>>>>>>>>> equal >> >>>>>>>>>>>>> to zero >> >>>>>>>>>>>>> *>* (and sometimes negative) in the joint tables. >> >>>>>>>>>>>>> *>>* I have got 2 files: >> >>>>>>>>>>>>> *>>* => one with distribution of all possible combinations >> >> of >> >>>>>>>>>>>>> V1xV2 >> >>>>>>>>>>>>> (some of >> >>>>>>>>>>>>> *>* which are zero or neagtive) and >> >>>>>>>>>>>>> *>* => one with the marginal distribution of V3. >> >>>>>>>>>>>>> *>>* I am trying to build the long and narrow dataset >> >> V1xV2xV3 >> >>>> in >> >>>>>>>>>>>>> such >> >>>>>>>>>>>>> a way >> >>>>>>>>>>>>> *>* that each V1xV2 cell does not get modified and V3 fits >> >> as >> >>>>>>>>>>>>> closely >> >>>>>>>>>>>>> as >> >>>>>>>>>>>>> *>* possible to its marginal distribution. Does it make >> >> sense? >> >>>>>>>>>>>>> *>>* To be even more specific, my 2 input files look like >> >> the >> >>>>>>>>>>>>> following. >> >>>>>>>>>>>>> *>>* FILE 1 >> >>>>>>>>>>>>> *>* V1,V2,V4 >> >>>>>>>>>>>>> *>* A, A, 24.251 >> >>>>>>>>>>>>> *>* A, B, 1.065 >> >>>>>>>>>>>>> *>* (...) >> >>>>>>>>>>>>> *>* B, C, 0.294 >> >>>>>>>>>>>>> *>* B, D, 2.731 >> >>>>>>>>>>>>> *>* (...) >> >>>>>>>>>>>>> *>* H, L, 0.345 >> >>>>>>>>>>>>> *>* H, M, 0.000 >> >>>>>>>>>>>>> *>>* FILE 2 >> >>>>>>>>>>>>> *>* V3, V4 >> >>>>>>>>>>>>> *>* A, 1.575 >> >>>>>>>>>>>>> *>* B, 4.294 >> >>>>>>>>>>>>> *>* C, 10.044 >> >>>>>>>>>>>>> *>* (...) >> >>>>>>>>>>>>> *>* L, 5.123 >> >>>>>>>>>>>>> *>* M, 3.334 >> >>>>>>>>>>>>> *>>* What I need to achieve is a file such as the following >> >>>>>>>>>>>>> *>>* FILE 3 >> >>>>>>>>>>>>> *>* V1, V2, V3, V4 >> >>>>>>>>>>>>> *>* A, A, A, ??? >> >>>>>>>>>>>>> *>* A, A, B, ??? >> >>>>>>>>>>>>> *>* (...) >> >>>>>>>>>>>>> *>* D, D, E, ??? >> >>>>>>>>>>>>> *>* D, D, F, ??? >> >>>>>>>>>>>>> *>* (...) >> >>>>>>>>>>>>> *>* H, M, L, ??? >> >>>>>>>>>>>>> *>* H, M, M, ??? >> >>>>>>>>>>>>> *>>* Please notice that FILE 3 need to be such that if I >> >>>> aggregate >> >>>>>>>>>>>>> on >> >>>>>>>>>>>>> V1+V2 I >> >>>>>>>>>>>>> *>* recover exactly FILE 1 and that if I aggregate on V3 I >> >> can >> >>>>>>>>>>>>> recover >> >>>>>>>>>>>>> a file >> >>>>>>>>>>>>> *>* as close as possible to FILE 3 (ideally the same file). >> >>>>>>>>>>>>> *>>* Can anyone suggest how I could do that with R? >> >>>>>>>>>>>>> *>>* Thank you very much indeed for any assistance you are >> >>>> able to >> >>>>>>>>>>>>> provide. >> >>>>>>>>>>>>> *>>* Kind regards, >> >>>>>>>>>>>>> *>>* Luca* >> >>>>>>>>>>>>> >> >>>>>>>>>>>>> [[alternative HTML version deleted]] >> >> >> David Winsemius >> Alameda, CA, USA >> >> > [[alternative HTML version deleted]] ______________________________________________ R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.