Hi,
Found a bug in the function when tested. So, try this (added one more line):
#Modified function
fun1 <- function(dat,n) {
rl <- rle(is.na(dat[,"Count"]))
indx <-
which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
x2 <- x1[!is.na(x1$Count),]
datN <-
data.frame(Position=max(x2$Position),Count=sum(x2$Count))
rowN <- row.names(x2)[x2$Position %in% max(x2$Position)]
row.names(datN) <- if(length(rowN)>1) rowN[1] else rowN
datN
})
names(lst1) <- NULL
lst1 <- lst1[!duplicated(sapply(lst1,row.names))] ######added
dat2 <- do.call(rbind,lst1)
indx2 <-
sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x)
c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))
dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
row.names(dat1New) <- 1:nrow(dat1New)
dat1New
}
#Another function
fun2 <- function(dat,n){
indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"])))))
indx1 <- indx[is.na(dat[,"Count"])]
names(indx1) <- which(is.na(dat[,"Count"]))
indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]]
lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) {
x1 <- indx2[i]
x2 <- as.numeric(names(x1))
x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),]
x4 <- subset(x3, !is.na(Count))
x5 <-
data.frame(Position=max(x4$Position),Count=sum(x4$Count))
ind <- x4$Position %in% max(x4$Position)
row.names(x5) <- if(sum(ind)>1)
row.names(x4)[ind][1] else row.names(x4)[ind]
x5
})
attr(lst1,"dimnames") <- NULL
dat2 <- do.call(rbind,lst1)
indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i) {x1 <-
indx2[i]
x2 <- as.numeric(names(x1))
c(min(x2)-1L, x2,
max(x2)+1L)}),use.names=FALSE))
dat$id <- 1:nrow(dat)
dat2$id <- as.numeric(row.names(dat2))
library(plyr)
res <- join(dat,dat2[,-1],by="id",type="left")
res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),]
res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])]
res2 <- res1[,1:2]
row.names(res2) <- 1:nrow(res2)
res2
}
identical(fun1(dat1,1),fun2(dat1,1))
#[1] TRUE
identical(fun1(fun1(dat1,1),2),fun2(fun2(dat1,1),2))
#[1] TRUE
identical(fun1(fun1(fun1(dat1,1),2),3),fun2(fun2(fun2(dat1,1),2),3))
#[1] TRUE
#Speed
set.seed(185)
datT <- data.frame(Position = sample(10:80,1e5,replace=TRUE),Count=
sample(c(NA, 10:100),1e5, replace=TRUE))
system.time(res <- fun1(datT,1))
# user system elapsed
# 0.676 0.000 0.676
system.time(res2 <- fun2(datT,1))
# user system elapsed
# 1.240 0.000 1.237
identical(res,res2)
#[1] TRUE
A.K.
On Friday, October 18, 2013 4:19 PM, arun <smartpink...@yahoo.com> wrote:
Hi,
May be this helps:
dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L,
14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L,
29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA,
NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L
)), .Names = c("Position", "Count"), class = "data.frame", row.names = c(NA,
-23L))
#There might be simple solutions.
fun1 <- function(dat,n) {
rl <- rle(is.na(dat[,"Count"]))
indx <-
which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
x2 <- x1[!is.na(x1$Count),]
datN <-
data.frame(Position=max(x2$Position),Count=sum(x2$Count))
rowN <- row.names(x2)[x2$Position %in% max(x2$Position)]
row.names(datN) <- if(length(rowN)>1) rowN[1] else rowN
datN
})
names(lst1) <- NULL
dat2 <- do.call(rbind,lst1)
indx2 <-
sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x)
c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))
dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
row.names(dat1New) <- 1:nrow(dat1New)
dat1New
}
dat1N <- fun1(dat1,1)
dat1N
Position Count
1 15 15
2 22 NA
3 38 NA
4 61 22
5 62 18
6 14 NA
7 29 NA
8 63 NA
9 46 28
10 24 NA
11 22 NA
12 49 16
13 42 19
14 38 NA
15 29 NA
16 22 NA
17 42 46
dat2N <- fun1(dat1N,2)
dat2N
Position Count
1 61 37
2 62 18
3 14 NA
4 29 NA
5 63 NA
6 49 44
7 42 19
8 38 NA
9 29 NA
10 22 NA
11 42 46
dat3N <- fun1(dat2N,3)
dat3N
Position Count
1 61 37
2 62 62
3 42 65
A.K.
Hi all, I have a dataset with 2 important columns, "Position" and
"Count". There are a total of 34,532 rows, but only 457 non-NA values in the "Count"
column (every cell in "Position" column has a value). I
need to write a loop to march down the rows, and if there are 2 rows in
"Count" where there is only 1 NA row between them, sum the two values up and
print only one row with the summed Count value and the Position
value that corresponds to the larger Count value, thus making the three
rows into one. For example:
Position Count
15 15
22 NA
38 NA
49 5
55 NA
61 17
would become
Position Count
15 15
22 NA
38 NA
61 22
After this step, I also need to write another script to march
down the rows and look for rows with only two NA's between non-NA rows
in Count. This would make the previous data become
Position Count
61 37
Ideally I would like a loop that can be flexibly adjusted to the
number of NA's in between adjacent non-NA values that can be freely
changed. I would greatly appreciate any insight for this.
______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.