Hi,
Found a bug in the function when tested. So, try this (added one more line): #Modified function fun1 <- function(dat,n) { rl <- rle(is.na(dat[,"Count"])) indx <- which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n] lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) { x1 <- dat[c(min(x)-1L,x,max(x)+1L),] x2 <- x1[!is.na(x1$Count),] datN <- data.frame(Position=max(x2$Position),Count=sum(x2$Count)) rowN <- row.names(x2)[x2$Position %in% max(x2$Position)] row.names(datN) <- if(length(rowN)>1) rowN[1] else rowN datN }) names(lst1) <- NULL lst1 <- lst1[!duplicated(sapply(lst1,row.names))] ######added dat2 <- do.call(rbind,lst1) indx2 <- sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE)) dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],] dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2 row.names(dat1New) <- 1:nrow(dat1New) dat1New } #Another function fun2 <- function(dat,n){ indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"]))))) indx1 <- indx[is.na(dat[,"Count"])] names(indx1) <- which(is.na(dat[,"Count"])) indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]] lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) { x1 <- indx2[i] x2 <- as.numeric(names(x1)) x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),] x4 <- subset(x3, !is.na(Count)) x5 <- data.frame(Position=max(x4$Position),Count=sum(x4$Count)) ind <- x4$Position %in% max(x4$Position) row.names(x5) <- if(sum(ind)>1) row.names(x4)[ind][1] else row.names(x4)[ind] x5 }) attr(lst1,"dimnames") <- NULL dat2 <- do.call(rbind,lst1) indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i) {x1 <- indx2[i] x2 <- as.numeric(names(x1)) c(min(x2)-1L, x2, max(x2)+1L)}),use.names=FALSE)) dat$id <- 1:nrow(dat) dat2$id <- as.numeric(row.names(dat2)) library(plyr) res <- join(dat,dat2[,-1],by="id",type="left") res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),] res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])] res2 <- res1[,1:2] row.names(res2) <- 1:nrow(res2) res2 } identical(fun1(dat1,1),fun2(dat1,1)) #[1] TRUE identical(fun1(fun1(dat1,1),2),fun2(fun2(dat1,1),2)) #[1] TRUE identical(fun1(fun1(fun1(dat1,1),2),3),fun2(fun2(fun2(dat1,1),2),3)) #[1] TRUE #Speed set.seed(185) datT <- data.frame(Position = sample(10:80,1e5,replace=TRUE),Count= sample(c(NA, 10:100),1e5, replace=TRUE)) system.time(res <- fun1(datT,1)) # user system elapsed # 0.676 0.000 0.676 system.time(res2 <- fun2(datT,1)) # user system elapsed # 1.240 0.000 1.237 identical(res,res2) #[1] TRUE A.K. On Friday, October 18, 2013 4:19 PM, arun <smartpink...@yahoo.com> wrote: Hi, May be this helps: dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L, 14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L, 29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA, NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L )), .Names = c("Position", "Count"), class = "data.frame", row.names = c(NA, -23L)) #There might be simple solutions. fun1 <- function(dat,n) { rl <- rle(is.na(dat[,"Count"])) indx <- which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n] lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) { x1 <- dat[c(min(x)-1L,x,max(x)+1L),] x2 <- x1[!is.na(x1$Count),] datN <- data.frame(Position=max(x2$Position),Count=sum(x2$Count)) rowN <- row.names(x2)[x2$Position %in% max(x2$Position)] row.names(datN) <- if(length(rowN)>1) rowN[1] else rowN datN }) names(lst1) <- NULL dat2 <- do.call(rbind,lst1) indx2 <- sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE)) dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],] dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2 row.names(dat1New) <- 1:nrow(dat1New) dat1New } dat1N <- fun1(dat1,1) dat1N Position Count 1 15 15 2 22 NA 3 38 NA 4 61 22 5 62 18 6 14 NA 7 29 NA 8 63 NA 9 46 28 10 24 NA 11 22 NA 12 49 16 13 42 19 14 38 NA 15 29 NA 16 22 NA 17 42 46 dat2N <- fun1(dat1N,2) dat2N Position Count 1 61 37 2 62 18 3 14 NA 4 29 NA 5 63 NA 6 49 44 7 42 19 8 38 NA 9 29 NA 10 22 NA 11 42 46 dat3N <- fun1(dat2N,3) dat3N Position Count 1 61 37 2 62 62 3 42 65 A.K. Hi all, I have a dataset with 2 important columns, "Position" and "Count". There are a total of 34,532 rows, but only 457 non-NA values in the "Count" column (every cell in "Position" column has a value). I need to write a loop to march down the rows, and if there are 2 rows in "Count" where there is only 1 NA row between them, sum the two values up and print only one row with the summed Count value and the Position value that corresponds to the larger Count value, thus making the three rows into one. For example: Position Count 15 15 22 NA 38 NA 49 5 55 NA 61 17 would become Position Count 15 15 22 NA 38 NA 61 22 After this step, I also need to write another script to march down the rows and look for rows with only two NA's between non-NA rows in Count. This would make the previous data become Position Count 61 37 Ideally I would like a loop that can be flexibly adjusted to the number of NA's in between adjacent non-NA values that can be freely changed. I would greatly appreciate any insight for this. ______________________________________________ R-help@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.