Looks like a right parenthesis was dropped. Corrected:
removeNNAs <- function( dat, N, lessOrEqual=FALSE ) { N1 <- N+1 rx <- rle( !is.na( dat$Count ) ) # indexes of the ends of each run of NAs or non-NAs cs <- cumsum( rx$lengths ) # indexes of the ends of runs of NAs or non-NAs cs2 <- cs[ !rx$values ] # If the first Count is NA, then drop first run of NAs if ( !rx$values[1] ) { cs2 <- cs2[ -1 ] } # If the last Count is NA, then drop last run of NAs if ( !rx$values[ length( rx$values ) ] ) { cs2 <- cs2[ -length( cs2 ) ] } # cs2 is indexes of rows to potentially receive deleted Counts # after collapse cs2 <- cs2 + 1 # cs1 is indexes of non-NA Counts to be deleted cs1 <- cs[ rx$values ][ seq.int( length( cs2 ) ) ] # identify the indexes of the Count values before the strings # of NAs that meet the criteria if ( lessOrEqual ) { idx0 <- N1 >= ( cs2 - cs1 ) } else { idx0 <- N1 == ( cs2 - cs1 ) } idx1 <- cs1[ idx0 ] # identify the indexes of the Count values after the strings of # NAs that meet the criteria idx2 <- cs2[ idx0 ] # Identify which indexes are both sources and destinations idx1c <-c( idx2[ -length( idx2 ) ] == idx1[ -1 ], FALSE ) # identify groups of indexes that need to be merged idx1g <- rev( cumsum( rev( !idx1c ) ) ) # find which elements of idx1 represent the beginning of a # sequence of indexes to be replaced (meta-indexes) srcmidxs <- which( -1 == diff( c( idx1g[ 1 ] + 1, idx1g ) ) ) # find which elements of idx2 represent the end of a sequence # to be replaced (meta-indexes) destmidxs <- which( 1 == rev( diff( rev( c( idx1g, 0 ) ) ) ) ) # add counts from before NAs to destination rows result <- dat srcidxList <- vector( mode="list", length=length( destmidxs ) ) for ( i in seq.int( length( destmidxs ) ) ) { # row to which data will be copied destidx <- idx2[ destmidxs[ i ] ] # sequence of indexes of source rows srcidxss <- seq.int( from=idx1[ srcmidxs[ i ] ], to=destidx - 1 ) result[ destidx, "Count" ] <- ( dat[ destidx, "Count" ] + sum( dat[ srcidxss, "Count" ], na.rm=TRUE ) ) # keep a list of indexes to be removed srcidxList[ i ] <- list( srcidxss ) } # remove source rows result <- result[ -unlist( srcidxList ), ] result } On Sun, 20 Oct 2013, Jeff Newmiller wrote:
I thought this question looked interesting enough to make my own stab at it, but in hindsight I think this business of combining the counts seems quite unlikely to be necessary... it would be simpler and less damaging to the original data pattern to just remove groups of rows having fewer than "N" NAs.removeNNAs <- function( dat, N, lessOrEqual=FALSE ) { N1 <- N+1 rx <- rle( !is.na( dat$Count ) ) # indexes of the ends of each run of NAs or non-NAs cs <- cumsum( rx$lengths ) # indexes of the ends of runs of NAs or non-NAs cs2 <- cs[ !rx$values ] # If the first Count is NA, then drop first run of NAs if ( !rx$values[1] ) { cs2 <- cs2[ -1 ] } # If the last Count is NA, then drop last run of NAs if ( !rx$values[ length( rx$values ) ] ) { cs2 <- cs2[ -length( cs2 ) ] } # cs2 is indexes of rows to potentially receive deleted Counts # after collapse cs2 <- cs2 + 1 # cs1 is indexes of non-NA Counts to be deleted cs1 <- cs[ rx$values ][ seq.int( length( cs2 ) ) ] # identify the indexes of the Count values before the strings # of NAs that meet the criteria if ( lessOrEqual ) { idx0 <- N1 >= ( cs2 - cs1 ) } else { idx0 <- N1 == ( cs2 - cs1 ) } idx1 <- cs1[ idx0 ] # identify the indexes of the Count values after the strings of # NAs that meet the criteria idx2 <- cs2[ idx0 ] # Identify which indexes are both sources and destinations idx1c <-c( idx2[ -length( idx2 ) ] == idx1[ -1 ], FALSE ) # identify groups of indexes that need to be merged idx1g <- rev( cumsum( rev( !idx1c ) ) ) # find which elements of idx1 represent the beginning of a # sequence of indexes to be replaced (meta-indexes) srcmidxs <- which( -1 == diff( c( idx1g[ 1 ] + 1, idx1g ) ) ) # find which elements of idx2 represent the end of a sequence # to be replaced (meta-indexes) destmidxs <- which( 1 == rev( diff( rev( c( idx1g, 0 ) ) ) ) ) # add counts from before NAs to destination rows result <- dat srcidxList <- vector( mode="list", length=length( destmidxs ) ) for ( i in seq.int( length( destmidxs ) ) ) { # row to which data will be copied destidx <- idx2[ destmidxs[ i ] ] # sequence of indexes of source rows srcidxss <- seq.int( from=idx1[ srcmidxs[ i ] ], to=destidx - 1 ) result[ destidx, "Count" ] <- ( dat[ destidx, "Count" ] + sum( dat[ srcidxss, "Count" ], na.rm=TRUE ) # keep a list of indexes to be removed srcidxList[ i ] <- list( srcidxss ) } # remove source rows result <- result[ -unlist( srcidxList ), ] result } On Fri, 18 Oct 2013, arun wrote:Hi,Found a bug in the function when tested. So, try this (added one more line):#Modified function fun1 <- function(dat,n) { rl <- rle(is.na(dat[,"Count"]))indx <- which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) { x1 <- dat[c(min(x)-1L,x,max(x)+1L),] x2 <- x1[!is.na(x1$Count),]datN <- data.frame(Position=max(x2$Position),Count=sum(x2$Count)) rowN <- row.names(x2)[x2$Position %in% max(x2$Position)] row.names(datN) <- if(length(rowN)>1) rowN[1] else rowNdatN }) names(lst1) <- NULL lst1 <- lst1[!duplicated(sapply(lst1,row.names))] ######added dat2 <- do.call(rbind,lst1)indx2 <- sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],] dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2 row.names(dat1New) <- 1:nrow(dat1New) dat1New } #Another function fun2 <- function(dat,n){ indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"]))))) indx1 <- indx[is.na(dat[,"Count"])] names(indx1) <- which(is.na(dat[,"Count"])) indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]] lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) { x1 <- indx2[i] x2 <- as.numeric(names(x1)) x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),] x4 <- subset(x3, !is.na(Count))x5 <- data.frame(Position=max(x4$Position),Count=sum(x4$Count))ind <- x4$Position %in% max(x4$Position)row.names(x5) <- if(sum(ind)>1) row.names(x4)[ind][1] else row.names(x4)[ind]x5 }) attr(lst1,"dimnames") <- NULL dat2 <- do.call(rbind,lst1)indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i) {x1 <- indx2[i]x2 <- as.numeric(names(x1))c(min(x2)-1L, x2, max(x2)+1L)}),use.names=FALSE))dat$id <- 1:nrow(dat) dat2$id <- as.numeric(row.names(dat2)) library(plyr) res <- join(dat,dat2[,-1],by="id",type="left") res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),] res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])] res2 <- res1[,1:2] row.names(res2) <- 1:nrow(res2) res2 } identical(fun1(dat1,1),fun2(dat1,1)) #[1] TRUE identical(fun1(fun1(dat1,1),2),fun2(fun2(dat1,1),2)) #[1] TRUE identical(fun1(fun1(fun1(dat1,1),2),3),fun2(fun2(fun2(dat1,1),2),3)) #[1] TRUE #Speed set.seed(185)datT <- data.frame(Position = sample(10:80,1e5,replace=TRUE),Count= sample(c(NA, 10:100),1e5, replace=TRUE))system.time(res <- fun1(datT,1)) # user system elapsed # 0.676 0.000 0.676 system.time(res2 <- fun2(datT,1)) # user system elapsed # 1.240 0.000 1.237 identical(res,res2) #[1] TRUE A.K. On Friday, October 18, 2013 4:19 PM, arun <smartpink...@yahoo.com> wrote: Hi, May be this helps: dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L, 14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L, 29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA, NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L)), .Names = c("Position", "Count"), class = "data.frame", row.names = c(NA,-23L)) #There might be simple solutions. fun1 <- function(dat,n) { rl <- rle(is.na(dat[,"Count"]))indx <- which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) { x1 <- dat[c(min(x)-1L,x,max(x)+1L),] x2 <- x1[!is.na(x1$Count),]datN <- data.frame(Position=max(x2$Position),Count=sum(x2$Count)) rowN <- row.names(x2)[x2$Position %in% max(x2$Position)] row.names(datN) <- if(length(rowN)>1) rowN[1] else rowNdatN }) names(lst1) <- NULL dat2 <- do.call(rbind,lst1)indx2 <- sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],] dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2 row.names(dat1New) <- 1:nrow(dat1New) dat1New } dat1N <- fun1(dat1,1) dat1N Position Count 1 15 15 2 22 NA 3 38 NA 4 61 22 5 62 18 6 14 NA 7 29 NA 8 63 NA 9 46 28 10 24 NA 11 22 NA 12 49 16 13 42 19 14 38 NA 15 29 NA 16 22 NA 17 42 46 dat2N <- fun1(dat1N,2) dat2N Position Count 1 61 37 2 62 18 3 14 NA 4 29 NA 5 63 NA 6 49 44 7 42 19 8 38 NA 9 29 NA 10 22 NA 11 42 46 dat3N <- fun1(dat2N,3) dat3N Position Count 1 61 37 2 62 62 3 42 65 A.K. Hi all, I have a dataset with 2 important columns, "Position" and"Count". There are a total of 34,532 rows, but only 457 non-NA values in the "Count" column (every cell in "Position" column has a value). Ineed to write a loop to march down the rows, and if there are 2 rows in"Count" where there is only 1 NA row between them, sum the two values up and print only one row with the summed Count value and the Positionvalue that corresponds to the larger Count value, thus making the three rows into one. For example: Position Count 15 15 22 NA 38 NA 49 5 55 NA 61 17 would become Position Count 15 15 22 NA 38 NA 61 22 After this step, I also need to write another script to march down the rows and look for rows with only two NA's between non-NA rows in Count. This would make the previous data become Position Count 61 37 Ideally I would like a loop that can be flexibly adjusted to the number of NA's in between adjacent non-NA values that can be freely changed. I would greatly appreciate any insight for this. ______________________________________________ R-help@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-helpPLEASE do read the posting guide http://www.R-project.org/posting-guide.htmland provide commented, minimal, self-contained, reproducible code.--------------------------------------------------------------------------- Jeff Newmiller The ..... ..... Go Live... DCN:<jdnew...@dcn.davis.ca.us> Basics: ##.#. ##.#. Live Go... Live: OO#.. Dead: OO#.. Playing Research Engineer (Solar/Batteries O.O#. #.O#. with /Software/Embedded Controllers) .OO#. .OO#. rocks...1k ---------------------------------------------------------------------------
--------------------------------------------------------------------------- Jeff Newmiller The ..... ..... Go Live... DCN:<jdnew...@dcn.davis.ca.us> Basics: ##.#. ##.#. Live Go... Live: OO#.. Dead: OO#.. Playing Research Engineer (Solar/Batteries O.O#. #.O#. with /Software/Embedded Controllers) .OO#. .OO#. rocks...1k ---------------------------------------------------------------------------
______________________________________________ R-help@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.