Re: [R] Loop for taking sum of rows based on proximity to other non-NA rows

Jeff Newmiller Sun, 20 Oct 2013 14:14:00 -0700

I thought this question looked interesting enough to make my own stab atit, but in hindsight I think this business of combining the counts seemsquite unlikely to be necessary... it would be simpler and less damaging tothe original data pattern to just remove groups of rows having fewer than"N" NAs.


removeNNAs <- function( dat, N, lessOrEqual=FALSE ) {
  N1 <- N+1
  rx <- rle( !is.na( dat$Count ) )
  # indexes of the ends of each run of NAs or non-NAs
  cs <- cumsum( rx$lengths )
  # indexes of the ends of runs of NAs or non-NAs
  cs2 <- cs[ !rx$values ]
  # If the first Count is NA, then drop first run of NAs
  if ( !rx$values[1] ) {
    cs2 <- cs2[ -1 ]
  }
  # If the last Count is NA, then drop last run of NAs
  if ( !rx$values[ length( rx$values ) ] ) {
    cs2 <- cs2[ -length( cs2 ) ]
  }
  # cs2 is indexes of rows to potentially receive deleted Counts
  # after collapse
  cs2 <- cs2 + 1
  # cs1 is indexes of non-NA Counts to be deleted
  cs1 <- cs[ rx$values ][ seq.int( length( cs2 ) ) ]
  # identify the indexes of the Count values before the strings
  # of NAs that meet the criteria
  if ( lessOrEqual ) {
    idx0 <- N1 >= ( cs2 - cs1 )
  } else {
    idx0 <- N1 == ( cs2 - cs1 )
  }
  idx1 <- cs1[ idx0 ]
  # identify the indexes of the Count values after the strings of
  # NAs that meet the criteria
  idx2 <- cs2[ idx0 ]
  # Identify which indexes are both sources and destinations
  idx1c <-c( idx2[ -length( idx2 ) ] == idx1[ -1 ], FALSE )
  # identify groups of indexes that need to be merged
  idx1g <- rev( cumsum( rev( !idx1c ) ) )
  # find which elements of idx1 represent the beginning of a
  # sequence of indexes to be replaced (meta-indexes)
  srcmidxs <- which( -1 == diff( c( idx1g[ 1 ] + 1, idx1g ) ) )
  # find which elements of idx2 represent the end of a sequence
  # to be  replaced (meta-indexes)
  destmidxs <- which( 1 == rev( diff( rev( c( idx1g, 0 ) ) ) ) )
  # add counts from before NAs to destination rows
  result <- dat
  srcidxList <- vector( mode="list", length=length( destmidxs ) )
  for ( i in seq.int( length( destmidxs ) ) ) {
    # row to which data will be copied
    destidx <- idx2[ destmidxs[ i ] ]
    # sequence of indexes of source rows
    srcidxss <- seq.int( from=idx1[ srcmidxs[ i ] ], to=destidx - 1 )
    result[ destidx, "Count" ] <- ( dat[ destidx, "Count" ]
                       + sum( dat[ srcidxss, "Count" ], na.rm=TRUE )
    # keep a list of indexes to be removed
    srcidxList[ i ] <- list( srcidxss )
  }
  # remove source rows
  result <- result[ -unlist( srcidxList ), ]
  result
}



On Fri, 18 Oct 2013, arun wrote:



Hi,

Found a bug in the function when tested.  So, try this (added one more line):

#Modified function
fun1 <- function(dat,n) {
 rl <- rle(is.na(dat[,"Count"]))
indx <- 
which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
 lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
                         x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
                     x2 <- x1[!is.na(x1$Count),]
                     datN <- 
data.frame(Position=max(x2$Position),Count=sum(x2$Count))
                     rowN <- row.names(x2)[x2$Position %in% max(x2$Position)]  
                     row.names(datN) <- if(length(rowN)>1) rowN[1] else rowN
                     datN
                    })
names(lst1) <- NULL
lst1 <- lst1[!duplicated(sapply(lst1,row.names))] ######added
dat2 <- do.call(rbind,lst1)
indx2 <-  
sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) 
c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))

dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
row.names(dat1New) <- 1:nrow(dat1New)
dat1New
}



#Another function
fun2 <- function(dat,n){
 indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"])))))
 indx1 <- indx[is.na(dat[,"Count"])]
 names(indx1) <- which(is.na(dat[,"Count"]))
indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]]
lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) {
                            x1 <- indx2[i]
                             x2 <- as.numeric(names(x1))
                             x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),]
                             x4 <- subset(x3, !is.na(Count))
                             x5 <- 
data.frame(Position=max(x4$Position),Count=sum(x4$Count))
                            ind <- x4$Position %in% max(x4$Position)
                             row.names(x5) <- if(sum(ind)>1) 
row.names(x4)[ind][1] else row.names(x4)[ind]
                            x5
                        })
attr(lst1,"dimnames") <- NULL
 dat2 <- do.call(rbind,lst1)
indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i) {x1 <- 
indx2[i]
                                     x2 <- as.numeric(names(x1))
                                     c(min(x2)-1L, x2, 
max(x2)+1L)}),use.names=FALSE))

dat$id <- 1:nrow(dat)
dat2$id <- as.numeric(row.names(dat2))
library(plyr)
res <- join(dat,dat2[,-1],by="id",type="left")
res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),]
res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])]
res2 <- res1[,1:2]
row.names(res2) <- 1:nrow(res2)
res2
}


identical(fun1(dat1,1),fun2(dat1,1))
#[1] TRUE
identical(fun1(fun1(dat1,1),2),fun2(fun2(dat1,1),2))
#[1] TRUE

identical(fun1(fun1(fun1(dat1,1),2),3),fun2(fun2(fun2(dat1,1),2),3))
#[1] TRUE


#Speed
set.seed(185)
datT <- data.frame(Position = sample(10:80,1e5,replace=TRUE),Count= 
sample(c(NA, 10:100),1e5, replace=TRUE))
 system.time(res <- fun1(datT,1))
 #  user  system elapsed
 # 0.676   0.000   0.676
 system.time(res2 <- fun2(datT,1))
#   user  system elapsed
#  1.240   0.000   1.237
 identical(res,res2)
#[1] TRUE

A.K.









On Friday, October 18, 2013 4:19 PM, arun <smartpink...@yahoo.com> wrote:
Hi,

May be this helps:

dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L,
14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L,
29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA,
NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L
)), .Names = c("Position", "Count"), class = "data.frame", row.names = c(NA,
-23L))


#There might be simple solutions.

fun1 <- function(dat,n) {
 rl <- rle(is.na(dat[,"Count"]))
indx <- 
which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
 lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
                         x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
                     x2 <- x1[!is.na(x1$Count),]
                     datN <- 
data.frame(Position=max(x2$Position),Count=sum(x2$Count))
                     rowN <- row.names(x2)[x2$Position %in% max(x2$Position)]   
                     row.names(datN) <- if(length(rowN)>1) rowN[1] else rowN
                     datN
                    })
names(lst1) <- NULL
dat2 <- do.call(rbind,lst1)
indx2 <-  
sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) 
c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))

dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
row.names(dat1New) <- 1:nrow(dat1New)
dat1New
}

dat1N <- fun1(dat1,1)
dat1N
   Position Count
1        15    15
2        22    NA
3        38    NA
4        61    22
5        62    18
6        14    NA
7        29    NA
8        63    NA
9        46    28
10       24    NA
11       22    NA
12       49    16
13       42    19
14       38    NA
15       29    NA
16       22    NA
17       42    46

dat2N <- fun1(dat1N,2)
dat2N
   Position Count
1        61    37
2        62    18
3        14    NA
4        29    NA
5        63    NA
6        49    44
7        42    19
8        38    NA
9        29    NA
10       22    NA
11       42    46
dat3N <- fun1(dat2N,3)
dat3N
  Position Count
1       61    37
2       62    62
3       42    65

A.K.









Hi all, I have a dataset with 2 important columns, "Position" and
"Count". There are a total of 34,532 rows, but only 457 non-NA values in the "Count" 
column (every cell in "Position" column has a value). I
need to write a loop to march down the rows, and if there are 2 rows in
"Count" where there is only 1 NA row between them, sum the two values up and 
print only one row with the summed Count value and the Position
value that corresponds to the larger Count value, thus making the three
rows into one. For example:

Position Count
15 15
22 NA
38 NA
49 5
55 NA
61 17

would become

Position Count
15 15
22 NA
38 NA
61 22

After this step, I also need to write another script to march
down the rows and look for rows with only two NA's between non-NA rows
in Count. This would make the previous data become

Position Count
61 37

Ideally I would like a loop that can be flexibly adjusted to the
number of NA's in between adjacent non-NA values that can be freely
changed. I would greatly appreciate any insight for this.

______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.


---------------------------------------------------------------------------
Jeff Newmiller                        The     .....       .....  Go Live...
DCN:<jdnew...@dcn.davis.ca.us>        Basics: ##.#.       ##.#.  Live Go...
                                      Live:   OO#.. Dead: OO#..  Playing
Research Engineer (Solar/Batteries            O.O#.       #.O#.  with
/Software/Embedded Controllers)               .OO#.       .OO#.  rocks...1k
---------------------------------------------------------------------------

______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Re: [R] Loop for taking sum of rows based on proximity to other non-NA rows

Reply via email to