Hi List,

I am looking for a variable selection procedure with a forward-backward 
selection method.
Firstly, it is meant to work with the cophenetic
correlation coefficient (CPCC) and intended to find the variable combination 
with the
highest cophenetic correlation. Secondly, it is aimed at Gower metric with
wards method (though this could be easily extended) aimed at categorical data.

What I have so far is a function for backward selection that returns the 
variables
deleted and associated CPCC.

My current approach is cumbersome and very slow when working with large data 
sets (mostly
because of the proximity matrix calculation). There are also problems with 
using only
backward selection, so a way of combining forward-backward would be much 
better. I was hoping that someone has a better /faster selection procedure that 
can be adapted to using the CPCC.

Below my backward selection function and example.

Thanks and cheers
Herry

################################################

require(cluster)

cophenCbw<-function(dta){
# cophenetic variable selection backward
if(!is.data.frame(dta)) {print("x must be a dataframe with variables as 
columns, cases as rows")}
else if(ncol(dta) <3) {pring("input dataframe must have at least 3 columns")}
else {
#currently function only performs cophenC on gower with ward, but this can be 
adjusted easily to other metrics/methods
require(cluster)
require(ade4)
dta->dta.sic
lhs<-dta
for(j in 1:ncol(dta)){
 print(paste("round", j))
 as.data.frame(matrix(ncol=4, nrow=0))->testm

 for(i in 0:ncol(lhs)) {
  if(i == 0){
   daisy(lhs, metric="gower")->d.all
     agnes(lingoes(d.all),method="ward")->agnes.d.all
     cophenetic(agnes.d.all)->d1
     cor(d1,d.all)->cc
     testm<-data.frame(varID=0,cophenC=round(cc,3),varsdel=NA,round=0)
  }
  else {
     daisy(lhs[,-i], metric="gower")->d.all
     agnes(lingoes(d.all),method="ward")->agnes.d.all
     cophenetic(agnes.d.all)->d1
     cor(d1,d.all)->cc
     
testm<-rbind(testm,data.frame(varID=i,cophenC=round(cc,3),varsdel=colnames(dta)[i],round=j))
     #print(paste("var", i, "out of",ncol(lhs),"nrows", 
nrow(lhs),"rowsInTestm:",nrow(testm)))
   }
 }
 if(j == 1) {
  testm[testm[,2] ==  min(testm[,2]),][1,]->varsdel    #use only the first if 
there are several
   vars2del<-varsdel[j,3]
  lhs<-dta[,-which(colnames(dta) %in% vars2del)]
   
print(paste("var2delete",varsdel[j,1],varsdel[j,3],"cophenC=",varsdel[j,2],"rowsInTestm:",nrow(testm)))


 }   # put exclusion variable into record
 else {
  rbind(varsdel,testm[testm[,2] ==  min(testm[,2]),])->varsdel
   vars2del<-rbind(vars2del,varsdel[j,3])
  lhs<-dta[,-which(colnames(dta) %in% vars2del)]
   
print(paste("var2delete",varsdel[j,1],varsdel[j,3],"cophenC=",varsdel[j,2],"rowsInTestm:",nrow(testm)))
  }
 if(is.na(varsdel[j,3])) break

  }
  
 }
return(varsdel)
}

cophenCbw(plantTraits)


########################################################
______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Reply via email to