Hi

I am new to R so I apologize if this is trivial.

I am trying to predict the resistance or susceptibility of my sequences to a certain drug with a randomForest function from a file with amino acids on each of the positions in the protein. I ran the following:

library(randomForest)

path <- "C:\\..."
path2 <- "..."
name <- "..."

actualFileName <- paste(path, path2, name, ".txt", sep="")

# reading in the training dataset
dat1 <- read.table(actualFileName, header=TRUE, sep="\t", colClasses="character")

head(dat1)
  X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 ... SR
1  M  K  V  K  L  L  V  L  L   C   T   F   T   A   T   Y   A ... suscep
2  M  K  V  K  L  L  V  L  L   C   T   F   A   A   T   Y   A ... suscep
3  M  K  V  K  L  L  V  L  L   C   T   F   T   A   T   Y   A ... resist
...


# some of the important sites identified by fisher test
dat1$X13 <- as.factor(dat1$X13)
dat1$X52 <- as.factor(dat1$X52)
dat1$X53 <- as.factor(dat1$X53)
dat1$X64 <- as.factor(dat1$X64)
dat1$X85 <- as.factor(dat1$X85)
dat1$X99 <- as.factor(dat1$X99)
dat1$X111 <- as.factor(dat1$X111)
dat1$X142 <- as.factor(dat1$X142)
dat1$X157 <- as.factor(dat1$X157)
dat1$X158 <- as.factor(dat1$X158)
dat1$X162 <- as.factor(dat1$X162)
dat1$X169 <- as.factor(dat1$X169)
dat1$X200 <- as.factor(dat1$X200)
dat1$X202 <- as.factor(dat1$X202)
dat1$X203 <- as.factor(dat1$X203)
dat1$X205 <- as.factor(dat1$X205)
dat1$X206 <- as.factor(dat1$X206)
dat1$X209 <- as.factor(dat1$X209)
dat1$X210 <- as.factor(dat1$X210)
dat1$X225 <- as.factor(dat1$X225)
dat1$X269 <- as.factor(dat1$X269)
dat1$X283 <- as.factor(dat1$X283)
dat1$X290 <- as.factor(dat1$X290)
dat1$X432 <- as.factor(dat1$X432)
dat1$X434 <- as.factor(dat1$X434)
dat1$X455 <- as.factor(dat1$X455)
dat1$X467 <- as.factor(dat1$X467)
dat1$X512 <- as.factor(dat1$X512)
dat1$SR <- as.factor(dat1$SR)



dat1.rf <-randomForest(SR ~ X13+ X52+ X53+ X64+ X85+ X99+ X111+ X142+ X157+ X158+ X162+ X169+ X200+
+ X202+ X203+ X205+ X206+ X209+ X210+ X225+ X269+ X283+ X290+ X432+ X434+ X455+ X467+ X512, data=dat1, importance=TRUE,
+ proximity=TRUE, varUsed=TRUE, ntree=5000, varImpPlot=TRUE)


print(dat1.rf)

varImpPlot(dat1.rf)

varUsed(dat1.rf, by.tree=FALSE, count=TRUE)


MDSplot(dat1.rf, dat1$SR, palette=rep(1, 2),
+   pch=as.numeric(dat1$SR))


path3 <- "C:\\Users..."
path4 <- "..."
name2 <- "..."

# reading in the test dataset
actualFileName2 <- paste(path3, path4, name2, ".txt", sep="")

dat2 <- read.table(actualFileName2, header=TRUE, sep="\t", colClasses="character")



dat2$X13 <- as.factor(dat2$X13)
dat2$X52 <- as.factor(dat2$X52)
dat2$X53 <- as.factor(dat2$X53)
dat2$X64 <- as.factor(dat2$X64)
dat2$X85 <- as.factor(dat2$X85)
dat2$X99 <- as.factor(dat2$X99)
dat2$X111 <- as.factor(dat2$X111)
dat2$X142 <- as.factor(dat2$X142)
dat2$X157 <- as.factor(dat2$X157)
dat2$X158 <- as.factor(dat2$X158)
dat2$X162 <- as.factor(dat2$X162)
dat2$X169 <- as.factor(dat2$X169)
dat2$X200 <- as.factor(dat2$X200)
dat2$X202 <- as.factor(dat2$X202)
dat2$X203 <- as.factor(dat2$X203)
dat2$X205 <- as.factor(dat2$X205)
dat2$X206 <- as.factor(dat2$X206)
dat2$X209 <- as.factor(dat2$X209)
dat2$X210 <- as.factor(dat2$X210)
dat2$X225 <- as.factor(dat2$X225)
dat2$X269 <- as.factor(dat2$X269)
dat2$X283 <- as.factor(dat2$X283)
dat2$X290 <- as.factor(dat2$X290)
dat2$X432 <- as.factor(dat2$X432)
dat2$X434 <- as.factor(dat2$X434)
dat2$X455 <- as.factor(dat2$X455)
dat2$X467 <- as.factor(dat2$X467)
dat2$X512 <- as.factor(dat2$X512)
dat2$SR <- as.factor(dat2$SR)



dat2.pred<-predict(dat1.rf, dat2, type="response", norm.votes=TRUE, predict.all=FALSE, proximity=FALSE, nodes=FALSE)

Error in predict.randomForest(dat1.rf, dat2, type = "response", norm.votes = TRUE, :
  New factor levels not present in the training data

The thing is that each of the amino acid positions in the training dataset is present also in the training dataset. So I don't know how to deal with the error.

Thank you very much.

Kind regards,

Mojca Zelnikar

--
The University of Edinburgh is a charitable body, registered in
Scotland, with registration number SC005336.

______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Reply via email to