Hi all,
I'm trying to learn how to use xgboost. I'm working with the 2016 GSS dataset 
(attached) and trying to determine what variables influence number of children. 
I've successfully used this to teach myself GLMs, decision trees, and random 
forests.
My problem is, I can't get the xgboost program (attached) to predict number of 
children in the testing set. Predictions range from 0.504 to 0.518, and number 
of children is supposed to be anywhere from 0 to 8, integers only. If anyone 
can show me what I'm doing wrong, I will be very grateful.
Thanks,Josh
# First, let's load our libraries.
library(ggplot2)
library(caret)
library(xgboost)
# Import GSS_SM and do all necessary data cleaning
gss <- read.csv("gss_sm.csv")
head(gss)  # Checking the variables
ncol(gss)  # 33
summary(gss)
nrow(gss)  # 2867 - we will see that only about 300 are usable for our purposes
gss <- gss[, 4:33]  # Removing identification numbers and the year 2016
# Check for outliers
boxplot(gss$age)  # no outliers
boxplot(gss$childs)  # one outlier, 8 children - leave it in
boxplot(gss$sibs)  # Many outliers, 9 and over - most likely errors
gss <- subset(gss, sibs < 9)  # Remove these errors
levels(gss$marital)  # Reduces to 2 variables - currently married, ever married
levels(gss$padeg)  # Should transform this to numbers 1-5
levels(gss$madeg)  # same
levels(gss$partyid)  # Should transform this to numbers 1-8
levels(gss$polviews)  # Same, 1-7
levels(gss$happy)  # Examine influence of different categories
levels(gss$partners)  # Same, 1-9
table(gss$partners)
# Remove "1 or more, # Unknown", there are only 9, assigning could bias results
gss <- subset(gss, partners != "1 or More, # Unknown")
boxplot(gss$wtssall)
# The 11 outliers have a score of over 3
# This ended up biasing results, so I removed the outliers
gss <- subset(gss, wtssall < 3)
# I did checks on other variables; no changes were needed, so I deleted these
# Let's convert some variables to numeric
# Names are based on original data's naming method, adding "num" when necessary
gss$degnum <- sapply(as.character(gss$degree), switch, "Lt High School" = 1, 
                     "High School" = 2, "Junior College" = 3, 
                     "Bachelor" = 4, "Graduate" = 5, USE.NAMES = F)
gss$padegnum <- sapply(as.character(gss$padeg), switch, "Lt High School" = 1, 
                       "High School" = 2, "Junior College" = 3, 
                       "Bachelor" = 4, "Graduate" = 5, USE.NAMES = F)
gss$madegnum <- sapply(as.character(gss$madeg), switch, "Lt High School" = 1, 
                       "High School" = 2, "Junior College" = 3, 
                       "Bachelor" = 4, "Graduate" = 5, USE.NAMES = F)
gss$partynum <- sapply(as.character(gss$partyid), switch, 
                       "Strong Republican" = 1, 
                       "Not Str Republican" = 2, "Ind,near Rep" = 3, 
                       "Independent" = 4, "Other Party" = 5,
                       "Ind,near Dem" = 6, "Not Str Democrat" = 7,
                       "Strong Democrat" = 8, USE.NAMES = F)
gss$polnum <- sapply(as.character(gss$polviews), switch, 
                     "Extremely Conservative" = 1, 
                     "Conservative" = 2, "Slightly Conservative" = 3, 
                     "Moderate" = 4, "Slightly Liberal" = 5,
                     "Liberal" = 6, "Extremely Liberal" = 7, USE.NAMES = F)
gss$partrnum <- sapply(as.character(gss$partners), switch, "No Partners" = 1, 
                       "1 Partner" = 2, "2 Partners" = 3, 
                       "3 Partners" = 4, "4 Partners" = 5,
                       "5-10 Partners" = 6, "11-20 Partners" = 7,
                       "21-100 Partners" = 8, USE.NAMES = F)
# This is to convert the lower end of the range to a number
gss$incnum <- as.integer(substr(as.character(gss$income_rc), 5, 
                                nchar(as.character(gss$income_rc))))
boxplot(gss$incnum)  # Only one has 170K or over, leave that one in
# Remove the original variables, plus redundant variables
gss[, c("degree", "padeg", "madeg", "partners", "partyid", "polviews",
        "income_rc", "pres12", "kids", "siblings", "ageq")] <- NULL
gss$evermar <- ifelse(gss$marital != "Never Married", 1, 0)
gss$marital <- NULL
# Make assumptions for NULL values in education and politics
gss$degnum <- as.integer(gsub("NULL", 2, gss$degnum))
gss$padegnum <- as.integer(gsub("NULL", 2, gss$padegnum))
gss$madegnum <- as.integer(gsub("NULL", 2, gss$madegnum))
gss$partynum <- as.integer(gsub("NULL", 4, gss$partynum))
gss$polnum <- as.integer(gsub("NULL", 4, gss$polnum))
# Let's look at number of children by parent's age
ggplot(data = gss, mapping = aes(x = age, y = childs)) + geom_count() + 
  geom_smooth(method = "loess") + facet_wrap(sex ~ .)
# Increases steadily until early 40s, drops, starts to increase at 60
# We ask ourselves, why would this be?
# First, people usually have children in their twenties and thirties.
# Second, people over 60 (born in the 1950s and before) had more children.
# Cultural change has led people born since then to have fewer.
# To look at completed families after this change, let's take a subset.
# This subset of our data will only include people ages 40 to 60, inclusive.
mid.age <- subset(gss, age >= 40 & age <= 60)

# Let's use a gradient boosting machine.
# xgboost only accept numeric variables, so make as many as possible
# Some decisions of where to make the split came from previous iterations
mid.age$male <- ifelse(mid.age$sex == "Male", 1, 0)
mid.age$white <- ifelse(mid.age$race == "White", 1, 0)
mid.age$happy <- ifelse(mid.age$happy == "Not Too Happy", 0, 1)
mid.age$legal <- ifelse(mid.age$grass == "Legal", 1, 0)
mid.age$central <- ifelse(as.character(mid.age$region) %in% 
                            c("E. Nor. Central", "E. Sou. Central", 
                              "W. Nor. Central", "W. Sou. Central"), 1, 0)
mid.age$coastal <- ifelse(as.character(mid.age$region) %in% 
                            c("Pacific", "Middle Atlantic"), 1, 0)
mid.age[, c("partrnum", "race", "sex", "region", "relig", "grass", 
            "zodiac", "income16", "agegrp", "religion", "bigregion", 
            "partners_rc", "polnum")] <- NULL

# Now for the boosting
set.seed(15131)  # Number another person would be unikely to repeat
mid.train.rows <- createDataPartition(na.omit(mid.age)$childs, p = 0.8, 
                                      list = FALSE)
mid.train <- na.omit(mid.age)[mid.train.rows, ]
mid.test <- na.omit(mid.age)[-mid.train.rows, ]
# Convert data to a format usable by XGBoost
# A model frame contains a formula and our data frame columns
data.training.mf  <- model.frame(childs ~ ., data = mid.train)

# A model (or design) matrix only contains numerical values. Factors are dummy 
coded by default
data.training.mm  <- model.matrix(attr(data.training.mf, "terms"), data = 
mid.train)

# A XGB dense matrix contains an R matrix and metadata [optional]
data.training.dm  <- xgb.DMatrix(data.training.mm, label = mid.train$childs, 
missing = -1)

# Testing set
data.testing.mf <- model.frame(childs ~ ., data = mid.test)
data.testing.mm <- model.matrix(attr(data.testing.mf, "terms"), data = mid.test)
data.testing.dm <- xgb.DMatrix(data.testing.mm, label = mid.test$childs, 
missing = -1)

# Cross-validation
par <- list(nrounds = 500, 
            max_depth = 3,
            eta = 0.1,
            gamma = 0,
            colsample_bytree = 0.6,
            min_child_weight = 0.1,
            subsample = 0.6) 
xgbGrid <- expand.grid(nrounds = 500, 
                       max_depth = c(1:5),
                       eta = c(0.01, 0.1),
                       gamma = 0,
                       colsample_bytree = c(0.6, 0.9),
                       min_child_weight = 0.1,
                       subsample = 0.6)
model.xgb.cv <- xgb.cv(params = par,
                       data = data.training.dm,
                       nrounds = 500,
                       prediction = FALSE,
                       print_every_n = 25,
                       early_stopping_rounds = 50,
                       maximize = TRUE,
                       nfold = 5)
model.xgb <- xgb.train(params = par,
                       data = data.training.dm,
                       nrounds = model.xgb.cv$best_iteration,
                       prediction = FALSE)
ctrl <- trainControl(method = "cv", number = 2)
# I chose 2 in the interest of computation speed
model.xgb.tuned <- train(childs ~ ., data = mid.train,
                         method = "xgbTree", # This is so we use xgboost
                         trControl = ctrl,
                         tuneGrid = xgbGrid)
ggplot(model.xgb.tuned)
# To minimize MSE, choose eta = 0.01, colsample = 0.9
# At eta = 0.01, MSE minimized at max_depth = 1 - degenerate tree
par.tuned <- list(nrounds = 500, 
                  max_depth = 2,
                  eta = 0.01,
                  gamma = 0,
                  colsample_bytree = 0.9,
                  min_child_weight = 0.1,
                  subsample = 0.6) 
model.xgb.final <- xgboost(params = par.tuned,
                             data = data.training.dm,
                             nrounds = model.xgb.cv$best_iteration,
                             prediction = TRUE)
importance <- xgb.importance(feature_names = dimnames(data.training.dm)[[2]], 
                             model = model.xgb.final)
xgb.plot.importance(importance)
prediction <- predict(model.xgb.final, newdata = data.testing.dm)
hist(prediction)
# Predictions range from 0.504 to 0.518. I'm trying to predict the number of 
children.
______________________________________________
R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Reply via email to