library(rpart)
library(tidyverse)
library(broom)
# Load & Prepare Data -----------------------------------------------------
# Based on data from a study conducted at the University of Texas in Austin
# (https://www.openintro.org/stat/data/evals.php) we investigate associations
# between teacher evalutions (scores between 1-5) and attributes of the
# professor including:
# -their rank
# -their ethnicity
# -their gender
# -whether or not english was their mother tongue
# -their age
# -their average "beauty" rating
load(url("http://www.openintro.org/stat/data/evals.RData"))
evals <- evals %>%
tbl_df() %>%
select(score, ethnicity, gender, language, age, bty_avg, rank)
View(evals)
# Beauty average? For real? Apparently so:
# 1. The red line is the mean teaching score by itself (using no other information
# about the teachers)
# 2. The blue is the the mean teaching score controlling for beauty. There is
# a significant non-zero (in fact positive) slope.
# i.e. holding all other variables constant, there appears to be a positive
# correlation betwewn beauty score and teaching score. Is it causal? We can't
# say with the info we have.
ggplot(evals, aes(x=bty_avg, y=score)) +
geom_jitter() +
geom_hline(yintercept=mean(evals$score), col="red", size=1) +
geom_smooth(method="lm") +
labs(x="Beauty Score", y="Teaching Score")
# We must first convert all our categorical variables to numerical ones. We use
# the match() function to have the specified coding of the levels of the
# categorical variables.
evals <- evals %>%
mutate(
# Rank: 0 = teaching, 1 = tenure track, 2 = tenure
rank = match(rank, c("teaching", "tenure track", "tenured")) - 1,
# Ethnicity: 0 = non-minority, 1 = minority
ethnicity = match(ethnicity, c("not minority", "minority")) - 1,
# Gender: 0 = male, 1 = female
gender = match(gender, c("male", "female")) - 1,
# Language: 0 = native english speaker, 1 = non-native english speaker
language = match(language, c("english", "non-english")) - 1
)
# Fit CART model ----------------------------------------------------------
# The "depth of tree" is among the many knobs we can control.
knobs <- rpart.control(maxdepth = 3)
# Run the following to see other knobs and stopping criteria
?rpart.control
# We fit the CART model:
model_formula <- as.formula("score ~ ethnicity + gender + language + age + bty_avg + rank")
model_CART <- rpart(model_formula, data = evals, control=knobs)
# Let's study the output:
print(model_CART)
summary(model_CART)
# Blech... Hideous. broom package to the...
broom::tidy(model_CART)
broom::augment(model_CART)
broom::glance(model_CART)
# Alas, the broom package does not recognize objects of type rpart! We cannot
# convert the output to tidy data! The authors of the broom package are aware
# of this shortcoming however:
# https://github.com/tidyverse/broom/issues/153
# So we need to use base R plotting tools. You may need to toy with the margin
# parameter in the plot():
plot(model_CART, margin=0.25)
text(model_CART, use.n = TRUE)
box()
# Wait, what? Recall: Yes go left, no go right and
# Rank: 0 = teaching, 1 = tenure track, 2 = tenure
# Ethnicity: 0 = non-minority, 1 = minority
# Gender: 0 = male, 1 = female
# Language: 0 = native english speaker, 1 = non-native english speaker
evals %>%
filter(bty_avg >= 2.167) %>%
filter(gender >= 0.5) %>%
group_by(old = age >= 34.5) %>%
summarise(count=n(), mean_eval=mean(score))
# The usual predict function for all the original data and for newdata. Note
# the %>% pipe needs to send the 20 randomly sampled rows from evals to the 2nd
# argument of predict(), and not the first as is the default, so we use the
# period to indicate where it should be piped to:
predict(model_CART)
evals %>%
sample_n(20) %>%
predict(model_CART, newdata=.)
# Exercises ---------------------------------------------------------------
# 1. Describe who the above 162+27 people from the above filter()s are
# 2. Which model, model_CART above or model_CART_2 below, do you think will better
# "fit" the data? Quantify this!
# Max depth of now 5, not 3:
knobs_2 <- rpart.control(maxdepth = 1)
model_CART_2 <- rpart(model_formula, data = evals, control=knobs_2)
# Note: This is base R facetting:
par(mfrow=c(1, 2))
plot(model_CART, margin=0.25)
text(model_CART, use.n = TRUE)
box()
plot(model_CART_2, margin=0.25)
text(model_CART_2, use.n = TRUE)
box()
# Extras ------------------------------------------------------------------
# Full details on rpart can be found here:
# http://www.statmethods.net/advstats/cart.html