You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Reporting Analysis done with R into a Word Document
We will use R to analyse the Bank Marketing dataset and apply a Logistic Regression model on it to predict if the client will subscribe a term deposit (variable y), then deliver results in a Word document using R officer & flextable libraries.
body_add_par(docx, "", style="Normal")
body_add_par(docx, "- spotting days, months and jobs in which users tend to subscribe", style="Normal")
data<-data %>%
mutate(month=factor(gsub("(^[[:alpha:]])", "\\U\\1",
month, perl=TRUE),
month.abb))
gg<- ggplot(data, aes(x= as.numeric(month), color=y)) +
geom_freqpoly(stat="count") +
xlab("month") +
scale_x_continuous(breaks= seq(1, 12, 1), labels=month.abb) +
scale_y_continuous(breaks= seq(0, 1500, 250)) +
theme_minimal() + scale_color_brewer(palette="Set1") +
xlab("last contact month of the year")
gg2<- ggplot(data, aes(x=job, fill=y)) +
geom_bar(width=0.8,
color="white",
alpha=0.8) +
theme_minimal() + scale_fill_brewer(palette="Set1")
gg3<- ggplot(data, aes(x=day, fill=y)) +
geom_histogram(bins=30, col="white") + theme_minimal() +
scale_fill_brewer(palette="Accent") +
xlab("last contact day of the month")
ggsave("daymonthjob.png",
arrangeGrob(gg, gg2, gg3), height=4, width=7)
body_add_img(docx, src="daymonthjob.png", height=4, width=7)
body_add_par(docx, "", style="Normal")
body_add_par(docx, "- Visualizing frequency of categorical variables and their relationship with y: ",
style="Normal")
# function (not in)'%!in%'<-function(x, y)
! ('%in%'(x, y))
vars<- colnames(data)
# summary of numeric data
summary(data[, vars[sapply(data[, vars], class) %in%
c("numeric", "integer")]])
## age balance day duration
## Min. :19.00 Min. :-3313 Min. : 1.00 Min. : 4
## 1st Qu.:33.00 1st Qu.: 69 1st Qu.: 9.00 1st Qu.: 104
## Median :39.00 Median : 444 Median :16.00 Median : 185
## Mean :41.17 Mean : 1423 Mean :15.92 Mean : 264
## 3rd Qu.:49.00 3rd Qu.: 1480 3rd Qu.:21.00 3rd Qu.: 329
## Max. :87.00 Max. :71188 Max. :31.00 Max. :3025
## campaign pdays previous
## Min. : 1.000 Min. : -1.00 Min. : 0.0000
## 1st Qu.: 1.000 1st Qu.: -1.00 1st Qu.: 0.0000
## Median : 2.000 Median : -1.00 Median : 0.0000
## Mean : 2.794 Mean : 39.77 Mean : 0.5426
## 3rd Qu.: 3.000 3rd Qu.: -1.00 3rd Qu.: 0.0000
## Max. :50.000 Max. :871.00 Max. :25.0000
body_add_break(docx)
body_add_par(docx, "Modeling Data", style="heading 1")
body_add_par(docx, "Preparing data for modeling",
style="heading 2")
body_add_par(docx, "- one hot encoding of categorical variables and normalizing numeric ones",
style="Normal")
body_add_par(docx, paste("dimenstions of original data:",
paste(dim(data), collapse=" , ")),
style="Normal")
data$day<-factor(data$day)
# predictors and outcomepred_vars<- setdiff(vars, "y")
## to one hot encode factor values and normalize numeric ones if neededcat<-pred_vars[sapply(data[, pred_vars], class) %in% c("factor", "character")]
num<-pred_vars[sapply(data[, pred_vars], class) %in% c("numeric", "integer")]
for (iincat) {
dict<- unique(data[, i])
for (keyindict) {
data[[paste0(i, "_", key)]] <-1.0* (data[, i] ==key)
}
}
data[, num] <- apply(data[, num], 2, function(x) {
(x- min(x)) / (max(x) - min(x))
})
data<-data[, -which(colnames(data) %in%cat)]
body_add_par(docx, paste("dimenstions of new encoded data:",
paste(dim(data), collapse=" , ")),
style="Normal")
cursor_end(docx)
body_add_par(docx, "Train/Test Splitting",
style="heading 2")
body_add_par(docx, "- splitting data to train and test datasets (80/20)", style="Normal")
sample_size<- floor(0.8* nrow(data))
## set the seed to make your splits reproducible
set.seed(13)
train_indices<- sample(seq(nrow(data)), size=sample_size)
train<-data[train_indices,]
test<-data[-train_indices,]
body_add_par(docx, paste("nrows of training dataset:",
nrow(train)),
style="Normal")
body_add_par(docx, paste("nrows of test dataset:",
nrow(test)),
style="Normal")
body_add_par(docx, "Fitting Classification Model",
style="heading 2")
body_add_par(docx, "- Applying Logistic Regression algorithm to data: ", style="Normal")
glmmodel<- glm(y~., data=train, family= binomial(link="logit"))
# how well the model fits the data it has seentrain$pred<- predict(glmmodel, newdata=train, type="response")
# visualizing classification thresheold
body_add_par(docx, "", style="Normal")
body_add_par(docx, "Spotting Classification Threshold Probability Value: ",
style="Normal")
ggplot(train, aes(x=pred, fill=y)) +
geom_density(alpha=0.6, col="white") +
scale_fill_brewer(palette="Set1") + theme_minimal()
body_add_par(docx, "The model seems to classify well at probablity: 0.12",
style="Normal")
body_add_par(docx, "", style="Normal")
body_add_par(docx, "Evaluating the Model",
style="heading 2")
body_add_par(docx, "- Measuring Model Accuracy: ", style="Normal")
(tab<- table(train$pred>=0.12, train$y))