-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStudent_Placement_Prediction.Rmd
132 lines (109 loc) · 4.09 KB
/
Student_Placement_Prediction.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
---
output:
pdf_document: default
html_document: default
---
```{r}
library(readxl)
library(dplyr)
library(caret)
library(pROC)
library(rpart)
```
```{r}
data <- read_excel("/file/path")
```
# (2)
# best approach is to exclude such parameters from the model
# parametes that are not available at the time of decision making are not present in the model
```{r}
data$Gender <- as.factor(data$Gender)
data$Board_SSC <- as.factor(data$Board_SSC)
data$Board_HSC <- as.factor(data$Board_HSC)
data$Stream_HSC <- as.factor(data$Stream_HSC)
data$Course_Degree <- as.factor(data$Course_Degree)
data$Entrance_Test <- as.factor(data$Entrance_Test)
data$Specialization_MBA <- as.factor(data$Specialization_MBA)
data <- na.omit(data)
```
```{r}
# (3) logistic regression model using only SSC percentage
model_ssc <- glm(Placement_B ~ Percent_SSC, data = data, family = "binomial")
summary(model_ssc)
```
```{r}
# (4) placement probabilities for SSC percentages
new_data <- data.frame(Percent_SSC = c(60, 80))
predicted_probabilities <- predict(model_ssc, newdata = new_data, type = "response")
predicted_probabilities
```
```{r}
# (5) best cutoff probability = 0.780510400018327
roc_curve <- roc(data$Placement_B, fitted(model_ssc))
best_cutoff <- coords(roc_curve, "best", ret = "threshold")
print(paste("Best Cutoff:", best_cutoff))
```
```{r}
# (6) logistic regression model with all appropriate parameters
model_full <- glm(Placement_B ~ Percent_SSC + Percent_HSC + Percent_Degree + Experience_Yrs + Percentile_ET + Percent_MBA, data = data, family = "binomial")
summary(model_full)
```
```{r}
# (7) sensitivity and specificity at 0.7 cut-off
predicted_class <- ifelse(predict(model_full, type = "response") > 0.7, 1, 0)
conf_matrix <- confusionMatrix(factor(predicted_class), factor(data$Placement_B))
sensitivity_specificity_0_7 <- conf_matrix$byClass[c("Sensitivity", "Specificity")]
```
```{r}
# (8) optimal threshold considering cost
calculate_cost <- function(threshold, data, model) {
predicted_class <- ifelse(predict(model, type = "response") > threshold, 1, 0)
table <- table(factor(predicted_class, levels = c(0, 1)), factor(data$Placement_B, levels = c(0, 1)))
fp_cost <- 4 * table[2, 1]
fn_cost <- table[1, 2]
total_cost <- fp_cost + fn_cost
return(total_cost)
}
thresholds <- seq(0, 1, by = 0.01)
costs <- sapply(thresholds, calculate_cost, data = data, model = model_full)
for (i in 1:length(thresholds)) {
print(paste("Threshold:", thresholds[i], "Cost:", costs[i]))
}
optimal_threshold <- thresholds[which.min(costs)]
print(paste("Optimal Threshold:", optimal_threshold))
# Optimal Threshold: 0.76
```
```{r}
# (9) decision tree model
tree_model <- rpart(Placement_B ~ ., data = data, method = "class")
tree_pred <- predict(tree_model, newdata = data, type = "class")
tree_conf_matrix <- confusionMatrix(tree_pred, factor(data$Placement_B))
```
```{r}
# model comparison
log_conf_matrix <- conf_matrix
comparison <- data.frame(
Model = c("Logistic Regression", "Decision Tree"),
Accuracy = c(log_conf_matrix$overall["Accuracy"], tree_conf_matrix$overall["Accuracy"]),
Sensitivity = c(log_conf_matrix$byClass["Sensitivity"], tree_conf_matrix$byClass["Sensitivity"]),
Specificity = c(log_conf_matrix$byClass["Specificity"], tree_conf_matrix$byClass["Specificity"])
)
comparison
```
```{r}
print(paste("Best cutoff (ROC):", best_cutoff))
print(paste("Optimal threshold (Cost):", optimal_threshold))
print("Model Comparison:")
print(comparison)
```
# (10) recommendation
# logistic reg had good accuracy (0.7654321) and good specificity (0.8965517) , but low sensitivity (0.2222222)
# model needs improvement with sensitivity
# to improve sensitivity can:
# use more relevant predictors, adjust model parameters, or use different modeling techniques
# decsion tree had perfect (1.0) accuracy, sensitivity, and sepecificity
# however, these perfect results indicate overfitting to the training data (may not be good for new unseen data)
#
# model choice = logistic regression
# but with logistic reg, be sure to include
# regular model updates, sensitivity improvements, and consideration of non-quantifiable factors