Machine Learning

I had the worst luck with this exercise. Something went wrong with my ordinal factors. I peaked at other students’ code and mine matched, but the code did not run. If the model coding was the same, I can only assume something went wrong with the data wranging, but all the steps I tried and different methods for dummy/ordered factors didn’t change the outcome. Maybe ML and I aren’t meant to be :(

Set it Up

Read in previously cleaned data

load("../../fluanalysis/data/clean_symptoms.RData")

# Attempt # 15 of ordering/deording the variables so the models run well.
# balanced_symptoms$Weakness<-as.factor(balanced_symptoms$Weakness)
# balanced_symptoms$CoughIntensity<-as.factor(balanced_symptoms$CoughIntensity)
# balanced_symptoms$Myalgia<-as.factor(balanced_symptoms$Myalgia)

Split the data

set.seed(123)

data_split <- initial_split(balanced_symptoms, prop = 7/10, strata = BodyTemp)

# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)

Set up the 5x5 split

folds <- vfold_cv(train_data, v = 5, repeats =5, strata = BodyTemp)

Create a recipe

model_recipe3 <- 
  recipe(BodyTemp ~ ., data = train_data) %>%
  step_dummy(all_nominal_predictors()) 
 # step_ordinalscore(Myalgia, CoughIntensity, Weakness) commenting out because it doesn't want to find the variables unless under VERY specific conditions (idk what those conditions are)

# model_recipe3<-prep(model_recipe3, training = train_data)
# An attempt I saw someone else do, it didn't work for me :(

Model Creation

Null Model

Build the model

null<-null_model() %>% 
  set_engine("parsnip") %>% 
  set_mode("regression") %>% 
  translate()

null_wf <-
  workflow() %>%
  add_model(null) %>%
  add_recipe(model_recipe3)

null_fit <-
  null_wf %>%
  fit(train_data)

null_fit %>%
  extract_fit_parsnip() %>%
  tidy()
# A tibble: 1 × 1
  value
  <dbl>
1  98.9

View Trained RMSE

null_train_aug <- 
  augment(null_fit, train_data)

yardstick::rmse(null_train_aug, BodyTemp, .pred)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        1.22

Test the Null Model

predict(null_fit, test_data)
# A tibble: 223 × 1
   .pred
   <dbl>
 1  98.9
 2  98.9
 3  98.9
 4  98.9
 5  98.9
 6  98.9
 7  98.9
 8  98.9
 9  98.9
10  98.9
# … with 213 more rows
null_test_aug <- 
  augment(null_fit, test_data)

yardstick::rmse(null_test_aug, BodyTemp, .pred)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        1.14

Decision Tree

tune_spec <- 
  decision_tree(
    cost_complexity = tune(),
    tree_depth = tune()
  ) %>% 
  set_engine("rpart") %>% 
  set_mode("regression")

tree_grid <- grid_regular(cost_complexity(),
                          tree_depth(),
                          levels = 5)

tree_wf <- workflow() %>%
  add_model(tune_spec) %>%
  add_recipe(model_recipe3)

## Commenting out because otherwise it will run errors and won't knit
# tree_res <- 
#   tree_wf %>% 
#   tune_grid(
#     resamples = folds, #recall this created CV from earlier
#     grid = tree_grid
#     )

# tree_res %>%collect_metrics()
# 
# tree_res %>%
#   show_best("rmse")

LASSO

set.seed(123)

lr_mod <- linear_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet")

lr_workflow <- 
  workflow() %>% 
  add_model(lr_mod) %>% 
  add_recipe(model_recipe3)

lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))

lr_reg_grid %>% top_n(-5) # lowest penalty values
Selecting by penalty
# A tibble: 5 × 1
   penalty
     <dbl>
1 0.0001  
2 0.000127
3 0.000161
4 0.000204
5 0.000259
lr_reg_grid %>% top_n(5)  # highest penalty values
Selecting by penalty
# A tibble: 5 × 1
  penalty
    <dbl>
1  0.0386
2  0.0489
3  0.0621
4  0.0788
5  0.1   
## Commenting out because otherwise it will run errors and won't knit
# lr_res <- lr_workflow %>% 
#   tune_grid(resamples = folds,
#             grid = lr_reg_grid)   #Had more options here but other students didn't and theirs ran ok, mine did not regardless
# 
# lr_res %>% show_best("rmse")
# best_lasso = lasso_res %>%
#   select_best("rmse")

Random Forest

rf_mod <-
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger") %>% 
  set_mode("regression")

ml_model2 <- recipe(BodyTemp ~ ., data = train_data)

rf_workflow <-
  workflow() %>%
  add_model(rf_mod)  %>%
  add_recipe(ml_model2)

## Commenting out because otherwise it will run errors and won't knit
# rf_res <- 
#   rf_workflow %>% 
#   tune_grid(folds)