-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_sampling_file.R
43 lines (35 loc) · 2.31 KB
/
run_sampling_file.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#### DO NOT EDIT THIS FILE ###############################################################
# Tommy copied this code to the CBS RA on 2024-09-07.
# All further edits will be made within the CBS RA environment unless discussed otherwise.
# We will export the code from the CBS RA and push updates to GitHub when appropriate.
##########################################################################################
# To run this code outside of the CBS RA, first run the following files:
# 1. fake_data_for_code_testing/fake_inputs_for_code_testing.R
# 2. run_training_set.R
# This file creates run_sampling_file(). It takes a sampling_file path and
# reads it. It also gets the training_sets in the form of a vector of
# training_set names from the jobfile and generates partial results
# by applying run_training_set() to each training_set. It takes the partial
# results dataframe and adds a column for the sampling_file.
library(tidyverse)
library(data.table)
library(furrr)
run_sampling_file <- function(sampling_file_name, data_path) {
# Identify the sampling file path (it will be read later within the run_training_set function)
# TODO: Change location_of_sampling_file to reflect the file path that will be used on OSSC
location_of_sampling_file <- "~/Documents/GitHub/stork_oracle_cbs/fake_data_for_code_testing/"
sampling_file_path <- paste0(location_of_sampling_file, sampling_file_name)
# Get the training_sets in the form of a vector of training_set names from the jobfile and generate
# partial results by applying run_training_set() to each training_set.
training_sets <- unique(data_splits$training_sets)
results_for_this_sampling_file <- future_map_dfr(training_sets,
~ run_training_set(.x,
sampling_file_path = sampling_file_path,
data_path = data_path))
# Take the partial results dataframe and add a column for the sampling_file.
results_for_this_sampling_file <- results_for_this_sampling_file %>%
mutate(sampling_file = sampling_file_name)
return(results_for_this_sampling_file)
}
# Example
# run_sampling_file(sampling_file_name = "pmt_train_and_evaluation_samples_seed_1.csv", data_path = fake_data_path)