diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..46e026e Binary files /dev/null and b/.DS_Store differ diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000..f807d50 Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/bin/__init__.py b/src/bin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/bin/__pycache__/__init__.cpython-39.pyc b/src/bin/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..d598918 Binary files /dev/null and b/src/bin/__pycache__/__init__.cpython-39.pyc differ diff --git a/src/bin/__pycache__/validations.cpython-39.pyc b/src/bin/__pycache__/validations.cpython-39.pyc new file mode 100644 index 0000000..a84a2a2 Binary files /dev/null and b/src/bin/__pycache__/validations.cpython-39.pyc differ diff --git a/src/bin/checkpointing.py b/src/bin/checkpointing.py new file mode 100644 index 0000000..411776f --- /dev/null +++ b/src/bin/checkpointing.py @@ -0,0 +1,17 @@ +import torch +import os + + +def save_checkpoint(state, full_file_path): + # Save the check point if the model it best. + create_file_if_does_not_exist(full_file_path) + + print("Saving a new best at file path: {}".format(full_file_path)) + torch.save(state, full_file_path) + + +def create_file_if_does_not_exist(full_file_path): + # Create + if not os.path.exists(full_file_path): + model_tar = open(full_file_path, mode="w+") + model_tar.close() diff --git a/src/bin/plotting.py b/src/bin/plotting.py new file mode 100644 index 0000000..1706c0b --- /dev/null +++ b/src/bin/plotting.py @@ -0,0 +1,161 @@ +import os +import matplotlib + +from matplotlib import pyplot as plt +from matplotlib.ticker import MaxNLocator +from bokeh.plotting import figure, output_file, show, save +from src import definitions + +LOSS_OVER_N_EPOCHS_DICT_KEYS = ["train_loss", "val_loss", "test_loss"] +SCORE_KEY_MAP = {'precision': 0, 'recall': 1, 'f1': 2} +PLOTTING_ROOT = os.path.join(definitions.ROOT_DIR, "../plots/") + + +# todo(abihnavshaw): Move this to validations. +def validate_loss_over_n_dict_keys(loss_over_n_epochs: dict): + assert all([key in LOSS_OVER_N_EPOCHS_DICT_KEYS for key in loss_over_n_epochs.keys()]) + + +def plot_loss_over_n_epochs(loss_over_n_epochs: dict, + title=None, + file_path=None, + fig_size: tuple = (10, 6)): + """ + + @param loss_over_n_epochs: Dictionary loss over epochs. + @param file_path: File path relative to plotting folder which is `student_life/plots`. + @param fig_size: Tuple for Figure Size. + @return: Void, instead plots the figure and saves it if given a file path. + """ + validate_loss_over_n_dict_keys(loss_over_n_epochs) + fig = plt.figure(figsize=fig_size) + ax = fig.add_subplot(111) + ax.set_xlabel('Epochs') + ax.set_ylabel('Loss') + if title: + ax.set_title(title) + + first_key = next(iter(loss_over_n_epochs.keys())) + n_epochs = len(loss_over_n_epochs[first_key]) + + for key in loss_over_n_epochs: + # If nothing to plot just skip that split. + if len(loss_over_n_epochs[key]) == 0: + continue + + ax.plot(range(1, n_epochs + 1), loss_over_n_epochs[key], label=key) + + plt.legend() + + if file_path: + file_path = os.path.join(PLOTTING_ROOT, file_path) + print("File Path: ", file_path) + fig.savefig(file_path) + + plt.show() + + +def plot_score_over_n_epochs(scores_over_n_epochs: dict, + score_type='f1', + title=None, + file_path=None, + fig_size: tuple = (10, 6)): + assert score_type in SCORE_KEY_MAP.keys(), "Invalid Score type." + + fig = plt.figure(figsize=fig_size) + ax = fig.add_subplot(111) + ax.set_xlabel('Epochs') + ax.set_ylabel('{} Score'.format(score_type)) + if title: + ax.set_title(title) + + f1_score_key = SCORE_KEY_MAP[score_type] + + first_key = next(iter(scores_over_n_epochs.keys())) + n_epochs = len(scores_over_n_epochs[first_key]) + + for key in scores_over_n_epochs: + f1_score = [] + if len(scores_over_n_epochs[key]) == 0: + continue + for epoch in range(n_epochs): + f1_score.append(scores_over_n_epochs[key][epoch][f1_score_key]) + + ax.plot(range(1, n_epochs + 1), f1_score, label=key) + + plt.legend() + plt.show() + + if file_path: + file_path = os.path.join(PLOTTING_ROOT, file_path) + fig.savefig(file_path) + + +def get_empty_stat_over_n_epoch_dictionaries(): + loss_over_epochs = { + "train_loss": [], + "val_loss": [], + "test_loss": [] + } + + scores_over_epochs = { + "train_scores": [], + "val_scores": [], + "test_scores": [], + "overall_scores": [] + } + + return loss_over_epochs, scores_over_epochs + + +def plot_line_chart_using_bokeh(x_axis_data: list, y_axis_data: list, colors: list, + title: str, output_file_name: str, + plot_height=350, plot_width=800, + line_alpha=0.5, line_width=1, + x_label='Time', y_label='Value', + show_fig=True): + assert len(x_axis_data) == len(y_axis_data) and len(x_axis_data) == len( + y_axis_data), "Length miss-match for x-axis or y-axis data." + + p = figure(x_axis_type="datetime", title=title, plot_height=plot_height, plot_width=plot_width) + p.xgrid.grid_line_color = None + p.ygrid.grid_line_alpha = 0.5 + p.xaxis.axis_label = x_label + p.yaxis.axis_label = y_label + p.multi_line(x_axis_data, y_axis_data, line_color=colors, line_width=line_width, line_alpha=line_alpha) + output_file(output_file_name) + if show_fig: + show(p) + + +def line_plot_as_pdf(*y, x, xlabel, ylabel, file_name, + line_lw=1, fig_size=(9, 5), + labelsize='large', markersize=8): + matplotlib.use('PDF') + fig = plt.figure() + + plt.rc('figure', figsize=fig_size) + + plt.rc('xtick', labelsize=labelsize) + plt.rc('ytick', labelsize=labelsize) + + plt.rc('font', weight='bold', size='12', family='serif') + plt.rc('axes', linewidth=1) + plt.rc('lines', + linewidth=line_lw, + markersize=markersize) + + plt.ylim(0.4, 0.8) + ax = fig.add_subplot(1, 1, 1) + + ax.xaxis.set_major_locator(MaxNLocator(integer=True)) + ax.set_prop_cycle(color=['r', 'b', 'g', 'y'], + marker=["v", "o", "d", "+"]) + + ax.set_xlabel(xlabel, weight='bold', size=22) + ax.set_ylabel(ylabel, weight='bold', size=22) + + for y_val, label in list(y): + ax.plot(x, y_val, label=label) + plt.legend(loc='upper right', prop={'size': 14}) + fig.savefig(file_name, bbox_inches='tight', dpi=1000) diff --git a/src/bin/scoring.py b/src/bin/scoring.py new file mode 100644 index 0000000..6505060 --- /dev/null +++ b/src/bin/scoring.py @@ -0,0 +1,38 @@ +from sklearn import metrics + + +def get_precission_recall_f_scores(**kwargs): + """ + + @param kwargs: + @return: Get scores + """ + + loss_over_epochs = kwargs.get('loss_over_epochs', None) + scores_over_epochs = kwargs.get('scores_over_epochs', None) + + train_loss = kwargs.get('train_loss', None) + train_labels = kwargs.get('train_labels', None) + train_preds = kwargs.get('train_preds', None) + + val_loss = kwargs.get('val_loss', None) + val_labels = kwargs.get('val_labels', None) + val_preds = kwargs.get('val_preds', None) + + test_loss = kwargs.get('test_loss', None) + test_labels = kwargs.get('test_labels', None) + test_preds = kwargs.get('test_preds', None) + + average = kwargs.get('average', 'macro') + + train_scores = metrics.precision_recall_fscore_support(train_labels, + train_preds, + average=average) + val_scores = metrics.precision_recall_fscore_support(val_labels, + val_preds, + average=average) + test_scores = metrics.precision_recall_fscore_support(test_labels, + test_preds, + average=average) + + return train_scores, val_scores, test_scores diff --git a/src/bin/statistics.py b/src/bin/statistics.py new file mode 100644 index 0000000..5cd46c6 --- /dev/null +++ b/src/bin/statistics.py @@ -0,0 +1,186 @@ +import pandas as pd +import numpy as np + +from collections import Counter +from tabulate import tabulate +from src import definitions +from src.bin import validations +from src.bin import user_statistics +from src.utils import data_conversion_utils as conversions +from src.data_manager import student_life_var_binned_data_manager as data_manager + +LABEL_COUNT_HEADERS = ['Train', 'Val', 'Test'] +USER_TRAIN_STATISTICS_MAP = { + 'confusion_matrix': user_statistics.user_confusion_matrix, + 'f1_score': user_statistics.user_f1_score, + 'accuracy': user_statistics.user_accuracy, + 'label_count': user_statistics.label_count +} + + +def get_statistics_on_data_dict(data: dict, feature_list: list): + """ + @attention Statistics returned are ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']. + @param data: Data in classic dictionary format. + @param feature_list: Feature list for the data. + @return: Statistics on whole data and raw appended data. + """ + validations.validate_data_dict_keys(data) + validations.validate_all_data_present_in_data_dict(data) + df_for_statistics = pd.DataFrame() + + for key in data['data']: + unit_sequence = data['data'][key][0] + df_for_statistics = df_for_statistics.append(pd.DataFrame(unit_sequence), + ignore_index=True) + + if not data_manager.FLATTEN_SEQUENCE_TO_COLS: + df_for_statistics.columns = feature_list + df_for_statistics.replace(to_replace=-1, value=np.nan, inplace=True) + return df_for_statistics.describe(percentiles=[0.25, 0.5, 0.75]), df_for_statistics + + +def get_train_test_val_label_counts_from_raw_data(data: dict): + """ + + @param data: Data in dictionary format. + @return: Counts. + """ + train_counter = get_label_count_in_split(data, 'train') + val_counter = get_label_count_in_split(data, 'val') + test_counter = get_label_count_in_split(data, 'test') + overall_counts = convert_label_counters_to_list(train_counter, val_counter, test_counter) + + return tabulate(overall_counts, headers=LABEL_COUNT_HEADERS) + + +def get_label_count_in_split(data: dict, split: str = None): + """ + + @param data: Data in dictionary format. + @param split: Split for which label counts are required. + Accepts 'test', 'train' and 'val'. + @return: Label count for the given split. + """ + assert split in ['train', 'test', 'val', None] + + labels = [] + + if split is None: + for split_id in data['data']: + label = data['data'][split_id][definitions.LABELS_IDX] + labels.append(label) + else: + for split_id in data[split + "_ids"]: + label = data['data'][split_id][definitions.LABELS_IDX] + labels.append(label) + + counters = Counter(labels) + return counters + + +def convert_label_counters_to_list(*counters): + counters = list(counters) + overall_counts = [] + + def append_count_for_label(label): + counts_per_element = [label] + for count in counters: + counts_per_element.append(count[label]) + overall_counts.append(counts_per_element) + + for label in definitions.LABELS: + append_count_for_label(label) + + return overall_counts + + +def tensor_preds_to_scalar_counter(tensor_preds): + preds_list = [] + for pred in tensor_preds: + preds_list.append(pred.item()) + + return Counter(preds_list) + + +def get_train_test_val_label_counts_from_predictions(*predictions): + predictions = list(predictions) + counters = [] + for preds in predictions: + counters.append(tensor_preds_to_scalar_counter(preds)) + overall_counts = convert_label_counters_to_list(*counters) + + return tabulate(overall_counts, LABEL_COUNT_HEADERS) + + +def get_class_weights_in_inverse_proportion(data: dict): + train_label_counts = get_label_count_in_split(data) + train_label_counts = [train_label_counts[label] for label in definitions.LABELS] + + # Weight All classes equally if any one class label missing. + if any([True if x == 0 else False for x in train_label_counts]): + return [1.0] * len(definitions.LABELS) + + class_weights = [x / max(train_label_counts) for x in train_label_counts] + class_weights = [1 / x for x in class_weights] + class_weights = [x / max(class_weights) for x in class_weights] + + return class_weights + + +def generate_training_statistics_for_user(labels, predictions, users, print_confusion=False): + """ + Prints the confusion matrix for each student. + + @param labels: The target labels. + @param predictions: Predictions from the model. + @param users: The user for the respective label. + @param print_confusion: If true, it prints the result. + @return Returns the confusion matrix of each student in a dictionary. + """ + + data_frame_dict = {"user": users, + "label": conversions.tensor_list_to_int_list(labels), + "prediction": conversions.tensor_list_to_int_list(predictions)} + + distinct_users = set(users) + statistics_per_user = {} + labels_predictions_users = pd.DataFrame(data_frame_dict) + + for distinct_user in distinct_users: + filter_mask = labels_predictions_users['user'] == distinct_user + user_data = labels_predictions_users[filter_mask] + statistics = {} + + for statistic in USER_TRAIN_STATISTICS_MAP: + statistics[statistic] = USER_TRAIN_STATISTICS_MAP[statistic](user_data) + + statistics_per_user[distinct_user] = statistics + + if print_confusion: + for user in statistics_per_user: + print(tabulate(statistics_per_user[user]['confusion_matrix'])) + print("---") + + return statistics_per_user + + +def get_sequence_length_and_num_features_from_data(data: dict, print_output=True): + first_key = next(iter(data['data'].keys())) + actual_data = data['data'][first_key][definitions.ACTUAL_DATA_IDX] + histogram = data['data'][first_key][definitions.HISTOGRAM_IDX] + covariates = data['data'][first_key][definitions.COVARIATE_DATA_IDX] + + sequence_len, num_features, histogram_seq_len, histogram_num_features = len(actual_data), len(actual_data[0]), len( + histogram), len(histogram[0]) + covariate_len = len(covariates) + + if print_output: + print("sequence_len: {} num_features: {} histogram_seq_len: {} histogram_num_feats: {} covariate_len: {}".format( + sequence_len, + num_features, + histogram_seq_len, + histogram_num_features, + covariate_len)) + + return sequence_len, num_features, histogram_seq_len, histogram_num_features, covariate_len diff --git a/src/bin/tensorify.py b/src/bin/tensorify.py new file mode 100644 index 0000000..ede2736 --- /dev/null +++ b/src/bin/tensorify.py @@ -0,0 +1,45 @@ +import torch + +from src import definitions +from src.bin import validations + + +def get_data_and_label_tensor(data: dict, key, cuda_enabled): + """ + + @param data: Data dict containing the data in our rich data structure. + @param key: Key in the data, usually time series key. + @param cuda_enabled: If true, returns cuda tensors. + @return: Returns tensors that can be used for training on the models. + """ + tensor_data = torch.tensor(list(data['data'][key][:definitions.COVARIATE_DATA_IDX]), + dtype=torch.float) + covariate_data = torch.tensor(list(data['data'][key][definitions.COVARIATE_DATA_IDX]), + dtype=torch.float) + histogram_data = torch.tensor(list(data['data'][key][definitions.HISTOGRAM_IDX]), + dtype=torch.float) + train_label = torch.tensor(data['data'][key][definitions.LABELS_IDX]).item() + train_label = torch.tensor([train_label], dtype=torch.long) + + if cuda_enabled: + tensor_data = tensor_data.cuda() + covariate_data = covariate_data.cuda() + histogram_data = histogram_data.cuda() + train_label = train_label.cuda() + + return tensor_data, covariate_data, histogram_data, train_label + + +def tensorify_data_gru_d(data: dict, cuda_enabled=False): + """ + + @param data: Data dictionary that needs to be converted to tensors in GRUD style of data. + @param cuda_enabled: If true, will convert data into cuda tensors. + @return: Return Data dictionary with tensors which can be used to train. + """ + validations.validate_data_dict_keys(data) + validations.validate_all_data_present_in_data_dict(data) + for key in data['data'].keys(): + data['data'][key] = get_data_and_label_tensor(data, key, cuda_enabled) + + return data diff --git a/src/bin/trainer.py b/src/bin/trainer.py new file mode 100644 index 0000000..adefb10 --- /dev/null +++ b/src/bin/trainer.py @@ -0,0 +1,933 @@ +import torch +import torch.nn as nn +import numpy as np +from copy import deepcopy +from sklearn.preprocessing import MultiLabelBinarizer +from sklearn import metrics + +from src.bin import validations +from src.utils import data_conversion_utils as conversions +from src.utils import object_generator_utils as object_generator +from src.models.user_dense_heads import softmax_select + +HISTOGRAM_IDX_AFTER_TENSORIFY = 2 + +def branching_on_leaved_out_with_exist_heads( + data, + key_set: str, + epochs, + model, + classification_criterion, + device, + use_covariates=True, + use_histogram=False +): + # extract keys from first week (can be optimized by number of days, at begin_interval) + month_days = {0: 0, 1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31} # map: month -> # days + + begin_interval = 7 # how many data used for the leaved out student + start_day = -1 + + min_data_need = list() + val_keys = list() + for key in data[key_set]: + month = int(key.split('_')[1]) + day = int(key.split('_')[2]) + curr_day = sum([month_days[i] for i in range(month + 1)]) + day + if start_day < 0: + start_day = curr_day + else: + if curr_day - start_day >= begin_interval: + val_keys.append(key) + else: + min_data_need.append(key) + + heads_ind = dict() + ind = 0 + for head in model.user_heads.user_layer: + heads_ind[ind] = head + ind += 1 + + # class weighted_out(nn.Module): + # def __init__(self, num_branches): + # super().__init__() + # self.p = nn.Parameter(torch.exp(torch.ones(num_branches, device=torch.device("cuda"))), requires_grad=True) + + # def forward(self): + # # weight sum to [0, 1] + # return self.p + + # leaved_out_head = weighted_out(len((heads_ind))) + leaved_out_head = softmax_select(len(heads_ind)) + + optimizer = torch.optim.Adam( + [ + {'params': leaved_out_head.parameters()}, + ], + lr=1e-5, + weight_decay=1e-4 + ) + scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.996) + train_loss, val_loss = list(), list() + f1_train_scores, f1_val_scores = list(), list() + val_roc_macros, val_roc_micros, val_roc_weighteds = list(), list(), list() + best_split_score = -1 + best_model = None + best_confmat = None + best_out = None + for epoch in range(epochs): + train_labels = list() + train_preds = list() + val_labels = list() + val_preds = list() + total_train_loss = 0 + leaved_out_head.train() + for key in min_data_need: + student_id = conversions.extract_student_id_from_key(key) + student_key = 'student_' + str(student_id) + actual_data, covariate_data, histogram_data, train_label = data['data'][key] + actual_data = actual_data[0].unsqueeze(0) + if use_histogram: + actual_data = histogram_data.unsqueeze(0) + + # forward + # select branch + prob_out, branch_ind = leaved_out_head() + chosen_key = heads_ind[int(branch_ind)] + + # prior out + decoded_output, y_pred = model( + chosen_key, + '-1', + actual_data, + covariate_data if use_covariates else None + ) + + # with probability factor + y_pred *= prob_out + + # # weighted, not drop + # prob = leaved_out_head() + # final_out = 0 + # for ind in heads_ind: + # chosen_key = heads_ind[ind] + + # decoded_output, y_pred = model( + # chosen_key, + # '-1', + # actual_data, + # covariate_data if use_covariates else None + # ) + + # final_out += y_pred * prob[ind] + + # y_pred = final_out + + # compute loss + classification_loss = classification_criterion(y_pred, train_label) + total_train_loss += classification_loss.item() + + # optimize + model.zero_grad() + leaved_out_head.zero_grad() + classification_loss.backward() + optimizer.step() + + train_labels.append(train_label) + predicted_class = get_predicted_class(y_pred) + train_preds.append(predicted_class) + train_loss.append(total_train_loss) + + # validate + total_val_loss = 0 + leaved_out_head.eval() + outs = list() + for key in val_keys: + student_id = conversions.extract_student_id_from_key(key) + student_key = 'student_' + str(student_id) + actual_data, covariate_data, histogram_data, train_label = data['data'][key] + actual_data = actual_data[0].unsqueeze(0) + if use_histogram: + actual_data = histogram_data.unsqueeze(0) + + # forward + # select branch + prob_out, branch_ind = leaved_out_head() + chosen_key = heads_ind[int(branch_ind)] + + # prior out + decoded_output, y_pred = model( + chosen_key, + '-1', + actual_data, + covariate_data if use_covariates else None + ) + + outs.append(y_pred.cpu().detach().numpy()) + + # # weighted, not drop + # prob = leaved_out_head() + # final_out = 0 + # for ind in heads_ind: + # chosen_key = heads_ind[ind] + + # decoded_output, y_pred = model( + # chosen_key, + # '-1', + # actual_data, + # covariate_data if use_covariates else None + # ) + + # final_out += y_pred * prob[ind] + + # y_pred = final_out + + # compute loss + classification_loss = classification_criterion(y_pred, train_label) + total_val_loss += classification_loss.item() + + # clear cache + model.zero_grad() + leaved_out_head.zero_grad() + classification_loss.backward() + model.zero_grad() + leaved_out_head.zero_grad() + + val_labels.append(train_label) + predicted_class = get_predicted_class(y_pred) + val_preds.append(predicted_class) + val_loss.append(total_val_loss) + + # learning rate decay + scheduler.step() + + # update score + train_label_list = conversions.tensor_list_to_int_list(train_labels) + train_pred_list = conversions.tensor_list_to_int_list(train_preds) + val_label_list = conversions.tensor_list_to_int_list(val_labels) + val_pred_list = conversions.tensor_list_to_int_list(val_preds) + + train_scores = metrics.precision_recall_fscore_support(train_label_list, train_pred_list, average='weighted') + val_scores = metrics.precision_recall_fscore_support(val_label_list, val_pred_list, average='weighted') + + f1_train_scores.append(train_scores[2]) + f1_val_scores.append(val_scores[2]) + + # compute val AUC scores + mlb = MultiLabelBinarizer() + mlb.fit([[0],[1], [2]]) + y_true = mlb.transform([[i] for i in val_label_list]) + y_pred = mlb.transform([[i] for i in val_pred_list]) + print("confusion matrix: ") + con_matrix = metrics.confusion_matrix(val_label_list, val_pred_list, labels=[0, 1, 2]) + print(con_matrix) + + val_roc_macro = None + try: + val_roc_macro = metrics.roc_auc_score(y_true, y_pred, average='macro') + except: + val_roc_macro = 0.0 + + val_roc_micro = None + try: + val_roc_micro = metrics.roc_auc_score(y_true, y_pred, average='micro') + except: + val_roc_micro = 0.0 + val_roc_weighted = None + try: + val_roc_weighted = metrics.roc_auc_score(y_true, y_pred, average='weighted') + except: + val_roc_weighted = 0.0 + val_roc_macros.append(val_roc_macro) + val_roc_micros.append(val_roc_micro) + val_roc_weighteds.append(val_roc_weighted) + + if train_scores[2] > best_split_score: + best_split_score = train_scores[2] + best_model = deepcopy(model) + # best_model = model + # best_branching_scores = copy.deepcopy(branching_scores) + + best_confmat = con_matrix + best_out = outs + + print('epoch {}, val_score {}'.format(epoch, val_scores)) + + return train_loss, val_loss, leaved_out_head, f1_train_scores, f1_val_scores, val_roc_macros, val_roc_micros, val_roc_weighteds, best_confmat, best_out + +def loocv_multitask_learner_with_branching_validation( + data, + ids, + key_set: str, + num_classes, + num_branches, + multitask_lerner_model, + reconstruction_criterion, + classification_criterion, + device, + optimizer=None, + alpha=1, + beta=1, + use_histogram=False, + histogram_seq_len=None, + ordinal_regression=False, + use_covariates=True): + + # validations.validate_data_dict_keys(data) + # validate_key_set_str(key_set) + + total_reconstruction_loss = 0 + total_classification_loss = 0 + total_joint_loss = 0 + + labels = list() + predictions = list() + users = list() + + if not optimizer: + multitask_lerner_model.eval() + multitask_lerner_model.user_heads.branching_probs.eval() + else: + multitask_lerner_model.train() + multitask_lerner_model.user_heads.branching_probs.train() + + # extract keys from first week (can be optimized by number of days, at begin_interval) + month_days = {0: 0, 1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31} # map: month -> # days + + begin_interval = 7 # how many data used for the leaved out student + start_day = -1 + + min_data_need = list() + val_keys = list() + for key in data[key_set]: + month = int(key.split('_')[1]) + day = int(key.split('_')[2]) + curr_day = sum([month_days[i] for i in range(month + 1)]) + day + if start_day < 0: + start_day = curr_day + else: + if curr_day - start_day >= begin_interval: + val_keys.append(key) + else: + min_data_need.append(key) + + # find best existing head + heads_score = dict() + for head in multitask_lerner_model.user_heads.user_layer: + heads_score[head] = 0 + + # inner loop, loop over existing heads + for key in min_data_need: + student_id = conversions.extract_student_id_from_key(key) + actual_data, covariate_data, histogram_data, train_label = data['data'][key] + + if ordinal_regression: + train_label_vector = get_target_vector_for_ordinal_regression(train_label, num_classes, device) + + actual_data = actual_data[0].unsqueeze(0) + if use_histogram: + if histogram_seq_len: + histogram_data = histogram_data[:max(histogram_seq_len, len(histogram_data))] + actual_data = histogram_data.unsqueeze(0) + + # forward + min_loss = np.inf + min_key = None + for id_ in ids: + if str(id_) == str(student_id): + continue + student_key = 'student_' + str(id_) + decoded_output, y_pred = multitask_lerner_model( + student_key, + '-1', + actual_data, + covariate_data if use_covariates else None + ) + # decoded output is `None` if training on only co-variates. + reconstruction_loss = reconstruction_criterion(actual_data, decoded_output) if decoded_output is not None else object_generator.get_tensor_on_correct_device([0]) + + # compute loss + classification_loss = None + if ordinal_regression: + classification_loss = classification_criterion(y_pred, train_label_vector) + else: + classification_loss = classification_criterion(y_pred, train_label) + + joint_loss = alpha * reconstruction_loss + beta * classification_loss + curr_loss = joint_loss.item() + + if min_key == None: + min_key = student_key + min_loss = curr_loss + elif curr_loss < min_loss: + min_key = student_key + min_loss = curr_loss + heads_score[min_key] += 1 + + chosen_key = None + chosen_score = -1 + for skey in heads_score: + if heads_score[skey] > chosen_score: + chosen_score = heads_score[skey] + chosen_key = skey + + for key in val_keys: + student_id = conversions.extract_student_id_from_key(key) + student_key = 'student_' + str(student_id) + actual_data, covariate_data, histogram_data, train_label = data['data'][key] + + if ordinal_regression: + train_label_vector = get_target_vector_for_ordinal_regression(train_label, num_classes, device) + + actual_data = actual_data[0].unsqueeze(0) + if use_histogram: + if histogram_seq_len: + histogram_data = histogram_data[:max(histogram_seq_len, len(histogram_data))] + actual_data = histogram_data.unsqueeze(0) + + # forward + decoded_output, y_pred = multitask_lerner_model( + chosen_key, + '-1', + actual_data, + covariate_data if use_covariates else None + ) + + # decoded output is `None` if training on only co-variates. + reconstruction_loss = reconstruction_criterion(actual_data, decoded_output) if decoded_output is not None else object_generator.get_tensor_on_correct_device([0]) + total_reconstruction_loss += reconstruction_loss.item() + + # compute loss + classification_loss = None + if ordinal_regression: + classification_loss = classification_criterion(y_pred, train_label_vector) + else: + classification_loss = classification_criterion(y_pred, train_label) + + total_classification_loss += classification_loss.item() + + joint_loss = alpha * reconstruction_loss + beta * classification_loss + total_joint_loss += joint_loss.item() + + labels.append(train_label) + predicted_class = get_predicted_class(y_pred, ordinal_regression=ordinal_regression) + predictions.append(predicted_class) + users.append(student_id) + + return total_joint_loss, total_reconstruction_loss, total_classification_loss, labels, predictions, users + +def evaluate_multitask_learner_with_branching(data, + key_set: str, + num_classes, + num_branches, + branching_scores, + multitask_lerner_model, + reconstruction_criterion, + classification_criterion, + device, + optimizer=None, + alpha=1, + beta=1, + use_histogram=False, + histogram_seq_len=None, + ordinal_regression=False, + use_covariates=True): + validations.validate_data_dict_keys(data) + validate_key_set_str(key_set) + + total_reconstruction_loss = 0 + total_classification_loss = 0 + total_joint_loss = 0 + + labels = list() + predictions = list() + users = list() + + if not optimizer: + multitask_lerner_model.eval() + multitask_lerner_model.user_heads.branching_probs.eval() + else: + multitask_lerner_model.train() + multitask_lerner_model.user_heads.branching_probs.train() + + for key in data[key_set]: + student_id = conversions.extract_student_id_from_key(key) + student_key = 'student_' + str(student_id) + actual_data, covariate_data, histogram_data, train_label = data['data'][key] + + if ordinal_regression: + train_label_vector = get_target_vector_for_ordinal_regression(train_label, num_classes, device) + + actual_data = actual_data[0].unsqueeze(0) + if use_histogram: + if histogram_seq_len: + histogram_data = histogram_data[:max(histogram_seq_len, len(histogram_data))] + actual_data = histogram_data.unsqueeze(0) + + # forward + decoded_output, y_pred = multitask_lerner_model( + student_key, + '-1', + actual_data, + covariate_data if use_covariates else None + ) + + # decoded output is `None` if training on only co-variates. + reconstruction_loss = reconstruction_criterion(actual_data, decoded_output) if decoded_output is not None else object_generator.get_tensor_on_correct_device([0]) + total_reconstruction_loss += reconstruction_loss.item() + + # compute loss + classification_loss = None + if ordinal_regression: + classification_loss = classification_criterion(y_pred, train_label_vector) + else: + classification_loss = classification_criterion(y_pred, train_label) + + total_classification_loss += classification_loss.item() + + joint_loss = alpha * reconstruction_loss + beta * classification_loss + total_joint_loss += joint_loss.item() + + # if training, optimize + if optimizer: + multitask_lerner_model.zero_grad() + joint_loss.backward() + optimizer.step() + + labels.append(train_label) + predicted_class = get_predicted_class(y_pred, ordinal_regression=ordinal_regression) + predictions.append(predicted_class) + users.append(student_id) + + return total_joint_loss, total_reconstruction_loss, total_classification_loss, labels, predictions, users + +# # Try by yunfeiluo in Fall 2020 ######################################################################################### +# def evaluate_multitask_learner_with_branching(data, +# key_set: str, +# num_classes, +# num_branches, +# branching_scores, +# multitask_lerner_model, +# reconstruction_criterion, +# classification_criterion, +# device, +# optimizer=None, +# alpha=1, +# beta=1, +# use_histogram=False, +# histogram_seq_len=None, +# ordinal_regression=False, +# use_covariates=True): +# validations.validate_data_dict_keys(data) +# validate_key_set_str(key_set) + +# total_reconstruction_loss = 0 +# total_classification_loss = 0 +# total_joint_loss = 0 + +# labels = list() +# predictions = list() +# users = list() + +# if not optimizer: +# multitask_lerner_model.eval() +# else: +# multitask_lerner_model.train() + +# for key in data[key_set]: +# student_id = conversions.extract_student_id_from_key(key) +# student_key = 'student_' + str(student_id) +# actual_data, covariate_data, histogram_data, train_label = data['data'][key] + +# if ordinal_regression: +# train_label_vector = get_target_vector_for_ordinal_regression(train_label, num_classes, device) + +# actual_data = actual_data[0].unsqueeze(0) +# if use_histogram: +# if histogram_seq_len: +# histogram_data = histogram_data[:max(histogram_seq_len, len(histogram_data))] +# actual_data = histogram_data.unsqueeze(0) + +# # if not in training process +# if not optimizer: +# decoded_output, y_pred, shared_out = multitask_lerner_model(student_key, +# str(np.argmax(branching_scores[student_key])), +# actual_data, +# covariate_data if use_covariates else None) + +# # decoded output is `None` if training on only co-variates. +# reconstruction_loss = reconstruction_criterion(actual_data, decoded_output) if decoded_output is not None else object_generator.get_tensor_on_correct_device([0]) +# total_reconstruction_loss += reconstruction_loss.item() + +# classification_loss = None +# if ordinal_regression: +# classification_loss = classification_criterion(y_pred, train_label_vector) +# else: +# classification_loss = classification_criterion(y_pred, train_label) + +# total_classification_loss += classification_loss.item() + +# joint_loss = alpha * reconstruction_loss + beta * classification_loss +# total_joint_loss += joint_loss.item() + +# labels.append(train_label) +# predicted_class = get_predicted_class(y_pred, ordinal_regression=ordinal_regression) +# predictions.append(predicted_class) +# users.append(student_id) + +# continue + +# # compute out for first branching block, with the autoencoder out +# decoded_output, y_pred, shared_out = multitask_lerner_model(student_key, +# '0', +# actual_data, +# covariate_data if use_covariates else None) + +# # decoded output is `None` if training on only co-variates. +# reconstruction_loss = reconstruction_criterion(actual_data, decoded_output) if decoded_output is not None else object_generator.get_tensor_on_correct_device([0]) +# total_reconstruction_loss += reconstruction_loss.item() + +# classification_loss = None +# if ordinal_regression: +# classification_loss = classification_criterion(y_pred, train_label_vector) +# else: +# classification_loss = classification_criterion(y_pred, train_label) + +# # Iterate over the rest branching blocks, find the best block +# best_branch_ind = 0 +# best_branch_loss = classification_loss +# for branch_id in range(1, num_branches): +# y_pred = multitask_lerner_model( +# student_key, +# str(branch_id), +# actual_data, +# covariate_data if use_covariates else None, +# shared_out=shared_out, +# has_shared=True +# ) + +# # compute loss +# classification_loss = None +# if ordinal_regression: +# classification_loss = classification_criterion(y_pred, train_label_vector) +# else: +# classification_loss = classification_criterion(y_pred, train_label) + +# if classification_loss.item() > best_branch_loss.item(): +# best_branch_loss = classification_loss +# best_branch_ind = branch_id + +# branching_scores[student_key][best_branch_ind] += 1 + +# total_classification_loss += best_branch_loss.item() + +# joint_loss = alpha * reconstruction_loss + beta * best_branch_loss +# total_joint_loss += joint_loss.item() + +# # Optimize +# multitask_lerner_model.zero_grad() +# joint_loss.backward() +# optimizer.step() + +# labels.append(train_label) +# predicted_class = get_predicted_class(y_pred, ordinal_regression=ordinal_regression) +# predictions.append(predicted_class) +# users.append(student_id) + +# return total_joint_loss, total_reconstruction_loss, total_classification_loss, labels, predictions, users + +######################################################################################################################## + +def validate_key_set_str(key_set: str): + assert key_set in ['test_ids', 'val_ids', 'train_ids'], "Invalid Key Set. Must be either test or val!" + + +def evaluate_set(data, key_set: str, model, criterion, optimizer=None, train_covariates=False): + validations.validate_data_dict_keys(data) + validate_key_set_str(key_set) + total_loss = 0 + labels = [] + predictions = [] + + if not optimizer: + model.eval() + else: + model.train() + + for key in data[key_set]: + actual_data, covariate_data, train_label = data['data'][key] + y_pred = model(actual_data, covariate_data) if train_covariates else model(actual_data) + y_pred_unqueezed = y_pred.unsqueeze(0) + loss = criterion(y_pred_unqueezed, train_label) + total_loss += loss.item() + + # Check if training + if criterion and optimizer: + model.zero_grad() + loss.backward() + optimizer.step() + + labels.append(train_label) + _, max_idx = y_pred.max(0) + predictions.append(max_idx) + + return total_loss, labels, predictions + + +def evaluate_autoencoder_set(data, key_set: str, autoencoder, criterion, optimizer, use_histogram=False): + validate_key_set_str(key_set) + + total_loss = 0 + decoded_outputs = {} + + for key in data[key_set]: + if use_histogram: + input_seq = data['data'][key][HISTOGRAM_IDX_AFTER_TENSORIFY] + else: + input_seq = data['data'][key][0][0].unsqueeze(0) + + decoded_output = autoencoder(input_seq) + decoded_outputs[key] = decoded_output + + loss = criterion(input_seq, decoded_output) + total_loss += loss.item() + + loss.backward() + optimizer.step() + + return total_loss, decoded_outputs + + +def evaluate_multitask_learner(data, + key_set: str, + num_classes, + multitask_lerner_model, + reconstruction_criterion, + classification_criterion, + device, + optimizer=None, + alpha=1, + beta=1, + use_histogram=False, + histogram_seq_len=None, + ordinal_regression=False, + use_covariates=True): + validations.validate_data_dict_keys(data) + validate_key_set_str(key_set) + + total_reconstruction_loss = 0 + total_classification_loss = 0 + total_joint_loss = 0 + + labels = [] + predictions = [] + users = [] + + if not optimizer: + multitask_lerner_model.eval() + else: + multitask_lerner_model.train() + + outs = list() + for key in data[key_set]: + student_id = conversions.extract_student_id_from_key(key) + student_key = 'student_' + str(student_id) + actual_data, covariate_data, histogram_data, train_label = data['data'][key] + + if ordinal_regression: + train_label_vector = get_target_vector_for_ordinal_regression(train_label, num_classes, device) + + actual_data = actual_data[0].unsqueeze(0) + if use_histogram: + if histogram_seq_len: + histogram_data = histogram_data[:max(histogram_seq_len, len(histogram_data))] + actual_data = histogram_data.unsqueeze(0) + + decoded_output, y_pred = multitask_lerner_model(student_key, + '-1', + actual_data, + covariate_data if use_covariates else None) + outs.append(y_pred.cpu().detach().numpy()) + + # decoded output is `None` if training on only co-variates. + reconstruction_loss = reconstruction_criterion(actual_data, decoded_output) if decoded_output is not None else object_generator.get_tensor_on_correct_device([0]) + total_reconstruction_loss += reconstruction_loss.item() + + if ordinal_regression: + classification_loss = classification_criterion(y_pred, train_label_vector) + else: + classification_loss = classification_criterion(y_pred, train_label) + + total_classification_loss += classification_loss.item() + + joint_loss = alpha * reconstruction_loss + beta * classification_loss + total_joint_loss += joint_loss.item() + + # Check if training + if optimizer: + multitask_lerner_model.zero_grad() + joint_loss.backward() + optimizer.step() + + labels.append(train_label) + predicted_class = get_predicted_class(y_pred, ordinal_regression=ordinal_regression) + predictions.append(predicted_class) + users.append(student_id) + + return total_joint_loss, total_reconstruction_loss, total_classification_loss, labels, predictions, users, outs + + +def evaluate_multitask_learner_per_user(data, + key_set: str, + num_classes, + multitask_lerner_model_dict, + reconstruction_criterion, + classification_criterion, + device, + optimize=False, + alpha=1, + beta=1, + use_histogram=False, + histogram_seq_len=None, + ordinal_regression=False, + use_covariates=True): + validations.validate_data_dict_keys(data) + validate_key_set_str(key_set) + + total_reconstruction_loss = 0 + total_classification_loss = 0 + total_joint_loss = 0 + + labels = [] + predictions = [] + users = [] + + for key in data[key_set]: + student_id = conversions.extract_student_id_from_key(key) + + multitask_lerner_model, optimizer = multitask_lerner_model_dict[student_id] + + if not optimize: + multitask_lerner_model.eval() + else: + multitask_lerner_model.train() + + student_key = 'student_' + str(student_id) + actual_data, covariate_data, histogram_data, train_label = data['data'][key] + + if ordinal_regression: + train_label_vector = get_target_vector_for_ordinal_regression(train_label, num_classes, device) + + actual_data = actual_data[0].unsqueeze(0) + if use_histogram: + if histogram_seq_len: + histogram_data = histogram_data[:max(histogram_seq_len, len(histogram_data))] + actual_data = histogram_data.unsqueeze(0) + + decoded_output, y_pred = multitask_lerner_model(student_key, + actual_data, + covariate_data if use_covariates else None) + + # decoded output is `None` if training on only co-variates. + reconstruction_loss = reconstruction_criterion(actual_data, decoded_output) if decoded_output is not None else object_generator.get_tensor_on_correct_device([0]) + total_reconstruction_loss += reconstruction_loss.item() + + if ordinal_regression: + classification_loss = classification_criterion(y_pred, train_label_vector) + else: + classification_loss = classification_criterion(y_pred, train_label) + + total_classification_loss += classification_loss.item() + + joint_loss = alpha * reconstruction_loss + beta * classification_loss + total_joint_loss += joint_loss.item() + + # Check if training + if optimize: + multitask_lerner_model.zero_grad() + joint_loss.backward() + optimizer.step() + + labels.append(train_label) + predicted_class = get_predicted_class(y_pred, ordinal_regression=ordinal_regression) + predictions.append(predicted_class) + users.append(student_id) + + return total_joint_loss, total_reconstruction_loss, total_classification_loss, labels, predictions, users + + +def evaluate_multitask_lstm_learner(data, + key_set: str, + multitask_lerner_model, + classification_criterion, + optimizer=None, + use_histogram=False): + validations.validate_data_dict_keys(data) + validate_key_set_str(key_set) + + total_classification_loss = 0 + + labels = [] + predictions = [] + users = [] + + if not optimizer: + multitask_lerner_model.eval() + else: + multitask_lerner_model.train() + + for key in data[key_set]: + student_id = conversions.extract_student_id_from_key(key) + student_key = 'student_' + str(student_id) + actual_data, covariate_data, histogram_data, train_label = data['data'][key] + actual_data = actual_data[0].unsqueeze(0) + if use_histogram: + actual_data = histogram_data.unsqueeze(0) + y_pred = multitask_lerner_model(student_key, actual_data, covariate_data) + + classification_loss = classification_criterion(y_pred, train_label) + total_classification_loss += classification_loss.item() + + # Check if training + if optimizer: + multitask_lerner_model.zero_grad() + classification_loss.backward() + optimizer.step() + + labels.append(train_label) + y_pred_squeezed = y_pred.squeeze(0) + _, max_idx = y_pred_squeezed.max(0) + predictions.append(max_idx) + users.append(student_id) + + return total_classification_loss, labels, predictions, users + + +def is_reconstruction_loss_available(y_pred): + if isinstance(y_pred, tuple) and len(y_pred) == 2: + return True + return False + + +def get_target_vector_for_ordinal_regression(train_label, num_classes, device): + label_val = train_label.item() + 1 + new_target_vector = torch.ones(label_val, dtype=torch.float, device=device) + + if new_target_vector.shape[-1] < num_classes: + zeroes = torch.zeros(num_classes - label_val, dtype=torch.float, device=device) + new_target_vector = torch.cat([new_target_vector, zeroes], 0) + + return new_target_vector.unsqueeze(0) + + +def get_predicted_class(y_pred, ordinal_regression=False, or_threshold=0.5): + y_pred_squeezed = y_pred.squeeze(0) + + if ordinal_regression: + predicted_class = y_pred_squeezed.ge(or_threshold).sum().int() + + else: + _, predicted_class = y_pred_squeezed.max(0) + + return predicted_class + diff --git a/src/bin/user_statistics.py b/src/bin/user_statistics.py new file mode 100644 index 0000000..4799960 --- /dev/null +++ b/src/bin/user_statistics.py @@ -0,0 +1,36 @@ +from sklearn import metrics +from src import definitions +from src.bin import validations + + +def user_confusion_matrix(user_data): + """ + + @param user_data: Data for single user + @return: Confusion matrix as list of lists. + """ + validations.validate_user_data(user_data) + validations.validate_single_values_column_in_df(user_data, 'user') + + return metrics.confusion_matrix(user_data['label'], user_data['prediction'], labels=definitions.LABELS) + + +def user_f1_score(user_data): + validations.validate_user_data(user_data) + validations.validate_single_values_column_in_df(user_data, 'user') + + return metrics.f1_score(user_data['label'], user_data['prediction'], average='weighted') + + +def user_accuracy(user_data): + validations.validate_user_data(user_data) + validations.validate_single_values_column_in_df(user_data, 'user') + + return metrics.accuracy_score(user_data['label'], user_data['prediction']) + + +def label_count(user_data): + validations.validate_user_data(user_data) + validations.validate_single_values_column_in_df(user_data, 'user') + + return len(user_data['label']) diff --git a/src/bin/validations.py b/src/bin/validations.py new file mode 100644 index 0000000..4f46c45 --- /dev/null +++ b/src/bin/validations.py @@ -0,0 +1,114 @@ +""" +Stores all validations required by the Lib. +""" +import pandas as pd + +from src import definitions + +DATA_DICT_KEYS = ['data', 'train_ids', 'test_ids', 'val_ids'] + + +def check_if_enough_indices_in_data_frame(training_vales: pd.DataFrame, time_indices_to_keep): + """ + + @brief: Checks if the data frame has the indices required. + This is done by intersection operation of the indices. + @return: True, if enough data available. + """ + required_len = len(time_indices_to_keep) + intersection_len = len(training_vales.index.intersection(time_indices_to_keep)) + + return required_len == intersection_len + + +def check_if_all_columns_present_in_df(df: pd.DataFrame, columns: list): + return all(col in df.columns for col in columns) + + +def check_if_element_in_list(element, src_list: list): + assert element in src_list, "Element: {} not present is the given list".format(element) + + +def check_if_key_present_in_dict(key, src_dict: dict): + check_if_element_in_list(key, list(src_dict.keys())) + + +def validate_student_id_in_data(*data: pd.DataFrame): + for df in data: + assert "student_id" in df.columns, "Invalid data. missing column 'student_id'." + + +def validate_config_key(*keys: str, config): + for key in keys: + assert key in config, "Invalid config!. Key: {} not present in config.".format(key) + + +def validate_student_id_in_data_as_first_col(*data: pd.DataFrame): + validate_student_id_in_data(*data) + + for df in data: + assert "student_id" == df.columns[0], "First Column in DataFrame is not 'student_id'." + + +def validate_data_integrity_for_len(*data_frame: pd.DataFrame): + data_frames = list(data_frame) + + for df in data_frames[1:]: + assert len(df) == len(data_frame[0]), "Lengths of the DataFrame do not match." + + +def validate_data_dict_keys(data_dict): + assert all([k in DATA_DICT_KEYS for k in data_dict.keys()]) + + +def validate_all_data_present_in_data_dict_for_key(data_dict: dict, key): + validate_data_dict_keys(data_dict) + first_key = next(iter(data_dict['data'].keys())) + assert len(data_dict['data'][first_key]) == definitions.DATA_TUPLE_LEN, \ + "Data Tuple len mismatch. Expected: {} Found: {}. If found less than expected, one of these could be missing -'Actual Data', 'Covariate','Missing Flags', 'Time Deltas', 'Label'".format( + definitions.DATA_TUPLE_LEN, + len(data_dict['data'][first_key])) + + +def validate_all_data_present_in_data_dict(data_dict: dict): + for key in data_dict['data']: + validate_all_data_present_in_data_dict_for_key(data_dict, key) + + +def validate_no_nans_in_tensor(tensor): + assert not (tensor != tensor).any(), "null exists in input!" + + +def validate_all_columns_present_in_data_frame(*data_frames: pd.DataFrame, columns: list): + for df in list(data_frames): + assert len(df.columns) >= len(columns), "More columns requested than available in data frame." + assert all([column in df.columns for column in columns] + ), "These columns missing in data frame: {}".format( + [col if col not in df.columns else None for col in columns]) + + +def validate_integrity_of_covariates(covariates, covariate_data): + assert (covariates == 0 and covariate_data is None + ) or (covariates > 0 and covariate_data is not None + ), "Mismatch in covariate initialization and covariate data." + + +def validate_single_values_column_in_df(df: pd.DataFrame, column): + """ + Validate if the passed column of the data frame just has one column. + """ + + assert len(df[column].value_counts()) == 1, "Column: {} has multiple value. This must have only one value.".format(column) + + +def validate_user_data(user_data): + assert check_if_all_columns_present_in_df(user_data, ['label', 'user', 'prediction']) + + +def validate_sequential_model_size_parameters(*size_params): + size_params = list(size_params) + for i in range(2, len(size_params), 2): + previous_size = size_params[i - 1] + cur_size = size_params[i] + + assert previous_size == cur_size, "Miss-match in size parameters of the network." diff --git a/src/configurations/data_manager_config.yaml b/src/configurations/data_manager_config.yaml new file mode 100644 index 0000000..1ff26ee --- /dev/null +++ b/src/configurations/data_manager_config.yaml @@ -0,0 +1,67 @@ +student_life_var_binned_data: + # Two splitting strategy supported - Days and Time Delta of hours from the labels. + default_splitting_strategy: 'time_delta' + normalize_strategy: 'mean' + # Default student list for data manager. + student_list: [35] + feature_list: ['epoch_of_day', 'time_since_last_label' , 'time_to_next_label', + 'time_to_next_deadline', 'activity_inference_mode','audio_activity_inference_mode', + 'conv_duration_min_inferred_mode','phonecharge_duration_min_inferred_mode', 'phonelock_duration_min_inferred_mode', + 'dark_duration_min_inferred_mode'] + # The covariates are flattened by taking the nearest value. If empty nothing will be flattened. + covariate_list: ['day_of_week', + 'sleep_rating_robust_sum', + 'hours_slept_robust_sum'] + process_covariates_as_regular_features: False + label_list: ['stress_level_mode'] + train_set_size: 60 + val_set_size: 20 + # Test set is here for just a consistency, it will contain what ever is remaining. + test_set_size: 20 + time_deltas: + time_delta_behind_from_label_h: 24 + time_delta_ahead_from_label_h: 0 + adjust_labels_wrt_median: True + flatten_sequence_to_cols: False + sub_sampling: + sub_sample_count: 5 + # This cannot be more than the actual sequence length. + output_sequence_len: 36 + use_histogram: False + histogram: + epoch_of_day: + simple_aggregates: [] + custom_aggregates: ['mode'] + time_since_last_label: + simple_aggregates: [] + custom_aggregates: ['time'] + time_to_next_label: + simple_aggregates: [] + custom_aggregates: ['time'] + time_to_next_deadline: + simple_aggregates: [] + custom_aggregates: ['time'] + activity_inference_mode: + simple_aggregates: [] + custom_aggregates: ['1', '2'] +# activity_inference_inferred_feature: +# simple_aggregates: [] +# custom_aggregates: ['0', '1', '2', '3'] +# audio_activity_inference_mode: +# simple_aggregates: [] +# custom_aggregates: ['0', '1', '2', '3'] +# audio_activity_inference_inferred_feature: +# simple_aggregates: [] +# custom_aggregates: ['0', '1', '2'] + conv_duration_min_inferred_mode: + simple_aggregates: [] + custom_aggregates: ['1'] + phonecharge_duration_min_inferred_mode: + simple_aggregates: [] + custom_aggregates: ['1'] + phonelock_duration_min_inferred_mode: + simple_aggregates: [] + custom_aggregates: ['1'] +# dark_duration_min_inferred_mode: +# simple_aggregates: [] +# custom_aggregates: ['1'] \ No newline at end of file diff --git a/src/configurations/feature_processing.yaml b/src/configurations/feature_processing.yaml new file mode 100644 index 0000000..30e3098 --- /dev/null +++ b/src/configurations/feature_processing.yaml @@ -0,0 +1,144 @@ +features: + activity_details: + resample_freq_min: 1 + simple_aggregates: [] + custom_aggregates: ["mode", "inferred_feature"] + + audio_details: + resample_freq_min: 1 + simple_aggregates: [] + custom_aggregates: ["mode", "inferred_feature"] + + sms_details: + resample_freq_min: 1 + simple_aggregates: ["count"] + custom_aggregates: [] + + conversation_details_inferred: + resample_freq_min: 1 + simple_aggregates: [] + custom_aggregates: ["mode"] + + phonecharge_details_inferred: + resample_freq_min: 1 + simple_aggregates: [] + custom_aggregates: ["mode"] + + phonelock_details_inferred: + resample_freq_min: 1 + simple_aggregates: [] + custom_aggregates: ["mode"] + + dark_details_inferred: + resample_freq_min: 1 + simple_aggregates: [] + custom_aggregates: ["mode"] + +# conversation_details: +# resample_freq_min: 1 +# simple_aggregates: ["sum"] +# custom_aggregates: [] +# +# phonecharge_details: +# resample_freq_min: 5 # Daily +# simple_aggregates: [] +# custom_aggregates: ["robust_sum"] +# +# phonelock_details: +# resample_freq_min: 5 +# simple_aggregates: [] +# custom_aggregates: ["robust_sum"] +# +# dark_details: +# resample_freq_min: 5 +# simple_aggregates: [] +# custom_aggregates: ["robust_sum"] + + sleep_details: + resample_freq_min: 5 + simple_aggregates: [] + custom_aggregates: ["robust_sum"] + + gps_details: + resample_freq_min: 5 + simple_aggregates: [] + custom_aggregates: ["mode"] + + stress_details: + resample_freq_min: 1 + simple_aggregates: [] + custom_aggregates: ["mode"] + +covariates: + day_of_week: True + epoch_of_day: True + time_since_last_label: True + time_to_next_label: True + gender: False + previous_stress_label: True + time_to_next_deadline: True + +feature_imputation_strategy: + # Imputed on the base bin. + # Don't impute feature if not set to true. + impute_features: True + # Always no-op for student_id. + student_id: [] + day_of_week: [] + epoch_of_day: [] + time_since_last_label: ['mean_fill'] + time_to_next_label: ['mean_fill'] + previous_stress_label: ['mode_fill'] + time_to_next_deadline: [] + activity_inference_mode: ['mode_fill'] + activity_inference_inferred_feature: ['mode_fill'] + audio_activity_inference_mode: ['mode_fill'] + audio_activity_inference_inferred_feature: ['mode_fill'] + conv_duration_min_sum: ['forward_fill'] + conv_duration_min_inferred_mode: ['mode_fill'] + phonecharge_duration_min_robust_sum: ['forward_fill', 'mean_fill'] + phonecharge_duration_min_inferred_mode: ['mode_fill'] + phonelock_duration_min_robust_sum: ['forward_fill'] + phonelock_duration_min_inferred_mode: ['mode_fill'] + dark_duration_min_robust_sum: ['forward_fill'] + dark_duration_min_inferred_mode: ['mode_fill'] + hours_slept_robust_sum: ['forward_fill', 'mean_fill'] + sleep_rating_robust_sum: ['forward_fill', 'mean_fill'] + latitude_mode: ['forward_fill'] + longitude_mode: ['forward_fill'] + sms_instance_count: ['mode_fill'] + stress_level_mode: [] + +explode_duration_based_features: + conversation_details: + explode_freq: 1 + phonecharge_details: + explode_freq: 1 + phonelock_details: + explode_freq: 1 + dark_details: + explode_freq: 1 + +# Todo(@abhinavshaw) : Check this feature out. Data Doesn't make sense. +# call_log_details: +# resample_freq_min: 60 +# simple_aggregates: ["count"] +# custom_aggregates: [] +# +# sms_details: +# resample_freq_min: 15 +# simple_aggregates: [] +# custom_aggregates: ["mode"] + +students: + # We use data only for students that have good data. +# student_list: [53, 46, 7, 49, 22, 24, 2] + student_list: [ 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 27, 30, 31, 32, 33, 34, 35, 41, 42, 44, 45, 46,47, 49, 50, 51, 52, 53, 54, 57, 58, 59] +# This list will override the student_list while selecting students. + student_ignore_list: [0,3,13] + +data_paths: + cluster_data_path: "/mnt/nfs/scratch1/abhinavshaw/data" + +# This is true in gypsum cluster. +cluster_mode: True diff --git a/src/configurations/grid_search.yaml b/src/configurations/grid_search.yaml new file mode 100644 index 0000000..1ef8210 --- /dev/null +++ b/src/configurations/grid_search.yaml @@ -0,0 +1,28 @@ +#model_name: +# hyper_parameter_1: [List of things] +# hyper_parameter_2: [List of things] +# hyper_parameter_3: SomeThing + +data_file_name: "training_date_normalized_shuffled_splits_select_features_no_prev_stress_all_students.pkl" + +multitask_learner_auto_encoder: + alpha: [0.0001, 0.005] + beta: [1] + autoencoder_bottle_neck_feature_size: [128, 256] + autoencoder_num_layers: [1] + shared_hidden_layer_size: [256, 512] + user_dense_layer_hidden_size: [64, 128, 256] + num_classes: [3] + shared_layer_dropout_prob: [0] + user_head_dropout_prob: [0, 0.15] + learning_rate: [0.00005] + n_epochs: [300] + decay: [0.0001] + # Data + hitogram_seq_len: [24] + # Loss + reconstruction_loss_reduction: ['sum'] + use_histogram: [True] + ordinal_regression_head: [False] + train_only_with_covariates: [False] + class_weights: [[0.95, 0.8, 1]] diff --git a/src/configurations/model_config.yaml b/src/configurations/model_config.yaml new file mode 100644 index 0000000..d3ebbee --- /dev/null +++ b/src/configurations/model_config.yaml @@ -0,0 +1,62 @@ +gru_d: + classes: 5 + # This is actually the Number of sequences to be considered. + num_layers: 1440 + x_mean: [0.0, 0.0, 10.0, 10.0, 0.0, 600.0, 0.0, 0.0, 8.0, 3.0] + learning_rate: 0.2 + learning_rate_decay: 100 + epochs: 20 + student_list: [35] +lstm_n_multitask: + use_histogram: True + autoencoder_bottle_neck_feature_size: 128 + autoencoder_num_layers: 1 + shared_hidden_layer_size: 256 + user_dense_layer_hidden_size: 64 + num_classes: 3 + decay: 0.0001 + shared_layer_dropout_prob: 0.00 + user_head_dropout_prob: 0.00 + + lstm_classifier: + alpha: 0 + beta: 1 + learning_rate: 0.000005 + n_epochs: 500 + bidirectional: True + lstm_classifier_per_user: + alpha: 0 + beta: 1 + learning_rate: 0.000005 + n_epochs: 300 + bidirectional: True + autoencoder_classifier: + alpha: 0.0001 + beta: 1 + learning_rate: 0.000005 + n_epochs: 500 + bidirectional: True + multitask_lstm: + alpha: 0 + beta: 1 + learning_rate: 0.000001 + n_epochs: 500 + bidirectional: True + multitask_autoencoder: + alpha: 0.001 + beta: 1 + learning_rate: 0.000001 + n_epochs: 500 + bidirectional: True + multitask_covariate: + alpha: 0.001 + beta: 1 + learning_rate: 0.000001 + n_epochs: 500 + bidirectional: True + multitask_no_covariate: + alpha: 0.0001 + beta: 1 + learning_rate: 0.000001 + n_epochs: 500 + bidirectional: True \ No newline at end of file diff --git a/src/data_manager/__init__.py b/src/data_manager/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data_manager/__pycache__/__init__.cpython-39.pyc b/src/data_manager/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..337fd50 Binary files /dev/null and b/src/data_manager/__pycache__/__init__.cpython-39.pyc differ diff --git a/src/data_manager/__pycache__/helper.cpython-39.pyc b/src/data_manager/__pycache__/helper.cpython-39.pyc new file mode 100644 index 0000000..c2d661c Binary files /dev/null and b/src/data_manager/__pycache__/helper.cpython-39.pyc differ diff --git a/src/data_manager/__pycache__/splitter.cpython-39.pyc b/src/data_manager/__pycache__/splitter.cpython-39.pyc new file mode 100644 index 0000000..70ce9b3 Binary files /dev/null and b/src/data_manager/__pycache__/splitter.cpython-39.pyc differ diff --git a/src/data_manager/cross_val.py b/src/data_manager/cross_val.py new file mode 100644 index 0000000..7f0037d --- /dev/null +++ b/src/data_manager/cross_val.py @@ -0,0 +1,155 @@ +import numpy as np + +from random import shuffle +from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedKFold +from src.utils import data_conversion_utils as conversions + + +SPLITTER_RANDOM_STATE = 100 + + +def random_stratified_splits(data: dict, stratify_by="labels"): + """ + + @param data: Data in classic dict format. + @param stratify_by: By what the splits need to be stratified. + Accepts - `labels` and `students`. + @return: Return splits which are randomize and stratified by labels. + """ + keys, labels = conversions.extract_keys_and_labels_from_dict(data) + keys, labels = np.array(keys), np.array(labels) + + if stratify_by == "students": + student_ids = conversions.extract_student_ids_from_keys(keys) + stratification_array = np.array(student_ids) + else: + stratification_array = labels + + (X_train, X_test, + y_train, y_test, + stratification_array, new_stratification_array) = train_test_split(keys, + labels, + stratification_array, + test_size=0.40, + shuffle=True, + stratify=stratification_array) + X_val, X_test = train_test_split(X_test, + test_size=0.40, + shuffle=True, + stratify=new_stratification_array) + + return X_train.tolist(), X_val.tolist(), X_test.tolist() + +# helper function, extract first n weeks/days data: +def get_first_n_data(keys, n): + month_days = {0: 0, 1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31} # map: month -> # days + + begin_interval = n # how many data used for the leaved out student + start_day = -1 + + key_list = list() + val_keys = list() + for key in keys: + month = int(key.split('_')[1]) + day = int(key.split('_')[2]) + curr_day = sum([month_days[i] for i in range(month + 1)]) + day + if start_day < 0: + start_day = curr_day + else: + if curr_day - start_day >= begin_interval: + val_keys.append(key) + else: + key_list.append(key) + return key_list, val_keys + +def leave_one_subject_out_split(data: dict, days_include=0): + """ + @param data: data for which the splits are needed to be generated. + @param days_include: the number of days of + leaved out data included in the taining + @return: Return list of dictionary (map: train_ids, val_ids -> data_keys) + """ + print('########## leave one out split, subject: ' + subject + '##########') + splits = list() + + data_keys = data['data'].keys() + + student_key = dict() # map: id -> keys + for key in data_keys: + try: + student_key[key.split('_')[0]].append(key) + except: + student_key[key.split('_')[0]] = [key] + + #for student in student_key: + for student in student_key: + splitting_dict = dict() + splitting_dict['train_ids'] = list() + for rest_student in student_key: + if rest_student != student: + splitting_dict['train_ids'] += student_key[rest_student] + else: + loo_train_keys, loo_val_keys = get_first_n_data(student_key[rest_student], days_include) + splitting_dict['train_ids'] += loo_train_keys + splitting_dict['val_ids'] = loo_val_keys + splits.append(splitting_dict) + + return splits + + +def get_k_fod_cross_val_splits_stratified_by_students(data: dict, groups:dict, n_splits=5, + stratification_type="students"): + """ + @param data: data for which the splits are needed to be generated. + @param groups: map: student_ids -> groups ids + @param n_splits: number of split + @param stratification_type: deterimine the criteria for stratified split + @return: Return list of dictionary (map: train_ids, val_ids -> data_keys) + """ + + print('########## k_fold stratification split, stratified by: ' + stratification_type + '############') + print('split n: ' + str(n_splits)) + splits = list() + + data_keys = data['data'].keys() + + # determine values in stratified column + stratification_column = list() + pos = 0 if stratification_type == "students" else -1 if stratification_type == 'labels' else None + if pos != None: + for key in data_keys: + stratification_column.append(int(key.split('_')[pos])) + elif stratification_type == 'student_label': + keys, labels = conversions.extract_keys_and_labels_from_dict(data) + student_ids = conversions.extract_student_ids_from_keys(keys) + for i in range(len(student_ids)): + stratification_column.append(str(student_ids[i]) + "_" + str(labels[i])) + else: + print('No such kind of criteria for splitting!!!') + exit() + + # splitting + data_keys = np.array(list(data_keys)) + stratification_column = np.array(list(stratification_column)) + splitter = StratifiedKFold(n_splits=n_splits, random_state=SPLITTER_RANDOM_STATE) + for train_index, val_index in splitter.split(X=data_keys, y=stratification_column): + + splitting_dict = dict() + splitting_dict['train_ids'] = data_keys[train_index].tolist() + splitting_dict['val_ids'] = data_keys[val_index].tolist() + splits.append(splitting_dict) + + return splits + + +def filter_student_id(filter_by_student_ids, student_id_list, labels_list, data_keys): + filtered_student_id_list, filtered_labels_list, filtered_data_keys = [], [], [] + + for idx, student_id in enumerate(student_id_list): + if conversions.convert_to_int_if_str(student_id) in filter_by_student_ids: + filtered_student_id_list.append(student_id) + filtered_labels_list.append(labels_list[idx]) + filtered_data_keys.append(data_keys[idx]) + + return filtered_student_id_list, filtered_labels_list, filtered_data_keys diff --git a/src/data_manager/generate_data.py b/src/data_manager/generate_data.py new file mode 100644 index 0000000..6e81413 --- /dev/null +++ b/src/data_manager/generate_data.py @@ -0,0 +1,11 @@ +from src.data_manager import student_life_var_binned_data_manager as data_manager +from src.bin import statistics +from src.utils import write_utils + +student_list = [4, 7, 8, 10, 14, 16, 17, 19, 22, 23, 24, 32, 33, 35, 36, 43, 44, 49, 51, 52, 53, 57, 58] +data = data_manager.get_data_for_training_in_dict_format(*student_list, normalize=True, fill_na=True, + flatten_sequence=False, split_type='percentage') +print(statistics.get_train_test_val_label_counts_from_raw_data(data)) + +write_utils.data_structure_to_pickle(data, + '../data/training_data/shuffled_splits/training_data_normalized_no_prev_stress_students_greater_than_40_labels.pkl') diff --git a/src/data_manager/helper.py b/src/data_manager/helper.py new file mode 100644 index 0000000..5687d8e --- /dev/null +++ b/src/data_manager/helper.py @@ -0,0 +1,81 @@ +import pandas as pd + +from src import definitions +from src.bin import validations +from src.utils import read_utils +from src.data_processing import helper as processing_helper + +VAR_BINNED_DATA_CONFIG = read_utils.read_yaml(definitions.DATA_MANAGER_CONFIG_FILE_PATH)[ + definitions.VAR_BINNED_DATA_MANAGER_ROOT] + +TIME_DELTA_BEHIND_FROM_LABEL_H = VAR_BINNED_DATA_CONFIG['time_deltas']['time_delta_behind_from_label_h'] +TIME_DELTA_AHEAD_FROM_LABEL_H = VAR_BINNED_DATA_CONFIG['time_deltas']['time_delta_ahead_from_label_h'] +TIME_DELTA_BEHIND_FROM_LABEL_H = pd.Timedelta(str(TIME_DELTA_BEHIND_FROM_LABEL_H) + ' hours') +TIME_DELTA_AHEAD_FROM_LABEL_H = pd.Timedelta(str(TIME_DELTA_AHEAD_FROM_LABEL_H) + ' hours') +USE_HISTOGRAM = VAR_BINNED_DATA_CONFIG['use_histogram'] +HISTOGRAM_CONFIGS = VAR_BINNED_DATA_CONFIG['histogram'] + + +def get_data_for_single_day(training_values, covariate_values, missing_values, + time_delta, y_labels, label_idx): + """ + + @return: Return split for a single day. i.e. One label corresponds to several data points, + takes in raw data frame and the label for which the split has to be calculated. + """ + day_string_format = '%Y-%m-%d' + day_string = label_idx.to_pydatetime().strftime(day_string_format) + # todo(@abhinavshaw): For covariates adjust wth delta as you can carry the original label here. + return (training_values.loc[day_string, :].values.tolist(), + missing_values.loc[day_string, :].values.tolist(), + time_delta.loc[day_string, :].values.tolist(), + covariate_values.loc[label_idx, :].values.tolist(), + y_labels.loc[label_idx].values.tolist()[0]) + + +def get_data_for_single_label_based_on_time_delta(training_values, covariate_values, missing_values, + time_delta, y_labels, label_idx): + time_indices_to_keep = pd.date_range(label_idx - TIME_DELTA_BEHIND_FROM_LABEL_H, + label_idx + TIME_DELTA_AHEAD_FROM_LABEL_H, + freq=definitions.DEFAULT_BASE_FREQ, + closed="left") + + # No-op if enough data is not available. + if not validations.check_if_enough_indices_in_data_frame(training_values, + time_indices_to_keep): + return + + training_values = training_values.reindex(time_indices_to_keep) + missing_values = missing_values.reindex(time_indices_to_keep) + time_delta = time_delta.reindex(time_indices_to_keep) + delta = training_values.index[1] - training_values.index[0] + histogram_values = get_histogram(training_values) + + return (training_values.values.tolist(), + missing_values.values.tolist(), + time_delta.values.tolist(), + covariate_values.loc[label_idx - delta, :].values.tolist(), + histogram_values.values.tolist(), + y_labels.loc[label_idx].values.tolist()[0]) + + +def get_histogram(training_values: pd.DataFrame) -> pd.DataFrame: + resampler = training_values.resample(rule="60T") + rule = {} + + for feature in training_values.columns: + # Skip if not in config. A simple way to controll what feature to process. + if feature not in HISTOGRAM_CONFIGS.keys(): + continue + + feature_rule = processing_helper.get_aggregation_rule_for_histogram(feature, + HISTOGRAM_CONFIGS[feature]) + + for key in feature_rule: + rule[key] = feature_rule[key] + + resampled_data = resampler.agg(rule) + resampled_data.columns = ['_'.join(col).strip() if 'student_id' not in col else 'student_id' + for col in resampled_data.columns.values] + + return resampled_data diff --git a/src/data_manager/splitter.py b/src/data_manager/splitter.py new file mode 100644 index 0000000..a286639 --- /dev/null +++ b/src/data_manager/splitter.py @@ -0,0 +1,114 @@ +from datetime import datetime + +DEFAULT_STUDENT_LIFE_YEAR = 2013 + + +def split_data_by_percentage(data_list, start_index: int = 0, percent: float = -1): + """ + + @param data_list: The data for which slice is required. + @param start_index: all indices before this are not considered for slicing. + @param percent: Percentage of data that contributes to the slice. If percent = -1, + then everything from start_index to len(data) is returned. + @return: + """ + data_len = len(data_list) + slice_length = round(data_len * percent / 100) + + assert 0 < percent <= 100 or percent == -1, "Percent value must be between 1 and 100 but got {}".format(percent) + assert 0 <= start_index < data_len + assert start_index + slice_length < data_len, "Over flow of data list. " \ + "Enter smaller percent value or reduce the start_index." + + if percent == -1: + data_slice = data_list[start_index:] + data_slice_keys = [month_day_hour_key for month_day_hour_key, data in data_slice] + end_index = data_len - 1 + else: + data_slice = data_list[start_index: start_index + slice_length] + data_slice_keys = [month_day_hour_key for month_day_hour_key, data in data_slice] + end_index = start_index + slice_length + + return data_slice_keys, end_index + + +def split_data_by_date_range(data_list, start_date: str=None, end_date: str=None): + """ + @attention end_date is not included in the slice. + @param data_list: Data list for which a slice is required. + @param start_date: Start date of the slice. + @param end_date: End date of the slice. + @return: sliced data_list. + """ + + sliced_data_key_list = [] + + if start_date is None: + date_key, data = data_list[0] + start_date = datetime_key_to_date(date_key) + else: + start_date = datetime_key_to_date(start_date) + + if end_date is None: + date_key, data = data_list[-1] + end_date = datetime_key_to_date(date_key) + else: + end_date = datetime_key_to_date(end_date) + + for date_key, data in data_list: + + cur_date = datetime_key_to_date(date_key) + + if start_date <= cur_date < end_date: + sliced_data_key_list.append(date_key) + + return sliced_data_key_list + + +def datetime_key_to_date(date_key): + month, day, hour = tuple(map(int, date_key.split("_"))) + return datetime(year=2013, month=month, day=day, hour=hour) + + +def get_data_split_by_percentage(data_list): + # Splitting data into Train, Val and Test Split. + train_set, end_idx = split_data_by_percentage(data_list, start_index=0, percent=25) + val_set, end_idx = split_data_by_percentage(data_list, start_index=end_idx, percent=15) + test_set, end_idx = split_data_by_percentage(data_list, start_index=end_idx, percent=1) + + train_set_2, end_idx = split_data_by_percentage(data_list, start_index=end_idx, percent=25) + val_set_2, end_idx = split_data_by_percentage(data_list, start_index=end_idx, percent=15) + test_set_2, end_idx = split_data_by_percentage(data_list, start_index=end_idx, percent=1) + + train_set_3, end_idx = split_data_by_percentage(data_list, start_index=end_idx, percent=10) + val_set_3, end_idx = split_data_by_percentage(data_list, start_index=end_idx, percent=1) + test_set_3, end_idx = split_data_by_percentage(data_list, start_index=end_idx, percent=-1) + + train_set = train_set + train_set_2 + train_set_3 + val_set = val_set + val_set_2 + val_set_3 + test_set = test_set + test_set_2 + test_set_3 + + return train_set, val_set, test_set + + +def get_data_split_by_date(data_list): + # Before midterm + train_set = split_data_by_date_range(data_list, start_date=None, end_date='04_10_0') + val_set = split_data_by_date_range(data_list, start_date='04_10_0', end_date='04_16_0') + test_set = split_data_by_date_range(data_list, start_date='04_16_0', end_date='04_17_0') + + # During midterm. + train_set_2 = split_data_by_date_range(data_list, start_date='04_17_0', end_date='04_27_0') + val_set_2 = split_data_by_date_range(data_list, start_date='04_27_0', end_date='04_30_0') + test_set_2 = split_data_by_date_range(data_list, start_date='04_30_0', end_date='05_2_0') + + # after midterm + train_set_3 = split_data_by_date_range(data_list, start_date='05_2_0', end_date='05_11_0') + val_set_3 = split_data_by_date_range(data_list, start_date='05_11_0', end_date='05_13_0') + test_set_3 = split_data_by_date_range(data_list, start_date='05_16_0', end_date=None) + + train_set = train_set + train_set_2 + train_set_3 + val_set = val_set + val_set_2 + val_set_3 + test_set = test_set + test_set_2 + test_set_3 + + return train_set, val_set, test_set diff --git a/src/data_manager/student_life_var_binned_data_manager.py b/src/data_manager/student_life_var_binned_data_manager.py new file mode 100644 index 0000000..3040578 --- /dev/null +++ b/src/data_manager/student_life_var_binned_data_manager.py @@ -0,0 +1,198 @@ +from src import definitions +from src.bin import validations as validations +from src.data_manager import splitter +from src.data_manager import helper as data_manager_helper +from src.data_processing import normalizer +from src.utils import read_utils +from src.utils import student_utils +from src.utils import set_utils +from src.utils import data_conversion_utils as conversions +from src.data_processing import covariates + +VAR_BINNED_DATA_CONFIG = read_utils.read_yaml(definitions.DATA_MANAGER_CONFIG_FILE_PATH)[ + definitions.VAR_BINNED_DATA_MANAGER_ROOT] +ADJUST_LABELS_WRT_MEDIAN = VAR_BINNED_DATA_CONFIG['adjust_labels_wrt_median'] +FLATTEN_SEQUENCE_TO_COLS = VAR_BINNED_DATA_CONFIG['flatten_sequence_to_cols'] + +DEFAULT_STUDENT_LIST = VAR_BINNED_DATA_CONFIG[definitions.STUDENT_LIST_CONFIG_KEY] +available_students = student_utils.get_available_students(definitions.BINNED_ON_VAR_FREQ_DATA_PATH) +DEFAULT_STUDENT_LIST = list(set(DEFAULT_STUDENT_LIST).intersection(set(available_students))) + +FEATURE_LIST = VAR_BINNED_DATA_CONFIG[definitions.FEATURE_LIST_CONFIG_KEY] +LABEL_LIST = VAR_BINNED_DATA_CONFIG[definitions.LABEL_LIST_CONFIG_KEY] +COVARIATE_LIST = VAR_BINNED_DATA_CONFIG[definitions.COVARIATE_LIST_CONFIG_KEY] +NORMALIZE_STRAT = VAR_BINNED_DATA_CONFIG['normalize_strategy'] + +if VAR_BINNED_DATA_CONFIG['process_covariates_as_regular_features']: + FEATURE_LIST = FEATURE_LIST + COVARIATE_LIST +else: + assert len(set_utils.lists_intersection(FEATURE_LIST, COVARIATE_LIST)) == 0, \ + "Feature List and Covariate List cannot overlap." + +# These sizes are in percent of data. +TRAIN_SET_SIZE = VAR_BINNED_DATA_CONFIG['train_set_size'] +VAL_SET_SIZE = VAR_BINNED_DATA_CONFIG['val_set_size'] +TEST_SET_SIZE = VAR_BINNED_DATA_CONFIG['test_set_size'] + +DEFAULT_SPLITTING_STRATEGY = VAR_BINNED_DATA_CONFIG['default_splitting_strategy'] +SPLITTING_STRATEGY_FUNCTION_MAP = { + 'day': data_manager_helper.get_data_for_single_day, + 'time_delta': data_manager_helper.get_data_for_single_label_based_on_time_delta +} + + +def get_data_based_on_labels_and_splitting_strategy(training_values, covariate_values, + missing_values, time_delta, + y_labels, splitting_strategy, + flatten_sequence_to_cols, normalize=False): + """ + + @param training_values: Training values of students. + @param covariate_values: Values that need to be processed as covariates. + @param missing_values: Missing values for one student. + @param time_delta: Time deltas for one student. + @param y_labels: Labels for training. Can have null values. + @param splitting_strategy: Splitting strategy for the data. Current support for + 1) days - Each label will have one day's worth of data. + 2) time_delta - Each label will contain data x hours beihind and y hours ahead (configurable by data_manager.yaml) + @param flatten_sequence_to_cols: If true, the sequences are flattened into columns. + @param normalize: If true, data is normalized based on global statistics. Expensive operation. + @return: Trimmed data based on time delta. + """ + validations.validate_data_integrity_for_len(training_values, missing_values, time_delta, y_labels) + assert splitting_strategy in SPLITTING_STRATEGY_FUNCTION_MAP.keys(), \ + "Invalid splitting strategy must be one of: {}".format(SPLITTING_STRATEGY_FUNCTION_MAP.keys()) + + data_list = [] + # todo(abhinavshaw): make it general for all the labels. + y_labels = y_labels[y_labels['stress_level_mode'].notnull()] + + # todo(abihnavshaw): Process on whole data once fixed issue with last label. + # len(y_label) -1 to ignore the last label. + for label_idx in range(len(y_labels) - 1): + data = SPLITTING_STRATEGY_FUNCTION_MAP[splitting_strategy](training_values, + covariate_values, + missing_values, + time_delta, + y_labels, + y_labels.index[label_idx]) + + if data: + month_day_hour_key = str(y_labels.index[label_idx].month) + '_' + str(y_labels.index[label_idx].day) + '_' \ + + str(y_labels.index[label_idx].hour) + data = conversions.flatten_data(data) if flatten_sequence_to_cols else data + data_list.append((month_day_hour_key, data)) + + return normalizer.normalize_data_list(data_list, normalize_strat=NORMALIZE_STRAT) if normalize else data_list + + +def process_student_data(raw_data, student_id: int, + splitting_strategy, + normalize: bool, + fill_na: bool, + flatten_sequence: bool, + split_type='percentage'): + """ + Processes student data from a large DF of all students. This data is then transformed to the kind + acceptable by DBM and VDB. + """ + assert len(LABEL_LIST) == 1, "Feature List greater than one, check logic to generate labels." + validations.validate_student_id_in_data(*raw_data) + validations.validate_data_integrity_for_len(*raw_data) + + student_data, missing_data, time_delta = conversions.extract_actual_missing_and_time_delta_from_raw_data_for_student( + raw_data, student_id=student_id) + + validations.validate_all_columns_present_in_data_frame(student_data, missing_data, time_delta, columns=FEATURE_LIST) + validations.validate_all_columns_present_in_data_frame(student_data, columns=LABEL_LIST) + + training_values = student_data.loc[:, FEATURE_LIST] + + covariate_values = student_data.loc[:, COVARIATE_LIST] + covariate_values = covariates.exam_period(covariate_values) + + missing_values = missing_data.loc[:, FEATURE_LIST] + time_deltas = time_delta.loc[:, FEATURE_LIST] + y_labels = student_data.loc[:, LABEL_LIST] + + # Additional flags for data processing. + if ADJUST_LABELS_WRT_MEDIAN: + y_labels['stress_level_mode'] = y_labels['stress_level_mode'].map(conversions.adjust_classes_wrt_median, + na_action='ignore') + if 'previous_stress_label' in COVARIATE_LIST: + covariate_values['previous_stress_label'] = covariate_values['previous_stress_label'].map( + conversions.adjust_classes_wrt_median, + na_action='ignore') + + # Filling missing Values + if fill_na: + training_values.fillna(value=-1, inplace=True) + + data_list = get_data_based_on_labels_and_splitting_strategy(training_values, + covariate_values, + missing_values, + time_deltas, + y_labels, + splitting_strategy, + flatten_sequence, + normalize) + + if split_type == 'percentage': + train_set, val_set, test_set = splitter.get_data_split_by_percentage(data_list) + else: + train_set, val_set, test_set = splitter.get_data_split_by_date(data_list) + + return data_list, train_set, val_set, test_set + + +def get_data_for_training_in_dict_format(*student_ids, + splitting_strategy=DEFAULT_SPLITTING_STRATEGY, + normalize=False, + fill_na=True, + flatten_sequence=False, + split_type='percentage'): + """ + + @attention: If no student_ids given to function the default students are returned. + @return: The processed data for all the students in the config. + """ + if not student_ids: + student_ids = DEFAULT_STUDENT_LIST + else: + student_ids = list(student_ids) + + # todo(abhinavshaw) Change to a function. + data = dict() + data["train_ids"] = [] + data["val_ids"] = [] + data["test_ids"] = [] + + data_dict = {} + raw_data = student_utils.get_var_binned_data_for_students(*student_ids) + + for it, student_id in enumerate(student_ids): + print("Student: {}".format(student_id)) + data_list, train_ids, val_ids, test_ids = process_student_data(raw_data, + student_id, + splitting_strategy=splitting_strategy, + normalize=normalize, + fill_na=fill_na, + flatten_sequence=flatten_sequence, + split_type=split_type) + + # Prefixing the IDs with student_id. + for month_day, daily_data in data_list: + data_key = str(student_id) + "_" + month_day + data_dict[data_key] = daily_data + + train_ids, val_ids, test_ids = student_utils.prefix_list_of_strings_or_ids_with_student_id(train_ids, + val_ids, + test_ids, + student_id=student_id) + + data['data'] = data_dict + data['train_ids'] += train_ids + data['val_ids'] += val_ids + data['test_ids'] += test_ids + + return data diff --git a/src/data_manager/sub_sampler.py b/src/data_manager/sub_sampler.py new file mode 100644 index 0000000..f2118dc --- /dev/null +++ b/src/data_manager/sub_sampler.py @@ -0,0 +1,89 @@ +import numpy as np + +from src import definitions +from src.bin import validations +from src.utils import read_utils +from src.utils import object_generator_utils + +HOURS_TO_MINUTES = 60 +DATA_MANAGER_CONFIG = read_utils.read_yaml(definitions.DATA_MANAGER_CONFIG_FILE_PATH)[ + definitions.VAR_BINNED_DATA_MANAGER_ROOT] +SUB_SAMPLING_CONFIG = DATA_MANAGER_CONFIG['sub_sampling'] +SUB_SAMPLE_COUNT = SUB_SAMPLING_CONFIG['sub_sample_count'] +BASE_FREQ = int(definitions.DEFAULT_BASE_FREQ.split(" ")[0]) +SEQUENCE_LEN = HOURS_TO_MINUTES * DATA_MANAGER_CONFIG['time_deltas']['time_delta_behind_from_label_h'] / BASE_FREQ +OUT_SEQUENCE_LEN = SUB_SAMPLING_CONFIG['output_sequence_len'] + + +def find_set_for_key(data: dict, key): + """ + + @param data: + @return: set where the key exists. + """ + + if key in data['train_ids']: + return 'train_ids' + elif key in data['val_ids']: + return 'val_ids' + elif key in data['test_ids']: + return 'test_ids' + else: + return None + + +def get_sub_sampled_sequences(data: dict): + validations.validate_all_data_present_in_data_dict(data) + validations.validate_data_dict_keys(data) + new_data = object_generator_utils.get_empty_data_dict() + + for key in data['data']: + key_set = find_set_for_key(data, key) + if key_set: + sub_sample_sequences(data['data'][key], key, key_set, new_data) + + return new_data + + +def sub_sample_sequences(data_tuple, key, key_set, new_data): + """ + + @param data_tuple: + @param new_data: Modifies this dictionary in place. + """ + end_range = SEQUENCE_LEN - OUT_SEQUENCE_LEN - 1 + sampled_seq_start_indices = np.random.randint(0, end_range, size=SUB_SAMPLE_COUNT) + + actual = data_tuple[definitions.ACTUAL_DATA_IDX] + missing = data_tuple[definitions.MISSING_FLAGS_IDX] + time_delta = data_tuple[definitions.TIME_DELTA_IDX] + covariates = data_tuple[definitions.COVARIATE_DATA_IDX] + y_label = data_tuple[definitions.LABELS_IDX] + + for idx, start_idx in enumerate(sampled_seq_start_indices): + sub_actual, sub_missing, sub_time_delta = slice_sequence(actual, + missing, + time_delta, + start_idx=start_idx, + output_seq_length=OUT_SEQUENCE_LEN) + new_key = key + "_" + str(idx) + new_data['data'][new_key] = sub_actual, sub_missing, sub_time_delta, covariates, y_label + new_data[key_set].append(new_key) + + +def slice_sequence(*data: list, start_idx, output_seq_length): + validate_if_data_slice_out_of_bound(*data, + start_idx=start_idx, + output_seq_length=output_seq_length) + out_data = [] + for datum in list(data): + out_data.append(datum[start_idx:start_idx + output_seq_length]) + + return tuple(out_data) + + +def validate_if_data_slice_out_of_bound(*data: list, start_idx, output_seq_length): + assert start_idx >= 0, "Start idx cannot be negative!" + for datum in list(data): + + assert len(datum) >= start_idx + output_seq_length, "Sequence length out of bound in list." diff --git a/src/data_preparation/__pycache__/generate_data.cpython-39.pyc b/src/data_preparation/__pycache__/generate_data.cpython-39.pyc new file mode 100644 index 0000000..f165cb3 Binary files /dev/null and b/src/data_preparation/__pycache__/generate_data.cpython-39.pyc differ diff --git a/src/data_preparation/generate_data.py b/src/data_preparation/generate_data.py new file mode 100644 index 0000000..43ec2df --- /dev/null +++ b/src/data_preparation/generate_data.py @@ -0,0 +1,14 @@ +from src.utils import student_life_var_binned_data_manager as data_manager +import pickle + +# student_list = [4, 7, 8, 10, 14, 16, 17, 19, 22, 23, 24, 32, 33, 35, 36, 43, 44, 49, 51, 52, 53, 57, 58] +student_list = [4, 7, 8, 10, 14, 16, 17, 19, 22, 23, 24, 32, 33, 35, 36, 43, 44, 49, 51, 52, 53, 57, 58] +data = data_manager.get_data_for_training_in_dict_format(*student_list, normalize=True, fill_na=True, + flatten_sequence=False, split_type='percentage') + +print(len(data)) + +# # save +# saved_filename = '../data/training_data/shuffled_splits/training_data_normalized_no_prev_stress_students_greater_than_40_labels.pkl' +# with open(saved_filename, 'wb') as f: +# pickle.dump(data, f) \ No newline at end of file diff --git a/src/data_processing/__init__.py b/src/data_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data_processing/__pycache__/__init__.cpython-39.pyc b/src/data_processing/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..92ce419 Binary files /dev/null and b/src/data_processing/__pycache__/__init__.cpython-39.pyc differ diff --git a/src/data_processing/__pycache__/aggregates.cpython-39.pyc b/src/data_processing/__pycache__/aggregates.cpython-39.pyc new file mode 100644 index 0000000..3fdbb18 Binary files /dev/null and b/src/data_processing/__pycache__/aggregates.cpython-39.pyc differ diff --git a/src/data_processing/__pycache__/covariates.cpython-39.pyc b/src/data_processing/__pycache__/covariates.cpython-39.pyc new file mode 100644 index 0000000..3ed0400 Binary files /dev/null and b/src/data_processing/__pycache__/covariates.cpython-39.pyc differ diff --git a/src/data_processing/__pycache__/helper.cpython-39.pyc b/src/data_processing/__pycache__/helper.cpython-39.pyc new file mode 100644 index 0000000..6870d4d Binary files /dev/null and b/src/data_processing/__pycache__/helper.cpython-39.pyc differ diff --git a/src/data_processing/__pycache__/imputation.cpython-39.pyc b/src/data_processing/__pycache__/imputation.cpython-39.pyc new file mode 100644 index 0000000..b2a8a28 Binary files /dev/null and b/src/data_processing/__pycache__/imputation.cpython-39.pyc differ diff --git a/src/data_processing/__pycache__/normalizer.cpython-39.pyc b/src/data_processing/__pycache__/normalizer.cpython-39.pyc new file mode 100644 index 0000000..9d2c70c Binary files /dev/null and b/src/data_processing/__pycache__/normalizer.cpython-39.pyc differ diff --git a/src/data_processing/aggregates.py b/src/data_processing/aggregates.py new file mode 100644 index 0000000..2ec7cf0 --- /dev/null +++ b/src/data_processing/aggregates.py @@ -0,0 +1,155 @@ +""" +Package for different aggregate functions. These are used to aggregate +sequential data. With respect to student life this data is usually sequence of +activity, coversation or noise inference values. +""" + +import math +import pandas as pd +import numpy as np + +from scipy.stats import iqr as quartile_range +from scipy.stats import kurtosis +from scipy.fftpack import fft +from collections import Counter + + +def linear_fit(array_like): + # Linear features + if (len(array_like) == 0): + return [0, 0] + p = np.polyfit(np.arange(len(array_like)), array_like, 1) + return [p[0], p[1]] + + +def poly_fit(array_like): + # Poly features + if (len(array_like) == 0): + return [0, 0, 0] + p = np.polyfit(np.arange(len(array_like)), array_like, 2) + return [p[0], p[1], p[2]] + + +def iqr(array_like): + # inter quartile range. + result = quartile_range(array_like) + return result if not math.isnan(result) else 0 + + +def kurt(array_like): + result = kurtosis(array_like) + return result if not math.isnan(result) else 0 + + +def mcr(array_like): + # returns how many times the mean has been crossed. + mean = np.mean(array_like) + array_like = array_like - mean + return np.sum(np.diff(np.sign(array_like)).astype(bool)) + + +def fourier_transform(array_like): + # Return Fast Fourier transfor of array. + result = fft(array_like) + return 0 + + +def mode(array_like): + """ + @param array_like: Array like data structure (accepts numpy array pandas series etc) of which + mode has to be calculated. + @return: Mode of the array. + """ + result = Counter(array_like).most_common() + return result[0][0] if len(result) > 0 else np.nan + + +def inferred_feature(array_like): + """ + @brief: Smart inference aggregation for features like conversation, audio and activity. + @param array_like: + @return: If the features occurs returns one, else 0. + """ + # Taking the max of the feature value is enough to infer that a that feature occured. + # Since the features are in a scale. + if len(array_like) == 0: + return np.nan + array_like = array_like[array_like != 0] + mode_value = mode(array_like) + + return mode_value if not math.isnan(mode_value) else 0 + + +def robust_sum(array_like): + if len(array_like) == 0: + return np.nan + + return np.sum(array_like) + + +def extend_complex_features(feature_name, resampled_df, columns=None): + # This function adds linear features such as slope and intercept to the dataset. + # If columns is None then apply features to all columns. + function_mapper = { + + 'linear': (linear_fit, ['linear_m', 'linear_c']), + 'poly': (poly_fit, ['poly_a', 'poly_b', 'poly_c']), + 'iqr': (iqr, []), + 'kurtosis': (kurt, []), + 'mcr': (mcr, []), + 'fft': (fourier_transform, []) + + } + # please give empty list if no columns name required. + + if columns: + complex_cols = [(feature_name, f) for f in columns] + else: + complex_cols = [(feature_name, f) for f in resampled_df.columns.values] + + complex_feature = resampled_df.agg({feature_name: function_mapper[feature_name][0]}) + + if len(function_mapper[feature_name][1]) > 0: + complex_feature = pd.concat( + [pd.DataFrame( + complex_feature[f].values.tolist(), + columns=[(col, f[1]) for col in function_mapper[feature_name][1]], + index=complex_feature.index) for f in complex_cols], + axis=1 + ) + + return complex_feature + + +def count_int_element(int_element): + + def count_ele(array_like): + array_like = array_like[array_like == int_element] + return len(array_like) + + return count_ele + + +def count_0(array_like): + array_like = array_like[array_like == 0] + return len(array_like) + + +def count_1(array_like): + array_like = array_like[array_like == 1] + return len(array_like) + + +def count_2(array_like): + array_like = array_like[array_like == 2] + return len(array_like) + + +def count_3(array_like): + array_like = array_like[array_like == 3] + return len(array_like) + + +def time_group(array_like): + first_value = array_like[0] + return first_value / 60 diff --git a/src/data_processing/covariates.py b/src/data_processing/covariates.py new file mode 100644 index 0000000..fd1b6ee --- /dev/null +++ b/src/data_processing/covariates.py @@ -0,0 +1,117 @@ +import os +import pandas as pd +import numpy as np + +from copy import deepcopy +from src.data_processing import imputation +from src import definitions + +processed_deadlines_path = os.path.join(definitions.SURVEYS_AND_COVARIATES_DATA_PATH, + "processed_student_deadlines.csv") +PROCESSED_DEADLINES = pd.read_csv(processed_deadlines_path, index_col=[0]) +PROCESSED_DEADLINES.index = pd.to_datetime(PROCESSED_DEADLINES.index) + + +def day_of_week(flattened_student_data: pd.DataFrame) -> pd.DataFrame: + """ + + @param flattened_student_data: Flattened student data in form of Data Frame. + @return: Day of week (integer value from 0-6) as a feature in separate column. + """ + flattened_student_data.insert(loc=1, column='day_of_week', value=flattened_student_data.index.dayofweek) + + return flattened_student_data + + +def epoch_of_day(flattened_student_data: pd.DataFrame) -> pd.DataFrame: + """ + @brief: Day has been defined into 4 epoch of 6 hours each. + @param flattened_student_data: Flattened student data in form of Data Frame. + @return: Epoch of day (integer value from 0-3) as a feature in separate column. + """ + + flattened_student_data.insert(loc=2, column='epoch_of_day', value=flattened_student_data.index.hour) + flattened_student_data['epoch_of_day'] = flattened_student_data['epoch_of_day'].map(evaluate_epoch) + return flattened_student_data + + +def evaluate_epoch(hour): + if 0 <= hour < 6: + return 0 + if 6 <= hour < 12: + return 1 + if 12 <= hour < 18: + return 2 + else: + return 3 + + +def time_since_last_label_min(flattened_student_data: pd.DataFrame) -> pd.DataFrame: + null_mask = flattened_student_data['stress_level_mode'].isnull() + flattened_student_data.insert(loc=3, column='time_since_last_label', value=deepcopy(flattened_student_data.index)) + flattened_student_data.loc[null_mask, 'time_since_last_label'] = np.nan + flattened_student_data['time_since_last_label'].fillna(method='ffill', inplace=True) + # Filling the sequences which do not have a last label and appear first in the data set. + flattened_student_data['time_since_last_label'] = (flattened_student_data.index - flattened_student_data[ + 'time_since_last_label']).astype('timedelta64[m]') + + return flattened_student_data + + +def time_to_next_label_min(flattened_student_data: pd.DataFrame) -> pd.DataFrame: + null_mask = flattened_student_data['stress_level_mode'].isnull() + flattened_student_data.insert(loc=4, column='time_to_next_label', value=deepcopy(flattened_student_data.index)) + flattened_student_data.loc[null_mask, 'time_to_next_label'] = np.nan + flattened_student_data['time_to_next_label'].fillna(method='bfill', inplace=True) + flattened_student_data['time_to_next_label'] = (flattened_student_data[ + 'time_to_next_label'] + - flattened_student_data.index).astype('timedelta64[m]') + + return flattened_student_data + + +def previous_stress_label(flattened_student_data: pd.DataFrame) -> pd.DataFrame: + flattened_student_data.insert(loc=5, + column='previous_stress_label', + value=flattened_student_data.iloc[:, -1]) + flattened_student_data['previous_stress_label'] = imputation.forward_fill( + flattened_student_data['previous_stress_label']) + + return flattened_student_data + + +def time_to_next_deadline(flattened_student_data: pd.DataFrame) -> pd.DataFrame: + student_id = flattened_student_data['student_id'].values[0] + deadlines = PROCESSED_DEADLINES.loc[:, str(student_id)] + deadlines.rename("deadlines", inplace=True) + flattened_student_data = flattened_student_data.join(deadlines, how='left', sort=True) + flattened_student_data.insert(loc=6, + column='time_to_next_deadline', + value=deepcopy(flattened_student_data.index)) + null_mask = np.logical_not(flattened_student_data['deadlines'] >= 1) + flattened_student_data.loc[null_mask, 'time_to_next_deadline'] = np.nan + flattened_student_data['time_to_next_deadline'].fillna(method='bfill', inplace=True) + flattened_student_data['time_to_next_deadline'] = (flattened_student_data[ + 'time_to_next_deadline'] + - flattened_student_data.index).astype('timedelta64[m]') + + return flattened_student_data.drop(columns='deadlines') + + +def exam_period(flattened_student_data: pd.DataFrame) -> pd.DataFrame: + flattened_student_data.insert(loc=0, + column='exam_period', + value=deepcopy(flattened_student_data.index)) + + non_exam_mask = (flattened_student_data['exam_period'] < definitions.MIDTERM_START_DATE) | ( + flattened_student_data['exam_period'] > definitions.MIDTERM_END_DATE) + + flattened_student_data['exam_period_inferred'] = 1 + flattened_student_data.loc[non_exam_mask, 'exam_period_inferred'] = 0 + + return flattened_student_data.drop(columns='exam_period') + + +def evaluate_gender(flattened_student_data: pd.DataFrame) -> pd.DataFrame: + # todo(abhinavshaw): Implement this. + return diff --git a/src/data_processing/explode_duration_based_features.py b/src/data_processing/explode_duration_based_features.py new file mode 100644 index 0000000..ad03b54 --- /dev/null +++ b/src/data_processing/explode_duration_based_features.py @@ -0,0 +1,96 @@ +""" +Script to generate binned aggregates based on the configuration per feature. +""" +import os +import pandas as pd +import numpy as np + +from src import definitions +from src.data_processing import helper +from src.utils import student_utils +from src.utils import read_utils +from src.utils.write_utils import df_to_csv +from src.utils import data_conversion_utils as conversions + +# Reading Configs. +FEATURE_CONFIG = read_utils.read_yaml(definitions.FEATURE_CONFIG_FILE_PATH)['explode_duration_based_features'] +AVAILABLE_FEATURE = FEATURE_CONFIG.keys() +STUDENT_CONFIG = read_utils.read_yaml(definitions.FEATURE_CONFIG_FILE_PATH)['students'] +AVAILABLE_STUDENTS = student_utils.get_available_students(definitions.MINIMAL_PROCESSED_DATA_PATH) +DEFAULT_RESAMPLING_AGGREGATE_CONFIG = { + "resample_freq_min": 1, + "simple_aggregates": [], + "custom_aggregates": ['robust_sum'] +} +DEFAULT_INFERENCE_VALUE_WHEN_INFERRED = 1 + + +def explode_values(feature_values: pd.Series, feature_name): + """ + + @return: These Values are going to explode. + """ + + final_exploded_df = pd.DataFrame() + for index in feature_values.index: + start_date = index + value = feature_values.loc[index].item() + end_date = start_date + pd.Timedelta(str(value) + " min") + exploded_df_index = pd.date_range(start_date, + end_date, + freq=definitions.DEFAULT_EXPLODING_BASE_FREQ, + closed="left") + exploded_df = pd.DataFrame(np.full(len(exploded_df_index), + DEFAULT_INFERENCE_VALUE_WHEN_INFERRED), + index=exploded_df_index, + columns=[feature_name]) + final_exploded_df = final_exploded_df.append(exploded_df) + final_exploded_df = conversions.drop_duplicate_indices_from_df(final_exploded_df) + + return final_exploded_df + + +def explode_feature_data(feature_data: pd.DataFrame): + resampled_feature_data = helper.get_resampled_aggregated_data(feature_data, + DEFAULT_RESAMPLING_AGGREGATE_CONFIG, + student_id) + feature_cols = helper.get_feature_cols_from_data(resampled_feature_data) + + for col in feature_cols: + not_null_mask = resampled_feature_data[col].notnull() + values_to_explode = resampled_feature_data[col][not_null_mask] + exploded_feature_values = explode_values(values_to_explode, col) + overlapping_indices = resampled_feature_data[col].index.intersection(exploded_feature_values.index) + resampled_feature_data[col].loc[overlapping_indices] = exploded_feature_values[col].loc[overlapping_indices] + resampled_feature_data[col].fillna(value=0, inplace=True) + # Todo (abhinavshaw): see how to handle appending the last duration sequence. + return resampled_feature_data + + +def rename_postfix_from_cols(data_frame: pd.DataFrame): + def remove_suffix(col): + col = col.replace("_robust_sum", "_inferred") + return col + + return data_frame.rename(remove_suffix, axis="columns") + + +############## Main Loop To Process Data ################## +for student_id in AVAILABLE_STUDENTS: + + student_data = [] + + for idx, feature in enumerate(AVAILABLE_FEATURE): + feature_data_path = os.path.join(definitions.MINIMAL_PROCESSED_DATA_PATH, + definitions.STUDENT_FOLDER_NAME_PREFIX + str(student_id), + feature + ".csv") + feature_data = pd.read_csv(feature_data_path, index_col=[0]) + feature_data.index = pd.to_datetime(feature_data.index) + exploded_feature_data = explode_feature_data(feature_data) + exploded_feature_data = rename_postfix_from_cols(exploded_feature_data) + exploded_feature_path_to_folder = os.path.join(definitions.MINIMAL_PROCESSED_DATA_PATH, + definitions.STUDENT_FOLDER_NAME_PREFIX + str(student_id)) + exploded_feature_filename = feature + "_inferred.csv" + df_to_csv(exploded_feature_data, exploded_feature_filename, exploded_feature_path_to_folder) + + print("Feature exploded for student: {}".format(student_id)) diff --git a/src/data_processing/helper.py b/src/data_processing/helper.py new file mode 100644 index 0000000..9f42ce7 --- /dev/null +++ b/src/data_processing/helper.py @@ -0,0 +1,274 @@ +""" +Helper function for data processing. +""" + +import pandas as pd +import numpy as np + +from src import definitions +from src.utils import read_utils +from src.bin import validations as validations +from src.data_processing import aggregates +from src.data_processing import covariates as covariate_processor +from src.data_processing import imputation + +FEATURE_IMPUTATION_STRATEGY = read_utils.read_yaml(definitions.FEATURE_CONFIG_FILE_PATH)[ + 'feature_imputation_strategy'] + +COVARIATE_FUNC_MAPPING = { + 'day_of_week': covariate_processor.day_of_week, + 'epoch_of_day': covariate_processor.epoch_of_day, + 'time_since_last_label': covariate_processor.time_since_last_label_min, + 'time_to_next_label': covariate_processor.time_to_next_label_min, + 'gender': covariate_processor.evaluate_gender, + 'previous_stress_label': covariate_processor.previous_stress_label, + 'time_to_next_deadline': covariate_processor.time_to_next_deadline +} + +AGGREGATE_FUNC_MAPPING = { + 'mode': aggregates.mode, + 'inferred_feature': aggregates.inferred_feature, + 'robust_sum': aggregates.robust_sum, + 'time': aggregates.time_group, + "0": aggregates.count_0, + "1": aggregates.count_1, + "2": aggregates.count_2, + "3": aggregates.count_3 + +} + +INTERPOLATION_FUNC_MAPPING = { + 'linear': imputation.linear_interpolation, + 'forward_fill': imputation.forward_fill, + 'mean_fill': imputation.mean_fill, + 'mode_fill': imputation.mode_fill, + 'none': None +} + + +def get_aggregation_rule(feature_inference_cols, feature_config, student_id): + """ + + @param feature_inference_cols: + @param feature_config: + @return: Return Aggregation Rule for the feature based on the configuration. + """ + def value(array_like): + return student_id + + # List of custom aggregate function. + custom_aggregates = [] + simple_aggregates = feature_config['simple_aggregates'] + + for custom_aggregate in feature_config['custom_aggregates']: + custom_aggregates.append(AGGREGATE_FUNC_MAPPING[custom_aggregate]) + + rule = {"student_id": value} + + for col in feature_inference_cols: + rule[col] = simple_aggregates + custom_aggregates + + return rule + + +def get_aggregation_rule_for_histogram(feature_name, feature_config): + simple_aggregate = feature_config['simple_aggregates'] + custom_aggregate = [] + + for agg in feature_config['custom_aggregates']: + custom_aggregate.append(AGGREGATE_FUNC_MAPPING[agg]) + + rule = {feature_name: simple_aggregate + custom_aggregate} + + return rule + + +def get_resampled_aggregated_data(feature_data: pd.DataFrame, feature_config, student_id) -> pd.DataFrame: + """ + + @param feature_data: Un-resampled data for the feature. + @param feature_config: Configs for the specific feature. + @return: Aggregated data on the resampled frequency. + """ + validations.validate_config_key('resample_freq_min', config=feature_config) + + # Extracting columns other than student id (These are the feature inference columns) + feature_inference_cols = list(feature_data.columns.values) + feature_inference_cols.remove("student_id") + # Resampling and applying aggregate rule. + resample_freq_min = feature_config[definitions.RESAMPLE_FREQ_CONFIG_KEY] + resampled_feature_data = feature_data.resample(rule=str(resample_freq_min) + "T") + aggregation_rule = get_aggregation_rule(feature_inference_cols, feature_config, student_id) + aggregated_data = resampled_feature_data.agg(aggregation_rule) + + # Flattening all the columns. + aggregated_data.columns = ['_'.join(col).strip() if 'student_id' not in col else 'student_id' + for col in aggregated_data.columns.values] + + return aggregated_data + + +def get_flattened_student_data_from_list(student_data: pd.DataFrame, student_id) -> pd.DataFrame: + """ + + @param student_data: A list of data frame with various features from the student_life data-set. + @param student_id: Student id of the student. + @return: flattened data-set after applying a left join. + """ + validations.validate_student_id_in_data(*student_data) + + # Pre-processing + feature_data_first = student_data[0] + start_date = feature_data_first.index[0].floor("D") + # todo(abhinavshaw): add one more day to end date to give some room while data processing. Verify end to end. + end_date = feature_data_first.index[-1].floor("D") + flattened_df_index = pd.date_range(start_date, end_date, freq=definitions.DEFAULT_BASE_FREQ) + flattened_df = pd.DataFrame(np.full(len(flattened_df_index), student_id), + index=flattened_df_index, + columns=["student_id"]) + + for idx, feature_df in enumerate(student_data): + feature_df_dropped_student_id = feature_df.drop("student_id", axis=1, inplace=False) + flattened_df = flattened_df.join(feature_df_dropped_student_id, how='left', sort=True) + + return flattened_df + + +def impute_missing_feature(flattened_student_data: pd.DataFrame) -> pd.DataFrame: + # TODO(abhinavshaw): allow multiple sequential imputation for features and clean up this code. + if FEATURE_IMPUTATION_STRATEGY['impute_features']: + for feature_col in flattened_student_data.columns: + propagation_types = FEATURE_IMPUTATION_STRATEGY[feature_col] + if len(propagation_types) > 0: + for propagation_type in propagation_types: + flattened_student_data[feature_col] = INTERPOLATION_FUNC_MAPPING[propagation_type]( + flattened_student_data[feature_col]) + flattened_student_data[feature_col] = flattened_student_data[feature_col].round(decimals=0) + + return flattened_student_data + + +def replace_neg_one_with_nan(df): + """ + + @param df: DataFrame to be processed. + @return: Replaces -1(int) or -1.0(double) all rows to np.nan. + """ + # Converting any missing values to NaN. + return df.replace(to_replace={-1: np.nan, -1.0: np.nan}, value=None, inplace=False) + + +def remove_days_with_no_stress_label(flattened_student_data: pd.DataFrame) -> pd.DataFrame: + """ + + @param flattened_student_data: Flattened data of student. Must contain stress_level_mode as + one of the columns. + @return: processed data frame where sequences belonging to the same day are removed where there + are no stress label. + """ + + validations.validate_student_id_in_data(flattened_student_data) + + stress_not_null_df = flattened_student_data[flattened_student_data['stress_level_mode'].notnull()] + stress_not_null_indices = stress_not_null_df.index + td = pd.Timedelta('1 days') + + for idx, time_index in enumerate(stress_not_null_indices): + floored_time_index = time_index.floor("D") + if idx == 0: + time_indices_to_keep = pd.date_range(floored_time_index, + floored_time_index + td, + freq=definitions.DEFAULT_BASE_FREQ, + closed="left") + else: + time_indices_to_keep = time_indices_to_keep.union( + pd.date_range(floored_time_index, + floored_time_index + td, + freq=definitions.DEFAULT_BASE_FREQ, + closed="left")) + + indices_to_be_dropped = flattened_student_data.index.difference(time_indices_to_keep) + flattened_student_data_dropped = flattened_student_data.drop(indices_to_be_dropped) + + return flattened_student_data_dropped + + +def get_time_deltas_min(flattened_student_data: pd.DataFrame) -> pd.DataFrame: + """ + @attention: Doesnt calculates time deltas for the student_id column as that is an identifier. + @param flattened_student_data: + @return: Returns time deltas of the last observed data in a DataFrame. + """ + validations.validate_student_id_in_data(flattened_student_data) + + time_deltas = pd.DataFrame(index=flattened_student_data.index, + columns=flattened_student_data.columns, + dtype=float) + last_observed_time = {} + for col in flattened_student_data.columns: + last_observed_time[col] = flattened_student_data.index[0] + + cols = flattened_student_data.columns + rows = len(flattened_student_data) + + for i in range(0, rows): + for col_idx, col in enumerate(cols): + + is_col_nan = np.isnan(flattened_student_data.iloc[i][col]) + if not is_col_nan and col != "student_id": + last_observed_time[col] = flattened_student_data.index[i] + + delta = time_deltas.index[i] - last_observed_time[col] + + # converting to minutes if the col is not student_id. + time_deltas.iloc[i, col_idx] = \ + flattened_student_data.iat[i, col_idx] \ + if col == "student_id" else delta.total_seconds() / 60 + + return time_deltas + + +def get_missing_data_mask(flattened_student_data: pd.DataFrame) -> pd.DataFrame: + """ + @attention: will not calculate missing flag for the student id column. Since that is an identifier + and not a feature or a label. + @param flattened_student_data: + @return: Return and integer data frame with value = 0 where data is missing else value = 1. + """ + validations.validate_student_id_in_data_as_first_col(flattened_student_data) + + # Calculate masks on all but the "student_id" col. + missing_value_mask = flattened_student_data.copy() + missing_value_mask.iloc[:, 1:] = flattened_student_data.iloc[:, 1:].isnull().astype(int) + + return missing_value_mask + + +def process_covariates(flattened_student_data: pd.DataFrame, covariates: dict) -> pd.DataFrame: + """ + + @param flattened_student_data: + @param covariates: Dictionary of covariates and their boolean flags. + @return: Data frame after processing covariates. + """ + + for covariate, bool_flag in covariates.items(): + if bool_flag: + processed_flattened_student_data = COVARIATE_FUNC_MAPPING[covariate](flattened_student_data) + flattened_student_data = processed_flattened_student_data if \ + processed_flattened_student_data is not None else flattened_student_data + + return flattened_student_data + + +def get_feature_cols_from_data(feature_data: pd.DataFrame): + """ + + @return: The columns that contain actual feature. This ignores student_id as a column, + as that does not contain any feature values. + """ + + feature_cols = list(feature_data.columns.values) + feature_cols.remove("student_id") + + return feature_cols diff --git a/src/data_processing/imputation.py b/src/data_processing/imputation.py new file mode 100644 index 0000000..3fc967a --- /dev/null +++ b/src/data_processing/imputation.py @@ -0,0 +1,19 @@ +import pandas as pd + + +def linear_interpolation(series: pd.Series): + return series.interpolate(method='linear') + + +def forward_fill(series: pd.Series): + return series.fillna(method='ffill') + + +def mean_fill(series: pd.Series): + mean = series.mean(skipna=True) + return series.fillna(mean) + + +def mode_fill(series: pd.Series): + mode = series.mode()[0] + return series.fillna(mode) diff --git a/src/data_processing/normalizer.py b/src/data_processing/normalizer.py new file mode 100644 index 0000000..01c454f --- /dev/null +++ b/src/data_processing/normalizer.py @@ -0,0 +1,89 @@ +import pandas as pd + + +def normalize(data_frame: pd.DataFrame, norm_type="mean", + df_mean: pd.Series = None, df_std: pd.Series = None, + df_min: pd.Series = None, df_max: pd.Series = None) -> pd.DataFrame: + if norm_type == "min_max": + if df_min is None: + df_min = data_frame.min() + if df_max is None: + df_max = data_frame.max() + + result = (data_frame - df_min) / (df_max - df_min) + else: + if df_mean is None: + df_mean = data_frame.mean() + if df_mean is None: + df_std = data_frame.std() + + result = (data_frame - df_mean) / df_std + + return result.fillna(0) + + +def normalize_data_list(data_list: list, normalize_strat='mean'): + """ + This function calculates the global mean, stdev, min, max and normalizes each data tuple based + on the global statistic. + + @param data_list: Accepts a list of tuple with the first element being + month_day_hour_key and second being data tuple. + @return: Normalized data list. + """ + new_data_list = [] + global_train_values = pd.DataFrame() + global_histogram = pd.DataFrame() + global_covariates = pd.DataFrame() + + for month_day_hour_key, data_tuple in data_list: + training_values, missing_values, time_delta, covariates, histogram, y_labels = data_tuple + global_train_values = global_train_values.append(pd.DataFrame(training_values), ignore_index=True) + global_histogram = global_histogram.append(pd.DataFrame(histogram), ignore_index=True) + global_covariates = global_covariates.append(pd.DataFrame([covariates]), ignore_index=True) + + global_train_mean, global_train_std, global_train_min, global_train_max = evaluate_df_statistic(global_train_values) + global_hist_mean, global_hist_std, global_hist_min, global_hist_max = evaluate_df_statistic(global_histogram) + global_covariates_mean, global_covariates_std, global_covariates_min, global_covariates_max = evaluate_df_statistic( + global_covariates) + + for month_day_hour_key, data_tuple in data_list: + training_values, missing_values, time_delta, covariates, histogram, y_labels = data_tuple + local_train_values = pd.DataFrame(training_values) + local_histogram = pd.DataFrame(histogram) + local_covariate = pd.DataFrame([covariates]) + + local_train_values = normalize(local_train_values, + norm_type=normalize_strat, + df_mean=global_train_mean, + df_std=global_train_std, + df_min=global_train_min, + df_max=global_train_max) + local_histogram = normalize(local_histogram, + norm_type=normalize_strat, + df_mean=global_hist_mean, + df_std=global_hist_std, + df_min=global_hist_min, + df_max=global_hist_max) + + local_covariate = normalize(local_covariate, + norm_type=normalize_strat, + df_mean=global_covariates_mean, + df_std=global_covariates_std, + df_min=global_covariates_min, + df_max=global_covariates_max) + + new_data_tuple = (local_train_values.values.tolist(), + missing_values, + time_delta, + local_covariate.values.tolist()[0] , + local_histogram.values.tolist(), + y_labels) + new_data_list.append((month_day_hour_key, new_data_tuple)) + + return new_data_list + + +def evaluate_df_statistic(df: pd.DataFrame): + df_mean, df_std, df_min, df_max = df.mean(), df.std(), df.min(), df.max() + return df_mean, df_std, df_min, df_max diff --git a/src/data_processing/query_generator.py b/src/data_processing/query_generator.py new file mode 100644 index 0000000..04040a2 --- /dev/null +++ b/src/data_processing/query_generator.py @@ -0,0 +1,47 @@ +def get_feature_query_for_student(student_id): + """ + @param student_id: The student Id for whom you want to extract the features from the database. + @return: Return the query string that can be given to the query processor to be executed. + """ + # Maintaining a feature list. + base_feature_map = { + + "activity_details": "SELECT activity_time ,student_id ,activity_inference FROM activity_details ", + + "dinning_details": "SELECT dinning_time ,student_id ,venue_id ,meal_type_id FROM dinning_details ", + + "call_log_details": "select timestamp as call_time, student_id, 1 as call_recorded from call_log_details", + + "sms_details": "select timestamp, student_id, 1 as sms_instance from sms_details ", + + "audio_details": "select audio_activity_time, student_id, audio_activity_inference from audio_details ", + + "conversation_details": "select conv_start_timestamp, student_id, conv_duration_min from conversation_details ", + + "dark_details": "select dark_start_timestamp, student_id, dark_duration_min from dark_details ", + + "phonecharge_details": "select start_timestamp, student_id, phonecharge_duration_min from phonecharge_details ", + + "phonelock_details": "select start_timestamp, student_id, phonelock_duration_min from phonelock_details ", + + "gps_details": "select wifi_timestamp as time, student_id, latitude, longitude from gps_details ", + + "sleep_details": "select response_timestamp, student_id, hours_slept, sleep_rating from sleep_details", + + } + + for key in base_feature_map.keys(): + base_feature_map[key] = base_feature_map[key] + " where student_id = " + str(student_id) + + return base_feature_map + + +def get_stress_query_for_student(student_id): + """ + + @param student_id: StudentId for the stress labels are required. + @return: Query for StressDetails for the given student. + """ + stress_query = "select student_id, response_time, adjusted_stress_level as stress_level from stress_details where student_id = " + + return stress_query+str(student_id) diff --git a/src/data_processing/query_processor.py b/src/data_processing/query_processor.py new file mode 100644 index 0000000..b4f1c7b --- /dev/null +++ b/src/data_processing/query_processor.py @@ -0,0 +1,37 @@ +import pandas as pd +from sqlalchemy import create_engine +import urllib + +connection_string = "DRIVER={SQL Server Native Client 11.0};SERVER=LAPTOP-C3LFVOFI;DATABASE=student_life;UID=student_sense;PWD=abhinav123" + + +# Create a connection with SQL server to get data. +def exec_sql_query(query, param=None): + """ + @param query: The query to be executed. + @param param: If executing a stored procedure, pass the list of parameters in params. + @return: DataFrame of the result set from the query. + """ + # Create Database Connection. + + params = urllib.parse.quote_plus(connection_string) + engine = create_engine("mssql+pyodbc:///?odbc_connect=%s" % params) + connection = engine.raw_connection() + + try: + cursor = connection.cursor() + if (param): + cursor.execute(query, param) + else: + cursor.execute(query) + + results = cursor.fetchall() + columns = [column[0] for column in cursor.description] + df = pd.DataFrame.from_records(results, columns=columns) + cursor.close() + connection.commit() + finally: + connection.close() + + del engine + return df diff --git a/src/data_processing/student_life_binned_aggregator.py b/src/data_processing/student_life_binned_aggregator.py new file mode 100644 index 0000000..1a63801 --- /dev/null +++ b/src/data_processing/student_life_binned_aggregator.py @@ -0,0 +1,63 @@ +""" +Script to generate binned aggregates based on the configuration per feature. +The aggregation method is as following - + + +""" +import os +import pandas as pd + +from src.definitions import MINIMAL_PROCESSED_DATA_PATH, \ + FEATURE_CONFIG_FILE_PATH, STUDENT_FOLDER_NAME_PREFIX, BINNED_ON_VAR_FREQ_DATA_PATH +from src.utils.read_utils import read_yaml +from src.utils.write_utils import df_to_csv +from src.utils import student_utils +from src.data_processing import helper + + +# Reading Configs. +FEATURE_CONFIG = read_yaml(FEATURE_CONFIG_FILE_PATH)['features'] +AVAILABLE_FEATURE = FEATURE_CONFIG.keys() +COVARIATES = read_yaml(FEATURE_CONFIG_FILE_PATH)['covariates'] +STUDENT_CONFIG = read_yaml(FEATURE_CONFIG_FILE_PATH)['students'] +AVAILABLE_STUDENTS = student_utils.get_available_students(MINIMAL_PROCESSED_DATA_PATH) +students = read_yaml(FEATURE_CONFIG_FILE_PATH)['students']['student_list'] + +if students: + AVAILABLE_STUDENTS = list(set(students).intersection(set(AVAILABLE_STUDENTS))) + +############## Main Loop To Process Data ################## + +for student_id in AVAILABLE_STUDENTS: + + student_data = [] + + for idx, feature in enumerate(AVAILABLE_FEATURE): + feature_data_path = os.path.join(MINIMAL_PROCESSED_DATA_PATH, + STUDENT_FOLDER_NAME_PREFIX + str(student_id), + feature + ".csv") + feature_data = pd.read_csv(feature_data_path, index_col=[0]) + feature_data.index = pd.to_datetime(feature_data.index) + aggregated_data = helper.get_resampled_aggregated_data(feature_data, FEATURE_CONFIG[feature], student_id) + student_data.append(aggregated_data) + + student_data_flattened = helper.get_flattened_student_data_from_list(student_data, student_id) + student_data_flattened = helper.replace_neg_one_with_nan(student_data_flattened) + student_data_flattened_processed = helper.process_covariates(student_data_flattened, COVARIATES) + missing_value_mask = helper.get_missing_data_mask(student_data_flattened_processed) + time_deltas_min = helper.get_time_deltas_min(student_data_flattened_processed) + student_data_flattened_processed = helper.impute_missing_feature(student_data_flattened_processed) + + ############################### Writing the files to csv ############################# + student_binned_data_dir_path = os.path.join( + BINNED_ON_VAR_FREQ_DATA_PATH, + "student_{}".format(student_id) + ) + df_to_csv(student_data_flattened_processed, file_name="var_binned_data.csv", + path_to_folder=student_binned_data_dir_path) + df_to_csv(missing_value_mask, file_name="missing_values_mask.csv", + path_to_folder=student_binned_data_dir_path) + df_to_csv(time_deltas_min, file_name="time_deltas_min.csv", + path_to_folder=student_binned_data_dir_path) + + print("Processed for student_id: {}".format(student_id)) diff --git a/src/data_processing/student_life_minimal_processor.py b/src/data_processing/student_life_minimal_processor.py new file mode 100644 index 0000000..c0c655e --- /dev/null +++ b/src/data_processing/student_life_minimal_processor.py @@ -0,0 +1,79 @@ +""" +Script to generate minimally processed raw data. +""" +import datetime +import os +import numpy as np + +from pathlib import Path +from src.data_processing.query_generator import get_feature_query_for_student, get_stress_query_for_student +from src.data_processing.query_processor import exec_sql_query + +# Collecting distinct students. +distinct_students = exec_sql_query("select distinct student_id from stress_details") +distinct_students = distinct_students.values.T.tolist() +distinct_students = distinct_students[0] +distinct_students.sort() + +# getting current working directory for creating directories later. +cwd = Path(os.getcwd()) + +print("Students: ", distinct_students) + +for student in distinct_students: + + newpath = Path(r'..\..\data\student_life_minimal_processed_data\student_' + str(student)) + newpath = os.path.join(cwd, newpath) + + if not os.path.exists(newpath): + os.makedirs(newpath) + + # Getting Stress levels for only student_id = 1. This will be merged with other features. + stress_details_raw = exec_sql_query(get_stress_query_for_student(student)) + stress_details = stress_details_raw.loc[:, ["response_time", "student_id", "stress_level"]] + stress_details = stress_details.sort_values(by="response_time") + stress_details.rename({"response_time": "time"}, axis='columns', inplace=True) + + # Extracting first and last index of stress level. + # We will truncate other features 1 day behind and 1 day ahead. + first_date = stress_details.loc[0, 'time'] + last_date = stress_details.loc[len(stress_details) - 1, 'time'] + + # delta to back and ahead, in days. + first_date = first_date - datetime.timedelta(days=1) + last_date = last_date + datetime.timedelta(days=0) + feature_map = get_feature_query_for_student(student) + + for key in feature_map.keys(): + feature_query = feature_map[key] + # Data processing begins.. + feature_data = exec_sql_query(feature_query) + + # Selecting Time Col and renaming time column from *_time to time. + train_col_list = [] + for col in feature_data.columns: + if "time" in col: + time_column = col + else: + train_col_list.append(col) + feature_data.rename({time_column: "time"}, axis='columns', inplace=True) + time_column = "time" + + # Sorting by values of time. + feature_data = feature_data.sort_values(by=time_column) + + # Truncating extra features that do not lie in the time frame. + feature_data = feature_data[ + np.logical_and(feature_data[time_column] > first_date, feature_data[time_column] < last_date)] + + if feature_data.empty: + print("Empty DataFrame for Student {} for feature {}".format(student, key)) + continue + + # Writing Feature Data. + feature_data_file_name = os.path.join(newpath, key+".csv") + feature_data.to_csv(feature_data_file_name, index=False, header=True) + + # Writing Stress Data. + stress_data_file_name = os.path.join(newpath, "stress_details.csv") + stress_details.to_csv(stress_data_file_name, index=False) diff --git a/src/data_processing/student_life_minimal_processor_gps_only.py b/src/data_processing/student_life_minimal_processor_gps_only.py new file mode 100644 index 0000000..3cd3135 --- /dev/null +++ b/src/data_processing/student_life_minimal_processor_gps_only.py @@ -0,0 +1,110 @@ +""" +Script to generate minimally processed raw data. +""" +import datetime +import os +import numpy as np +import pandas as pd + +from pathlib import Path +# from src.data_processing.query_generator import get_feature_query_for_student, get_stress_query_for_student +# from src.data_processing.query_processor import exec_sql_query + +# Collecting distinct students. +# distinct_students = exec_sql_query("select distinct student_id from stress_details") +# distinct_students = distinct_students.values.T.tolist() +# distinct_students = distinct_students[0] +# distinct_students.sort() +distinct_students = [int(p.split('_')[1].split('.')[0][1:]) for p in os.listdir("data/dataset/sensing/gps")] + +# getting current working directory for creating directories later. +cwd = Path(os.getcwd()) + +print("Students: ", distinct_students) + +# adj_stress = { +# "1": "0", +# "2": "0", +# "3": "1", +# "4": "2", +# "5": "2", +# } + +for student in distinct_students: + + newpath = Path(r'data\student_life_minimal_processed_data\student_' + str(student)) + newpath = os.path.join(cwd, newpath) + + if not os.path.exists(newpath): + os.makedirs(newpath) + + # Getting Stress levels for only student_id = 1. This will be merged with other features. + # stress_details_raw = exec_sql_query(get_stress_query_for_student(student)) + # stress_details = stress_details_raw.loc[:, ["response_time", "student_id", "stress_level"]] + + stress_details = pd.read_json('data/dataset/EMA/response/Stress/Stress_u{}.json'.format(student)) + stress_details = stress_details.sort_values(by="resp_time") + stress_details.rename({"resp_time": "time"}, axis='columns', inplace=True) + stress_details.rename({"level": "stress_level"}, axis='columns', inplace=True) + # try: + # for i in range(len(stress_details["stress_level"])): + # try: + # stress_details["stress_level"][i] = adj_stress[stress_details["stress_level"][i]] + # except: + # continue + # except: + # print(student) + + # Extracting first and last index of stress level. + # We will truncate other features 1 day behind and 1 day ahead. + first_date = stress_details.loc[0, 'time'] + last_date = stress_details.loc[len(stress_details) - 1, 'time'] + + # delta to back and ahead, in days. + first_date = first_date - datetime.timedelta(days=1) + last_date = last_date + datetime.timedelta(days=0) + + # for i in range(len(stress_details['time'])): + # stress_details['time'][i] = int(round(stress_details['time'][i].timestamp())) + # first_date = int(round(first_date.timestamp())) + # last_date = int(round(last_date.timestamp())) + + # feature_map = get_feature_query_for_student(student) + feature_map = {"gps_details": "select wifi_timestamp as time, student_id, latitude, longitude from gps_details "} + + for key in feature_map.keys(): + feature_query = feature_map[key] + # Data processing begins.. + # feature_data = exec_sql_query(feature_query) + feature_data = pd.read_csv("data/dataset/sensing/gps/gps_u{}.csv".format(student), index_col=False) + + # Selecting Time Col and renaming time column from *_time to time. + train_col_list = [] + # for col in feature_data.columns: + # if "time" in col: + # time_column = col + # else: + # train_col_list.append(col) + feature_data.rename({'time': "time"}, axis='columns', inplace=True) + time_column = "time" + feature_data["time"] = [datetime.datetime.fromtimestamp(t) for t in feature_data["time"]] + + # Sorting by values of time. + feature_data = feature_data.sort_values(by='time') + + # Truncating extra features that do not lie in the time frame. + feature_data = feature_data[ + np.logical_and(feature_data[time_column] > first_date, feature_data[time_column] < last_date) + ] + + if feature_data.empty: + print("Empty DataFrame for Student {} for feature {}".format(student, key)) + continue + + # Writing Feature Data. + feature_data_file_name = os.path.join(newpath, key+".csv") + feature_data.to_csv(feature_data_file_name, index=False, header=True) + + # Writing Stress Data. + stress_data_file_name = os.path.join(newpath, "stress_details.csv") + stress_details.to_csv(stress_data_file_name, index=False) diff --git a/src/definitions.py b/src/definitions.py new file mode 100644 index 0000000..c4e5fa6 --- /dev/null +++ b/src/definitions.py @@ -0,0 +1,92 @@ +import yaml +import os +import pathlib +import pandas as pd + +def read_yaml(file_path): + """Util to read Yaml File.""" + + # Reading from YML file. + with open(file_path, "r") as ymlfile: + yaml_file = yaml.safe_load(ymlfile) + + return yaml_file + +# Defining Root Directory of the project. +# ROOT_DIR = pathlib.Path(os.path.dirname(os.path.abspath(__file__))) +ROOT_DIR = '' +USER_HOME = pathlib.Path.home() + +# File and Key Names +STUDENT_FOLDER_NAME_PREFIX = "student_" +BINNED_DATA_FILE_NAME = "var_binned_data" +BINNED_DATA_MISSING_VALES_FILE_NAME = "missing_values_mask" +BINNED_DATA_TIME_DELTA_FILE_NAME = "time_deltas_min" + +# Config File Path +FEATURE_CONFIG_FILE_PATH = os.path.join(ROOT_DIR, "src/configurations/feature_processing.yaml") +DATA_MANAGER_CONFIG_FILE_PATH = os.path.join(ROOT_DIR, "src/configurations/data_manager_config.yaml") +MODEL_CONFIG_FILE_PATH = os.path.join(ROOT_DIR, "src/configurations/model_config.yaml") +GRID_SEARCH_CONFIG_FILE_PATH = os.path.join(ROOT_DIR, "src/configurations/grid_search.yaml") + +# Frequency constants +DEFAULT_BASE_FREQ = '1 min' +DEFAULT_EXPLODING_BASE_FREQ = '1 min' + +# Data manager config Keys +VAR_BINNED_DATA_MANAGER_ROOT = "student_life_var_binned_data" + +# Universal Config Keys. +STUDENT_LIST_CONFIG_KEY = "student_list" +FEATURE_LIST_CONFIG_KEY = "feature_list" +LABEL_LIST_CONFIG_KEY = "label_list" +COVARIATE_LIST_CONFIG_KEY = "covariate_list" +RESAMPLE_FREQ_CONFIG_KEY = "resample_freq_min" + +# Data Folder Paths - LOCAL +DATA_DIR = os.path.join(ROOT_DIR, "data") +MINIMAL_PROCESSED_DATA_PATH = os.path.join(ROOT_DIR, "data/student_life_minimal_processed_data") +BINNED_ON_VAR_FREQ_DATA_PATH = os.path.join(ROOT_DIR, "data/student_life_var_binned_data") +SURVEYS_AND_COVARIATES_DATA_PATH = os.path.join(ROOT_DIR, "data/surveys_and_covariates") +STUDENT_RAW_DATA_ANALYSIS_ROOT = os.path.join(ROOT_DIR, "data/raw_student_data_information") +SHUFFLED_DATA_ROOT = data_file_path = os.path.join(DATA_DIR, 'training_data/shuffled_splits') + +# Data Tuple Indices +DATA_TUPLE_LEN = 6 +ACTUAL_DATA_IDX = 0 +MISSING_FLAGS_IDX = 1 +TIME_DELTA_IDX = 2 +COVARIATE_DATA_IDX = 3 +HISTOGRAM_IDX = 4 +LABELS_IDX = -1 # Always last! + +# Data Folder Paths - CLUSTER +# # Overwrite Global Constants when cluster mode on. +# config = read_yaml(FEATURE_CONFIG_FILE_PATH) +# CLUSTER_MODE = config['cluster_mode'] +# if CLUSTER_MODE: +# cluster_data_root = config['data_paths']['cluster_data_path'] +# MINIMAL_PROCESSED_DATA_PATH = pathlib.Path( +# os.path.join(cluster_data_root, "student_life_minimal_processed_data")) +# BINNED_ON_VAR_FREQ_DATA_PATH = pathlib.Path( +# os.path.join(cluster_data_root, "student_life_var_binned_data")) +# SURVEYS_AND_COVARIATES_DATA_PATH = pathlib.Path( +# os.path.join(cluster_data_root, "surveys_and_covariates")) + + +# Labels + +ADJUST_WRT_MEDIAN = read_yaml( + DATA_MANAGER_CONFIG_FILE_PATH)['student_life_var_binned_data']['adjust_labels_wrt_median'] + +if ADJUST_WRT_MEDIAN: + LABELS = list(range(3)) +else: + LABELS = list(range(5)) + +# Dates +MIDTERM_START_DATE = pd.to_datetime('2013-04-17') +MIDTERM_END_DATE = pd.to_datetime('2013-05-02') + +# Warning Strings +LOW_MODEL_CAPACITY_WARNING = "Input size greater than hidden size. This may result in a low capacity network" \ No newline at end of file diff --git a/src/experiments/.DS_Store b/src/experiments/.DS_Store new file mode 100644 index 0000000..7acd91e Binary files /dev/null and b/src/experiments/.DS_Store differ diff --git a/src/experiments/__init__.py b/src/experiments/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/experiments/__pycache__/__init__.cpython-38.pyc b/src/experiments/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..6ea50e4 Binary files /dev/null and b/src/experiments/__pycache__/__init__.cpython-38.pyc differ diff --git a/src/experiments/__pycache__/__init__.cpython-39.pyc b/src/experiments/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..dcae04c Binary files /dev/null and b/src/experiments/__pycache__/__init__.cpython-39.pyc differ diff --git a/src/experiments/__pycache__/config.cpython-39.pyc b/src/experiments/__pycache__/config.cpython-39.pyc new file mode 100644 index 0000000..8ce9001 Binary files /dev/null and b/src/experiments/__pycache__/config.cpython-39.pyc differ diff --git a/src/experiments/__pycache__/layers.cpython-38.pyc b/src/experiments/__pycache__/layers.cpython-38.pyc new file mode 100644 index 0000000..89ea326 Binary files /dev/null and b/src/experiments/__pycache__/layers.cpython-38.pyc differ diff --git a/src/experiments/__pycache__/layers.cpython-39.pyc b/src/experiments/__pycache__/layers.cpython-39.pyc new file mode 100644 index 0000000..33b6106 Binary files /dev/null and b/src/experiments/__pycache__/layers.cpython-39.pyc differ diff --git a/src/experiments/__pycache__/location_mlp.cpython-38.pyc b/src/experiments/__pycache__/location_mlp.cpython-38.pyc new file mode 100644 index 0000000..2bd8750 Binary files /dev/null and b/src/experiments/__pycache__/location_mlp.cpython-38.pyc differ diff --git a/src/experiments/__pycache__/location_mlp.cpython-39.pyc b/src/experiments/__pycache__/location_mlp.cpython-39.pyc new file mode 100644 index 0000000..c5d3b46 Binary files /dev/null and b/src/experiments/__pycache__/location_mlp.cpython-39.pyc differ diff --git a/src/experiments/__pycache__/models.cpython-38.pyc b/src/experiments/__pycache__/models.cpython-38.pyc new file mode 100644 index 0000000..a8f9ae2 Binary files /dev/null and b/src/experiments/__pycache__/models.cpython-38.pyc differ diff --git a/src/experiments/__pycache__/models.cpython-39.pyc b/src/experiments/__pycache__/models.cpython-39.pyc new file mode 100644 index 0000000..d4885a5 Binary files /dev/null and b/src/experiments/__pycache__/models.cpython-39.pyc differ diff --git a/src/experiments/__pycache__/repeat_exp.cpython-39.pyc b/src/experiments/__pycache__/repeat_exp.cpython-39.pyc new file mode 100644 index 0000000..856eadd Binary files /dev/null and b/src/experiments/__pycache__/repeat_exp.cpython-39.pyc differ diff --git a/src/experiments/__pycache__/run_exp.cpython-39.pyc b/src/experiments/__pycache__/run_exp.cpython-39.pyc new file mode 100644 index 0000000..d3abacd Binary files /dev/null and b/src/experiments/__pycache__/run_exp.cpython-39.pyc differ diff --git a/src/experiments/analysis.py b/src/experiments/analysis.py new file mode 100644 index 0000000..d30f6fd --- /dev/null +++ b/src/experiments/analysis.py @@ -0,0 +1,148 @@ +import pickle +import numpy as np +from copy import deepcopy +import torch +from sklearn import metrics + +import src.utils.tensorify as tensorify +from src.utils.train_val_utils import * + +def check_BCN(): + # load overal scores + with open('data/cross_val_scores/calm_net_with_branching_5fold_3_with_generic.pkl', 'rb') as f: + # with open('data/cross_val_scores/calm_net_with_branching_loocv_3_7.pkl', 'rb') as f: + var = pickle.load(f) + f1s = np.array([np.max(i['val_f1']['micro']) for i in var if len(i) != 0]) + print('f1', f1s.mean()) + + # load data to fetch the splitting info + data_file_path = 'data/training_data/shuffled_splits/training_date_normalized_shuffled_splits_select_features_no_prev_stress_all_students.pkl' + data = read_data(data_file_path) + data = tensorify.tensorify_data_gru_d(deepcopy(data), torch.cuda.is_available()) + + # load groups + clusters_name = 'one_for_each' # 'one_for_each', 'all_in_one' + print('The groups: ' + clusters_name) + groups_file_path = 'src/experiments/clustering/student_groups/' + clusters_name + '.pkl' + student_groups = read_data(groups_file_path) # student groups + + splits = get_splits('5fold', data, student_groups, days_include=0) + + # fetching info from which fold + chosen_fold = 4 + + # start fetching + student_ys = dict() # map: student_id -> [y_pred, y_true, branch_id] + + train_val_record = var[chosen_fold] + model = train_val_record['model'] + max_epoch = np.argmax(train_val_record['val_f1']['micro']) + output = train_val_record['outputs'][max_epoch] + + y_pred = np.argmax(output, axis=1) + # y_true = val_data['labels'].cpu().detach().numpy() + + # fetch the validation ids + i = 0 + for split_no, split in enumerate(splits): + if i == chosen_fold: + val_ids = split['val_ids'] + break + i += 1 + + # fetch performance for each student + i = 0 + for key in val_ids: + actual_data, covariate_data, histogram_data, label = data['data'][key] + student_id = key.split('_')[0] + if student_ys.get(student_id) == None: + student_ys[student_id] = [[[y_pred[i]], [label.squeeze().cpu().detach().item()]]] + else: + student_ys[student_id][0][0].append(y_pred[i]) + student_ys[student_id][0][1].append(label.squeeze().cpu().detach().item()) + i += 1 + + # fetch branch assigned + for student_id in student_ys: + group_id = student_groups['student_{}'.format(student_id)] + branch = model.branching.probabilities[group_id].argmax().cpu().detach().item() + student_ys[student_id].append(branch) + + # calculate performances + # print(student_ys[student_id][0][1]) + # print(student_ys[student_id][0][0]) + student_ys[student_id][0] = metrics.f1_score( + student_ys[student_id][0][1], + student_ys[student_id][0][0], + average='micro' + ) + + # check + for student_id in student_ys: + print(student_id, student_ys[student_id]) + +def check_loocv(): + # load overal scores + # with open('data/cross_val_scores/calm_net_with_branching_5fold_3.pkl', 'rb') as f: + with open('data/cross_val_scores/semi_online_learning/calm_net_with_branching_generic_head_loocv_3_7.pkl', 'rb') as f: + var = pickle.load(f) + f1s = np.array([np.max(i['val_f1']['micro']) for i in var if len(i) != 0]) + print('f1', f1s.mean()) + + # fetching info from which fold + chosen_fold = 8 + + # start fetching + student_ys = dict() # map: student_id -> [y_pred, y_true, branch_id] + + train_val_record = var[chosen_fold] + model = train_val_record['model'] + # for i in model.branching.probabilities: + # print(i) + # exit() + max_epoch = np.argmax(train_val_record['val_f1']['micro']) + output = train_val_record['outputs'][max_epoch] + + # load data to fetch the splitting info + data_file_path = 'data/training_data/shuffled_splits/training_date_normalized_shuffled_splits_select_features_no_prev_stress_all_students.pkl' + data = read_data(data_file_path) + data = tensorify.tensorify_data_gru_d(deepcopy(data), torch.cuda.is_available()) + + # load groups + clusters_name = 'one_for_each' # 'one_for_each', 'all_in_one' + print('The groups: ' + clusters_name) + groups_file_path = 'src/experiments/clustering/student_groups/' + clusters_name + '.pkl' + student_groups = read_data(groups_file_path) # student groups + + splits = get_splits('loocv', data, student_groups, days_include=0) + + # fetch the validation ids + i = 0 + for split_no, split in enumerate(splits): + if i == chosen_fold: + val_ids = split['train_ids'] + break + i += 1 + + # fetch performance for each student + i = 0 + for key in val_ids: + actual_data, covariate_data, histogram_data, label = data['data'][key] + student_id = key.split('_')[0] + if student_ys.get(student_id) == None: + student_ys[student_id] = [0] + i += 1 + + # fetch branch assigned + for student_id in student_ys: + group_id = student_groups['student_{}'.format(student_id)] + branch = model.branching.probabilities[group_id].argmax().cpu().detach().item() + student_ys[student_id].append(branch) + + # check + for student_id in student_ys: + print(student_id, student_ys[student_id]) + +if __name__ == '__main__': + # check_loocv() + check_BCN() \ No newline at end of file diff --git a/src/experiments/clustering/check_groups.py b/src/experiments/clustering/check_groups.py new file mode 100644 index 0000000..d28d28b --- /dev/null +++ b/src/experiments/clustering/check_groups.py @@ -0,0 +1,38 @@ +# ---------------------------------------------------------------- +# Independent Study 496, Student Stree Prediction +# +# file_name: clustering_students.py +# Functionality: clustering students, +# return dict: map: group_ids(str) -> list_of_student_id(list(str)) +# Author: Yunfei Luo +# Start date: EST Feb.22th.2020 +# Last update: EST Feb.27th.2020 +# ---------------------------------------------------------------- + +import os +import sys +from src.utils.read_utils import read_pickle + +if __name__ == '__main__': + method = None + try: + method = sys.argv[1] + except: + method = 'one_for_each' + #student_groups = clustering(student_list, data['data'], method) + + groups_file_path = 'src/experiments/clustering/student_groups/' + method + '.pkl' + print('get group file from: ' + groups_file_path) + student_groups = read_pickle(groups_file_path) + + # check how students are distributed + print("student distribution: ") + rev_groups = dict() + for student in student_groups: + if rev_groups.get(student_groups[student]) != None: + rev_groups[student_groups[student]].append(student) + else: + rev_groups[student_groups[student]] = [student] + for group in rev_groups: + print(group + ': ' + str(rev_groups[group])) + \ No newline at end of file diff --git a/src/experiments/clustering/clustering_students.py b/src/experiments/clustering/clustering_students.py new file mode 100644 index 0000000..82ffdf1 --- /dev/null +++ b/src/experiments/clustering/clustering_students.py @@ -0,0 +1,347 @@ +# ---------------------------------------------------------------- +# Independent Study 496, Student Stree Prediction +# +# file_name: clustering_students.py +# Functionality: clustering students, +# return dict: map: group_ids(str) -> list_of_student_id(list(str)) +# Author: Yunfei Luo +# Start date: EST Feb.22th.2020 +# Last update: EST Apr.1st.2020 +# ---------------------------------------------------------------- + +import random +import os +import sys +import pandas as pd +import numpy as np +from sklearn.cluster import KMeans +from sklearn import model_selection +from sklearn import preprocessing +import src.utils.data_conversion_utils as conversions +from src.utils.read_utils import read_pickle +from src.utils import write_utils +import src.experiments.clustering.density_based_clustering as dbc + +def get_features(student_list, features): # helper function for kmeans + ''' + @param student_list: list of student_id + @param features: list of (string)feature + ''' + # read survey data + survey = 'Data/data/surveys_and_covariates/high_lelvel_aggregated_data.csv' + df = pd.read_csv(survey) + N = len(student_list) + + # get the index of student in student_list + ids = [int(student.split('_')[1]) for student in student_list] + ind_pos = dict() # map: student_id -> position + for i in range(len(df['student_id'])): + ind_pos[int(df['student_id'][i])] = i + ind = [ind_pos[id_] for id_ in ids] + + # cleansing, by remove NaN to avg + isna = df.isnull() + for feature in features: + have_data = [df[feature][i] for i in ind if not isna[feature][i]] + avg = sum(have_data) / len(have_data) + for i in [j for j in ind if isna[feature][j]]: + df[feature][i] = avg + + # extract data of features + student_features = dict() + + for i in ind: + row = np.array([]) + for feature in features: + row = np.append(row, [df[feature][i]]) + student_features[int(df['student_id'][i])] = row + + return df, student_features + +def kmeans_features(student_list, features, eps, min_samples): # build kmeans model + ''' + @param student_list: list of student id, in the form (string)student_id + @param features: list of (string)feature + @param k: number of centers for kmeans clustering + ''' + # read survey data + df, student_features = get_features(student_list, features) + A = np.array([student_features[i] for i in student_features]) + + A = preprocessing.normalize(A) + + # # kmeans clustering + # model = KMeans(n_clusters = k, random_state=0).fit(A) + # centers = model.cluster_centers_ + # print('centers are: ') + # print(centers) + + # # build group dictionary + # groups = dict() + # for student in student_list: + # quality = student_features[int(student.split('_')[1])] + # belongs = model.predict([quality])[0] + # groups[student] = 'group_'+str(belongs) + + # return groups + + # + model = dbc.density_based_clustering(eps=eps, min_samples=min_samples, cluster_method='xi', metric='l2').fit(A) + groups = dict() + i = 0 + for student in student_list: + belongs = model.labels_[i] + groups[student] = 'group_'+str(belongs) + i += 1 + + # # plot ############################################################### + from mpl_toolkits import mplot3d + import matplotlib.pyplot as plt + + groups_pts = dict() # map: group_id -> pts + # extract data along axises + # + for i in range(len(model.labels_)): + label = model.labels_[i] + if groups_pts.get(label) == None: + groups_pts[label] = dict() + groups_pts[label]['x'] = [A[i][0]] + groups_pts[label]['y'] = [A[i][1]] + groups_pts[label]['z'] = [A[i][2]] + else: + groups_pts[label]['x'].append(A[i][0]) + groups_pts[label]['y'].append(A[i][1]) + groups_pts[label]['z'].append(A[i][2]) + + # initialize + colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'cyan', 'deeppink'] + markers = ['o', '^', 's', '*', '1', 'p', '_', 'X', 'P'] + + fig = plt.figure() + ax = fig.add_subplot(111, projection='3d') + for label in groups_pts: + x_ = groups_pts[label]['x'] + y_ = groups_pts[label]['y'] + z_ = groups_pts[label]['z'] + ax.scatter(x_, y_, z_, c=colors[label], marker=markers[label]) + + ax.set_xlabel('avg_hours_slept') + ax.set_ylabel('mode_sleep_rating') + ax.set_zlabel('avg_ddl_per_week') + plt.show() + ########################################################################## + + return groups + +# original model +def one_for_each(student_list): + groups = dict() + for i in range(len(student_list)): + groups[student_list[i]] = 'group_' + str(i) + return groups + +# clustering based on average stress +def avg_stress_cluster(student_list, data, eps, min_samples): + ''' + @param student_list: list of student id, in the form (string)student_id + @param data: actual data, dict: (string)keys -> data + @param eps: distance for clustering + @param min_samples: min # of samples for a point to be considered as a core + ''' + # compute averages + stress = dict() + for key in data: + try: + stress['student_'+key.split('_')[0]].append(data[key][-1]) + except: + stress['student_'+key.split('_')[0]] = [data[key][-1]] + max_stress = -1 + for i in stress: + stress[i] = sum(stress[i]) / len(stress[i]) + max_stress = max(max_stress, stress[i]) + + avgs = [[stress[i]] for i in stress] + + # train model + model = dbc.density_based_clustering(eps=eps, min_samples=min_samples, cluster_method='xi', metric='l1').fit(avgs) + groups = dict() + i = 0 + for student in stress: + belongs = model.labels_[i] + groups[student] = 'group_'+str(belongs) + i += 1 + + # # plot ############################################################### + from mpl_toolkits import mplot3d + import matplotlib.pyplot as plt + + groups_pts = dict() # map: group_id -> pts + # extract data along axises + for i in range(len(model.labels_)): + label = model.labels_[i] + if groups_pts.get(label) == None: + groups_pts[label] = [avgs[i][0]] + else: + groups_pts[label].append(avgs[i][0]) + + # initialize + colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'cyan', 'deeppink'] + markers = ['o', '^', 's', '*', '1', 'p', '_', 'X', 'P'] + + for label in groups_pts: + x = groups_pts[label] + y = [0 for _ in range(len(x))] + plt.scatter(x, y, c=colors[label], marker=markers[label]) + + # x = [i[0] for i in avgs] + # y = [0 for _ in range(len(avgs))] + # fig, ax = plt.subplots() + # ax.scatter(x, y) + # i = 0 + # for student in stress: + # ax.annotate(student.split('_')[-1], (x[i], y[i])) + # i += 1 + + plt.show() + ########################################################################## + + return groups + +# time warping clustering +def time_warping(student_list, data, feature, eps, min_samples): + ''' + @param student_list: list of student id, in the form (string)student_id + @param data: actual data, dict: (string)keys -> data + @param feature: {-1: stress label, 0-5: corresponding feature} + @param eps: distance for clustering + @param min_samples: min # of samples for a point to be considered as a core + ''' + month_days = {0: 0, 1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31} # map: month -> # days + # TODO (yunfeiluo) + student_key = dict() # map: student -> [(key, time)] + for key in data: + curr = key.split('_') + time = sum([month_days[i] for i in range(int(curr[1]))]) + int(curr[2]) + (int(curr[3]) / 24) # month plus day plus_hour + try: + student_key[curr[0]].append((key, time)) + except: + student_key[curr[0]] = [(key, time)] + for student in student_key: + student_key[student] = sorted(student_key[student], key=lambda x:x[1]) + + # formulate data + pts = list() + if feature == -1: + for student in student_key: # [time, label] + pt = np.array([[i[1], data[i[0]][-1]] for i in student_key[student]]) + student_key[student] = pt + pts.append(pt) + pts = np.array(pts) + + # plt.plot([i[0] for i in pts[0]], [i[1] for i in pts[0]]) + # for i in range(len(pts)): + # plt.plot([i[0] for i in pts[i]], [i[1] for i in pts[i]]) + # plt.show() + + # dtw clustering + print('fitting...') + model = dbc.density_based_clustering(eps=eps, min_samples=min_samples, cluster_method='xi', metric='dtw').fit(pts) + + # build group dictionary + print('predicting...') + groups = dict() + i = 0 + for student in student_key: + belongs = model.labels_[i] + groups['student_'+student] = 'group_'+str(belongs) + i += 1 + + ### plot ####################################################################### + # import matplotlib.pyplot as plt + + group_assign = dict() + for i in range(len(pts)): + try: + group_assign[model.labels_[i]].append(pts[i]) + except: + group_assign[model.labels_[i]] = [pts[i]] + + # visualize + import matplotlib.pyplot as plt + for pt in group_assign[4]: + plt.plot([i[0] for i in pt], [i[1] for i in pt]) + plt.show() + + ################################################################################ + + return groups + +# do clustering works +def clustering(student_list, data, method): + ''' + @param student_list: list of student id, in the form (string)student_id + @param data: the actual data of the students, with data_key->data + @param method: string from command line argument(s), decide how to clustering + ''' + # TODO *yunfeiluo) do the actual clustering work, write to pkl file + groups = dict() + if method == 'one_for_each': + groups = one_for_each(student_list) + elif method[:10] == 'avg_stress': + ''' + avg_stress_eps_min-samples + ''' + groups = avg_stress_cluster(student_list=student_list, data=data, eps=float(method.split('_')[-2]), min_samples=int(method.split('_')[-1])) + elif method[:7] == 'surveys': + ''' + surveys_eps_min-samples + ''' + features = ['avg_hours_slept', 'mode_sleep_rating', 'avg_dead_line_per_week'] + eps = float(method.split('_')[1]) + min_samples = int(method.split('_')[2]) + groups = kmeans_features(student_list, features, eps, min_samples) + elif method [:3] == 'dtw': + ''' + dtw_eps_min-samples + ''' + eps = float(method.split('_')[1]) + min_samples = int(method.split('_')[2]) + feature = -1 # stress label + groups = time_warping(student_list, data, feature, eps, min_samples) + else: + groups = one_for_each(student_list) + + # write to pkl file + filepath = 'Data/student_groups/' + method + '.pkl' + print('write to the file: ' + filepath) + write_utils.data_structure_to_pickle(groups, filepath) + +if __name__ == '__main__': + # ##### Pickle ##### + data_file_path = 'Data/training_data/shuffled_splits/training_date_normalized_shuffled_splits_select_features_no_prev_stress_all_students.pkl' + data = read_pickle(data_file_path) + student_list = conversions.extract_distinct_student_idsfrom_keys(data['data'].keys()) + student_list = conversions.prepend_ids_with_string(student_list, "student_") + + method = None + try: + method = sys.argv[1] + except: + method = 'one_for_each' + student_groups = clustering(student_list, data['data'], method) + + groups_file_path = 'Data/student_groups/' + method + '.pkl' + print('get group file from: ' + groups_file_path) + student_groups = read_pickle(groups_file_path) + + # check how students are distributed + print("student distribution: ") + rev_groups = dict() + for student in student_groups: + if rev_groups.get(student_groups[student]) != None: + rev_groups[student_groups[student]].append(student) + else: + rev_groups[student_groups[student]] = [student] + for group in rev_groups: + print(group + ': ' + str(rev_groups[group])) + \ No newline at end of file diff --git a/src/experiments/clustering/density_based_clustering.py b/src/experiments/clustering/density_based_clustering.py new file mode 100644 index 0000000..ddd87c3 --- /dev/null +++ b/src/experiments/clustering/density_based_clustering.py @@ -0,0 +1,181 @@ +# ---------------------------------------------------------------- +# Independent Study 496, Student Stree Prediction +# +# file_name: density_based_clustering.py +# Functionality: Class, density_based_clustering: do clustering based on density +# Author: Yunfei Luo +# Start date: EST Apr.9th.2020 +# Last update: EST Apr.9th.2020 +# ---------------------------------------------------------------- + +import numpy as np +from sklearn.cluster import OPTICS +from sklearn.cluster import DBSCAN + +class density_based_clustering: + def __init__(self, eps, min_samples, cluster_method, metric): + self.eps = eps + self.min_samples = min_samples + self.cluster_method = cluster_method # xi or dbscan + self.metric = metric + self.dist_matrix = list() + self.labels_ = list() + + def fit(self, pts): + # clustering + self.pts = np.array(pts) + if self.metric == 'precomputed': + self.dist_matrix = self.pts + print('############################ Predicted by DBSCAN ############################') + #clusters = OPTICS(min_samples=self.min_samples, max_eps=self.eps, cluster_method=self.cluster_method, metric='precomputed').fit(self.dist_matrix) + clusters = DBSCAN(eps=self.eps, min_samples=self.min_samples, metric='precomputed').fit(self.dist_matrix) + self.labels_ = clusters.labels_ + else: + self.dist_matrix = self.compute_dist_matrix(self.pts) + print('############################ Predicted by DBSCAN ############################') + #clusters = OPTICS(min_samples=self.min_samples, max_eps=self.eps, cluster_method=self.cluster_method, metric='precomputed').fit(self.dist_matrix) + clusters = DBSCAN(eps=self.eps, min_samples=self.min_samples, metric='precomputed').fit(self.dist_matrix) + self.labels_ = clusters.labels_ + + # # breif view + # for row in self.dist_matrix: + # print(row) + + # clustering outliers + outliers_ind = [i for i in range(len(clusters.labels_)) if clusters.labels_[i] == -1] + print('Num of outliers: ', len(outliers_ind)) + self.cluster_outliers(outliers_ind) + + return self + + def cluster_outliers(self, outliers_ind): + ''' + a. Put each outlier to the cluster where its closest points belongs to; + b. Repeat a. until no change made; + c. Treat the rest outliers as in one group; + ''' + # Cluster the outliers that are close to the exist clusters + hasChange = True + while hasChange: + hasChange = False + for i in outliers_ind: + if self.labels_[i] != -1: + continue + min_ind = -1 + min_dist = np.inf + for j in range(len(self.pts)): + if i != j: + if self.dist_matrix[i][j] < min_dist: + min_dist = self.dist_matrix[i][j] + min_ind = j + if self.labels_[min_ind] != -1: + self.labels_[i] = self.labels_[min_ind] + hasChange = True + + # cluster the rest outliers + outliers_ind = [i for i in outliers_ind if self.labels_[i] == -1] + left = [i for i in outliers_ind] + while len(left) > 0: + group_id = max(self.labels_) + 1 + i = left[0] + self.labels_[i] = group_id + + while True: + # find closest node in outliers + min_ind = -1 + min_dist = np.inf + for j in outliers_ind: + if i != j: + if self.dist_matrix[i][j] < min_dist: + min_dist = self.dist_matrix[i][j] + min_ind = j + + # if the closest is grouped, break + if len(left) == 1 or self.labels_[min_ind] != -1: + if len(left) == 1: + print('hey here') + self.labels_[i] = self.labels_[min_ind] + break + else: + self.labels_[min_ind] = group_id + i = min_ind + + # update ouliers list (delete those with group) + left = [i for i in left if self.labels_[i] == -1] + + + ''' + ## pseudo-code + + loop: + for each outliers i: + find i's closest point j + if j is in an existed group: + put i into j's group + if at least 1 outliers being grouped: + continue + else: + break the loop + + # # after the above loop, all the outliers' closest points are also outliers + # # group them greedly: + + while still have outliers: + choose the first point i, put it in a new group k + loop: + find i's closest point j + if j haven't grouped: + put j into the group k + else: + put i to the group of j + break + i = j + update outliers list + ''' + + # helper functions + # Calculate distance between to points + def compute_dist_matrix(self, pts): + compute_dist = None + if self.metric == 'l1': + compute_dist = lambda x1, x2: np.linalg.norm(x1-x2, 1) + elif self.metric == 'l2': + compute_dist = lambda x1, x2: np.linalg.norm(x1-x2, 2) + elif self.metric == 'dtw': + compute_dist = self.dtw_dist + else: + print('The distance computing type is not available yet...') + exit() + dist_matrix = list() + for pt1 in pts: + row = list() + for pt2 in pts: + row.append(compute_dist(pt1, pt2)) + dist_matrix.append(row) + return np.array(dist_matrix) + + ## DTW distance ############################################################## + def dist(self, p1, p2): + return np.linalg.norm(p1-p2, ord=2) + + # Calculate DTW distance between to series data + def dtw_dist(self, ts1, ts2): + DTW = dict() + DTW[(0, 0)] = 0 + + for i in range(len(ts1)): + for j in range(len(ts2)): + if i == 0 and j == 0: + continue + cost = self.dist(ts1[i], ts2[j]) + min_ = None + if i - 1 >= 0 and j - 1 >= 0: + min_ = min(DTW[(i-1, j)], DTW[(i, j-1)], DTW[(i-1, j-1)]) + elif i - 1 >= 0: + min_ = DTW[(i-1, j)] + elif j - 1 >= 0: + min_ = DTW[(i, j-1)] + DTW[(i, j)] = cost + min_ + + return DTW[(len(ts1) - 1, len(ts2) - 1)] + ############################################################################## \ No newline at end of file diff --git a/src/experiments/clustering/dtw_.py b/src/experiments/clustering/dtw_.py new file mode 100644 index 0000000..4c6619b --- /dev/null +++ b/src/experiments/clustering/dtw_.py @@ -0,0 +1,126 @@ +# ---------------------------------------------------------------- +# Independent Study 496, Student Stree Prediction +# +# file_name: dtw_.py +# Functionality: Class, DTW_clusters: do clustering based on DTW distance +# Author: Yunfei Luo +# Start date: EST Mar.25th.2020 +# Last update: EST Apr.8th.2020 +# ---------------------------------------------------------------- + +import numpy as np +import src.experiments.clustering.density_based_clustering as dbc + +class DTW_clusters: + def __init__(self, eps, min_samples): + self.eps = eps + self.min_samples = min_samples + self.random_state = 0 + self.dist_matrix = list() # 2D array of distance matrix + self.groups = dict() # dictionary, map: pts_ind -> group + self.pts = list() + + # helper functions + def cluster_by_construct_graph(self): + ''' + Construct graph where each node represent each data point; + Add Edge between two nodes if their distance is below eps T; + Collect cluster information by retrieve connected graphs. + ''' + # helper function + def dfs(ind, choosen, group_id): + for i in range(len(choosen)): + if choosen[i]: + continue + if self.dist_matrix[ind][i] <= self.eps: + choosen[i] = True + self.groups[i] = group_id + dfs(i, choosen, group_id) + + # group the data points w.r.t eps + choosen = [False for _ in range(len(self.pts))] + group_id = -1 + while False in choosen: + group_id += 1 + ind = choosen.index(False) + choosen[ind] = True + self.groups[ind] = group_id + dfs(ind, choosen, group_id) + + def density_based_clustering(self): + ''' + clusteri by DBSCAN or OPTICS (xi) + ''' + group_assign = dict() + #clusters = OPTICS(min_samples=2, max_eps=75, cluster_method='xi', metric='precomputed').fit(self.dist_matrix) + clusters = dbc.density_based_clustering(eps=self.eps, min_samples=self.min_samples, cluster_method='xi', metric='precomputed').fit(self.dist_matrix) + + for i in range(len(self.pts)): + self.groups[i] = clusters.labels_[i] + try: + group_assign[clusters.labels_[i]].append(self.pts[i]) + except: + group_assign[clusters.labels_[i]] = [self.pts[i]] + + # # visualize + # import matplotlib.pyplot as plt + # for pt in group_assign[6]: + # plt.plot([i[0] for i in pt], [i[1] for i in pt]) + # plt.show() + + def fit(self, data): + # calculate distance matrix + dist_matrix = list() + for pt1 in data: + row = list() + for pt2 in data: + row.append(self.dtw_dist(pt1, pt2)) + dist_matrix.append(row) + self.dist_matrix = np.array(dist_matrix) + self.pts = data + + # plt.imshow(self.dist_matrix, cmap='gray') + # plt.show() + + self.density_based_clustering() + + return self + + def predict(self, pts): + res = list() + for pt in pts: + group = -1 + min_dist = np.inf + for i in range(len(self.pts)): + curr_dist = self.dtw_dist(pt, self.pts[i]) + if curr_dist < min_dist: + min_dist = curr_dist + group = self.groups[i] + res.append(group) + return res + + # helper functions + # Calculate distance between to points + def dist(self, p1, p2): + return np.linalg.norm(p1-p2, ord=2) + + # Calculate DTW distance between to series data + def dtw_dist(self, ts1, ts2): + DTW = dict() + DTW[(0, 0)] = 0 + + for i in range(len(ts1)): + for j in range(len(ts2)): + if i == 0 and j == 0: + continue + cost = self.dist(ts1[i], ts2[j]) + min_ = None + if i - 1 >= 0 and j - 1 >= 0: + min_ = min(DTW[(i-1, j)], DTW[(i, j-1)], DTW[(i-1, j-1)]) + elif i - 1 >= 0: + min_ = DTW[(i-1, j)] + elif j - 1 >= 0: + min_ = DTW[(i, j-1)] + DTW[(i, j)] = cost + min_ + + return DTW[(len(ts1) - 1, len(ts2) - 1)] diff --git a/src/experiments/clustering/imgs/dtw_5_fold_val_result.png b/src/experiments/clustering/imgs/dtw_5_fold_val_result.png new file mode 100644 index 0000000..2413f40 Binary files /dev/null and b/src/experiments/clustering/imgs/dtw_5_fold_val_result.png differ diff --git a/src/experiments/clustering/student_groups/all_in_one.pkl b/src/experiments/clustering/student_groups/all_in_one.pkl new file mode 100644 index 0000000..ca32f6b Binary files /dev/null and b/src/experiments/clustering/student_groups/all_in_one.pkl differ diff --git a/src/experiments/clustering/student_groups/avg_stress_0.05_5.pkl b/src/experiments/clustering/student_groups/avg_stress_0.05_5.pkl new file mode 100644 index 0000000..6dac4f8 Binary files /dev/null and b/src/experiments/clustering/student_groups/avg_stress_0.05_5.pkl differ diff --git a/src/experiments/clustering/student_groups/avg_stress_0.1_5.pkl b/src/experiments/clustering/student_groups/avg_stress_0.1_5.pkl new file mode 100644 index 0000000..551a636 Binary files /dev/null and b/src/experiments/clustering/student_groups/avg_stress_0.1_5.pkl differ diff --git a/src/experiments/clustering/student_groups/avg_stress_0.2_5.pkl b/src/experiments/clustering/student_groups/avg_stress_0.2_5.pkl new file mode 100644 index 0000000..b083149 Binary files /dev/null and b/src/experiments/clustering/student_groups/avg_stress_0.2_5.pkl differ diff --git a/src/experiments/clustering/student_groups/avg_stress_1_20.pkl b/src/experiments/clustering/student_groups/avg_stress_1_20.pkl new file mode 100644 index 0000000..ca32f6b Binary files /dev/null and b/src/experiments/clustering/student_groups/avg_stress_1_20.pkl differ diff --git a/src/experiments/clustering/student_groups/dtw_25_3.pkl b/src/experiments/clustering/student_groups/dtw_25_3.pkl new file mode 100644 index 0000000..ec34565 Binary files /dev/null and b/src/experiments/clustering/student_groups/dtw_25_3.pkl differ diff --git a/src/experiments/clustering/student_groups/dtw_50_3.pkl b/src/experiments/clustering/student_groups/dtw_50_3.pkl new file mode 100644 index 0000000..cdd4c05 Binary files /dev/null and b/src/experiments/clustering/student_groups/dtw_50_3.pkl differ diff --git a/src/experiments/clustering/student_groups/dtw_75_3.pkl b/src/experiments/clustering/student_groups/dtw_75_3.pkl new file mode 100644 index 0000000..01ac51c Binary files /dev/null and b/src/experiments/clustering/student_groups/dtw_75_3.pkl differ diff --git a/src/experiments/clustering/student_groups/dtw_db/dtw_25_3.pkl b/src/experiments/clustering/student_groups/dtw_db/dtw_25_3.pkl new file mode 100644 index 0000000..ec34565 Binary files /dev/null and b/src/experiments/clustering/student_groups/dtw_db/dtw_25_3.pkl differ diff --git a/src/experiments/clustering/student_groups/dtw_db/dtw_50_3.pkl b/src/experiments/clustering/student_groups/dtw_db/dtw_50_3.pkl new file mode 100644 index 0000000..cdd4c05 Binary files /dev/null and b/src/experiments/clustering/student_groups/dtw_db/dtw_50_3.pkl differ diff --git a/src/experiments/clustering/student_groups/dtw_db/dtw_75_3.pkl b/src/experiments/clustering/student_groups/dtw_db/dtw_75_3.pkl new file mode 100644 index 0000000..01ac51c Binary files /dev/null and b/src/experiments/clustering/student_groups/dtw_db/dtw_75_3.pkl differ diff --git a/src/experiments/clustering/student_groups/one_for_each.pkl b/src/experiments/clustering/student_groups/one_for_each.pkl new file mode 100644 index 0000000..1b2f300 Binary files /dev/null and b/src/experiments/clustering/student_groups/one_for_each.pkl differ diff --git a/src/experiments/clustering/student_groups/pre_survey_scores_2.pkl b/src/experiments/clustering/student_groups/pre_survey_scores_2.pkl new file mode 100644 index 0000000..b172639 Binary files /dev/null and b/src/experiments/clustering/student_groups/pre_survey_scores_2.pkl differ diff --git a/src/experiments/clustering/student_groups/pre_survey_scores_3.pkl b/src/experiments/clustering/student_groups/pre_survey_scores_3.pkl new file mode 100644 index 0000000..1f445e7 Binary files /dev/null and b/src/experiments/clustering/student_groups/pre_survey_scores_3.pkl differ diff --git a/src/experiments/clustering/student_groups/pre_survey_scores_4.pkl b/src/experiments/clustering/student_groups/pre_survey_scores_4.pkl new file mode 100644 index 0000000..6e805d0 Binary files /dev/null and b/src/experiments/clustering/student_groups/pre_survey_scores_4.pkl differ diff --git a/src/experiments/clustering/student_groups/pre_survey_scores_5.pkl b/src/experiments/clustering/student_groups/pre_survey_scores_5.pkl new file mode 100644 index 0000000..7529f74 Binary files /dev/null and b/src/experiments/clustering/student_groups/pre_survey_scores_5.pkl differ diff --git a/src/experiments/clustering/student_groups/pre_survey_scores_6.pkl b/src/experiments/clustering/student_groups/pre_survey_scores_6.pkl new file mode 100644 index 0000000..97411f0 Binary files /dev/null and b/src/experiments/clustering/student_groups/pre_survey_scores_6.pkl differ diff --git a/src/experiments/clustering/student_groups/pre_survey_scores_7.pkl b/src/experiments/clustering/student_groups/pre_survey_scores_7.pkl new file mode 100644 index 0000000..0a0c834 Binary files /dev/null and b/src/experiments/clustering/student_groups/pre_survey_scores_7.pkl differ diff --git a/src/experiments/clustering/student_groups/pre_survey_scores_normalized_2.pkl b/src/experiments/clustering/student_groups/pre_survey_scores_normalized_2.pkl new file mode 100644 index 0000000..1793502 Binary files /dev/null and b/src/experiments/clustering/student_groups/pre_survey_scores_normalized_2.pkl differ diff --git a/src/experiments/clustering/student_groups/pre_survey_scores_normalized_3.pkl b/src/experiments/clustering/student_groups/pre_survey_scores_normalized_3.pkl new file mode 100644 index 0000000..1f7f378 Binary files /dev/null and b/src/experiments/clustering/student_groups/pre_survey_scores_normalized_3.pkl differ diff --git a/src/experiments/clustering/student_groups/pre_survey_scores_normalized_4.pkl b/src/experiments/clustering/student_groups/pre_survey_scores_normalized_4.pkl new file mode 100644 index 0000000..a725ae1 Binary files /dev/null and b/src/experiments/clustering/student_groups/pre_survey_scores_normalized_4.pkl differ diff --git a/src/experiments/clustering/student_groups/pre_survey_scores_normalized_5.pkl b/src/experiments/clustering/student_groups/pre_survey_scores_normalized_5.pkl new file mode 100644 index 0000000..9504c64 Binary files /dev/null and b/src/experiments/clustering/student_groups/pre_survey_scores_normalized_5.pkl differ diff --git a/src/experiments/clustering/student_groups/pre_survey_scores_normalized_6.pkl b/src/experiments/clustering/student_groups/pre_survey_scores_normalized_6.pkl new file mode 100644 index 0000000..93bf730 Binary files /dev/null and b/src/experiments/clustering/student_groups/pre_survey_scores_normalized_6.pkl differ diff --git a/src/experiments/clustering/student_groups/survey_scores_2.pkl b/src/experiments/clustering/student_groups/survey_scores_2.pkl new file mode 100644 index 0000000..f5680c5 Binary files /dev/null and b/src/experiments/clustering/student_groups/survey_scores_2.pkl differ diff --git a/src/experiments/clustering/student_groups/survey_scores_3.pkl b/src/experiments/clustering/student_groups/survey_scores_3.pkl new file mode 100644 index 0000000..014d0bb Binary files /dev/null and b/src/experiments/clustering/student_groups/survey_scores_3.pkl differ diff --git a/src/experiments/clustering/student_groups/survey_scores_4.pkl b/src/experiments/clustering/student_groups/survey_scores_4.pkl new file mode 100644 index 0000000..72ca9d8 Binary files /dev/null and b/src/experiments/clustering/student_groups/survey_scores_4.pkl differ diff --git a/src/experiments/clustering/student_groups/survey_scores_5.pkl b/src/experiments/clustering/student_groups/survey_scores_5.pkl new file mode 100644 index 0000000..8b360e0 Binary files /dev/null and b/src/experiments/clustering/student_groups/survey_scores_5.pkl differ diff --git a/src/experiments/clustering/student_groups/survey_scores_normalized_2.pkl b/src/experiments/clustering/student_groups/survey_scores_normalized_2.pkl new file mode 100644 index 0000000..090835a Binary files /dev/null and b/src/experiments/clustering/student_groups/survey_scores_normalized_2.pkl differ diff --git a/src/experiments/clustering/student_groups/survey_scores_normalized_3.pkl b/src/experiments/clustering/student_groups/survey_scores_normalized_3.pkl new file mode 100644 index 0000000..f7c7880 Binary files /dev/null and b/src/experiments/clustering/student_groups/survey_scores_normalized_3.pkl differ diff --git a/src/experiments/clustering/student_groups/survey_scores_normalized_4.pkl b/src/experiments/clustering/student_groups/survey_scores_normalized_4.pkl new file mode 100644 index 0000000..ecd5560 Binary files /dev/null and b/src/experiments/clustering/student_groups/survey_scores_normalized_4.pkl differ diff --git a/src/experiments/clustering/student_groups/survey_scores_normalized_5.pkl b/src/experiments/clustering/student_groups/survey_scores_normalized_5.pkl new file mode 100644 index 0000000..e09653b Binary files /dev/null and b/src/experiments/clustering/student_groups/survey_scores_normalized_5.pkl differ diff --git a/src/experiments/clustering/student_groups/survey_scores_normalized_6.pkl b/src/experiments/clustering/student_groups/survey_scores_normalized_6.pkl new file mode 100644 index 0000000..2385ab5 Binary files /dev/null and b/src/experiments/clustering/student_groups/survey_scores_normalized_6.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_0.02_5.pkl b/src/experiments/clustering/student_groups/surveys_0.02_5.pkl new file mode 100644 index 0000000..6ae80f3 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_0.02_5.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_0.03_5.pkl b/src/experiments/clustering/student_groups/surveys_0.03_5.pkl new file mode 100644 index 0000000..555b480 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_0.03_5.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_0.05_5.pkl b/src/experiments/clustering/student_groups/surveys_0.05_5.pkl new file mode 100644 index 0000000..fc144e1 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_0.05_5.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_db/surveys_0.3_2.pkl b/src/experiments/clustering/student_groups/surveys_db/surveys_0.3_2.pkl new file mode 100644 index 0000000..0f3ca05 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_db/surveys_0.3_2.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_db/surveys_0.3_5.pkl b/src/experiments/clustering/student_groups/surveys_db/surveys_0.3_5.pkl new file mode 100644 index 0000000..e14bb40 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_db/surveys_0.3_5.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_db/surveys_0.5_6.pkl b/src/experiments/clustering/student_groups/surveys_db/surveys_0.5_6.pkl new file mode 100644 index 0000000..b19b7b4 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_db/surveys_0.5_6.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_db/surveys_0.7_5.pkl b/src/experiments/clustering/student_groups/surveys_db/surveys_0.7_5.pkl new file mode 100644 index 0000000..b6d8199 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_db/surveys_0.7_5.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_kmeans/surveys_10_groups.pkl b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_10_groups.pkl new file mode 100644 index 0000000..4ab0511 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_10_groups.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_kmeans/surveys_12_groups.pkl b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_12_groups.pkl new file mode 100644 index 0000000..0719fa7 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_12_groups.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_kmeans/surveys_3_groups.pkl b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_3_groups.pkl new file mode 100644 index 0000000..82ee5f6 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_3_groups.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_kmeans/surveys_4_groups.pkl b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_4_groups.pkl new file mode 100644 index 0000000..248dd93 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_4_groups.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_kmeans/surveys_5_groups.pkl b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_5_groups.pkl new file mode 100644 index 0000000..7a33060 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_5_groups.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_kmeans/surveys_6_groups.pkl b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_6_groups.pkl new file mode 100644 index 0000000..ecf87de Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_6_groups.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_kmeans/surveys_7_groups.pkl b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_7_groups.pkl new file mode 100644 index 0000000..4251620 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_7_groups.pkl differ diff --git a/src/experiments/clustering/student_groups/surveys_kmeans/surveys_8_groups.pkl b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_8_groups.pkl new file mode 100644 index 0000000..38be6c1 Binary files /dev/null and b/src/experiments/clustering/student_groups/surveys_kmeans/surveys_8_groups.pkl differ diff --git a/src/experiments/clustering/survey/BigFive.csv b/src/experiments/clustering/survey/BigFive.csv new file mode 100644 index 0000000..a232e9b --- /dev/null +++ b/src/experiments/clustering/survey/BigFive.csv @@ -0,0 +1,86 @@ +uid,type,I see myself as someone who... - 1. Is talkative,I see myself as someone who... - 2. Tends to find fault with others,I see myself as someone who... - 3. Does a thorough job,"I see myself as someone who... - 4. Is depressed, blue","I see myself as someone who... - 5. Is original, comes up with new ideas",I see myself as someone who... - 6. Is reserved,I see myself as someone who... - 7. Is helpful and unselfish with others,I see myself as someone who... - 8. Can be somewhat careless,"I see myself as someone who... - 9. Is relaxed, handles stress well.",I see myself as someone who... - 10. Is curious about many different things,I see myself as someone who... - 11.Is full of energy,I see myself as someone who... - 12. Starts quarrels with others,I see myself as someone who... - 13. Is a reliable worker,I see myself as someone who... - 14. Can be tense,"I see myself as someone who... - 15. Is ingenious, a deep thinker",I see myself as someone who... - 16. Generates a lot of enthusiasm,I see myself as someone who... - 17. Has a forgiving nature,I see myself as someone who... - 18. Tends to be disorganized,I see myself as someone who... - 19. Worries a lot,I see myself as someone who... - 20. Has an active imagination,I see myself as someone who... - 21. Tends to be quiet,I see myself as someone who... - 22. Is generally trusting,I see myself as someone who... - 23. Tends to be lazy,"I see myself as someone who... - 24. Is emotionally stable, not easily upset",I see myself as someone who... - 25. Is inventive,I see myself as someone who... - 26. Has an assertive personality,I see myself as someone who... - 27. Can be cold and aloof,I see myself as someone who... - 28. Perseveres until the task is finished,I see myself as someone who... - 29. Can be moody,"I see myself as someone who... - 30. Values artistic, aesthetic experiences","I see myself as someone who... - 31. Is sometimes shy, inhibited",I see myself as someone who... - 32. Is considerate and kind to almost everyone,I see myself as someone who... - 33. Does things efficiently,I see myself as someone who... - 34. Remains calm in tense situations,I see myself as someone who... - 35. Prefers work that is routine,"I see myself as someone who... - 36. Is outgoing, sociable",I see myself as someone who... - 37. Is sometimes rude to others,I see myself as someone who... - 38. Makes plans and follows through with them,I see myself as someone who... - 39. Gets nervous easily,"I see myself as someone who... - 40. Likes to reflect, play with ideas",I see myself as someone who... - 41. Has few artistic interests,I see myself as someone who... - 42. Likes to cooperate with others,I see myself as someone who... - 43. Is easily distracted,"I see myself as someone who... - 44. Is sophisticated in art, music, or literature" +u00,pre,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Agree strongly,Agree strongly,Agree strongly,Agree a little,Agree strongly,Agree a little,Agree a little,Agree a little,Disagree a little,Disagree a little,Disagree a little,Disagree Strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Agree a little,Agree a little,Agree a little +u01,pre,Neither agree nor disagree,Agree a little,Agree strongly,Agree strongly,Agree a little,Agree a little,Agree strongly,Disagree a little,Agree a little,Agree strongly,Agree strongly,Disagree Strongly,Agree strongly,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little,Agree strongly,Agree a little,Disagree a little,Agree a little,Agree strongly,Agree strongly,Agree strongly,Disagree a little,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Agree strongly,Disagree a little,Neither agree nor disagree,Disagree a little,Agree a little +u02,pre,Disagree a little,Agree a little,Agree strongly,Disagree Strongly,Neither agree nor disagree,Agree strongly,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree strongly,Disagree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Agree a little,Agree a little,Agree strongly,Disagree a little,Agree strongly,Disagree a little,Disagree a little,Agree a little,Agree a little,Agree strongly,Agree a little,Agree a little,Agree a little,Neither agree nor disagree +u03,pre,Disagree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Disagree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Disagree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Disagree a little,Disagree a little +u04,pre,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Disagree a little,Agree a little,Agree strongly,Agree a little,Disagree Strongly,Agree a little,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Agree a little,Agree strongly,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Disagree Strongly,Neither agree nor disagree,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Disagree a little,Agree strongly,Disagree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree +u05,pre,Agree a little,Neither agree nor disagree,Agree strongly,Disagree Strongly,Neither agree nor disagree,Agree a little,Agree strongly,Disagree Strongly,Agree a little,Agree a little,Agree a little,Disagree Strongly,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Disagree Strongly,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree a little +u07,pre,Disagree a little,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree strongly,Disagree a little,Agree a little,Disagree a little,Disagree a little,Disagree a little,Agree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Disagree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Agree strongly,Disagree a little,Agree a little,Agree a little,Agree strongly,Disagree a little,Neither agree nor disagree,,Disagree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Disagree Strongly +u08,pre,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Agree a little,Agree a little,Agree strongly,Agree a little,Disagree a little,Agree strongly,Agree a little,Agree strongly,Disagree a little,Agree a little,Disagree a little,Agree strongly,Agree a little,Agree strongly,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Agree strongly,Agree a little,Agree strongly,Agree a little,Disagree a little,Agree a little,Disagree Strongly,Agree strongly,Agree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Disagree Strongly +u09,pre,Disagree a little,Disagree Strongly,Agree a little,Disagree Strongly,Agree a little,Agree strongly,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Disagree Strongly,Agree a little,Disagree a little,Agree strongly,Neither agree nor disagree,Agree strongly,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Disagree Strongly,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Neither agree nor disagree,Disagree a little,Disagree Strongly,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little +u10,pre,Disagree Strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Agree a little,Agree strongly,Disagree a little,Disagree Strongly,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Disagree Strongly,Disagree a little,Agree strongly,Agree a little,Agree strongly,Disagree a little,Disagree Strongly,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Agree strongly,Agree strongly,Disagree Strongly,Agree a little,Disagree Strongly,Agree strongly,Agree a little,Agree a little,Disagree Strongly,Agree strongly,Agree strongly,Agree a little,Agree a little,Disagree Strongly,Disagree Strongly,Disagree a little,Neither agree nor disagree +u12,pre,Disagree Strongly,Disagree a little,Agree a little,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Agree a little,Agree a little,Disagree Strongly,Agree a little,Disagree a little,Agree a little,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Disagree a little +u13,pre,Agree a little,Agree a little,Disagree a little,Disagree a little,Agree strongly,Disagree Strongly,Agree a little,Agree strongly,Disagree Strongly,Agree strongly,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Disagree a little,Agree strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree a little,Disagree a little,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little,Agree a little,Agree a little +u14,pre,Agree a little,Agree strongly,Agree strongly,Disagree Strongly,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Disagree Strongly,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,,Neither agree nor disagree,Agree a little,Neither agree nor disagree,,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Agree a little,Agree a little,Agree a little,Agree strongly,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree +u15,pre,Agree a little,Disagree a little,Agree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree Strongly,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree a little +u16,pre,Disagree Strongly,Agree a little,Agree strongly,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree strongly,Neither agree nor disagree,Disagree Strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree Strongly,Agree a little,Agree strongly,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree strongly,Agree strongly,Disagree Strongly,Agree a little,Agree a little,Disagree a little +u17,pre,Agree strongly,Agree strongly,Agree strongly,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Disagree a little,Agree strongly,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Agree strongly,Agree strongly,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree Strongly,Agree strongly,Disagree Strongly,Agree strongly,Neither agree nor disagree,Agree strongly +u18,pre,Disagree a little,Agree a little,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree Strongly,Disagree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree Strongly +u19,pre,Disagree Strongly,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Agree strongly,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree +u20,pre,Agree strongly,Disagree a little,Agree a little,Disagree a little,Agree strongly,Disagree a little,Agree strongly,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Disagree a little,Agree strongly,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Agree strongly,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree strongly,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little +u22,pre,Neither agree nor disagree,Disagree a little,Agree strongly,Disagree a little,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree strongly,Agree strongly,Disagree Strongly,Agree strongly,Agree a little,Agree strongly,Agree strongly,Disagree a little,Disagree Strongly,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree Strongly,Neither agree nor disagree,Agree strongly,Agree a little,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Agree strongly,Disagree a little,Disagree a little,Agree a little,Agree a little,Agree strongly,Disagree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Disagree a little +u23,pre,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Disagree a little,Disagree a little,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree strongly +u24,pre,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Disagree a little,Agree a little,Disagree a little,Agree a little,Agree strongly,Neither agree nor disagree,Disagree a little,Disagree a little,Agree a little,Agree a little,Disagree a little,Agree strongly,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little +u27,pre,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree Strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little +u30,pre,Agree strongly,Disagree a little,Agree strongly,Disagree Strongly,Agree a little,Disagree a little,Agree a little,Disagree a little,Disagree a little,Agree a little,Agree a little,Disagree Strongly,Agree strongly,Disagree a little,Agree a little,Agree strongly,Agree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Disagree Strongly,Agree a little,Agree a little,Agree strongly,Disagree a little,Agree strongly,Disagree a little,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Agree strongly,Disagree a little,Agree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little +u31,pre,Agree strongly,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree Strongly,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree Strongly,Disagree a little,Agree strongly,Agree a little,Agree a little,Agree a little,Disagree Strongly,Agree strongly,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Agree strongly,Disagree Strongly,Agree strongly,Neither agree nor disagree,Disagree a little,Disagree Strongly,Agree strongly,Disagree a little,Neither agree nor disagree,Disagree a little,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree strongly +u32,pre,Agree strongly,Agree a little,Agree strongly,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Disagree Strongly,Agree strongly,Agree strongly,Agree strongly,Disagree Strongly,Agree a little,Disagree a little,Agree a little,Agree strongly,Agree strongly,Disagree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Disagree Strongly,Agree strongly,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little,Disagree a little,Agree strongly,Disagree a little,Agree strongly,Disagree a little,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Disagree a little +u33,pre,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Disagree a little,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Neither agree nor disagree,Agree a little,Agree strongly,Neither agree nor disagree,Agree strongly,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Disagree a little,Disagree Strongly,Neither agree nor disagree,Agree strongly,Disagree Strongly,Disagree a little,Agree strongly,Agree strongly +u34,pre,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree Strongly,Neither agree nor disagree,Disagree a little,Agree strongly,Agree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Disagree Strongly,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Neither agree nor disagree +u35,pre,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Disagree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Disagree a little +u36,pre,Agree a little,Disagree a little,Agree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree strongly,Agree a little,Neither agree nor disagree,Disagree Strongly,Agree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree +u39,pre,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree +u42,pre,Agree a little,Disagree Strongly,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree strongly,Agree a little,Disagree Strongly,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Disagree a little,Disagree a little,Agree a little,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree +u43,pre,Disagree a little,Disagree a little,Agree strongly,Disagree a little,Agree a little,Agree strongly,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Disagree Strongly,Agree strongly,Agree a little,Agree a little,Agree a little,Agree strongly,Disagree a little,Disagree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree Strongly,Agree a little,Agree a little,Disagree a little,Disagree a little,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Disagree a little,Disagree a little,Agree strongly,Agree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Disagree a little +u44,pre,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree strongly,Agree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree strongly,Disagree Strongly,Disagree Strongly,Agree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Disagree a little,Agree a little,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Agree a little,Disagree Strongly +u45,pre,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Disagree Strongly,Neither agree nor disagree,Neither agree nor disagree,Disagree Strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree Strongly,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree strongly,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Disagree Strongly,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Disagree Strongly,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree Strongly,Agree a little,Disagree a little,Agree a little +u46,pre,Agree a little,Agree strongly,Agree strongly,Disagree a little,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Disagree a little,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree strongly,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree +u47,pre,Agree strongly,Neither agree nor disagree,Agree a little,Disagree a little,Agree strongly,Disagree a little,Agree a little,Agree a little,Agree strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Disagree a little,Disagree a little,Disagree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree strongly,Agree strongly,Neither agree nor disagree,Agree strongly,Disagree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree +u49,pre,Agree a little,Disagree a little,Agree strongly,Disagree Strongly,Agree a little,Agree strongly,Agree a little,Agree a little,Agree strongly,Agree strongly,Agree a little,Disagree Strongly,Agree strongly,Agree a little,Agree strongly,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Agree a little,Agree strongly,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Disagree a little,Agree strongly +u50,pre,Agree strongly,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree +u51,pre,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Agree a little,Disagree a little,Agree a little,Disagree a little,Agree a little,Disagree a little,Agree a little +u52,pre,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Disagree Strongly,Neither agree nor disagree,Disagree a little,Agree strongly,Neither agree nor disagree,Disagree Strongly,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Agree a little,Agree strongly,Agree strongly,Neither agree nor disagree,Agree strongly,Disagree Strongly,Neither agree nor disagree,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree strongly,Disagree a little,Agree strongly,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Disagree Strongly,Disagree a little,Disagree a little,Agree strongly,Agree strongly,Agree a little,Agree a little,Agree strongly,Disagree Strongly +u53,pre,Agree a little,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Agree strongly,Agree strongly,Agree a little,Agree strongly,Disagree a little,Agree a little,Disagree a little,Disagree Strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Disagree a little,Agree strongly,Agree strongly,Agree strongly,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Disagree Strongly,Disagree a little,Neither agree nor disagree,Disagree a little,Disagree a little,Agree strongly,Agree strongly,Disagree Strongly,Agree a little,Agree strongly,Agree strongly +u56,pre,Disagree a little,Disagree Strongly,Agree a little,Disagree Strongly,Agree strongly,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Disagree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Agree strongly,Disagree a little,Neither agree nor disagree,Agree strongly,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little,Agree a little,Agree a little,Disagree Strongly,Disagree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Agree a little,Disagree a little,Neither agree nor disagree +u57,pre,Disagree Strongly,Disagree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Agree strongly,Agree a little,Disagree a little,Neither agree nor disagree,Agree strongly,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Disagree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Disagree Strongly,Neither agree nor disagree,Disagree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree a little +u58,pre,Disagree Strongly,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree strongly,Neither agree nor disagree,Disagree a little,Agree a little,Neither agree nor disagree,Disagree Strongly,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Neither agree nor disagree,Disagree a little,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Disagree a little,Agree strongly,Disagree Strongly,Disagree Strongly,Agree strongly,Agree strongly,Agree strongly,Disagree a little,Agree a little,Agree a little,Agree strongly +u59,pre,Agree strongly,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree strongly,Agree strongly,Disagree Strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Agree a little,Disagree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Agree strongly,Agree a little,Agree strongly,Agree a little,Agree strongly,Disagree a little,Agree a little,Agree strongly,Agree strongly,Disagree a little,Disagree a little,Agree strongly,Neither agree nor disagree +u00,post,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Agree a little,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Agree a little,Agree a little,Agree strongly,Agree a little,Agree strongly,Neither agree nor disagree,Agree strongly,Disagree a little,Neither agree nor disagree,Agree strongly,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree Strongly,Agree strongly,Disagree a little,Agree a little,Agree a little,Agree a little +u01,post,Agree a little,Agree a little,Agree strongly,Agree strongly,Agree a little,Agree strongly,Agree strongly,Disagree a little,Agree a little,Agree strongly,Agree a little,Disagree Strongly,Agree strongly,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Agree a little,Disagree a little,Agree a little,Agree strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little +u02,post,Disagree Strongly,Agree a little,Agree a little,Disagree a little,Agree a little,Agree strongly,Agree a little,Disagree a little,Disagree a little,Agree a little,Disagree a little,Agree a little,Agree strongly,Agree a little,Disagree a little,Disagree Strongly,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Agree strongly,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Agree strongly,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree +u03,post,Disagree Strongly,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree Strongly,Neither agree nor disagree,Disagree a little,Agree a little,Neither agree nor disagree,Disagree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Agree a little,Disagree Strongly,Disagree a little,Agree a little,Agree a little,Disagree Strongly,Neither agree nor disagree,Disagree a little,Disagree a little,Disagree Strongly +u04,post,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree Strongly,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Agree strongly,Disagree a little,Agree a little,Disagree Strongly,Agree a little,Agree strongly,Agree a little,Agree a little,Disagree a little,Disagree a little,Disagree Strongly,Neither agree nor disagree,Disagree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Disagree Strongly,Agree strongly,Disagree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little +u05,post,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Disagree Strongly,Neither agree nor disagree,Agree a little,Agree a little,Disagree Strongly,Agree a little,Agree strongly,Agree a little,Disagree Strongly,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little +u07,post,Neither agree nor disagree,Disagree Strongly,Agree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Disagree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Disagree a little,Agree a little,Agree strongly,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree a little,Disagree a little,Agree a little,Disagree a little +u09,post,Agree a little,Disagree Strongly,Agree a little,Disagree Strongly,Agree a little,Agree strongly,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Agree strongly,Agree a little,Disagree a little,Disagree Strongly,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree a little,Neither agree nor disagree,Disagree a little,Disagree a little,Disagree a little,Agree strongly,Disagree a little,Agree a little,Agree a little,Disagree a little +u10,post,Disagree Strongly,Agree strongly,Agree strongly,Disagree a little,Agree strongly,Agree strongly,Disagree Strongly,Disagree Strongly,Agree a little,Agree strongly,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Disagree Strongly,Disagree Strongly,Agree a little,Agree strongly,Agree strongly,Neither agree nor disagree,Disagree Strongly,Neither agree nor disagree,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Disagree Strongly,Agree a little,Disagree Strongly,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Disagree Strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Agree a little,Disagree Strongly,Disagree Strongly,Disagree Strongly,Neither agree nor disagree +u14,post,Agree a little,Agree a little,Agree strongly,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Neither agree nor disagree,Disagree a little,Agree strongly,Disagree a little,Agree a little,Agree a little,Agree strongly,Agree a little,Agree strongly,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree +u15,post,Agree a little,Disagree Strongly,Agree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little +u16,post,Disagree Strongly,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Disagree Strongly,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree a little,Agree strongly,Agree strongly,Agree strongly,Disagree a little,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Agree strongly,Agree strongly,Agree a little,Agree strongly,Agree strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Agree strongly,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree +u17,post,Agree strongly,Agree strongly,Agree a little,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree a little,Disagree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Disagree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Disagree a little,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree strongly,Agree strongly,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree strongly +u18,post,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,,Disagree Strongly,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree a little,Disagree Strongly,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Disagree Strongly,Disagree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Disagree a little +u19,post,Disagree Strongly,Disagree a little,Agree strongly,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Agree strongly,Disagree Strongly,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Neither agree nor disagree,Disagree a little,Disagree a little,Agree a little,Disagree Strongly,Agree a little,Agree strongly,Agree a little,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Disagree a little,Neither agree nor disagree +u20,post,Neither agree nor disagree,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree Strongly,Disagree Strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree strongly,Disagree a little,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Agree a little,Agree a little,Disagree Strongly,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree strongly,Disagree Strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree +u23,post,Agree a little,Agree a little,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree a little,Agree a little,Disagree Strongly,Agree a little,Agree a little,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Agree a little,Neither agree nor disagree,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Disagree a little,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree strongly,Agree strongly,Agree strongly,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,,,Agree a little,Agree strongly,Agree a little,Agree strongly +u24,post,Agree a little,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Agree strongly,Agree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree strongly,Agree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree strongly,Agree strongly +u27,post,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Disagree Strongly,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Agree a little,Agree strongly,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little +u30,post,Agree a little,Disagree a little,Agree strongly,Disagree Strongly,Agree a little,Disagree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Disagree Strongly,Agree strongly,Disagree a little,Agree a little,Agree strongly,Neither agree nor disagree,Disagree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Agree a little,Disagree Strongly,Agree a little,Agree a little,Agree a little,Disagree a little,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Disagree Strongly,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Disagree Strongly,Agree strongly,Disagree Strongly,Agree strongly,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree +u31,post,Agree a little,Agree a little,Agree strongly,Disagree Strongly,Neither agree nor disagree,Disagree a little,Disagree a little,Disagree a little,Agree a little,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree strongly,,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Agree strongly +u32,post,Agree strongly,Agree a little,Agree strongly,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Disagree Strongly,Agree strongly,Agree strongly,Agree a little,Disagree Strongly,Agree strongly,Disagree a little,Agree a little,Agree strongly,Agree a little,Disagree Strongly,Disagree a little,Agree strongly,Disagree a little,Agree a little,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Disagree Strongly,Agree strongly,Disagree a little,Agree a little,Disagree a little,Agree a little,Agree strongly,Agree strongly,Neither agree nor disagree,Agree strongly,Disagree Strongly,Agree strongly,Disagree a little,Agree a little,Neither agree nor disagree,Agree strongly,Disagree a little,Disagree a little +u33,post,Agree strongly,Disagree a little,Agree a little,Agree a little,Agree strongly,Disagree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Disagree a little,Disagree Strongly,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree strongly,Agree strongly,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Disagree a little,Agree a little,Agree strongly,Disagree Strongly,Agree a little,Agree strongly,Agree strongly +u34,post,Agree a little,Neither agree nor disagree,Agree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Agree strongly,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Disagree a little,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Disagree Strongly +u35,post,Disagree a little,Disagree Strongly,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Agree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree +u36,post,Agree a little,Disagree Strongly,Agree strongly,Disagree a little,Agree a little,Disagree a little,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Disagree Strongly,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Disagree Strongly,Disagree Strongly,Disagree a little,Agree a little,Agree a little,Agree strongly,Disagree a little,Agree a little,Agree a little,Agree a little,Disagree Strongly,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree Strongly,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree +u42,post,Agree a little,Disagree Strongly,Disagree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree strongly,Agree strongly,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Disagree a little,Disagree a little,Disagree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Disagree a little,Disagree a little,Agree a little,Disagree a little,Agree a little,Disagree a little,Agree a little +u43,post,Disagree a little,Disagree a little,Agree strongly,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Agree a little,Disagree Strongly,Agree strongly,Disagree a little,Agree a little,Disagree a little,Agree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Disagree a little,Agree strongly,Neither agree nor disagree,Disagree a little,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Disagree a little +u44,post,Disagree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Disagree a little +u45,post,Agree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Agree a little,Agree strongly,Agree strongly,Neither agree nor disagree,Agree a little,Disagree a little,Disagree a little,Agree a little,Agree strongly,Disagree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree strongly,Disagree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree +u47,post,Agree a little,Disagree a little,Agree a little,Disagree Strongly,Agree strongly,Neither agree nor disagree,Agree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Disagree a little,Agree strongly,Agree strongly,Neither agree nor disagree,Disagree Strongly,Disagree a little,Agree strongly,Disagree Strongly,Agree strongly,Disagree Strongly,Agree strongly,Agree strongly,Agree a little,Neither agree nor disagree,Disagree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree a little,Disagree a little,Neither agree nor disagree,Agree strongly,Disagree Strongly,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree +u49,post,Agree a little,Neither agree nor disagree,Agree strongly,Disagree Strongly,Agree strongly,Agree a little,Agree a little,Agree a little,Agree strongly,Agree strongly,Agree a little,Disagree Strongly,Agree strongly,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree a little,Disagree a little,Agree strongly,Agree strongly,Agree strongly,Agree a little,Agree strongly,Agree strongly,Neither agree nor disagree,Disagree a little,Agree strongly,Disagree a little,Agree strongly,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Disagree a little,Agree strongly +u51,post,Disagree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Disagree Strongly,Agree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree +u52,post,Disagree Strongly,Agree a little,Agree a little,Agree strongly,Disagree a little,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Disagree Strongly,Agree strongly,Neither agree nor disagree,Disagree Strongly,Neither agree nor disagree,Agree strongly,Agree strongly,Neither agree nor disagree,Agree strongly,Disagree Strongly,Agree strongly,Agree strongly,Agree strongly,Disagree Strongly,Agree strongly,Disagree Strongly,Disagree Strongly,Neither agree nor disagree,Agree strongly,Agree a little,Agree strongly,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree Strongly,Agree strongly,Agree strongly,Agree strongly,Neither agree nor disagree,Agree strongly,Disagree Strongly +u53,post,Agree a little,Agree a little,Agree strongly,Disagree a little,Agree strongly,Agree a little,Agree a little,Agree a little,Disagree Strongly,Neither agree nor disagree,Disagree a little,Disagree Strongly,Agree strongly,Agree strongly,Agree a little,Disagree a little,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Agree a little,Agree a little,Neither agree nor disagree,Disagree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree strongly,Agree strongly,Agree strongly,Agree a little,Neither agree nor disagree,Disagree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree strongly,Disagree a little,Agree a little,Agree strongly,Agree strongly +u54,post,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree +u56,post,Disagree Strongly,Disagree a little,Agree a little,Disagree Strongly,Agree a little,Agree strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree strongly,Agree strongly,Agree strongly,Agree a little,Agree a little,Disagree a little,Disagree a little,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Disagree a little,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree +u58,post,Disagree Strongly,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Agree a little,Agree strongly,,Disagree Strongly,Agree strongly,Agree strongly,Disagree Strongly,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Agree strongly,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree strongly,Agree strongly,Disagree Strongly,Agree a little,Agree a little,Agree strongly,Disagree Strongly,Agree strongly,Neither agree nor disagree,Agree strongly,Agree strongly,Disagree a little,Agree strongly,Disagree a little,Agree strongly,Disagree Strongly,Disagree Strongly,Agree strongly,Agree strongly,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little +u59,post,Agree strongly,Disagree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Agree a little,Agree strongly,Agree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,Disagree a little,Agree strongly,Agree strongly,Agree a little,Neither agree nor disagree,Agree a little,Disagree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,Agree a little,Agree a little,Agree strongly,Neither agree nor disagree diff --git a/src/experiments/clustering/survey/FlourishingScale.csv b/src/experiments/clustering/survey/FlourishingScale.csv new file mode 100644 index 0000000..92bb644 --- /dev/null +++ b/src/experiments/clustering/survey/FlourishingScale.csv @@ -0,0 +1,84 @@ +uid,type,I lead a purposeful and meaningful life,My social relationships are supportive and rewarding,I am engaged and interested in my daily activities,I actively contribute to the happiness and well-being of others,I am competent and capable in the activities that are important to me,I am a good person and live a good life,I am optimistic about my future,People respect me +u00,pre,7,6,,6,6,7,7,4 +u01,pre,4,6,6,6,7,6,4,6 +u02,pre,6,7,5,6,5,6,4,7 +u03,pre,6,2,2,4,5,6,5,4 +u04,pre,3,5,4,3,2,2,3,5 +u05,pre,6,6,6,6,6,6,6,6 +u07,pre,7,5,6,5,7,7,7,5 +u08,pre,5,5,5,5,5,,4,4 +u09,pre,5,4,6,6,6,7,7,5 +u10,pre,6,3,6,1,7,4,6,6 +u12,pre,7,6,5,6,7,7,7,4 +u13,pre,6,6,4,5,6,6,7,4 +u14,pre,7,7,7,6,6,7,6,6 +u15,pre,5,6,4,4,6,6,6,6 +u16,pre,6,6,7,5,3,7,4,4 +u17,pre,5,5,5,4,4,7,3,4 +u18,pre,6,7,5,4,5,2,3,5 +u19,pre,6,6,6,5,5,4,5,5 +u20,pre,7,7,5,6,4,6,6,4 +u22,pre,6,5,5,6,6,6,5,7 +u23,pre,5,4,5,5,3,4,4,5 +u24,pre,6,5,3,3,6,7,7,4 +u27,pre,4,4,3,3,4,4,4,5 +u30,pre,7,7,6,6,7,7,7,5 +u31,pre,3,2,2,1,4,1,1,2 +u32,pre,7,7,6,7,7,7,7,6 +u33,pre,2,3,2,6,2,5,5,6 +u34,pre,6,6,5,6,6,7,7,6 +u35,pre,7,6,6,6,5,6,6,6 +u36,pre,6,6,6,6,6,5,6,5 +u39,pre,1,1,2,2,2,2,2,3 +u42,pre,5,6,6,6,4,7,6,5 +u43,pre,6,6,6,5,6,7,6,5 +u44,pre,6,6,6,6,6,6,6,6 +u45,pre,5,7,4,5,6,7,7,7 +u46,pre,5,4,5,6,7,5,6,4 +u47,pre,7,7,,,,7,7,7 +u49,pre,7,6,6,5,6,7,7,7 +u50,pre,6,6,6,5,6,7,7,5 +u51,pre,5,5,,5,5,5,5,5 +u52,pre,3,2,4,3,7,5,6,4 +u53,pre,7,7,6,6,6,6,6,6 +u56,pre,5,6,5,5,6,6,7,6 +u57,pre,6,6,7,6,6,7,6,6 +u58,pre,7,5,7,6,7,7,7,5 +u59,pre,5,5,4,5,5,6,7,6 +u00,post,6,5,6,6,6,6,5,5 +u01,post,5,5,6,5,7,6,6,6 +u02,post,5,6,4,6,5,6,5,7 +u03,post,3,5,5,5,4,3,3,3 +u04,post,5,5,4,3,4,4,3,5 +u05,post,6,7,6,6,6,6,6,7 +u07,post,6,6,5,5,7,6,6,6 +u09,post,6,5,7,5,6,7,6,5 +u10,post,7,1,7,1,7,4,6,6 +u14,post,7,7,7,6,6,7,7,6 +u15,post,6,6,6,6,6,6,6,6 +u16,post,6,6,3,5,5,7,5,4 +u17,post,3,6,4,3,4,7,4,7 +u19,post,5,5,6,5,5,5,5,6 +u20,post,7,7,4,6,4,7,6,4 +u23,post,6,5,5,5,6,5,5,5 +u24,post,6,4,3,3,7,7,7,6 +u27,post,4,3,4,4,4,4,4,4 +u30,post,7,7,7,7,7,7,7,7 +u31,post,5,4,2,4,5,7,7,2 +u32,post,7,7,7,7,7,7,7,7 +u33,post,3,2,2,5,5,4,3,4 +u34,post,7,2,4,3,3,1,1,2 +u35,post,6,6,4,6,5,6,6,5 +u36,post,6,6,4,6,6,6,6,6 +u42,post,2,2,2,2,2,2,2,2 +u43,post,6,5,5,4,5,6,6,5 +u44,post,5,5,5,5,5,5,5,6 +u45,post,7,6,5,6,6,7,6,6 +u46,post,7,6,5,6,5,6,5,4 +u47,post,7,7,7,7,5,7,5,7 +u49,post,7,7,7,6,7,7,7,7 +u51,post,6,6,6,6,6,6,6,6 +u52,post,4,6,4,4,7,6,2,4 +u53,post,7,6,6,6,6,6,6,6 +u56,post,5,6,6,7,6,6,6,6 +u59,post,6,6,5,6,7,7,7,6 diff --git a/src/experiments/clustering/survey/LonelinessScale.csv b/src/experiments/clustering/survey/LonelinessScale.csv new file mode 100644 index 0000000..abd82be --- /dev/null +++ b/src/experiments/clustering/survey/LonelinessScale.csv @@ -0,0 +1,84 @@ +uid,type,1. I feel in tune with the people around me,2. I lack companionship,3. There is no one I can turn to,4. I do not feel alone,5. I feel part of a group of friends,6. I have a lot in common with the people around me,7. I am no longer close to anyone,8. My interests and ideas are not shared by those around me,9. I am an outgoing person,10. There are people I feel close to,11. I feel left out,12. My social relationships are superficial,13. No one really knows me well,14. I feel isolated from others,15. I can find companionship when I want it,16. There are people who really understand me,17. I am unhappy being so withdrawn,18. People are around me but not with me,19. There are people I can talk to,20. There are people I can turn to +u00,pre,Sometimes,Rarely,Never,Never,Often,Sometimes,Never,Never,Often,Often,Rarely,Rarely,Never,Never,Often,Rarely,Often,Sometimes,Often,Often +u01,pre,Sometimes,Rarely,Rarely,Sometimes,Often,Often,Never,Never,Sometimes,Often,Rarely,Rarely,Never,Rarely,Often,Often,Sometimes,Never,Often,Often +u02,pre,Sometimes,Rarely,Rarely,Often,Often,Often,Never,Often,Rarely,Often,Rarely,Never,Never,Never,Often,Often,Never,Never,Often,Often +u03,pre,Sometimes,Rarely,Never,Often,Rarely,Rarely,Sometimes,Sometimes,Rarely,Rarely,Rarely,Rarely,Sometimes,Sometimes,Sometimes,Often,Sometimes,Sometimes,Sometimes,Sometimes +u04,pre,Sometimes,Sometimes,Sometimes,Sometimes,Rarely,Sometimes,Never,Rarely,Rarely,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes,Rarely,Rarely,Sometimes,Sometimes,Rarely,Rarely +u05,pre,Sometimes,Rarely,Never,Often,Often,Sometimes,Never,Never,Sometimes,Often,Rarely,Rarely,Never,Rarely,Often,Sometimes,Never,Never,Often,Often +u07,pre,Rarely,Never,Never,Often,Sometimes,Rarely,Never,Sometimes,Rarely,Often,Sometimes,Rarely,Rarely,Often,Sometimes,Often,Never,Sometimes,Often,Often +u08,pre,Sometimes,Sometimes,Rarely,Sometimes,Often,Often,Sometimes,Never,Sometimes,Often,Never,Never,Never,Never,Sometimes,Sometimes,Never,Rarely,Sometimes,Sometimes +u09,pre,Sometimes,Sometimes,Rarely,Often,Often,Often,Sometimes,Rarely,Rarely,Sometimes,Rarely,Rarely,Sometimes,Rarely,Often,Often,Sometimes,Never,Often,Often +u10,pre,Rarely,Often,Often,Rarely,Never,Rarely,Often,Often,Never,Rarely,Sometimes,Often,Often,Often,Often,Never,Never,Rarely,Rarely,Rarely +u12,pre,Often,Rarely,Rarely,Sometimes,Often,Never,Rarely,Often,Sometimes,Often,Rarely,Rarely,Sometimes,Sometimes,Often,Sometimes,Rarely,Sometimes,Sometimes,Sometimes +u13,pre,Sometimes,Sometimes,Sometimes,Often,Often,Sometimes,Sometimes,Rarely,Rarely,Sometimes,Rarely,Often,Often,Sometimes,Often,Often,Never,Often,Often,Often +u14,pre,Sometimes,Never,Never,Often,Often,Sometimes,Never,Never,Sometimes,Often,Rarely,Never,Never,Never,Rarely,Often,Never,Never,Often,Often +u15,pre,Sometimes,Never,Never,Often,Often,Often,Never,Never,Sometimes,Often,Sometimes,Rarely,Never,Never,Often,Often,Never,Never,Often,Often +u16,pre,Sometimes,Never,Never,Often,Often,Often,Rarely,Rarely,Rarely,Often,Sometimes,Never,Rarely,Sometimes,Often,Often,Sometimes,Never,Often,Often +u17,pre,Rarely,Sometimes,Never,Rarely,Never,Never,Never,Often,Sometimes,Often,Often,Sometimes,Often,Often,Often,Often,Sometimes,Often,Often,Often +u18,pre,Sometimes,Sometimes,Often,Sometimes,Sometimes,Rarely,Sometimes,Sometimes,Rarely,Sometimes,Often,Sometimes,Sometimes,Sometimes,Sometimes,Rarely,Sometimes,Sometimes,Sometimes,Rarely +u19,pre,Rarely,Sometimes,Often,Rarely,Sometimes,Sometimes,Sometimes,Never,Never,Sometimes,Sometimes,Sometimes,Often,Often,Sometimes,Rarely,Sometimes,Sometimes,Sometimes,Sometimes +u20,pre,Often,Rarely,Rarely,Often,Often,Often,Rarely,Rarely,Often,Often,Rarely,Rarely,Rarely,Rarely,Often,Often,Rarely,Rarely,Often,Often +u22,pre,Sometimes,Sometimes,Sometimes,Sometimes,Rarely,Rarely,Sometimes,Often,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes,Rarely,Rarely,Sometimes,Sometimes,Rarely +u23,pre,Often,Sometimes,Rarely,Sometimes,Sometimes,Sometimes,Rarely,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes,Rarely,Rarely,Often,Sometimes,Sometimes,Sometimes,Sometimes +u24,pre,Sometimes,Rarely,Sometimes,Sometimes,Sometimes,Rarely,Often,Often,Sometimes,Sometimes,Often,Sometimes,Sometimes,Rarely,Often,Sometimes,Rarely,Often,Sometimes,Sometimes +u27,pre,Sometimes,Sometimes,Sometimes,Often,Often,Sometimes,Sometimes,Rarely,Rarely,Often,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes,Rarely,Rarely,Rarely,Sometimes,Sometimes +u30,pre,Often,Never,Never,Often,Often,Often,Never,Rarely,Often,Often,Rarely,Never,Never,Rarely,Often,Often,Never,Never,Often,Often +u31,pre,Rarely,Rarely,Sometimes,Often,Sometimes,Sometimes,Never,Rarely,Often,Often,Rarely,Sometimes,Never,Never,Sometimes,Rarely,Sometimes,Rarely,Rarely,Rarely +u32,pre,Often,Sometimes,Never,Often,Often,Often,Never,Never,Often,Often,Rarely,Rarely,Never,Rarely,Often,Often,Never,Never,Often,Often +u33,pre,Sometimes,Often,Sometimes,Never,Rarely,Rarely,Often,Sometimes,Sometimes,Rarely,Often,Sometimes,Often,Often,Never,Sometimes,Often,Often,Sometimes,Rarely +u34,pre,Often,Rarely,Never,Often,Often,Sometimes,Sometimes,Rarely,Often,Often,Sometimes,Sometimes,Sometimes,Sometimes,Often,Often,Never,Sometimes,Often,Often +u35,pre,Often,Rarely,Never,Sometimes,Often,Often,Rarely,Rarely,Rarely,Often,Rarely,Never,Rarely,Rarely,Sometimes,Rarely,Rarely,Rarely,Sometimes,Sometimes +u36,pre,Often,Never,Never,Often,Often,Often,Never,Never,Sometimes,Sometimes,Never,Sometimes,Sometimes,Rarely,Often,Sometimes,Rarely,Rarely,Often,Sometimes +u39,pre,Sometimes,Rarely,Rarely,Often,Often,Often,Rarely,Rarely,Sometimes,Often,Never,Sometimes,Rarely,Rarely,Often,Often,Rarely,Rarely,Often,Often +u42,pre,Often,Never,Never,Often,Often,Often,Never,Never,Sometimes,Often,Never,Sometimes,Rarely,Never,Often,Sometimes,Never,Never,Often,Often +u43,pre,Sometimes,Sometimes,Never,Rarely,Often,Often,Never,Rarely,Rarely,Often,Sometimes,Rarely,Sometimes,Rarely,Rarely,Sometimes,Sometimes,Sometimes,Often,Often +u44,pre,Often,Sometimes,Never,Often,Often,Often,Never,Never,Often,Often,Never,Rarely,Never,Never,Often,Sometimes,Never,Never,Often,Often +u45,pre,Often,Never,Sometimes,Sometimes,Sometimes,Sometimes,Rarely,Sometimes,Often,Sometimes,Rarely,Sometimes,Often,Rarely,Sometimes,Rarely,Rarely,Sometimes,Rarely,Rarely +u46,pre,Often,Rarely,Never,Often,Often,Often,Never,Sometimes,Sometimes,Often,Rarely,Sometimes,Rarely,Never,Often,Often,Never,Rarely,Often,Often +u47,pre,Sometimes,Rarely,Never,Often,Often,Often,Never,Never,Often,Often,Never,Never,Never,Never,Often,Often,Never,Never,Often,Often +u49,pre,Often,Rarely,Never,Sometimes,Often,Often,Never,Never,Sometimes,Often,Rarely,Rarely,Rarely,Rarely,Often,Sometimes,Rarely,Sometimes,Often,Often +u50,pre,Sometimes,Rarely,Never,Sometimes,Sometimes,Sometimes,Rarely,Sometimes,Often,Often,Rarely,Sometimes,Rarely,Never,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes +u51,pre,Sometimes,Rarely,Rarely,Sometimes,Sometimes,Sometimes,Rarely,Rarely,Sometimes,Sometimes,Rarely,Rarely,Rarely,Rarely,Sometimes,Sometimes,Rarely,Rarely,Sometimes,Sometimes +u52,pre,Sometimes,Often,Often,Never,Sometimes,Often,Sometimes,Sometimes,Never,Sometimes,Sometimes,Often,Often,Sometimes,Never,Rarely,Often,Often,Sometimes,Rarely +u53,pre,Sometimes,Never,Never,Often,Often,Sometimes,Never,Rarely,Sometimes,Often,Sometimes,Sometimes,Never,Rarely,Often,Sometimes,Never,Rarely,Often,Often +u56,pre,Sometimes,Rarely,Rarely,Sometimes,Rarely,Sometimes,Rarely,Rarely,Never,Sometimes,Rarely,Rarely,Rarely,Rarely,Often,Sometimes,Never,Never,Often,Often +u57,pre,Sometimes,Rarely,Sometimes,Often,Often,Sometimes,Sometimes,Rarely,Rarely,Often,Rarely,Rarely,Rarely,Rarely,Sometimes,Sometimes,Rarely,Sometimes,Sometimes,Sometimes +u58,pre,Sometimes,Sometimes,Never,Often,Rarely,Sometimes,Never,Sometimes,Never,Sometimes,Sometimes,Never,Often,Sometimes,Rarely,Never,Sometimes,Never,Sometimes,Sometimes +u59,pre,Sometimes,Rarely,Never,Often,Sometimes,Sometimes,Rarely,Rarely,Often,Often,Rarely,Never,Rarely,Sometimes,Sometimes,Sometimes,Rarely,Rarely,Often,Often +u00,post,Sometimes,Rarely,Rarely,Often,Sometimes,Sometimes,Never,Rarely,Sometimes,Sometimes,Rarely,Sometimes,Never,Never,Sometimes,Sometimes,Sometimes,Sometimes,Often,Often +u01,post,Sometimes,Rarely,Rarely,Often,Often,Sometimes,Never,Rarely,Often,Often,Sometimes,Rarely,Rarely,Rarely,Often,Sometimes,Sometimes,Sometimes,Often,Often +u02,post,Often,Rarely,Rarely,Often,Often,Often,Never,Never,Rarely,Often,Rarely,Never,Never,Never,Often,Often,Never,Never,Often,Often +u03,post,Rarely,Never,Never,Sometimes,Never,Rarely,Often,Sometimes,Never,Rarely,Rarely,Rarely,Sometimes,Rarely,Sometimes,Often,Rarely,Rarely,Sometimes,Rarely +u04,post,Sometimes,Sometimes,Sometimes,Sometimes,Rarely,Sometimes,Sometimes,Sometimes,Rarely,Often,Often,Rarely,Sometimes,Rarely,Rarely,Sometimes,Rarely,Sometimes,Sometimes,Sometimes +u07,post,Rarely,Sometimes,Never,Often,Sometimes,Rarely,Never,Sometimes,Sometimes,Often,Sometimes,Sometimes,Never,Sometimes,Often,Often,Sometimes,Sometimes,Often,Often +u09,post,Often,Sometimes,Rarely,Rarely,Often,Often,Never,Rarely,Rarely,Often,Rarely,Rarely,Never,Rarely,Often,Rarely,Rarely,Sometimes,Often,Often +u10,post,Rarely,Rarely,Rarely,Rarely,Rarely,Never,Often,Often,Never,Never,Never,Often,Often,Often,Sometimes,Never,Never,Often,Rarely,Rarely +u14,post,Often,Rarely,Never,Often,Often,Often,Never,Never,Often,Often,Rarely,Rarely,Never,Never,Sometimes,Often,Never,Never,Sometimes,Often +u15,post,Often,Rarely,Never,Often,Often,Often,Never,Never,Sometimes,Often,Rarely,Never,Never,Never,Often,Often,Never,Never,Often,Often +u16,post,Sometimes,Sometimes,Rarely,Often,Often,Often,Rarely,Rarely,Rarely,Sometimes,Sometimes,Rarely,Rarely,Rarely,Often,Often,Sometimes,Sometimes,Often,Often +u17,post,Sometimes,Often,Rarely,Often,Never,Rarely,Never,Often,Sometimes,Often,Often,Sometimes,Often,Often,Rarely,Often,Sometimes,Often,Often,Often +u18,post,Rarely,Sometimes,Sometimes,Rarely,Sometimes,Rarely,Often,Sometimes,Rarely,Rarely,Sometimes,Often,Often,Sometimes,Rarely,Never,Sometimes,Sometimes,Never,Never +u19,post,Rarely,Often,Sometimes,Rarely,Sometimes,Rarely,Rarely,Rarely,Rarely,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes,Rarely,Rarely,Sometimes,Rarely,Sometimes,Sometimes +u20,post,Often,Rarely,Rarely,Often,Often,Often,Rarely,Often,Often,Sometimes,Rarely,Rarely,Rarely,Rarely,Sometimes,Sometimes,Sometimes,Sometimes,Often,Sometimes +u23,post,Sometimes,Sometimes,Sometimes,Rarely,Rarely,Rarely,Rarely,Sometimes,Sometimes,Sometimes,Sometimes,Rarely,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes +u24,post,Sometimes,Sometimes,Sometimes,Sometimes,Rarely,Rarely,Rarely,Sometimes,Sometimes,Rarely,Sometimes,Often,Often,Sometimes,Often,Rarely,Sometimes,Often,Sometimes,Sometimes +u27,post,Sometimes,Sometimes,Sometimes,Sometimes,Sometimes,Rarely,Sometimes,Rarely,Rarely,Sometimes,Rarely,Sometimes,Sometimes,Rarely,Sometimes,Rarely,Rarely,Sometimes,Sometimes,Sometimes +u30,post,Often,Rarely,Rarely,Often,Often,Often,Never,Never,Often,Often,Rarely,Never,Never,Never,Often,Often,Never,Never,Often,Often +u31,post,Sometimes,Never,Rarely,Often,Often,Sometimes,Never,Rarely,Sometimes,Sometimes,Sometimes,Sometimes,Never,Never,Never,Rarely,Never,Rarely,Rarely,Rarely +u32,post,Often,Often,Never,Often,Often,Often,Never,Rarely,Often,Often,Rarely,Rarely,Never,Never,Often,Often,Rarely,Rarely,Often,Often +u33,post,Often,Often,Often,Sometimes,Sometimes,Rarely,Often,Often,Often,Rarely,Sometimes,Sometimes,Often,Often,Rarely,Rarely,Often,Often,Sometimes,Sometimes +u34,post,Often,Rarely,Rarely,Often,Often,Often,Never,Rarely,Sometimes,Often,Sometimes,Rarely,Sometimes,Rarely,Sometimes,Sometimes,Rarely,Rarely,Often,Often +u35,post,Often,Rarely,Rarely,Sometimes,Rarely,Sometimes,Sometimes,Rarely,Sometimes,Rarely,Rarely,Sometimes,Never,Never,Sometimes,Sometimes,Never,Rarely,Often,Sometimes +u36,post,Often,Never,Never,Sometimes,Often,Often,Never,Never,Sometimes,Sometimes,Rarely,Sometimes,Rarely,Rarely,Sometimes,Sometimes,Rarely,Rarely,Often,Often +u42,post,Often,Sometimes,Never,Often,Often,Often,Never,Rarely,Sometimes,Often,Rarely,Sometimes,Rarely,Never,Sometimes,Sometimes,Rarely,Rarely,Often,Often +u43,post,Sometimes,Sometimes,Never,Often,Sometimes,Sometimes,Sometimes,Rarely,Rarely,Often,Rarely,Sometimes,Sometimes,Rarely,Often,Often,Sometimes,Sometimes,Often,Often +u44,post,Sometimes,Sometimes,Rarely,Sometimes,Sometimes,Sometimes,Rarely,Rarely,Sometimes,Sometimes,Rarely,Sometimes,Rarely,Rarely,Sometimes,Sometimes,Rarely,Sometimes,Sometimes,Sometimes +u45,post,Often,Rarely,Sometimes,Rarely,Sometimes,Sometimes,Rarely,Rarely,Often,Rarely,Sometimes,Rarely,Rarely,Rarely,Sometimes,Rarely,Rarely,Sometimes,Sometimes,Sometimes +u47,post,Often,Often,Never,Often,Often,Often,Never,Never,Often,Often,Never,Never,Never,Never,Often,Often,Never,Never,Often,Often +u49,post,Sometimes,Rarely,Never,Often,Often,Often,Never,Never,Sometimes,Often,Never,Never,Never,Never,Often,Often,Rarely,Sometimes,Often,Often +u51,post,Sometimes,Rarely,Rarely,Often,Often,Sometimes,Never,Never,Sometimes,Sometimes,Never,Rarely,Rarely,Rarely,Sometimes,Sometimes,Rarely,Rarely,Sometimes,Sometimes +u52,post,Often,Sometimes,Never,Often,Rarely,Often,Never,Never,Sometimes,Often,Sometimes,Never,Never,Never,Often,Sometimes,Never,Never,Often,Sometimes +u53,post,Often,Rarely,Never,Often,Often,Sometimes,Never,Rarely,Sometimes,Often,Sometimes,Rarely,Rarely,Rarely,Often,Sometimes,Rarely,Rarely,Often,Often +u56,post,Sometimes,Rarely,Never,Sometimes,Sometimes,Rarely,Rarely,Rarely,Never,Sometimes,Rarely,Never,Rarely,Rarely,Often,Often,Rarely,Rarely,Often,Often +u58,post,Often,Never,Never,Never,Sometimes,Never,Rarely,Sometimes,Never,Sometimes,Never,Rarely,Often,Rarely,Sometimes,Often,Never,Rarely,Sometimes,Sometimes +u59,post,Often,Never,Never,Sometimes,Often,Often,Never,Rarely,Often,Often,Never,Never,Rarely,Rarely,Sometimes,Sometimes,Rarely,Rarely,Often,Sometimes diff --git a/src/experiments/clustering/survey/PHQ-9.csv b/src/experiments/clustering/survey/PHQ-9.csv new file mode 100644 index 0000000..8e06bc1 --- /dev/null +++ b/src/experiments/clustering/survey/PHQ-9.csv @@ -0,0 +1,85 @@ +uid,type,Little interest or pleasure in doing things,"Feeling down, depressed, hopeless.","Trouble falling or staying asleep, or sleeping too much.",Feeling tired or having little energy,Poor appetite or overeating,Feeling bad about yourself or that you are a failure or have let yourself or your family down,"Trouble concentrating on things, such as reading the newspaper or watching television",Moving or speaking so slowly that other people could have noticed. Or the opposite being so figety or restless that you have been moving around a lot more than usual,"Thoughts that you would be better off dead, or of hurting yourself",Response +u00,pre,Not at all,Several days,Not at all,Several days,Not at all,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u01,pre,Several days,Several days,Several days,Several days,Not at all,Several days,Not at all,Not at all,Not at all,Very difficult +u02,pre,More than half the days,Several days,More than half the days,More than half the days,More than half the days,Several days,Several days,More than half the days,Not at all,Somewhat difficult +u03,pre,Not at all,Several days,Not at all,Not at all,Not at all,Not at all,Not at all,Several days,Not at all,Somewhat difficult +u04,pre,Several days,Several days,Not at all,Several days,Several days,Several days,Several days,Not at all,Not at all,Somewhat difficult +u05,pre,Not at all,Not at all,Several days,Not at all,Several days,Not at all,Not at all,Not at all,Not at all, +u07,pre,Several days,Several days,Not at all,Several days,More than half the days,Several days,Several days,Not at all,Not at all,Not difficult at all +u08,pre,Several days,Several days,Not at all,Not at all,Several days,Several days,Several days,Not at all,Not at all,Not difficult at all +u09,pre,Not at all,Not at all,Several days,Several days,Several days,Not at all,Several days,Not at all,Not at all,Not difficult at all +u10,pre,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all, +u12,pre,Not at all,Not at all,Not at all,Several days,Not at all,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u13,pre,Not at all,Several days,Not at all,Several days,Several days,Several days,Not at all,Not at all,Not at all,Not difficult at all +u14,pre,Not at all,Not at all,Not at all,Several days,Not at all,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u15,pre,Several days,Not at all,Not at all,Several days,Several days,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u16,pre,Several days,Several days,Several days,Not at all,Not at all,Nearly every day,Not at all,Not at all,Not at all,Somewhat difficult +u17,pre,Several days,Several days,Nearly every day,Nearly every day,Nearly every day,Several days,Not at all,Several days,Not at all,Somewhat difficult +u18,pre,More than half the days,More than half the days,Several days,More than half the days,More than half the days,Nearly every day,Several days,Not at all,More than half the days,Very difficult +u19,pre,Several days,Several days,Not at all,Not at all,Several days,Several days,Not at all,Not at all,Several days,Somewhat difficult +u20,pre,Several days,Several days,Several days,Several days,Several days,Several days,Several days,Several days,Not at all,Somewhat difficult +u22,pre,Not at all,Several days,Several days,Not at all,Not at all,Several days,Not at all,Not at all,Not at all,Not difficult at all +u23,pre,Several days,More than half the days,Several days,More than half the days,Several days,More than half the days,More than half the days,Not at all,Not at all,Somewhat difficult +u24,pre,Several days,Several days,Several days,Several days,Several days,Not at all,Not at all,Not at all,Not at all,Somewhat difficult +u27,pre,Not at all,Not at all,More than half the days,Several days,Not at all,Not at all,More than half the days,Not at all,Not at all,Somewhat difficult +u30,pre,Not at all,Not at all,Several days,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u31,pre,More than half the days,Several days,Several days,Several days,Several days,More than half the days,Several days,Several days,More than half the days,Somewhat difficult +u32,pre,Not at all,Not at all,More than half the days,Several days,Several days,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u33,pre,Nearly every day,Nearly every day,Nearly every day,Nearly every day,Nearly every day,Nearly every day,Nearly every day,More than half the days,Not at all,Extremely difficult +u34,pre,Not at all,Not at all,Several days,Not at all,Several days,Several days,Not at all,Not at all,Not at all,Not difficult at all +u35,pre,Several days,Several days,Several days,Several days,Several days,Several days,Several days,Not at all,Not at all,Somewhat difficult +u36,pre,Several days,Not at all,Not at all,Several days,Not at all,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u39,pre,Not at all,Not at all,Not at all,Several days,Not at all,Several days,Not at all,Several days,Not at all,Somewhat difficult +u42,pre,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Several days,Not at all,Not at all,Not difficult at all +u43,pre,Several days,Several days,Several days,Several days,Several days,Several days,Several days,Not at all,Not at all,Somewhat difficult +u44,pre,Not at all,Not at all,Not at all,Several days,Not at all,Not at all,Not at all,Not at all,Not at all,Somewhat difficult +u45,pre,Several days,Several days,More than half the days,Several days,Several days,Several days,Not at all,Not at all,Not at all,Not difficult at all +u46,pre,More than half the days,Several days,More than half the days,Several days,Several days,More than half the days,Several days,Not at all,Not at all,Somewhat difficult +u47,pre,More than half the days,Several days,Not at all,Not at all,Not at all,Not at all,More than half the days,Not at all,Not at all,Not difficult at all +u49,pre,Not at all,Not at all,Not at all,Several days,Several days,Not at all,Not at all,Not at all,Not at all,Somewhat difficult +u50,pre,Not at all,Several days,Not at all,More than half the days,Several days,Several days,Not at all,Several days,Several days,Not difficult at all +u51,pre,Not at all,Not at all,Not at all,Several days,Not at all,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u52,pre,Several days,Several days,Nearly every day,Nearly every day,Nearly every day,Several days,Not at all,Not at all,Not at all,Not difficult at all +u53,pre,Several days,Several days,More than half the days,More than half the days,Several days,Not at all,Several days,Not at all,Not at all,Somewhat difficult +u56,pre,Not at all,Not at all,Not at all,More than half the days,Not at all,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u57,pre,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u58,pre,Several days,Several days,Not at all,Not at all,Several days,Not at all,Several days,Several days,Not at all,Not difficult at all +u59,pre,Not at all,Not at all,Several days,More than half the days,Several days,Not at all,Not at all,Several days,Not at all,Not difficult at all +u00,post,Not at all,Not at all,Not at all,Several days,Not at all,Several days,Several days,Not at all,Not at all,Somewhat difficult +u01,post,Not at all,Several days,Several days,Several days,Not at all,Not at all,Not at all,Several days,Not at all,Somewhat difficult +u02,post,More than half the days,Not at all,Not at all,Several days,Several days,Not at all,Several days,Not at all,Not at all,Somewhat difficult +u03,post,Several days,Several days,Not at all,Several days,Not at all,Several days,Not at all,Not at all,Not at all,Somewhat difficult +u04,post,Several days,Several days,Several days,Several days,Several days,More than half the days,Several days,Not at all,Not at all,Somewhat difficult +u05,post,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u07,post,Several days,Several days,More than half the days,More than half the days,More than half the days,Not at all,Not at all,Not at all,Not at all,Somewhat difficult +u09,post,Not at all,Not at all,Not at all,Not at all,Several days,Several days,Not at all,Not at all,Not at all,Not difficult at all +u10,post,Not at all,Not at all,Not at all,Several days,Several days,Not at all,Not at all,More than half the days,Not at all,Not difficult at all +u14,post,Not at all,Not at all,Several days,Several days,Several days,Not at all,Not at all,Not at all,Not at all, +u15,post,Not at all,Not at all,Several days,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u16,post,Several days,More than half the days,More than half the days,Nearly every day,More than half the days,More than half the days,Not at all,Not at all,Not at all,Somewhat difficult +u17,post,More than half the days,More than half the days,Nearly every day,Nearly every day,More than half the days,Nearly every day,Nearly every day,Not at all,Not at all,Somewhat difficult +u18,post,More than half the days,More than half the days,Several days,More than half the days,More than half the days,Several days,Several days,Not at all,Several days,Somewhat difficult +u19,post,Not at all,Several days,Not at all,Not at all,Several days,Several days,Not at all,Not at all,Several days,Not difficult at all +u20,post,Several days,More than half the days,Several days,Several days,Several days,More than half the days,Not at all,Not at all,Not at all,Somewhat difficult +u23,post,More than half the days,Nearly every day,Nearly every day,More than half the days,More than half the days,Nearly every day,More than half the days,More than half the days,More than half the days,Very difficult +u24,post,More than half the days,Several days,Not at all,Several days,Several days,Several days,Several days,Not at all,Not at all,Somewhat difficult +u27,post,Several days,Several days,Not at all,More than half the days,Not at all,Several days,More than half the days,Not at all,Not at all,Somewhat difficult +u30,post,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all, +u31,post,Nearly every day,Not at all,Not at all,Not at all,Not at all,Not at all,Several days,Several days,Not at all,Not difficult at all +u32,post,Not at all,Not at all,Several days,Several days,Not at all,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u33,post,Nearly every day,Nearly every day,Nearly every day,Nearly every day,Nearly every day,Nearly every day,Nearly every day,Nearly every day,Several days,Extremely difficult +u34,post,Several days,Not at all,Several days,Several days,Several days,Not at all,More than half the days,Not at all,Not at all,Not difficult at all +u35,post,Several days,Several days,Not at all,Several days,Several days,Several days,Several days,Several days,Not at all,Not difficult at all +u36,post,Not at all,Not at all,Not at all,Several days,Not at all,Not at all,Not at all,Not at all,Not at all,Somewhat difficult +u42,post,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not difficult at all +u43,post,Not at all,Not at all,Several days,Several days,Several days,Not at all,Several days,Not at all,Not at all,Somewhat difficult +u44,post,Several days,Not at all,Not at all,Several days,Not at all,Not at all,Not at all,Not at all,Not at all,Somewhat difficult +u45,post,Not at all,Not at all,Several days,Not at all,Not at all,Not at all,Several days,Not at all,Not at all,Not difficult at all +u47,post,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Several days,Not at all,Not difficult at all +u49,post,Nearly every day,Not at all,More than half the days,More than half the days,Several days,Not at all,Not at all,Not at all,Not at all,Somewhat difficult +u51,post,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all,Not at all, +u52,post,Several days,Nearly every day,Nearly every day,More than half the days,Nearly every day,Several days,Not at all,Several days,Several days,Extremely difficult +u53,post,Not at all,Several days,Nearly every day,More than half the days,Nearly every day,Not at all,More than half the days,Not at all,Not at all,Somewhat difficult +u56,post,Not at all,Not at all,Not at all,More than half the days,Not at all,Not at all,Several days,Not at all,Not at all,Somewhat difficult +u58,post,Not at all,Several days,Not at all,More than half the days,More than half the days,Several days,Several days,Several days,Not at all,Not difficult at all +u59,post,Not at all,Not at all,More than half the days,More than half the days,Several days,Not at all,Several days,Several days,Not at all,Somewhat difficult diff --git a/src/experiments/clustering/survey/PerceivedStressScale.csv b/src/experiments/clustering/survey/PerceivedStressScale.csv new file mode 100644 index 0000000..666036b --- /dev/null +++ b/src/experiments/clustering/survey/PerceivedStressScale.csv @@ -0,0 +1,86 @@ +uid,type,"1. In the last month, how often have you been upset because of something that happened unexpectedly?","2. In the last month, how often have you felt that you were unable to control the important things in your life?","3. In the last month, how often have you felt nervous and ""stressed""?","4. In the last month, how often have you felt confident about your ability to handle your personal problems?","5. In the last month, how often have you felt that things were going your way?","6. In the last month, how often have you found that you could not cope with all the things that you had to do?","7. In the last month, how often have you been able to control irritations in your life?","8. In the last month, how often have you felt that you were on top of things?","9. In the last month, how often have you been angered because of things that were outside of your control?","10. In the last month, how often have you felt difficulties were piling up so high that you could not overcome them?" +u00,pre,Sometime,Sometime,Fairly often,Fairly often,Sometime,Very often,Sometime,Sometime,Fairly often,Fairly often +u01,pre,Sometime,Sometime,Sometime,Sometime,Fairly often,Sometime,Fairly often,Fairly often,Almost never,Almost never +u02,pre,Fairly often,Sometime,Sometime,Fairly often,Almost never,Sometime,Almost never,Sometime,Sometime,Almost never +u03,pre,Sometime,Almost never,Sometime,Almost never,Sometime,Never,Almost never,Never,Never,Never +u04,pre,Almost never,Almost never,Fairly often,Sometime,Sometime,Fairly often,Sometime,Almost never,Sometime,Sometime +u05,pre,Almost never,Sometime,Almost never,Very often,Very often,Almost never,Fairly often,Very often,Almost never,Never +u07,pre,Sometime,Almost never,Fairly often,Very often,Very often,Almost never,Fairly often,Fairly often,Sometime,Almost never +u08,pre,Sometime,Fairly often,Very often,Sometime,Sometime,Sometime,Fairly often,Sometime,Sometime,Fairly often +u09,pre,Almost never,Almost never,Sometime,Sometime,Fairly often,Almost never,Fairly often,Sometime,Almost never,Almost never +u10,pre,Fairly often,Fairly often,Fairly often,Fairly often,Fairly often,Sometime,Almost never,Very often,Fairly often,Almost never +u12,pre,Almost never,Almost never,Sometime,Fairly often,Fairly often,Almost never,Fairly often,Fairly often,Almost never,Almost never +u13,pre,Almost never,Fairly often,Very often,Almost never,Almost never,Almost never,Almost never,Never,Sometime,Almost never +u14,pre,Almost never,Never,Sometime,Fairly often,Sometime,Almost never,Very often,Very often,Almost never,Almost never +u15,pre,Fairly often,Sometime,Fairly often,Sometime,Sometime,Sometime,Sometime,Sometime,Sometime,Sometime +u16,pre,Fairly often,Very often,Very often,Fairly often,Sometime,Sometime,Sometime,Fairly often,Sometime,Fairly often +u17,pre,Fairly often,Very often,Very often,Sometime,Almost never,Very often,Sometime,Never,Fairly often,Very often +u18,pre,Fairly often,Fairly often,Fairly often,Fairly often,Almost never,Fairly often,Almost never,Sometime,Fairly often,Fairly often +u19,pre,Almost never,Sometime,Sometime,Almost never,Sometime,Almost never,Almost never,Sometime,Sometime,Sometime +u20,pre,Almost never,Almost never,Sometime,Fairly often,Sometime,Sometime,Very often,Sometime,Almost never,Sometime +u22,pre,Almost never,Never,Almost never,Very often,Sometime,Never,Fairly often,Very often,Almost never,Never +u23,pre,Fairly often,Fairly often,Very often,Sometime,Sometime,Fairly often,Sometime,Sometime,Fairly often,Fairly often +u24,pre,Sometime,Almost never,Fairly often,Fairly often,Fairly often,Sometime,Sometime,Almost never,Sometime,Almost never +u27,pre,Sometime,Sometime,Fairly often,Fairly often,Sometime,Sometime,Sometime,Sometime,Almost never,Almost never +u30,pre,Sometime,Sometime,Sometime,Very often,Fairly often,Almost never,Very often,Very often,Almost never,Never +u31,pre,Very often,Very often,,Almost never,Sometime,Almost never,Fairly often,Almost never,Sometime,Sometime +u32,pre,Never,Never,Sometime,Very often,Fairly often,Never,Very often,Very often,Never,Never +u33,pre,Fairly often,Very often,Very often,Almost never,Almost never,Very often,Almost never,Never,Sometime,Fairly often +u34,pre,Almost never,Almost never,Sometime,Fairly often,Fairly often,Sometime,Fairly often,Sometime,Sometime,Almost never +u35,pre,Sometime,Sometime,Almost never,Fairly often,Fairly often,Sometime,Very often,Sometime,Almost never,Sometime +u36,pre,Almost never,Almost never,Sometime,Fairly often,Fairly often,Almost never,Fairly often,Sometime,Almost never,Almost never +u42,pre,Sometime,Fairly often,Very often,Almost never,Almost never,Sometime,Sometime,Sometime,Almost never,Sometime +u43,pre,Sometime,Fairly often,Fairly often,Fairly often,Sometime,Sometime,Fairly often,Sometime,Fairly often,Sometime +u44,pre,Almost never,Sometime,Sometime,Sometime,Sometime,Almost never,Sometime,Almost never,Sometime,Almost never +u45,pre,Sometime,Fairly often,Sometime,Almost never,,Almost never,Never,Sometime,Fairly often,Almost never +u46,pre,Very often,Fairly often,Sometime,Almost never,Sometime,Very often,Never,Sometime,Very often,Fairly often +u47,pre,Sometime,Almost never,Fairly often,Fairly often,Fairly often,Almost never,Sometime,Fairly often,Almost never,Almost never +u49,pre,Sometime,Sometime,Sometime,Fairly often,Fairly often,Sometime,Very often,Sometime,Sometime,Sometime +u50,pre,Almost never,Almost never,Almost never,Never,Sometime,Almost never,Sometime,Sometime,Fairly often,Sometime +u51,pre,Almost never,Almost never,Sometime,Sometime,Sometime,Almost never,Fairly often,Sometime,Almost never,Almost never +u52,pre,Sometime,Very often,Very often,Almost never,Fairly often,Sometime,Very often,Sometime,Almost never,Fairly often +u53,pre,Fairly often,Fairly often,Very often,Sometime,Sometime,Sometime,Fairly often,Sometime,Sometime,Sometime +u54,pre,Sometime,Sometime,Fairly often,Fairly often,Sometime,Sometime,Sometime,Sometime,Almost never,Sometime +u56,pre,Almost never,Almost never,Almost never,Fairly often,Fairly often,Sometime,Fairly often,Fairly often,Almost never,Almost never +u57,pre,Almost never,Sometime,Sometime,Very often,Fairly often,Never,Fairly often,Fairly often,Almost never,Never +u58,pre,Sometime,Fairly often,Very often,Fairly often,Almost never,Almost never,Very often,Almost never,Sometime,Almost never +u59,pre,Sometime,Fairly often,Sometime,Sometime,Sometime,Fairly often,Very often,Fairly often,Sometime,Almost never +u00,post,Almost never,Sometime,Sometime,Fairly often,Sometime,Sometime,Fairly often,Sometime,Almost never,Almost never +u01,post,Almost never,Sometime,Sometime,Fairly often,Fairly often,Sometime,Sometime,Fairly often,Sometime,Sometime +u02,post,Sometime,Sometime,Very often,Fairly often,Sometime,Very often,Sometime,Sometime,Fairly often,Fairly often +u03,post,Almost never,Sometime,Very often,Sometime,Almost never,Fairly often,Almost never,Almost never,Almost never,Sometime +u04,post,Sometime,Almost never,Very often,Fairly often,Sometime,Almost never,Fairly often,Sometime,Almost never,Sometime +u05,post,Sometime,Almost never,Almost never,Fairly often,Fairly often,Never,Very often,Fairly often,Never,Never +u07,post,Sometime,Fairly often,Very often,Fairly often,Almost never,Fairly often,Almost never,Almost never,Fairly often,Fairly often +u09,post,Almost never,Sometime,Very often,Fairly often,Fairly often,Sometime,Fairly often,Sometime,Almost never,Sometime +u10,post,Sometime,Almost never,Fairly often,Very often,Fairly often,Sometime,Sometime,Very often,Sometime,Sometime +u14,post,Sometime,Almost never,Fairly often,Sometime,Fairly often,Almost never,Very often,Very often,Sometime,Sometime +u15,post,Almost never,Almost never,Sometime,Fairly often,Fairly often,Never,Fairly often,Sometime,Sometime,Never +u16,post,Very often,Very often,Very often,Sometime,Sometime,Sometime,Fairly often,Fairly often,Sometime,Very often +u17,post,Fairly often,Very often,Very often,Sometime,Almost never,Very often,Almost never,Never,Very often,Fairly often +u18,post,Fairly often,Fairly often,Very often,Sometime,Sometime,Sometime,Fairly often,Sometime,Sometime,Fairly often +u19,post,Fairly often,Fairly often,Very often,Fairly often,Fairly often,Sometime,Sometime,Fairly often,Sometime,Sometime +u20,post,Sometime,Sometime,Fairly often,Sometime,Fairly often,Sometime,Very often,Sometime,Sometime,Sometime +u23,post,Fairly often,Sometime,Fairly often,Fairly often,Sometime,Sometime,Sometime,Sometime,Fairly often,Sometime +u24,post,Almost never,Almost never,Sometime,Sometime,Fairly often,Almost never,Fairly often,Sometime,Sometime,Almost never +u27,post,Sometime,Sometime,Very often,Sometime,Sometime,Fairly often,Sometime,Sometime,Sometime,Sometime +u30,post,Almost never,Almost never,Sometime,Fairly often,Fairly often,Sometime,Fairly often,Fairly often,Almost never,Almost never +u31,post,Fairly often,Sometime,Almost never,Almost never,Almost never,Sometime,Sometime,Never,Almost never,Never +u32,post,Almost never,Fairly often,Sometime,Very often,Fairly often,Almost never,Very often,Very often,Never,Never +u33,post,Fairly often,Very often,Very often,Almost never,Almost never,Very often,Almost never,Almost never,Sometime,Very often +u34,post,Almost never,Almost never,Sometime,Fairly often,Fairly often,Sometime,Fairly often,Sometime,Almost never,Sometime +u35,post,Almost never,Sometime,Sometime,Sometime,Sometime,Almost never,Fairly often,Sometime,Almost never,Sometime +u36,post,Sometime,Almost never,Sometime,Fairly often,Sometime,Sometime,Almost never,Almost never,Almost never,Almost never +u42,post,Almost never,Sometime,Almost never,Sometime,Sometime,Almost never,Sometime,Sometime,Almost never,Almost never +u43,post,Almost never,Sometime,Almost never,Sometime,Fairly often,Almost never,Fairly often,Fairly often,Sometime,Almost never +u44,post,Sometime,Sometime,Sometime,Sometime,Sometime,Sometime,Sometime,Almost never,Sometime,Almost never +u45,post,Almost never,Sometime,Almost never,Almost never,Sometime,Sometime,Almost never,Sometime,Sometime,Sometime +u46,post,Fairly often,Sometime,Very often,Sometime,Almost never,Very often,Sometime,Sometime,Very often,Very often +u47,post,Sometime,Sometime,Fairly often,Almost never,Almost never,Fairly often,Almost never,Almost never,Very often,Sometime +u49,post,Almost never,Almost never,Sometime,Fairly often,Very often,Never,Fairly often,Very often,Almost never,Almost never +u51,post,Almost never,Almost never,Fairly often,Fairly often,Fairly often,Almost never,Fairly often,Fairly often,Almost never,Almost never +u52,post,Sometime,Very often,Very often,Almost never,Fairly often,Fairly often,Almost never,Almost never,Sometime,Very often +u53,post,Sometime,Sometime,Fairly often,Sometime,Sometime,Sometime,Fairly often,Sometime,Sometime,Sometime +u54,post,Very often,Very often,Fairly often,Sometime,Sometime,Sometime,Fairly often,Never,Sometime,Sometime +u56,post,Almost never,Sometime,Almost never,Fairly often,Very often,Almost never,Fairly often,Fairly often,Never,Almost never +u59,post,Sometime,Sometime,Sometime,Fairly often,Fairly often,Sometime,Fairly often,Fairly often,Almost never,Never diff --git a/src/experiments/clustering/survey/backup/scores.csv b/src/experiments/clustering/survey/backup/scores.csv new file mode 100644 index 0000000..e86af20 --- /dev/null +++ b/src/experiments/clustering/survey/backup/scores.csv @@ -0,0 +1,50 @@ +student_id,pre_PHQ_9,post_PHQ_9,pre_PSS,post_PSS,pre_lonliness_scale,post_longliness_scale,pre_flourishing_scale,post_flourishing_scale,pre_panas_positive,post_panas_positive,avg_hours_slept,mode_sleep_rating,avg_dead_line_per_week,E_pre,A_pre,C_pre,N_pre,O_pre,E_post,A_post,C_post,N_post,O_post +0,2,3,24,15,31,33,,45,,32,6,,,,,,,,,,,, +1,5,4,15,16,34,39,45,46,29,24,7.222222222,1.592592593,0.909090909,62.5,75.55555556,88.88888889,55,78,60,75.55555556,84.44444444,57.5,74 +2,13,5,21,25,33,31,46,44,20,21,7.76,2,1.090909091,42.5,60,84.44444444,70,64,37.5,60,77.77777778,65,60 +3,2,4,17,24,33,25,34,31,28,22,7.5,,,,,,,,,,,, +4,6,8,21,17,31,36,27,33,22,27,6.515151515,2.060606061,0.636363636,47.5,62.22222222,53.33333333,82.5,58,55,60,53.33333333,92.5,62 +5,,0,7,7,30,,48,50,25,25,7.4,1.6,1.545454545,62.5,75.55555556,88.88888889,55,66,60,75.55555556,84.44444444,47.5,68 +7,7,8,12,28,33,38,49,47,25,16,7.235294118,1.882352941,1.818181818,62.5,84.44444444,75.55555556,45,46,65,82.22222222,62.22222222,57.5,54 +8,5,,23,,29,,,,28,,7,1.902439024,1.636363636,52.5,75.55555556,84.44444444,65,66,,,,, +9,4,2,13,17,39,34,46,47,21,23,8,1.666666667,0.636363636,47.5,91.11111111,53.33333333,42.5,66,52.5,95.55555556,62.22222222,40,72 +10,,4,20,15,33,27,39,39,29,38,7.840909091,2.113636364,1.636363636,40,26.66666667,95.55555556,67.5,72,45,26.66666667,100,62.5,80 +12,1,,11,,37,,49,,31,,7.45,1.75,2.454545455,57.5,71.11111111,73.33333333,57.5,68,,,,, +13,4,,25,,44,,44,,18,,7,,,,,,,,,,,, +14,1,,9,14,26,31,52,53,43,24,6.966666667,1.666666667,2.272727273,65,73.33333333,66.66666667,50,66,65,77.77777778,73.33333333,45,68 +15,3,1,22,11,31,31,43,48,27,29,7.5625,1.8125,1.181818182,65,71.11111111,62.22222222,55,60,70,68.88888889,62.22222222,50,58 +16,6,12,24,26,36,40,42,41,18,22,8.142857143,1.910714286,1.363636364,35,66.66666667,71.11111111,85,76,37.5,66.66666667,64.44444444,80,76 +17,13,18,33,34,40,44,37,38,24,30,4.769230769,2.692307692,1.090909091,52.5,77.77777778,66.66666667,62.5,90,62.5,75.55555556,64.44444444,85,90 +18,15,12,27,24,38,31,37,,22,19,6.444444444,1.703703704,2.181818182,52.5,44.44444444,64.44444444,75,50,57.5,40,62.22222222,77.5,58 +19,5,4,20,21,36,32,42,42,25,25,7.836363636,1.654545455,2.454545455,35,62.22222222,68.88888889,72.5,64,42.5,73.33333333,75.55555556,80,74 +20,8,8,14,18,40,40,45,45,21,24,6.454545455,1.818181818,1.363636364,77.5,86.66666667,66.66666667,47.5,72,65,86.66666667,66.66666667,50,64 +22,3,,6,,36,,46,,24,,8.6875,1.375,1.272727273,72.5,68.88888889,91.11111111,57.5,76,,,,, +23,11,21,27,22,38,35,35,42,,26,7.739130435,1.608695652,1,60,73.33333333,60,75,72,62.5,71.11111111,71.11111111,85,80 +24,5,7,18,14,41,39,41,43,19,21,7.818181818,2.227272727,1.727272727,70,53.33333333,46.66666667,62.5,70,70,62.22222222,57.77777778,62.5,84 +27,5,7,18,23,38,33,31,31,19,18,5.266666667,2.533333333,1.818181818,50,77.77777778,53.33333333,62.5,66,42.5,73.33333333,55.55555556,65,66 +30,1,,9,12,33,33,52,56,35,27,7.09375,1.53125,2.363636364,90,82.22222222,88.88888889,40,72,87.5,82.22222222,91.11111111,40,72 +31,12,5,,21,29,24,16,36,22,26,5.454545455,2.545454545,1.090909091,85,77.77777778,57.77777778,62.5,74,62.5,60,64.44444444,67.5,68 +32,4,2,3,8,35,38,54,56,21,32,8.022727273,2.363636364,2.272727273,85,84.44444444,86.66666667,35,74,82.5,84.44444444,91.11111111,30,76 +33,23,25,33,33,39,46,31,28,26,27,6.853658537,2.292682927,2.545454545,57.5,66.66666667,37.77777778,72.5,92,67.5,77.77777778,46.66666667,75,90 +34,3,6,14,14,43,38,49,23,20,20,6.8,1.2,1,65,86.66666667,62.22222222,57.5,66,65,84.44444444,62.22222222,60,64 +35,7,7,14,16,30,29,48,44,23,18,7.083333333,1.375,1,52.5,77.77777778,71.11111111,52.5,68,65,73.33333333,64.44444444,47.5,72 +36,2,1,12,18,33,32,46,46,24,27,7.5625,1.96875,,70,82.22222222,75.55555556,35,70,72.5,86.66666667,82.22222222,45,72 +39,3,,,,38,,15,,17,,7.777777778,1,,67.5,66.66666667,62.22222222,60,70,,,,, +42,1,0,24,15,31,36,45,16,25,30,7.5,1.583333333,1,62.5,84.44444444,48.88888889,50,78,75,86.66666667,55.55555556,42.5,74 +43,7,4,21,13,35,40,47,42,26,26,7.48,2.24,,47.5,82.22222222,84.44444444,50,66,45,84.44444444,82.22222222,45,64 +44,1,2,18,20,32,33,48,41,25,26,9.01754386,1.140350877,1.363636364,67.5,73.33333333,75.55555556,45,72,55,66.66666667,68.88888889,65,66 +45,7,2,,20,34,32,48,49,31,24,7.727272727,1.090909091,1,75,73.33333333,68.88888889,62.5,74,70,73.33333333,73.33333333,50,70 +46,10,,31,30,37,,42,44,27,32,6.260869565,2.347826087,1.545454545,62.5,62.22222222,71.11111111,70,84,,,,, +47,5,1,14,28,30,33,,52,32,24,7.25,1.625,1.363636364,87.5,64.44444444,68.88888889,40,76,72.5,66.66666667,73.33333333,37.5,80 +49,2,8,16,8,35,32,51,55,28,,6.921052632,2,7.636363636,57.5,84.44444444,73.33333333,50,86,62.5,86.66666667,73.33333333,42.5,90 +50,7,,19,,34,,48,,,,7.2,2.2,1.363636364,77.5,66.66666667,71.11111111,50,70,,,,, +51,1,,14,12,30,29,,48,20,20,7.148148148,1.851851852,1,67.5,75.55555556,75.55555556,55,80,60,75.55555556,73.33333333,45,74 +52,12,15,22,29,39,29,34,37,37,28,7.304347826,1.260869565,1.454545455,32.5,66.66666667,44.44444444,82.5,70,40,57.77777778,53.33333333,95,62 +53,8,11,23,20,33,36,50,49,27,25,6.555555556,2.814814815,0.636363636,45,80,57.77777778,90,88,45,75.55555556,60,87.5,88 +56,2,3,11,9,28,29,46,48,18,25,9.4,2,,32.5,82.22222222,66.66666667,40,82,30,80,66.66666667,47.5,80 +57,0,,9,,35,,50,,24,,7.525423729,1.949152542,1.363636364,42.5,80,64.44444444,42.5,54,,,,, +58,5,8,20,,28,25,51,,25,,7.25,1.538461538,0.636363636,42.5,86.66666667,71.11111111,77.5,72,52.5,84.44444444,84.44444444,75,74 +59,5,7,18,13,34,31,43,50,29,31,6.134328358,2.119402985,4.727272727,80,82.22222222,51.11111111,57.5,70,77.5,82.22222222,57.77777778,60,64 +54,,,18,13,,,,,,24,5.583333333,2.25,2.818181818,,,,,,60,66.66666667,62.22222222,55,62 +25,,,,,,,,,,,7.5,2,1,,,,,,,,,, +41,,,,,,,,,,,7.692307692,1.846153846,1.272727273,,,,,,,,,, diff --git a/src/experiments/clustering/survey/generate_survey_score_covariate.py b/src/experiments/clustering/survey/generate_survey_score_covariate.py new file mode 100644 index 0000000..766bd6e --- /dev/null +++ b/src/experiments/clustering/survey/generate_survey_score_covariate.py @@ -0,0 +1,43 @@ +import numpy as np +import pandas as pd +import pickle + +from sklearn.preprocessing import MinMaxScaler + +if __name__ == "__main__": + # read survey scores + student_list = [4, 7, 8, 10, 14, 16, 17, 19, 22, 23, 24, 32, 33, 35, 36, 43, 44, 49, 51, 52, 53, 57, 58] + df = pd.read_csv("scores.csv", index_col="student_id") + + # print(np.nan_to_num(np.array([df['pre_PHQ_9'][10]]))) + + # form covariate vectors + scores_matrix = list() + for id_ in student_list: + vec = list() + for key in df: + vec.append(df[key][id_]) + scores_matrix.append(vec) + + # pad nan, min_max normalization + scores_matrix = np.nan_to_num(np.array(scores_matrix)) + scaler = MinMaxScaler() + scaler.fit(scores_matrix) + scores_matrix = scaler.transform(scores_matrix).tolist() + + # formatting + scores_covariates = dict() # id -> scores_vectors + for i in range(len(student_list)): + scores_covariates[str(student_list[i])] = scores_matrix[i] + + # save the vectors + with open('survey_scores_covariate', 'wb') as f: + pickle.dump(scores_covariates, f) + + # check saved data + scores_covariates = None + with open('survey_scores_covariate', 'rb') as f: + scores_covariates = pickle.load(f) + + for id_ in scores_covariates: + print(id_, scores_covariates[id_]) diff --git a/src/experiments/clustering/survey/panas.csv b/src/experiments/clustering/survey/panas.csv new file mode 100644 index 0000000..b85cedf --- /dev/null +++ b/src/experiments/clustering/survey/panas.csv @@ -0,0 +1,86 @@ +uid,type,Interested,Distressed,Upset,Strong,Guilty,Scared,Hostile ,Enthusiastic,Proud,Irritable,Alert,Inspired,Nervous,Determined ,Attentive,Jittery,Active ,Afraid +u00,pre,5,4,3,4,3,5,5,3,3,3,4,4,,4,3,2,2,5 +u01,pre,4,2,2,2,5,1,1,3,3,2,4,3,3,4,3,2,4,1 +u02,pre,4,1,1,2,2,2,1,3,2,2,3,2,2,2,3,3,2,1 +u03,pre,4,2,2,4,5,5,5,3,1,1,5,3,1,3,3,5,4,1 +u04,pre,4,2,1,3,3,1,1,3,3,2,3,1,2,3,3,1,4,1 +u05,pre,4,2,1,4,5,5,5,4,2,2,4,3,1,3,4,1,4,5 +u07,pre,3,2,2,4,2,2,3,4,5,4,3,2,3,3,3,1,3,2 +u08,pre,3,4,3,3,1,4,1,3,3,5,3,3,4,5,3,1,4,3 +u09,pre,4,1,1,3,1,2,1,3,3,1,3,2,1,3,3,2,4,1 +u10,pre,4,2,2,4,1,1,3,3,3,5,4,4,2,4,4,3,3,1 +u12,pre,5,2,2,5,1,1,3,5,5,2,4,3,2,5,5,3,5,2 +u13,pre,2,3,1,1,3,1,1,3,1,4,2,1,3,1,1,4,1,1 +u14,pre,5,5,5,3,5,5,5,4,3,5,5,5,3,5,5,5,5,5 +u15,pre,3,3,3,4,1,3,3,3,3,3,3,3,3,3,3,3,5,3 +u16,pre,3,3,3,2,2,2,1,2,1,2,1,2,4,2,3,1,2,2 +u17,pre,4,2,1,4,1,1,2,5,3,3,1,4,4,5,4,2,1,1 +u18,pre,3,5,5,1,1,2,3,2,1,4,2,1,3,2,1,2,3,3 +u19,pre,4,2,2,3,3,3,1,4,2,1,4,3,4,4,4,3,3,3 +u20,pre,4,2,1,3,1,1,1,4,3,1,3,3,3,3,3,2,3,1 +u22,pre,4,3,3,3,1,1,1,2,1,3,2,3,1,4,3,1,4,1 +u23,pre,2,4,3,3,,1,2,3,3,3,4,,5,4,4,4,3,3 +u24,pre,4,3,1,4,1,1,2,2,3,2,4,2,3,2,2,1,3,1 +u27,pre,3,3,2,3,1,2,1,3,2,3,2,2,3,2,3,1,3,2 +u30,pre,4,5,5,4,5,1,5,3,4,5,4,4,1,4,4,1,3,5 +u31,pre,4,2,1,5,1,2,1,4,1,1,3,2,2,4,3,3,5,4 +u32,pre,4,3,1,3,1,1,1,4,4,2,3,2,3,3,3,1,3,1 +u33,pre,3,5,4,1,4,4,3,2,2,3,3,3,3,2,2,4,1,4 +u34,pre,4,1,1,4,1,1,1,3,2,1,2,4,2,3,3,1,3,1 +u35,pre,4,1,1,3,3,1,1,3,3,1,2,3,2,3,2,1,4,1 +u36,pre,5,1,1,3,1,1,1,4,4,1,1,4,3,3,4,1,4,1 +u39,pre,1,1,1,1,1,1,1,2,1,1,2,4,2,4,2,1,3,1 +u42,pre,2,4,4,3,4,4,2,3,4,3,3,3,3,2,2,1,2,4 +u43,pre,4,2,1,2,3,2,1,4,4,2,3,3,3,5,4,2,2,2 +u44,pre,4,1,2,3,1,1,1,4,3,2,2,4,1,4,4,1,4,1 +u45,pre,3,3,4,3,1,1,1,3,2,5,1,2,2,5,4,5,4,1 +u46,pre,4,4,2,5,1,1,3,4,4,5,3,1,1,4,4,1,5,1 +u47,pre,4,1,4,4,1,5,3,5,4,4,1,4,5,3,4,3,5,5 +u49,pre,5,4,3,3,2,2,1,4,3,2,4,4,2,4,4,1,4,1 +u50,pre,1,1,2,3,2,1,1,3,3,1,3,3,2,3,3,,3,1 +u51,pre,3,2,2,2,1,1,1,3,3,1,2,2,3,3,3,2,3,1 +u52,pre,5,5,4,3,1,4,2,5,5,4,5,5,5,5,4,5,3,5 +u53,pre,4,2,3,1,1,2,2,3,3,2,3,4,1,4,3,3,3,1 +u56,pre,4,1,1,2,1,1,1,2,2,1,2,4,1,2,2,1,2,1 +u57,pre,5,2,1,4,1,1,2,4,3,2,4,3,1,3,4,1,5,1 +u58,pre,1,1,1,2,4,1,1,3,3,2,1,4,3,5,5,1,4,1 +u59,pre,4,2,1,4,3,1,1,4,4,2,3,3,5,5,4,4,3,2 +u00,post,4,3,3,3,4,1,2,3,4,4,4,3,3,3,3,3,4,1 +u01,post,3,2,2,3,2,1,1,3,3,3,3,2,2,3,4,3,3,1 +u02,post,4,2,2,4,1,1,1,3,3,1,4,4,2,4,1,1,1,2 +u03,post,4,3,2,3,1,3,2,3,3,2,1,4,3,2,2,1,3,3 +u04,post,3,2,2,4,1,1,2,4,4,3,4,3,4,4,3,2,5,2 +u05,post,4,2,2,3,3,3,1,4,3,1,4,3,3,4,4,1,4,1 +u07,post,2,4,3,3,1,1,2,1,3,2,2,1,2,1,1,1,2,1 +u09,post,3,1,2,2,1,1,1,4,4,1,4,3,2,3,4,1,5,2 +u10,post,5,5,1,4,4,1,4,5,5,5,4,5,4,5,5,3,5,1 +u14,post,4,1,1,3,1,1,1,4,4,2,5,5,3,5,5,1,1,1 +u15,post,4,4,4,4,2,2,2,4,3,3,4,3,3,4,3,2,4,3 +u16,post,2,5,4,2,4,5,1,1,2,4,2,1,4,3,3,1,1,5 +u17,post,2,5,5,4,1,4,4,5,5,5,1,2,5,5,1,3,2,5 +u18,post,2,5,3,3,1,5,5,1,1,4,3,1,5,3,2,3,1,4 +u19,post,4,2,1,4,2,2,1,5,4,1,3,5,2,3,3,2,3,2 +u20,post,2,4,4,4,3,3,1,2,2,1,3,3,4,3,3,3,3,3 +u23,post,2,2,3,2,1,2,4,4,4,4,3,4,4,3,4,3,2,1 +u24,post,2,4,2,2,1,1,1,2,3,2,3,1,2,4,2,3,3,2 +u27,post,2,3,1,2,1,3,1,2,2,4,3,2,4,2,3,3,1,3 +u30,post,4,1,1,4,5,5,1,3,4,1,4,3,1,4,4,1,4,5 +u31,post,2,3,4,2,5,5,5,3,2,4,2,3,4,3,2,2,1,5 +u32,post,5,3,2,4,1,1,1,4,5,3,4,5,1,5,5,2,4,1 +u33,post,4,5,5,2,2,2,2,4,4,2,3,3,4,3,2,3,1,3 +u34,post,3,3,2,2,1,1,1,3,2,1,2,4,3,3,2,1,3,1 +u35,post,2,4,3,2,1,1,1,3,3,1,1,3,2,2,2,1,2,1 +u36,post,4,2,2,4,1,2,1,3,3,3,3,3,3,4,4,3,4,2 +u42,post,5,3,3,5,1,1,3,4,4,3,4,4,2,4,3,3,3,2 +u43,post,4,3,2,3,2,3,1,4,4,2,3,3,3,5,4,2,2,3 +u44,post,4,3,3,1,1,2,1,4,2,3,3,3,3,3,4,3,4,3 +u45,post,2,4,1,3,3,1,1,3,3,3,3,4,2,2,3,2,4,1 +u46,post,2,5,5,3,1,2,5,5,3,5,3,5,4,4,4,5,2,2 +u47,post,3,1,1,5,1,1,2,4,4,1,1,5,2,4,5,1,4,1 +u49,post,5,3,2,3,1,1,1,5,5,2,3,5,2,5,,2,3,2 +u51,post,3,2,1,3,1,1,1,3,3,2,2,3,2,3,3,1,3,1 +u52,post,2,5,5,2,1,5,3,5,4,5,3,2,5,3,4,5,1,5 +u53,post,5,3,2,3,1,1,2,4,4,2,3,4,4,5,4,1,1,1 +u54,post,2,5,5,2,3,2,1,3,2,3,4,2,4,3,1,2,2,1 +u56,post,4,2,2,3,2,1,1,3,3,2,3,4,2,3,3,2,3,1 +u59,post,4,1,2,4,2,5,2,5,5,3,4,3,5,5,4,4,3,5 diff --git a/src/experiments/clustering/survey/psqi.csv b/src/experiments/clustering/survey/psqi.csv new file mode 100644 index 0000000..b12b913 --- /dev/null +++ b/src/experiments/clustering/survey/psqi.csv @@ -0,0 +1,85 @@ +uid,type,"During the past month, what time have you usually gone to bed at night? ","During the past month, how long (in minutes) has it usually taken you to fall asleep each night?",When have you usually gotten up in the morning?,"During the past month, how many hours of actual sleep did you get at night? (This may be different than the number of hours you spent in bed.)",a. Cannot get to sleep within 30 minutes,b. Wake up in the middle of the night or early morning,c. Have to get up to use the bathroom,d. Cannot breathe comfortably,e. Cough or snore loudly,f. Feel too cold,g. Feel too hot,h. Have bad dreams,i. Have pain,j. Other reason(s),"Other reason(s), please describe, including how often you have had trouble sleeping because of this reason(s):","During the past month, how often have you taken medicine (prescribed or over the counter) to help you sleep?","During the past month, how often have you had trouble staying awake while driving, eating meals, or engaging in social activity?","During the past month, how much of a problem has it been for you to keep up enthusiasm to get things done?","During the past month, how would you rate your sleep quality overall?" +u00,pre,2:00 AM,10 mins,7:00 AM,6 hours,Not during the past month,Three or a more times week,Three or a more times week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,Once or a twice week,headaches,Not during the past month,Not during the past month,Once or a twice week,Fairly bad +u01,pre,1AM,20,11AM,10-Sep,Not during the past month,Three or a more times week,Not during the past month,Not during the past month,Not during the past month,Once or a twice week,Once or a twice week,Less than once week,Not during the past month,Not during the past month,,Less than once week,Once or a twice week,Once or a twice week,Very good +u02,pre,around 1 am,"I fall asleep really quickly, maybe 5-10minutes",about 9am,about 7-8hours,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Once or a twice week,Once or a twice week,Fairly good +u03,pre,1am,10min,8:30am,7hours,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Less than once week,Fairly good +u04,pre,11:30,10,6:00,6:00,Not during the past month,Less than once week,Less than once week,Not during the past month,,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Once or a twice week,Fairly good +u05,pre,11:30pm,15,8:00am,8,Less than once week,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Once or a twice week,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Not during the past month,Very good +u07,pre,Between 12a and 2a,20 minutes,between 9a and 11a,8,Once or a twice week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Less than once week,Not during the past month,,Not during the past month,Once or a twice week,Once or a twice week,Fairly good +u08,pre,1:00 AM,15 minutes,9:00 AM,7 Hours,Once or a twice week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Less than once week,Fairly good +u09,pre,12:00,10,8:30,7.5,Not during the past month,Not during the past month,Once or a twice week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Three or a more times week,Very good +u10,pre,12-1 AM,30,8:10 AM,8-Jul,Three or a more times week,Three or a more times week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Once or a twice week,Not during the past month,Once or a twice week,Trying to solve coding problems while in bed (several times a week),Not during the past month,Less than once week,Less than once week,Fairly good +u12,pre,10pm,10,3am,7.5,Less than once week,Not during the past month,Three or a more times week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Once or a twice week,Not during the past month,Not during the past month,Very good +u13,pre,2:00 AM,<10 mins,8:45am,7,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,,Not during the past month,Less than once week,Once or a twice week,Fairly good +u14,pre,2:00am,10 minutes,10am,9 hours,Not during the past month,Once or a twice week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Not during the past month,Fairly good +u15,pre,1,5,8:15,7,Not during the past month,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Three or a more times week,Fairly good +u16,pre,11 or 12pm,5 min,7:30am,7-Jun,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Less than once week,Three or a more times week,Class work or all-nighters,Not during the past month,Not during the past month,Once or a twice week,Fairly bad +u17,pre,"During the beginning of the month around 4am. But now, after spring break, I try to be in bed around 12, latest 1:30am.","It has become longer and longer, right now around an hour, sometimes longer some nights.","It varies. I know that I regularly need at least 6-7 hrs of sleep, so ideally 7-8am, but because of my late sleep time, usually around 8:30 or 9am.","Difficult to say, because I did have a few all-nighters during finals. More than 130 hours, at least.",Three or a more times week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Once or a twice week,Once or a twice week,Less than once week,Three or a more times week,Three or a more times week,bed feels uncomfortable,Not during the past month,Once or a twice week,Three or a more times week,Fairly bad +u18,pre,12AM,10 minutes,8am,7 hours,Less than once week,Less than once week,Three or a more times week,Less than once week,Once or a twice week,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,,Not during the past month,Three or a more times week,Once or a twice week,Fairly good +u19,pre,Midnight,less than 20,8:30,7,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Less than once week,Fairly good +u20,pre,11pm-4pm,5minutes,7am-10am,7-Jun,Less than once week,Once or a twice week,Less than once week,Not during the past month,Not during the past month,Less than once week,Less than once week,Less than once week,Less than once week,,,Not during the past month,Less than once week,Less than once week,Fairly good +u22,pre,1 or 2 am,15-20min,9am to 11am,8 hours,Once or a twice week,Not during the past month,Once or a twice week,Not during the past month,Not during the past month,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Not during the past month,Fairly good +u23,pre,13:00,30 minutes,8:00,6 hours,,Three or a more times week,,,,,,,,Once or a twice week,Depressed and anxious,Three or a more times week,Less than once week,Three or a more times week,Fairly good +u24,pre,2:00 AM,20,9:00 AM,7,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,Once or a twice week,Not during the past month,Once or a twice week,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Three or a more times week,Fairly good +u27,pre,1-2AM,1 hour,7:30,6-May,Once or a twice week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Once or a twice week,,Fairly bad +u30,pre,around midnight,30,8am,7,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Not during the past month,Fairly good +u31,pre,Never,30 mins,Just hour before first class,"Not regularly, but 3 to 5",Less than once week,Three or a more times week,Not during the past month,Once or a twice week,Three or a more times week,Three or a more times week,Not during the past month,Three or a more times week,Three or a more times week,Once or a twice week,God damn cs39,Less than once week,Once or a twice week,Once or a twice week,Very bad +u32,pre,Midnight,45,8:30am,7,Three or a more times week,Three or a more times week,Once or a twice week,Not during the past month,Not during the past month,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Once or a twice week,Fairly bad +u33,pre,1:30 AM,30 minutes,9:45 AM,7,Once or a twice week,Not during the past month,Once or a twice week,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,,Less than once week,Not during the past month,Three or a more times week,Fairly bad +u34,pre,2AM,1 minutes,10AM,8 hours,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Once or a twice week,Very good +u35,pre,around 1:00 AM,5~10 mins,8:30~9:00 AM,6~7hours,Less than once week,Not during the past month,Less than once week,Not during the past month,Not during the past month,Less than once week,Not during the past month,Less than once week,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Not during the past month,Fairly good +u36,pre,12:30AM,10min,8AM,7.5hrs,Not during the past month,Less than once week,Less than once week,Not during the past month,Less than once week,Less than once week,Not during the past month,Less than once week,Not during the past month,,,Not during the past month,Not during the past month,Less than once week,Fairly good +u39,pre,2:00 AM,30 min,9:00 AM,8-Jul,Less than once week,Less than once week,Less than once week,Once or a twice week,Not during the past month,Not during the past month,Not during the past month,Less than once week,Less than once week,Not during the past month,,Not during the past month,Not during the past month,Less than once week,Fairly good +u42,pre,3 or 4 am,in 10 min,9 or 10 am,5 hours,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,,,Not during the past month,Once or a twice week,Once or a twice week,Fairly bad +u43,pre,11:30,12,8:30-9,7.5,Less than once week,Once or a twice week,Less than once week,Not during the past month,Not during the past month,Less than once week,Less than once week,Not during the past month,Once or a twice week,Not during the past month,,Less than once week,Less than once week,Once or a twice week,Fairly good +u44,pre,12:00 m,20mins,8:00am,7hours approximately,Less than once week,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Less than once week,Fairly good +u45,pre,1:00 AM,15min,9:30 AM,8 hours,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,Less than once week,Not during the past month,,,Not during the past month,Less than once week,Less than once week,Fairly good +u46,pre,1:30,20,9:00,7,Once or a twice week,Three or a more times week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Once or a twice week,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Less than once week,Fairly good +u47,pre,12AM,1 Hour,9AM,8 Hours,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,,Not during the past month,Not during the past month,Not during the past month,Very good +u49,pre,2-4am,15-20 min,7-8am,5-Apr,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Once or a twice week,Needing to do work due for the next day.,Not during the past month,Not during the past month,Less than once week,Fairly good +u50,pre,2am,10min,9am,7hours,Not during the past month,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,,,Not during the past month,Not during the past month,Not during the past month,Fairly good +u51,pre,2am,15min,9:30am,6,Less than once week,Less than once week,Less than once week,Not during the past month,Not during the past month,Once or a twice week,Less than once week,Less than once week,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Less than once week,Fairly bad +u52,pre,3:00 AM,60,1:00 PM,9 Hours,Three or a more times week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Three or a more times week,Three or a more times week,Very good +u53,pre,2:00 AM,30 min,9:00am,200 hrs,Once or a twice week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Once or a twice week,Once or a twice week,Less than once week,Not during the past month,Once or a twice week,Stress,Less than once week,Three or a more times week,Once or a twice week,Fairly bad +u56,pre,1am,15,10:30am,8,Not during the past month,Less than once week,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,,Not during the past month,Not during the past month,Once or a twice week,Fairly good +u57,pre,Midnight or 1 AM,14 minutes,9 am or 10 am,Around 9 hours on average,Not during the past month,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Not during the past month,Very good +u58,pre,0:30,10,8:00,7:30,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Less than once week,Once or a twice week,Once or a twice week,Fairly good +u59,pre,2:00 AM,20 minutes,8:00 AM,5 hours,Once or a twice week,Not during the past month,Less than once week,Not during the past month,Less than once week,Once or a twice week,Once or a twice week,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Three or a more times week,Three or a more times week,Fairly bad +u00,post,Midnight,15 mins,7:00 AM,6,Less than once week,Three or a more times week,Three or a more times week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,,,Not during the past month,Not during the past month,Less than once week,Fairly good +u01,post,3AM,20,11AM,7,Not during the past month,Three or a more times week,Less than once week,Not during the past month,Not during the past month,Once or a twice week,Once or a twice week,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Once or a twice week,Once or a twice week,Fairly good +u02,post,1am,15min,9am,8,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Less than once week,Fairly good +u03,post,2am,15,8am,8,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Once or a twice week,Fairly good +u04,post,11:30pm,10,7am,7,Not during the past month,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Less than once week,Fairly good +u05,post,1:00am,10,8:00am,6 to 7,Not during the past month,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Once or a twice week,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Not during the past month,Fairly good +u07,post,5am,10,2pm,8,Less than once week,Less than once week,Less than once week,Less than once week,Less than once week,Less than once week,Less than once week,Less than once week,Less than once week,Less than once week,,Not during the past month,Less than once week,Once or a twice week,Fairly bad +u09,post,Midnight,20,8:30am,7.5-8 hours,Less than once week,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Less than once week,Fairly good +u10,post,12-1AM,10,8AM,8,Not during the past month,Three or a more times week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Once or a twice week,Once or a twice week,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Not during the past month,Fairly good +u14,post,2:30am,5,11am,6,Less than once week,Three or a more times week,Not during the past month,Not during the past month,Not during the past month,Less than once week,Once or a twice week,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Once or a twice week,Less than once week,Fairly bad +u15,post,12,5,830,8,Not during the past month,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,,Not during the past month,Less than once week,Not during the past month,Fairly good +u16,post,12am,10,8am,7,Not during the past month,Once or a twice week,Once or a twice week,Not during the past month,Less than once week,Not during the past month,Once or a twice week,Three or a more times week,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Three or a more times week,Fairly good +u17,post,5am; most nights I've not gone to bed at all since the start of May,"at, first, when I had to pull all-nighters, it took me at least 2 hrs. Now, because I'm overtired, I sleep immediately.","Usually I don't get up, I'm awake from the night before. But often I go to sleep at 6 and wake up at 8","probably 2hrs per night, 4 if I'm lucky",Three or a more times week,Three or a more times week,Not during the past month,Not during the past month,Not during the past month,Three or a more times week,Not during the past month,Three or a more times week,Three or a more times week,Three or a more times week,Worried about classes; assignments not finished the next morning; that coding will not be finished in time for deadline,Not during the past month,Three or a more times week,Three or a more times week,Very bad +u18,post,2am,5mins,8am,5,Less than once week,Three or a more times week,Three or a more times week,Less than once week,Once or a twice week,Not during the past month,Once or a twice week,Less than once week,Less than once week,Not during the past month,,Less than once week,Once or a twice week,Three or a more times week,Very good +u19,post,12:00,10,8:00,7-Jun,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Not during the past month,Very good +u20,post,3am in the morning,10minutes,9:30am,7hours/day,Once or a twice week,Three or a more times week,Three or a more times week,Less than once week,Less than once week,Less than once week,Less than once week,Once or a twice week,Less than once week,Not during the past month,,Once or a twice week,Less than once week,Once or a twice week,Fairly good +u23,post,1:30 AM,20 minutes,8:00,5 hours,,,,,,,,,,Three or a more times week,OCD,Three or a more times week,Once or a twice week,Three or a more times week,Fairly bad +u24,post,3:00 AM,30 min,1:00 PM,6:30 hours,Less than once week,Not during the past month,Less than once week,Not during the past month,Not during the past month,Less than once week,Less than once week,Not during the past month,Not during the past month,,,Not during the past month,Less than once week,Three or a more times week,Fairly good +u27,post,3-4am,20-30,7:30am,4-Mar,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,,Less than once week,Once or a twice week,Once or a twice week,Very bad +u30,post,12am (midnight),30,8:30am,7,Once or a twice week,Less than once week,Less than once week,Not during the past month,Not during the past month,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Not during the past month,Fairly good +u31,post,usually dawn,10,9:00 AM,6 to 7,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Less than once week,Less than once week,Less than once week,Not during the past month,Not during the past month,,Less than once week,Less than once week,Less than once week,Fairly good +u32,post,12:00am,30,9:00am,7 hours,Three or a more times week,Once or a twice week,Once or a twice week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,,Not during the past month,Not during the past month,Not during the past month,Fairly good +u33,post,2,20,10,7,Three or a more times week,Less than once week,Not during the past month,Less than once week,Less than once week,Less than once week,Less than once week,Less than once week,Not during the past month,Not during the past month,,Not during the past month,Once or a twice week,Three or a more times week,Fairly bad +u34,post,2AM,5 mins,10AM,8 hours,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,Not during the past month,,,Not during the past month,Less than once week,Less than once week,Very good +u35,post,2:00am,half a hour,"9""00 am",6,Less than once week,Not during the past month,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Not during the past month,Fairly good +u36,post,12:30 AM,10 minutes,8:30 AM,7.5 hours,Not during the past month,Less than once week,Once or a twice week,Not during the past month,Less than once week,Not during the past month,Not during the past month,Less than once week,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Once or a twice week,Fairly good +u42,post,4:00 AM,10 min,11:00 AM,6 hours,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Less than once week,Fairly good +u43,post,midnight,12,8:30,8,Less than once week,Once or a twice week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Less than once week,Less than once week,Less than once week,Not during the past month,,Not during the past month,Less than once week,Once or a twice week,Fairly good +u44,post,12:30,20 minutes,8:00,8,Less than once week,Less than once week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Less than once week,Fairly good +u45,post,1am,less than half an hour,9:00 AM,7hours,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,,,Not during the past month,Not during the past month,Less than once week,Fairly good +u46,post,,,,,,,,,,,,,,,,,,, +u47,post,2am,15,10am,420,Not during the past month,Less than once week,Not during the past month,Not during the past month,Not during the past month,Once or a twice week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Not during the past month,Very good +u49,post,2-5 am,5-10 min,8:40 AM,8-Mar,Less than once week,Once or a twice week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Once or a twice week,Not during the past month,Fairly bad +u51,post,2am,10,9am,6~7,Not during the past month,Once or a twice week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Less than once week,Once or a twice week,Not during the past month,Not during the past month,,Not during the past month,Less than once week,Not during the past month,Fairly good +u52,post,2:00 AM,30,10:00 AM,8,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,,Not during the past month,Once or a twice week,Three or a more times week,Very good +u53,post,3am,1 hr,9am,6 hr,Three or a more times week,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Once or a twice week,Three or a more times week,stress,Three or a more times week,Three or a more times week,Once or a twice week,Fairly bad +u56,post,1am,10,10am,8.5,Not during the past month,Less than once week,Not during the past month,Not during the past month,Not during the past month,Less than once week,Less than once week,Less than once week,Not during the past month,Not during the past month,,Not during the past month,Not during the past month,Once or a twice week,Fairly good +u59,post,2:00 AM,15 Minutes,7:00 AM,4-5 Hours,Less than once week,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Not during the past month,Less than once week,Not during the past month,Less than once week,,,Not during the past month,Three or a more times week,Less than once week,Fairly good diff --git a/src/experiments/clustering/survey/scores.csv b/src/experiments/clustering/survey/scores.csv new file mode 100644 index 0000000..abfb391 --- /dev/null +++ b/src/experiments/clustering/survey/scores.csv @@ -0,0 +1,48 @@ +student_id,pre_PHQ_9,post_PHQ_9,pre_PSS,post_PSS,pre_lonliness_scale,post_longliness_scale,pre_flourishing_scale,post_flourishing_scale,pre_panas_positive,post_panas_positive,pre_panas_negative,post_panas_negative,A_pre,C_pre,N_pre,O_pre,E_post,A_post,C_post,N_post,O_post +0,2,3,24,15,31,33,,45,13,16,,15,18,25,21,27,18,22,26,24,24 +1,5,4,15,16,34,39,45,46,15,11,13,13,26,26,20,29,16,26,24,19,25 +2,13,5,21,25,33,31,46,44,10,7,11,12,27,26,26,22,19,19,25,24,28 +3,2,4,17,24,33,25,34,31,14,10,18,11,29,29,21,29,23,28,27,24,27 +4,6,8,21,17,31,36,27,33,13,12,12,17,24,22,29,29,22,29,24,27,27 +5,,0,7,7,30,,48,50,15,12,18,15,26,28,20,27,20,30,26,23,30 +7,7,8,12,28,33,38,49,47,12,7,16,10,24,28,,31,24,29,26,23,33 +8,5,,23,,29,,,,13,,14,,26,24,20,21,,,,, +9,4,2,13,17,39,34,46,47,10,10,11,13,29,26,23,23,21,29,22,24,24 +10,,4,20,15,33,27,39,39,13,19,17,21,24,27,21,28,16,24,29,23,26 +12,1,,11,,37,,49,,13,,19,,30,25,25,28,,,,, +13,4,,25,,44,,44,,10,,8,,20,22,23,21,,,,, +14,1,,9,14,26,31,52,53,20,8,21,17,27,,,23,20,27,23,22,24 +15,3,1,22,11,31,31,43,48,12,13,16,16,28,22,22,26,22,25,22,22,27 +16,6,12,24,26,36,40,42,41,9,11,11,12,26,24,22,22,17,28,23,24,22 +17,13,18,33,34,40,44,37,38,9,10,15,15,19,22,23,23,21,20,21,22,23 +18,15,12,27,24,38,31,37,,11,8,10,18,26,27,22,25,23,24,,23,27 +19,5,4,20,21,36,32,42,42,11,10,16,13,28,23,23,26,17,27,24,24,25 +20,8,8,14,18,40,40,45,45,9,9,13,15,25,28,23,20,22,33,26,22,24 +22,3,,6,,36,,46,,12,,10,,29,25,25,24,,,,, +23,11,21,27,22,38,35,35,42,,9,18,17,27,23,20,28,15,20,24,, +24,5,7,18,14,41,39,41,43,10,8,15,10,22,23,27,25,20,22,20,21,30 +27,5,7,18,23,38,33,31,31,10,8,12,13,27,26,23,25,21,27,23,24,27 +30,1,,9,12,33,33,52,56,17,14,18,14,29,28,28,28,23,31,27,26,26 +31,12,5,,21,29,24,16,36,11,12,14,15,23,24,25,29,23,25,21,25, +32,4,2,3,8,35,38,54,56,10,13,13,15,26,23,24,23,23,28,25,22,22 +33,23,25,33,33,39,46,31,28,11,9,12,13,28,21,23,28,23,27,19,20,27 +34,3,6,14,14,43,38,49,23,9,8,12,10,27,26,21,29,20,24,26,22,24 +35,7,7,14,16,30,29,48,44,12,6,10,8,27,28,25,28,22,25,,23,26 +36,2,1,12,18,33,32,46,46,11,12,12,15,29,28,24,25,21,33,29,24,24 +39,3,,,,38,,15,,6,,8,,26,22,20,25,,,,, +42,1,0,24,15,31,36,45,16,11,12,13,17,28,24,24,25,24,29,29,25,27 +43,7,4,21,13,35,40,47,42,11,10,13,14,29,26,18,25,22,30,25,22,26 +44,1,2,18,20,32,33,48,41,11,12,11,12,23,24,22,22,18,22,25,22,21 +45,7,2,,20,34,32,48,49,13,12,11,12,27,29,29,29,22,23,27,24,25 +46,10,,31,30,37,,42,44,15,10,16,19,22,22,26,24,,,,, +47,5,1,14,28,30,33,,52,14,9,17,15,25,25,20,26,27,24,31,27,28 +49,2,8,16,8,35,32,51,55,13,11,14,,26,23,20,21,17,25,21,21,21 +50,7,,19,,34,,48,,,,12,,22,24,20,25,,,,, +51,1,,14,12,30,29,,48,8,9,11,11,28,26,26,28,20,28,25,22,25 +52,12,15,22,29,39,29,34,37,13,9,19,17,28,24,23,21,16,26,26,22,17 +53,8,11,23,20,33,36,50,49,10,9,10,16,26,20,24,28,24,24,21,25,26 +56,2,3,11,9,28,29,46,48,8,11,8,12,29,24,22,27,20,30,24,23,26 +57,0,,9,,35,,50,,13,,15,,28,27,21,27,,,,, +58,5,8,20,,28,25,51,,11,,12,,27,24,23,24,13,26,,24,21 +59,5,7,18,13,34,31,43,50,12,12,17,19,31,21,17,25,19,27,20,20,24 +54,,,18,13,,,,,,10,,12,,,,,24,28,28,26,29 diff --git a/src/experiments/clustering/survey/scores_pre.csv b/src/experiments/clustering/survey/scores_pre.csv new file mode 100644 index 0000000..cfee138 --- /dev/null +++ b/src/experiments/clustering/survey/scores_pre.csv @@ -0,0 +1,48 @@ +student_id,pre_PHQ_9,pre_PSS,pre_lonliness_scale,pre_flourishing_scale,pre_panas_positive,pre_panas_negative,E_pre,A_pre,C_pre,N_pre,O_pre +0,2,24,31,,13,,16,18,25,21,27 +1,5,15,34,45,15,13,17,26,26,20,29 +2,13,21,33,46,10,11,19,27,26,26,22 +3,2,17,33,34,14,18,23,29,29,21,29 +4,6,21,31,27,13,12,19,24,22,29,29 +5,,7,30,48,15,18,19,26,28,20,27 +7,7,12,33,49,12,16,23,24,28,,31 +8,5,23,29,,13,14,19,26,24,20,21 +9,4,13,39,46,10,11,19,29,26,23,23 +10,,20,33,39,13,17,18,24,27,21,28 +12,1,11,37,49,13,19,17,30,25,25,28 +13,4,25,44,44,10,8,19,20,22,23,21 +14,1,9,26,52,20,21,18,27,,,23 +15,3,22,31,43,12,16,24,28,22,22,26 +16,6,24,36,42,9,11,18,26,24,22,22 +17,13,33,40,37,9,15,19,19,22,23,23 +18,15,27,38,37,11,10,23,26,27,22,25 +19,5,20,36,42,11,16,18,28,23,23,26 +20,8,14,40,45,9,13,23,25,28,23,20 +22,3,6,36,46,12,10,17,29,25,25,24 +23,11,27,38,35,,18,,27,23,20,28 +24,5,18,41,41,10,15,20,22,23,27,25 +27,5,18,38,31,10,12,22,27,26,23,25 +30,1,9,33,52,17,18,22,29,28,28,28 +31,12,,29,16,11,14,30,23,24,25,29 +32,4,3,35,54,10,13,20,26,23,24,23 +33,23,33,39,31,11,12,25,28,21,23,28 +34,3,14,43,49,9,12,22,27,26,21,29 +35,7,14,30,48,12,10,23,27,28,25,28 +36,2,12,33,46,11,12,22,29,28,24,25 +39,3,,38,15,6,8,23,26,22,20,25 +42,1,24,31,45,11,13,23,28,24,24,25 +43,7,21,35,47,11,13,19,29,26,18,25 +44,1,18,32,48,11,11,21,23,24,22,22 +45,7,,34,48,13,11,26,27,29,29,29 +46,10,31,37,42,15,16,25,22,22,26,24 +47,5,14,30,,14,17,19,25,25,20,26 +49,2,16,35,51,13,14,19,26,23,20,21 +50,7,19,34,48,,12,21,22,24,20,25 +51,1,14,30,,8,11,21,28,26,26,28 +52,12,22,39,34,13,19,19,28,24,23,21 +53,8,23,33,50,10,10,22,26,20,24,28 +56,2,11,28,46,8,8,21,29,24,22,27 +57,0,9,35,50,13,15,21,28,27,21,27 +58,5,20,28,51,11,12,15,27,24,23,24 +59,5,18,34,43,12,17,18,31,21,17,25 +54,,18,,,,,,,,, diff --git a/src/experiments/clustering/survey/survey_scores_covariate b/src/experiments/clustering/survey/survey_scores_covariate new file mode 100644 index 0000000..83be961 Binary files /dev/null and b/src/experiments/clustering/survey/survey_scores_covariate differ diff --git a/src/experiments/clustering/survey/survey_scores_covariate_zero_pad b/src/experiments/clustering/survey/survey_scores_covariate_zero_pad new file mode 100644 index 0000000..a3b23a4 Binary files /dev/null and b/src/experiments/clustering/survey/survey_scores_covariate_zero_pad differ diff --git a/src/experiments/clustering/survey/vr_12.csv b/src/experiments/clustering/survey/vr_12.csv new file mode 100644 index 0000000..7f6b16d --- /dev/null +++ b/src/experiments/clustering/survey/vr_12.csv @@ -0,0 +1,84 @@ +uid,type,"In general, would you say your health is","Moderate activities, such as moving a table, pushing a vacuum cleaner, bowling or playing golf?",Climbing several flights of stairs?,Accomplished less than you would like.,Were limited in the kind of work or other activities.,Accomplished less than you would like.,Didn't do work or other activities as carefully as usual.,"During the past 4 weeks, how much did pain interfere with your normal work (including both work outside the home and housework)?",How much of the time during the past 4 weeks: Have you felt calm and peaceful?,How much of the time during the past 4 weeks: Did you have a lot of energy?,How much of the time during the past 4 weeks: Have you felt downhearted and blue?,"During the past 4 weeks, how much of the time has your physical health or emotional problems interfered with your social activities (like visiting with friends, relatives, etc.)?","Compared to one year ago, how would you rate your physical health in general now?","Compared to one year ago, how would you rate your emotional problems (such as feeling anxious, depressed or irritable) now?" +u00,pre,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, some of the time","Yes, some of the time",Not at all,A good bit of the time,Some of the time,Some of the time,Some of the time,Slightly worse,About the same +u01,pre,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, some of the time","Yes, some of the time",Not at all,A good bit of the time,A good bit of the time,Some of the time,A little of the time,About the same,Much better +u02,pre,Good,"No, not limited at all","No, not limited at all","Yes, some of the time","No, none of the time","No, none of the time","Yes, a little of the time",Not at all,Most of the time,Some of the time,A little of the time,None of the time,About the same,Slightly better +u03,pre,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, a little of the time","Yes, a little of the time",Not at all,A good bit of the time,A good bit of the time,Some of the time,A little of the time,Slightly better,Slightly worse +u04,pre,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, a little of the time","Yes, a little of the time",Not at all,A good bit of the time,A good bit of the time,A little of the time,None of the time,Slightly better,About the same +u05,pre,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,Most of the time,Most of the time,A little of the time,None of the time,About the same,About the same +u07,pre,Good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, a little of the time","No, none of the time",Not at all,A good bit of the time,Some of the time,Some of the time,A little of the time,Slightly worse,Much better +u08,pre,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, a little of the time","Yes, a little of the time",A little bit,Some of the time,Some of the time,Some of the time,A little of the time,Slightly worse,Slightly worse +u09,pre,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,A good bit of the time,Most of the time,A little of the time,None of the time,About the same,Slightly worse +u10,pre,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, a little of the time","Yes, a little of the time",A little bit,Some of the time,Some of the time,A good bit of the time,None of the time,About the same,Slightly better +u12,pre,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,A good bit of the time,Most of the time,A little of the time,None of the time,About the same,About the same +u13,pre,Good,"No, not limited at all","No, not limited at all","No, none of the time","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time",Not at all,A little of the time,A little of the time,Some of the time,Some of the time,About the same,Much better +u14,pre,Excellent,"No, not limited at all","No, not limited at all","Yes, a little of the time","Yes, some of the time","No, none of the time","No, none of the time",Not at all,Most of the time,All of the time,None of the time,None of the time,Slightly better,Slightly better +u15,pre,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, some of the time","Yes, a little of the time",Not at all,Some of the time,Some of the time,Some of the time,None of the time,Slightly better,About the same +u16,pre,Good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, some of the time","Yes, a little of the time",Not at all,Some of the time,A good bit of the time,A good bit of the time,Some of the time,Slightly better,Slightly better +u17,pre,Good,"No, not limited at all","No, not limited at all","Yes, some of the time","No, none of the time","Yes, a little of the time","Yes, some of the time",A little bit,Some of the time,Some of the time,A little of the time,None of the time,About the same,Slightly better +u18,pre,Good,"No, not limited at all","Yes, limited a little","Yes, some of the time","Yes, some of the time","Yes, most of the time","Yes, most of the time",Not at all,Some of the time,A little of the time,Most of the time,A little of the time,About the same,Much worse +u19,pre,Good,"No, not limited at all","No, not limited at all","Yes, a little of the time","Yes, a little of the time","Yes, some of the time","Yes, a little of the time",Not at all,Some of the time,A good bit of the time,Some of the time,Some of the time,About the same,Slightly worse +u20,pre,Good,"Yes, limited a little","Yes, limited a little","Yes, some of the time","Yes, a little of the time","No, none of the time","No, none of the time",A little bit,Most of the time,Most of the time,A little of the time,A little of the time,About the same,About the same +u22,pre,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, a little of the time","No, none of the time",Not at all,Some of the time,A good bit of the time,Some of the time,A little of the time,Slightly better,Slightly better +u23,pre,Fair,"Yes, limited a little","Yes, limited a little","Yes, some of the time","Yes, some of the time","Yes, most of the time","Yes, most of the time",Quite a bit,Some of the time,A good bit of the time,A good bit of the time,Most of the time,Much worse,Much worse +u24,pre,Very good,"No, not limited at all","No, not limited at all","Yes, some of the time","Yes, some of the time","Yes, a little of the time","Yes, a little of the time",Not at all,Most of the time,Some of the time,Some of the time,A little of the time,Slightly better,Much worse +u27,pre,Good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,A good bit of the time,A good bit of the time,Some of the time,A little of the time,About the same,About the same +u30,pre,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,Most of the time,Most of the time,None of the time,None of the time,About the same,About the same +u31,pre,Good,"Yes, limited a little","Yes, limited a little","Yes, a little of the time","Yes, a little of the time","Yes, some of the time","Yes, a little of the time",Quite a bit,A little of the time,None of the time,A little of the time,Most of the time,Slightly worse,Much worse +u32,pre,Very good,"No, not limited at all","No, not limited at all","No, none of the time","Yes, some of the time","No, none of the time","No, none of the time",A little bit,Most of the time,A good bit of the time,A little of the time,A little of the time,Much better,About the same +u33,pre,Good,"No, not limited at all","No, not limited at all","No, none of the time","Yes, a little of the time","Yes, most of the time","Yes, most of the time",A little bit,A little of the time,A good bit of the time,A good bit of the time,Some of the time,About the same,Slightly worse +u34,pre,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,A good bit of the time,A good bit of the time,A little of the time,None of the time,Slightly better,About the same +u35,pre,Good,"No, not limited at all","Yes, limited a little","Yes, a little of the time","No, none of the time","Yes, some of the time","Yes, some of the time",A little bit,A good bit of the time,A good bit of the time,A little of the time,A little of the time,About the same,Slightly better +u36,pre,Good,"No, not limited at all","No, not limited at all","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time",A little bit,Most of the time,A good bit of the time,A little of the time,A little of the time,About the same,About the same +u39,pre,Fair,"No, not limited at all","No, not limited at all","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time",,Some of the time,A little of the time,A little of the time,None of the time,Slightly worse,Slightly worse +u42,pre,Very good,"No, not limited at all","Yes, limited a little","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time","Yes, some of the time",Not at all,A good bit of the time,Most of the time,A good bit of the time,A little of the time,About the same,About the same +u43,pre,Fair,"Yes, limited a little","Yes, limited a little","Yes, most of the time","Yes, most of the time","Yes, a little of the time","No, none of the time",Extremely,Some of the time,A little of the time,Some of the time,Some of the time,Much worse,Slightly worse +u44,pre,Excellent,"No, not limited at all","No, not limited at all","Yes, a little of the time","No, none of the time","Yes, a little of the time","Yes, a little of the time",A little bit,A good bit of the time,A good bit of the time,A little of the time,A little of the time,About the same,Much better +u45,pre,Very good,"Yes, limited a little","No, not limited at all","Yes, a little of the time","Yes, a little of the time","No, none of the time","No, none of the time",A little bit,Some of the time,A good bit of the time,Some of the time,A little of the time,Slightly worse,Much worse +u46,pre,Very good,"No, not limited at all","No, not limited at all","Yes, a little of the time","No, none of the time","Yes, some of the time","Yes, most of the time",Not at all,A good bit of the time,A good bit of the time,Some of the time,A little of the time,Slightly worse,About the same +u47,pre,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,Most of the time,All of the time,Some of the time,A little of the time,Slightly better,Slightly better +u49,pre,Very good,"Yes, limited a lot","Yes, limited a little","No, none of the time","No, none of the time","Yes, a little of the time","No, none of the time",Not at all,A good bit of the time,Some of the time,Some of the time,A little of the time,Slightly better,About the same +u50,pre,Good,"Yes, limited a little","Yes, limited a little","Yes, a little of the time","Yes, some of the time","No, none of the time","Yes, a little of the time",Moderately,Most of the time,A good bit of the time,Some of the time,Some of the time,Slightly better,Slightly better +u51,pre,Good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,Some of the time,Some of the time,A little of the time,None of the time,About the same,About the same +u52,pre,Good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, some of the time","No, none of the time",Not at all,Some of the time,A good bit of the time,A little of the time,Some of the time,Slightly better,Much better +u53,pre,Good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, some of the time","Yes, some of the time",A little bit,A little of the time,Some of the time,A good bit of the time,Some of the time,About the same,Slightly worse +u56,pre,Very good,"No, not limited at all","No, not limited at all","Yes, some of the time","No, none of the time","No, none of the time","Yes, a little of the time",Moderately,A good bit of the time,A little of the time,None of the time,None of the time,Slightly better,Much better +u57,pre,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,Most of the time,A good bit of the time,A little of the time,A little of the time,Slightly better,About the same +u58,pre,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, some of the time","Yes, most of the time",Not at all,Most of the time,Some of the time,None of the time,Some of the time,Much better,Slightly better +u59,pre,Good,"No, not limited at all","No, not limited at all","Yes, a little of the time","No, none of the time","Yes, a little of the time","Yes, a little of the time",A little bit,A good bit of the time,A good bit of the time,Some of the time,Some of the time,Much better,Slightly worse +u00,post,Good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, a little of the time","Yes, a little of the time",Not at all,A good bit of the time,Most of the time,A little of the time,None of the time,Slightly worse,Slightly better +u01,post,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, some of the time","Yes, some of the time",Not at all,Some of the time,Some of the time,Some of the time,Some of the time,About the same,Slightly better +u02,post,Good,"No, not limited at all","No, not limited at all","Yes, a little of the time","Yes, a little of the time","Yes, some of the time","Yes, a little of the time",Not at all,Some of the time,A little of the time,A little of the time,None of the time,Slightly better,Slightly better +u03,post,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","Yes, a little of the time",A little bit,Some of the time,A good bit of the time,A good bit of the time,A little of the time,Slightly better,Slightly worse +u04,post,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, a little of the time","Yes, a little of the time",Not at all,Some of the time,A good bit of the time,A little of the time,A little of the time,Slightly better,About the same +u05,post,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, a little of the time","No, none of the time",Not at all,Most of the time,Most of the time,None of the time,None of the time,Slightly better,About the same +u07,post,Fair,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, a little of the time","Yes, a little of the time",A little bit,Some of the time,A little of the time,A good bit of the time,None of the time,Slightly worse,Slightly better +u09,post,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,A good bit of the time,Some of the time,A little of the time,None of the time,About the same,Slightly worse +u10,post,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,Most of the time,A good bit of the time,None of the time,None of the time,About the same,Much better +u14,post,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,Some of the time,A good bit of the time,A little of the time,None of the time,Slightly better,About the same +u15,post,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Moderately,A good bit of the time,A good bit of the time,A little of the time,None of the time,Slightly worse,About the same +u16,post,Fair,"No, not limited at all","No, not limited at all","No, none of the time","Yes, a little of the time","Yes, most of the time","Yes, most of the time",Not at all,Some of the time,Some of the time,A good bit of the time,A little of the time,About the same,Slightly worse +u17,post,Good,"Yes, limited a little","Yes, limited a little","Yes, all of the time","Yes, all of the time","Yes, some of the time","Yes, all of the time",Moderately,None of the time,None of the time,A good bit of the time,A little of the time,Slightly worse,Slightly better +u18,post,Fair,"No, not limited at all","Yes, limited a little","Yes, a little of the time","Yes, a little of the time","Yes, some of the time","Yes, a little of the time",A little bit,A little of the time,Some of the time,A good bit of the time,A little of the time,About the same,Slightly worse +u19,post,Very good,"Yes, limited a little","Yes, limited a lot","No, none of the time","No, none of the time","Yes, some of the time","Yes, some of the time",Not at all,Some of the time,A good bit of the time,Some of the time,A little of the time,About the same,Slightly worse +u20,post,Very good,"Yes, limited a little","Yes, limited a little","Yes, some of the time","Yes, some of the time","No, none of the time","Yes, a little of the time",A little bit,A good bit of the time,A good bit of the time,Some of the time,A little of the time,Slightly better,Slightly worse +u23,post,Fair,"Yes, limited a lot","Yes, limited a lot","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time",Quite a bit,A little of the time,Some of the time,A good bit of the time,Some of the time,Much worse,Much worse +u24,post,Good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,Some of the time,Some of the time,A good bit of the time,Most of the time,Slightly worse,Slightly better +u27,post,Fair,"No, not limited at all","No, not limited at all","Yes, some of the time","Yes, most of the time","Yes, some of the time","Yes, some of the time",Not at all,A little of the time,A little of the time,Some of the time,A little of the time,About the same,Slightly worse +u30,post,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,Some of the time,Most of the time,None of the time,None of the time,About the same,About the same +u31,post,Very good,"Yes, limited a little","No, not limited at all","Yes, a little of the time","Yes, a little of the time","Yes, some of the time","Yes, some of the time",A little bit,All of the time,Most of the time,A little of the time,Some of the time,About the same,About the same +u32,post,Very good,"No, not limited at all","Yes, limited a little","No, none of the time","Yes, some of the time","No, none of the time","No, none of the time",Moderately,Some of the time,A good bit of the time,None of the time,A little of the time,About the same,About the same +u33,post,Good,"No, not limited at all","No, not limited at all","Yes, a little of the time","No, none of the time","Yes, all of the time","Yes, all of the time",A little bit,Some of the time,Some of the time,Most of the time,Most of the time,Slightly worse,Slightly worse +u34,post,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,A good bit of the time,A good bit of the time,A little of the time,None of the time,Slightly better,About the same +u35,post,Good,"No, not limited at all","No, not limited at all","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time",A little bit,Most of the time,A good bit of the time,Some of the time,A little of the time,About the same,About the same +u36,post,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, a little of the time","Yes, a little of the time",A little bit,Most of the time,A good bit of the time,A little of the time,A little of the time,About the same,Slightly better +u42,post,Excellent,"Yes, limited a little","Yes, limited a little","Yes, a little of the time","No, none of the time","No, none of the time","Yes, a little of the time",A little bit,Most of the time,Most of the time,A little of the time,A little of the time,About the same,About the same +u43,post,Fair,"Yes, limited a little","Yes, limited a little","Yes, some of the time","Yes, some of the time","Yes, a little of the time","No, none of the time",Moderately,A good bit of the time,Some of the time,A little of the time,A little of the time,Much worse,Slightly worse +u44,post,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, a little of the time","Yes, a little of the time",A little bit,A good bit of the time,Some of the time,Some of the time,A little of the time,Slightly worse,Much better +u45,post,Excellent,"Yes, limited a little","Yes, limited a little","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time",Not at all,All of the time,All of the time,A little of the time,A little of the time,Slightly better,Slightly worse +u47,post,Excellent,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,A good bit of the time,All of the time,None of the time,All of the time,About the same,About the same +u49,post,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","No, none of the time","No, none of the time",Not at all,Most of the time,A good bit of the time,A little of the time,A little of the time,Slightly worse,Slightly better +u51,post,Good,"No, not limited at all","No, not limited at all","No, none of the time","Yes, a little of the time","No, none of the time","No, none of the time",Not at all,A good bit of the time,A good bit of the time,A little of the time,None of the time,About the same,About the same +u52,post,Fair,"No, not limited at all","Yes, limited a little","No, none of the time","No, none of the time","Yes, all of the time","No, none of the time",Not at all,Some of the time,A good bit of the time,A good bit of the time,Some of the time,Slightly better,Slightly worse +u53,post,Fair,"No, not limited at all","Yes, limited a little","Yes, some of the time","Yes, some of the time","Yes, some of the time","Yes, some of the time",A little bit,A little of the time,Some of the time,Some of the time,A little of the time,Much worse,Slightly worse +u56,post,Very good,"No, not limited at all","No, not limited at all","No, none of the time","No, none of the time","Yes, a little of the time","Yes, a little of the time",Not at all,Most of the time,Some of the time,A little of the time,None of the time,About the same,About the same +u59,post,Very good,"No, not limited at all","No, not limited at all","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time","Yes, a little of the time",Moderately,Most of the time,Most of the time,Some of the time,A little of the time,About the same,About the same diff --git a/src/experiments/clustering/survey_scores.py b/src/experiments/clustering/survey_scores.py new file mode 100644 index 0000000..5cf41fc --- /dev/null +++ b/src/experiments/clustering/survey_scores.py @@ -0,0 +1,309 @@ +import pandas as pd +import numpy as np +import pickle + +# ---------------------------------------------------------------- +# Independent Study, Student Stree Prediction +# +# File_name: survey_scores +# Functionality: calculate the scores of surveys +# Author: Yunfei Luo +# Start date: EST May.14th.2020 +# Last update: EST May.14th.2020 +# ---------------------------------------------------------------- + +class survey_scores: + def __init__(self, csv_file_path, neg_scale, pos_scale, pos_term, pre_col, post_col, filter_q): + self.neg_scale = neg_scale + self.pos_scale = pos_scale + self.pos_term = pos_term + self.scores = dict() # map: student_id -> {pre: score, post: score} + df = pd.read_csv(csv_file_path) + + self.missing_data = list() + + # indexing questions + self.questions = list() + for i in range(len(df.columns[2:])): + if i+1 in filter_q: + self.questions.append(df.columns[2:][i]) + print(df.columns[2:][i]) + # indexing scores + for uid in df["uid"]: + key = int(uid[1:]) + # check if key exist + if self.scores.get(key) != None: + continue + + self.scores[key] = {"pre": -1, "post": -1} + + # calc scores + rows = df.loc[df['uid'] == uid] + row1 = rows.loc[rows['type'] == "pre"] + row2 = rows.loc[rows['type'] == "post"] + + # if len(rows[self.questions[0]]) == 1: + # self.missing_data.append(key) + # continue + + pre_score = 0 + post_score = 0 + for i in range(len(self.questions)): + pos = False + if i+1 in self.pos_term: + pos = True + + if pre_score != None: + cum = -1 + try: + # calc pre score + for j in row1[self.questions[i]]: + cum += 1 + if pos: + pre_score += self.pos_scale[j] + else: + pre_score += self.neg_scale[j] + except: + pre_score = None + self.missing_data.append(key) + if cum < 0: + pre_score = None + self.missing_data.append(key) + + if post_score != None: + cum = -1 + try: + # calc post score + for j in row2[self.questions[i]]: + cum += 1 + if pos: + post_score += self.pos_scale[j] + else: + post_score += self.neg_scale[j] + except: + post_score = None + self.missing_data.append(key) + if cum < 0: + post_score = None + self.missing_data.append(key) + + # store scores + self.scores[key]['pre'] = pre_score + self.scores[key]['post'] = post_score + + # # check + # for key in self.scores: + # print("pre score of {} is {}".format(key, self.scores[key]["pre"])) + # print("post score of {} is {}".format(key, self.scores[key]["post"])) + # print(' ') + + # # write to a new file + # df_out = {pre_col: list(), post_col: list()} + # ind = list() + # for key in self.scores: + # ind.append(key) + # df_out[pre_col].append(self.scores[key]["pre"]) + # df_out[post_col].append(self.scores[key]["post"]) + # df_out = pd.DataFrame(df_out, index=ind) + # df_out.index.name = "student_id" + + # write to a exist file + df_out = pd.read_csv("src/experiments/clustering/survey/scores.csv", index_col="student_id") + pre_scores = list() + post_scores = list() + has = list() + for i, row in df_out.iterrows(): + try: + pre_scores.append(self.scores[i]['pre']) + except: + pre_scores.append(None) + try: + post_scores.append(self.scores[i]['post']) + except: + post_scores.append(None) + has.append(i) + df_out[pre_col] = pre_scores + df_out[post_col] = post_scores + + # add a new row + new = list() + for key in self.scores: + if key not in has: + new.append(key) + print('new', new) + for id_ in new: + new_row = list() + for i in df_out.columns: + if i == pre_col: + new_row.append(self.scores[key]["pre"]) + elif i == post_col: + new_row.append(self.scores[key]["post"]) + else: + new_row.append(None) + df_out.loc[id_] = new_row + #print(df_out) + + self.missing_data = [i for i in set(self.missing_data)] + print('missing data', self.missing_data) + df_out.to_csv('src/experiments/clustering/survey/scores.csv') + +def calc_PSS(): + neg_scale = {"Never": 0, "Almost never": 1, "Sometime": 2, "Fairly often": 3, "Very often": 4} + pos_scale = {"Never": 4, "Almost never": 3, "Sometime": 2, "Fairly often": 1, "Very often": 0} + pos_term = [4,5,7,8] + csv_file_path = "src/experiments/clustering/survey/PerceivedStressScale.csv" + PSS_score = survey_scores(csv_file_path, neg_scale, pos_scale, pos_term, "pre_PSS", "post_PSS") + +def calc_PHQ_9(): + neg_scale = {"Extremely difficult": 0, "Very difficult":0, "Somewhat difficult": 0, "Not difficult at all": 0} + pos_scale = {"Not at all": 0, "Several days": 1, "More than half the days": 2, "Nearly every day": 3} + pos_term = [i for i in range(10)] + csv_file_path = "src/experiments/clustering/survey/PHQ-9.csv" + PSS_score = survey_scores(csv_file_path, neg_scale, pos_scale, pos_term, "pre_PHQ_9", "post_PHQ_9") + +def calc_lonliness_scale(): + neg_scale = {} + pos_scale = {"Never": 0, "Rarely": 1, "Sometimes": 2, "Often": 3} + pos_term = [i for i in range(21)] + csv_file_path = "src/experiments/clustering/survey/LonelinessScale.csv" + PSS_score = survey_scores(csv_file_path, neg_scale, pos_scale, pos_term, "pre_lonliness_scale", "post_longliness_scale") + +def calc_flourishing_scale(): + neg_scale = {} + pos_scale = {} + for i in range(8): + pos_scale[i+1] = i+1 + pos_term = [i for i in range(9)] + csv_file_path = "src/experiments/clustering/survey/FlourishingScale.csv" + PSS_score = survey_scores(csv_file_path, neg_scale, pos_scale, pos_term, "pre_flourishing_scale", "post_flourishing_scale") + +def calc_panas(): + neg_scale = {} + pos_scale = {} + for i in range(5): + pos_scale[i+1] = i+1 + neg_scale[i+1] = 0 + # filter_q = [1, 3, 5, 9, 10, 12, 14, 16, 17, 19] + # pos_term = [1, 3, 5, 9, 10, 12, 14, 16, 17, 19] + filter_q = [2, 4, 6, 7, 8, 11, 13, 15, 18, 20] + pos_term = [2, 4, 6, 7, 8, 11, 13, 15, 18, 20] + csv_file_path = "src/experiments/clustering/survey/panas.csv" + PSS_score = survey_scores(csv_file_path, neg_scale, pos_scale, pos_term, "pre_panas_negative", "post_panas_negative", filter_q) + +def calc_big_five(): + neg_scale = {"Disagree Strongly": 5, "Disagree a little": 4, "Neither agree nor disagree": 3, "Agree a little": 2, "Agree strongly": 1} + pos_scale = {"Disagree Strongly": 1, "Disagree a little": 2, "Neither agree nor disagree": 3, "Agree a little": 4, "Agree strongly": 5} + # filter_q = [1, 6, 11, 16, 21, 26, 31, 36] + # pos_term = [1, 11, 16, 26, 36] + # filter_q = [2, 7, 12, 17, 22, 27, 32, 37, 42] + # pos_term = [7, 17, 22, 32, 42] + # filter_q = [3, 8, 13, 18, 23, 28, 33, 38, 43] + # pos_term = [3, 13, 28, 33, 38] + # filter_q = [4, 9, 14, 19, 24, 29, 34, 39] + # pos_term = [4, 14, 19, 29, 39] + filter_q = [5, 10, 15, 20, 25, 30, 35, 40, 41, 44] + pos_term = [5, 10, 15, 20, 25, 30, 40, 44] + csv_file_path = "src/experiments/clustering/survey/BigFive.csv" + PSS_score = survey_scores(csv_file_path, neg_scale, pos_scale, pos_term, "O_pre", "O_post", filter_q) + +if __name__ == "__main__": + # calc_PHQ_9() + # calc_PSS() + # calc_lonliness_scale() + # calc_flourishing_scale() + # calc_panas() + # calc_big_five() + + ''' + group form: map: student_id -> group_id + ''' + from sklearn.cluster import KMeans + student_list = [4, 7, 8, 10, 14, 16, 17, 19, 22, 23, 24, 32, 33, 35, 36, 43, 44, 49, 51, 52, 53, 57, 58] + df = pd.read_csv("src/experiments/clustering/survey/scores_pre.csv", index_col="student_id") + student_survey_scores = dict() + missing_students = list() + for i, row in df.iterrows(): + if i not in student_list: + continue + curr = list() + na = 0 + for j in row: + if "n" in str(j): + curr.append(-1) + na += 1 + else: + curr.append(j) + student_survey_scores[i] = curr + if na > 0: + missing_students.append(i) + + print("missing students", missing_students) + ms = np.array([student_survey_scores[i] for i in student_survey_scores if i in missing_students]) + + num_groups = 4 + # kmeans on clean dataset + X = np.array([student_survey_scores[i] for i in student_survey_scores if i not in missing_students]) + kmeans = KMeans(n_clusters=num_groups, random_state=0).fit(X) + + # store group info + student_group = dict() # map: student_id -> group + for i in student_list: + if i in missing_students: + continue + student_group[i] = kmeans.predict([np.array(student_survey_scores[i])])[0] + + # clustering the missing students, to the closest center + def clustering_missing_students(centers, missing_students, ms, student_group): + for i in range(len(ms)): + student_vec = [num for num in ms[i] if num >= 0] + close_ind = -1 + close_dist = np.inf + for j in range(len(centers)): + center_vec = [centers[j][ind] for ind in range(len(ms[i])) if ms[i][ind] >= 0] + dist = np.linalg.norm(np.array(center_vec) - np.array(student_vec)) + if dist < close_dist: + close_dist = dist + close_ind = j + + student_group[missing_students[i]] = close_ind + + clustering_missing_students(kmeans.cluster_centers_, missing_students, ms, student_group) + + # check + group_student = dict() # map group -> list of student + for student in student_group: + try: + group_student[student_group[student]].append(student) + except: + group_student[student_group[student]] = [student] + for group in group_student: + print(group_student[group]) + + # formalize + output_group = dict() # copy student_group with correct form + for i in student_group: + output_group['student_{}'.format(i)] = "group_{}".format(student_group[i]) + + # write to file + with open("src/experiments/clustering/student_groups/pre_survey_scores_{}.pkl".format(num_groups), 'wb') as f: + pickle.dump(output_group, f) + + # check + with open("src/experiments/clustering/student_groups/pre_survey_scores_{}.pkl".format(num_groups), 'rb') as f: + readed_file = pickle.load(f) + for stu in readed_file: + print("{}, {}".format(stu, readed_file[stu])) + + + # check missing data + # missing_ids = list() + # #df = pd.read_csv("src/experiments/clustering/survey/scores.csv", index_col="student_id") + # df = pd.read_csv("src/experiments/clustering/survey/BigFive.csv", index_col="uid") + # df = df.isna() + # for i, row in df.iterrows(): + # for j in row: + # if j: + # missing_ids.append(i) + # #print(df) + # missing_ids = [i for i in set(missing_ids)] + # print("missing ids", missing_ids) diff --git a/src/experiments/config.py b/src/experiments/config.py new file mode 100644 index 0000000..ec712f6 --- /dev/null +++ b/src/experiments/config.py @@ -0,0 +1,65 @@ + +def get_config(job_name, device, num_features, student_groups, num_branches): + config = { + 'model_params': { + 'device': device, + 'in_size': num_features, + 'AE': 'lstm', + 'AE_num_layers': 1, + 'AE_hidden_size': 128, + 'shared_in_size': 128, # same with AE hidden + 'shared_hidden_size': 256, + 'num_branches': num_branches, # when equal to 1, it is equivalent to CALM_Net + 'groups': student_groups, + 'heads_hidden_size': 64, + 'num_classes': 3 + # 'num_classes': 2 + }, + 'training_params': { + 'device': device, + 'loss_weight': { + 'alpha': 1e-4, + 'beta': 1, + 'theta': 1 / 22, # 1 over number of students + }, + 'class_weights': [0.6456, 0.5635, 1.0000], + # 'class_weights': [0.6456, 0.5635+1.0000], + 'global_lr': 1e-5, + 'branching_lr': 1e-5, + 'weight_decay': 1e-4, + 'epochs': 2, + 'batch_size': 1, + 'use_histogram': True, + 'use_covariates': True, + 'use_decoder': True, + } + } + + if job_name == 'calm_net': + config['training_params']['global_lr'] = 1e-5 # 1e-6 + config['training_params']['branching_lr'] = 1e-5 # 1e-6 + config['training_params']['epochs'] = 200 # 500 + elif job_name == 'calm_net_with_branching': + config['training_params']['global_lr'] = 1e-5 + config['training_params']['branching_lr'] = 1e-5 + config['training_params']['epochs'] = 200 + elif job_name in ['trans', 'trans_calm_net', 'trans_calm_net_with_branching']: + config['model_params']['AE'] = 'trans' + config['training_params']['global_lr'] = 1e-5 + config['training_params']['branching_lr'] = 1e-5 + config['training_params']['epochs'] = 100 + config['training_params']['use_decoder'] = False + elif job_name == 'calm_net_no_cov': + config['training_params']['global_lr'] = 1e-5 + config['training_params']['branching_lr'] = 1e-5 + config['training_params']['epochs'] = 200 + config['training_params']['use_covariates'] = False + elif job_name in ['lm_net', 'lstm']: + config['training_params']['epochs'] = 200 + config['training_params']['use_decoder'] = False + elif "test_only" in job_name: + config["training_params"]["epochs"] = 1 + + print('Num branches:', config['model_params']['num_branches']) + return config + \ No newline at end of file diff --git a/src/experiments/get_split_stats.py b/src/experiments/get_split_stats.py new file mode 100644 index 0000000..9d130f6 --- /dev/null +++ b/src/experiments/get_split_stats.py @@ -0,0 +1,93 @@ +import sys +from copy import deepcopy + +from src.experiments.config import * +from src.utils.train_val_utils import * +import src.utils.tensorify as tensorify +import src.utils.data_conversion_utils as conversions + +if __name__ == '__main__': + # read command line arguments + job_name = sys.argv[1] # choice: 'test', 'calm_net', 'calm_net_with_branching' + split_name = sys.argv[2] # choice: ['5fold', 'loocv'] + num_branches = int(sys.argv[3]) # any interger + print('Job name:', job_name) + print('Type:', split_name) + + # use gpu if available + device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu') + print("Device: ", device) + + # load data + print('Loading Data...') + data_file_path = 'data/training_data/shuffled_splits/training_date_normalized_shuffled_splits_select_features_no_prev_stress_all_students.pkl' + data = read_data(data_file_path) + tensorified_data = tensorify.tensorify_data_gru_d(deepcopy(data), torch.cuda.is_available()) + student_list = conversions.extract_distinct_student_idsfrom_keys(data['data'].keys()) + + # load groups + clusters_name = 'all_in_one' # 'one_for_each', 'all_in_one' + print('The groups: ' + clusters_name) + groups_file_path = 'src/experiments/clustering/student_groups/' + clusters_name + '.pkl' + student_groups = read_data(groups_file_path) # student groups + + # check how students are distributed + print("student distribution: ") + rev_groups = dict() # map: group_id -> student_ids + for student in student_groups: + if rev_groups.get(student_groups[student]) != None: + rev_groups[student_groups[student]].append(student) + else: + rev_groups[student_groups[student]] = [student] + for group in rev_groups: + print(group + ': ' + str(rev_groups[group])) + + ############ SETTINGS ############ + use_historgram = True + first_key = next(iter(data['data'].keys())) + if use_historgram: + num_features = len(data['data'][first_key][4][0]) + else: + num_features = len(data['data'][first_key][0][0]) + num_covariates = len(data['data'][first_key][3]) # 3 is the covariate index + + # get split + days = [7, 14, 21, 28] + stats = dict() # map: day -> dictionary(map: student -> [train_num, val_num]) + for day in days: + stats[day] = dict() # map: student -> [train_num, val_num] + + print('Validation Type:', split_name) + splits = get_splits(split_name, data, student_groups, days_include=day) + + # get configurations for model and training + config = get_config(job_name, device, num_features, student_groups, num_branches) + model_params = config['model_params'] + training_params = config['training_params'] + + if training_params['use_covariates']: + model_params['shared_in_size'] += num_covariates + #################################### + + for split_no, split in enumerate(splits): + # if the days include exceed the max range of date of keys, skip + # this three lines only for loocv. A empty dict will be appended + if len(split['val_ids']) == 0: + continue + + for key in split['val_ids']: + student = int(key.split('_')[0]) + break + + train_num = 0 + for key in split['train_ids']: + curr_s = int(key.split('_')[0]) + if curr_s == student: + train_num += 1 + val_num = len(split['val_ids']) + + stats[day][student] = [train_num, val_num] + + # save results + with open('data/check/train_val_num_stats.pkl', 'wb') as f: + pickle.dump(stats, f) \ No newline at end of file diff --git a/src/experiments/layers.py b/src/experiments/layers.py new file mode 100644 index 0000000..04ab6d1 --- /dev/null +++ b/src/experiments/layers.py @@ -0,0 +1,274 @@ +import torch +from torch import nn + +from src.models.layers import * + +class autoencoder(nn.Module): + def __init__(self, AE_type, in_size, hidden_size, num_layers, device): + super().__init__() + self.device = device + self.hidden_size = hidden_size + self.AE_type = AE_type + + if AE_type == 'lstm': + # LSTM encoder + self.encoder = nn.LSTM( + input_size=in_size, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + bidirectional=False, + ) + elif AE_type == 'trans': + # Transformer encoder + self.encoder = nn.Sequential( + Transformer( + emb_size=hidden_size, # was 16 + num_heads=8, + dropout=0.1, + hidden_size=256, + add_norm=True, + # data related + in_channel=in_size, + seq_length=24, + ), + CheckShape(None, key=lambda x: (x, (None, None))) # match previous in/out pipeline + ) + + # original code below + self.encoder_act = nn.ReLU() + + self.decoder = nn.LSTM( + input_size=hidden_size, + hidden_size=in_size, + num_layers=num_layers, + batch_first=True, + ) + self.decoder_act = nn.Sigmoid() + + def forward(self, x, inds): + # inds: index of the samples + # x: (N, ?, D) + # decoder_out: (N, ?, D) + # bottle_neck (encoder_out[:, -1, :]): (N, H) + + bottle_neck = torch.zeros(len(inds), self.hidden_size).to(self.device) + out = list() + + for i in range(len(inds)): + if self.AE_type == 'trans': + # encoder forward + curr_len = x[inds[i]].shape[1] + encoder_out, (hidden, cell) = self.encoder(x[inds[i]][:, abs(24-curr_len):, :]) + else: + encoder_out, (hidden, cell) = self.encoder(x[inds[i]]) + + encoder_out = self.encoder_act(encoder_out) + bottle_neck[i] = encoder_out[:, -1, :].squeeze() + + if self.AE_type == 'lstm': + # decoder forward, tentatively comment out for transformer + decoder_out, (hidden, cell) = self.decoder(encoder_out) + out.append(self.decoder_act(decoder_out)) + + return out, bottle_neck + +class branching(nn.Module): + def __init__(self, groups, num_branches, device): + super().__init__() + self.device = device + self.num_branches = num_branches + self.t = 10.0 + + self.groups = dict() # map: ids -> group_id + group_nodes = set() + for student in groups: + self.groups[student.split('_')[1]] = groups[student] + group_nodes.add(groups[student]) + + self.probabilities = dict() # map: str(ind) -> probability params + for group in group_nodes: + self.probabilities[group] = nn.Parameter(torch.ones(self.num_branches, device=self.device), requires_grad=True) + self.probabilities = nn.ParameterDict(self.probabilities) + + def forward(self, x, ids): + # x: (N, D) + # ids: (N, ) + # out: (N, P) + if self.t > 0.5: + self.t *= 0.98 + + N = x.shape[0] + + # fetch probability params + out = torch.zeros(N, self.num_branches).to(self.device) + for i in range(len(ids)): + out[i] = torch.log(self.probabilities[self.groups[ids[i]]]) + + # gumbel-trick softmax + eps = -torch.log(-torch.log(torch.rand((N, self.num_branches), device=self.device))) + out = torch.exp((out + eps) / self.t) + + # # vanilla softmax + # out = torch.exp(out) + + # final normalize step of softmax + return out / out.sum(dim=1).view(-1, 1) + +class branch_layer(nn.Module): + def __init__(self, num_branches, in_size, hidden_size, out_size, device): + super().__init__() + self.device = device + self.num_branches = num_branches + self.out_size = out_size + + self.branches = dict() # map: str(ind) -> sequential + for i in range(self.num_branches): + self.branches[str(i)] = nn.Sequential( + nn.Linear(in_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, self.out_size), + nn.ReLU() + ) + self.branches = nn.ModuleDict(self.branches) + + def forward(self, x, probabilities): + # x: (N, D) + # branch_out: (N, H_B) + # probabilities: (N, B) + # out: (N, H_B) + out = torch.zeros(x.shape[0], self.out_size).to(self.device) + for i in range(self.num_branches): + out += self.branches[str(i)](x) * probabilities[:, i].view(-1, 1) + return out + +class out_heads(nn.Module): + def __init__(self, groups, in_size, hidden_size, out_size, device, transfer_learn=False): + super().__init__() + self.device = device + self.transfer_learn = transfer_learn + self.hidden_size = hidden_size + self.out_size = out_size + self.groups = dict() # map: ids -> group_id + group_nodes = set() + for student in groups: + self.groups[student.split('_')[1]] = groups[student] + group_nodes.add(groups[student]) + self.num_groups = len([i for i in group_nodes]) + + self.out_heads = dict() # map: group_ids -> sequential + for group in group_nodes: + self.out_heads[group] = nn.Sequential( + nn.Linear(in_size, hidden_size), + nn.ReLU(), + ) + self.out_heads = nn.ModuleDict(self.out_heads) + + self.final_out = dict() # map: group_ids -> sequential + for group in group_nodes: + self.final_out[group] = nn.Sequential( + nn.Linear(hidden_size, out_size), + ) + self.final_out = nn.ModuleDict(self.final_out) + + + def forward(self, x, ids): + # x: (N, D) + # ids: (N, ) + out_size = self.hidden_size if self.transfer_learn else self.out_size + out = torch.zeros(x.shape[0], out_size).to(self.device) + for i in range(len(ids)): + curr_out = self.out_heads[self.groups[ids[i]]](x[i]) + if self.transfer_learn: + out[i] = curr_out + else: + out[i] = self.final_out[self.groups[ids[i]]](curr_out) + return out + +class out_heads_with_generic(nn.Module): + def __init__(self, groups, in_size, hidden_size, out_size, device, transfer_learn=False): + super().__init__() + self.device = device + self.transfer_learn = transfer_learn + self.hidden_size = hidden_size + self.out_size = out_size + self.groups = dict() # map: ids -> group_id + group_nodes = set() + for student in groups: + self.groups[student.split('_')[1]] = groups[student] + group_nodes.add(groups[student]) + self.num_groups = len([i for i in group_nodes]) + + self.out_heads = dict() # map: group_ids -> sequential + for group in group_nodes: + self.out_heads[group] = nn.Sequential( + nn.Linear(in_size, hidden_size), + nn.ReLU(), + ) + self.out_heads = nn.ModuleDict(self.out_heads) + + self.final_out = dict() # map: group_ids -> sequential + for group in group_nodes: + self.final_out[group] = nn.Sequential( + nn.Linear(hidden_size, out_size), + ) + self.final_out = nn.ModuleDict(self.final_out) + + self.generic_out = nn.Sequential( + nn.Linear(in_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, out_size), + ) + + def forward(self, x, ids): + # x: (N, D) + # ids: (N, ) + out_size = self.hidden_size if self.transfer_learn else self.out_size + out = torch.zeros(x.shape[0], out_size).to(self.device) + gen_out = torch.zeros(x.shape[0], out_size).to(self.device) + for i in range(len(ids)): + gen_out[i] = self.generic_out(x[i]) + curr_out = self.out_heads[self.groups[ids[i]]](x[i]) + if self.transfer_learn: + out[i] = curr_out + else: + out[i] = self.final_out[self.groups[ids[i]]](curr_out) + return out, gen_out + +class personal_head(nn.Module): + def __init__(self, num_heads, in_size, hidden_size, out_size, device): + super().__init__() + self.t = 10.0 + self.device = device + self.num_heads = num_heads + self.probabilities = nn.Parameter(torch.ones(num_heads, device=self.device), requires_grad=True) + + self.liner = nn.Sequential( + nn.Linear(in_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, out_size), + ) + + def forward(self, x): + # branching + eps = -torch.log(-torch.log(torch.rand(self.probabilities.shape, device=self.device))) + prob = torch.exp((torch.log(self.probabilities) + eps) / self.t) + prob = (prob / prob.sum()).view(1, -1, 1) + out = (x * prob).sum(dim=1) + + # forward + return self.liner(out) + +# others +class GTS_Linear(nn.Module): # gumbel-trick softmax linear unit + def __init__(self, in_size, out_size, t=1e1): + super().__init__() + self.t = t + self.W = nn.Parameter(torch.ones(in_size, out_size), requires_grad=True) + + def forward(self, x): + eps = -torch.log(-torch.log(torch.rand(self.W.shape))) + LOG_W = torch.log(self.W) + eps + EXP_W = torch.exp(LOG_W / self.t) + W = EXP_W / EXP_W.sum(dim=1).view(-1, 1) + return torch.matmul(x, W) \ No newline at end of file diff --git a/src/experiments/location_mlp.py b/src/experiments/location_mlp.py new file mode 100644 index 0000000..568d00d --- /dev/null +++ b/src/experiments/location_mlp.py @@ -0,0 +1,284 @@ +import tqdm +import pickle +import sys + +import numpy as np +import torch +from torch import nn +from sklearn.model_selection import StratifiedKFold +from sklearn import metrics + +from src.utils.train_val_utils import * + +SPLITTER_RANDOM_STATE = 100 + +def cv(student_data, n_splits=5): + data = list() + samples = list() + labels = list() + stratification_column = list() + for s in student_data: + for i in range(len(student_data[s]['features'])): + samples.append(student_data[s]['features'][i]) + labels.append(student_data[s]['labels'][i]) + stratification_column.append('{}_{}'.format(s, student_data[s]['labels'][i])) + + samples = np.array(samples) + labels = np.array(labels) + stratification_column = np.array(stratification_column) + splitter = StratifiedKFold(n_splits=n_splits, random_state=SPLITTER_RANDOM_STATE) + for train_index, val_index in splitter.split(X=samples, y=stratification_column): + curr_fold = { + 'train_data': samples[train_index], + 'train_labels': labels[train_index], + 'val_data': samples[val_index], + 'val_labels': labels[val_index], + } + + for i in curr_fold: + print(curr_fold[i].shape) + print('++++++++++++++++++++++++++++') + + data.append(curr_fold) + return data + +def loocv(student_data, key_data, days_include=0): + data = list() + splits = cross_val.leave_one_subject_out_split(key_data, days_include=days_include) + for split in splits: + curr_fold = { + 'train_data': np.array([key_data['data'][split['train_ids'][0]]['sample']]), + 'train_labels': np.array([key_data['data'][split['train_ids'][0]]['label']]), + 'val_data': np.array([key_data['data'][split['val_ids'][0]]['sample']]), + 'val_labels': np.array([key_data['data'][split['val_ids'][0]]['label']]) + } + for i in range(1, len(split['train_ids'])): + curr_fold['train_data'] = np.concatenate((curr_fold['train_data'], np.array([key_data['data'][split['train_ids'][i]]['sample']])), axis=0) + # print(curr_fold['train_labels']) + # print(key_data['data'][split['train_ids'][i]]['label']) + curr_fold['train_labels'] = np.concatenate((curr_fold['train_labels'], np.array([key_data['data'][split['train_ids'][i]]['label']])), axis=0) + for i in range(1, len(split['val_ids'])): + curr_fold['val_data'] = np.concatenate((curr_fold['val_data'], np.array([key_data['data'][split['val_ids'][i]]['sample']])), axis=0) + curr_fold['val_labels'] = np.concatenate((curr_fold['val_labels'], np.array([key_data['data'][split['val_ids'][i]]['label']])), axis=0) + # for s in student_data: + # curr_fold = { + # 'train_data': None, + # 'train_labels': None, + # 'val_data': None, + # 'val_labels': None + # } + + # for other_s in student_data: + # if other_s == s: + # loo_train_keys, loo_val_keys = get_first_n_data(key_data[s], days_include) + # if curr_fold['train_data'] is None: + # curr_fold['train_data'] = student_data[other_s]['features'] + # curr_fold['train_labels'] = student_data[other_s]['labels'] + # if curr_fold['train_data'] is None: + # curr_fold['train_data'] = student_data[other_s]['features'] + # curr_fold['train_labels'] = student_data[other_s]['labels'] + # else: + # curr_fold['train_data'] = np.concatenate((curr_fold['train_data'], student_data[other_s]['features']), axis=0) + # curr_fold['train_labels'] = np.concatenate((curr_fold['train_labels'], student_data[other_s]['labels']), axis=0) + for i in curr_fold: + print(curr_fold[i].shape) + print('++++++++++++++++++++++++++++') + print(' ') + data.append(curr_fold) + # exit() + return data + +def load_data(days_include=0): + # student_data = { + # 'student_1': { + # 'features': np.ones((5, 12)), + # 'labels': np.ones(5) + # }, + # 'student_2': { + # 'features': np.ones((5, 12)), + # 'labels': np.ones(5) + # }, + # 'student_3': { + # 'features': np.ones((5, 12)), + # 'labels': np.ones(5) + # } + # } + with open("data/location_data/gatis-new-23.pkl", 'rb') as f: + all_data = pickle.load(f) + + student_data = dict() + key_data = {'data': dict()} + students = set([s for s in all_data["student_id"]]) + for s in students: + student_data[s] = dict() + curr_data = all_data.loc[all_data['student_id'] == s].sort_values(by="time").to_numpy() + student_data[s]["features"] = curr_data[:, 1:13].astype('float') + student_data[s]["labels"] = curr_data[:, 13].astype('int') + student_data[s]["times"] = curr_data[:, 0].astype('str') + for i in range(len(student_data[s]["features"])): + times = student_data[s]["times"][i].split()[0].split('-')[1:] + key_data['data']["{}_{}_{}".format(s, int(times[0]), int(times[1]))] = { + 'sample': student_data[s]["features"][i], + 'label': student_data[s]["labels"][i] + } + + # return loocv(student_data, key_data, days_include=days_include) + return cv(student_data) + +class LocationMLP(nn.Module): + def __init__(self, device): + super().__init__() + + self.fc_liner = nn.Sequential( + nn.Linear(12, 57), + nn.Tanh(), + nn.BatchNorm1d(57), + nn.Dropout(p=0.35), + nn.Linear(57, 35), + nn.Tanh(), + nn.BatchNorm1d(35), + nn.Dropout(p=0.25), + nn.Linear(35, 35), + nn.Tanh(), + nn.BatchNorm1d(35), + nn.Dropout(p=0.15), + nn.Linear(35, 3), + nn.Softmax(dim=1), + nn.BatchNorm1d(3), + ) + + self.loss_func = torch.nn.CrossEntropyLoss(weight=torch.tensor([0.6456, 0.5635, 1.0000], device=device) + ) + + def forward(self, x): + return self.fc_liner(x) + + def loss(self, samples, labels): + out = self.forward(samples) + return self.loss_func(out, labels) + + def predict(self, samples): + return self.forward(samples).argmax(dim=1) + +# ================= SPLIT LINE ==================================================== + +def train_val( + model, + optimizer, + train_data, + train_labels, + val_data, + val_labels, + epochs, + batch_size + ): + + saved_records = { + 'val_f1': { + 'micro': list(), + 'macro': list(), + 'weighted': list() + }, + 'val_auc': { + 'micro': list(), + 'macro': list(), + 'weighted': list() + }, + 'outputs': list(), + 'confmats': list(), + } + + train_inds = [i for i in range(len(train_data))] + val_inds = [i for i in range(len(val_data))] + + for e in tqdm.tqdm(range(epochs)): + # train + model.train() + batch_inds = get_mini_batchs(batch_size, train_inds) + for batch_ind in batch_inds: + if len(batch_ind) == 1: + continue + # forward + loss = model.loss(train_data[batch_ind], train_labels[batch_ind]) + + # backpropagation + model.zero_grad() + loss.backward() + optimizer.step() + + # validation + model.eval() + y_pred = list() + batch_inds = get_mini_batchs(batch_size, val_inds) + for batch_ind in batch_inds: + y_pred += model.predict(val_data[batch_ind]).cpu().detach().numpy().tolist() + saved_records['outputs'].append(model(val_data).cpu().detach().numpy()) + saved_records['confmats'].append(metrics.confusion_matrix(val_labels, y_pred, labels=[0, 1, 2])) + + # evaluate + for avg_type in ['micro', 'macro', 'weighted']: + saved_records['val_f1'][avg_type].append(eval_f1_score(y_pred, val_labels, avg_type)) + saved_records['val_auc'][avg_type].append(eval_auc_score(saved_records['outputs'][-1], val_labels, [[0], [1], [2]], avg_type)) + print("best weighted val f1", np.max(saved_records['val_f1']['weighted'])) + print("best micro val f1", np.max(saved_records['val_f1']['micro'])) + return saved_records + +def run(data, device, remark): + records = list() + + # config hyper-param + epochs = 300 # 300 + batch_size = 32 + for fold in data: + train_data = torch.Tensor(fold['train_data']).to(device).float() + train_labels = torch.Tensor(fold['train_labels']).to(device).long() + val_data = torch.Tensor(fold['val_data']).to(device).float() + val_labels = fold['val_labels'] + # val_labels = torch.Tensor(fold['val_labels']).long() + + print('train shape', train_data.shape, train_labels.shape) + print('val shape', val_data.shape, val_labels.shape) + + model = LocationMLP(device).to(device) + + # optimizer = torch.optim.Adam( + # model.parameters(), + # lr=1e-3, + # weight_decay=1e-4 + # ) + + optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.98) + + records.append(train_val( + model, + optimizer, + train_data, + train_labels, + val_data, + val_labels, + epochs, + batch_size + )) + + # save results + with open('data/cross_val_scores/location_mlp_5fold_0_0_{}.pkl'.format(remark), 'wb') as f: + pickle.dump(records, f) + + # print("avg weighted f1", np.mean([np.max(r['val_f1']['weighted']) for r in records])) + # print("avg micro f1", np.mean([np.max(r['val_f1']['micro']) for r in records])) + + +if __name__ == '__main__': + # use gpu if available + device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu') + # device = torch.device('cpu') + print("Device: ", device) + + # start train-val process + days_include = int(sys.argv[1]) # 7, 14, 21, 28 + data = load_data(days_include=days_include) + # run(data, device, "ol_{}".format(days_include)) + + for i in range(10): + run(data, device, str(i)) + \ No newline at end of file diff --git a/src/experiments/models.py b/src/experiments/models.py new file mode 100644 index 0000000..7484932 --- /dev/null +++ b/src/experiments/models.py @@ -0,0 +1,159 @@ +import torch +from torch import nn + +from src.experiments.layers import * + +class MultitaskAutoencoder(nn.Module): + def __init__(self, params, use_covariates): + super().__init__() + self.transfer_learn = False + self.with_generic_head = False + self.use_covariates = use_covariates + + self.params = params + self.autoencoder = autoencoder( + AE_type=params['AE'], + in_size=params['in_size'], + hidden_size=params['AE_hidden_size'], + num_layers=params['AE_num_layers'], + device=params['device'], + ) + + self.branching = branching( + groups=params['groups'], + num_branches=params['num_branches'], + device=params['device'], + ) + + self.branch_layer = branch_layer( + num_branches=params['num_branches'], + in_size=params['shared_in_size'], + hidden_size=params['shared_hidden_size'], + out_size=params['shared_hidden_size'] // 2, + device=params['device'], + ) + + self.out_heads = out_heads( + groups=params['groups'], + in_size=params['shared_hidden_size'] // 2, + hidden_size=params['heads_hidden_size'], + out_size=params['num_classes'], + device=params['device'], + ) + + # if self.with_generic_head: + # self.out_heads = out_heads_with_generic( + # groups=params['groups'], + # in_size=params['shared_hidden_size'] // 2, + # hidden_size=params['heads_hidden_size'], + # out_size=params['num_classes'], + # device=params['device'], + # ) + # else: + # self.out_heads = out_heads( + # groups=params['groups'], + # in_size=params['shared_hidden_size'] // 2, + # hidden_size=params['heads_hidden_size'], + # out_size=params['num_classes'], + # device=params['device'], + # ) + + if self.with_generic_head: + self.generic_out = nn.Sequential( + nn.Linear(params['shared_hidden_size'] // 2, params['heads_hidden_size']), + nn.ReLU(), + nn.Linear(params['heads_hidden_size'], params['num_classes']), + ) + + # if with prob param + self.generic_prob = nn.Parameter(torch.ones((1, params['num_branches']), device=params['device']), requires_grad=True) + + def forward(self, x, inds, ids, covariate_data): + # autoencoder forward + AE_out, bottle_neck = self.autoencoder(x, inds) + if covariate_data is not None and self.use_covariates: + bottle_neck = torch.cat((bottle_neck, covariate_data), dim=1) + + if self.with_generic_head: + # generic_out = self.generic_out(bottle_neck) + + # if with prob param + eps = -torch.log(-torch.log(torch.rand((1, self.params['num_branches']), device=self.params['device']))) # gumbel noise + gen_prob = torch.log(self.generic_prob) # take log + gen_prob = torch.exp((gen_prob + eps) / self.branching.t) # softmax + gen_prob /= gen_prob.sum(dim=1).view(-1, 1) + generic_branch_out = self.branch_layer(bottle_neck, gen_prob) # take input from branches + + generic_out = self.generic_out(generic_branch_out) # final generic out + + # select branches + if not self.transfer_learn: + probabilities = self.branching(bottle_neck, ids) + + # branches forward + branch_out = self.branch_layer(bottle_neck, probabilities) + final_out = self.out_heads(branch_out, ids) + + # if not self.with_generic_head: + # final_out = self.out_heads(branch_out, ids) + # else: + # final_out, generic_out = self.out_heads(branch_out, ids) + + # for transfer learning + else: + B = self.downstream_layers.num_heads + H = self.params['heads_hidden_size'] + out = torch.zeros(bottle_neck.shape[0], B, H).to(self.autoencoder.device) + # extract all the existing key of heads + id_ = [key for key in self.branching.groups] + + # get output from each head one-by-one + ind = 0 + for i in id_: + ids = [i for _ in range(bottle_neck.shape[0])] + + probabilities = self.branching(bottle_neck, ids) + + # branches forward + branch_out = self.branch_layer(bottle_neck, probabilities) + final_out = self.out_heads(branch_out, ids) + out[:, ind, :] = final_out + ind += 1 + + final_out = self.downstream_layers(out) + + # return + if not self.with_generic_head: + return final_out, AE_out + else: + return final_out, AE_out, generic_out + + def set_transfer_learn(self, downstream_layers): + self.transfer_learn = True + self.out_heads.transfer_learn = True + self.downstream_layers = downstream_layers + +class LocationMLP(nn.Module): + def __init__(self): + super().__init__() + + self.fc_liner = nn.Sequential( + nn.Linear(12, 57), + nn.Tanh(), + nn.BatchNorm1d(57), + nn.Dropout(p=0.35), + nn.Linear(57, 35), + nn.Tanh(), + nn.BatchNorm1d(35), + nn.Dropout(p=0.25), + nn.Linear(35, 35), + nn.Tanh(), + nn.BatchNorm1d(35), + nn.Dropout(p=0.15), + nn.Linear(35, 3), + nn.Softmax(dim=1), + nn.BatchNorm1d(3), + ) + + def forward(self, x): + return self.fc_liner(x) diff --git a/src/experiments/repeat_exp.py b/src/experiments/repeat_exp.py new file mode 100644 index 0000000..8019e8d --- /dev/null +++ b/src/experiments/repeat_exp.py @@ -0,0 +1,16 @@ +from src.experiments.run_exp import * + +if __name__ == '__main__': + repeat_n = 10 + + # read command line arguments + job_name = sys.argv[1] # choice: 'test', 'calm_net', 'calm_net_with_branching' + split_name = sys.argv[2] # choice: ['5fold', 'loocv', '5fold_c'] + num_branches = int(sys.argv[3]) # any interger + days_include = int(sys.argv[4]) # any interger + clusters_name = sys.argv[5] # 'one_for_each', 'all_in_one', 'pre_survey_scores_7 + remark = sys.argv[6] # calm_net, single, survey + + for i in range(repeat_n): + curr_remark = remark + str(i) + run_exp(job_name, split_name, num_branches, days_include, clusters_name, curr_remark) \ No newline at end of file diff --git a/src/experiments/run_exp.py b/src/experiments/run_exp.py new file mode 100644 index 0000000..8b59110 --- /dev/null +++ b/src/experiments/run_exp.py @@ -0,0 +1,153 @@ +import sys +from copy import deepcopy + +from src.experiments.config import * +from src.utils.train_val_utils import * +import src.utils.tensorify as tensorify +import src.utils.data_conversion_utils as conversions + +def run_exp(job_name, split_name, num_branches, days_include, clusters_name, remark): + print('Job name:', job_name) + print('Type:', split_name) + + # use gpu if available + device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + # device = torch.device('cpu') + print("Device: ", device) + + # ====== LOAD STUDENTLIFE DATA =================================== + + # load data + print('Loading Data...') + data_file_path = 'data/training_data/shuffled_splits/training_date_normalized_shuffled_splits_select_features_no_prev_stress_all_students.pkl' + data = read_data(data_file_path) + tensorified_data = tensorify.tensorify_data_gru_d(deepcopy(data), torch.cuda.is_available()) + student_list = conversions.extract_distinct_student_idsfrom_keys(data['data'].keys()) + + # load groups + clusters_name = clusters_name # 'one_for_each', 'all_in_one', 'pre_survey_scores_7 + print('The groups: ' + clusters_name) + groups_file_path = 'src/experiments/clustering/student_groups/' + clusters_name + '.pkl' + student_groups = read_data(groups_file_path) # student groups + + # ================================================================ + + # ====== LOAD SAMPLE DATA ======================================== + + # with open("data/training_data/sample_input.pkl", "rb") as f: + # data = pickle.load(f) + # tensorified_data = data + # student_groups = data["one_for_each"] + + # ================================================================ + + # check how students are distributed + print("student distribution: ") + rev_groups = dict() # map: group_id -> student_ids + for student in student_groups: + if rev_groups.get(student_groups[student]) != None: + rev_groups[student_groups[student]].append(student) + else: + rev_groups[student_groups[student]] = [student] + for group in rev_groups: + print(group + ': ' + str(rev_groups[group])) + + ############ SETTINGS ############ + use_historgram = True + first_key = next(iter(data['data'].keys())) + if use_historgram: + num_features = len(tensorified_data['data'][first_key][2][0]) + else: + num_features = len(tensorified_data['data'][first_key][0][0]) + num_covariates = len(tensorified_data['data'][first_key][1]) # 1 is the covariate index + + # get split + print('Validation Type:', split_name) + splits = get_splits(split_name, data, student_groups, days_include=days_include) + # print(splits) + + # get configurations for model and training + config = get_config(job_name, device, num_features, student_groups, num_branches) + model_params = config['model_params'] + training_params = config['training_params'] + + if training_params['use_covariates']: + model_params['shared_in_size'] += num_covariates + #################################### + + # ======== FOR UPWEIGHT ====================================== + # # start training + # student_id_list = [4, 7, 8, 10, 14, 16, 17, 19, 22, 23, 24, 32, 33, 35, 36, 43, 44, 49, 51, 52, 53, 57, 58] + # # chosen_student = [4, 7, 8, 16, 22, 52] + # chosen_student = student_id_list + + # # for determine up-weight coefficient + # if split_name == 'loocv' and days_include > 0: + # with open('data/check/train_val_num_stats.pkl', 'rb') as f: + # train_val_stats = pickle.load(f)[days_include] + + # ============================================================ + + saved_records = list() # list of results from each split + for split_no, split in enumerate(splits): + # if the days include exceed the max range of date of keys, skip + # this three lines only for loocv. A empty dict will be appended + if len(split['val_ids']) == 0: + saved_records.append(dict()) + continue + + print("Split No: ", split_no) + + # fetch train and val data + tensorified_data['train_ids'] = split['train_ids'] + tensorified_data['val_ids'] = split['val_ids'] + + # fetch leaved out student + if split_name == 'loocv' and days_include > 0: + leaved_student = split['val_ids'][0].split('_')[0] + + # # for upweight + # if int(leaved_student) not in chosen_student: + # continue + + # # determine up-weight coefficient + # leaved_train = train_val_stats[int(leaved_student)][0] # #of train from leaved_student + # rest_train = len(split['train_ids']) - leaved_train + # up_weight_k = int(rest_train / leaved_train) + + up_weight_k = 1.0 + else: # 5-fold + leaved_student = -1 + up_weight_k = 1.0 + + # Training and Validation + curr_record = train_and_val( + data=tensorified_data, + model_params=model_params, + training_params=training_params, + leaved_student=leaved_student, + up_weight_k=up_weight_k, + ) + curr_record['model'] = None # tentative, since EmbConvBlock contain lambda. can keep model once the model write/load function is optimize using torch.load + saved_records.append(curr_record) + + # # new added + # saved_filename = 'val_labels_{}_{}_{}.pkl'.format(job_name, split_name, num_branches) + # with open(saved_filename, 'wb') as f: + # pickle.dump(saved_records, f) + + # save results + saved_filename = 'data/cross_val_scores/{}_{}_{}_{}_{}.pkl'.format(job_name, split_name, num_branches, days_include, remark) + with open(saved_filename, 'wb') as f: + pickle.dump(saved_records, f) + +if __name__ == '__main__': + # read command line arguments + job_name = sys.argv[1] # choice: 'test', 'calm_net', 'calm_net_with_branching' + split_name = sys.argv[2] # choice: ['5fold', 'loocv'] + num_branches = int(sys.argv[3]) # any interger, 1 for calm_net, >=2 for branched_calm_net + days_include = int(sys.argv[4]) # any interger, 0 for 5fold, {7, 14, 21, 28} for loocv + clusters_name = sys.argv[5] # 'one_for_each' for calm_net and branched_calm_net + remark = sys.argv[6] # any additional info (for differentiate saved filename) + + run_exp(job_name, split_name, num_branches, days_include, clusters_name, remark) \ No newline at end of file diff --git a/src/experiments/run_exp_one_generic_head.py b/src/experiments/run_exp_one_generic_head.py new file mode 100644 index 0000000..7530286 --- /dev/null +++ b/src/experiments/run_exp_one_generic_head.py @@ -0,0 +1,119 @@ +import sys +from copy import deepcopy +from turtle import up + +from src.experiments.config import * +from src.utils.train_val_utils import * +import src.utils.tensorify as tensorify +import src.utils.data_conversion_utils as conversions + +if __name__ == '__main__': + # read command line arguments + job_name = sys.argv[1] # choice: 'test', 'calm_net', 'calm_net_with_branching' + split_name = sys.argv[2] # choice: ['5fold', 'loocv'] + num_branches = int(sys.argv[3]) # any interger + days_include = int(sys.argv[4]) # any interger + print('Job name:', job_name) + print('Type:', split_name) + + # use gpu if available + device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu') + # device = torch.device('cpu') + print("Device: ", device) + + # load data + print('Loading Data...') + data_file_path = 'data/training_data/shuffled_splits/training_date_normalized_shuffled_splits_select_features_no_prev_stress_all_students.pkl' + data = read_data(data_file_path) + tensorified_data = tensorify.tensorify_data_gru_d(deepcopy(data), torch.cuda.is_available()) + student_list = conversions.extract_distinct_student_idsfrom_keys(data['data'].keys()) + + # load groups + clusters_name = 'all_in_one' # 'one_for_each', 'all_in_one' + print('The groups: ' + clusters_name) + groups_file_path = 'src/experiments/clustering/student_groups/' + clusters_name + '.pkl' + student_groups = read_data(groups_file_path) # student groups + + # check how students are distributed + print("student distribution: ") + rev_groups = dict() # map: group_id -> student_ids + for student in student_groups: + if rev_groups.get(student_groups[student]) != None: + rev_groups[student_groups[student]].append(student) + else: + rev_groups[student_groups[student]] = [student] + for group in rev_groups: + print(group + ': ' + str(rev_groups[group])) + + ############ SETTINGS ############ + use_historgram = True + first_key = next(iter(data['data'].keys())) + if use_historgram: + num_features = len(data['data'][first_key][4][0]) + else: + num_features = len(data['data'][first_key][0][0]) + num_covariates = len(data['data'][first_key][3]) # 3 is the covariate index + + # get split + print('Validation Type:', split_name) + splits = get_splits(split_name, data, student_groups, days_include=days_include) + + # get configurations for model and training + config = get_config(job_name, device, num_features, student_groups, num_branches) + model_params = config['model_params'] + training_params = config['training_params'] + + if training_params['use_covariates']: + model_params['shared_in_size'] += num_covariates + #################################### + + # start training + student_id_list = [4, 7, 8, 10, 14, 16, 17, 19, 22, 23, 24, 32, 33, 35, 36, 43, 44, 49, 51, 52, 53, 57, 58] + # chosen_student = [4, 7, 8, 16, 22, 52] + chosen_student = student_id_list + with open('data/check/train_val_num_stats.pkl', 'rb') as f: + train_val_stats = pickle.load(f)[days_include] + + saved_records = list() # list of results from each split + for split_no, split in enumerate(splits): + # if the days include exceed the max range of date of keys, skip + # this three lines only for loocv. A empty dict will be appended + if len(split['val_ids']) == 0: + saved_records.append(dict()) + continue + + print("Split No: ", split_no) + + # fetch train and val data + tensorified_data['train_ids'] = split['train_ids'] + tensorified_data['val_ids'] = split['val_ids'] + + # fetch leaved out student + leaved_student = split['val_ids'][0].split('_')[0] + if int(leaved_student) not in chosen_student: + continue + + # determine up-weight coefficient + leaved_train = train_val_stats[int(leaved_student)][0] # #of train from leaved_student + rest_train = len(split['train_ids']) - leaved_train + up_weight_k = int(rest_train / leaved_train) + + # Training and Validation + saved_records.append(train_and_val( + data=tensorified_data, + model_params=model_params, + training_params=training_params, + leaved_student=leaved_student, + up_weight_k=up_weight_k, + )) + + # # new added + # saved_filename = 'val_labels_{}_{}_{}.pkl'.format(job_name, split_name, num_branches) + # with open(saved_filename, 'wb') as f: + # pickle.dump(saved_records, f) + + # save results + saved_filename = 'data/cross_val_scores/{}_{}_{}_{}_{}.pkl'.format(job_name, split_name, num_branches, days_include, chosen_student) + with open(saved_filename, 'wb') as f: + pickle.dump(saved_records, f) + \ No newline at end of file diff --git a/src/experiments/run_exp_transfer_learning.py b/src/experiments/run_exp_transfer_learning.py new file mode 100644 index 0000000..96e48fb --- /dev/null +++ b/src/experiments/run_exp_transfer_learning.py @@ -0,0 +1,130 @@ +import sys +from copy import deepcopy + +from src.experiments.config import * +from src.utils.train_val_utils import * +import src.utils.tensorify as tensorify +import src.utils.data_conversion_utils as conversions +from src.experiments.layers import * + +if __name__ == '__main__': + # read command line arguments + job_name = sys.argv[1] # choice: 'test', 'calm_net', 'calm_net_with_branching' + split_name = sys.argv[2] # choice: ['5fold', 'loocv'] + num_branches = int(sys.argv[3]) # any interger + print('Job name:', job_name) + print('Type:', split_name) + + # use gpu if available + device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu') + print("Device: ", device) + + # load data + print('Loading Data...') + data_file_path = 'data/training_data/shuffled_splits/training_date_normalized_shuffled_splits_select_features_no_prev_stress_all_students.pkl' + data = read_data(data_file_path) + tensorified_data = tensorify.tensorify_data_gru_d(deepcopy(data), torch.cuda.is_available()) + student_list = conversions.extract_distinct_student_idsfrom_keys(data['data'].keys()) + + # load groups + clusters_name = 'one_for_each' + print('The groups: ' + clusters_name) + groups_file_path = 'src/experiments/clustering/student_groups/' + clusters_name + '.pkl' + student_groups = read_data(groups_file_path) # student groups + + # check how students are distributed + print("student distribution: ") + rev_groups = dict() # map: group_id -> student_ids + for student in student_groups: + if rev_groups.get(student_groups[student]) != None: + rev_groups[student_groups[student]].append(student) + else: + rev_groups[student_groups[student]] = [student] + for group in rev_groups: + print(group + ': ' + str(rev_groups[group])) + + ############ SETTINGS ############ + use_historgram = True + first_key = next(iter(data['data'].keys())) + if use_historgram: + num_features = len(data['data'][first_key][4][0]) + else: + num_features = len(data['data'][first_key][0][0]) + num_covariates = len(data['data'][first_key][3]) # 3 is the covariate index + + # get split + print('Validation Type:', split_name) + splits = get_splits(split_name, data, student_groups) + + # get configurations for model and training + config = get_config(job_name, device, num_features, student_groups, num_branches) + model_params = config['model_params'] + training_params = config['training_params'] + + if training_params['use_covariates']: + model_params['shared_in_size'] += num_covariates + #################################### + + # start training + saved_records = list() # list of results from each split + for split_no, split in enumerate(splits): + print("Split No: ", split_no) + + val_student = split['val_ids'][0].split('_')[0] + print('leaved student:', val_student) + + # delete the leaved key + key = 'student_{}'.format(val_student) + val = model_params['groups']['student_{}'.format(val_student)] + del model_params['groups']['student_{}'.format(val_student)] + + # pre-Training + tensorified_data['train_ids'] = [i for i in split['train_ids'] if i.split('_')[0] != val_student] + tensorified_data['val_ids'] = tensorified_data['train_ids'][:1] + + print('Pre-Training...') + pre_record = train_and_val( + data=tensorified_data, + model_params=model_params, + training_params=training_params, + ) + + # assemble downstream task + downstream_layers = personal_head( + num_heads=22, + in_size=model_params['heads_hidden_size'], + hidden_size=model_params['heads_hidden_size'], + out_size=model_params['num_classes'], + device=model_params['device'], + ) + + pre_record['model'].set_transfer_learn(downstream_layers) + # pre_record['model'].transfer_learn = True + # pre_record['model'].out_heads.transfer_learn = True + # pre_record['model'].downstream_layers = downstream_layers + + # fine-tuning + tensorified_data['train_ids'] = [i for i in split['train_ids'] if i.split('_')[0] == val_student] + tensorified_data['val_ids'] = split['val_ids'] + + print('Fine-Tuning...') + saved_records.append(train_and_val( + data=tensorified_data, + model_params=model_params, + training_params=training_params, + pre_record=pre_record, + )) + + # retrieve the deleted key + model_params['groups'][key] = val + + # # new added + # saved_filename = 'val_labels_{}_{}_{}.pkl'.format(job_name, split_name, num_branches) + # with open(saved_filename, 'wb') as f: + # pickle.dump(saved_records, f) + + # save results + saved_filename = 'data/cross_val_scores/transfer_learn_{}_{}_{}.pkl'.format(job_name, split_name, num_branches) + with open(saved_filename, 'wb') as f: + pickle.dump(saved_records, f) + \ No newline at end of file diff --git a/src/grid_search/__init__.py b/src/grid_search/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/grid_search/grid_search.py b/src/grid_search/grid_search.py new file mode 100644 index 0000000..8fd0ddb --- /dev/null +++ b/src/grid_search/grid_search.py @@ -0,0 +1,181 @@ +import itertools +import torch +import copy +import os +import tqdm + +from src import definitions +from sklearn import metrics +from src.bin import tensorify +from src.bin import plotting +from src.data_manager import cross_val +from src.grid_search import helper +from src.utils import data_conversion_utils as conversions +from src.utils import read_utils +from src.utils import write_utils +from statistics import mean as list_mean +from src.bin import statistics + +F_SCORE_INDEX = 2 + +TRAINING_DATA_FILE_NAME = read_utils.read_yaml(definitions.GRID_SEARCH_CONFIG_FILE_PATH)['data_file_name'] + + +def get_hyper_parameter_list_for_grid_search(experiment="multitask_learner_auto_encoder"): + experiment_config = read_utils.read_yaml(definitions.GRID_SEARCH_CONFIG_FILE_PATH)[experiment] + hyper_parameter_list = [] + params = experiment_config.keys() + + for param in params: + hyper_parameter_list.append(experiment_config[param]) + + hyper_parameters_list = list(itertools.product(*hyper_parameter_list)) + final_hyper_prameters_list = [] + + for hyper_parameters in hyper_parameters_list: + hyper_parameters_dict = {} + for idx, param in enumerate(params): + hyper_parameters_dict[param] = hyper_parameters[idx] + + final_hyper_prameters_list.append(hyper_parameters_dict) + + return final_hyper_prameters_list + + +def search_best_params_for_experiment(experiment, data: dict): + if experiment == "multitask_learner_auto_encoder": + search_multitask_auto_encoder(get_hyper_parameter_list_for_grid_search(experiment), data) + + +def search_multitask_auto_encoder(hyper_parameters_list, data: dict): + splits = cross_val.get_k_fod_cross_val_splits_stratified_by_students(data) + student_list = conversions.extract_distinct_student_idsfrom_keys(data['data'].keys()) + tensorified_data = tensorify.tensorify_data_gru_d(copy.deepcopy(data), torch.cuda.is_available()) + + final_scores_for_each_config = [] + + print("Label Distribution") + print(statistics.get_train_test_val_label_counts_from_raw_data(data)) + + for model_params_no, model_params in enumerate(hyper_parameters_list): + print("###################### Param Config No: {} ########################".format(model_params_no)) + print("Params: ", model_params) + + (use_histogram, + autoencoder_bottle_neck_feature_size, + autoencoder_num_layers, + alpha, beta, + decay, + num_features, + num_covariates, + shared_hidden_layer_size, + user_dense_layer_hidden_size, + num_classes, + learning_rate, + n_epochs, + shared_layer_dropout_prob, + user_head_dropout_prob, + class_weights, + device) = helper.get_params_from_model(model_params, data) + + best_val_scores = [] + + for split_no, split in enumerate(splits): + + print("Split {}".format(split_no)) + + best_split_score = -1 + + tensorified_data['train_ids'] = split["train_ids"] + tensorified_data['val_ids'] = split["val_ids"] + tensorified_data['test_ids'] = [] + + model, reconstruction_criterion, classification_criterion, optimizer = helper.init_multitask_autoencoder_learner( + num_features, + autoencoder_bottle_neck_feature_size, + autoencoder_num_layers, + shared_hidden_layer_size, + user_dense_layer_hidden_size, + num_classes, + num_covariates, + shared_layer_dropout_prob, + user_head_dropout_prob, + learning_rate, + decay, + class_weights, + student_list) + + total_loss_over_epochs, scores_over_epochs = plotting.get_empty_stat_over_n_epoch_dictionaries() + reconstruction_loss_over_epochs = copy.deepcopy(total_loss_over_epochs) + classification_loss_over_epochs = copy.deepcopy(total_loss_over_epochs) + + for epoch in tqdm.tqdm(range(n_epochs)): + + (train_total_loss, + train_total_reconstruction_loss, + train_total_classification_loss, + train_labels, + train_preds, + train_users), (val_total_loss, + val_total_reconstruction_loss, + val_total_classification_loss, + val_labels, + val_preds, + val_users) = helper.train_for_one_epoch(tensorified_data, + num_classes, + model, + reconstruction_criterion, + classification_criterion, + device, + optimizer, + alpha, + beta, + use_histogram) + + ######## Appending losses ######## + total_loss_over_epochs['train_loss'].append(train_total_loss) + total_loss_over_epochs['val_loss'].append(val_total_loss) + + reconstruction_loss_over_epochs['train_loss'].append(train_total_reconstruction_loss) + reconstruction_loss_over_epochs['val_loss'].append(val_total_reconstruction_loss) + + classification_loss_over_epochs['train_loss'].append(train_total_classification_loss) + classification_loss_over_epochs['val_loss'].append(val_total_classification_loss) + + ######## Appending Metrics ######## + train_label_list = conversions.tensor_list_to_int_list(train_labels) + train_pred_list = conversions.tensor_list_to_int_list(train_preds) + val_label_list = conversions.tensor_list_to_int_list(val_labels) + val_pred_list = conversions.tensor_list_to_int_list(val_preds) + + train_scores = metrics.precision_recall_fscore_support(train_label_list, + train_pred_list, + average='weighted')[F_SCORE_INDEX] + val_scores = metrics.precision_recall_fscore_support(val_label_list, + val_pred_list, + average='weighted')[F_SCORE_INDEX] + + scores_over_epochs['train_scores'].append(train_scores) + scores_over_epochs['val_scores'].append(val_scores) + + if val_scores > best_split_score: + best_split_score = val_scores + + best_val_scores.append(best_split_score) + + avg_val_score = list_mean(best_val_scores) + final_scores_for_each_config.append((avg_val_score, model_params)) + + print("Average score for current configuration: {}".format(avg_val_score)) + + grid_search_details_file_path = os.path.join(definitions.DATA_DIR, "grid_search_details.pkl") + write_utils.data_structure_to_pickle(final_scores_for_each_config, grid_search_details_file_path) + + +def run_grid_search(): + data_file_path = os.path.join(definitions.DATA_DIR, 'training_data/shuffled_splits', TRAINING_DATA_FILE_NAME) + data = read_utils.read_pickle(data_file_path) + search_best_params_for_experiment("multitask_learner_auto_encoder", data) + + +run_grid_search() diff --git a/src/grid_search/helper.py b/src/grid_search/helper.py new file mode 100644 index 0000000..4b5819a --- /dev/null +++ b/src/grid_search/helper.py @@ -0,0 +1,142 @@ +import torch + +from src import definitions +from src.bin import trainer +from src.models.multitask_learning import multitask_autoencoder +from src.utils import data_conversion_utils as conversions + + +def init_multitask_autoencoder_learner(num_features, + autoencoder_bottle_neck_feature_size, + autoencoder_num_layers, + shared_hidden_layer_size, + user_dense_layer_hidden_size, + num_classes, + num_covariates, + shared_layer_dropout_prob, + user_head_dropout_prob, + learning_rate, + decay, + class_weights, + student_list): + class_weights = torch.tensor(class_weights) + + model = multitask_autoencoder.MultiTaskAutoEncoderLearner( + conversions.prepend_ids_with_string(student_list, "student_"), + num_features, + autoencoder_bottle_neck_feature_size, + autoencoder_num_layers, + shared_hidden_layer_size, + user_dense_layer_hidden_size, + num_classes, + num_covariates, + shared_layer_dropout_prob, + user_head_dropout_prob) + + if torch.cuda.is_available(): + model.cuda() + class_weights = class_weights.cuda() + + reconstruction_criterion = torch.nn.L1Loss(reduction="sum") + classification_criterion = torch.nn.CrossEntropyLoss(weight=class_weights) + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=decay) + + return model, reconstruction_criterion, classification_criterion, optimizer + + +def get_params_from_model(params, data): + use_histogram = params['use_histogram'] + autoencoder_bottle_neck_feature_size = params['autoencoder_bottle_neck_feature_size'] + autoencoder_num_layers = params['autoencoder_num_layers'] + alpha, beta = params['alpha'], params['beta'] + decay = params['decay'] + + first_key = next(iter(data['data'].keys())) + if use_histogram: + num_features = len(data['data'][first_key][4][0]) + else: + num_features = len(data['data'][first_key][0][0]) + + num_covariates = len(data['data'][first_key][definitions.COVARIATE_DATA_IDX]) + shared_hidden_layer_size = params['shared_hidden_layer_size'] + user_dense_layer_hidden_size = params['user_dense_layer_hidden_size'] + num_classes = params['num_classes'] + learning_rate = params['learning_rate'] + n_epochs = params['n_epochs'] + shared_layer_dropout_prob = params["shared_layer_dropout_prob"] + user_head_dropout_prob = params["user_head_dropout_prob"] + class_weights = params['class_weights'] + device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu') + + return (use_histogram, + autoencoder_bottle_neck_feature_size, + autoencoder_num_layers, + alpha, beta, + decay, + num_features, + num_covariates, + shared_hidden_layer_size, + user_dense_layer_hidden_size, + num_classes, + learning_rate, + n_epochs, + shared_layer_dropout_prob, + user_head_dropout_prob, + class_weights, + device) + + +def train_for_one_epoch(data, + num_classes, + model, + reconstruction_criterion, + classification_criterion, + device, + optimizer, + alpha, + beta, + use_histogram): + (train_total_loss, + train_total_reconstruction_loss, + train_total_classification_loss, + train_labels, + train_preds, + train_users) = trainer.evaluate_multitask_learner(data, + 'train_ids', + num_classes, + model, + reconstruction_criterion, + classification_criterion, + device, + optimizer=optimizer, + alpha=alpha, + beta=beta, + use_histogram=use_histogram) + + (val_total_loss, + val_total_reconstruction_loss, + val_total_classification_loss, + val_labels, + val_preds, + val_users) = trainer.evaluate_multitask_learner(data, + 'val_ids', + num_classes, + model, + reconstruction_criterion, + classification_criterion, + device, + alpha=alpha, + beta=beta, + use_histogram=use_histogram) + + return (train_total_loss, + train_total_reconstruction_loss, + train_total_classification_loss, + train_labels, + train_preds, + train_users), (val_total_loss, + val_total_reconstruction_loss, + val_total_classification_loss, + val_labels, + val_preds, + val_users) diff --git a/src/models/.DS_Store b/src/models/.DS_Store new file mode 100644 index 0000000..bfb9916 Binary files /dev/null and b/src/models/.DS_Store differ diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/__pycache__/__init__.cpython-39.pyc b/src/models/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..aecbd02 Binary files /dev/null and b/src/models/__pycache__/__init__.cpython-39.pyc differ diff --git a/src/models/__pycache__/layers.cpython-39.pyc b/src/models/__pycache__/layers.cpython-39.pyc new file mode 100644 index 0000000..b2593ca Binary files /dev/null and b/src/models/__pycache__/layers.cpython-39.pyc differ diff --git a/src/models/attention/__init__.py b/src/models/attention/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/attention/align_functions.py b/src/models/attention/align_functions.py new file mode 100644 index 0000000..274d4f3 --- /dev/null +++ b/src/models/attention/align_functions.py @@ -0,0 +1,132 @@ +import torch +import torch.nn as nn + +from src.models.attention import helper + + +class AdditiveAlignment(nn.Module): + """ + This class is the align function for additive attention. + -------------------------------------------------------------------------- + s_t: Initially, the last hidden state of the Encoder is the initial hidden state of the Decoder. + This hidden state is the one that is used to calculate the energy between the Encoder outputs. (value a_ij) + + Note: aka concat attention. + -------------------------------------------------------------------------- + Equation - score(s_t, h_i) = v_a_⊤ * tanh (W_a[s_t; h_i]) + """ + def __init__(self, encoder_output_size, context_vector_size): + super(AdditiveAlignment, self).__init__() + self.encoder_output_size = encoder_output_size + self.context_vector_size = context_vector_size + + # The scoring layer learns a function that calculates the score between the decoder hidden state and encoder output. + self.score = nn.Linear(self.encoder_output_size + self.context_vector_size, + self.context_vector_size) + + # This matrix is multiplied by the energy which is a sequence of vectors of len [context_vector_dim] + # To get a final vector of seq len, we need v to be [1, context_vector_dim, 1] + # and the Energy to be [batch_size, seq_len, context_vector_dim] + self.v = nn.Parameter(torch.rand(self.context_vector_size, 1), requires_grad=True) + + def forward(self, encoder_outputs, decoder_hidden_state): + """ + + @param encoder_outputs(batch_size, seq_len, encoder_output_dim): The encoder_outputs that need to be used to calculate the energy + between the last hidden state of the decoder. + + @param decoder_hidden_state(batch_size, context_vector_dim): Last hidden state of the decoder. It is the last hidden state of the encoder at t=0. + (initially) + @return: Return the alignment function that will go through the softmax for the final attention weights. + """ + + # Preparing the hidden state to match the dimensions of the ended_outputs. + batch_size, seq_len, encoder_hidden_size = helper.get_dimensions_from_encoder_outputs(encoder_outputs) + decoder_hidden_state = decoder_hidden_state.unsqueeze(1) + decoder_hidden_state = decoder_hidden_state.repeat(1, seq_len, 1) + + # Calculating energy between the hidden state and each encoder output. + # concatenated = [batch_size, seq_len, encoder_output_dim + context_vector_dim] + concatenated = torch.cat((decoder_hidden_state, encoder_outputs), dim=2) + # energy = [batch_size, seq_len, context_vector_dim] + energy = torch.tanh(self.score(concatenated)) + + # v = [context_vector_dim, 1] + v = self.v.unsqueeze(0).repeat(batch_size, 1, 1) + # v = [batch_size, context_vector_dim, 1] + attention = torch.bmm(energy, v).squeeze(2) + # attention = [batch_size, seq_len] + + return attention + + +class GeneralAlignment(nn.Module): + """ + This attention is based on the equation in paper - Effective Approaches to Attention-based Neural Machine Translation + Luong et. al. + -------------------------------------------------------------------------- + Equation for score function - s_t_T * W_a * h_i + -------------------------------------------------------------------------- + """ + + def __init__(self, encoder_output_size, context_vector_size): + super(GeneralAlignment, self).__init__() + self.encoder_output_size = encoder_output_size + self.context_vector_size = context_vector_size + + self.score = nn.Linear(encoder_output_size, context_vector_size) + + def forward(self, encoder_outputs, decoder_hidden_state): + """ + @param encoder_outputs(batch_size, seq_len, encoder_output_dim): The encoder_outputs that need to be used to calculate the energy + between the last hidden state of the decoder. + @param decoder_hidden_state(batch_size, context_vector_dim): Last hidden state of the decoder. It is the last hidden state of the encoder at t=0. + (initially) + + @return(batch_size, seq_len): Return the weights that need to be used to calculate the expected context vector. + """ + + # Preparing the hidden state to match the dimensions of the ended_outputs. + batch_size, seq_len, encoder_hidden_size = helper.get_dimensions_from_encoder_outputs(encoder_outputs) + decoder_hidden_state = decoder_hidden_state.unsqueeze(1) + # decoder_hidden_state = [batch_size, 1, context_vector_size] + decoder_hidden_state = decoder_hidden_state.permute(0, 2, 1) + + attention = self.score(encoder_outputs) + # attention = [batch_size, seq_len, context_vector_dim] + attention = torch.bmm(attention, decoder_hidden_state).squeeze(2) + # attention = [batch_size, seq_len] + + return attention + + +class DotAlignment(nn.Module): + """ + DotAttention + -------------------------------------------------------------------------- + Equation - s_t_T * h_i + -------------------------------------------------------------------------- + """ + def __init__(self, encoder_output_size, context_vector_size): + """ + @attention: Context_vecto_size must be same as encoder_output_size as to calculate the dot product between these + vectors we need them to be of the same dimension. + """ + super(DotAlignment, self).__init__() + assert (encoder_output_size == context_vector_size), "context_vector_size must be same as the encoder_output_size" + self.encoder_output_size = encoder_output_size + self.context_vector_size = context_vector_size + + def forward(self, encoder_outputs, decoder_hidden_state): + """ + Parameter description same as other align function layers. + """ + decoder_hidden_state = decoder_hidden_state.unsqueeze(1) + print("decoder_hidden_state", decoder_hidden_state.shape) + # decoder_hidden_state = [batch_size, 1, context_vector_size] + decoder_hidden_state = decoder_hidden_state.permute(0, 2, 1) + # decoder_hidden_state = [batch_size, context_vector_size, 1] + print("decoder_hidden_state", decoder_hidden_state.shape) + attention = torch.bmm(encoder_outputs, decoder_hidden_state).squeeze(2) + + return attention diff --git a/src/models/attention/attention.py b/src/models/attention/attention.py new file mode 100644 index 0000000..8819d4c --- /dev/null +++ b/src/models/attention/attention.py @@ -0,0 +1,208 @@ +""" +Implementation is based on the paper - NEURAL MACHINE TRANSLATION +BY JOINTLY LEARNING TO ALIGN AND TRANSLATE +Author - Bahdanau et. al. + +Deviation from paper - Use of LSTM rather than GRU. + +This module is referred from - +https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb +""" +import torch +from torch import nn + +from src.models.attention import helper + + +class AttentionEncoder(nn.Module): + """ + The RNN unit used here is LSTM. + todo(abhinavshaw): Make the RNN configurable. + """ + + def __init__(self, + input_size, + encoder_hidden_size, + num_layers, + is_cuda, + dropout_p=0): + super(AttentionEncoder, self).__init__() + # Input size is same as feature size. + self.input_size = input_size + self.encoder_hidden_size = encoder_hidden_size + self.num_layers = num_layers + self.is_cuda = is_cuda + self.dropout_p = dropout_p + + self.rnn = nn.LSTM(input_size=self.input_size, + hidden_size=self.encoder_hidden_size, + num_layers=self.num_layers, + batch_first=True, + bidirectional=True) + + self.linear = nn.Linear(self.encoder_hidden_size * 2, self.encoder_hidden_size) + + self.dropout = nn.Dropout(p=dropout_p) + + def forward(self, input_seq): + """ + + @param input_seq(batch_size,seq_len,input_size): The input sequence that needs to be encoded. + @return: Encoded input sequence and the context vector which + is formed by the last hidden state of the RNN. + @attention: The last hidden state (both the forward and the backward hidden states) + are put through a fully connected layer with tanh activation(As in paper). + """ + + outputs, (hidden_state, cell_state) = self.rnn(input_seq) + # outputs(seq_len, num_features) + # hidden_state(2, sequence_len, hidden_size): Last hidden state of the LSTM + + # Hidden states are indexed as -1 and -2 because, the hidden state contains the forward + # which is the first and backward hidden state which is the second state. + hidden_forward, hidden_backward = hidden_state[-2, :, :], hidden_state[-1, :, :] + combined_hidden_state = torch.cat((hidden_forward, hidden_backward), dim=1) + # note: Check this linear layer here. This is deviation from the paper. In might be wrong. + context_vector = torch.tanh(self.linear(combined_hidden_state)) + + return outputs, context_vector + + def get_encoder_dimensions(self): + encoder_output_size = self.encoder_hidden_size * 2 + context_vector_size = self.encoder_hidden_size + + return encoder_output_size, context_vector_size + + +class Attention(nn.Module): + """ + A wrapper class to calculate the attention weights through softmax. + """ + + def __init__(self, align_function): + super(Attention, self).__init__() + # The align function must always return a tensor of size [batch_size, seq_len] + self.align_function = align_function + self.softmax = nn.Softmax(dim=1) + + def forward(self, encoder_outputs, decoder_hidden_state): + """ + + @param encoder_outputs(batch_size, seq_len, encoder_output_dim): The encoder_outputs that need to be used to calculate the energy + between the last hidden state of the decoder. + + @param decoder_hidden_state(batch_size, context_vector_dim): Last hidden state of the decoder. It is the last hidden state of the encoder at t=0. + (initially) + @return: Return the weights that need to be used to calculate the expected context vector. + """ + + attention = self.align_function(encoder_outputs, decoder_hidden_state) + + assert len(attention.shape) == 2, "Attention tensor must have exactly two dimensions, being of the shape [batch_size, seq_len]" + b_s, s_len, encoder_h_s = helper.get_dimensions_from_encoder_outputs(encoder_outputs) + attention__b_s, attention_s_len = attention.shape + assert (b_s == attention__b_s) and (attention_s_len == s_len), "Attention vector must have shape of [batch_size, seq_len]." + + attention_weights = self.softmax(attention) + + return attention_weights + + +class ExpectedContextVectorAfterAttention(nn.Module): + """ + This class encapsulates the procedure for calculating the expected vector from the attention weights and the + sequence of encoder outputs. + """ + + def __init__(self, attention): + """ + + @param attention: The attention that needs to be used to calculate the expected vector. + """ + super(ExpectedContextVectorAfterAttention, self).__init__() + self.attention = attention + + def forward(self, encoder_outputs, previous_decoder_hidden_state): + """ + @param previous_decoder_hidden_state(batch_size, decoder_hidden_size): Hidden state of the Decoder at t-1. In the first pass this + will be the last hidden state of the Encoder (context vector) + @param encoder_outputs(batch_size, seq_len, encoder_hidden_dim * 2): The sequence of Encoder ouputs. + @return: expected_vector from encoder hidden states after attention. Shape (batch_size, 1, encoder_hidden_dim * 2) + """ + + attention_weights = self.attention(encoder_outputs, previous_decoder_hidden_state) + # attention weights = [batch_size, seq_len] + attention_weights = attention_weights.unsqueeze(1) + # attention weights = [batch_size, 1, seq_len] + expected_encoder_output = torch.bmm(attention_weights, encoder_outputs) + # expected_encoder_output = [batch_size, 1, encoder_output_size], this is also the weighted encoder output. + + return expected_encoder_output + + +class AttentionDecoder(nn.Module): + """ + This class is AttentionDecoder. + """ + + def __init__(self, + output_size, + encoder_output_size, + decoder_hidden_size, + attention, + input_size, + context_vector_size=None, + dropout_p=0): + """ + @param output_size: This is equal to input size if the Encoder-Decoder pair is used for dimensionality reduction. + @param context_vector_size: This is just accepted to assert the correct size of the decoder hidden state. + @param attention: Attention model to be used for the decoder. + """ + super(AttentionDecoder, self).__init__() + # The encoder_output_size should be equal to the decoder_hidden_state + # because this is what is used as the first hidden state in the decoder. + # This is used to calculate the expected context vector. + if context_vector_size is not None: + assert context_vector_size == decoder_hidden_size, "context_vector size not equal to the decoder_hidden size." + + self.output_size = output_size + self.encoder_output_size = encoder_output_size + self.decoder_hidden_size = decoder_hidden_size + self.input_size = input_size + self.dropout_p = dropout_p + + self.expected_vector_from_attention = ExpectedContextVectorAfterAttention(attention) + + # Decoder rnn. LSTM for now, but this could be changed to any RNN. + self.rnn = nn.LSTM(input_size=self.input_size + self.encoder_output_size, + hidden_size=self.decoder_hidden_size, + batch_first=True) + # This layer does a transform to a concatenated [encoder_outputs, decoder_rnn_output, input(target)] + self.output = nn.Linear(self.encoder_output_size + self.decoder_hidden_size + self.input_size, self.output_size) + self.dropout = nn.Dropout(p=self.dropout_p) + + def forward(self, input_vector, encoder_outputs, previous_decoder_hidden_state): + """ + @param input_vector(batch_size, 1, input_size): The input vector that needs to be translated. + @param previous_decoder_hidden_state(batch_size, decoder_hidden_size): Hidden state of the Decoder at t-1. In the first pass this + will be the last hidden state of the Encoder (context vector) + @param encoder_outputs(batch_size, seq_len, encoder_hidden_dim * 2): The sequence of Encoder ouputs. + + @return: The predicted target that can be used to minimize the reconstruction loss. + """ + + expected_encoder_output = self.expected_vector_from_attention(encoder_outputs, previous_decoder_hidden_state) + rnn_input = torch.cat((input_vector, expected_encoder_output), dim=2) + # rnn_input = [batch_size, 1, input_size + encoder_output_size] + + rnn_output, (hidden, cell) = self.rnn(rnn_input) + # rnn_output = [batch_size, seq_len, input_vector_seq_len, decoder_hidden_size] seq_len is 1, while decoding + # since the input is a vector and not a sequence. + + assert (hidden == + rnn_output).all(), "Since, n directions and num layers is always 1 the hidden state and output should be the same." + + output = self.output(torch.cat((rnn_output, input_vector, expected_encoder_output), dim=2)) + # output = [batch_size, 1, output_dim] + + return output, hidden.squeeze(1) diff --git a/src/models/attention/helper.py b/src/models/attention/helper.py new file mode 100644 index 0000000..df811f1 --- /dev/null +++ b/src/models/attention/helper.py @@ -0,0 +1,9 @@ +""" +Helper functions for attention models. +""" + + +def get_dimensions_from_encoder_outputs(encoder_outputs): + batch_size, seq_len, encoder_hidden_size = encoder_outputs.shape + + return batch_size, seq_len, encoder_hidden_size diff --git a/src/models/autoencoder.py b/src/models/autoencoder.py new file mode 100644 index 0000000..32bfcf0 --- /dev/null +++ b/src/models/autoencoder.py @@ -0,0 +1,87 @@ +import torch +import torch.nn as nn +import numpy as np + + +class EncoderRNN(nn.Module): + def __init__(self, input_size, hidden_size, num_layers, isCuda, bidirectional=False): + super(EncoderRNN, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size // 2 if bidirectional else hidden_size + self.num_layers = num_layers + self.isCuda = isCuda + self.bidirectional = bidirectional + + self.lstm = nn.LSTM(input_size=self.input_size, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True, + bidirectional=self.bidirectional) + + self.relu = nn.ReLU() + + # initialize weights + nn.init.xavier_uniform(self.lstm.weight_ih_l0, gain=np.sqrt(2)) + nn.init.xavier_uniform(self.lstm.weight_hh_l0, gain=np.sqrt(2)) + + def forward(self, input_seq): + tt = torch.cuda if self.isCuda else torch + # h0 = torch.autograd.Variable(tt.FloatTensor(self.num_layers, input_seq.size(0), self.hidden_size)) + # c0 = torch.autograd.Variable(tt.FloatTensor(self.num_layers, input_seq.size(0), self.hidden_size)) + + encoded_input, hidden = self.lstm(input_seq) + encoded_input = self.relu(encoded_input) + return encoded_input + + +class DecoderRNN(nn.Module): + def __init__(self, hidden_size, output_size, num_layers, isCuda): + """ + + @param hidden_size: Hidden size is the size of the encoded input, usually encoder hidden size. + """ + super(DecoderRNN, self).__init__() + self.hidden_size = hidden_size + self.output_size = output_size + self.num_layers = num_layers + self.isCuda = isCuda + + self.lstm = nn.LSTM(input_size=self.hidden_size, + hidden_size=self.output_size, + num_layers=self.num_layers, + batch_first=True) + self.sigmoid = nn.Sigmoid() + + # initialize weights + nn.init.xavier_uniform(self.lstm.weight_ih_l0, gain=np.sqrt(2)) + nn.init.xavier_uniform(self.lstm.weight_hh_l0, gain=np.sqrt(2)) + + def forward(self, encoded_input): + tt = torch.cuda if self.isCuda else torch + # h0 = torch.autograd.Variable(tt.FloatTensor(self.num_layers, encoded_input.size(0), self.output_size)) + # c0 = torch.autograd.Variable(tt.FloatTensor(self.num_layers, encoded_input.size(0), self.output_size)) + decoded_output, hidden = self.lstm(encoded_input) + decoded_output = self.sigmoid(decoded_output) + return decoded_output + + +class LSTMAE(nn.Module): + def __init__(self, input_size, hidden_size, num_layers, isCuda=False, bidirectional=False): + super(LSTMAE, self).__init__() + self.encoder = EncoderRNN(input_size, + hidden_size, + num_layers, + isCuda, + bidirectional) + self.decoder = DecoderRNN(hidden_size, + input_size, + num_layers, + isCuda) + + def forward(self, input_seq): + encoded_input = self.encoder(input_seq) + decoded_output = self.decoder(encoded_input) + return decoded_output + + def get_bottleneck_features(self, input_seq): + return self.encoder(input_seq) diff --git a/src/models/autoencoder_classifier.py b/src/models/autoencoder_classifier.py new file mode 100644 index 0000000..dfa454f --- /dev/null +++ b/src/models/autoencoder_classifier.py @@ -0,0 +1,100 @@ +import torch +import torch.nn as nn + +from src.models import autoencoder +from src.bin import validations + + +class AutoEncoderClassifier(nn.Module): + def __init__(self, + users: list, + autoencoder_input_size, + autoencoder_bottleneck_feature_size, + autoencoder_num_layers, + shared_hidden_layer_size, + user_dense_layer_hidden_size, + num_classes, + num_covariates=0, + shared_layer_dropout_prob=0, + user_head_dropout_prob=0, + ordinal_regression_head=False, + bidirectional=False): + """ + This model has a dense layer for each student. This is used for MultiTask learning. + + @param users: List of students (their ids) that are going to be used for trained. + @param autoencoder_input_size: Input size of the time series portion on the model. + @param autoencoder_bottleneck_feature_size: Encoded input size of autoecoder. + @param autoencoder_num_layers: Num layers in autoencoder LSTM model. + @param user_dense_layer_hidden_size: dense head hidden size. Same for all users. + @param num_classes: Number of classes in classification. + @param num_covariates: Number of covariates to be concatenated to the dense layer before + generating class probabilities. + """ + super(AutoEncoderClassifier, self).__init__() + self.is_cuda_avail = True if torch.cuda.device_count() > 0 else False + self.users = users + self.autoencoder_input_size = autoencoder_input_size + self.autoencoder_bottleneck_feature_size = autoencoder_bottleneck_feature_size + self.autoencoder_num_layers = autoencoder_num_layers + self.shared_hidden_layer_size = shared_hidden_layer_size + self.user_dense_layer_hidden_size = user_dense_layer_hidden_size + self.num_classes = num_classes + self.num_covariates = num_covariates + self.shared_layer_dropout_prob = shared_layer_dropout_prob + self.user_head_dropout_prob = user_head_dropout_prob + self.ordinal_regression_head = ordinal_regression_head + self.bidirectional = bidirectional + + # Layer initialization. + self.autoencoder = autoencoder.LSTMAE(self.autoencoder_input_size, + self.autoencoder_bottleneck_feature_size, + self.autoencoder_num_layers, + self.is_cuda_avail, + self.bidirectional) + + self.shared_linear = nn.Linear(self.autoencoder_bottleneck_feature_size + self.num_covariates, + self.shared_hidden_layer_size) + + self.shared_activation = nn.ReLU() + self.shared_layer_dropout = nn.Dropout(p=self.shared_layer_dropout_prob) + self.shared_linear_1 = nn.Linear(self.shared_hidden_layer_size, self.shared_hidden_layer_size // 2) + self.shared_activation_1 = nn.ReLU() + + self.user_head = nn.Sequential(nn.Linear(self.shared_hidden_layer_size // 2, self.user_dense_layer_hidden_size), + nn.ReLU(), + nn.Linear(self.user_dense_layer_hidden_size, self.num_classes)) + + def forward(self, user, input_seq, covariate_data=None): + """ + Slightly complex forward pass. The autoencoder part return the decoded output + which needs to be trained using MAE or MSE. The user head returns a vector of + class probability distributions which need to be trained using cross entropy. + + @param user: The student for which the model is being trained. All the students + contribute towards the loss of the auto encoder, but each have a separate linear + head. + @param input_seq: Must contain the input sequence that will be used to train the + autoencoder. + @param covariate_data: The covariates which will be concatenated with the output + of the autoencoders before being used for classification. + @return: output of the autoencoder and the probability distribution of each class + for the student. + """ + validations.validate_integrity_of_covariates(self.num_covariates, covariate_data) + autoencoder_out = self.autoencoder(input_seq) + bottle_neck = self.autoencoder.get_bottleneck_features(input_seq) + bottle_neck = bottle_neck[:, -1, :] + + if covariate_data is not None: + bottle_neck = torch.cat((bottle_neck, covariate_data.unsqueeze(0)), dim=1) + + shared_hidden_state = self.shared_linear(bottle_neck) + shared_hidden_state = self.shared_activation(shared_hidden_state) + # shared_hidden_state = self.shared_layer_dropout(shared_hidden_state) + shared_hidden_state_1 = self.shared_linear_1(shared_hidden_state) + shared_hidden_state_1 = self.shared_activation_1(shared_hidden_state_1) + + y_out = self.user_head(shared_hidden_state_1) + + return autoencoder_out, y_out diff --git a/src/models/grud.py b/src/models/grud.py new file mode 100644 index 0000000..876f53e --- /dev/null +++ b/src/models/grud.py @@ -0,0 +1,360 @@ +""" +Python module that defines GRUD NN module. +""" +import torch +import math +import warnings +import numbers + + +class GRUD(torch.nn.Module): + def __init__(self, input_size, hidden_size, output_size, num_layers=1, x_mean=0, + bias=True, batch_first=False, bidirectional=False, dropout_type='mloss', dropout=0): + super(GRUD, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + self.num_layers = num_layers + self.zeros = torch.autograd.Variable(torch.zeros(input_size)) + self.x_mean = torch.autograd.Variable(torch.tensor(x_mean, dtype=torch.float)) + self.bias = bias + self.batch_first = batch_first + self.dropout_type = dropout_type + self.dropout = dropout + self.bidirectional = bidirectional + num_directions = 2 if bidirectional else 1 + + if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \ + isinstance(dropout, bool): + raise ValueError("dropout should be a number in range [0, 1] " + "representing the probability of an element being " + "zeroed") + if dropout > 0 and num_layers == 1: + warnings.warn("dropout option adds dropout after all but last " + "recurrent layer, so non-zero dropout expects " + "num_layers greater than 1, but got dropout={} and " + "num_layers={}".format(dropout, num_layers)) + + ################################ + gate_size = 1 # not used + ################################ + + self._all_weights = [] + + ''' + w_ih = Parameter(torch.Tensor(gate_size, layer_input_size)) + w_hh = Parameter(torch.Tensor(gate_size, hidden_size)) + b_ih = Parameter(torch.Tensor(gate_size)) + b_hh = Parameter(torch.Tensor(gate_size)) + layer_params = (w_ih, w_hh, b_ih, b_hh) + ''' + # decay rates gamma + w_dg_x = torch.nn.Parameter(torch.Tensor(input_size)) + w_dg_h = torch.nn.Parameter(torch.Tensor(hidden_size)) + + # z + w_xz = torch.nn.Parameter(torch.Tensor(input_size)) + w_hz = torch.nn.Parameter(torch.Tensor(hidden_size)) + w_mz = torch.nn.Parameter(torch.Tensor(input_size)) + + # r + w_xr = torch.nn.Parameter(torch.Tensor(input_size)) + w_hr = torch.nn.Parameter(torch.Tensor(hidden_size)) + w_mr = torch.nn.Parameter(torch.Tensor(input_size)) + + # h_tilde + w_xh = torch.nn.Parameter(torch.Tensor(input_size)) + w_hh = torch.nn.Parameter(torch.Tensor(hidden_size)) + w_mh = torch.nn.Parameter(torch.Tensor(input_size)) + + # y (output) + w_hy = torch.nn.Parameter(torch.Tensor(output_size, hidden_size)) + + # bias + b_dg_x = torch.nn.Parameter(torch.Tensor(input_size)) + b_dg_h = torch.nn.Parameter(torch.Tensor(hidden_size)) + b_z = torch.nn.Parameter(torch.Tensor(hidden_size)) + b_r = torch.nn.Parameter(torch.Tensor(hidden_size)) + b_h = torch.nn.Parameter(torch.Tensor(hidden_size)) + b_y = torch.nn.Parameter(torch.Tensor(output_size)) + + layer_params = (w_dg_x, w_dg_h, \ + w_xz, w_hz, w_mz, \ + w_xr, w_hr, w_mr, \ + w_xh, w_hh, w_mh, \ + w_hy, \ + b_dg_x, b_dg_h, b_z, b_r, b_h, b_y) + + param_names = ['weight_dg_x', 'weight_dg_h', \ + 'weight_xz', 'weight_hz', 'weight_mz', \ + 'weight_xr', 'weight_hr', 'weight_mr', \ + 'weight_xh', 'weight_hh', 'weight_mh', \ + 'weight_hy'] + if bias: + param_names += ['bias_dg_x', 'bias_dg_h', \ + 'bias_z', \ + 'bias_r', \ + 'bias_h', \ + 'bias_y'] + + for name, param in zip(param_names, layer_params): + setattr(self, name, param) + self._all_weights.append(param_names) + + self.flatten_parameters() + self.reset_parameters() + + def flatten_parameters(self): + """ + Resets parameter data pointer so that they can use faster code paths. + Right now, this works only if the module is on the GPU and cuDNN is enabled. + Otherwise, it's a no-op. + """ + any_param = next(self.parameters()).data + if not any_param.is_cuda or not torch.backends.cudnn.is_acceptable(any_param): + return + + # If any parameters alias, we fall back to the slower, copying code path. This is + # a sufficient check, because overlapping parameter buffers that don't completely + # alias would break the assumptions of the uniqueness check in + # Module.named_parameters(). + all_weights = self._flat_weights + unique_data_ptrs = set(p.data_ptr() for p in all_weights) + if len(unique_data_ptrs) != len(all_weights): + return + + with torch.cuda.device_of(any_param): + import torch.backends.cudnn.rnn as rnn + + # NB: This is a temporary hack while we still don't have Tensor + # bindings for ATen functions + with torch.no_grad(): + # NB: this is an INPLACE function on all_weights, that's why the + # no_grad() is necessary. + torch._cudnn_rnn_flatten_weight( + all_weights, (4 if self.bias else 2), + self.input_size, rnn.get_cudnn_mode(self.mode), self.hidden_size, self.num_layers, + self.batch_first, bool(self.bidirectional)) + + def _apply(self, fn): + ret = super(GRUD, self)._apply(fn) + self.flatten_parameters() + return ret + + def reset_parameters(self): + stdv = 1.0 / math.sqrt(self.hidden_size) + for weight in self.parameters(): + torch.nn.init.uniform_(weight, -stdv, stdv) + + def check_forward_args(self, input, hidden, batch_sizes): + is_input_packed = batch_sizes is not None + expected_input_dim = 2 if is_input_packed else 3 + if input.dim() != expected_input_dim: + raise RuntimeError( + 'input must have {} dimensions, got {}'.format( + expected_input_dim, input.dim())) + if self.input_size != input.size(-1): + raise RuntimeError( + 'input.size(-1) must be equal to input_size. Expected {}, got {}'.format( + self.input_size, input.size(-1))) + + if is_input_packed: + mini_batch = int(batch_sizes[0]) + else: + mini_batch = input.size(0) if self.batch_first else input.size(1) + + num_directions = 2 if self.bidirectional else 1 + expected_hidden_size = (self.num_layers * num_directions, + mini_batch, self.hidden_size) + + def check_hidden_size(hx, expected_hidden_size, msg='Expected hidden size {}, got {}'): + if tuple(hx.size()) != expected_hidden_size: + raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size()))) + + if self.mode == 'LSTM': + check_hidden_size(hidden[0], expected_hidden_size, + 'Expected hidden[0] size {}, got {}') + check_hidden_size(hidden[1], expected_hidden_size, + 'Expected hidden[1] size {}, got {}') + else: + check_hidden_size(hidden, expected_hidden_size) + + def extra_repr(self): + s = '{input_size}, {hidden_size}' + if self.num_layers != 1: + s += ', num_layers={num_layers}' + if self.bias is not True: + s += ', bias={bias}' + if self.batch_first is not False: + s += ', batch_first={batch_first}' + if self.dropout != 0: + s += ', dropout={dropout}' + if self.bidirectional is not False: + s += ', bidirectional={bidirectional}' + return s.format(**self.__dict__) + + def __setstate__(self, d): + super(GRUD, self).__setstate__(d) + if 'all_weights' in d: + self._all_weights = d['all_weights'] + if isinstance(self._all_weights[0][0], str): + return + num_layers = self.num_layers + num_directions = 2 if self.bidirectional else 1 + self._all_weights = [] + + weights = ['weight_dg_x', 'weight_dg_h', \ + 'weight_xz', 'weight_hz', 'weight_mz', \ + 'weight_xr', 'weight_hr', 'weight_mr', \ + 'weight_xh', 'weight_hh', 'weight_mh', \ + 'weight_hy', \ + 'bias_dg_x', 'bias_dg_h', \ + 'bias_z', 'bias_r', 'bias_h', 'bias_y'] + + if self.bias: + self._all_weights += [weights] + else: + self._all_weights += [weights[:2]] + + @property + def _flat_weights(self): + return list(self._parameters.values()) + + @property + def all_weights(self): + return [[getattr(self, weight) for weight in weights] for weights in self._all_weights] + + def forward(self, input): + """ + + @param input: Input of a tuple - (X, M, Delta) + 1. X - Data of shape (num_features x sequence length) + The sequence length is mean to be constant, or should be equal to the minimum seq. length. + 2. M - Missing mask corresponding to data X. 0 when data is missing and 1 when present. + 3. Delta - Time delta between the current step and last observed value. + @return: Y_pred of shape 1 X output_size (usually equal to num_classes) + """ + # input.size = (3, 33,49) : num_input or num_hidden, num_layer or step + X = torch.squeeze(input[0]) # .size = (33,49) + Mask = torch.squeeze(input[1]) # .size = (33,49) + Delta = torch.squeeze(input[2]) # .size = (33,49) + Hidden_State = torch.autograd.Variable(torch.zeros(self.input_size)) + + step_size = X.size(1) # 49 + # print('step size : ', step_size) + + output = None + h = Hidden_State + + # decay rates gamma + w_dg_x = getattr(self, 'weight_dg_x') + w_dg_h = getattr(self, 'weight_dg_h') + + # z + w_xz = getattr(self, 'weight_xz') + w_hz = getattr(self, 'weight_hz') + w_mz = getattr(self, 'weight_mz') + + # r + w_xr = getattr(self, 'weight_xr') + w_hr = getattr(self, 'weight_hr') + w_mr = getattr(self, 'weight_mr') + + # h_tilde + w_xh = getattr(self, 'weight_xh') + w_hh = getattr(self, 'weight_hh') + w_mh = getattr(self, 'weight_mh') + + # bias + b_dg_x = getattr(self, 'bias_dg_x') + b_dg_h = getattr(self, 'bias_dg_h') + b_z = getattr(self, 'bias_z') + b_r = getattr(self, 'bias_r') + b_h = getattr(self, 'bias_h') + + for layer in range(self.num_layers): + + x = torch.squeeze(X[:, layer:layer + 1]) + m = torch.squeeze(Mask[:, layer:layer + 1]) + d = torch.squeeze(Delta[:, layer:layer + 1]) + + # (4) + gamma_x = torch.exp(-torch.max(self.zeros, (w_dg_x * d + b_dg_x))) + gamma_h = torch.exp(-torch.max(self.zeros, (w_dg_h * d + b_dg_h))) + + # (5) + x = m * x + (1 - m) * (gamma_x * x + (1 - gamma_x) * self.x_mean) + + # (6) + if self.dropout == 0: + h = gamma_h * h + + z = torch.sigmoid((w_xz * x + w_hz * h + w_mz * m + b_z)) + r = torch.sigmoid((w_xr * x + w_hr * h + w_mr * m + b_r)) + h_tilde = torch.tanh((w_xh * x + w_hh * (r * h) + w_mh * m + b_h)) + + h = (1 - z) * h + z * h_tilde + + elif self.dropout_type == 'Moon': + ''' + RNNDROP: a novel dropout for rnn in asr(2015) + ''' + h = gamma_h * h + + z = torch.sigmoid((w_xz * x + w_hz * h + w_mz * m + b_z)) + r = torch.sigmoid((w_xr * x + w_hr * h + w_mr * m + b_r)) + + h_tilde = torch.tanh((w_xh * x + w_hh * (r * h) + w_mh * m + b_h)) + + h = (1 - z) * h + z * h_tilde + dropout = torch.nn.Dropout(p=self.dropout) + h = dropout(h) + + elif self.dropout_type == 'Gal': + ''' + A Theoretically grounded application of dropout in recurrent neural networks(2015) + ''' + dropout = torch.nn.Dropout(p=self.dropout) + h = dropout(h) + + h = gamma_h * h + + z = torch.sigmoid((w_xz * x + w_hz * h + w_mz * m + b_z)) + r = torch.sigmoid((w_xr * x + w_hr * h + w_mr * m + b_r)) + h_tilde = torch.tanh((w_xh * x + w_hh * (r * h) + w_mh * m + b_h)) + + h = (1 - z) * h + z * h_tilde + + elif self.dropout_type == 'mloss': + ''' + recurrent dropout without memory loss arXiv 1603.05118 + g = h_tilde, p = the probability to not drop a neuron + ''' + + h = gamma_h * h + + z = torch.sigmoid((w_xz * x + w_hz * h + w_mz * m + b_z)) + r = torch.sigmoid((w_xr * x + w_hr * h + w_mr * m + b_r)) + h_tilde = torch.tanh((w_xh * x + w_hh * (r * h) + w_mh * m + b_h)) + + dropout = torch.nn.Dropout(p=self.dropout) + h_tilde = dropout(h_tilde) + + h = (1 - z) * h + z * h_tilde + + else: + h = gamma_h * h + + z = torch.sigmoid((w_xz * x + w_hz * h + w_mz * m + b_z)) + r = torch.sigmoid((w_xr * x + w_hr * h + w_mr * m + b_r)) + h_tilde = torch.tanh((w_xh * x + w_hh * (r * h) + w_mh * m + b_h)) + + h = (1 - z) * h + z * h_tilde + + w_hy = getattr(self, 'weight_hy') + b_y = getattr(self, 'bias_y') + + output = torch.matmul(w_hy, h) + b_y + output = torch.sigmoid(output) + + return output diff --git a/src/models/layers.py b/src/models/layers.py new file mode 100644 index 0000000..ae821ec --- /dev/null +++ b/src/models/layers.py @@ -0,0 +1,197 @@ +import torch +import torch.nn as nn +import math +from einops import rearrange + +# =========== HELPER LAYER ======================================================================== +class CheckShape(nn.Module): + def __init__(self, remark, key=None): + super().__init__() + self.remark = remark + self.key = key + def forward(self, x): + if self.remark is not None: + print(self.remark, x.shape) + + out = x + if self.key is not None: + out = self.key(x) + return out + +# =========== Encoder Block ======================================================================== +class EmbConvBlock(nn.Module): + def __init__( + self, + in_channel, # 22 + T_kernel_size, # 8, + emb_size=64, + hidden_size=64*4 + ): + super().__init__() + + # Input shape: (Series_length, Channel, 1) + self.liner = nn.Sequential( + # CheckShape("Before in"), + # Temporal + nn.Conv2d(1, hidden_size, kernel_size=[T_kernel_size, 1], padding='same'), + # nn.Conv2d(1, emb_size, kernel_size=[T_kernel_size, 1], padding='same', dilation=2), # no warning + nn.BatchNorm2d(hidden_size), + nn.GELU(), + # CheckShape("1st conv"), + # Spatial + nn.Conv2d(hidden_size, emb_size, kernel_size=[1, in_channel], padding='valid'), + nn.BatchNorm2d(emb_size), + nn.GELU(), + CheckShape(None, key=lambda x: torch.permute(x, (0, 3, 2, 1))), + # CheckShape("Emb out"), + ) + + def forward(self, x): + # Input shape: (N, L, C, 1) + out = self.liner(x) + return out + +# =========== MAIN TRANSFORMER LAYERS ======================================================================== +class tAPE(nn.Module): + def __init__(self, d_model, dropout=0.1, max_len=1024, scale_factor=1.0): + super(tAPE, self).__init__() + self.dropout = nn.Dropout(p=dropout) + pe = torch.zeros(max_len, d_model) # positional encoding + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + + pe[:, 0::2] = torch.sin((position * div_term)*(d_model/max_len)) + pe[:, 1::2] = torch.cos((position * div_term)*(d_model/max_len)) + pe = scale_factor * pe.unsqueeze(0) + self.register_buffer('pe', pe) # this stores the variable in the state_dict (used for non-trainable variables) + + def forward(self, x): + x = x + self.pe + return self.dropout(x) + +class Attention(nn.Module): + def __init__(self, emb_size, num_heads, seq_len=22, dropout=0.5, add_norm=True): + super().__init__() + self.add_norm = add_norm + self.seq_len = seq_len + self.num_heads = num_heads + self.scale = emb_size ** -0.5 + # self.to_qkv = nn.Linear(inp, inner_dim * 3, bias=False) + self.key = nn.Linear(emb_size, emb_size, bias=False) + self.value = nn.Linear(emb_size, emb_size, bias=False) + self.query = nn.Linear(emb_size, emb_size, bias=False) + + self.relative_bias_table = nn.Parameter(torch.zeros((2 * self.seq_len - 1), num_heads)) + coords = torch.meshgrid((torch.arange(1), torch.arange(self.seq_len))) + coords = torch.flatten(torch.stack(coords), 1) + relative_coords = coords[:, :, None] - coords[:, None, :] + relative_coords[1] += self.seq_len - 1 + relative_coords = rearrange(relative_coords, 'c h w -> h w c') + relative_index = relative_coords.sum(-1).flatten().unsqueeze(1) + self.register_buffer("relative_index", relative_index) + + self.dropout = nn.Dropout(p=dropout) + self.to_out = nn.LayerNorm(emb_size) + + self.LayerNorm = nn.LayerNorm(emb_size, eps=1e-5) + + def forward(self, x): + batch_size, seq_len, _ = x.shape + # compute key, query, value vectors + k = self.key(x).reshape(batch_size, seq_len, self.num_heads, -1).permute(0, 2, 3, 1) + v = self.value(x).reshape(batch_size, seq_len, self.num_heads, -1).transpose(1, 2) + q = self.query(x).reshape(batch_size, seq_len, self.num_heads, -1).transpose(1, 2) + + # compute attention + attn = torch.matmul(q, k) * self.scale + attn = nn.functional.softmax(attn, dim=-1) + + # add bias + # Use "gather" for more efficiency on GPUs + relative_bias = self.relative_bias_table.gather(0, self.relative_index.repeat(1, 8)) + relative_bias = rearrange(relative_bias, '(h w) c -> 1 c h w', h=1 * self.seq_len, w=1 * self.seq_len) + attn = attn + relative_bias + + # final out + out = torch.matmul(attn, v) + out = out.transpose(1, 2) + out = out.reshape(batch_size, seq_len, -1) + out = self.to_out(out) + + # Add & Norm + if self.add_norm: + return self.LayerNorm(x + out) + else: + return out + +class FeedForward(nn.Module): + def __init__(self, emb_size, hidden_size, dropout=0.5, add_norm=True): + super().__init__() + self.add_norm = add_norm + + self.fc_liner = nn.Sequential( + nn.Linear(emb_size, hidden_size), + nn.ReLU(), + nn.Dropout(p=dropout), + nn.Linear(hidden_size, emb_size), + nn.Dropout(p=dropout), + # CheckShape("FC out") + ) + + self.LayerNorm = nn.LayerNorm(emb_size, eps=1e-5) + + def forward(self, x): + out = self.fc_liner(x) + if self.add_norm: + return self.LayerNorm(x + out) + return out + +class Transformer(nn.Module): + def __init__( + self, + emb_size=16, + num_heads=8, + dropout=0.1, + hidden_size=256, + add_norm=True, + # data related + in_channel=22, + seq_length=1024, + ): + super().__init__() + + # Input shape: (Series_length, Channel, 1) + self.encoder = EmbConvBlock(in_channel, 8, emb_size, hidden_size=emb_size*4) + + self.transformer = nn.Sequential( + CheckShape(None, key=lambda x: x.squeeze(1)), + tAPE(emb_size, dropout=dropout, max_len=seq_length), + Attention(emb_size, num_heads, seq_len=seq_length, dropout=dropout, add_norm=add_norm), + FeedForward(emb_size, hidden_size, dropout=0.5, add_norm=add_norm), + # CheckShape("Final Shape") + ) + + def forward(self, x): + # Input shape: (N, L, C) + x = x.unsqueeze(1) # (N, 1, L, C) + emb = self.encoder(x) # (N, 1, L, E) + out = self.transformer(emb) # (N, L, E) + return out + +if __name__ == '__main__': + num_sample = 5 + seq_length = 32 + in_channel = 3 + + x = torch.rand((num_sample, seq_length, in_channel)) + layer = Transformer( + emb_size=16, + num_heads=8, + dropout=0.1, + hidden_size=256, + add_norm=True, + # data related + in_channel=in_channel, + seq_length=seq_length, + ) + y = layer(x) diff --git a/src/models/model_config_loader.py b/src/models/model_config_loader.py new file mode 100644 index 0000000..1371fe2 --- /dev/null +++ b/src/models/model_config_loader.py @@ -0,0 +1,61 @@ +import torch +from src.utils import read_utils +from src import definitions +from src.bin import validations +from src.bin import statistics +from src.utils import data_conversion_utils as conversions + + +def load_static_configs_for_lstm_n_multitask_models(model): + model_config = read_utils.read_yaml(definitions.MODEL_CONFIG_FILE_PATH)[ + 'lstm_n_multitask'] + validations.validate_config_key(model, config=model_config) + + # Global configs which are common to every model. + use_histogram = model_config['use_histogram'] + autoencoder_bottle_neck_feature_size = model_config['autoencoder_bottle_neck_feature_size'] + autoencoder_num_layers = model_config['autoencoder_num_layers'] + shared_hidden_layer_size = model_config['shared_hidden_layer_size'] + user_dense_layer_hidden_size = model_config['user_dense_layer_hidden_size'] + num_classes = model_config['num_classes'] + decay = model_config['decay'] + shared_layer_dropout_prob = model_config['shared_layer_dropout_prob'] + user_head_dropout_prob = model_config['user_head_dropout_prob'] + + # Specific configs that vary across models. + alpha = model_config[model]['alpha'] + beta = model_config[model]['beta'] + learning_rate = model_config[model]['learning_rate'] + n_epochs = model_config[model]['n_epochs'] + bidirectional = model_config[model]['bidirectional'] + + return (use_histogram, + autoencoder_bottle_neck_feature_size, + autoencoder_num_layers, + shared_hidden_layer_size, + user_dense_layer_hidden_size, + num_classes, + decay, + shared_layer_dropout_prob, + user_head_dropout_prob, + alpha, + beta, + learning_rate, + n_epochs, + bidirectional) + + +def load_derived_configs_for_lstm_n_multitask_models(use_histogram, data): + # Derived Configs + first_key = next(iter(data['data'].keys())) + if use_histogram: + num_features = len(data['data'][first_key][4][0]) + else: + num_features = len(data['data'][first_key][0][0]) + num_covariates = len(data['data'][first_key][definitions.COVARIATE_DATA_IDX]) + device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu') + class_weights = torch.tensor(statistics.get_class_weights_in_inverse_proportion(data)) + cuda_enabled = torch.cuda.is_available() + student_list = conversions.extract_distinct_student_idsfrom_keys(data['data'].keys()) + + return num_features, num_covariates, device, class_weights, cuda_enabled, student_list diff --git a/src/models/multitask_learning/__init__.py b/src/models/multitask_learning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/multitask_learning/multitask_autoencoder.py b/src/models/multitask_learning/multitask_autoencoder.py new file mode 100644 index 0000000..4d82c79 --- /dev/null +++ b/src/models/multitask_learning/multitask_autoencoder.py @@ -0,0 +1,200 @@ +import torch +import torch.nn as nn + +from src.models import autoencoder +from src.models import user_dense_heads +from src.bin import validations +from src.utils import object_generator_utils as object_generator + + +class MultiTaskAutoEncoderLearner(nn.Module): + def __init__(self, + users: list, + groups: dict, + num_branches, + autoencoder_input_size, + autoencoder_bottleneck_feature_size, + autoencoder_num_layers, + shared_hidden_layer_size, + user_dense_layer_hidden_size, + num_classes, + num_covariates=0, + shared_layer_dropout_prob=0, + user_head_dropout_prob=0, + ordinal_regression_head=False, + train_only_with_covariates=False, + bidirectional=False): + """ + This model has a dense layer for each student. This is used for MultiTask learning. + + @param users: List of students (their ids) that are going to be used for trained. + @param autoencoder_input_size: Input size of the time series portion on the model. + @param autoencoder_bottleneck_feature_size: Encoded input size of autoecoder. + @param autoencoder_num_layers: Num layers in autoencoder LSTM model. + @param user_dense_layer_hidden_size: dense head hidden size. + @param num_classes: Number of classes in classification. + @param num_covariates: Number of covariates to be concatenated to the dense layer before + generating class probabilities. + """ + super(MultiTaskAutoEncoderLearner, self).__init__() + + if train_only_with_covariates: + assert num_covariates > 0, "The model has to be provided either input sequence or covariates." + + self.is_cuda_avail = True if torch.cuda.device_count() > 0 else False + self.users = users + self.groups = groups + self.num_branches = num_branches + self.autoencoder_input_size = autoencoder_input_size + # Ignore the autoencoder input feature if you are just training on sequences. + self.autoencoder_bottleneck_feature_size = autoencoder_bottleneck_feature_size if not train_only_with_covariates else 0 + self.autoencoder_num_layers = autoencoder_num_layers + self.shared_hidden_layer_size = shared_hidden_layer_size + self.user_dense_layer_hidden_size = user_dense_layer_hidden_size + self.num_classes = num_classes + self.num_covariates = num_covariates + self.shared_layer_dropout_prob = shared_layer_dropout_prob + self.user_head_dropout_prob = user_head_dropout_prob + self.ordinal_regression_head = ordinal_regression_head + self.train_only_with_covariates = train_only_with_covariates + self.bidirectional = bidirectional + + # Layer initialization. + if not train_only_with_covariates: + self.autoencoder = autoencoder.LSTMAE(self.autoencoder_input_size, + self.autoencoder_bottleneck_feature_size, + self.autoencoder_num_layers, + self.is_cuda_avail, + self.bidirectional) + + self.shared_linear = nn.Linear(self.autoencoder_bottleneck_feature_size + self.num_covariates, + self.shared_hidden_layer_size) + self.shared_activation = nn.ReLU() + self.shared_layer_dropout = nn.Dropout(p=self.shared_layer_dropout_prob) + self.shared_linear_1 = nn.Linear(self.shared_hidden_layer_size, self.shared_hidden_layer_size // 2) + self.shared_activation_1 = nn.ReLU() + + # # Original SOTA module + # self.user_heads = user_dense_heads.UserDenseHead(self.users, + # self.shared_hidden_layer_size // 2, + # self.user_dense_layer_hidden_size, + # self.num_classes, + # self.user_head_dropout_prob, + # self.ordinal_regression_head) + + # Tried module in Spring 2020 (by yunfeiluo) + self.user_heads = user_dense_heads.GroupDenseHead(self.groups, + self.shared_hidden_layer_size // 2, + self.user_dense_layer_hidden_size, + self.num_classes, + self.user_head_dropout_prob, + self.ordinal_regression_head) + + # # New Try in Fall 2020 (by yunfeiluo), with students' head + # self.user_heads = user_dense_heads.BranchingUserDenseHead( + # self.users, + # self.shared_hidden_layer_size // 2, + # self.user_dense_layer_hidden_size, + # self.user_dense_layer_hidden_size, + # self.num_classes, + # self.num_branches, + # self.user_head_dropout_prob, + # self.ordinal_regression_head + # ) + + # # without students' head, end after branching layers + # self.user_heads = user_dense_heads.BranchingDenseHead( + # self.users, + # self.shared_hidden_layer_size // 2, + # self.user_dense_layer_hidden_size, + # self.num_classes, + # self.num_branches, + # self.user_head_dropout_prob, + # self.ordinal_regression_head + # ) + + # # without shared dense layer, but with user heads + # self.user_heads = user_dense_heads.BranchingUserDenseHead( + # self.users, + # self.autoencoder_bottleneck_feature_size + self.num_covariates, + # self.shared_hidden_layer_size, + # self.user_dense_layer_hidden_size, + # self.num_classes, + # self.num_branches, + # self.user_head_dropout_prob, + # self.ordinal_regression_head + # ) + + # # curr best on 5-fold + # # without shared dense layer, but with user heads + # self.user_heads = user_dense_heads.BranchingUserBlock( + # self.users, + # self.autoencoder_bottleneck_feature_size + self.num_covariates, + # self.shared_hidden_layer_size, + # self.user_dense_layer_hidden_size, + # self.num_classes, + # self.num_branches, + # self.user_head_dropout_prob, + # self.ordinal_regression_head + # ) + + # # with shared dense layer, with user heads + # self.user_heads = user_dense_heads.BranchingUserBlock( + # self.users, + # self.shared_hidden_layer_size // 2, + # self.shared_hidden_layer_size // 2, + # self.user_dense_layer_hidden_size, + # self.num_classes, + # self.num_branches, + # self.user_head_dropout_prob, + # self.ordinal_regression_head + # ) + + def forward(self, user, branch_id, input_seq, covariate_data=None, shared_out=None, has_shared=False): + """ + Slightly complex forward pass. The autoencoder part return the decoded output + which needs to be trained using MAE or MSE. The user head returns a vector of + class probability distributions which need to be trained using cross entropy. + + @param user: The student for which the model is being trained. All the students + contribute towards the loss of the auto encoder, but each have a separate linear + head. + @param input_seq: Must contain the input sequence that will be used to train the + autoencoder. + @param covariate_data: The covariates which will be concatenated with the output + of the autoencoders before being used for classification. + @return: output of the autoencoder and the probability distribution of each class + for the student. + """ + + # return if shared_out is provided + # if has_shared: + # return self.user_heads(user, branch_id, shared_out) + + validations.validate_integrity_of_covariates(self.num_covariates, covariate_data) + # If not training on sequences, do not put the sequences through he auto encoder. + if not self.train_only_with_covariates: + autoencoder_out = self.autoencoder(input_seq) + bottle_neck = self.autoencoder.get_bottleneck_features(input_seq) + bottle_neck = bottle_neck[:, -1, :] + else: + bottle_neck = object_generator.get_tensor_on_correct_device([]) + + if covariate_data is not None: + bottle_neck = torch.cat((bottle_neck, covariate_data.unsqueeze(0)), dim=1) + + # with shared dense layer + shared_hidden_state = self.shared_linear(bottle_neck) + shared_hidden_state = self.shared_activation(shared_hidden_state) + shared_hidden_state = self.shared_layer_dropout(shared_hidden_state) + shared_hidden_state_1 = self.shared_linear_1(shared_hidden_state) + shared_hidden_state_1 = self.shared_activation_1(shared_hidden_state_1) + + shared_out = shared_hidden_state_1 + # shared_out = bottle_neck + + # y_out = self.user_heads(user, branch_id, shared_out) + y_out = self.user_heads(user, shared_out) + + # return autoencoder_out if not self.train_only_with_covariates else None, y_out, shared_out + return autoencoder_out if not self.train_only_with_covariates else None, y_out diff --git a/src/models/multitask_learning/multitask_autoencoder_branching.py b/src/models/multitask_learning/multitask_autoencoder_branching.py new file mode 100644 index 0000000..77f912b --- /dev/null +++ b/src/models/multitask_learning/multitask_autoencoder_branching.py @@ -0,0 +1,200 @@ +import torch +import torch.nn as nn + +from src.models import autoencoder +from src.models import user_dense_heads +from src.bin import validations +from src.utils import object_generator_utils as object_generator + + +class MultiTaskAutoEncoderLearner(nn.Module): + def __init__(self, + users: list, + groups: dict, + num_branches, + autoencoder_input_size, + autoencoder_bottleneck_feature_size, + autoencoder_num_layers, + shared_hidden_layer_size, + user_dense_layer_hidden_size, + num_classes, + num_covariates=0, + shared_layer_dropout_prob=0, + user_head_dropout_prob=0, + ordinal_regression_head=False, + train_only_with_covariates=False, + bidirectional=False): + """ + This model has a dense layer for each student. This is used for MultiTask learning. + + @param users: List of students (their ids) that are going to be used for trained. + @param autoencoder_input_size: Input size of the time series portion on the model. + @param autoencoder_bottleneck_feature_size: Encoded input size of autoecoder. + @param autoencoder_num_layers: Num layers in autoencoder LSTM model. + @param user_dense_layer_hidden_size: dense head hidden size. + @param num_classes: Number of classes in classification. + @param num_covariates: Number of covariates to be concatenated to the dense layer before + generating class probabilities. + """ + super(MultiTaskAutoEncoderLearner, self).__init__() + + if train_only_with_covariates: + assert num_covariates > 0, "The model has to be provided either input sequence or covariates." + + self.is_cuda_avail = True if torch.cuda.device_count() > 0 else False + self.users = users + self.groups = groups + self.num_branches = num_branches + self.autoencoder_input_size = autoencoder_input_size + # Ignore the autoencoder input feature if you are just training on sequences. + self.autoencoder_bottleneck_feature_size = autoencoder_bottleneck_feature_size if not train_only_with_covariates else 0 + self.autoencoder_num_layers = autoencoder_num_layers + self.shared_hidden_layer_size = shared_hidden_layer_size + self.user_dense_layer_hidden_size = user_dense_layer_hidden_size + self.num_classes = num_classes + self.num_covariates = num_covariates + self.shared_layer_dropout_prob = shared_layer_dropout_prob + self.user_head_dropout_prob = user_head_dropout_prob + self.ordinal_regression_head = ordinal_regression_head + self.train_only_with_covariates = train_only_with_covariates + self.bidirectional = bidirectional + + # Layer initialization. + if not train_only_with_covariates: + self.autoencoder = autoencoder.LSTMAE(self.autoencoder_input_size, + self.autoencoder_bottleneck_feature_size, + self.autoencoder_num_layers, + self.is_cuda_avail, + self.bidirectional) + + # self.shared_linear = nn.Linear(self.autoencoder_bottleneck_feature_size + self.num_covariates, + # self.shared_hidden_layer_size) + # self.shared_activation = nn.ReLU() + # self.shared_layer_dropout = nn.Dropout(p=self.shared_layer_dropout_prob) + # self.shared_linear_1 = nn.Linear(self.shared_hidden_layer_size, self.shared_hidden_layer_size // 2) + # self.shared_activation_1 = nn.ReLU() + + # # Original SOTA module + # self.user_heads = user_dense_heads.UserDenseHead(self.users, + # self.shared_hidden_layer_size // 2, + # self.user_dense_layer_hidden_size, + # self.num_classes, + # self.user_head_dropout_prob, + # self.ordinal_regression_head) + + # # Tried module in Spring 2020 (by yunfeiluo) + # self.user_heads = user_dense_heads.GroupDenseHead(self.groups, + # self.shared_hidden_layer_size // 2, + # self.user_dense_layer_hidden_size, + # self.num_classes, + # self.user_head_dropout_prob, + # self.ordinal_regression_head) + + # # New Try in Fall 2020 (by yunfeiluo), with students' head + # self.user_heads = user_dense_heads.BranchingUserDenseHead( + # self.users, + # self.shared_hidden_layer_size // 2, + # self.user_dense_layer_hidden_size, + # self.user_dense_layer_hidden_size, + # self.num_classes, + # self.num_branches, + # self.user_head_dropout_prob, + # self.ordinal_regression_head + # ) + + # # without students' head, end after branching layers + # self.user_heads = user_dense_heads.BranchingDenseHead( + # self.users, + # self.shared_hidden_layer_size // 2, + # self.user_dense_layer_hidden_size, + # self.num_classes, + # self.num_branches, + # self.user_head_dropout_prob, + # self.ordinal_regression_head + # ) + + # # without shared dense layer, but with user heads + # self.user_heads = user_dense_heads.BranchingUserDenseHead( + # self.users, + # self.autoencoder_bottleneck_feature_size + self.num_covariates, + # self.shared_hidden_layer_size, + # self.user_dense_layer_hidden_size, + # self.num_classes, + # self.num_branches, + # self.user_head_dropout_prob, + # self.ordinal_regression_head + # ) + + # curr best on 5-fold + # without shared dense layer, but with user heads + self.user_heads = user_dense_heads.BranchingUserBlock( + self.users, + self.autoencoder_bottleneck_feature_size + self.num_covariates, + self.shared_hidden_layer_size, + self.user_dense_layer_hidden_size, + self.num_classes, + self.num_branches, + self.user_head_dropout_prob, + self.ordinal_regression_head + ) + + # # with shared dense layer, with user heads + # self.user_heads = user_dense_heads.BranchingUserBlock( + # self.users, + # self.shared_hidden_layer_size // 2, + # self.shared_hidden_layer_size // 2, + # self.user_dense_layer_hidden_size, + # self.num_classes, + # self.num_branches, + # self.user_head_dropout_prob, + # self.ordinal_regression_head + # ) + + def forward(self, user, branch_id, input_seq, covariate_data=None, shared_out=None, has_shared=False): + """ + Slightly complex forward pass. The autoencoder part return the decoded output + which needs to be trained using MAE or MSE. The user head returns a vector of + class probability distributions which need to be trained using cross entropy. + + @param user: The student for which the model is being trained. All the students + contribute towards the loss of the auto encoder, but each have a separate linear + head. + @param input_seq: Must contain the input sequence that will be used to train the + autoencoder. + @param covariate_data: The covariates which will be concatenated with the output + of the autoencoders before being used for classification. + @return: output of the autoencoder and the probability distribution of each class + for the student. + """ + + # return if shared_out is provided + # if has_shared: + # return self.user_heads(user, branch_id, shared_out) + + validations.validate_integrity_of_covariates(self.num_covariates, covariate_data) + # If not training on sequences, do not put the sequences through he auto encoder. + if not self.train_only_with_covariates: + autoencoder_out = self.autoencoder(input_seq) + bottle_neck = self.autoencoder.get_bottleneck_features(input_seq) + bottle_neck = bottle_neck[:, -1, :] + else: + bottle_neck = object_generator.get_tensor_on_correct_device([]) + + if covariate_data is not None: + bottle_neck = torch.cat((bottle_neck, covariate_data.unsqueeze(0)), dim=1) + + # # with shared dense layer + # shared_hidden_state = self.shared_linear(bottle_neck) + # shared_hidden_state = self.shared_activation(shared_hidden_state) + # shared_hidden_state = self.shared_layer_dropout(shared_hidden_state) + # shared_hidden_state_1 = self.shared_linear_1(shared_hidden_state) + # shared_hidden_state_1 = self.shared_activation_1(shared_hidden_state_1) + + # shared_out = shared_hidden_state_1 + shared_out = bottle_neck + + # y_out = self.user_heads(user, branch_id, shared_out) + y_out = self.user_heads(user, shared_out) + + # return autoencoder_out if not self.train_only_with_covariates else None, y_out, shared_out + return autoencoder_out if not self.train_only_with_covariates else None, y_out diff --git a/src/models/multitask_learning/multitask_biLSTM_attention.py b/src/models/multitask_learning/multitask_biLSTM_attention.py new file mode 100644 index 0000000..2f923bd --- /dev/null +++ b/src/models/multitask_learning/multitask_biLSTM_attention.py @@ -0,0 +1,149 @@ +import torch +import torch.nn as nn + +from src.models.attention import attention, align_functions +from src.models import user_dense_heads +from src.bin import validations + +ALIGNMENT_TYPE_FUNCTION_MAP = { + 'additive': align_functions.AdditiveAlignment, + 'general': align_functions.GeneralAlignment, + 'dot': align_functions.DotAlignment +} + + +class MultiTaskBiLSTMAttention(nn.Module): + def __init__(self, + users: list, + lstm_input_size, + lstm_hidden_size, + lstm_num_layers, + covariate_hidden_size, + shared_hidden_layer_size, + user_dense_layer_hidden_size, + num_classes, + dropout=0, + num_covariates=0, + alignment_type="additive", + forward_pass_type="covariate_concat"): + """ + This model has a dense layer for each student. This is used for MultiTask learning. + + @param users: List of students (their ids) that are going to be used for trained. + @param lstm_input_size: Input size of the time series portion on the model. + @param lstm_hidden_size: Hidden size of the LSTM. + @param lstm_num_layers: Num layers in LSTM. + @param user_dense_layer_hidden_size: dense head hidden size. + @param num_classes: Number of classes in classification. + @param num_covariates: Number of covariates to be concatenated to the dense layer before + generating class probabilities. + """ + super(MultiTaskBiLSTMAttention, self).__init__() + validations.check_if_element_in_list(forward_pass_type, ALLOWED_FORWARD_PASSES) + validations.check_if_key_present_in_dict(alignment_type, ALIGNMENT_TYPE_FUNCTION_MAP) + + # Check if cuda available. If so set this flag true. + self.is_cuda_avail = True if torch.cuda.device_count() > 0 else False + self.users = users + self.lstm_input_size = lstm_input_size + self.lstm_num_layers = lstm_num_layers + self.dropout = dropout + self.shared_hidden_layer_size = shared_hidden_layer_size + self.user_dense_layer_hidden_size = user_dense_layer_hidden_size + self.num_classes = num_classes + self.num_covariates = num_covariates + self.covariate_hidden_size = covariate_hidden_size + + # Layer initialization. + self.biLSTM_attention_encoder = attention.AttentionEncoder(self.lstm_input_size, + self.lstm_hidden_size, + self.lstm_num_layers, + self.is_cuda_avail, + self.dropout) + self.encoder_outputs_size, self.context_vector_size = self.biLSTM_attention_encoder.get_encoder_dimensions() + + self.alignment_function = ALIGNMENT_TYPE_FUNCTION_MAP[alignment_type](self.encoder_outputs_size, + self.context_vector_size) + self.attention = attention.Attention(self.alignment_function) + + # This is the layer to be used to calculate the expected vector, the attention layers are just initializations. + # Return the fixed vector that we want to send through the shared layer. + self.expected_vector_from_attention = attention.ExpectedContextVectorAfterAttention(self.attention) + + # This layer is used to return a vector of encoder output size. + self.covariate_linear = nn.Linear(self.num_covariates, self.covariate_hidden_size) + + self.shared_linear = nn.Linear(self.lstm_hidden_size + self.num_covariates, + self.shared_hidden_layer_size) + + self.shared_activation = nn.ReLU() + + self.user_heads = user_dense_heads.UserDenseHead(self.users, + self.shared_hidden_layer_size, + self.user_dense_layer_hidden_size, + self.num_classes) + + FORWARD_PASS_TYPE_FUNCTION_MAPS = { + 'covariate_concat': self.covariate_concat_forward, + 'covariate_hidden': self.covariate_hidden_forward + } + + self.sequence_forward_pass = FORWARD_PASS_TYPE_FUNCTION_MAPS[forward_pass_type] + + def forward(self, user, input_seq, covariate_data=None): + """ + The input sequence is inputed to the LSTM and the last hidden state of the LSTM + is passed to the shared layer of the MultiTask Learner. + + @param user: The student for which the model is being trained. All the students + contribute towards the loss of the auto encoder, but each have a separate linear + head. + @param input_seq: Must contain the input sequence that will be used to train the + autoencoder. + @param covariate_data(batch_size, num_covariates): The covariates which will be concatenated with the output + of the autoencoders before being used for classification. + @return: output of the autoencoder and the probability distribution of each class + for the student. + """ + validations.validate_integrity_of_covariates(self.num_covariates, covariate_data) + + encoder_outputs, context_vector = self.biLSTM_attention_encoder(input_seq) + # encoder_outputs = [batch_size, seq_len, encoder_output_size] + # context_vector = [batch_size, seq_len, context_vector_size] + + expected_vector = self.sequence_forward_pass(encoder_outputs, context_vector, covariate_data) + + return self.shared_layer_forward(expected_vector, user) + + def shared_layer_forward(self, expected_vector, user): + shared_hidden_state = self.shared_linear(expected_vector) + shared_hidden_state = self.shared_activation(shared_hidden_state) + + y_out = self.user_heads(user, shared_hidden_state) + + return y_out + + def covariate_concat_forward(self, encoder_outputs, context_vector, covariate_data): + if covariate_data is not None: + covariate_hidden = self.covariate_linear(covariate_data) + # covariate_hidden = [batch_size, encoder_output_size] + encoder_outputs = torch.cat((encoder_outputs, covariate_hidden), dim=1) + # covariate_hidden = [batch_size, seq_len + 1, encoder_output_size] + + expected_vector = self.expected_vector_from_attention(encoder_outputs, context_vector) + # expected_vector = [batch_size, 1, encoder_output_size] + + return expected_vector + + def covariate_hidden_forward(self, encoder_outputs, context_vector, covariate_data): + """ + @brief: Uses the covariate_hidden layer as the hidden state to calculate additive attention. + Essentially, the hidden layer of the covariate Linear layer will be used to calculate the score with + the encoder outputs. + """ + assert covariate_data is not None, "Covariates cannot be None for this forward pass." + covariate_hidden = self.covariate_linear(covariate_data) + # covariate_hidden = [batch_size, encoder_output_size] + expected_vector = self.expected_vector_from_attention(encoder_outputs, covariate_hidden) + + return expected_vector diff --git a/src/models/multitask_learning/multitask_lstm.py b/src/models/multitask_learning/multitask_lstm.py new file mode 100644 index 0000000..d068c03 --- /dev/null +++ b/src/models/multitask_learning/multitask_lstm.py @@ -0,0 +1,183 @@ +import torch +import torch.nn as nn + +from src.models.user_classifier import UserClassifier +from src.models import user_dense_heads +from src.bin import validations + + +class MultiTaskLSTM(nn.Module): + def __init__(self, + users: list, + lstm_input_size, + lstm_hidden_size, + lstm_num_layers, + lstm_bidirectional, + multitask_input_size, + multitask_hidden_size, + multitask_num_classes, + dropout=0, + num_covariates=0, + **shared_layer_params): + """ + This model has a dense layer for each student. This is used for MultiTask learning. + + @param users: List of students (their ids) that are going to be used for trained. + @param lstm_input_size: Input size of the time series portion on the model. + @param lstm_hidden_size: Hidden size of the LSTM. + @param lstm_num_layers: Num layers in LSTM. + @param lstm_bidirectional: LSTM is bidirectional if set True. + @param user_dense_layer_hidden_size: dense head hidden size. + @param num_classes: Number of classes in classification. + @param num_covariates: Number of covariates to be concatenated to the dense layer before + generating class probabilities. + """ + super(MultiTaskLSTM, self).__init__() + self.is_cuda_avail = True if torch.cuda.device_count() > 0 else False + self.users = users + self.lstm_bidirectional = lstm_bidirectional + self.lstm_input_size = lstm_input_size + + if self.lstm_bidirectional: + self.lstm_hidden_size = lstm_hidden_size // 2 + else: + self.lstm_hidden_size = lstm_hidden_size + + self.lstm_num_layers = lstm_num_layers + self.dropout = dropout + self.num_covariates = num_covariates + + # Layer initialization. + self.lstm = nn.LSTM(input_size=self.lstm_input_size, + hidden_size=self.lstm_hidden_size, + batch_first=True, + bidirectional=self.lstm_bidirectional, + dropout=self.dropout) + + self.relu = nn.ReLU() + + self.user_classifier = UserClassifier(users, + multitask_input_size + self.num_covariates, + multitask_hidden_size, + multitask_num_classes, + dropout, + **shared_layer_params) + + def forward(self, user, input_seq, covariate_data=None): + """ + The input sequence is send to the LSTM and the last hidden state of the LSTM + is passed to the multi classifier. + + @param user: The student for which the model is being trained. All the students + contribute towards the loss of the auto encoder, but each have a separate linear + head. + @param input_seq: Must contain the input sequence that will be used to train the + LSTM. + @param covariate_data: The covariates which will be concatenated with the output + of the LSTM before being used for classification. + @return: probability distribution of each class + for the student. + """ + validations.validate_integrity_of_covariates(self.num_covariates, covariate_data) + lstm_out = self.lstm(input_seq) + lstm_last_hidden_state = lstm_out[:, -1, :] + lstm_last_hidden_state = self.relu(lstm_last_hidden_state) + + if covariate_data is not None: + embedding = torch.cat((lstm_last_hidden_state, covariate_data.unsqueeze(0)), dim=1) + + y_out = self.user_classifier(user, embedding) + + decoded_sequence = None + return decoded_sequence, y_out + + +class MultiTaskMultiLSTMLearner(nn.Module): + def __init__(self, + users: list, + lstm_input_size, + lstm_hidden_size, + lstm_num_layers, + lstm_bidirectional, + shared_hidden_layer_size, + user_dense_layer_hidden_size, + num_classes, + num_covariates=0, + lstm_dropout_prob=0, + shared_layer_dropout_prob=0, + user_head_dropout_prob=0): + """ + This model has a dense layer for each student. This is used for MultiTask learning. + + @param users: List of students (their ids) that are going to be used for trained. + @param lstm_input_size: Input size of the time series portion on the model. + @param lstm_hidden_size: Hidden size of the LSTM. + @param lstm_num_layers: Num layers in LSTM. + @param lstm_bidirectional: LSTM is bidirectional if set True. + @param user_dense_layer_hidden_size: dense head hidden size. + @param num_classes: Number of classes in classification. + @param num_covariates: Number of covariates to be concatenated to the dense layer before + generating class probabilities. + """ + super(MultiTaskMultiLSTMLearner, self).__init__() + self.is_cuda_avail = True if torch.cuda.device_count() > 0 else False + self.users = users + self.lstm_hidden_size = lstm_hidden_size + self.lstm_bidirectional = lstm_bidirectional + self.lstm_input_size = lstm_input_size + self.lstm_dropout_prob = lstm_dropout_prob + self.shared_layer_dropout_prob = shared_layer_dropout_prob + self.user_head_dropout_prob = user_head_dropout_prob + self.lstm_num_layers = lstm_num_layers + self.shared_hidden_layer_size = shared_hidden_layer_size + self.user_dense_layer_hidden_size = user_dense_layer_hidden_size + self.num_classes = num_classes + self.num_covariates = num_covariates + + # Layer initialization. + self.user_lstm = user_dense_heads.UserLSTM(input_size=self.lstm_input_size, + lstm_hidden_size=self.lstm_hidden_size, + num_layers=self.lstm_num_layers, + bidirectional=self.lstm_bidirectional, + dropout=self.lstm_dropout_prob) + + self.shared_linear = nn.Linear(self.lstm_hidden_size + self.num_covariates, + self.shared_hidden_layer_size) + + self.shared_activation = nn.ReLU() + self.shared_layer_dropout = nn.Dropout(p=self.shared_layer_dropout_prob) + + self.user_heads = user_dense_heads.UserDenseHead(self.users, + self.shared_hidden_layer_size, + self.user_dense_layer_hidden_size, + self.num_classes) + + def forward(self, user, input_seq, covariate_data=None): + """ + The input sequence is inputed to the LSTM user head and the last hidden state of the LSTM + is passed to the shared layer of the MultiTask Learner. + + @param user: The student for which the model is being trained. All the students + contribute towards the loss of the auto encoder, but each have a separate linear + head. + @param input_seq: Must contain the input sequence that will be used to train the + autoencoder. + @param covariate_data: The covariates which will be concatenated with the output + of the LSTM before being used for classification. + @return: output of the LSTM and the probability distribution of each class + for the student. + """ + validations.validate_integrity_of_covariates(self.num_covariates, covariate_data) + lstm_out = self.user_lstm(user, input_seq) + lstm_last_hidden_state = lstm_out[:, -1, :] + + if covariate_data is not None: + embedding = torch.cat((lstm_last_hidden_state, covariate_data.unsqueeze(0)), dim=1) + + shared_hidden_state = self.shared_linear(embedding) + shared_hidden_state = self.shared_activation(shared_hidden_state) + shared_hidden_state = self.shared_layer_dropout(shared_hidden_state) + + y_out = self.user_heads(user, shared_hidden_state) + + return y_out diff --git a/src/models/simple.py b/src/models/simple.py new file mode 100644 index 0000000..7ae58b7 --- /dev/null +++ b/src/models/simple.py @@ -0,0 +1,104 @@ +""" +Python module that defines Simple LSTM Based NN module. +""" +import torch +import torch.nn as nn + +from src import definitions +from src.bin import validations + + +class SimpleLSTM(nn.Module): + """ + Simple LSTM followed by a dense layer for predicting time series. + """ + + def __init__(self, num_features, + num_classes=3, + hidden_size=64, + dropout=0, + bidirectional=False, + covariates=0): + super(SimpleLSTM, self).__init__() + self.covariates = covariates + self.lstm = nn.LSTM(input_size=num_features, + hidden_size=hidden_size, + batch_first=True, + bidirectional=bidirectional, + dropout=dropout) + + dense_layer_hidden_size = hidden_size * 2 if bidirectional else hidden_size + dense_layer_hidden_size = dense_layer_hidden_size + covariates if covariates > 0 else dense_layer_hidden_size + self.linear = nn.Linear(dense_layer_hidden_size, num_classes) + + def forward(self, tensor_data, covariates=None): + assert covariates is not None and self.covariates > 0 or covariates is None and self.covariates == 0,\ + "If training for covariates, initialize correctly." + + if covariates is not None: + assert covariates.shape[0] == self.covariates, "Expected covariate size and input mismatch." + # Extracting actual data form the tuple. + input_sequence = tensor_data[definitions.ACTUAL_DATA_IDX].unsqueeze(0) + validations.validate_no_nans_in_tensor(input_sequence) + + lstm_out, hidden = self.lstm(input_sequence) + y_out = lstm_out[:, -1].unsqueeze(0) + + if self.covariates > 0 and covariates is not None: + # Adding two dummy dimensions. + covariates = covariates.unsqueeze(0).unsqueeze(0) + y_out = torch.cat((y_out, covariates), dim=2) + + y_out = self.linear(y_out) + y_out = y_out.squeeze(0).squeeze(0) + + return y_out + + +class SimpleCNN(nn.Module): + """ + Simple LSTM followe by a dense layer for predicting time series. + """ + + def __init__(self, + num_features=14, + sequence_length=72, + num_classes=3, + in_channels=3, + out_channels=24, + # This kernel configuration allows us to take. (_, num_features) + kernel_size=(12, 14), + stride=1, + padding=0, + bias=True): + super(SimpleCNN, self).__init__() + assert kernel_size[1] == num_features, \ + "Num Features and Kernel Size mismatch. Expected kernel size (_ , {}) for {}".format(num_features, + kernel_size) + kernel_height, kernel_width = kernel_size + expected_h = ((sequence_length - kernel_height + 2 * padding) / stride) + 1 + expected_w = ((num_features - kernel_width + 2 * padding) / stride) + 1 + assert int(expected_h) == expected_h and int(expected_w) == expected_w, "CNN parameters o not fit sequence." + + self.cnn2d = nn.Conv2d(in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=bias) + self.linear = nn.Linear(int(out_channels * expected_h * expected_w), num_classes) + self.in_channels = in_channels + self.sequence_length = sequence_length + self.num_features = num_features + + def forward(self, tensor_data, covariate=None): + tensor_data = tensor_data.unsqueeze(0) + validations.validate_no_nans_in_tensor(tensor_data) + assert tensor_data.shape[1] >= self.in_channels and tensor_data.shape[2] == self.sequence_length and tensor_data.shape[3] == self.num_features, "Wrong dimensions in input data!" + + y_out = self.cnn2d(tensor_data) + y_out = y_out.reshape(-1) + y_out = self.linear(y_out) + y_out = nn.functional.relu(y_out) + + return y_out diff --git a/src/models/user_classifier.py b/src/models/user_classifier.py new file mode 100644 index 0000000..090d600 --- /dev/null +++ b/src/models/user_classifier.py @@ -0,0 +1,73 @@ +import torch +import torch.nn as nn + +from src.models.user_dense_heads import UserDenseHead +from src.bin import validations + + +class UserClassifier(nn.Module): + def __init__(self, users: list, + multitask_input_size, + multitask_hidden_size, + multitask_num_classes, + multitask_dropout=0, + **shared_layer_params): + """ + This wraps multiple fully connected layers followed by the multitask layer for users. + + @param users: List of students (their ids) that are going to be used for trained. + The student ids much be strings. + @param multitask_input_size: Input size of each dense layer. + @param multitask_hidden_size: Hidden size of the dense layer. + @param multitask_num_classes: Number of classes that the multitask layer will output. + @param multitask_dropout: dropout for multitask head. + """ + super(UserClassifier, self).__init__() + self.is_cuda_avail = True if torch.cuda.device_count() > 0 else False + + # Extracting params for the shared layer. + self.sl_input_size = shared_layer_params.get("sl_input_size", ) + self.sl_hidden_size = shared_layer_params.get("sl_hidden_size", ) + self.sl_dropout_prob = shared_layer_params.get("sl_dropout_prob", ) + self.sl_1_input_size = shared_layer_params.get("sl_1_input_size", ) + self.sl_1_hidden_size = shared_layer_params.get("sl_1_hidden_size", ) + self.sl_1_dropout_prob = shared_layer_params.get("sl_1_dropout_prob", ) + + self.users = users + self.multitask_input_size = multitask_input_size + self.multitask_hidden_size = multitask_hidden_size + self.multitask_num_classes = multitask_num_classes + self.multitask_dropout = multitask_dropout + + validations.validate_sequential_model_size_parameters(self.sl_input_size, + self.sl_hidden_size, + self.sl_1_input_size, + self.sl_1_hidden_size, + self.multitask_input_size, + self.multitask_hidden_size) + + self.sl = nn.Linear(self.sl_input_size, self.sl_hidden_size) + self.sl_activation = nn.ReLU() + self.sl_dropout = nn.Dropout(p=self.sl_dropout) + + self.sl_1 = nn.Linear(self.sl_1_input_size, self.sl_1_hidden_size) + self.sl_1_activation = nn.ReLU() + self.sl_1_dropout = nn.Dropout(p=self.sl_1_dropout_prob) + + self.user_dense_head = UserDenseHead(self.users, + self.multitask_input_size, + self.multitask_hidden_size, + self.multitask_num_classes, + self.multitask_dropout) + + def forward(self, user, input_data): + + sl_out = self.sl(input_data) + sl_out = self.sl_activation(sl_out) + sl_out = self.sl_dropout(sl_out) + + sl_1_out = self.sl_1(sl_out) + sl_1_out = self.sl_1_activation(sl_1_out) + sl_1_out = self.sl_1_dropout(sl_1_out) + + return self.student_dense_layer[user](sl_1_out) diff --git a/src/models/user_dense_heads.py b/src/models/user_dense_heads.py new file mode 100644 index 0000000..96dfcf5 --- /dev/null +++ b/src/models/user_dense_heads.py @@ -0,0 +1,314 @@ +import warnings +import torch +import torch.nn as nn +import numpy as np + +from src.definitions import LOW_MODEL_CAPACITY_WARNING + +class softmax_select(nn.Module): + def __init__(self, num_branches): + super(softmax_select, self).__init__() + self.num_branches = num_branches + self.prob = nn.Parameter(torch.exp(torch.ones(self.num_branches, device=torch.device("cuda"))), requires_grad=True) + self.T = 10.0 + + def forward(self): + eps = torch.rand(self.num_branches, device=torch.device("cuda")) + log_prob = torch.log(self.prob) + if self.training: + log_prob = (log_prob + eps) / self.T + return torch.exp(log_prob[torch.argmax(log_prob)]) / torch.exp(log_prob).sum(), str(torch.argmax(log_prob).tolist()) + return 1.0, str(torch.argmax(log_prob).tolist()) + +class BranchingUserBlock(nn.Module): + def __init__(self, users: list, input_size, branch_hidden_size, user_hidden_size, num_classes, num_branches, dropout=0, ordinal_regression_head=False): + """ + This model has a dense layer for each student. This is used for MultiTask learning. + + @param users: List of students (their ids) that are going to be used for trained. + The student ids much be strings. + @param input_size: Input size of each dense layer. + @param hidden_size: Hidden size of the dense layer. + """ + super(BranchingUserBlock, self).__init__() + self.is_cuda_avail = True if torch.cuda.device_count() > 0 else False + self.input_size = input_size + self.branch_hidden_size = branch_hidden_size + self.user_hidden_size = user_hidden_size + self.num_classes = num_classes + self.num_branches = num_branches + self.dropout = dropout + + # Layer initialization. + if self.input_size > self.branch_hidden_size: + warnings.warn(LOW_MODEL_CAPACITY_WARNING) + + # Construct dictionary for branching layer + branching_layer = dict() + for i in range(self.num_branches): + sequential_liner = nn.Sequential( + nn.Linear(self.input_size, self.branch_hidden_size), + nn.ReLU(), + # nn.Dropout(p=0.5), + nn.Linear(self.branch_hidden_size, self.branch_hidden_size // 2), + nn.ReLU() + ) + + branching_layer[str(i)] = sequential_liner + + self.branching_layer = nn.ModuleDict(branching_layer) + + # construct user layers with probability distribution + user_layer = dict() + branching_probs = dict() + for user in users: + sequential_liner = nn.Sequential( + nn.Linear(self.branch_hidden_size // 2, self.user_hidden_size), + nn.ReLU(), + # nn.Dropout(p=0.5), + nn.Linear(self.user_hidden_size, self.num_classes)) + + if ordinal_regression_head: + sequential_liner.add_module("sigmoid", nn.Sigmoid()) + + user_layer[user] = sequential_liner + + prob_module = softmax_select(self.num_branches) + branching_probs[user] = prob_module + + self.user_layer = nn.ModuleDict(user_layer) + self.branching_probs = nn.ModuleDict(branching_probs) + + def forward(self, user, input_data): + prob_out, branch_ind = self.branching_probs[user]() + branching_out = prob_out * self.branching_layer[branch_ind](input_data) + + return self.user_layer[user](branching_out) + +class BranchingDenseHead(nn.Module): + def __init__(self, users: list, input_size, hidden_size, num_classes, num_branches, dropout=0, ordinal_regression_head=False): + """ + This model has a dense layer for each student. This is used for MultiTask learning. + + @param users: List of students (their ids) that are going to be used for trained. + The student ids much be strings. + @param input_size: Input size of each dense layer. + @param hidden_size: Hidden size of the dense layer. + """ + super(BranchingDenseHead, self).__init__() + self.is_cuda_avail = True if torch.cuda.device_count() > 0 else False + self.input_size = input_size + self.hidden_size = hidden_size + self.num_classes = num_classes + self.num_branches = num_branches + self.dropout = dropout + + # Layer initialization. + if self.input_size > self.hidden_size: + warnings.warn(LOW_MODEL_CAPACITY_WARNING) + + # Construct dictionary for branching layer + branching_layer = dict() + for i in range(self.num_branches): + sequential_liner = nn.Sequential( + nn.Linear(self.input_size, self.hidden_size), + nn.ReLU(), + # nn.Dropout(p=dropout), + nn.Linear(self.hidden_size, self.num_classes) + ) + + if ordinal_regression_head: + sequential_liner.add_module("sigmoid", nn.Sigmoid()) + + branching_layer[str(i)] = sequential_liner + + self.branching_layer = nn.ModuleDict(branching_layer) + + def forward(self, user, branch_id, input_data): + # user is not used here. Leav if here for the purpose of not modifying to much on the other files. + return self.branching_layer[branch_id](input_data) + +class BranchingUserDenseHead(nn.Module): + def __init__(self, users: list, input_size, branch_hidden_size, user_hidden_size, num_classes, num_branches, dropout=0, ordinal_regression_head=False): + """ + This model has a dense layer for each student. This is used for MultiTask learning. + + @param users: List of students (their ids) that are going to be used for trained. + The student ids much be strings. + @param input_size: Input size of each dense layer. + @param hidden_size: Hidden size of the dense layer. + """ + super(BranchingUserDenseHead, self).__init__() + self.is_cuda_avail = True if torch.cuda.device_count() > 0 else False + self.input_size = input_size + self.branch_hidden_size = branch_hidden_size + self.user_hidden_size = user_hidden_size + self.num_classes = num_classes + self.num_branches = num_branches + self.dropout = dropout + + # Layer initialization. + if self.input_size > self.branch_hidden_size: + warnings.warn(LOW_MODEL_CAPACITY_WARNING) + + # Construct dictionary for branching layer + branching_layer = dict() + for i in range(self.num_branches): + sequential_liner = nn.Sequential( + nn.Linear(self.input_size, self.branch_hidden_size), + nn.ReLU(), + # nn.Dropout(p=dropout), + nn.Linear(self.branch_hidden_size, self.branch_hidden_size // 2), + nn.ReLU() + ) + + branching_layer[str(i)] = sequential_liner + + self.branching_layer = nn.ModuleDict(branching_layer) + + # construct user layers + dense_layer = dict() + for user in users: + sequential_liner = nn.Sequential( + nn.Linear(self.branch_hidden_size // 2, self.user_hidden_size), + nn.ReLU(), + # nn.Dropout(p=dropout), + nn.Linear(self.user_hidden_size, self.num_classes)) + + if ordinal_regression_head: + sequential_liner.add_module("sigmoid", nn.Sigmoid()) + + dense_layer[user] = sequential_liner + + self.student_dense_layer = nn.ModuleDict(dense_layer) + + def forward(self, user, branch_id, input_data): + branching_out = self.branching_layer[branch_id](input_data) + return self.student_dense_layer[user](branching_out) + +class GroupDenseHead(nn.Module): + def __init__(self, groups: dict, input_size, hidden_size, num_classes, dropout=0, ordinal_regression_head=False): + """ + This model has a dense layer for each group of students. This is used for MultiTask learning. + + @param groups: dictionary of groups of student, map: student_ids -> group_ids + The ids of group and student much be strings. + @param input_size: Input size of each dense layer. + @param hidden_size: Hidden size of the dense layer. + """ + super(GroupDenseHead, self).__init__() + self.is_cuda_avail = True if torch.cuda.device_count() > 0 else False + self.input_size = input_size + self.hidden_size = hidden_size + self.num_classes = num_classes + self.dropout = dropout + self.groups = groups # map: student -> group + + group_nodes = set() + for student in groups: + group_nodes.add(groups[student]) + + # Layer initialization. + if self.input_size > self.hidden_size: + warnings.warn(LOW_MODEL_CAPACITY_WARNING) + dense_layer = dict() + + # make a dense layer for each group + for group in group_nodes: + sequential_liner = nn.Sequential( + nn.Linear(self.input_size, self.hidden_size), + nn.ReLU(), + nn.Dropout(p=self.dropout), + nn.Linear(self.hidden_size, self.num_classes)) + + dense_layer[group] = sequential_liner + + self.student_dense_layer = nn.ModuleDict(dense_layer) + + def forward(self, user, input_data): + return self.student_dense_layer[self.groups[user]](input_data) + +class UserDenseHead(nn.Module): + def __init__(self, users: list, input_size, hidden_size, num_classes, dropout=0, ordinal_regression_head=False): + """ + This model has a dense layer for each student. This is used for MultiTask learning. + + @param users: List of students (their ids) that are going to be used for trained. + The student ids much be strings. + @param input_size: Input size of each dense layer. + @param hidden_size: Hidden size of the dense layer. + """ + super(UserDenseHead, self).__init__() + self.is_cuda_avail = True if torch.cuda.device_count() > 0 else False + self.input_size = input_size + self.hidden_size = hidden_size + self.num_classes = num_classes + self.dropout = dropout + + # Layer initialization. + if self.input_size > self.hidden_size: + warnings.warn(LOW_MODEL_CAPACITY_WARNING) + dense_layer = {} + for user in users: + # todo(abhinavshaw): Make this configurable to any model of the users choice. can take those layers as a list. + sequential_liner = nn.Sequential( + nn.Linear(self.input_size, self.hidden_size), + nn.ReLU(), + # nn.Dropout(p=dropout), + nn.Linear(self.hidden_size, self.num_classes)) + + if ordinal_regression_head: + sequential_liner.add_module("sigmoid", nn.Sigmoid()) + + dense_layer[user] = sequential_liner + + self.student_dense_layer = nn.ModuleDict(dense_layer) + + def forward(self, user, input_data): + return self.student_dense_layer[user](input_data) + + +class UserLSTM(nn.Module): + def __init__(self, users: list, + input_size, + lstm_hidden_size, + num_layers=1, + bidirectional=False, + dropout=0): + """ + This model has a LSTM for each user layer for each student. + This is used for MultiTask learning. + + @param users: List of students (their ids) that are going to be used for trained. + The student ids much be strings. + @param input_size: Input size of each LSTM. + @param lstm_hidden_size: Hidden size of the LSTM. + """ + super(UserDenseHead, self).__init__() + self.is_cuda_avail = True if torch.cuda.device_count() > 0 else False + self.input_size = input_size + self.lstm_hidden_size = lstm_hidden_size + self.num_layers = num_layers + self.bidirectional = bidirectional + self.dropout = dropout + + if self.bidirectional: + self.lstm_hidden_size = self.lstm_hidden_size // 2 + + # Layer initialization. + if self.input_size > self.lstm_hidden_size: + warnings.warn(LOW_MODEL_CAPACITY_WARNING) + lstm_layer = {} + for user in users: + # todo(abhinavshaw): Make this configurable to any model of the users choice. can take those layers as a list. + lstm_layer[user] = nn.LSTM(input_size=input_size, + hidden_size=self.lstm_hidden_size, + batch_first=True, + num_layers=self.num_layers, + bidirectional=self.bidirectional, + dropout=dropout) + + self.student_dense_layer = nn.ModuleDict(lstm_layer) + + def forward(self, user, input_data): + return self.student_dense_layer[user](input_data) diff --git a/src/multi_task_autoencoder_test.py b/src/multi_task_autoencoder_test.py new file mode 100644 index 0000000..cf52409 --- /dev/null +++ b/src/multi_task_autoencoder_test.py @@ -0,0 +1,351 @@ +import os +import sys +import torch +import tqdm +import random + +import src.bin.tensorify as tensorify +import src.utils.data_conversion_utils as conversions +import src.data_manager.student_life_var_binned_data_manager as data_manager +import src.bin.trainer as trainer +from statistics import mean as list_mean + +from sklearn import metrics +from sklearn.preprocessing import LabelBinarizer +from sklearn.preprocessing import MultiLabelBinarizer + +from torch import nn +from copy import deepcopy +from src import definitions +from src.bin import statistics +from src.bin import checkpointing +from src.data_manager import cross_val +from src.models.multitask_learning import multitask_autoencoder +from src.utils.read_utils import read_pickle +from src.utils import write_utils + +feature_list = data_manager.FEATURE_LIST + +# ##### Pickle ##### +data_file_path = 'data/training_data/shuffled_splits/training_date_normalized_shuffled_splits_select_features_no_prev_stress_all_students.pkl' +data = read_pickle(data_file_path) + +################ get clusters from command line arg ############### +is_avg_stress = True if "avg_stress" in sys.argv[1] else False +print("running with average stress", is_avg_stress) + +clusters = None +try: + clusters = sys.argv[1] # file name in folder /Data/student_groups +except: + clusters = 'one_for_each' + exit() +print('The groups: ' + clusters) +groups_file_path = 'src/experiments/clustering/student_groups/' + clusters + '.pkl' +student_groups = read_pickle(groups_file_path) # student groups + +# check how students are distributed +if not is_avg_stress: + print("student distribution: ") + group_ids = list() + student_ids = list() + rev_groups = dict() + for student in student_groups: + if rev_groups.get(student_groups[student]) != None: + rev_groups[student_groups[student]].append(student) + else: + rev_groups[student_groups[student]] = [student] + group_ids.append(student_groups[student]) + student_ids.append(student.split('_')[1]) + group_ids = set(group_ids) + for group in rev_groups: + print(group + ': ' + str(rev_groups[group])) +##################################################################### + +######## Split data ################################################# +# k-fold cross validation +stratification_type = None +try: + stratification_type = sys.argv[2] +except: + stratification_type = "students" +#splits = cross_val.get_k_fod_cross_val_splits_stratified_by_students(data=data, groups = student_groups, n_splits=5, stratification_type=stratification_type) + +# leave one subject out +# num_subject = 5 +# ids = random.sample(student_ids, num_subject) +batch = dict() +batch['0'] = ["4", "7", "8"] +batch['1'] = ["10", "14", "16", "17", "19"] +batch['2'] = ["22", "23", "24", "32", "33"] +batch['3'] = ["35", "36", "43", "44", "49"] +batch['4'] = ["51", "52", "53", "57", "58"] +# missing = [8, 10, 14, 22, 23, 36, 43, 49, 51, 57, 58] + +#ids = student_ids +try: + ids = batch[sys.argv[3]] +except: + print("invalid batch num! ") + exit() +print('Choosen student: ', ids) +splits = cross_val.leave_one_subject_out_split(data=data, groups=student_groups, ids=ids, subject='students') + +print("Splits: ", len(splits)) +##################################################################### + +################################## Init ############################# +use_historgram = True +autoencoder_bottle_neck_feature_size = 128 +autoencoder_num_layers = 1 +alpha , beta = 0.0001, 1 +decay = 0.0001 +first_key = next(iter(data['data'].keys())) +if use_historgram: + num_features = len(data['data'][first_key][4][0]) +else: + num_features = len(data['data'][first_key][0][0]) +num_covariates = len(data['data'][first_key][definitions.COVARIATE_DATA_IDX]) +shared_hidden_layer_size = 256 +user_dense_layer_hidden_size = 64 +num_classes = 3 +learning_rate = 0.000001 +n_epochs = 1 # 500 +shared_layer_dropout_prob=0.00 +user_head_dropout_prob=0.00 + +device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu') + +print("Num Features:", num_features) +print("Device: ", device) +print("Num_covariates:", num_covariates) + +cuda_enabled = torch.cuda.is_available() +tensorified_data = tensorify.tensorify_data_gru_d(deepcopy(data), cuda_enabled) +student_list = conversions.extract_distinct_student_idsfrom_keys(data['data'].keys()) + +##################################################################### + +# ################## For inspecting score of each group ################################### +# # store the score of each groups +# # to see the generalization ability of each group +# # only when split by groups +# group_val_score = dict() # map: group -> list(val_scores) +# for group in group_ids: +# group_val_score[group] = list() +# group_label_pred = dict() # map: group -> map: labels -> list, map: preds -> list +# for group in group_val_score: +# group_label_pred[group] = dict() +# group_label_pred[group]['labels'] = list() +# group_label_pred[group]['preds'] = list() + +# key_group = dict() # map: key -> group +# for key in data['data']: +# key_group[key] = student_groups['student_' + key.split('_')[0]] +# ######################################################################################## + +split_val_scores = list() +best_score_epoch_log = list() +best_models = list() + +split_roc_macro = list() +split_roc_micro = list() +split_roc_weighted = list() + +confusion_matrices = dict() # map: str(id) -> nparray(confusion matrix) + +for split_no, split in enumerate(splits): + print("Split No: ", split_no) + + best_split_score = -1 + epoch_at_best_score = 0 + best_model = None + + best_val_roc_macro = -1 + best_val_roc_micro = -1 + best_val_roc_weighted = -1 + + tensorified_data['train_ids'] = split['train_ids'] + data['train_ids'] = split['train_ids'] + + tensorified_data['val_ids'] = split['val_ids'] + data['val_ids'] = split['val_ids'] + + tensorified_data['test_ids'] = list() + + leaved_student = data['val_ids'][0].split('_')[0] + print('############# val student!!! {} ################'.format(leaved_student)) + + validation_user_statistics_over_epochs = list() + + class_weights = torch.tensor(statistics.get_class_weights_in_inverse_proportion(data)) + class_weights = torch.tensor([0.6456, 0.5635, 1.0000]) + print("Class Weights: ", class_weights) + + #### fetch groups #### + groups = student_groups + if is_avg_stress: + groups = student_groups[leaved_student] + + model = multitask_autoencoder.MultiTaskAutoEncoderLearner( + conversions.prepend_ids_with_string(student_list, "student_"), + groups, + num_features, + autoencoder_bottle_neck_feature_size, + autoencoder_num_layers, + shared_hidden_layer_size, + user_dense_layer_hidden_size, + num_classes, + num_covariates, + shared_layer_dropout_prob, + user_head_dropout_prob) + if cuda_enabled: + model.cuda() + class_weights = class_weights.cuda() + + reconstruction_criterion = torch.nn.L1Loss(reduction="sum") + classification_criterion = torch.nn.CrossEntropyLoss(weight=class_weights) + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=decay) + + for epoch in tqdm.tqdm(range(n_epochs)): + + (train_total_loss, train_total_reconstruction_loss, train_total_classification_loss, + train_labels, train_preds, train_users) = trainer.evaluate_multitask_learner(tensorified_data, + 'train_ids', + num_classes, + model, + reconstruction_criterion, + classification_criterion, + device, + optimizer=optimizer, + alpha=alpha, + beta=beta, + use_histogram=use_historgram) + + (val_total_loss, val_total_reconstruction_loss, val_total_classification_loss, + val_labels, val_preds, val_users) = trainer.evaluate_multitask_learner(tensorified_data, + 'val_ids', + num_classes, + model, + reconstruction_criterion, + classification_criterion, + device, + alpha=alpha, + beta=beta, + use_histogram=use_historgram) + ############ For inspecting score of each group ################################### + # group_label_pred_this_split = deepcopy(group_label_pred) + # (val_total_loss, val_total_reconstruction_loss, val_total_classification_loss, + # val_labels, val_preds, val_users) = trainer.evaluate_multitask_learner(tensorified_data, + # 'val_ids', + # num_classes, + # model, + # reconstruction_criterion, + # classification_criterion, + # device, + # alpha=alpha, + # beta=beta, + # use_histogram=use_historgram, + # key_group=key_group, + # group_label_pred=group_label_pred_this_split) + ################################################################################ + + ######## Appending Metrics ######## + train_label_list = conversions.tensor_list_to_int_list(train_labels) + train_pred_list = conversions.tensor_list_to_int_list(train_preds) + val_label_list = conversions.tensor_list_to_int_list(val_labels) + val_pred_list = conversions.tensor_list_to_int_list(val_preds) + + train_scores = metrics.precision_recall_fscore_support(train_label_list, train_pred_list, average='weighted') + val_scores = metrics.precision_recall_fscore_support(val_label_list, val_pred_list, average='weighted') + + # Compute AUC sorces + mlb = MultiLabelBinarizer() + mlb.fit([val_label_list]) + print('val classes', [i for i in set(val_label_list)]) + print('pred classes', [i for i in set(val_pred_list)]) + y_true = mlb.transform([[i] for i in val_label_list]) + y_pred = mlb.transform([[i] for i in val_pred_list]) + print('y_true shape', y_true.shape) + print('y_pred shape', y_pred.shape) + print("confusion matrix: ") + con_matrix = metrics.confusion_matrix(val_label_list, val_pred_list, labels=[0, 1, 2]) + print(con_matrix) + + val_roc_macro = metrics.roc_auc_score(y_true, y_pred, average='macro') + val_roc_micro = metrics.roc_auc_score(y_true, y_pred, average='micro') + val_roc_weighted = metrics.roc_auc_score(y_true, y_pred, average='weighted') + + if val_roc_macro > best_val_roc_macro: + best_val_roc_macro = val_roc_macro + if val_roc_micro > best_val_roc_micro: + best_val_roc_micro = val_roc_micro + if val_roc_weighted > best_val_roc_weighted: + best_val_roc_weighted = val_roc_weighted + print("AUC (macro) this Epoch: {} Best Score: {}".format(val_roc_macro, best_val_roc_macro)) + print("AUC (micro) this Epoch: {} Best Score: {}".format(val_roc_micro, best_val_roc_micro)) + print("AUC (weighted) this Epoch: {} Best Score: {}".format(val_roc_weighted, best_val_roc_weighted)) + + # validation_user_statistics_over_epochs.append(statistics.generate_training_statistics_for_user(val_labels, + # val_preds, + # val_users)) + + if val_scores[2] > best_split_score: + best_split_score = val_scores[2] + epoch_at_best_score = epoch + best_model = deepcopy(model) + + confusion_matrices[data['val_ids'][0].split('_')[0]] = con_matrix + + print("Split: {} Score This Epoch: {} Best Score: {}".format(split_no, val_scores[2], best_split_score)) + + ############ For inspecting score of each group ################################### + # for group in group_val_score: + # val_score = metrics.precision_recall_fscore_support(group_label_pred_this_split[group]['labels'], group_label_pred_this_split[group]['preds'], average='weighted') + # try: + # group_val_score[group][int(split_no)] = max(group_val_score[group][int(split_no)], val_score[2]) + # except: + # group_val_score[group].append(val_score[2]) + #print(group + ': ' + str(group_val_score[group])) + ################################################################################ + + split_val_scores.append(best_split_score) + best_score_epoch_log.append(epoch_at_best_score) + best_models.append(deepcopy(best_model)) + + split_roc_macro.append(best_val_roc_macro) + split_roc_micro.append(best_val_roc_micro) + split_roc_weighted.append(best_val_roc_weighted) + +print('F1 split scores: ' + str(split_val_scores)) +#print("alpha: {} Beta: {}".format(alpha, beta)) +print("Avg F1 Cross Val Score: {}".format(list_mean(split_val_scores))) +max_idx = split_val_scores.index(max(split_val_scores)) + +print('AUC (macro) scores: ' + str(split_roc_macro)) +print("Avg AUC macro Val Score: {}".format(list_mean(split_roc_macro))) +print('AUC (micro) scores: ' + str(split_roc_micro)) +print("Avg AUC micro Val Score: {}".format(list_mean(split_roc_micro))) +print('AUC (weighted) scores: ' + str(split_roc_weighted)) +print("Avg AUC weighted Val Score: {}".format(list_mean(split_roc_weighted))) + +############## For inspecting score of each group ################################# +# for group in group_val_score: +# group_val_score[group] = sum(group_val_score[group]) / len(group_val_score[group]) +# print('Avg Cross Val Score of ' + group + ': ' + str(group_val_score[group])) +################################################################################ + +scores_and_epochs = (split_val_scores, epoch_at_best_score) +scores_and_epochs_file_name = os.path.join('data', "cross_val_scores/multitask_autoencoder_" + clusters + '_' + sys.argv[3] + ".pkl") +write_utils.data_structure_to_pickle(scores_and_epochs, scores_and_epochs_file_name) + +AUC_scores = (split_roc_macro, split_roc_micro, split_roc_weighted) +AUC_scores_file_name = os.path.join('data', "cross_val_scores/auc_multitask_autoencoder_" + clusters + '_' + sys.argv[3] + ".pkl") +write_utils.data_structure_to_pickle(AUC_scores, AUC_scores_file_name) + +model_file_name = "saved_models/multitask_lstm-ae_{}_{}.model".format(clusters, sys.argv[3]) +model_file_name = os.path.join('data', model_file_name) +checkpointing.save_checkpoint(best_models[max_idx].state_dict(), model_file_name) + +confusion_matrices_file_name = os.path.join('data', "check/confusion_matrices_" + clusters + '_' + sys.argv[3] + ".pkl") +write_utils.data_structure_to_pickle(confusion_matrices, confusion_matrices_file_name) \ No newline at end of file diff --git a/src/utils/.DS_Store b/src/utils/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/src/utils/.DS_Store differ diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/__pycache__/__init__.cpython-38.pyc b/src/utils/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..4622013 Binary files /dev/null and b/src/utils/__pycache__/__init__.cpython-38.pyc differ diff --git a/src/utils/__pycache__/__init__.cpython-39.pyc b/src/utils/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..6f261cd Binary files /dev/null and b/src/utils/__pycache__/__init__.cpython-39.pyc differ diff --git a/src/utils/__pycache__/cross_val.cpython-38.pyc b/src/utils/__pycache__/cross_val.cpython-38.pyc new file mode 100644 index 0000000..623ed52 Binary files /dev/null and b/src/utils/__pycache__/cross_val.cpython-38.pyc differ diff --git a/src/utils/__pycache__/cross_val.cpython-39.pyc b/src/utils/__pycache__/cross_val.cpython-39.pyc new file mode 100644 index 0000000..6ec0203 Binary files /dev/null and b/src/utils/__pycache__/cross_val.cpython-39.pyc differ diff --git a/src/utils/__pycache__/data_conversion_utils.cpython-38.pyc b/src/utils/__pycache__/data_conversion_utils.cpython-38.pyc new file mode 100644 index 0000000..5ca6205 Binary files /dev/null and b/src/utils/__pycache__/data_conversion_utils.cpython-38.pyc differ diff --git a/src/utils/__pycache__/data_conversion_utils.cpython-39.pyc b/src/utils/__pycache__/data_conversion_utils.cpython-39.pyc new file mode 100644 index 0000000..d0d3e09 Binary files /dev/null and b/src/utils/__pycache__/data_conversion_utils.cpython-39.pyc differ diff --git a/src/utils/__pycache__/read_utils.cpython-39.pyc b/src/utils/__pycache__/read_utils.cpython-39.pyc new file mode 100644 index 0000000..4738831 Binary files /dev/null and b/src/utils/__pycache__/read_utils.cpython-39.pyc differ diff --git a/src/utils/__pycache__/student_life_var_binned_data_manager.cpython-39.pyc b/src/utils/__pycache__/student_life_var_binned_data_manager.cpython-39.pyc new file mode 100644 index 0000000..d2cb5c1 Binary files /dev/null and b/src/utils/__pycache__/student_life_var_binned_data_manager.cpython-39.pyc differ diff --git a/src/utils/__pycache__/tensorify.cpython-39.pyc b/src/utils/__pycache__/tensorify.cpython-39.pyc new file mode 100644 index 0000000..5ba46af Binary files /dev/null and b/src/utils/__pycache__/tensorify.cpython-39.pyc differ diff --git a/src/utils/__pycache__/train_val_utils.cpython-38.pyc b/src/utils/__pycache__/train_val_utils.cpython-38.pyc new file mode 100644 index 0000000..ad35b7a Binary files /dev/null and b/src/utils/__pycache__/train_val_utils.cpython-38.pyc differ diff --git a/src/utils/__pycache__/train_val_utils.cpython-39.pyc b/src/utils/__pycache__/train_val_utils.cpython-39.pyc new file mode 100644 index 0000000..1c0c0b6 Binary files /dev/null and b/src/utils/__pycache__/train_val_utils.cpython-39.pyc differ diff --git a/src/utils/cross_val.py b/src/utils/cross_val.py new file mode 100644 index 0000000..e86a3ca --- /dev/null +++ b/src/utils/cross_val.py @@ -0,0 +1,150 @@ +import numpy as np +import matplotlib.pyplot as plt + +from random import shuffle +from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit +from src.utils import data_conversion_utils as conversions + + +SPLITTER_RANDOM_STATE = 100 + +# helper function, extract first n weeks/days data: +def get_first_n_data(keys, n): + month_days = {0: 0, 1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31} # map: month -> # days + + begin_interval = n # how many data used for the leaved out student + start_day = -1 + + key_list = list() + val_keys = list() + for key in keys: + month = int(key.split('_')[1]) + day = int(key.split('_')[2]) + curr_day = sum([month_days[i] for i in range(month + 1)]) + day + if start_day < 0: + start_day = curr_day + else: + if curr_day - start_day >= begin_interval: + val_keys.append(key) + else: + key_list.append(key) + return key_list, val_keys + +def test_only(data: dict): + splits = [{ + "train_ids": list(), + "val_ids": [k for k in data['data'].keys()] + } + ] + return splits + +def leave_one_subject_out_split(data: dict, days_include=0): + """ + @param data: data for which the splits are needed to be generated. + @param days_include: the number of days of + leaved out data included in the taining + @return: Return list of dictionary (map: train_ids, val_ids -> data_keys) + """ + splits = list() + + data_keys = data['data'].keys() + + student_key = dict() # map: id -> keys + for key in data_keys: + try: + student_key[key.split('_')[0]].append(key) + except: + student_key[key.split('_')[0]] = [key] + + #for student in student_key: + for student in student_key: + splitting_dict = dict() + splitting_dict['train_ids'] = list() + for rest_student in student_key: + if rest_student != student: + splitting_dict['train_ids'] += student_key[rest_student] + else: + if days_include == 0: + splitting_dict['val_ids'] = student_key[rest_student] + else: + loo_train_keys, loo_val_keys = get_first_n_data(student_key[rest_student], days_include) + splitting_dict['train_ids'] += loo_train_keys + splitting_dict['val_ids'] = loo_val_keys + splits.append(splitting_dict) + + return splits + +def get_k_fod_chronological(data: dict, n_splits=5): + num_data = dict() + splits = [{"train_ids": list(), "val_ids": list()} for _ in range(n_splits)] + data_keys = data['data'].keys() + student_to_keys = dict() # map: student_id -> list(keys) + for k in data_keys: + id_ = k.split("_")[0] + if student_to_keys.get(id_) is None: + student_to_keys[id_] = [k] + else: + student_to_keys[id_].append(k) + + for id_ in student_to_keys: + curr_keys = np.array(sorted(student_to_keys[id_])) + num_data[id_] = len(curr_keys) + tscv = TimeSeriesSplit(n_splits=n_splits, test_size=None) + for i, (train_index, test_index) in enumerate(tscv.split(curr_keys)): + splits[i]['train_ids'] += curr_keys[train_index].tolist() + splits[i]['val_ids'] += curr_keys[test_index].tolist() + + # # plot number of samples + # plt.bar(num_data.keys(), num_data.values()) + # plt.savefig("num_samples.png") + # for k in num_data: + # print(k, num_data[k]) + # exit() + + return splits + +def get_k_fod_cross_val_splits_stratified_by_students(data: dict, groups:dict, n_splits=5, + stratification_type="students"): + """ + @param data: data for which the splits are needed to be generated. + @param groups: map: student_ids -> groups ids + @param n_splits: number of split + @param stratification_type: deterimine the criteria for stratified split + @return: Return list of dictionary (map: train_ids, val_ids -> data_keys) + """ + + print('########## k_fold stratification split, stratified by: ' + stratification_type + '############') + print('split n: ' + str(n_splits)) + splits = list() + + data_keys = data['data'].keys() + + # determine values in stratified column + stratification_column = list() + pos = 0 if stratification_type == "students" else -1 if stratification_type == 'labels' else None + if pos != None: + for key in data_keys: + stratification_column.append(int(key.split('_')[pos])) + elif stratification_type == 'student_label': + keys, labels = conversions.extract_keys_and_labels_from_dict(data) + student_ids = conversions.extract_student_ids_from_keys(keys) + for i in range(len(student_ids)): + stratification_column.append(str(student_ids[i]) + "_" + str(labels[i])) + else: + print('No such kind of criteria for splitting!!!') + exit() + + # splitting + data_keys = np.array(list(data_keys)) + stratification_column = np.array(list(stratification_column)) + # splitter = StratifiedKFold(n_splits=n_splits, random_state=SPLITTER_RANDOM_STATE) + splitter = StratifiedKFold(n_splits=n_splits) # for local run + for train_index, val_index in splitter.split(X=data_keys, y=stratification_column): + + splitting_dict = dict() + splitting_dict['train_ids'] = data_keys[train_index].tolist() + splitting_dict['val_ids'] = data_keys[val_index].tolist() + splits.append(splitting_dict) + + return splits diff --git a/src/utils/data_conversion_utils.py b/src/utils/data_conversion_utils.py new file mode 100644 index 0000000..3ab3ab7 --- /dev/null +++ b/src/utils/data_conversion_utils.py @@ -0,0 +1,186 @@ +import numpy as np +import pandas as pd + +from src import definitions + +def transpose_data(data: list): + np_data_array = np.array(data, dtype=np.float32) + return np.transpose(np_data_array) + +def get_mean_for_series(series, mask): + assert len(series) == len(mask), "Length mismatch of series: {} and mask: {}".format( + len(series), + len(mask)) + return np.mean(series[mask.astype(bool)]) + + +def get_mean_for_series(series, mask): + return np.mean(series[mask.astype(bool)]) + +def adjust_classes_wrt_median(label): + if label < 2: + return 0 + elif label > 2: + return 2 + else: + return 1 + + +def flatten_matrix(matrix): + """ + + @param matrix: Accepts numpy matrix of list to be flattened. + @return: Flattened list or Matrix. + """ + assert isinstance(matrix, np.ndarray) or isinstance(matrix, + list), "Invalid data type, please give either np.ndarray or a lists." + + if isinstance(matrix, np.ndarray): + return matrix.flatten() + else: + return np.array(matrix).flatten().tolist() + + +def extract_keys_and_labels_from_dict(data: dict): + keys = [] + labels = [] + + for key in data['data']: + keys.append(key) + labels.append(data['data'][key][definitions.LABELS_IDX]) + + return keys, labels + + +def extract_student_ids_from_keys(keys): + student_ids = [] + for key in keys: + student_ids.append(extract_student_id_from_key(key)) + + return student_ids + + +def extract_distinct_student_idsfrom_keys(keys): + return set(extract_student_ids_from_keys(keys)) + + +def extract_student_id_from_key(key): + return key.split("_")[0] + + +def extract_actual_missing_and_time_delta_from_raw_data_for_student(raw_data, student_id): + assert len(raw_data) == 3, \ + "Invalid raw data, it missing one of the following: Actual data, Missing flags or Time Deltas" + + (student_data, missing_data, time_delta) = raw_data + student_data = student_data[student_data['student_id'] == student_id] + missing_data = missing_data[missing_data['student_id'] == student_id] + time_delta = time_delta[time_delta['student_id'] == student_id] + + return student_data, missing_data, time_delta + + +def extract_keys_of_student_from_data(data: dict, student_id): + keys = [] + + for key in data['data']: + if str(student_id) == extract_student_id_from_key(key): + keys.append(key) + + return keys + + +def extract_labels_for_student_id_form_data(data: dict, student_id): + student_keys = extract_keys_of_student_from_data(data, student_id) + labels = [] + + for key in student_keys: + labels.append(data['data'][key][definitions.LABELS_IDX]) + + return labels + + +def get_filtered_keys_for_these_students(*student_id, keys): + filtered_keys = [] + student_ids = list(student_id) + + for key in keys: + curr_student = key.split("_")[0] + if curr_student in student_ids: + filtered_keys.append(key) + + return filtered_keys + + +def flatten_data(data: list): + """ + + @param data: Data to be flattened, i.e. the rows will be appended as columns. + @return: Convert sequences to columns by flattening all rows into a single row. + """ + assert len(data) == 4, "Missing either of the one in data - Actual data, missing flags, time deltas or label" + flattened_data_list = [] + # Cannot flatten the labels. + for i in range(len(data) - 1): + flattened_data_list.append(flatten_matrix(data[i])) + # Append the label as well. + flattened_data_list.append(data[-1]) + + return flattened_data_list + + +def convert_df_to_tuple_of_list_values(*data_frames): + data_frames_as_list = [] + for df in data_frames: + data_frames_as_list.append(df.values.tolist()) + + return tuple(data_frames_as_list) + + +def get_indices_list_in_another_list(a, b): + """ + + @param a: List of elements who's indices need to be found. + @param b: Base list containing superset of a. + @return: indices of elements of list a in list b. + """ + indices = [] + for element in a: + indices.append(b.index(element)) + + return indices + + +def drop_duplicate_indices_from_df(df: pd.DataFrame) -> pd.DataFrame: + return df[~df.index.duplicated(keep="first")] + + +def convert_to_string_if_int(value): + return str(value) if isinstance(value, int) else value + + +def convert_to_int_if_str(value): + if value.isdigit(): + return int(value) + + +def convert_list_of_strings_to_list_of_ints(string_list): + return [convert_to_int_if_str(x) for x in string_list] + + +def prepend_ids_with_string(ids, string): + return [string + str(x) for x in ids] + + +def tensor_list_to_int_list(tensor_list): + int_list = [] + for t in tensor_list: + int_list.append(t.item()) + + return int_list + + +def get_model_parameters(model): + total_model_parameters = sum(p.numel() for p in + model.parameters() if p.requires_grad) + return total_model_parameters diff --git a/src/utils/read_utils.py b/src/utils/read_utils.py new file mode 100644 index 0000000..8710dd7 --- /dev/null +++ b/src/utils/read_utils.py @@ -0,0 +1,23 @@ +import yaml +import os +import pickle + + +def read_yaml(file_path): + """Util to read Yaml File.""" + + # Reading from YML file. + with open(file_path, "r") as ymlfile: + yaml_file = yaml.load(ymlfile) + + return yaml_file + + +def read_pickle(file_path): + if not os.path.exists(file_path): + raise "File as {} does not exist.".format(file_path) + + with (open(file_path, "rb")) as file: + data = pickle.load(file) + + return data diff --git a/src/utils/student_life_var_binned_data_manager.py b/src/utils/student_life_var_binned_data_manager.py new file mode 100644 index 0000000..3040578 --- /dev/null +++ b/src/utils/student_life_var_binned_data_manager.py @@ -0,0 +1,198 @@ +from src import definitions +from src.bin import validations as validations +from src.data_manager import splitter +from src.data_manager import helper as data_manager_helper +from src.data_processing import normalizer +from src.utils import read_utils +from src.utils import student_utils +from src.utils import set_utils +from src.utils import data_conversion_utils as conversions +from src.data_processing import covariates + +VAR_BINNED_DATA_CONFIG = read_utils.read_yaml(definitions.DATA_MANAGER_CONFIG_FILE_PATH)[ + definitions.VAR_BINNED_DATA_MANAGER_ROOT] +ADJUST_LABELS_WRT_MEDIAN = VAR_BINNED_DATA_CONFIG['adjust_labels_wrt_median'] +FLATTEN_SEQUENCE_TO_COLS = VAR_BINNED_DATA_CONFIG['flatten_sequence_to_cols'] + +DEFAULT_STUDENT_LIST = VAR_BINNED_DATA_CONFIG[definitions.STUDENT_LIST_CONFIG_KEY] +available_students = student_utils.get_available_students(definitions.BINNED_ON_VAR_FREQ_DATA_PATH) +DEFAULT_STUDENT_LIST = list(set(DEFAULT_STUDENT_LIST).intersection(set(available_students))) + +FEATURE_LIST = VAR_BINNED_DATA_CONFIG[definitions.FEATURE_LIST_CONFIG_KEY] +LABEL_LIST = VAR_BINNED_DATA_CONFIG[definitions.LABEL_LIST_CONFIG_KEY] +COVARIATE_LIST = VAR_BINNED_DATA_CONFIG[definitions.COVARIATE_LIST_CONFIG_KEY] +NORMALIZE_STRAT = VAR_BINNED_DATA_CONFIG['normalize_strategy'] + +if VAR_BINNED_DATA_CONFIG['process_covariates_as_regular_features']: + FEATURE_LIST = FEATURE_LIST + COVARIATE_LIST +else: + assert len(set_utils.lists_intersection(FEATURE_LIST, COVARIATE_LIST)) == 0, \ + "Feature List and Covariate List cannot overlap." + +# These sizes are in percent of data. +TRAIN_SET_SIZE = VAR_BINNED_DATA_CONFIG['train_set_size'] +VAL_SET_SIZE = VAR_BINNED_DATA_CONFIG['val_set_size'] +TEST_SET_SIZE = VAR_BINNED_DATA_CONFIG['test_set_size'] + +DEFAULT_SPLITTING_STRATEGY = VAR_BINNED_DATA_CONFIG['default_splitting_strategy'] +SPLITTING_STRATEGY_FUNCTION_MAP = { + 'day': data_manager_helper.get_data_for_single_day, + 'time_delta': data_manager_helper.get_data_for_single_label_based_on_time_delta +} + + +def get_data_based_on_labels_and_splitting_strategy(training_values, covariate_values, + missing_values, time_delta, + y_labels, splitting_strategy, + flatten_sequence_to_cols, normalize=False): + """ + + @param training_values: Training values of students. + @param covariate_values: Values that need to be processed as covariates. + @param missing_values: Missing values for one student. + @param time_delta: Time deltas for one student. + @param y_labels: Labels for training. Can have null values. + @param splitting_strategy: Splitting strategy for the data. Current support for + 1) days - Each label will have one day's worth of data. + 2) time_delta - Each label will contain data x hours beihind and y hours ahead (configurable by data_manager.yaml) + @param flatten_sequence_to_cols: If true, the sequences are flattened into columns. + @param normalize: If true, data is normalized based on global statistics. Expensive operation. + @return: Trimmed data based on time delta. + """ + validations.validate_data_integrity_for_len(training_values, missing_values, time_delta, y_labels) + assert splitting_strategy in SPLITTING_STRATEGY_FUNCTION_MAP.keys(), \ + "Invalid splitting strategy must be one of: {}".format(SPLITTING_STRATEGY_FUNCTION_MAP.keys()) + + data_list = [] + # todo(abhinavshaw): make it general for all the labels. + y_labels = y_labels[y_labels['stress_level_mode'].notnull()] + + # todo(abihnavshaw): Process on whole data once fixed issue with last label. + # len(y_label) -1 to ignore the last label. + for label_idx in range(len(y_labels) - 1): + data = SPLITTING_STRATEGY_FUNCTION_MAP[splitting_strategy](training_values, + covariate_values, + missing_values, + time_delta, + y_labels, + y_labels.index[label_idx]) + + if data: + month_day_hour_key = str(y_labels.index[label_idx].month) + '_' + str(y_labels.index[label_idx].day) + '_' \ + + str(y_labels.index[label_idx].hour) + data = conversions.flatten_data(data) if flatten_sequence_to_cols else data + data_list.append((month_day_hour_key, data)) + + return normalizer.normalize_data_list(data_list, normalize_strat=NORMALIZE_STRAT) if normalize else data_list + + +def process_student_data(raw_data, student_id: int, + splitting_strategy, + normalize: bool, + fill_na: bool, + flatten_sequence: bool, + split_type='percentage'): + """ + Processes student data from a large DF of all students. This data is then transformed to the kind + acceptable by DBM and VDB. + """ + assert len(LABEL_LIST) == 1, "Feature List greater than one, check logic to generate labels." + validations.validate_student_id_in_data(*raw_data) + validations.validate_data_integrity_for_len(*raw_data) + + student_data, missing_data, time_delta = conversions.extract_actual_missing_and_time_delta_from_raw_data_for_student( + raw_data, student_id=student_id) + + validations.validate_all_columns_present_in_data_frame(student_data, missing_data, time_delta, columns=FEATURE_LIST) + validations.validate_all_columns_present_in_data_frame(student_data, columns=LABEL_LIST) + + training_values = student_data.loc[:, FEATURE_LIST] + + covariate_values = student_data.loc[:, COVARIATE_LIST] + covariate_values = covariates.exam_period(covariate_values) + + missing_values = missing_data.loc[:, FEATURE_LIST] + time_deltas = time_delta.loc[:, FEATURE_LIST] + y_labels = student_data.loc[:, LABEL_LIST] + + # Additional flags for data processing. + if ADJUST_LABELS_WRT_MEDIAN: + y_labels['stress_level_mode'] = y_labels['stress_level_mode'].map(conversions.adjust_classes_wrt_median, + na_action='ignore') + if 'previous_stress_label' in COVARIATE_LIST: + covariate_values['previous_stress_label'] = covariate_values['previous_stress_label'].map( + conversions.adjust_classes_wrt_median, + na_action='ignore') + + # Filling missing Values + if fill_na: + training_values.fillna(value=-1, inplace=True) + + data_list = get_data_based_on_labels_and_splitting_strategy(training_values, + covariate_values, + missing_values, + time_deltas, + y_labels, + splitting_strategy, + flatten_sequence, + normalize) + + if split_type == 'percentage': + train_set, val_set, test_set = splitter.get_data_split_by_percentage(data_list) + else: + train_set, val_set, test_set = splitter.get_data_split_by_date(data_list) + + return data_list, train_set, val_set, test_set + + +def get_data_for_training_in_dict_format(*student_ids, + splitting_strategy=DEFAULT_SPLITTING_STRATEGY, + normalize=False, + fill_na=True, + flatten_sequence=False, + split_type='percentage'): + """ + + @attention: If no student_ids given to function the default students are returned. + @return: The processed data for all the students in the config. + """ + if not student_ids: + student_ids = DEFAULT_STUDENT_LIST + else: + student_ids = list(student_ids) + + # todo(abhinavshaw) Change to a function. + data = dict() + data["train_ids"] = [] + data["val_ids"] = [] + data["test_ids"] = [] + + data_dict = {} + raw_data = student_utils.get_var_binned_data_for_students(*student_ids) + + for it, student_id in enumerate(student_ids): + print("Student: {}".format(student_id)) + data_list, train_ids, val_ids, test_ids = process_student_data(raw_data, + student_id, + splitting_strategy=splitting_strategy, + normalize=normalize, + fill_na=fill_na, + flatten_sequence=flatten_sequence, + split_type=split_type) + + # Prefixing the IDs with student_id. + for month_day, daily_data in data_list: + data_key = str(student_id) + "_" + month_day + data_dict[data_key] = daily_data + + train_ids, val_ids, test_ids = student_utils.prefix_list_of_strings_or_ids_with_student_id(train_ids, + val_ids, + test_ids, + student_id=student_id) + + data['data'] = data_dict + data['train_ids'] += train_ids + data['val_ids'] += val_ids + data['test_ids'] += test_ids + + return data diff --git a/src/utils/tensorify.py b/src/utils/tensorify.py new file mode 100644 index 0000000..3ce8ccb --- /dev/null +++ b/src/utils/tensorify.py @@ -0,0 +1,46 @@ +import torch + +from src import definitions + +def get_data_and_label_tensor(data: dict, key, cuda_enabled): + """ + + @param data: Data dict containing the data in our rich data structure. + @param key: Key in the data, usually time series key. + @param cuda_enabled: If true, returns cuda tensors. + @return: Returns tensors that can be used for training on the models. + """ + tensor_data = torch.tensor(list(data['data'][key][:definitions.COVARIATE_DATA_IDX]), + dtype=torch.float) + covariate_data = torch.tensor(list(data['data'][key][definitions.COVARIATE_DATA_IDX]), + dtype=torch.float) + histogram_data = torch.tensor(list(data['data'][key][definitions.HISTOGRAM_IDX]), + dtype=torch.float) + train_label = torch.tensor(data['data'][key][definitions.LABELS_IDX]).item() + + # # for train on two labels + # train_label = int(min(1, train_label)) + train_label = int(train_label) + + train_label = torch.tensor([train_label], dtype=torch.long) + + if cuda_enabled: + tensor_data = tensor_data.cuda() + covariate_data = covariate_data.cuda() + histogram_data = histogram_data.cuda() + train_label = train_label.cuda() + + return tensor_data, covariate_data, histogram_data, train_label + + +def tensorify_data_gru_d(data: dict, cuda_enabled=False): + """ + + @param data: Data dictionary that needs to be converted to tensors in GRUD style of data. + @param cuda_enabled: If true, will convert data into cuda tensors. + @return: Return Data dictionary with tensors which can be used to train. + """ + for key in data['data'].keys(): + data['data'][key] = get_data_and_label_tensor(data, key, cuda_enabled) + + return data diff --git a/src/utils/train_val_utils.py b/src/utils/train_val_utils.py new file mode 100644 index 0000000..da4cf22 --- /dev/null +++ b/src/utils/train_val_utils.py @@ -0,0 +1,347 @@ +import numpy as np +import pickle +import copy +import tqdm +from sklearn import metrics +from sklearn.preprocessing import MultiLabelBinarizer + +from src.experiments.models import * +from src.utils import cross_val + +######## helper methods ######## +def read_data(filename): + data = None + with open(filename, "rb") as f: + data = pickle.load(f) + return data + +def get_splits(split_name, data, student_groups, days_include=0): + splits = None + if split_name == '5fold': + # k-fold cross validation + stratification_type = "student_label" + n_splits = 5 + splits = cross_val.get_k_fod_cross_val_splits_stratified_by_students( + data=data, + groups=student_groups, + n_splits=n_splits, + stratification_type=stratification_type + ) + elif split_name == 'loocv': + splits = cross_val.leave_one_subject_out_split(data=data, days_include=days_include) + print("Num Splits: ", len(splits)) + elif split_name == '5fold_c':# chronological order + splits = cross_val.get_k_fod_chronological( + data=data, + n_splits=5 + ) + return splits + +def get_mini_batchs(batch_size, inds, shuffle=True): + batch_inds = list() + if shuffle: + np.random.shuffle(inds) + i = 0 + while i < len(inds): + batch_inds.append(inds[i:i+batch_size]) + i += batch_size + return batch_inds + +def formatting_train_val_data(data, training_params): + train_data = { + 'samples': list(), + 'covariate_data': torch.tensor([]).to(training_params['device']), + 'labels': torch.tensor([]).type(torch.LongTensor).to(training_params['device']), + 'ids': list(), + } + val_data = { + 'samples': list(), + 'covariate_data': torch.tensor([]).to(training_params['device']), + 'labels': torch.tensor([]).type(torch.LongTensor).to(training_params['device']), + 'ids': list(), + } + for key in data['train_ids']: + actual_data, covariate_data, histogram_data, train_label = data['data'][key] + if training_params['use_histogram']: + actual_data = histogram_data.unsqueeze(0) + + # for train on two labels + # train_label = torch.minimum(torch.tensor([1]), train_label) + + # update global variable + train_data['samples'].append(actual_data.to(training_params['device'])) + train_data['labels'] = torch.cat((train_data['labels'], train_label.to(training_params['device'])), dim=0) + train_data['covariate_data'] = torch.cat((train_data['covariate_data'], covariate_data.unsqueeze(0).to(training_params['device'])), dim=0) + train_data['ids'].append(key.split('_')[0]) + for key in data['val_ids']: + actual_data, covariate_data, histogram_data, train_label = data['data'][key] + if training_params['use_histogram']: + actual_data = histogram_data.unsqueeze(0) + + # for train on two labels + # train_label = torch.minimum(torch.tensor([1]), train_label) + + val_data['samples'].append(actual_data.to(training_params['device'])) + val_data['labels'] = torch.cat((val_data['labels'], train_label.to(training_params['device'])), dim=0) + val_data['covariate_data'] = torch.cat((val_data['covariate_data'], covariate_data.unsqueeze(0).to(training_params['device'])), dim=0) + val_data['ids'].append(key.split('_')[0]) + train_data['inds'] = list(range(len(train_data['samples']))) + train_data['ids'] = np.array(train_data['ids']) + val_data['inds'] = list(range(len(val_data['samples']))) + val_data['ids'] = np.array(val_data['ids']) + return train_data, val_data + +#### evaluation metrics #### +def eval_accuracy(y_pred, y_true): + return (y_pred == y_true).mean() + +def eval_f1_score(y_pred, y_true, avg_type): + return metrics.f1_score(y_true, y_pred, average=avg_type) + +def eval_auc_score(y_pred, y_true, labels, avg_type, b=False): + if not b: + mlb = MultiLabelBinarizer() + mlb.fit(labels) + y_true = mlb.transform([[i] for i in y_true]) + # y_pred = mlb.transform([[i] for i in y_pred]) + + roc_weighted = None + try: + roc_weighted = metrics.roc_auc_score(y_true, y_pred, average=avg_type) + except: + roc_weighted = 0.0 + return roc_weighted +################################################################ + +def train_ae_then_freeze(model, optimizer, reconstruction_criterion, train_data, training_params): + # Training Autoencoder (AE) + for e in range(200): + # training + model.train() + batchs = get_mini_batchs(training_params['batch_size'], train_data['inds']) + train_loss = 0 + for batch in batchs: + # forward + final_out, AE_out = model( + x=train_data['samples'], + inds=batch, + ids=train_data['ids'][batch], + covariate_data=train_data['covariate_data'][batch] + ) + + reconstruction_loss = 0 + for i in range(len(AE_out)): + reconstruction_loss += reconstruction_criterion(train_data['samples'][batch[i]], AE_out[i]) + reconstruction_loss *= training_params['loss_weight']['alpha'] + total_loss = reconstruction_loss + + # backpropagation + model.zero_grad() + total_loss.backward() + optimizer.step() + train_loss += total_loss.cpu().detach().item() + + # freeze AE + for p in model.autoencoder.parameters(): + p.requires_grad = False + +#### main trian function #### +def train_and_val( + data, + model_params, + training_params, + pre_record=None, + leaved_student=None, + up_weight_k=None, +): + # prepare data + train_data, val_data = formatting_train_val_data(data, training_params) + # return val_data['labels'].cpu() # new added + + # declare results for save + saved_records = { + 'model': None, + 'train_losses': list(), + 'val_losses': list(), + 'outputs': list(), + 'generic_outputs': list(), + 'confmats': list(), + 'val_f1': { + 'micro': list(), + 'macro': list(), + 'weighted': list() + }, + 'val_auc': { + 'micro': list(), + 'macro': list(), + 'weighted': list() + }, + 'labels': list(), + 'generic_records': { + 'outputs': list(), + 'confmats': list(), + 'val_f1': { + 'micro': list(), + 'macro': list(), + 'weighted': list() + }, + 'val_auc': { + 'micro': list(), + 'macro': list(), + 'weighted': list() + }, + } + } + + # construct model + print('Initializing...') + model = None + if pre_record == None: + model = MultitaskAutoencoder(model_params, training_params['use_covariates']).to(training_params['device']) + else: + model = pre_record['model'].to(training_params['device']) + reconstruction_criterion = torch.nn.L1Loss(reduction="sum") + classification_criterion = torch.nn.CrossEntropyLoss( + weight=torch.tensor(training_params['class_weights'], device=training_params['device']) + ) + + # construct optimizer + optimizer = torch.optim.Adam( + [ + {'params': model.autoencoder.parameters()}, + {'params': model.out_heads.parameters()}, + {'params': model.branching.parameters(), 'lr': training_params['branching_lr']}, + {'params': model.branch_layer.parameters(), 'lr': training_params['branching_lr']}, + ], + lr=training_params['global_lr'], + weight_decay=training_params['weight_decay'], + ) + + # # train ae prior than train entire + # train_ae_then_freeze(model, optimizer, reconstruction_criterion, train_data, training_params) + + # start training + print('Training...') + for epoch in tqdm.tqdm(range(training_params['epochs'])): + # training + model.train() + batchs = get_mini_batchs(training_params['batch_size'], train_data['inds']) + train_loss = 0 + for batch in batchs: + # forward + if not model.with_generic_head: + final_out, AE_out = model( + x=train_data['samples'], + inds=batch, + ids=train_data['ids'][batch], + covariate_data=train_data['covariate_data'][batch] + ) + else: + final_out, AE_out, generic_out = model( + x=train_data['samples'], + inds=batch, + ids=train_data['ids'][batch], + covariate_data=train_data['covariate_data'][batch] + ) + + # up-weighting loss + # fetch weight vector + weight_vec = list() + ids = train_data['ids'][batch] + for i in range(len(batch)): + if ids[i] == leaved_student: + weight_vec.append([up_weight_k]) + else: + weight_vec.append([1]) + weight_vec = torch.Tensor(weight_vec).to(training_params['device']) + + # up-weight + final_out *= weight_vec + # AE_out *= weight_vec + + # compute loss + classification_loss = classification_criterion(final_out, train_data['labels'][batch]) * training_params['loss_weight']['beta'] + total_loss = classification_loss + + if training_params['use_decoder']: + reconstruction_loss = 0 + for i in range(len(AE_out)): + reconstruction_loss += reconstruction_criterion(train_data['samples'][batch[i]], AE_out[i]) + reconstruction_loss *= training_params['loss_weight']['alpha'] + total_loss = reconstruction_loss + classification_loss + + if model.with_generic_head: + total_loss += classification_criterion(generic_out, train_data['labels'][batch]) * training_params['loss_weight']['theta'] + + # backpropagation + model.zero_grad() + total_loss.backward() + optimizer.step() + train_loss += total_loss.cpu().detach().item() + + # validation + model.eval() + # forward + if not model.with_generic_head: + final_out, AE_out = model( + x=val_data['samples'], + inds=list(range(len(val_data['samples']))), + ids=val_data['ids'], + covariate_data=val_data['covariate_data'] + ) + else: + final_out, AE_out, generic_out = model( + x=val_data['samples'], + inds=list(range(len(val_data['samples']))), + ids=val_data['ids'], + covariate_data=val_data['covariate_data'] + ) + + # compute loss + classification_loss = classification_criterion(final_out, val_data['labels']) * training_params['loss_weight']['beta'] + val_loss = classification_loss + + if training_params['use_decoder']: + reconstruction_loss = 0 + for i in range(len(AE_out)): + reconstruction_loss += reconstruction_criterion(val_data['samples'][i], AE_out[i]) + reconstruction_loss *= training_params['loss_weight']['alpha'] + val_loss = reconstruction_loss + classification_loss + + # evaluate and update global variables + saved_records['train_losses'].append(train_loss) + saved_records['val_losses'].append(val_loss.cpu().detach().item()) + + # save validation information + saved_records['outputs'].append(final_out.cpu().detach().numpy()) + + y_pred = np.argmax(saved_records['outputs'][-1], axis=1) + y_true = val_data['labels'].cpu().detach().numpy() + labels = [[0], [1], [2]] + saved_records['confmats'].append(metrics.confusion_matrix(y_true, y_pred, labels=[i[0] for i in labels])) + + for avg_type in ['micro', 'macro', 'weighted']: + saved_records['val_auc'][avg_type].append(eval_auc_score(saved_records['outputs'][-1], y_true, labels, avg_type)) + saved_records['val_f1'][avg_type].append(eval_f1_score(y_pred, y_true, avg_type)) + + if saved_records['val_f1']['weighted'][-1] == max(saved_records['val_f1']['weighted']): + saved_records['model'] = copy.deepcopy(model).cpu() + + # if the model has the generic head + if model.with_generic_head: + val_loss += classification_criterion(generic_out, val_data['labels']) * training_params['loss_weight']['theta'] + saved_records['generic_records']['outputs'].append(generic_out.cpu().detach().numpy()) + + y_pred = np.argmax(saved_records['generic_records']['outputs'][-1], axis=1) + y_true = val_data['labels'].cpu().detach().numpy() + labels = [[0], [1], [2]] + saved_records['generic_records']['confmats'].append(metrics.confusion_matrix(y_true, y_pred, labels=[i[0] for i in labels])) + + for avg_type in ['micro', 'macro', 'weighted']: + saved_records['generic_records']['val_auc'][avg_type].append(eval_auc_score(saved_records['generic_records']['outputs'][-1], y_true, labels, avg_type)) + saved_records['generic_records']['val_f1'][avg_type].append(eval_f1_score(y_pred, y_true, avg_type)) + + print("F1 Score This Epoch: {} Best Score: {}".format(saved_records['val_f1']['weighted'][-1], max(saved_records['val_f1']['weighted']))) + + # final return + return saved_records