Merge pull request #39 from datakind/pdp-add-new-features

[pdp] Add some new features
datakind · Dec 5, 2024 · 9169ba3 · 9169ba3
2 parents 5c4d585 + 83d95a6
commit 9169ba3
Show file tree

Hide file tree

Showing 7 changed files with 142 additions and 28 deletions.
diff --git a/src/student_success_tool/analysis/pdp/features/cumulative.py b/src/student_success_tool/analysis/pdp/features/cumulative.py
@@ -33,6 +33,7 @@ def add_features(
                 ("term_in_peak_covid", "sum"),
                 ("term_is_fall_spring", "sum"),
                 ("term_is_while_student_enrolled_at_other_inst", "sum"),
+                ("term_is_pre_cohort", "sum"),
                 ("course_level_mean", ["mean", "min", "std"]),
                 ("course_grade_numeric_mean", ["mean", "min", "std"]),
                 ("num_courses", ["sum", "mean", "min"]),

diff --git a/src/student_success_tool/analysis/pdp/features/student_term.py b/src/student_success_tool/analysis/pdp/features/student_term.py
@@ -148,12 +148,21 @@ def add_features(
     feature_name_funcs = (
         {
             "year_of_enrollment_at_cohort_inst": year_of_enrollment_at_cohort_inst,
+            "term_is_pre_cohort": term_is_pre_cohort,
             "term_is_while_student_enrolled_at_other_inst": term_is_while_student_enrolled_at_other_inst,
             "frac_credits_earned": shared.frac_credits_earned,
             "student_term_enrollment_intensity": ft.partial(
                 student_term_enrollment_intensity,
                 min_num_credits_full_time=min_num_credits_full_time,
             ),
+            "num_courses_in_program_of_study_area_term_1": ft.partial(
+                num_courses_in_study_area,
+                study_area_col="student_program_of_study_area_term_1",
+            ),
+            "num_courses_in_program_of_study_area_year_1": ft.partial(
+                num_courses_in_study_area,
+                study_area_col="student_program_of_study_area_year_1",
+            ),
         }
         | {
             fc_col: ft.partial(compute_frac_courses, numer_col=nc_col)
@@ -195,13 +204,37 @@ def year_of_enrollment_at_cohort_inst(
     return pd.Series(np.ceil((dts_diff + 1) / 365.25), dtype="Int8")
 
 
+def term_is_pre_cohort(
+    df: pd.DataFrame,
+    *,
+    cohort_start_dt_col: str = "cohort_start_dt",
+    term_start_dt_col: str = "term_start_dt",
+) -> pd.Series:
+    return df[term_start_dt_col].lt(df[cohort_start_dt_col]).astype("boolean")
+
+
 # TODO: we could probably compute this directly, w/o an intermediate feature?
 def term_is_while_student_enrolled_at_other_inst(
     df: pd.DataFrame, *, col: str = "num_courses_enrolled_at_other_institution_s_Y"
 ) -> pd.Series:
     return df[col].gt(0)
 
 
+def num_courses_in_study_area(
+    df: pd.DataFrame,
+    *,
+    study_area_col: str,
+    course_subject_areas_col: str = "course_subject_areas",
+    fill_value: str = "-1",
+) -> pd.Series:
+    return (
+        pd.DataFrame(df[course_subject_areas_col].tolist(), dtype="string")
+        .eq(df[study_area_col].fillna(fill_value), axis="index")
+        .sum(axis="columns")
+        .astype("Int8")
+    )
+
+
 def compute_frac_courses(
     df: pd.DataFrame, *, numer_col: str, denom_col: str = "num_courses"
 ) -> pd.Series:

diff --git a/src/student_success_tool/analysis/pdp/targets/failure_to_retain.py b/src/student_success_tool/analysis/pdp/targets/failure_to_retain.py
@@ -13,9 +13,11 @@ def make_labeled_dataset(
     df: pd.DataFrame,
     *,
     student_criteria: dict[str, object | Collection[object]],
+    exclude_pre_cohort_terms: bool = True,
     student_id_cols: str | list[str] = "student_guid",
     cohort_id_col: str = "cohort_id",
     term_id_col: str = "term_id",
+    term_is_pre_cohort_col: str = "term_is_pre_cohort",
     term_rank_col: str = "term_rank",
     retention_col: str = "retention",
 ) -> pd.DataFrame:
@@ -39,15 +41,21 @@ def make_labeled_dataset(
     df_eligible_student_terms = pd.merge(
         df, df_eligible_students, on=student_id_cols, how="inner"
     )
-    df_features = shared.get_first_student_terms_within_cohort(
-        df_eligible_student_terms,
-        student_id_cols=student_id_cols,
-        cohort_id_col=cohort_id_col,
-        term_id_col=term_id_col,
-        term_rank_col=term_rank_col,
-        sort_cols=term_rank_col,
-        include_cols=None,
-    )
+    if exclude_pre_cohort_terms is True:
+        df_features = shared.get_first_student_terms_within_cohort(
+            df_eligible_student_terms,
+            student_id_cols=student_id_cols,
+            term_is_pre_cohort_col=term_is_pre_cohort_col,
+            sort_cols=term_rank_col,
+            include_cols=None,
+        )
+    else:
+        df_features = shared.get_first_student_terms(
+            df_eligible_student_terms,
+            student_id_cols=student_id_cols,
+            sort_cols=term_rank_col,
+            include_cols=None,
+        )
     df_targets = compute_target_variable(
         df_eligible_student_terms,
         student_id_cols=student_id_cols,

diff --git a/src/student_success_tool/analysis/pdp/targets/shared.py b/src/student_success_tool/analysis/pdp/targets/shared.py
@@ -240,9 +240,7 @@ def get_first_student_terms_within_cohort(
     df: pd.DataFrame,
     *,
     student_id_cols: str | list[str] = "student_guid",
-    cohort_id_col: str = "cohort_id",
-    term_id_col: str = "term_id",
-    term_rank_col: str = "term_rank",
+    term_is_pre_cohort_col: str = "term_is_pre_cohort",
     sort_cols: str | list[str] = "term_rank",
     include_cols: t.Optional[list[str]] = None,
 ) -> pd.DataFrame:
@@ -260,19 +258,9 @@ def get_first_student_terms_within_cohort(
         sort_cols
         include_cols
     """
-    student_id_cols = utils.to_list(student_id_cols)
-    # TODO: handle students w/o any courses in their cohort term?
-    student_cohort_term_ranks = (
-        df.loc[df[cohort_id_col].eq(df[term_id_col]), student_id_cols + [term_rank_col]]
-        .rename(columns={term_rank_col: "student_cohort_term_rank"})
-    )  # fmt: off
-    df_within_cohort = (
-        pd.merge(df, student_cohort_term_ranks, on=student_id_cols)
-        .loc[lambda df: df[term_rank_col].ge(df["student_cohort_term_rank"]), :]
-        .drop(columns="student_cohort_term_rank")
-    )
     return get_first_student_terms(
-        df_within_cohort,
+        # exclude rows that are "pre-cohort", so "first" meets our criteria here
+        df.loc[df[term_is_pre_cohort_col].eq(False), :],
         student_id_cols=student_id_cols,
         sort_cols=sort_cols,
         include_cols=include_cols,

diff --git a/tests/analysis/pdp/features/test_cumulative.py b/tests/analysis/pdp/features/test_cumulative.py
@@ -45,6 +45,7 @@ def df():
             ],
             "term_rank": [1, 3, 4, 5, 9],
             "term_rank_fall_spring": [1, 2, pd.NA, 3, 5],
+            "term_is_pre_cohort": [True, False, False, False, False],
         }
     ).astype({"term_rank": "Int8", "term_rank_fall_spring": "Int8"})
 
@@ -64,12 +65,14 @@ def df_grped(df):
             ["num_courses_course_level_0", "num_courses_course_level_1"],
             [
                 ("term_id", "count"),
+                ("term_is_pre_cohort", "sum"),
                 ("course_grade_num_mean", ["mean", "std"]),
                 ("num_courses", "sum"),
             ],
             pd.DataFrame(
                 {
                     "term_id_cumcount": [1.0, 2.0, 3.0, 4.0, 5.0],
+                    "term_is_pre_cohort_cumsum": [1.0, 1.0, 1.0, 1.0, 1.0],
                     "course_grade_num_mean_cummean": [4.0, 3.25, 2.75, 2.875, 3.0],
                     "course_grade_num_mean_cumstd": [
                         np.nan,

diff --git a/tests/analysis/pdp/features/test_student_term.py b/tests/analysis/pdp/features/test_student_term.py
@@ -218,6 +218,74 @@ def test_year_of_enrollment_at_cohort_inst(df, ccol, tcol, exp):
     assert obs.equals(exp) or obs.compare(exp).empty
 
 
+@pytest.mark.parametrize(
+    ["df", "ccol", "tcol", "exp"],
+    [
+        (
+            pd.DataFrame(
+                {
+                    "cohort_start_dt": ["2019-09-01", "2019-09-01", "2021-02-01"],
+                    "term_start_dt": ["2020-02-01", "2019-09-01", "2020-09-01"],
+                },
+                dtype="datetime64[s]",
+            ),
+            "cohort_start_dt",
+            "term_start_dt",
+            pd.Series([False, False, True], dtype="boolean"),
+        ),
+    ],
+)
+def test_term_is_pre_cohort(df, ccol, tcol, exp):
+    obs = student_term.term_is_pre_cohort(
+        df, cohort_start_dt_col=ccol, term_start_dt_col=tcol
+    )
+    assert isinstance(obs, pd.Series) and not obs.empty
+    assert obs.equals(exp) or obs.compare(exp).empty
+
+
+@pytest.mark.parametrize(
+    ["df", "study_area_col", "exp"],
+    [
+        (
+            pd.DataFrame(
+                {
+                    "study_area_term_1": ["01", "02", None, "03"],
+                    "study_area_year_1": ["01", "03", None, "03"],
+                    "course_subject_areas": [
+                        ["01", "01", "01", "02"],
+                        ["01", "02", "01"],
+                        ["01", "02", "03"],
+                        [],
+                    ],
+                }
+            ).astype({"study_area_term_1": "string", "study_area_year_1": "string"}),
+            "study_area_term_1",
+            pd.Series([3, 1, 0, 0], dtype="Int8"),
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "study_area_term_1": ["01", "02", None, "03"],
+                    "study_area_year_1": ["01", "03", None, "03"],
+                    "course_subject_areas": [
+                        ["01", "01", "01", "02"],
+                        ["01", "02", "01"],
+                        ["01", "02", "03"],
+                        [],
+                    ],
+                }
+            ).astype({"study_area_term_1": "string", "study_area_year_1": "string"}),
+            "study_area_year_1",
+            pd.Series([3, 0, 0, 0], dtype="Int8"),
+        ),
+    ],
+)
+def test_num_courses_in_study_area(df, study_area_col, exp):
+    obs = student_term.num_courses_in_study_area(df, study_area_col=study_area_col)
+    assert isinstance(obs, pd.Series) and not obs.empty
+    assert obs.equals(exp) or obs.compare(exp).empty
+
+
 @pytest.mark.parametrize(
     ["df", "numer_col", "denom_col", "exp"],
     [

diff --git a/tests/analysis/pdp/targets/test_shared.py b/tests/analysis/pdp/targets/test_shared.py
@@ -61,6 +61,16 @@ def test_df():
             ],
             "num_credits_earned": [25.0, 30.0, 35.0, 25.0, 35.0, 20.0, 45.0, 10.0],
             "term_rank": [3, 4, 5, 5, 6, 2, 4, 8],
+            "term_is_pre_cohort": [
+                True,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+            ],
         },
     ).astype(
         {
@@ -253,6 +263,7 @@ def test_select_students_by_next_year_course_data(test_df, exp):
                     ],
                     "num_credits_earned": [20.0, 25.0, 45.0, 25.0, 10.0],
                     "term_rank": [2, 3, 4, 5, 8],
+                    "term_is_pre_cohort": [False, True, False, False, False],
                 }
             ).astype(
                 {
@@ -328,24 +339,26 @@ def test_get_first_student_terms_at_num_credits_earned(
             [],
             pd.DataFrame(
                 {
-                    "student_id": ["01", "04", "02", "05"],
-                    "term_rank": [4, 4, 5, 8],
+                    "student_id": ["03", "01", "04", "02", "05"],
+                    "term_rank": [2, 4, 4, 5, 8],
                 }
             ),
         ),
         (
             ["cohort_id", "term_id"],
             pd.DataFrame(
                 {
-                    "student_id": ["01", "04", "02", "05"],
-                    "term_rank": [4, 4, 5, 8],
+                    "student_id": ["03", "01", "04", "02", "05"],
+                    "term_rank": [2, 4, 4, 5, 8],
                     "cohort_id": [
+                        "2019-20 FALL",
                         "2020-21 SPRING",
                         "2020-21 SPRING",
                         "2021-22 FALL",
                         "2022-23 FALL",
                     ],
                     "term_id": [
+                        "2019-20 SPRING",
                         "2020-21 SPRING",
                         "2020-21 SPRING",
                         "2021-22 FALL",