Skip to content

Commit

Permalink
Merge pull request #39 from datakind/pdp-add-new-features
Browse files Browse the repository at this point in the history
[pdp] Add some new features
  • Loading branch information
bdewilde authored Dec 5, 2024
2 parents 5c4d585 + 83d95a6 commit 9169ba3
Show file tree
Hide file tree
Showing 7 changed files with 142 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def add_features(
("term_in_peak_covid", "sum"),
("term_is_fall_spring", "sum"),
("term_is_while_student_enrolled_at_other_inst", "sum"),
("term_is_pre_cohort", "sum"),
("course_level_mean", ["mean", "min", "std"]),
("course_grade_numeric_mean", ["mean", "min", "std"]),
("num_courses", ["sum", "mean", "min"]),
Expand Down
33 changes: 33 additions & 0 deletions src/student_success_tool/analysis/pdp/features/student_term.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,21 @@ def add_features(
feature_name_funcs = (
{
"year_of_enrollment_at_cohort_inst": year_of_enrollment_at_cohort_inst,
"term_is_pre_cohort": term_is_pre_cohort,
"term_is_while_student_enrolled_at_other_inst": term_is_while_student_enrolled_at_other_inst,
"frac_credits_earned": shared.frac_credits_earned,
"student_term_enrollment_intensity": ft.partial(
student_term_enrollment_intensity,
min_num_credits_full_time=min_num_credits_full_time,
),
"num_courses_in_program_of_study_area_term_1": ft.partial(
num_courses_in_study_area,
study_area_col="student_program_of_study_area_term_1",
),
"num_courses_in_program_of_study_area_year_1": ft.partial(
num_courses_in_study_area,
study_area_col="student_program_of_study_area_year_1",
),
}
| {
fc_col: ft.partial(compute_frac_courses, numer_col=nc_col)
Expand Down Expand Up @@ -195,13 +204,37 @@ def year_of_enrollment_at_cohort_inst(
return pd.Series(np.ceil((dts_diff + 1) / 365.25), dtype="Int8")


def term_is_pre_cohort(
df: pd.DataFrame,
*,
cohort_start_dt_col: str = "cohort_start_dt",
term_start_dt_col: str = "term_start_dt",
) -> pd.Series:
return df[term_start_dt_col].lt(df[cohort_start_dt_col]).astype("boolean")


# TODO: we could probably compute this directly, w/o an intermediate feature?
def term_is_while_student_enrolled_at_other_inst(
df: pd.DataFrame, *, col: str = "num_courses_enrolled_at_other_institution_s_Y"
) -> pd.Series:
return df[col].gt(0)


def num_courses_in_study_area(
df: pd.DataFrame,
*,
study_area_col: str,
course_subject_areas_col: str = "course_subject_areas",
fill_value: str = "-1",
) -> pd.Series:
return (
pd.DataFrame(df[course_subject_areas_col].tolist(), dtype="string")
.eq(df[study_area_col].fillna(fill_value), axis="index")
.sum(axis="columns")
.astype("Int8")
)


def compute_frac_courses(
df: pd.DataFrame, *, numer_col: str, denom_col: str = "num_courses"
) -> pd.Series:
Expand Down
26 changes: 17 additions & 9 deletions src/student_success_tool/analysis/pdp/targets/failure_to_retain.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ def make_labeled_dataset(
df: pd.DataFrame,
*,
student_criteria: dict[str, object | Collection[object]],
exclude_pre_cohort_terms: bool = True,
student_id_cols: str | list[str] = "student_guid",
cohort_id_col: str = "cohort_id",
term_id_col: str = "term_id",
term_is_pre_cohort_col: str = "term_is_pre_cohort",
term_rank_col: str = "term_rank",
retention_col: str = "retention",
) -> pd.DataFrame:
Expand All @@ -39,15 +41,21 @@ def make_labeled_dataset(
df_eligible_student_terms = pd.merge(
df, df_eligible_students, on=student_id_cols, how="inner"
)
df_features = shared.get_first_student_terms_within_cohort(
df_eligible_student_terms,
student_id_cols=student_id_cols,
cohort_id_col=cohort_id_col,
term_id_col=term_id_col,
term_rank_col=term_rank_col,
sort_cols=term_rank_col,
include_cols=None,
)
if exclude_pre_cohort_terms is True:
df_features = shared.get_first_student_terms_within_cohort(
df_eligible_student_terms,
student_id_cols=student_id_cols,
term_is_pre_cohort_col=term_is_pre_cohort_col,
sort_cols=term_rank_col,
include_cols=None,
)
else:
df_features = shared.get_first_student_terms(
df_eligible_student_terms,
student_id_cols=student_id_cols,
sort_cols=term_rank_col,
include_cols=None,
)
df_targets = compute_target_variable(
df_eligible_student_terms,
student_id_cols=student_id_cols,
Expand Down
18 changes: 3 additions & 15 deletions src/student_success_tool/analysis/pdp/targets/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,9 +240,7 @@ def get_first_student_terms_within_cohort(
df: pd.DataFrame,
*,
student_id_cols: str | list[str] = "student_guid",
cohort_id_col: str = "cohort_id",
term_id_col: str = "term_id",
term_rank_col: str = "term_rank",
term_is_pre_cohort_col: str = "term_is_pre_cohort",
sort_cols: str | list[str] = "term_rank",
include_cols: t.Optional[list[str]] = None,
) -> pd.DataFrame:
Expand All @@ -260,19 +258,9 @@ def get_first_student_terms_within_cohort(
sort_cols
include_cols
"""
student_id_cols = utils.to_list(student_id_cols)
# TODO: handle students w/o any courses in their cohort term?
student_cohort_term_ranks = (
df.loc[df[cohort_id_col].eq(df[term_id_col]), student_id_cols + [term_rank_col]]
.rename(columns={term_rank_col: "student_cohort_term_rank"})
) # fmt: off
df_within_cohort = (
pd.merge(df, student_cohort_term_ranks, on=student_id_cols)
.loc[lambda df: df[term_rank_col].ge(df["student_cohort_term_rank"]), :]
.drop(columns="student_cohort_term_rank")
)
return get_first_student_terms(
df_within_cohort,
# exclude rows that are "pre-cohort", so "first" meets our criteria here
df.loc[df[term_is_pre_cohort_col].eq(False), :],
student_id_cols=student_id_cols,
sort_cols=sort_cols,
include_cols=include_cols,
Expand Down
3 changes: 3 additions & 0 deletions tests/analysis/pdp/features/test_cumulative.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def df():
],
"term_rank": [1, 3, 4, 5, 9],
"term_rank_fall_spring": [1, 2, pd.NA, 3, 5],
"term_is_pre_cohort": [True, False, False, False, False],
}
).astype({"term_rank": "Int8", "term_rank_fall_spring": "Int8"})

Expand All @@ -64,12 +65,14 @@ def df_grped(df):
["num_courses_course_level_0", "num_courses_course_level_1"],
[
("term_id", "count"),
("term_is_pre_cohort", "sum"),
("course_grade_num_mean", ["mean", "std"]),
("num_courses", "sum"),
],
pd.DataFrame(
{
"term_id_cumcount": [1.0, 2.0, 3.0, 4.0, 5.0],
"term_is_pre_cohort_cumsum": [1.0, 1.0, 1.0, 1.0, 1.0],
"course_grade_num_mean_cummean": [4.0, 3.25, 2.75, 2.875, 3.0],
"course_grade_num_mean_cumstd": [
np.nan,
Expand Down
68 changes: 68 additions & 0 deletions tests/analysis/pdp/features/test_student_term.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,74 @@ def test_year_of_enrollment_at_cohort_inst(df, ccol, tcol, exp):
assert obs.equals(exp) or obs.compare(exp).empty


@pytest.mark.parametrize(
["df", "ccol", "tcol", "exp"],
[
(
pd.DataFrame(
{
"cohort_start_dt": ["2019-09-01", "2019-09-01", "2021-02-01"],
"term_start_dt": ["2020-02-01", "2019-09-01", "2020-09-01"],
},
dtype="datetime64[s]",
),
"cohort_start_dt",
"term_start_dt",
pd.Series([False, False, True], dtype="boolean"),
),
],
)
def test_term_is_pre_cohort(df, ccol, tcol, exp):
obs = student_term.term_is_pre_cohort(
df, cohort_start_dt_col=ccol, term_start_dt_col=tcol
)
assert isinstance(obs, pd.Series) and not obs.empty
assert obs.equals(exp) or obs.compare(exp).empty


@pytest.mark.parametrize(
["df", "study_area_col", "exp"],
[
(
pd.DataFrame(
{
"study_area_term_1": ["01", "02", None, "03"],
"study_area_year_1": ["01", "03", None, "03"],
"course_subject_areas": [
["01", "01", "01", "02"],
["01", "02", "01"],
["01", "02", "03"],
[],
],
}
).astype({"study_area_term_1": "string", "study_area_year_1": "string"}),
"study_area_term_1",
pd.Series([3, 1, 0, 0], dtype="Int8"),
),
(
pd.DataFrame(
{
"study_area_term_1": ["01", "02", None, "03"],
"study_area_year_1": ["01", "03", None, "03"],
"course_subject_areas": [
["01", "01", "01", "02"],
["01", "02", "01"],
["01", "02", "03"],
[],
],
}
).astype({"study_area_term_1": "string", "study_area_year_1": "string"}),
"study_area_year_1",
pd.Series([3, 0, 0, 0], dtype="Int8"),
),
],
)
def test_num_courses_in_study_area(df, study_area_col, exp):
obs = student_term.num_courses_in_study_area(df, study_area_col=study_area_col)
assert isinstance(obs, pd.Series) and not obs.empty
assert obs.equals(exp) or obs.compare(exp).empty


@pytest.mark.parametrize(
["df", "numer_col", "denom_col", "exp"],
[
Expand Down
21 changes: 17 additions & 4 deletions tests/analysis/pdp/targets/test_shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,16 @@ def test_df():
],
"num_credits_earned": [25.0, 30.0, 35.0, 25.0, 35.0, 20.0, 45.0, 10.0],
"term_rank": [3, 4, 5, 5, 6, 2, 4, 8],
"term_is_pre_cohort": [
True,
False,
False,
False,
False,
False,
False,
False,
],
},
).astype(
{
Expand Down Expand Up @@ -253,6 +263,7 @@ def test_select_students_by_next_year_course_data(test_df, exp):
],
"num_credits_earned": [20.0, 25.0, 45.0, 25.0, 10.0],
"term_rank": [2, 3, 4, 5, 8],
"term_is_pre_cohort": [False, True, False, False, False],
}
).astype(
{
Expand Down Expand Up @@ -328,24 +339,26 @@ def test_get_first_student_terms_at_num_credits_earned(
[],
pd.DataFrame(
{
"student_id": ["01", "04", "02", "05"],
"term_rank": [4, 4, 5, 8],
"student_id": ["03", "01", "04", "02", "05"],
"term_rank": [2, 4, 4, 5, 8],
}
),
),
(
["cohort_id", "term_id"],
pd.DataFrame(
{
"student_id": ["01", "04", "02", "05"],
"term_rank": [4, 4, 5, 8],
"student_id": ["03", "01", "04", "02", "05"],
"term_rank": [2, 4, 4, 5, 8],
"cohort_id": [
"2019-20 FALL",
"2020-21 SPRING",
"2020-21 SPRING",
"2021-22 FALL",
"2022-23 FALL",
],
"term_id": [
"2019-20 SPRING",
"2020-21 SPRING",
"2020-21 SPRING",
"2021-22 FALL",
Expand Down

0 comments on commit 9169ba3

Please sign in to comment.