Skip to content

Commit

Permalink
Use categorical dtype for categorical features
Browse files Browse the repository at this point in the history
The categorical 'actiontype', 'result', 'bodypart' and 'bodypart_detailed'
features now have dtype "pd.Categorical". This allows machine learning
frameworks such as xgboost to automatically recognize these features as being
categorical and handle them accordingly.

The features are now strings (e.g., "pass") instead of integer ids. Also, the
column name for each of these features was renamed as follows:
type_id -> actiontype
result_id -> result
bodypart_id -> bodypart
  • Loading branch information
probberechts committed Oct 10, 2023
1 parent 81cc37a commit 8b1df4c
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 31 deletions.
88 changes: 57 additions & 31 deletions socceraction/vaep/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pandas as pd # type: ignore
from pandera.typing import DataFrame

import socceraction.spadl.config as spadlconfig
import socceraction.spadl.config as spadlcfg
from socceraction.atomic.spadl import AtomicSPADLSchema
from socceraction.spadl.schema import SPADLSchema

Expand Down Expand Up @@ -117,9 +117,9 @@ def play_left_to_right(gamestates: GameStates, home_team_id: int) -> GameStates:
away_idx = a0.team_id != home_team_id
for actions in gamestates:
for col in ['start_x', 'end_x']:
actions.loc[away_idx, col] = spadlconfig.field_length - actions[away_idx][col].values
actions.loc[away_idx, col] = spadlcfg.field_length - actions[away_idx][col].values
for col in ['start_y', 'end_y']:
actions.loc[away_idx, col] = spadlconfig.field_width - actions[away_idx][col].values
actions.loc[away_idx, col] = spadlcfg.field_width - actions[away_idx][col].values
return gamestates


Expand Down Expand Up @@ -169,7 +169,13 @@ def actiontype(actions: Actions) -> Features:
Features
The 'type_id' of each action.
"""
return actions[['type_id']]
X = pd.DataFrame(index=actions.index)
X["actiontype"] = pd.Categorical(
actions["type_id"].replace(spadlcfg.actiontypes_df().type_name.to_dict()),
categories=spadlcfg.actiontypes,
ordered=False,
)
return X


@simple
Expand All @@ -187,8 +193,8 @@ def actiontype_onehot(actions: SPADLActions) -> Features:
A one-hot encoding of each action's type.
"""
X = {}
for type_id, type_name in enumerate(spadlconfig.actiontypes):
col = 'type_' + type_name
for type_id, type_name in enumerate(spadlcfg.actiontypes):
col = 'actiontype_' + type_name
X[col] = actions['type_id'] == type_id
return pd.DataFrame(X, index=actions.index)

Expand All @@ -207,7 +213,13 @@ def result(actions: SPADLActions) -> Features:
Features
The 'result_id' of each action.
"""
return actions[['result_id']]
X = pd.DataFrame(index=actions.index)
X["result"] = pd.Categorical(
actions["result_id"].replace(spadlcfg.results_df().result_name.to_dict()),
categories=spadlcfg.actiontypes,
ordered=False,
)
return X


@simple
Expand All @@ -225,7 +237,7 @@ def result_onehot(actions: SPADLActions) -> Features:
The one-hot encoding of each action's result.
"""
X = {}
for result_id, result_name in enumerate(spadlconfig.results):
for result_id, result_name in enumerate(spadlcfg.results):
col = 'result_' + result_name
X[col] = actions['result_id'] == result_id
return pd.DataFrame(X, index=actions.index)
Expand Down Expand Up @@ -275,10 +287,18 @@ def bodypart(actions: Actions) -> Features:
bodypart_detailed :
An alternative version that splits between the left and right foot.
"""
foot_id = spadlconfig.bodyparts.index("foot")
left_foot_id = spadlconfig.bodyparts.index("foot_left")
right_foot_id = spadlconfig.bodyparts.index("foot_right")
return actions[['bodypart_id']].replace([left_foot_id, right_foot_id], foot_id)
X = pd.DataFrame(index=actions.index)
foot_id = spadlcfg.bodyparts.index("foot")
left_foot_id = spadlcfg.bodyparts.index("foot_left")
right_foot_id = spadlcfg.bodyparts.index("foot_right")
X["bodypart"] = pd.Categorical(
actions["bodypart_id"]
.replace([left_foot_id, right_foot_id], foot_id)
.replace(spadlcfg.bodyparts_df().bodypart_name.to_dict()),
categories=["foot", "head", "other", "head/other"],
ordered=False,
)
return X


@simple
Expand All @@ -303,7 +323,13 @@ def bodypart_detailed(actions: Actions) -> Features:
bodypart :
An alternative version that does not split between the left and right foot.
"""
return actions[['bodypart_id']]
X = pd.DataFrame(index=actions.index)
X["bodypart"] = pd.Categorical(
actions["bodypart_id"].replace(spadlcfg.bodyparts_df().bodypart_name.to_dict()),
categories=spadlcfg.bodyparts,
ordered=False,
)
return X


@simple
Expand All @@ -328,19 +354,19 @@ def bodypart_onehot(actions: Actions) -> Features:
An alternative version that splits between the left and right foot.
"""
X = {}
for bodypart_id, bodypart_name in enumerate(spadlconfig.bodyparts):
for bodypart_id, bodypart_name in enumerate(spadlcfg.bodyparts):
if bodypart_name in ('foot_left', 'foot_right'):
continue
col = 'bodypart_' + bodypart_name
if bodypart_name == 'foot':
foot_id = spadlconfig.bodyparts.index("foot")
left_foot_id = spadlconfig.bodyparts.index("foot_left")
right_foot_id = spadlconfig.bodyparts.index("foot_right")
foot_id = spadlcfg.bodyparts.index("foot")
left_foot_id = spadlcfg.bodyparts.index("foot_left")
right_foot_id = spadlcfg.bodyparts.index("foot_right")
X[col] = actions['bodypart_id'].isin([foot_id, left_foot_id, right_foot_id])
elif bodypart_name == 'head/other':
head_id = spadlconfig.bodyparts.index("head")
other_id = spadlconfig.bodyparts.index("other")
head_other_id = spadlconfig.bodyparts.index("head/other")
head_id = spadlcfg.bodyparts.index("head")
other_id = spadlcfg.bodyparts.index("other")
head_other_id = spadlcfg.bodyparts.index("head/other")
X[col] = actions['bodypart_id'].isin([head_id, other_id, head_other_id])
else:
X[col] = actions['bodypart_id'] == bodypart_id
Expand Down Expand Up @@ -370,17 +396,17 @@ def bodypart_detailed_onehot(actions: Actions) -> Features:
An alternative version that does not split between the left and right foot.
"""
X = {}
for bodypart_id, bodypart_name in enumerate(spadlconfig.bodyparts):
for bodypart_id, bodypart_name in enumerate(spadlcfg.bodyparts):
col = 'bodypart_' + bodypart_name
if bodypart_name == 'foot':
foot_id = spadlconfig.bodyparts.index("foot")
left_foot_id = spadlconfig.bodyparts.index("foot_left")
right_foot_id = spadlconfig.bodyparts.index("foot_right")
foot_id = spadlcfg.bodyparts.index("foot")
left_foot_id = spadlcfg.bodyparts.index("foot_left")
right_foot_id = spadlcfg.bodyparts.index("foot_right")
X[col] = actions['bodypart_id'].isin([foot_id, left_foot_id, right_foot_id])
elif bodypart_name == 'head/other':
head_id = spadlconfig.bodyparts.index("head")
other_id = spadlconfig.bodyparts.index("other")
head_other_id = spadlconfig.bodyparts.index("head/other")
head_id = spadlcfg.bodyparts.index("head")
other_id = spadlcfg.bodyparts.index("other")
head_other_id = spadlcfg.bodyparts.index("head/other")
X[col] = actions['bodypart_id'].isin([head_id, other_id, head_other_id])
else:
X[col] = actions['bodypart_id'] == bodypart_id
Expand Down Expand Up @@ -450,8 +476,8 @@ def endlocation(actions: SPADLActions) -> Features:
return actions[['end_x', 'end_y']]


_goal_x: float = spadlconfig.field_length
_goal_y: float = spadlconfig.field_width / 2
_goal_x: float = spadlcfg.field_length
_goal_y: float = spadlcfg.field_width / 2


@simple
Expand Down Expand Up @@ -622,10 +648,10 @@ def goalscore(gamestates: GameStates) -> Features:
actions = gamestates[0]
teamA = actions['team_id'].values[0]
goals = actions['type_name'].str.contains('shot') & (
actions['result_id'] == spadlconfig.results.index('success')
actions['result_id'] == spadlcfg.results.index('success')
)
owngoals = actions['type_name'].str.contains('shot') & (
actions['result_id'] == spadlconfig.results.index('owngoal')
actions['result_id'] == spadlcfg.results.index('owngoal')
)
teamisA = actions['team_id'] == teamA
teamisB = ~teamisA
Expand Down
21 changes: 21 additions & 0 deletions tests/vaep/test_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,27 @@ def test_same_index(spadl_actions: DataFrame[SPADLSchema]) -> None:
tm.assert_index_equal(features.index, spadl_actions.index)


def test_actiontype(spadl_actions: DataFrame[SPADLSchema]) -> None:
gamestates = fs.gamestates(spadl_actions)
ltr_gamestates = fs.play_left_to_right(gamestates, 782)
out = fs.actiontype(ltr_gamestates)
assert out.shape == (len(spadl_actions), 3)


def test_actiontype_onehot(spadl_actions: DataFrame[SPADLSchema]) -> None:
gamestates = fs.gamestates(spadl_actions)
ltr_gamestates = fs.play_left_to_right(gamestates, 782)
out = fs.actiontype_onehot(ltr_gamestates)
assert out.shape == (len(spadl_actions), len(spadl.config.actiontypes) * 3)


def test_result(spadl_actions: DataFrame[SPADLSchema]) -> None:
gamestates = fs.gamestates(spadl_actions)
ltr_gamestates = fs.play_left_to_right(gamestates, 782)
out = fs.result(ltr_gamestates)
assert out.shape == (len(spadl_actions), 3)


def test_result_onehot(spadl_actions: DataFrame[SPADLSchema]) -> None:
gamestates = fs.gamestates(spadl_actions)
ltr_gamestates = fs.play_left_to_right(gamestates, 782)
Expand All @@ -64,6 +78,13 @@ def test_actiontype_result_onehot(spadl_actions: DataFrame[SPADLSchema]) -> None
)


def test_bodypart(spadl_actions: DataFrame[SPADLSchema]) -> None:
gamestates = fs.gamestates(spadl_actions)
ltr_gamestates = fs.play_left_to_right(gamestates, 782)
out = fs.bodypart(ltr_gamestates)
assert out.shape == (len(spadl_actions), 3)


def test_bodypart_onehot(spadl_actions: DataFrame[SPADLSchema]) -> None:
gamestates = fs.gamestates(spadl_actions)
ltr_gamestates = fs.play_left_to_right(gamestates, 782)
Expand Down

0 comments on commit 8b1df4c

Please sign in to comment.