Skip to content

Commit

Permalink
Support StatsPerform MA5 and MA12 feeds
Browse files Browse the repository at this point in the history
  • Loading branch information
probberechts committed Sep 5, 2023
1 parent bdbb783 commit f6d330f
Show file tree
Hide file tree
Showing 6 changed files with 302 additions and 2 deletions.
24 changes: 24 additions & 0 deletions socceraction/data/opta/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
F24XMLParser,
MA1JSONParser,
MA3JSONParser,
MA5JSONParser,
MA12JSONParser,
OptaParser,
WhoScoredParser,
)
Expand All @@ -38,6 +40,8 @@
"f24": F24JSONParser,
"ma1": MA1JSONParser,
"ma3": MA3JSONParser,
"ma5": MA5JSONParser,
"ma12": MA12JSONParser,
}

_xmlparsers = {
Expand Down Expand Up @@ -319,6 +323,26 @@ def _get_parsers_for_feeds(
warnings.warn(f"No parser available for {feed} feeds. This feed is ignored.")
return parsers

def file_names(self, competition_id: int, season_id: int):
"""Return a list of all the file names in the data folder of a given season.
Parameters
----------
competition_id : int
The ID of the competition.
season_id : int
The ID of the season.
"""
# feed = 'ma1'
feed_pattern = '{competition_id}\\{season_id}\\MA1\\{game_id}.json'
glob_pattern = feed_pattern.format(
competition_id=competition_id, season_id=season_id, game_id="*"
)
feed_files = glob.glob(os.path.join(self.root, glob_pattern))
for i in range(len(feed_files)):
feed_files[i] = feed_files[i][25:][:-5]
return feed_files

def competitions(self) -> DataFrame[OptaCompetitionSchema]:
"""Return a dataframe with all available competitions and seasons.
Expand Down
4 changes: 4 additions & 0 deletions socceraction/data/opta/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
'F24XMLParser',
'MA1JSONParser',
'MA3JSONParser',
'MA5JSONParser',
'MA12JSONParser',
'WhoScoredParser',
]

Expand All @@ -20,4 +22,6 @@
from .f24_xml import F24XMLParser
from .ma1_json import MA1JSONParser
from .ma3_json import MA3JSONParser
from .ma5_json import MA5JSONParser
from .ma12_json import MA12JSONParser
from .whoscored import WhoScoredParser
168 changes: 168 additions & 0 deletions socceraction/data/opta/parsers/ma12_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
"""JSON parser for Stats Perform MA1 feeds."""
from typing import Any, Dict, List, Optional, Tuple

import pandas as pd

from ...base import MissingDataError
from .base import OptaJSONParser, assertget


class MA12JSONParser(OptaJSONParser):
"""Extract data from a Stats Perform MA12 data stream.
Parameters
----------
path : str
Path of the data file.
"""

def _get_matches(self) -> List[Dict[str, Any]]:
if 'matchInfo' in self.root:
return [self.root]
if 'match' in self.root:
return self.root['match']
raise MissingDataError

def _get_match_info(self) -> Dict[str, Any]:
if "matchInfo" in self.root:
return self.root["matchInfo"]
raise MissingDataError

def _get_live_data(self) -> Dict[str, Any]:
if "liveData" in self.root:
return self.root["liveData"]
raise MissingDataError

def _get_name(self, obj: Dict[str, Any]) -> Optional[str]:
if "name" in obj:
return assertget(obj, "name")
if "firstName" in obj:
return f"{assertget(obj, 'firstName')} {assertget(obj, 'lastName')}"
return None

@staticmethod
def _extract_team_id(teams: List[Dict[str, str]], side: str) -> Optional[str]:
for team in teams:
team_side = assertget(team, "position")
if team_side == side:
team_id = assertget(team, "id")
return team_id
raise MissingDataError

def extract_players(self) -> Dict[Tuple[str, str], Dict[str, Any]]: # noqa: C901
"""Return a dictionary with all available players.
Returns
-------
dict
A mapping between (game ID, player ID) tuples and the information
available about each player in the data stream.
"""
match_info = self._get_match_info()
game_id = assertget(match_info, "id")
live_data = self._get_live_data()
line_up = assertget(live_data, "lineUp")

players_data: Dict[str, List[Any]] = {
# "game_id": [],
"team_id": [],
"player_id": [],
"player_name": [],
# "is_starter": [],
"starting_position": [],
"minutes_played": [],
"jersey_number": [],
"position_side": [],
"xG_non_penalty": [],
}

team_info = assertget(match_info, "contestant")
home_team_info = team_info[0]
away_team_info = team_info[1]

home_team_id = assertget(home_team_info, "id")
away_team_id = assertget(away_team_info, "id")

team_home = line_up[0]
team_away = line_up[1]
for player in assertget(team_home, "player"):
# players_data["game_id"] += [game_id]
players_data["team_id"] += [home_team_id]
players_data["player_id"] += [assertget(player, "playerId")]
players_data["player_name"] += [assertget(player, "matchName")]

starting_position = assertget(player, "position")
players_data["starting_position"] += [starting_position]
# players_data["is_starter"] += (starting_position != "Substitute")

players_data["jersey_number"] += [assertget(player, "shirtNumber")]

if starting_position != "Substitute":
players_data["position_side"] += [assertget(player, "positionSide")]
else:
players_data["position_side"] += [""]

xG_non_penalty = 0.0
minutesPlayed = 0
if "stat" in player and player["stat"] is not None:
player_stats = assertget(player, "stat")
for stat in player_stats:
stat_type = assertget(stat, "type")
if stat_type == "expectedGoalsNonpenalty":
xG_non_penalty = assertget(stat, "value")
break
elif stat_type == "minsPlayed":
minutesPlayed = assertget(stat, "value")
players_data["xG_non_penalty"] += [xG_non_penalty]
players_data["minutes_played"] += [minutesPlayed]

for player in assertget(team_away, "player"):
# players_data["game_id"] += game_id
players_data["team_id"] += [away_team_id]
players_data["player_id"] += [assertget(player, "playerId")]
players_data["player_name"] += [assertget(player, "matchName")]

starting_position = assertget(player, "position")
players_data["starting_position"] += [starting_position]
# players_data["is_starter"] += (starting_position != "Substitute")

players_data["jersey_number"] += [assertget(player, "shirtNumber")]

if starting_position != "Substitute":
players_data["position_side"] += [assertget(player, "positionSide")]
else:
players_data["position_side"] += [""]
xG_non_penalty = 0.0
minutesPlayed = 0
if "stat" in player:
player_stats = assertget(player, "stat")
for stat in player_stats:
stat_type = assertget(stat, "type")
if stat_type == "expectedGoalsNonpenalty":
xG_non_penalty = assertget(stat, "value")
break
elif stat_type == "minsPlayed":
minutesPlayed = assertget(stat, "value")
players_data["xG_non_penalty"] += [xG_non_penalty]
players_data["minutes_played"] += [minutesPlayed]

df_players_data = pd.DataFrame.from_dict(players_data)

players = {}
for _, player in df_players_data.iterrows():
is_starter = player.starting_position != "Substitute"
players[(game_id, player.player_id)] = {
# Fields required by the base schema
"game_id": game_id,
"team_id": player.team_id,
"player_id": player.player_id,
"player_name": player.player_name,
"is_starter": is_starter,
"minutes_played": player.minutes_played,
"jersey_number": player.jersey_number,
# Fields required by the opta schema
"starting_position": player.starting_position,
"position_side": player.position_side,
"xG_non_penalty": player.xG_non_penalty,
}
return players
8 changes: 6 additions & 2 deletions socceraction/data/opta/parsers/ma3_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,20 +86,24 @@ def extract_games(self) -> Dict[str, Dict[str, Any]]:
game_date = assertget(match_info, "date")[0:10]
game_time = assertget(match_info, "time")[0:8]
game_datetime = f"{game_date}T{game_time}"
# game_duration = assertget(match_details, "matchLengthMin"),
game_duration = match_details.get("matchLengthMin")
if game_duration is None:
game_duration = 93
return {
game_id: dict(
# Fields required by the base schema
game_id=game_id,
season_id=assertget(season, "id"),
competition_id=assertget(competition, "id"),
game_day=int(assertget(match_info, "week")),
game_day=int(match_info["week"]) if "week" in match_info else None,
game_date=datetime.strptime(game_datetime, "%Y-%m-%dT%H:%M:%S"),
home_team_id=self._extract_team_id(contestant, "home"),
away_team_id=self._extract_team_id(contestant, "away"),
# Optional fields
home_score=home_score,
away_score=away_score,
duration=assertget(match_details, "matchLengthMin"),
duration=game_duration,
# referee=?
venue=assertget(venue, "shortName"),
# attendance=?
Expand Down
87 changes: 87 additions & 0 deletions socceraction/data/opta/parsers/ma5_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""JSON parser for Stats Perform MA5 feeds."""
from datetime import datetime
from typing import Any, Dict, List

from ...base import MissingDataError
from .base import OptaJSONParser, assertget


class MA5JSONParser(OptaJSONParser):
"""Extract data from a Stats Perform MA5 data stream.
Parameters
----------
path : str
Path of the data file.
"""

def _get_matches(self) -> List[Dict[str, Any]]:
if 'matchInfo' in self.root:
return [self.root]
if 'match' in self.root:
return self.root['match']
raise MissingDataError

def _get_match_info(self) -> Dict[str, Any]:
if "matchInfo" in self.root:
return self.root["matchInfo"]
raise MissingDataError

def _get_live_data(self) -> Dict[str, Any]:
if "liveData" in self.root:
return self.root["liveData"]
raise MissingDataError

"""
game_id: Series[Object] = pa.Field()
season_id: Series[Object] = pa.Field()
competition_id: Series[Object] = pa.Field()
game_day: Series[pd.Int64Dtype] = pa.Field(nullable=True)
game_date: Series[DateTime] = pa.Field()
home_team_id: Series[Object] = pa.Field()
away_team_id: Series[Object] = pa.Field()
"""

def extract_games(self) -> Dict[str, Dict[str, Any]]:
"""Return a dictionary with all available games.
Returns
-------
dict
A mapping between game IDs and the information available about
each game in the data stream.
"""
match_info = self._get_match_info()
live_data = self._get_live_data()
game_id = assertget(match_info, "id")
season = assertget(match_info, "tournamentCalendar")
competition = assertget(match_info, "competition")

contestants = assertget(match_info, "contestant")
home_team = contestants[0]
away_team = contestants[1]
home_team_id = assertget(home_team, "id")
away_team_id = assertget(away_team, "id")

possession_data = assertget(live_data, "possession")
possession_wave_data = assertget(possession_data, "possessionWave")
overall_percentage = assertget(possession_wave_data[0], "overall")
away_possession = assertget(overall_percentage, "away")
home_possession = assertget(overall_percentage, "home")

game_date = assertget(match_info, "date")[0:10]
game_time = assertget(match_info, "time")[0:8]
game_datetime = f"{game_date}T{game_time}"
return {
game_id: dict(
game_id=game_id,
season_id=assertget(season, "id"),
competition_id=assertget(competition, "id"),
game_day=int(match_info["week"]) if "week" in match_info else None,
game_date=datetime.strptime(game_datetime, "%Y-%m-%dT%H:%M:%S"),
home_team_id=home_team_id,
away_team_id=away_team_id,
home_possession=home_possession,
away_possession=away_possession,
)
}
13 changes: 13 additions & 0 deletions socceraction/data/opta/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ class OptaGameSchema(GameSchema):
"""The name of the manager of the home team."""
away_manager: Optional[Series[str]] = pa.Field(nullable=True)
"""The name of the manager of the away team."""
home_possession: Optional[Series[int]]
"""The possession percentage of the home team in the game."""
away_possession: Optional[Series[int]]
"""The possession percentage of the away team in the game."""


class OptaPlayerSchema(PlayerSchema):
Expand All @@ -45,6 +49,15 @@ class OptaPlayerSchema(PlayerSchema):
starting_position: Series[str]
"""The starting position of the player."""

position_side: Optional[Series[str]]
"""The side of the pitch where the player started the game"""

xG_non_penalty: Optional[Series[int]]
"""The xG of the player without taking penalties into account"""

# minutes_played: Optional[Series[int]]
# """The amount of minutes played by the player"""


class OptaTeamSchema(TeamSchema):
"""Definition of a dataframe containing the list of teams of a game."""
Expand Down

0 comments on commit f6d330f

Please sign in to comment.