From f6d330f1b87bc1e9ed99654a676d3f944563a26e Mon Sep 17 00:00:00 2001 From: Pieter Robberechts Date: Tue, 5 Sep 2023 19:57:28 +0200 Subject: [PATCH] Support StatsPerform MA5 and MA12 feeds --- socceraction/data/opta/loader.py | 24 +++ socceraction/data/opta/parsers/__init__.py | 4 + socceraction/data/opta/parsers/ma12_json.py | 168 ++++++++++++++++++++ socceraction/data/opta/parsers/ma3_json.py | 8 +- socceraction/data/opta/parsers/ma5_json.py | 87 ++++++++++ socceraction/data/opta/schema.py | 13 ++ 6 files changed, 302 insertions(+), 2 deletions(-) create mode 100644 socceraction/data/opta/parsers/ma12_json.py create mode 100644 socceraction/data/opta/parsers/ma5_json.py diff --git a/socceraction/data/opta/loader.py b/socceraction/data/opta/loader.py index 309f2750..e13f364d 100644 --- a/socceraction/data/opta/loader.py +++ b/socceraction/data/opta/loader.py @@ -21,6 +21,8 @@ F24XMLParser, MA1JSONParser, MA3JSONParser, + MA5JSONParser, + MA12JSONParser, OptaParser, WhoScoredParser, ) @@ -38,6 +40,8 @@ "f24": F24JSONParser, "ma1": MA1JSONParser, "ma3": MA3JSONParser, + "ma5": MA5JSONParser, + "ma12": MA12JSONParser, } _xmlparsers = { @@ -319,6 +323,26 @@ def _get_parsers_for_feeds( warnings.warn(f"No parser available for {feed} feeds. This feed is ignored.") return parsers + def file_names(self, competition_id: int, season_id: int): + """Return a list of all the file names in the data folder of a given season. + + Parameters + ---------- + competition_id : int + The ID of the competition. + season_id : int + The ID of the season. + """ + # feed = 'ma1' + feed_pattern = '{competition_id}\\{season_id}\\MA1\\{game_id}.json' + glob_pattern = feed_pattern.format( + competition_id=competition_id, season_id=season_id, game_id="*" + ) + feed_files = glob.glob(os.path.join(self.root, glob_pattern)) + for i in range(len(feed_files)): + feed_files[i] = feed_files[i][25:][:-5] + return feed_files + def competitions(self) -> DataFrame[OptaCompetitionSchema]: """Return a dataframe with all available competitions and seasons. diff --git a/socceraction/data/opta/parsers/__init__.py b/socceraction/data/opta/parsers/__init__.py index 89e034ea..e5a954b9 100644 --- a/socceraction/data/opta/parsers/__init__.py +++ b/socceraction/data/opta/parsers/__init__.py @@ -9,6 +9,8 @@ 'F24XMLParser', 'MA1JSONParser', 'MA3JSONParser', + 'MA5JSONParser', + 'MA12JSONParser', 'WhoScoredParser', ] @@ -20,4 +22,6 @@ from .f24_xml import F24XMLParser from .ma1_json import MA1JSONParser from .ma3_json import MA3JSONParser +from .ma5_json import MA5JSONParser +from .ma12_json import MA12JSONParser from .whoscored import WhoScoredParser diff --git a/socceraction/data/opta/parsers/ma12_json.py b/socceraction/data/opta/parsers/ma12_json.py new file mode 100644 index 00000000..8a2c3118 --- /dev/null +++ b/socceraction/data/opta/parsers/ma12_json.py @@ -0,0 +1,168 @@ +"""JSON parser for Stats Perform MA1 feeds.""" +from typing import Any, Dict, List, Optional, Tuple + +import pandas as pd + +from ...base import MissingDataError +from .base import OptaJSONParser, assertget + + +class MA12JSONParser(OptaJSONParser): + """Extract data from a Stats Perform MA12 data stream. + + Parameters + ---------- + path : str + Path of the data file. + """ + + def _get_matches(self) -> List[Dict[str, Any]]: + if 'matchInfo' in self.root: + return [self.root] + if 'match' in self.root: + return self.root['match'] + raise MissingDataError + + def _get_match_info(self) -> Dict[str, Any]: + if "matchInfo" in self.root: + return self.root["matchInfo"] + raise MissingDataError + + def _get_live_data(self) -> Dict[str, Any]: + if "liveData" in self.root: + return self.root["liveData"] + raise MissingDataError + + def _get_name(self, obj: Dict[str, Any]) -> Optional[str]: + if "name" in obj: + return assertget(obj, "name") + if "firstName" in obj: + return f"{assertget(obj, 'firstName')} {assertget(obj, 'lastName')}" + return None + + @staticmethod + def _extract_team_id(teams: List[Dict[str, str]], side: str) -> Optional[str]: + for team in teams: + team_side = assertget(team, "position") + if team_side == side: + team_id = assertget(team, "id") + return team_id + raise MissingDataError + + def extract_players(self) -> Dict[Tuple[str, str], Dict[str, Any]]: # noqa: C901 + """Return a dictionary with all available players. + + Returns + ------- + dict + A mapping between (game ID, player ID) tuples and the information + available about each player in the data stream. + """ + match_info = self._get_match_info() + game_id = assertget(match_info, "id") + live_data = self._get_live_data() + line_up = assertget(live_data, "lineUp") + + players_data: Dict[str, List[Any]] = { + # "game_id": [], + "team_id": [], + "player_id": [], + "player_name": [], + # "is_starter": [], + "starting_position": [], + "minutes_played": [], + "jersey_number": [], + "position_side": [], + "xG_non_penalty": [], + } + + team_info = assertget(match_info, "contestant") + home_team_info = team_info[0] + away_team_info = team_info[1] + + home_team_id = assertget(home_team_info, "id") + away_team_id = assertget(away_team_info, "id") + + team_home = line_up[0] + team_away = line_up[1] + for player in assertget(team_home, "player"): + # players_data["game_id"] += [game_id] + players_data["team_id"] += [home_team_id] + players_data["player_id"] += [assertget(player, "playerId")] + players_data["player_name"] += [assertget(player, "matchName")] + + starting_position = assertget(player, "position") + players_data["starting_position"] += [starting_position] + # players_data["is_starter"] += (starting_position != "Substitute") + + players_data["jersey_number"] += [assertget(player, "shirtNumber")] + + if starting_position != "Substitute": + players_data["position_side"] += [assertget(player, "positionSide")] + else: + players_data["position_side"] += [""] + + xG_non_penalty = 0.0 + minutesPlayed = 0 + if "stat" in player and player["stat"] is not None: + player_stats = assertget(player, "stat") + for stat in player_stats: + stat_type = assertget(stat, "type") + if stat_type == "expectedGoalsNonpenalty": + xG_non_penalty = assertget(stat, "value") + break + elif stat_type == "minsPlayed": + minutesPlayed = assertget(stat, "value") + players_data["xG_non_penalty"] += [xG_non_penalty] + players_data["minutes_played"] += [minutesPlayed] + + for player in assertget(team_away, "player"): + # players_data["game_id"] += game_id + players_data["team_id"] += [away_team_id] + players_data["player_id"] += [assertget(player, "playerId")] + players_data["player_name"] += [assertget(player, "matchName")] + + starting_position = assertget(player, "position") + players_data["starting_position"] += [starting_position] + # players_data["is_starter"] += (starting_position != "Substitute") + + players_data["jersey_number"] += [assertget(player, "shirtNumber")] + + if starting_position != "Substitute": + players_data["position_side"] += [assertget(player, "positionSide")] + else: + players_data["position_side"] += [""] + xG_non_penalty = 0.0 + minutesPlayed = 0 + if "stat" in player: + player_stats = assertget(player, "stat") + for stat in player_stats: + stat_type = assertget(stat, "type") + if stat_type == "expectedGoalsNonpenalty": + xG_non_penalty = assertget(stat, "value") + break + elif stat_type == "minsPlayed": + minutesPlayed = assertget(stat, "value") + players_data["xG_non_penalty"] += [xG_non_penalty] + players_data["minutes_played"] += [minutesPlayed] + + df_players_data = pd.DataFrame.from_dict(players_data) + + players = {} + for _, player in df_players_data.iterrows(): + is_starter = player.starting_position != "Substitute" + players[(game_id, player.player_id)] = { + # Fields required by the base schema + "game_id": game_id, + "team_id": player.team_id, + "player_id": player.player_id, + "player_name": player.player_name, + "is_starter": is_starter, + "minutes_played": player.minutes_played, + "jersey_number": player.jersey_number, + # Fields required by the opta schema + "starting_position": player.starting_position, + "position_side": player.position_side, + "xG_non_penalty": player.xG_non_penalty, + } + return players diff --git a/socceraction/data/opta/parsers/ma3_json.py b/socceraction/data/opta/parsers/ma3_json.py index df9be4c6..e52328fa 100644 --- a/socceraction/data/opta/parsers/ma3_json.py +++ b/socceraction/data/opta/parsers/ma3_json.py @@ -86,20 +86,24 @@ def extract_games(self) -> Dict[str, Dict[str, Any]]: game_date = assertget(match_info, "date")[0:10] game_time = assertget(match_info, "time")[0:8] game_datetime = f"{game_date}T{game_time}" + # game_duration = assertget(match_details, "matchLengthMin"), + game_duration = match_details.get("matchLengthMin") + if game_duration is None: + game_duration = 93 return { game_id: dict( # Fields required by the base schema game_id=game_id, season_id=assertget(season, "id"), competition_id=assertget(competition, "id"), - game_day=int(assertget(match_info, "week")), + game_day=int(match_info["week"]) if "week" in match_info else None, game_date=datetime.strptime(game_datetime, "%Y-%m-%dT%H:%M:%S"), home_team_id=self._extract_team_id(contestant, "home"), away_team_id=self._extract_team_id(contestant, "away"), # Optional fields home_score=home_score, away_score=away_score, - duration=assertget(match_details, "matchLengthMin"), + duration=game_duration, # referee=? venue=assertget(venue, "shortName"), # attendance=? diff --git a/socceraction/data/opta/parsers/ma5_json.py b/socceraction/data/opta/parsers/ma5_json.py new file mode 100644 index 00000000..fe415b12 --- /dev/null +++ b/socceraction/data/opta/parsers/ma5_json.py @@ -0,0 +1,87 @@ +"""JSON parser for Stats Perform MA5 feeds.""" +from datetime import datetime +from typing import Any, Dict, List + +from ...base import MissingDataError +from .base import OptaJSONParser, assertget + + +class MA5JSONParser(OptaJSONParser): + """Extract data from a Stats Perform MA5 data stream. + + Parameters + ---------- + path : str + Path of the data file. + """ + + def _get_matches(self) -> List[Dict[str, Any]]: + if 'matchInfo' in self.root: + return [self.root] + if 'match' in self.root: + return self.root['match'] + raise MissingDataError + + def _get_match_info(self) -> Dict[str, Any]: + if "matchInfo" in self.root: + return self.root["matchInfo"] + raise MissingDataError + + def _get_live_data(self) -> Dict[str, Any]: + if "liveData" in self.root: + return self.root["liveData"] + raise MissingDataError + + """ + game_id: Series[Object] = pa.Field() + season_id: Series[Object] = pa.Field() + competition_id: Series[Object] = pa.Field() + game_day: Series[pd.Int64Dtype] = pa.Field(nullable=True) + game_date: Series[DateTime] = pa.Field() + home_team_id: Series[Object] = pa.Field() + away_team_id: Series[Object] = pa.Field() + """ + + def extract_games(self) -> Dict[str, Dict[str, Any]]: + """Return a dictionary with all available games. + + Returns + ------- + dict + A mapping between game IDs and the information available about + each game in the data stream. + """ + match_info = self._get_match_info() + live_data = self._get_live_data() + game_id = assertget(match_info, "id") + season = assertget(match_info, "tournamentCalendar") + competition = assertget(match_info, "competition") + + contestants = assertget(match_info, "contestant") + home_team = contestants[0] + away_team = contestants[1] + home_team_id = assertget(home_team, "id") + away_team_id = assertget(away_team, "id") + + possession_data = assertget(live_data, "possession") + possession_wave_data = assertget(possession_data, "possessionWave") + overall_percentage = assertget(possession_wave_data[0], "overall") + away_possession = assertget(overall_percentage, "away") + home_possession = assertget(overall_percentage, "home") + + game_date = assertget(match_info, "date")[0:10] + game_time = assertget(match_info, "time")[0:8] + game_datetime = f"{game_date}T{game_time}" + return { + game_id: dict( + game_id=game_id, + season_id=assertget(season, "id"), + competition_id=assertget(competition, "id"), + game_day=int(match_info["week"]) if "week" in match_info else None, + game_date=datetime.strptime(game_datetime, "%Y-%m-%dT%H:%M:%S"), + home_team_id=home_team_id, + away_team_id=away_team_id, + home_possession=home_possession, + away_possession=away_possession, + ) + } diff --git a/socceraction/data/opta/schema.py b/socceraction/data/opta/schema.py index f1eeff8c..e14278aa 100644 --- a/socceraction/data/opta/schema.py +++ b/socceraction/data/opta/schema.py @@ -37,6 +37,10 @@ class OptaGameSchema(GameSchema): """The name of the manager of the home team.""" away_manager: Optional[Series[str]] = pa.Field(nullable=True) """The name of the manager of the away team.""" + home_possession: Optional[Series[int]] + """The possession percentage of the home team in the game.""" + away_possession: Optional[Series[int]] + """The possession percentage of the away team in the game.""" class OptaPlayerSchema(PlayerSchema): @@ -45,6 +49,15 @@ class OptaPlayerSchema(PlayerSchema): starting_position: Series[str] """The starting position of the player.""" + position_side: Optional[Series[str]] + """The side of the pitch where the player started the game""" + + xG_non_penalty: Optional[Series[int]] + """The xG of the player without taking penalties into account""" + + # minutes_played: Optional[Series[int]] + # """The amount of minutes played by the player""" + class OptaTeamSchema(TeamSchema): """Definition of a dataframe containing the list of teams of a game."""