Skip to content

Commit

Permalink
polars
Browse files Browse the repository at this point in the history
  • Loading branch information
cthorrez committed Oct 28, 2024
1 parent 74a5013 commit b3eb278
Show file tree
Hide file tree
Showing 39 changed files with 2,155 additions and 924 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ from riix.models.elo import Elo
from riix.utils import MatchupDataset, split_matchup_dataset, generate_matchup_data
from riix.metrics import binary_metrics_suite

df = generate_matchup_data() # replace with your pandas dataframe
df = generate_matchup_data() # replace with your **polars** dataframe
dataset = MatchupDataset(
df,
competitor_cols=['competitor_1', 'competitor_2'],
Expand Down
10 changes: 5 additions & 5 deletions docs/riix.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/core.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/core/base.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/core/test.html

Large diffs are not rendered by default.

236 changes: 119 additions & 117 deletions docs/riix/eval.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/metrics.html

Large diffs are not rendered by default.

9 changes: 5 additions & 4 deletions docs/riix/models.html

Large diffs are not rendered by default.

560 changes: 302 additions & 258 deletions docs/riix/models/autograd_rating_system.html

Large diffs are not rendered by default.

721 changes: 721 additions & 0 deletions docs/riix/models/baselines.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/constant_variance_glicko.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/elo.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/elo_davidson.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/elomentum.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/gen_elo.html

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions docs/riix/models/glicko.html

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions docs/riix/models/glicko2.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/iterative_markov.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/melo.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/online_disc_decomp.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/online_rao_kupper.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/skf.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/template.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/temporal_massey.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/trueskill.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/velo.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/weng_lin.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/models/weng_lin_thurstone_mosteller.html

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions docs/riix/models/yuksel_2024.html

Large diffs are not rendered by default.

9 changes: 5 additions & 4 deletions docs/riix/utils.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/utils/constants.html

Large diffs are not rendered by default.

783 changes: 428 additions & 355 deletions docs/riix/utils/data_utils.html

Large diffs are not rendered by default.

372 changes: 372 additions & 0 deletions docs/riix/utils/date_utils.html

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/riix/utils/math_utils.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/search.js

Large diffs are not rendered by default.

26 changes: 11 additions & 15 deletions examples/elo_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -13,23 +13,26 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Schema({'competitor_1': String, 'competitor_2': String, 'outcome': Float64, 'date': Datetime(time_unit='ms', time_zone=None)})\n",
"loaded dataset with:\n",
"10000 matchups\n",
"100 unique competitors\n",
"10 rating periods of length 1D\n",
"split into train_dataset of length 8000 and test_dataset of length 2000\n",
"len(train_dataset)=8000, len(test_dataset)=2000\n"
]
}
],
"source": [
"df = generate_matchup_data() # replace with your pandas dataframe\n",
"df = generate_matchup_data() # replace with your **polars** dataframe\n",
"print(df.schema)\n",
"full_dataset = MatchupDataset(\n",
" df=df,\n",
" competitor_cols=['competitor_1', 'competitor_2'],\n",
Expand All @@ -43,14 +46,14 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'accuracy': 0.72975, 'log_loss': 0.5359083106524117, 'brier_score': 0.1793377446861956}\n"
"{'accuracy': np.float64(0.72975), 'accuracy_without_draws': np.float64(0.730095142714071), 'log_loss': np.float64(0.5359083106524117), 'brier_score': np.float64(0.1793377446861956)}\n"
]
}
],
Expand All @@ -59,12 +62,12 @@
"model.fit_dataset(train_dataset)\n",
"test_probs = model.fit_dataset(test_dataset, return_pre_match_probs=True)\n",
"test_metrics = binary_metrics_suite(probs=test_probs, outcomes=test_dataset.outcomes)\n",
"print(test_metrics)\n"
"print(test_metrics)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand All @@ -83,13 +86,6 @@
"source": [
"model.print_leaderboard(num_places=5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -108,7 +104,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
"version": "3.12.7"
}
},
"nbformat": 4,
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "riix"
version = "0.0.3"
version = "0.0.4"
description = "vectorized implementations of online rating systems"
readme = "README.md"
license = {file = "LICENSE"}
Expand All @@ -14,7 +14,7 @@ authors = [
dependencies = [
"numpy",
"scipy",
"pandas",
"polars",
"jax",
]
keywords = ["rating system", "paired comparison"]
Expand Down
98 changes: 41 additions & 57 deletions riix/utils/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,66 +4,67 @@
import time
from typing import List
import numpy as np
import pandas as pd

import polars as pl
from riix.utils.date_utils import get_duration

class MatchupDataset:
"""class for loading and iterating over paired comparison data"""

def __init__(
self,
df: pd.DataFrame,
df: pl.DataFrame,
competitor_cols: List[str],
outcome_col: str,
datetime_col: str = None,
timestamp_col: str = None,
time_step_col: str = None,
rating_period: str = '1W',
verbose: bool = True,
):
if len(competitor_cols) != 2:
raise ValueError('must specify exactly 2 competitor columns')
if (bool(datetime_col) + bool(timestamp_col) + bool(time_step_col)) != 1:
if (bool(datetime_col) + bool(time_step_col)) != 1:
raise ValueError('must specify only one of time_step_col, datetime_col, timestamp_col')

if time_step_col:
self.time_steps = df[time_step_col].astype(np.int64)
self.time_steps = df[time_step_col]
else:
if datetime_col:
if df[datetime_col].dtype == 'datetime64[ms]':
epoch_times = df[datetime_col].values.astype(np.int64) // 10**3
else:
epoch_times = pd.to_datetime(df[datetime_col]).values.astype(np.int64) // 10**9
if timestamp_col:
epoch_times = df[timestamp_col].values.astype(np.int64)

first_time = epoch_times[0]
epoch_times = epoch_times - first_time
period_delta = int(pd.Timedelta(rating_period).total_seconds())
self.time_steps = epoch_times // period_delta
self.time_steps = self.time_steps.astype(np.int32)
if df[datetime_col].dtype == pl.Datetime("ms"):
datetime = df[datetime_col]
elif df.schema[datetime_col] == pl.Date:
datetime = df[datetime_col].cast(pl.Datetime)
elif df.schema[datetime_col] == pl.Utf8:
datetime = df[datetime_col].str.strptime(pl.Datetime, '%Y-%m-%d')
else:
raise ValueError('datetime_col must be one of Date, Datetime, or Utf8')
seconds_since_epoch = (datetime.dt.timestamp() // 1_000_000).to_numpy()
rating_period_duration_in_seconds = get_duration(rating_period)
first_time = seconds_since_epoch[0]
seconds_since_first_time = seconds_since_epoch - first_time
self.time_steps = seconds_since_first_time // rating_period_duration_in_seconds

self.time_steps = self.time_steps.astype(np.int32)
self.process_time_steps()

# map competitor names/ids to integers
self.num_matchups = len(df)

str_competitors = pd.concat([
df[competitor_cols[0]].astype(str),
df[competitor_cols[1]].astype(str)
str_competitors = pl.concat([
df[competitor_cols[0]].cast(pl.Utf8).alias('competitor'),
df[competitor_cols[1]].cast(pl.Utf8).alias('competitor')
])
comp_idxs, competitors = pd.factorize(str_competitors, sort=True)
self.competitors = competitors.tolist()
self.competitors = str_competitors.unique().sort().to_list()
self.num_competitors = len(self.competitors)
self.competitor_to_idx = {comp: idx for idx, comp in enumerate(self.competitors)}
self.matchups = np.column_stack([comp_idxs[:self.num_matchups], comp_idxs[self.num_matchups:]]).astype(np.int32)
self.outcomes = df[outcome_col].values.astype(np.float64)
comp_idxs_1 = df[competitor_cols[0]].cast(pl.Utf8).map_elements(lambda x: self.competitor_to_idx[x], return_dtype=pl.Int32)
comp_idxs_2 = df[competitor_cols[1]].cast(pl.Utf8).map_elements(lambda x: self.competitor_to_idx[x], return_dtype=pl.Int32)
self.matchups = np.hstack([
comp_idxs_1.to_numpy()[:,None],
comp_idxs_2.to_numpy()[:,None],
])
self.outcomes = df[outcome_col].to_numpy()

if verbose:
print('loaded dataset with:')
print(f'{self.matchups.shape[0]} matchups')
print(f'{len(self.competitors)} unique competitors')
print(f'{self.unique_time_steps.max()} rating periods of length {rating_period}')
print(f'{self.unique_time_steps.max() + 1} rating periods of length {rating_period}')

def process_time_steps(self):
self.unique_time_steps, unique_time_step_indices = np.unique(self.time_steps, return_index=True)
Expand Down Expand Up @@ -127,27 +128,6 @@ def split_matchup_dataset(dataset, test_fraction=0.2):
print(f'split into train_dataset of length {len(train_dataset)} and test_dataset of length {len(test_dataset)}')
return train_dataset, test_dataset

class BasicMatchupDataset:
def __init__(
self,
df: pd.DataFrame,
competitor_cols: List[str],
outcome_col: str
):
self.num_matchups = len(df)
comp_idxs, competitors = pd.factorize(pd.concat([df[competitor_cols[0]], df[competitor_cols[1]]]), sort=True)
self.competitors = competitors.to_list()
self.num_competitors = len(self.competitors)
self.competitor_to_idx = {comp: idx for idx, comp in enumerate(self.competitors)}
self.matchups = np.column_stack([comp_idxs[:self.num_matchups], comp_idxs[self.num_matchups:]])
self.outcomes = df[outcome_col].values.astype(np.float64)

def __iter__(self):
for idx in range(self.num_competitors):
yield self.matchups[idx], self.outcomes[idx]
# for matchup, outcome in zip(self.matchups, self.outcomes):
# yield matchup, outcome

def generate_matchup_data(
num_matchups: int = 10000,
num_competitors: int = 100,
Expand Down Expand Up @@ -184,14 +164,18 @@ def generate_matchup_data(
outcomes = rng.multinomial(n=1, pvals=probs)
outcomes = np.argmax(outcomes, axis=1) / 2.0 # map 0->0, 1->0.5, 2->1.0

data = {
df = pl.DataFrame({
'timestamp': timestamps,
'competitor_1': matchups[:, 0],
'competitor_2': matchups[:, 1],
'outcome': outcomes,
}
df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['timestamp'], unit='s')
df['competitor_1'] = 'competitor_' + df['competitor_1'].astype(str)
df['competitor_2'] = 'competitor_' + df['competitor_2'].astype(str)
})
df = (
df.with_columns([
(pl.col('timestamp') * 1000).cast(pl.Datetime('ms')).alias('date'),
pl.concat_str([pl.lit('competitor_'), pl.col('competitor_1').cast(pl.Utf8)]).alias('competitor_1'),
pl.concat_str([pl.lit('competitor_'), pl.col('competitor_2').cast(pl.Utf8)]).alias('competitor_2')
])
.drop('timestamp')
)
return df
37 changes: 37 additions & 0 deletions riix/utils/date_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import re

PROG = re.compile(r'(\d+)([WwDdHhMmSs])')
SECONDS_PER_UNIT = {
'W': 7 * 24 * 60 * 60, # weeks to seconds
'D': 24 * 60 * 60, # days to seconds
'H': 60 * 60, # hours to seconds
'M': 60, # minutes to seconds
'S': 1 # seconds
}

def get_duration(duration_str):
"""
Parse duration strings like '7D', '1W', '24H' etc. into the number os seconds since epoch as an int
Parameters:
-----------
duration_str : str
String in format numberLetter where Letter is one of:
W/w - weeks
D/d - days
H/h - hours
M/m - minutes
S/s - seconds
Returns:
--------
duration : int
"""
# Extract number and unit using regex
match = PROG.match(duration_str)
if not match:
raise ValueError(f"Invalid duration format: {duration_str}")
number = int(match.group(1))
unit = match.group(2).upper()
duration = int(number * SECONDS_PER_UNIT[unit])
return duration

0 comments on commit b3eb278

Please sign in to comment.