-
Notifications
You must be signed in to change notification settings - Fork 0
/
R7_functions.py
91 lines (78 loc) · 3.7 KB
/
R7_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# -*- coding: utf-8 -*-
from typing import Union
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import (
explained_variance_score,
mean_squared_error,
mean_absolute_error,
max_error,
)
from IPython.display import display
def plot_residuals(data_dict, dataset_key="validation"):
"""
Plot residuals distribution (histogram and boxplot) for a given dataset key.
Parameters:
- data_dict (dict): Dictionary containing datasets (e.g., "train", "validation").
- dataset_key (str): Key for the dataset to plot (e.g., "train" or "validation").
"""
# Check if the dataset_key exists in the dictionary
if dataset_key not in data_dict:
raise ValueError(f"Dataset '{dataset_key}' not found in the dictionary.")
# Extract the dataset
df = data_dict[dataset_key]
# Ensure the dataset has the 'residuals' column
if "residuals" not in df.columns:
raise ValueError(f"Dataset '{dataset_key}' does not contain 'residuals' column.")
# Compute summary statistics
residuals = df["residuals"] / 1000 # Convert residuals to k€
s_stats = residuals.describe().round(1)
# Compute mean and std for confidence interval
mean = s_stats["mean"]
std_dev = s_stats["std"]
lower_ci = round(mean - std_dev, 1) # Mean - 1 Std
upper_ci = round(mean + std_dev, 1) # Mean + 1 Std
# Compute IQR and thresholds for outliers
IQR = s_stats["75%"] - s_stats["25%"]
lower_bound = round(s_stats["25%"] - 1.5 * IQR, 1) # Rounded lower bound
upper_bound = round(s_stats["75%"] + 1.5 * IQR, 1) # Rounded upper bound
# Plot residuals: histogram and boxplot
fig, axes = plt.subplots(2, 1, figsize=(10, 8), gridspec_kw={'height_ratios': [3, 1]})
# 1. Histogram of residuals
sns.histplot(residuals, kde=True, color="skyblue", ax=axes[0])
axes[0].axvline(mean, color="red", linestyle="--", label=f"Mean: {mean} k€")
axes[0].axvline(upper_ci, color="orange", linestyle="--", label=f"Mean + 1 SD: {upper_ci} k€")
axes[0].axvline(lower_ci, color="orange", linestyle="--", label=f"Mean - 1 SD: {lower_ci} k€")
axes[0].set_title(f"Residuals Distribution (Histogram) - {dataset_key.capitalize()} Data")
axes[0].set_xlabel("Residuals (k€)")
axes[0].set_ylabel("Count")
axes[0].legend(
title="Statistics",
loc="upper right",
labels=[
f"Mean: {mean} k€",
f"Std Dev: {std_dev} k€",
f"Mean ± 1 SD: [{lower_ci}, {upper_ci}] k€"
]
)
# 2. Boxplot of residuals
sns.boxplot(x=residuals, ax=axes[1], color="white", boxprops=dict(alpha=0.7), showfliers=False)
sns.stripplot(x=residuals, ax=axes[1], color="skyblue", alpha=0.6, jitter=True)
# Highlight areas for outliers
axes[1].axvspan(residuals.min(), lower_bound, color="orange", alpha=0.2, label=f"Outliers (Low < {lower_bound} k€)")
axes[1].axvspan(upper_bound, residuals.max(), color="orange", alpha=0.2, label=f"Outliers (High > {upper_bound} k€)")
# Add lines for the IQR bounds
axes[1].axvline(lower_bound, color="orange", linestyle="--", label=f"Lower Bound: {lower_bound} k€")
axes[1].axvline(upper_bound, color="orange", linestyle="--", label=f"Upper Bound: {upper_bound} k€")
# Configure boxplot aesthetics
axes[1].set_title(f"Residuals Distribution (Boxplot) - {dataset_key.capitalize()} Data")
axes[1].set_xlabel("Residuals (k€)")
axes[1].legend()
# Tight layout to avoid overlap
plt.tight_layout()
plt.show()