From fc224b9adc27aebc45bd7d37c99eeaa0897dbbca Mon Sep 17 00:00:00 2001 From: threnjen Date: Tue, 17 Dec 2024 10:50:36 -0800 Subject: [PATCH] hotfix to utilize all samples for small data sets --- modules/rag_description_generation/rag_functions.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/rag_description_generation/rag_functions.py b/modules/rag_description_generation/rag_functions.py index 81e4b9f..c701c13 100644 --- a/modules/rag_description_generation/rag_functions.py +++ b/modules/rag_description_generation/rag_functions.py @@ -31,6 +31,7 @@ def get_single_game_entries( # get the ratings sample distribution by taking 10% of the total ratings df["rounded_rating"] = df["rating"].round(0).astype(int) sample_size = int(len(df) * sample_pct) # Desired total sample size + group_sizes = round( df["rounded_rating"].value_counts(normalize=True) * sample_size, 0 ).astype(int) @@ -50,7 +51,11 @@ def get_single_game_entries( f"Total quality reviews: {len(df)}. {removed_reviews} reviews removed due to quality threshold" ) - if len(df) < sample_size: + sample_size = sample_size if sample_size >= 250 else len(df) + + if sample_size == len(df): + print("Using all quality reviews") + elif len(df) < sample_size: print("Not enough quality reviews to sample from; using all reviews") else: print(f"Stratified sampling to {sample_size} reviews")