-
Notifications
You must be signed in to change notification settings - Fork 0
/
add-sample-datasets.py
131 lines (112 loc) · 3.98 KB
/
add-sample-datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from pathlib import Path
from tkinter import (
Button,
Frame,
Label,
StringVar,
Tk,
filedialog,
messagebox,
simpledialog,
)
import numpy as np
import pandas as pd
DEFAULT_SAMPLE_SIZE = 1000
FEATURES = ["star_rating", "review_body", "review_headline"]
SEED = 7654 # for reproducability
class SamplingApp(Frame):
def __init__(self, master=None):
super().__init__(master)
self.master = master
self.master.title("Dataset Sampling App")
self.master.geometry("400x240")
self.master.resizable(False, False) # Fix window size
self.current_action = StringVar()
self._create_content()
self.pack()
def _create_content(self):
"""Prepare and display a brief introduction."""
self.intro_title = Label(
self,
text="Amazon Reviews Data Sampler",
font=("Courier", 21, "bold"),
justify="center",
wraplength=320,
)
self.intro_title.pack(pady=(30, 10))
self.intro_body = Label(
self,
text=(
"Extract sentiment analysis samples from the Amazon Customer"
" Reviews datasets."
),
font=("Times", 14, "italic"),
wraplength=360,
)
self.intro_body.pack()
self.button = Button(
self,
text="Select Files",
bg="#cf5",
font=("Courier", 12),
width=90,
height=25,
command=self._fetch_files,
)
self.button.pack(padx=115, pady=(10, 50))
def _fetch_files(self):
"""Open a filedialog to select data files to process."""
self.current_action.set("Fetching files...")
self.files = filedialog.askopenfilenames(
initialdir=Path.home() / "Downloads",
filetypes=[("compressed tsv", "*.tsv.gz")],
)
self._get_samples()
def _get_samples(self):
"""Extract and save samples of the selected files."""
self.current_action.set("Extracting samples...")
if self.files:
SAMPLE_SIZE = simpledialog.askinteger(
title="Sample size", prompt="Enter sample size: "
)
process_files(
files=self.files,
sample_size=SAMPLE_SIZE or DEFAULT_SAMPLE_SIZE,
)
messagebox.showinfo(
message="Done! Results saved to current folder."
)
else:
self.destroy()
def process_files(*, files: tuple, sample_size: int) -> None:
"""Prepare sentiment analysis samples of the specified size from the
supplied files.
The files should be one of the Amazon Customer Reviews datasets available
at https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt.
Ratings less than 3 are considered negative, while ratings of 3 to 5 are
assumed to be positive.
Args:
files (tuple): Paths to files.
sample_size (int): Desired sample size.
"""
for file in files:
filepath = Path(file)
data = pd.read_csv(filepath, sep="\t", usecols=FEATURES).dropna()
data["text"] = data["review_headline"] + " " + data["review_body"]
# Set [1, 2] as 0 (-ve) and [3, 4, 5] as 1 (+ve)
data["sentiment"] = np.where(data["star_rating"] < 3, 0, 1)
data.drop(columns=FEATURES, inplace=True)
positive_sample = data.query("sentiment == 1").sample(
sample_size // 2 + 1, random_state=SEED
)
negative_sample = data.query("sentiment == 0").sample(
sample_size // 2 + 1, random_state=SEED
)
source_info = filepath.name.rstrip(".tsv.gz").split("_")[3:]
name = "-".join(source_info).lower()
pd.concat([positive_sample, negative_sample]).sample(
sample_size, random_state=SEED
).to_csv(f"{name}-reviews-sample.csv.xz", index=False)
if __name__ == "__main__":
app = SamplingApp(master=Tk())
app.mainloop()