-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
71 lines (56 loc) · 2.14 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from pathlib import Path
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
DATA_DIR = Path("datasets")
class Dataset:
"""Load, partition and vectorize a file from the sample `datasets`
directory.
There are currently 2 files available:
- software-reviews-sample.csv.xz
- video-reviews-sample.csv.xz
To add more, please follow the instructions in the README file of the
`datasets` directory.
Once loaded, the file's contents are split into a training and a
validation set, then vectorized using the scikit-learn TfidfVectorizer.
Args:
file_name (str): A file having text data and sentiment labels.
test_size (float, optional): The proportion of the data to use for
model validation. Defaults to 0.2.
"""
def __init__(self, file_name: str, test_size: float = 0.2) -> None:
self.file_name = file_name
self.source = file_name.removesuffix("-sample.csv.xz")
self.TEST_SIZE = test_size
self._split()
self._vectorize_text()
def _load_file(self) -> pd.DataFrame:
"""Get data from a text file as a pandas dataframe.
Returns:
pandas.DataFrame: The file's contents.
"""
return pd.read_csv(
DATA_DIR / self.file_name,
dtype={0: "string", 1: "int8"},
)
def _split(self) -> None:
"""Partition the data into training and validation sets."""
data = self._load_file()
(
self.X_train,
self.X_test,
self.y_train,
self.y_test,
) = train_test_split(
data["text"],
data["sentiment"],
test_size=self.TEST_SIZE,
stratify=data["sentiment"],
)
def _vectorize_text(self) -> None:
"""Vectorize features using the TfidfVectorizer."""
self.vectorizer = TfidfVectorizer(
ngram_range=(1, 2), max_df=0.8, stop_words="english"
)
self.X_train = self.vectorizer.fit_transform(self.X_train)
self.X_test = self.vectorizer.transform(self.X_test)