-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnegative_sample_extention.py
31 lines (25 loc) · 1.16 KB
/
negative_sample_extention.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
filename = "Synthetic_xfm_t5wtense_logical_equivalence_train_v4"
dataframe = pd.read_csv("./output_result/"+filename+".csv")
df = pd.DataFrame(columns=['sentence1', 'sentence2', 'label'])
import random
from sklearn.utils import shuffle
negative_dataframe = dataframe.loc[dataframe['label'] == 0]
negative_dataframe.reset_index(drop=True, inplace=True)
cut_line = dataframe.shape[0]
negative_sample_k = 2 ## add 50% negative samples
for index, row in dataframe.iterrows():
df = df.append(
{'sentence1': row["sentence1"], 'sentence2': row["sentence2"],
'label': row["label"]},
ignore_index=True)
if row["label"] == 1:
for i in range(negative_sample_k):
negative_sentence = negative_dataframe["sentence2"][random.randint(0,negative_dataframe.shape[0]-1)]
df = df.append(
{'sentence1': row["sentence1"], 'sentence2': negative_sentence,
'label': 0},
ignore_index=True)
df = shuffle(df)
df_final = df.head(cut_line)
df_final.to_csv("./output_result/"+filename+"_negative_samples_1_"+str(negative_sample_k+1)+".csv",index = None,encoding = 'utf8')