-
Notifications
You must be signed in to change notification settings - Fork 1
/
logical_equivalence_synthetic_dataset.py
154 lines (138 loc) · 8.22 KB
/
logical_equivalence_synthetic_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import pandas as pd
import random
import string
punctuation_string = string.punctuation
data = []
data2 = []
df_output = pd.DataFrame(data,columns=['Sentences', 'Logic-words'])
df_output_2 = pd.DataFrame(data2,columns=['Sentences', 'Logic-words'])
flag = "text2text" ### Generate sentences with more entities to train a T5/GPT-2.
## We have two sets with same number of samples.
## One is only constructed by contraposition law.
## The other is contructed by the four logical equivalence laws.
if flag == "text2text":
subject_list = ['the bald eagle', 'the tiger', 'the bear', 'the lion', 'the wolf', 'the crocodile', 'the dinosaur', 'the snake', 'the leopard',
'the cat', 'the dog', 'the mouse', 'the rabbit', 'the squirrel', 'the duck', 'the goat', 'the goose', 'the donkey', 'the cow',
'Anne', 'Alan', 'Bob', 'Charlie', 'Dave', 'Erin', 'Harry', 'Gary', 'Fiona', 'James',
'Robert', 'John', 'Michael', 'David', 'William', 'Richard', 'Anthony', 'Paul', 'Andrew']
verb_list = ['is']
adjective_list = ['kind', 'quiet', 'round', 'nice', 'smart', 'clever', 'cautious', 'careful', 'brainy',
'dull', 'rough', 'lazy', 'slow', 'sleepy', 'boring', 'bored','tired', 'reckless',
'furry', 'small', 'cute', 'lovely', 'beautiful', 'funny', 'adorable',
'big', 'strong', 'awful', 'fierce', 'heavy', 'horrible', 'powerful', 'angry',
'tall', 'huge', 'high', 'aggressive', 'anxious', 'dizzy', 'depressed', 'disturbed',
'short', 'thin', 'little', 'tiny',
'wealthy', 'poor', 'dull', 'rough', 'bad', 'sad', 'awful']
for subject in subject_list:
for verb in verb_list:
for adjective in adjective_list:
temp_sentence = subject+" "+verb+" "+adjective + "."
df_output_2 = df_output_2.append({'Sentences': temp_sentence,
'Logic-words': 'None'}, ignore_index=True)
df_output = df_output.append(
{'Sentences': temp_sentence,
'Logic-words': 'positive'}, ignore_index=True)
temp_sentence_negation = subject + " " + verb + " not " + adjective + "."
df_output = df_output.append(
{'Sentences': temp_sentence_negation,
'Logic-words': 'negative'}, ignore_index=True)
df_output_if_then = pd.DataFrame(data,columns=['Sentences', 'Logic-words'])
for index, item in df_output.iterrows():
sentence1 = df_output.loc[index, 'Sentences']
for i in punctuation_string:
sentence1 = sentence1.replace(i, '')
random_number = random.randint(0, df_output.shape[0]-1)
while index == random_number:
random_number = random.randint(0,df_output.shape[0]-1)
sentence2 = df_output.loc[random_number, 'Sentences']
temp_sentence = "If " + sentence1 + ", then " + sentence2
df_output_if_then = df_output_if_then.append(
{'Sentences': temp_sentence,
'Logic-words': 'if,then'}, ignore_index=True)
df_output_and_or = pd.DataFrame(data,columns=['Sentences', 'Logic-words'])
for index, item in df_output.iterrows():
sentence1 = df_output.loc[index, 'Sentences']
for i in punctuation_string:
sentence1 = sentence1.replace(i, '')
random_number = random.randint(0, df_output.shape[0]-1)
while index == random_number:
random_number = random.randint(0,df_output.shape[0]-1)
sentence2 = df_output.loc[random_number, 'Sentences']
temp_sentence = sentence1 + " and " + sentence2
df_output_and_or = df_output_and_or.append(
{'Sentences': temp_sentence,
'Logic-words': 'and'}, ignore_index=True)
if "not" in sentence1 and "not" not in sentence2:
temp_sentence = sentence2 + " or " + sentence1
else:
temp_sentence = sentence1 + " or " + sentence2
df_output_and_or = df_output_and_or.append(
{'Sentences': temp_sentence,
'Logic-words': 'or'}, ignore_index=True)
df_output = df_output.append(df_output_if_then)
df_output = df_output.append(df_output_and_or)
df_output_if_then.to_csv("./output_result/synthetic_sentences_for_text2text_contraposition_law.csv",index = None,encoding = 'utf8')
df_output.to_csv("./output_result/synthetic_sentences_for_text2text.csv",index = None,encoding = 'utf8')
df_output_2.to_csv("./output_result/synthetic_single_no_logic_words_sentences_for_text2text.csv",index = None,encoding = 'utf8')
else:
subject_list = ['the bald eagle', 'the tiger', 'the bear', 'the lion', 'the wolf', 'the crocodile', 'the dinosaur', 'the snake', 'the leopard',
'the cat', 'the dog', 'the mouse', 'the rabbit', 'the squirrel',
'Anne', 'Alan', 'Bob', 'Charlie', 'Dave', 'Erin', 'Harry', 'Gary', 'Fiona']
verb_list = ['is']
adjective_list = ['kind', 'quiet', 'round', 'nice', 'smart', 'clever',
'dull', 'rough', 'lazy', 'slow', 'sleepy', 'boring', 'tired', 'reckless',
'furry', 'small', 'cute', 'lovely', 'beautiful', 'funny',
'big', 'strong', 'awful', 'fierce', 'heavy', 'horrible', 'powerful', 'angry',
'high', 'huge',
'short', 'thin', 'little', 'tiny',
'wealthy', 'poor', 'dull', 'rough', 'bad', 'sad']
for subject in subject_list:
for verb in verb_list:
for adjective in adjective_list:
temp_sentence = subject+" "+verb+" "+adjective + "."
df_output_2 = df_output_2.append({'Sentences': temp_sentence,
'Logic-words': 'None'}, ignore_index=True)
df_output = df_output.append(
{'Sentences': temp_sentence,
'Logic-words': 'positive'}, ignore_index=True)
temp_sentence_negation = subject + " " + verb + " not " + adjective + "."
df_output = df_output.append(
{'Sentences': temp_sentence_negation,
'Logic-words': 'negative'}, ignore_index=True)
df_output_if_then = pd.DataFrame(data,columns=['Sentences', 'Logic-words'])
for index, item in df_output.iterrows():
sentence1 = df_output.loc[index, 'Sentences']
for i in punctuation_string:
sentence1 = sentence1.replace(i, '')
random_number = random.randint(0, df_output.shape[0]-1)
while index == random_number:
random_number = random.randint(0,df_output.shape[0]-1)
sentence2 = df_output.loc[random_number, 'Sentences']
temp_sentence = "If " + sentence1 + ", then " + sentence2
df_output_if_then = df_output_if_then.append(
{'Sentences': temp_sentence,
'Logic-words': 'if,then'}, ignore_index=True)
df_output_and_or = pd.DataFrame(data,columns=['Sentences', 'Logic-words'])
for index, item in df_output.iterrows():
sentence1 = df_output.loc[index, 'Sentences']
for i in punctuation_string:
sentence1 = sentence1.replace(i, '')
random_number = random.randint(0, df_output.shape[0]-1)
while index == random_number:
random_number = random.randint(0,df_output.shape[0]-1)
sentence2 = df_output.loc[random_number, 'Sentences']
temp_sentence = sentence1 + " and " + sentence2
df_output_and_or = df_output_and_or.append(
{'Sentences': temp_sentence,
'Logic-words': 'and'}, ignore_index=True)
if "not" in sentence1 and "not" not in sentence2:
temp_sentence = sentence2 + " or " + sentence1
else:
temp_sentence = sentence1 + " or " + sentence2
df_output_and_or = df_output_and_or.append(
{'Sentences': temp_sentence,
'Logic-words': 'or'}, ignore_index=True)
df_output = df_output.append(df_output_if_then)
df_output = df_output.append(df_output_and_or)
df_output.to_csv("./output_result/synthetic_sentences.csv",index = None,encoding = 'utf8')
df_output_2.to_csv("./output_result/synthetic_single_no_logic_words_sentences.csv",index = None,encoding = 'utf8')