-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_weisfeiler_pyramid_match_similarity.py
175 lines (142 loc) · 5.66 KB
/
add_weisfeiler_pyramid_match_similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import utils
from CORD19_GraphOfDocs.neo4j_wrapper import Neo4jDatabase
from grakel.kernels import PyramidMatch, WeisfeilerLehman
from grakel import Graph
from CORD19_GraphOfDocs.utils import read_file
from CORD19_GraphOfDocs.utils import generate_words
import json
import argparse
def create_text(file_text, fieldname='abstract'):
obj = json.loads(file_text)
text = ''.join(item['text'] for item in obj[fieldname])
return text
def create_author_graph_of_words(docs, window_size=4):
edges = {}
unique_words = set()
for doc in docs:
for i in range(len(doc)):
unique_words.add(doc[i])
for j in range(i + 1, i + window_size):
if j < len(doc):
unique_words.add(doc[j])
edge_tuple1 = (doc[i], doc[j])
edge_tuple2 = (doc[j], doc[i])
if edge_tuple1 in edges:
edges[edge_tuple1] += 1
elif edge_tuple2 in edges:
edges[edge_tuple2] += 1
else:
edges[edge_tuple1] = 1
node_labels = {word: word for word in unique_words}
g = Graph(edges, node_labels=node_labels)
return g
def get_author_filenames(database, author_name):
query = (
'MATCH (a:Author)-[:writes]->(p:Paper) '
f'WHERE a.name="{author_name}" RETURN a.name, collect(p.filename)'
)
return database.execute(query, 'r')
def get_graph_kernel(database, author_name):
filenames = get_author_filenames(database, author_name)
docs = [read_file('data/CORD-19-research-challenge/dataset', fname + '.json') for fname in filenames[0][1]]
docs = [generate_words(create_text(doc)) for doc in docs]
g = create_author_graph_of_words(docs, 4)
# print(len(g.vertices))
if len(g.vertices) == 0:
gk = None
else:
#gk = PyramidMatch(normalize=False)
gk = WeisfeilerLehman(normalize=False, n_iter=5, base_graph_kernel=(PyramidMatch, {}))
gk.fit([g])
return gk
def get_graph(database, author_name):
filenames = get_author_filenames(database, author_name)
docs = [read_file('data/CORD-19-research-challenge/dataset', fname + '.json') for fname in filenames[0][1]]
docs = [generate_words(create_text(doc)) for doc in docs]
g = create_author_graph_of_words(docs, 4)
# print(len(g.vertices))
if len(g.vertices) == 0:
g = None
return g
def calculate_similarity_feature(author1_graph_kernel, author2_graph):
gk = author1_graph_kernel
g = author2_graph
if gk is None or g is None:
similarity = 0
else:
similarity = gk.transform([g])[0][0]
return similarity
def main(args):
database = Neo4jDatabase('bolt://localhost:7687', 'neo4j', '1234')
column_name = args.column_name
column_name_normalized = f'{column_name}_normalized'
train_dataset = utils.read_from_csv_file(args.train_filepath)
test_dataset = utils.read_from_csv_file(args.test_filepath)
print('train###')
for i, sample in enumerate(train_dataset):
author1 = sample['author1']
author2 = sample['author2']
print(i, author1, author2)
author1_graph_kernel = get_graph_kernel(database, author1)
author2_graph = get_graph(database, author2)
sample[column_name] = calculate_similarity_feature(author1_graph_kernel, author2_graph)
train_dataset[i] = sample
print('test###')
for i, sample in enumerate(test_dataset):
author1 = sample['author1']
author2 = sample['author2']
print(i, author1, author2)
author1_graph_kernel = get_graph_kernel(database, author1)
author2_graph = get_graph(database, author2)
sample[column_name] = calculate_similarity_feature(author1_graph_kernel, author2_graph)
test_dataset[i] = sample
utils.write_list_of_dicts_to_csv_file(args.train_filepath, train_dataset)
utils.write_list_of_dicts_to_csv_file(args.test_filepath, test_dataset)
from sklearn.preprocessing import MinMaxScaler
train_dataset = utils.read_from_csv_file(args.train_filepath)
test_dataset = utils.read_from_csv_file(args.test_filepath)
train_values = []
for sample in train_dataset:
train_values.append([sample[column_name]])
test_values = []
for sample in test_dataset:
test_values.append([sample[column_name]])
scaler = MinMaxScaler()
scaler.fit(train_values + test_values)
normalized_train_values = scaler.transform(train_values)
for i, sample in enumerate(train_dataset):
sample[column_name_normalized] = normalized_train_values[i, 0]
train_dataset[i] = sample
normalized_test_values = scaler.transform(test_values)
for i, sample in enumerate(test_dataset):
sample[column_name_normalized] = normalized_test_values[i, 0]
test_dataset[i] = sample
utils.write_list_of_dicts_to_csv_file(args.train_filepath, train_dataset)
utils.write_list_of_dicts_to_csv_file(args.test_filepath, test_dataset)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
epilog="Example: python generate_dataset.py"
)
parser.add_argument(
"--train-filepath",
help="The path for the train dataset",
dest="train_filepath",
type=str,
required=True
)
parser.add_argument(
"--test-filepath",
help="The path for the test dataset",
dest="test_filepath",
type=str,
required=True
)
parser.add_argument(
"--column-name",
help="The column name to add the calculation",
dest="column_name",
type=str,
required=True
)
args = parser.parse_args()
main(args)