-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfpgrowth.py
146 lines (131 loc) · 5.32 KB
/
fpgrowth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import pandas as pd
import time
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
# data = [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
'''
python fpgrowth.py
'''
split_str = ' ||| '
min_sup = 5
min_con = 0.5
alpha = 1
beta = 10
def loadData():
dataset = {i:[] for i in range(2017, 2023)}
with open('./authors_encoded.txt', 'r') as encode:
for line in encode:
words = line.split(split_str)
length = len(words) - 1
dataLine = [int(words[i]) for i in range(2, length)]
dataset[int(words[0])].append(dataLine)
return dataset
def readAuthorIndex():
author_dict = {}
with open('./authors_index.txt', 'r') as authors_index:
for name in authors_index:
name = name.strip().split(split_str)
if len(name) != 3: continue
author_dict[int(name[0])] = [name[1], name[2]]
return author_dict
if __name__ == '__main__':
print('loading the dataset...')
dataset = loadData()
author_dict = readAuthorIndex()
print('change the format to one-hot...')
df = {}
te_len = {}
co_authors = {}
teams = {}
for year in range(2017, 2023):
data = dataset[year]
te = TransactionEncoder()
te_ary = te.fit(data).transform(data)
df[year] = pd.DataFrame(te_ary, columns=te.columns_)
te_len[year] = te_ary.shape[0]
print('get co-authors & teams...')
df_co_authors = pd.DataFrame(columns=['year', 'authors', 'papers', 'active'])
df_teams = pd.DataFrame(columns=['year', 'authors', 'papers', 'active'])
for year in range(2017, 2023):
# get frequent itemsets
frequent_itemsets = fpgrowth(df[year], min_support = min_sup/te_len[year], use_colnames=True)
# get rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_con)
co_authors = frequent_itemsets[frequent_itemsets.itemsets.apply(lambda x: len(x)) == 2]
teams = frequent_itemsets[frequent_itemsets.itemsets.apply(lambda x: len(x)) > 2]
co_authors_set = set([tuple(fs) for fs in co_authors['itemsets']])
teams_set = set([tuple(fs) for fs in teams['itemsets']])
rules_set = set([
(tuple(fs[0]), tuple(fs[1])) for fs in
zip(rules['antecedents'].tolist(), rules['consequents'].tolist())
])
# remove not confident relationships
for rel in co_authors_set:
stay = True
for i in range(2):
ok = False
for rule in rules_set:
if rel[0:1] == rule[0] and rel[1:2] == rule[1]:
ok = True
break
if not ok:
stay = False
break
if not stay:
co_authors = co_authors[co_authors.itemsets.apply(lambda x: tuple(x)) != rel]
for rel in teams_set:
stay = True
for i in range(len(rel)):
ok = False
for rule in rules_set:
if rel[:i] + rel[i+1:] == rule[0] and (rel[i],) == rule[1]:
ok = True
break
if not ok:
stay = False
break
if not stay:
teams = teams[teams.itemsets.apply(lambda x: tuple(x)) != rel]
# write df
for encode_lists in co_authors['itemsets']:
authors = []
active = 0.0
# get author name and active/group_papers
for encode in encode_lists:
authors.append(author_dict[encode][0])
active += 1/int(author_dict[encode][1])
active += alpha + beta / len(authors)
authors = tuple(authors)
# get group_papers
for index, rows in co_authors.iterrows():
if encode_lists == rows['itemsets']:
papers = int(float(rows['support']) * te_len[year])
active *= papers
break
df_co_authors.loc[len(df_co_authors)] = [year, authors, papers, active]
for encode_lists in teams['itemsets']:
authors = []
active = 0.0
# get author name and active/group_papers
for encode in encode_lists:
authors.append(author_dict[encode][0])
active += 1/int(author_dict[encode][1])
active += alpha + beta / len(authors)
authors = tuple(authors)
# get group_papers
for index, rows in teams.iterrows():
if encode_lists == rows['itemsets']:
papers = int(float(rows['support']) * te_len[year])
active *= papers
break
df_teams.loc[len(df_teams)] = [year, authors, papers, active]
print("Writing result into csv...")
df_co_authors.to_csv(
'./result_co_authors_' + str(min_sup) + '_' + str(min_con) + '.csv',
index=False
)
df_teams.to_csv(
'./result_teams_' + str(min_sup) + '_' + str(min_con) + '.csv',
index=False
)