-
Notifications
You must be signed in to change notification settings - Fork 0
/
database.py
119 lines (82 loc) · 3.73 KB
/
database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Implementation of a basic database wrapper for CS 425 Web-Scale Data Course Project
# The aim is to abstract data reads and writes.
# You can implement your own module which does not depend on a database.
import pymysql
from pymysql import IntegrityError
import config
source_table = 'sub_authors'
minhash_output_table = 'common_users'
simrank_output_table = 'simrank_sims'
post_table = 'post'
comment_table = 'comment'
author_postfix = '0000'
def connect_db():
return pymysql.connect(host=config.DB_HOST,
user=config.DB_USER,
password=config.DB_PASSWORD,
db=config.DB_NAME,
connect_timeout=6000)
# SimRank and MinHash functions
def get_sub_ids(connection):
cursor = connection.cursor()
sql = f"SELECT DISTINCT(sub_id) FROM {source_table}"
cursor.execute(sql)
return [str(item[0]) for item in cursor.fetchall()]
def get_user_ids(connection):
cursor = connection.cursor()
sql = f"SELECT DISTINCT(sub_id) FROM {source_table}"
cursor.execute(sql)
return [str(item[0]) + author_postfix for item in cursor.fetchall()]
def get_sub_users(connection, sub_id):
cursor = connection.cursor()
sql = f"SELECT author_id FROM {source_table} WHERE sub_id = %s"
cursor.execute(sql, sub_id)
return [str(item[0]) for item in cursor.fetchall()]
def get_sub_in_links(connection, sub_id):
cursor = connection.cursor()
sql = f"SELECT author_id FROM {source_table} WHERE sub_id = %s"
cursor.execute(sql, int(sub_id))
return [str(item[0]) + author_postfix for item in cursor.fetchall()]
def get_sub_in_links_v2(connection, sub_id):
cursor = connection.cursor()
sql = f'SELECT sub2, sim FROM {minhash_output_table} WHERE sub1 = %s'
cursor.execute(sql, int(sub_id))
return [(str(item[0]), int(item[1])) for item in cursor.fetchall()]
def get_user_in_links(connection, user_id):
cursor = connection.cursor()
sql = f"SELECT sub_id FROM {source_table} WHERE author_id = %s"
cursor.execute(sql, int(user_id))
return [str(item[0]) for item in cursor.fetchall()]
def insert_common_user_nums(connection, sub_i, user_nums):
cursor = connection.cursor()
sql = f"INSERT INTO {minhash_output_table} (sub1, sub2, sim) VALUES (%s, %s, %s)"
try:
cursor.executemany(sql, [(sub_i, sub_j, user_nums[sub_j]) for sub_j in user_nums])
connection.commit()
except IntegrityError:
pass
def insert_sub_similarities(connection, base_id, sims, limit=0.01):
cursor = connection.cursor()
sql = f"INSERT INTO {simrank_output_table} (sub1, sub2, sim) VALUES (%s, %s, %s)"
cursor.executemany(sql, [(base_id, sub_id, sims[sub_id]) for sub_id in sims
if base_id != sub_id and sims[sub_id] > limit])
connection.commit()
# Data crawler functions
def check_post(connection, post_id):
cursor = connection.cursor()
sql = f"SELECT * FROM {post_table} WHERE id= %s"
return cursor.execute(sql, post_id)
def insert_post(connection, post_id, username, sub_name, title, score, nsfw):
cursor = connection.cursor()
sql = f"INSERT INTO {post_table} (id, username, sub_name, title, score, nsfw) VALUES (%s, %s, %s, %s, %s, %s)"
try:
cursor.execute(sql, (post_id, username, sub_name, title, score, nsfw))
except IntegrityError:
print(f'Error! Could not insert post {post_id}')
def insert_comment(connection, comment_id, post_id, username):
cursor = connection.cursor()
sql = f"INSERT INTO {comment_table} (id, post_id, username) VALUES (%s, %s, %s)"
try:
cursor.execute(sql, (comment_id, post_id, username))
except IntegrityError:
print(f'Error! Could not insert comment {comment_id}')