-
Notifications
You must be signed in to change notification settings - Fork 8
/
05-Create-Open-Doors-Tables.py
executable file
·191 lines (171 loc) · 6.34 KB
/
05-Create-Open-Doors-Tables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# encoding: utf-8
import os
from shared_python.Args import Args
from shared_python.FinalTables import FinalTables
from shared_python.Sql import Sql
def _clean_email(author):
"""
Tidy up author emails that might be missing after earlier stages
:param author: row from the authors table
:return:
"""
email = author["email"]
if email is None or email == "":
email = (
"{0}{1}[email protected]".format(author["name"], args.archive_name)
.replace(" ", "")
.replace("'", "")
)
if email.startswith("mailto:"):
email = author["email"].replace("mailto:", "")
return email
def main(args, log):
sql = Sql(args, log)
final = FinalTables(args, sql, log)
log.info("Creating final destination tables in {0}".format(args.output_database))
table_names = {
"authors": "authors",
"stories": "stories",
"chapters": "chapters",
"story_links": "story_links",
}
filter = "WHERE id NOT IN "
codepath = os.path.dirname(os.path.realpath(__file__))
sql.run_script_from_file(
codepath + "/shared_python/create-open-doors-tables.sql",
database=args.output_database,
)
# Filter out DNI stories - story_ids_to_remove must be comma-separated list of DNI ids
story_exclusion_filter = ""
if os.path.exists(args.story_ids_to_remove):
with open(args.story_ids_to_remove, "rt") as f:
log.info(
"Removing {0} Do Not Import stories...".format(
sum(line.count(",") for line in f) + 1
)
)
f.seek(0)
for line in f:
story_exclusion_filter = filter + "(" + line + ")"
# Filter out DNI stories - bookmark_ids_to_remove must be comma-separated list of DNI ids
bookmark_exclusion_filter = ""
if args.bookmark_ids_to_remove and os.path.exists(args.bookmark_ids_to_remove):
with open(args.bookmark_ids_to_remove, "rt") as f:
log.info(
"Removing {0} Do Not Import bookmarks...".format(
sum(line.count(",") for line in f) + 1
)
)
f.seek(0)
for line in f:
bookmark_exclusion_filter = filter + "(" + line + ")"
# Load filtered tables into variables
stories_without_tags = final.original_table(
table_names["stories"], story_exclusion_filter
)
log.info(
"Stories without tags after removing DNI: {0}".format(len(stories_without_tags))
)
bookmarks_without_tags = final.original_table(
table_names["story_links"], bookmark_exclusion_filter
)
if bookmarks_without_tags:
log.info(
"Bookmarks without tags after removing DNI: {0}".format(
len(bookmarks_without_tags)
)
)
else:
log.info("No bookmarks to remove")
# STORIES
log.info(
"Copying stories to final table {0}.stories...".format(args.output_database)
)
final_stories = []
for story in stories_without_tags:
story_authors = final.original_table(
"item_authors", f"WHERE item_id={story['id']} and item_type='story'"
)
# Add additional story processing here
if len(story_authors) > 0:
final_stories.append(
final.story_to_final_without_tags(story, story_authors)
)
else:
log.warning(
f"Story with id {story['id']} has no authors, and will not be imported"
)
final.insert_into_final("stories", final_stories)
# BOOKMARKS
if bookmarks_without_tags is not None:
log.info(
"Copying bookmarks to final table {0}.story_links...".format(
args.output_database
)
)
final_bookmarks = []
for bookmark in bookmarks_without_tags:
# Add additional bookmark processing here
bookmark_authors = final.original_table(
"item_authors",
f"WHERE item_id={bookmark['id']} and item_type='story_link'",
)
final_bookmarks.append(
final.story_to_final_without_tags(bookmark, bookmark_authors, False)
)
if final_bookmarks:
final.insert_into_final("story_links", final_bookmarks)
# AUTHORS
log.info(
"Copying authors to final table {0}.authors, cleaning emails and removing authors with no works...".format(
args.output_database
)
)
final_authors = []
authors = final.original_table(table_names["authors"])
for final_author in authors:
if any(
story["author_id"] == final_author["id"]
or story["coauthor_id"] == final_author["id"]
for story in final_stories
) or any(
bookmark["author_id"] == final_author["id"] for bookmark in final_bookmarks
):
final_author["email"] = _clean_email(final_author)
final_authors.append(final_author)
final.insert_into_final("authors", final_authors)
# CHAPTERS
chapters = final.original_table(table_names["chapters"], "")
if chapters:
dest_chapter_table = f"{args.output_database}.{table_names['chapters']}"
log.info(
"Copying chapters table {0} from source chapters table...".format(
dest_chapter_table
)
)
sql.execute("drop table if exists {0}".format(dest_chapter_table))
truncate_and_insert = (
"create table {0} (unique(id), key(story_id)) select * from {1}.{2}".format(
dest_chapter_table, args.temp_db_database, table_names["chapters"]
)
)
sql.execute(truncate_and_insert)
add_auto_increment = (
"alter table {0} modify id int not null auto_increment".format(
dest_chapter_table
)
)
sql.execute(add_auto_increment)
else:
log.info(
"Creating chapters table {0}.chapters from source stories table...".format(
args.output_database
)
)
final_chapters = final.dummy_chapters(final_stories)
final.insert_into_final("chapters", final_chapters)
if __name__ == "__main__":
args_obj = Args()
args = args_obj.args_for_05()
log = args_obj.logger_with_filename()
main(args, log)