-
Notifications
You must be signed in to change notification settings - Fork 46
/
content_to_db.py
574 lines (505 loc) · 23.3 KB
/
content_to_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
import sqlite3
import os
import re
import shutil
import logging
from PIL import Image, UnidentifiedImageError
# Create the database and table
conn = sqlite3.connect('tsh.db')
cursor = conn.cursor()
# Create the topics table
cursor.execute('''
CREATE TABLE IF NOT EXISTS topics (
id INTEGER PRIMARY KEY,
title TEXT,
level INTEGER,
parent INTEGER,
path TEXT,
draft TEXT
)
''')
# Create the articles table
cursor.execute('''
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY,
type TEXT,
title TEXT,
parent INTEGER,
description TEXT,
path TEXT,
keywords TEXT,
date TEXT,
date_modified TEXT,
draft TEXT,
weight INTEGER,
author TEXT,
content TEXT
)
''')
# Create the contributors table
cursor.execute('''
CREATE TABLE IF NOT EXISTS contributors (
id INTEGER PRIMARY KEY,
name TEXT,
description_short TEXT,
description_long TEXT,
skills TEXT,
linkedin TEXT,
facebook TEXT,
twitter TEXT,
email TEXT,
image TEXT,
status TEXT,
path TEXT,
content TEXT
)
''')
# Create the blogs table
cursor.execute('''
CREATE TABLE IF NOT EXISTS blogs (
id INTEGER PRIMARY KEY,
title TEXT,
description TEXT,
path TEXT,
date TEXT,
date_modified TEXT,
draft TEXT,
content TEXT
)
''')
# Commit the creation of tables
conn.commit()
# Define the content directory
script_directory = os.path.dirname(os.path.realpath(__file__))
content_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "content")
topic_folder = os.path.join(content_directory, 'topics')
# Insert or update a topic in the database
# Parameters:
# - cursor: SQLite cursor object
# - title (str): The title of the topic
# - level (int): The level of the topic in the hierarchy
# - parent (int): The parent ID of the topic
# - path (str): The path of the topic
# - draft (str): The draft status of the topic
# Returns:
# - int: The ID of the inserted or updated topic
def insert_topic_into_db(cursor, title, level, parent, path, draft):
try:
cursor.execute("SELECT 1 FROM topics WHERE path = ?", (path,))
exists = cursor.fetchone()
if exists:
cursor.execute("UPDATE topics SET title = ?, level = ?, parent = ?, draft = ? WHERE path = ?",
(title, level, parent, draft, path))
else:
cursor.execute("INSERT INTO topics (title, level, parent, path, draft) VALUES (?, ?, ?, ?, ?)",
(title, level, parent, path, draft))
conn.commit()
cursor.execute("SELECT id FROM topics WHERE path = ?", (path,))
topic_id = cursor.fetchone()[0]
logging.info(f"Inserted/Updated topic, ID: {topic_id}")
except Exception as e:
logging.error(f"Database error: {e}")
raise
return topic_id
# Insert or update an article in the database
# Parameters:
# - cursor: SQLite cursor object
# - type (str): The type of the article
# - title (str): The title of the article
# - parent (int): The parent ID of the article
# - description (str): The description of the article
# - path (str): The path of the article
# - keywords (str): The keywords associated with the article
# - date (str): The publication date of the article
# - date_modified (str): The modification date of the article
# - draft (str): The draft status of the article
# - weight (int): The weight of the article for ordering
# - author (str): The author of the article
# - content (str): The content of the article
# Returns:
# - None -> Insert/update article into db
def insert_article_into_db(cursor, type, title, parent, description, path, keywords, date, date_modified, draft, weight, author, content):
cursor.execute("SELECT 1 FROM articles WHERE path = ?", (path,))
exists = cursor.fetchone()
if exists:
cursor.execute("""
UPDATE articles
SET type = ?, title = ?, parent = ?, description = ?, keywords = ?, date = ?,
date_modified = ?, draft = ?, weight = ?, author = ?, content = ?
WHERE path = ?
""", (type, title, parent, description, keywords, date, date_modified, draft, weight, author, content, path))
print("Artikel bijgewerkt.")
else:
cursor.execute("""
INSERT INTO articles (type, title, parent, description, path, keywords, date, date_modified, draft, weight, author, content)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (type, title, parent, description, path, keywords, date, date_modified, draft, weight, author, content))
# Insert or update a contributor in the database
# Parameters:
# - cursor: SQLite cursor object
# - name (str): The name of the contributor
# - description_short (str): Short description of the contributor
# - description_long (str): Long description of the contributor
# - skills (str): Skills of the contributor
# - linkedin (str): LinkedIn profile URL of the contributor
# - facebook (str): Facebook profile URL of the contributor
# - twitter (str): Twitter profile URL of the contributor
# - email (str): Email of the contributor
# - image (str): Image URL of the contributor
# - status (str): Status of the contributor
# - path (str): Path of the contributor
# - content (str): Content of the contributor
# Returns:
# - None -> Insert/update contributor in db
def insert_or_update_contributor(cursor, name, description_short, description_long, skills, linkedin, facebook, twitter, email, image, status, path, content):
name = name if name else None
description_short = description_short if description_short else None
description_long = description_long if description_long else None
skills = skills if skills else None
linkedin = linkedin if linkedin else None
facebook = facebook if facebook else None
twitter = twitter if twitter else None
email = email if email else None
image = image if image else None
status = status if status else None
path = path if path else None
content = content if content else None
cursor.execute("SELECT 1 FROM contributors WHERE path = ?", (path,))
exists = cursor.fetchone()
if exists:
cursor.execute('''
UPDATE contributors SET name = ?, description_short = ?, description_long = ?, skills = ?, linkedin = ?, facebook = ?, twitter = ?, email = ?, image = ?, status = ?, content = ?
WHERE path = ?
''', (name, description_short, description_long, skills, linkedin, facebook, twitter, email, image, status, content, path))
else:
cursor.execute('''
INSERT INTO contributors (name, description_short, description_long, skills, linkedin, facebook, twitter, email, image, status, path, content)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (name, description_short, description_long, skills, linkedin, facebook, twitter, email, image, status, path, content))
# Insert or update a blog in the database
# Parameters:
# - cursor: SQLite cursor object
# - title (str): The title of the blog
# - description (str): The description of the blog
# - path (str): The path of the blog
# - date (str): The publication date of the blog
# - date_modified (str): The modification date of the blog
# - draft (str): The draft status of the blog
# - content (str): The content of the blog
# Returns:
# - None -> Insert/update blog into db
def insert_or_update_blog(cursor, title, description, path, date, date_modified, draft, content):
title = title if title else None
description = description if description else None
path = path if path else None
date = date if date else None
date_modified = date_modified if date_modified else None
draft = draft if draft else None
content = content if content else None
cursor.execute("SELECT 1 FROM blogs WHERE path = ?", (path,))
exists = cursor.fetchone()
if exists:
cursor.execute('''
UPDATE blogs SET title = ?, description = ?, date = ?, date_modified = ?, draft = ?, content = ?
WHERE path = ?
''', (title, description, date, date_modified, draft, content, path))
else:
cursor.execute('''
INSERT INTO blogs (title, description, path, date, date_modified, draft, content)
VALUES (?, ?, ?, ?, ?, ?, ?)
''', (title, description, path, date, date_modified, draft, content))
# Parse a Markdown file to extract metadata
# Parameters:
# - file_path (str): The path to the Markdown file
# Returns:
# - tuple: A tuple containing the title, description, and draft status
def parse_md_file(file_path):
with open(file_path, 'r') as file:
content = file.read()
title = re.search(r'title: "(.*?)"', content)
description = re.search(r'description: "(.*?)"', content)
draft = re.search(r'draft: (.*?)\n', content)
# Extract values, return None if not found
title_value = title.group(1) if title else None
description_value = description.group(1) if description else None
draft_value = draft.group(1) if draft else None
return title_value, description_value, draft_value
# Process an article from a Markdown file and insert it into the database
# Parameters:
# - md_file_path (str): The path to the Markdown file
# - parent_id (int): The parent ID of the article
# Returns:
# - None -> Insert article into db
def process_article(md_file_path, parent_id):
with open(md_file_path, 'r', encoding='utf-8') as md_file:
content = md_file.read()
title = re.search(r'title: "(.*?)"', content)
description = re.search(r'description: "(.*?)"', content)
draft = re.search(r'draft: (.*?)\n', content)
keywords = re.search(r'keywords: "(.*?)"', content)
date = re.search(r'date: (\d{4}-\d{2}-\d{2})', content)
date_modified = re.search(r'date_modified: (\d{4}-\d{2}-\d{2})', content)
weight = re.search(r'weight: (\d+)', content)
author = re.search(r'author: "(.*?)"', content)
# Extract article content
match = re.search(r'---(.*?)---(.*)', content, re.DOTALL)
if match:
file_content = match.group(2).strip()
else:
file_content = None
# Insert data into articles table
insert_article_into_db(cursor, 'topic', title.group(1) if title else None, parent_id,
description.group(1) if description else None, os.path.basename(md_file_path).replace('.md', ''),
keywords.group(1) if keywords else None, date.group(1) if date else None,
date_modified.group(1) if date_modified else None, draft.group(1) if draft else None,
int(weight.group(1)) if weight else None, author.group(1) if author else None, file_content)
import sqlite3
import os
import re
import shutil
import logging
# Loop through topics and fill the database
# Parameters:
# - root_path (str): The root directory path to start the traversal
# Returns:
# - None
def fill_database(root_path):
exclude = {'img', 'images', 'data'}
path_to_id = {}
for path, dirs, files in os.walk(root_path):
dirs[:] = [d for d in dirs if d not in exclude]
level = path.count(os.sep) - root_path.count(os.sep)
parent_path = os.path.dirname(path)
parent_id = path_to_id.get(parent_path, None)
folder_name = os.path.basename(path)
index_file = os.path.join(path, '_index.md')
if os.path.exists(index_file):
title, description, draft = parse_md_file(index_file)
folder_id = insert_topic_into_db(cursor, title, level, parent_id, folder_name, draft)
path_to_id[path] = folder_id
for file in os.listdir(path):
if file != '_index.md' and file.endswith('.md'):
md_file_path = os.path.join(path, file)
process_article(md_file_path, folder_id)
# Check if the file is an image
# Parameters:
# - filename (str): The name of the file to check
# Returns:
# - bool: True if the file is an image, False otherwise
def is_image(filename):
_, ext = os.path.splitext(filename)
return ext.lower() in image_extensions
# Check if the file is not an image or Markdown
# Parameters:
# - filename (str): The name of the file to check
# Returns:
# - bool: True if the file is not an image or Markdown, False otherwise
def is_not_image_or_md(filename):
_, ext = os.path.splitext(filename)
return not (ext.lower() in image_extensions or ext.lower() == '.md')
# Execute loop through topics and fill the database
fill_database(topic_folder)
# Fetch and process examples
examples_root_folder = os.path.join(content_directory, 'examples')
for md_file_name in os.listdir(examples_root_folder):
if md_file_name != '_index.md' and md_file_name.endswith('.md'):
# Construct the full path of the Markdown file
md_file_path = os.path.join(examples_root_folder, md_file_name)
path = md_file_name.replace('.md', '').lower()
# Initialize variables
description = None
title = None
keywords = None
date = None
date_modified = None
draft = None
author = None
content = None
# Read the contents of the Markdown file
with open(md_file_path, 'r', encoding='utf-8') as md_file:
# YAML
for line in md_file:
if line.startswith('description:'):
description = line.strip().replace('description:', '', 1).replace('"','').strip()
elif line.startswith('title:'):
title = line.strip().replace('title:', '', 1).replace('"','').strip()
elif line.startswith('keywords:'):
keywords = line.strip().replace('keywords:', '', 1).replace('"','').strip()
elif line.startswith('date:'):
date = line.strip().replace('date:', '', 1).strip()
elif line.startswith('date_modified:'):
date_modified = line.strip().replace('date_modified:', '', 1).strip()
elif line.startswith('draft:'):
draft = line.strip().replace('draft:', '', 1).strip()
elif line.startswith('author:'):
author = line.strip().replace('author:', '', 1).replace('"','').strip()
elif line.startswith('weight:'):
weight = line.strip().replace('weight:', '', 1).replace('"','').strip()
with open(md_file_path, 'r', encoding='utf-8') as md_file:
# Fetch content
md_file_content = md_file.read()
match = re.match(r'---(.*?)---(.*)', md_file_content, re.DOTALL)
if match:
file_content = match.group(2)
content = file_content
else:
content = None
# Insert data into the articles table
insert_article_into_db(cursor, 'examples', title if title else None, None,
description if description else None, os.path.basename(md_file_path).replace('.md', ''),
keywords if keywords else None, date if date else None,
date_modified if date_modified else None, draft if draft else None,
int(weight) if weight else None, author if author else None, content if content else None)
# Fetch and process contributors
contributors_root_folder = os.path.join(content_directory, 'contributors')
for md_file_name in os.listdir(contributors_root_folder):
if md_file_name != '_index.md' and md_file_name.endswith('.md'):
# Construct the full path of the Markdown file
md_file_path = os.path.join(contributors_root_folder, md_file_name)
# Initialize variables
name = None
description_short = None
description_long = None
skills_list = []
skills = None
skills_started = False
linkedin = None
facebook = None
twitter = None
email = None
image = None
status = None
content = None
path = None
# Read the contents of the Markdown file
with open(md_file_path, 'r', encoding='utf-8') as md_file:
# YAML
for line in md_file:
if line.startswith('name:'):
name = line.strip().replace('name:', '', 1).replace('"','').strip()
path = name.replace(' ','-').lower()
elif line.startswith('description_short:'):
description_short = line.strip().replace('description_short:', '', 1).replace('"','').strip()
elif line.startswith('description_long:'):
description_long = line.strip().replace('description_long:', '', 1).replace('"','').strip()
elif 'linkedin' in line.strip().lower() and line.strip().startswith('link:'):
linkedin = line.strip().replace('link:', '', 1).strip()
elif 'facebook' in line.strip().lower() and line.strip().startswith('link:'):
facebook = line.strip().replace('link:', '', 1).strip()
elif 'twitter' in line.strip().lower() and line.strip().startswith('link:'):
twitter = line.strip().replace('link:', '', 1).strip()
elif line.startswith('email:'):
email = line.strip().replace('email:', '', 1).replace('"','').strip()
elif line.startswith('image:'):
image = line.strip().replace('image:', '', 1).replace('"','').strip()
elif line.startswith('status:'):
status = line.strip().replace('status:', '', 1).replace('"','').strip()
elif line.strip().startswith('skills:'):
skills_started = True
elif skills_started and line.strip().startswith('-'):
skill = line.strip().lstrip('-').strip()
skills_list.append(skill)
elif skills_started and not line.strip().startswith('-') and not line.strip().startswith('skills:'):
skills_started = False
skills = ', '.join(skills_list)
with open(md_file_path, 'r', encoding='utf-8') as md_file:
# Fetch content
md_file_content = md_file.read()
match = re.match(r'---(.*?)---(.*)', md_file_content, re.DOTALL)
if match:
content = match.group(2)
else:
content = None
# Insert or update contributor data in the contributors table
insert_or_update_contributor(cursor, name, description_short, description_long, skills, linkedin, facebook, twitter, email, image, status, path, content)
# Fetch and process blogs
blog_root_folder = os.path.join(content_directory, 'blog')
for md_file_name in os.listdir(blog_root_folder):
if md_file_name != '_index.md' and md_file_name.endswith('.md'):
# Construct the full path of the Markdown file
md_file_path = os.path.join(blog_root_folder, md_file_name)
path = md_file_name.replace('.md', '').lower()
# Initialize variables
description = None
title = None
date = None
date_modified = None
draft = None
content = None
# Read the contents of the Markdown file
with open(md_file_path, 'r', encoding='utf-8') as md_file:
# YAML
for line in md_file:
if line.startswith('description:'):
description = line.strip().replace('description:', '', 1).replace('"','').strip()
elif line.startswith('title:'):
title = line.strip().replace('title:', '', 1).replace('"','').strip()
elif line.startswith('date:'):
date = line.strip().replace('date:', '', 1).strip()
elif line.startswith('date_modified:'):
date_modified = line.strip().replace('date_modified:', '', 1).strip()
elif line.startswith('draft:'):
draft = line.strip().replace('draft:', '', 1).strip()
with open(md_file_path, 'r', encoding='utf-8') as md_file:
# Fetch content
md_file_content = md_file.read()
match = re.match(r'---(.*?)---(.*)', md_file_content, re.DOTALL)
if match:
file_content = match.group(2)
content = file_content
else:
content = None
# Insert or update blog data in the blogs table
insert_or_update_blog(cursor, title, description, path, date, date_modified, draft, content)
# Commit the changes to the database and close the connection
conn.commit()
conn.close()
# Define the image directory
img_directory = os.path.join(script_directory, "static/img")
# Define valid image extensions
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.mov']
# Create image directory if it does not exist
if not os.path.exists(img_directory):
os.makedirs(img_directory)
# Loop through the content directory and convert/copy images to the image directory
for root, _, files in os.walk(content_directory):
for filename in files:
src_filepath = os.path.join(root, filename)
if is_image(filename):
try:
# Define the destination filepath with .webp extension
dst_filename = os.path.splitext(filename)[0] + '.webp'
dst_filepath = os.path.join(img_directory, dst_filename)
# Open and convert the image to webp
with Image.open(src_filepath) as img:
img.save(dst_filepath, 'webp')
print(f"Converted {src_filepath} to {dst_filepath}")
except UnidentifiedImageError:
print(f"UnidentifiedImageError: Cannot identify image file {src_filepath}")
except Exception as e:
print(f"Error processing {src_filepath}: {e}")
# Define the directory for other files
files_directory = os.path.join(script_directory, "static/files")
# Create files directory if it does not exist
if not os.path.exists(files_directory):
os.makedirs(files_directory)
# Initialize a set to keep track of unique file extensions
unique_extensions = set()
# Loop through the content directory and copy non-image, non-Markdown files
for root, _, files in os.walk(content_directory):
for filename in files:
if is_not_image_or_md(filename):
src_filepath = os.path.join(root, filename)
dst_filepath = os.path.join(files_directory, filename)
os.makedirs(os.path.dirname(dst_filepath), exist_ok=True)
# Check if the destination file already exists
if os.path.exists(dst_filepath):
continue
else:
shutil.copy(src_filepath, dst_filepath)
print(f"Bestand gekopieerd: {dst_filepath}")
# Add the file extension to the set of unique extensions
unique_extensions.add(os.path.splitext(filename)[1].lower())
# Convert the set of unique extensions to a list for output
unique_extensions_list = list(unique_extensions)
print(f"Unieke bestandsextensies van gekopieerde bestanden: {unique_extensions_list}")