-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathN2Omodule.py
350 lines (254 loc) · 10.9 KB
/
N2Omodule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 25 14:16:18 2020
@author: books
"""
from io import TextIOWrapper
from os import path
from re import compile, search
from csv import DictReader
from pathlib import Path
def str_slash_char_remove(string):
#regex_forbidden_characters = compile('[\\/*?:"<>|]')
regexSlash = compile("\/")
string = regexSlash.sub('', string)
return string
def str_forbid_char_remove(string):
#regex_forbidden_characters = compile('[\\/*?:"<>|]')
regex_forbidden_characters = compile('[\\*?:"<>|]')
string = regex_forbidden_characters.sub('', string)
return string
# convert %20 to ' '
def str_space_utf8_replace(string):
regex_utf8_space = compile("%20")
string = regex_utf8_space.sub(' ', string)
return string
def str_notion_uid_remove(string):
regexUID = compile("%20\w{32}")
string = regexUID.sub('', string)
return string
def ObsIndex(contents):
"""
Function to return all the relevant indices
Requires: contents are pre-conditioned by pathlib.Path()
Returns: (mdIndex, csvIndex, othersIndex, folderIndex, folderTree)
"""
## index the directory structure
folderIndex = []
folderTree = []
for line in enumerate(contents):
if path.isdir(line[1]):
folderIndex.append(line[0]) #save index
folderTree.append(line[1])
## Case: directories are implicit
if not folderIndex:
Tree = list(set([path.dirname(x) for x in contents]))
[folderTree.append(Path(l)) for l in Tree]
## Index the .md files
mdIndex = []
for line in enumerate(contents):
if line[1].suffix == ".md":
mdIndex.append(line[0]) #save index
## Index the .csv files
csvIndex = []
for line in enumerate(contents):
if line[1].suffix == ".csv":
csvIndex.append(line[0]) #save index
## index the other files using set difference
othersIndex = list(set(range(0,len(contents)))
- (set(folderIndex)|set(mdIndex)|set(csvIndex)))
return mdIndex, csvIndex, othersIndex, folderIndex, folderTree
def N2Ocsv(csvFile):
# Convert csv to dictionary object
reader = DictReader(TextIOWrapper(csvFile, "utf-8-sig"), delimiter=',', quotechar='"')
dictionry = {}
for row in reader: # I don't know how this works but it does what I want
for column, value in row.items():
dictionry.setdefault(column, []).append(value)
IntLinks = list(dictionry.keys())[0] # Only want 1st column header
oldTitle = dictionry.get(IntLinks)
Title = []
mdTitle = []
# Clean Internal Links
regexURLid = compile("(?:https?|ftp):\/\/")
# Clean symbol invalid window path < > : " / \ | ? *
regexSymbols = compile("[<>?:/\|*\"]")
regexSpaces = compile("\s+")
for line in oldTitle:
line = line.rstrip()
#1 Replace URL identifiers and/or symbols with a space
line = regexURLid.sub(" ",line)
line = regexSymbols.sub(" ",line)
#2 Remove duplicate spaces
line = regexSpaces.sub(" ", line)
#3 Remove any spaces at beginning
line = line.lstrip()
#4 Cut title at 50 characters
line = str(line)
#5 Remove any spaces at end
line = line.rstrip()
if line:
Title.append(line)
## convert Titles to [[internal link]]
for line in Title:
mdTitle.append("[["+line+"]] ")
return mdTitle
def convertBlankLink(line):
# converts Notion about:blank links (found by regex) to Obsidian pretty links
regexSymbols = compile("[^\w\s]")
regexSpaces = compile("\s+")
num_matchs = 0
# about:blank links (lost or missing links within Notion)
## Group1:Pretty Link Title
regexBlankLink = compile("\[(.[^\[\]\(\)]*)\]\(about:blank#.[^\[\]\(\)]*\)")
matchBlank = regexBlankLink.search(line)
if matchBlank:
InternalTitle = matchBlank.group(1)
# Replace symbols with space
InternalLink = regexSymbols.sub(" ",InternalTitle)
# Remove duplicate spaces
InternalLink = regexSpaces.sub( " ", InternalLink)
# Remove any spaces at beginning
InternalLink = InternalLink.lstrip()
# Cut title at 50 characters
InternalLink = InternalLink[0:50]
# Remove any spaces at end
InternalLink = InternalLink.rstrip()
# Reconstruct Internal Links as pretty links
PrettyLink = "[["+InternalLink+"]] "
line, num_matchs = regexBlankLink.subn(PrettyLink, line)
if num_matchs > 1:
print(f"Warning: {line} replaced {num_matchs} matchs!!")
return line, num_matchs
def embedded_link_convert(line):
'''
This internal links combine:
- Link to local page
- External notion page
- Link to Database ~ exported *.csv file
- png in notion
'''
# folder style links
#regexPath = compile("^\[(.+)\]\(([^\(]*)(?:\.md|\.csv)\)$") # Overlap incase multiple links in same line
#regexRelativePathImage = compile("(?:\.png|\.jpg|\.gif|\.bmp|\.jpeg|\.svg)")
regexPath = compile("!\[(.*?)\]\((.*?)\)")
regex20 = compile("%20")
num_matchs = 0
# Identify and group relative paths
# While for incase multiple match on single line
pathMatch = regexPath.search(line)
if pathMatch:
# modify paths into local links. just remove UID and convert spaces
Title = pathMatch.group(1)
relativePath = pathMatch.group(2)
#is_image = regexRelativePathImage.search(relativePath)
regexSpecialUtf8 = compile("%([A-F0-9][A-F0-9])%([A-F0-9][A-F0-9])%([A-F0-9][A-F0-9])")
regexutf8 = compile("%([A-F0-9][A-F0-9])%([A-F0-9][A-F0-9])")
regexUID = compile("%20\w{32}")
relativePath = str_forbid_char_remove(relativePath)
relativePath = regexUID.sub("", relativePath)
relativePath = str_space_utf8_replace(relativePath)
utf8_match = regexutf8.search(relativePath)
while utf8_match:
is_special_utf8 = False
utf8_match = regexutf8.search(relativePath)
if utf8_match:
byte_1 = "0x" + utf8_match.group(1)
byte_2 = "0x" + utf8_match.group(2)
if (byte_1[0:3] == "0xE") and (byte_1[3] in ['1', '2', '3', '4', '5', '6']):
special_utf8_match = regexSpecialUtf8.search(relativePath)
byte_3 = "0x" + special_utf8_match.group(3)
bytes_unicode = bytes([int(byte_1,0), int(byte_2,0), int(byte_3,0)])
is_special_utf8 = True
else:
bytes_unicode = bytes([int(byte_1,0), int(byte_2,0)])
try:
unicode_str = str(bytes_unicode, 'utf-8')
except:
print("ERROR: convert unicode failed")
print(f" {bytes_unicode} in - {line}")
break
if is_special_utf8:
relativePath = regexSpecialUtf8.sub(unicode_str, relativePath, 1)
else:
relativePath = regexutf8.sub(unicode_str, relativePath, 1)
line, num_matchs = regexPath.subn("[["+relativePath+"]]", line)
if num_matchs > 1:
print(f"Warning: {line} replaced {num_matchs} matchs!!")
return line, num_matchs
def internal_link_convert(line):
'''
This internal links combine:
- Link to local page
- External notion page
- Link to Database ~ exported *.csv file
- png in notion
'''
# folder style links
#regexPath = compile("^\[(.+)\]\(([^\(]*)(?:\.md|\.csv)\)$") # Overlap incase multiple links in same line
regexPath = compile("\[(.*?)\]\((.*?)\)")
regex20 = compile("%20")
regexRelativePathNotion = compile("https:\/\/www\.notion\.so")
regexRelativePathMdCsv = compile("(?:\.md|\.csv)")
regexRelativePathImage = compile("(?:\.png|\.jpg|\.gif|\.bmp|\.jpeg|\.svg)")
regexSlash = compile("\/")
num_matchs = 0
# Identify and group relative paths
# While for incase multiple match on single line
pathMatch = regexPath.search(line)
if pathMatch:
# modify paths into local links. just remove UID and convert spaces
# Title = pathMatch.group(1)
relativePath = pathMatch.group(2)
notionMatch = regexRelativePathNotion.search(relativePath)
is_md_or_csv = regexRelativePathMdCsv.search(relativePath)
is_image = regexRelativePathImage.search(relativePath)
if is_md_or_csv or notionMatch:
# Replace all matchs
# line = regexPath.sub("[["+<group 1>+"]]", line)
line, num_matchs = regexPath.subn("[["+'\\1'''+"]]", line)
regexMarkdownLink = compile("\[\[(.*?)\]\]")
markdownLinkMatch = regexMarkdownLink.search(line)
if markdownLinkMatch:
title = markdownLinkMatch.group(1)
title = str_notion_uid_remove(title)
title = str_space_utf8_replace(title)
title = str_forbid_char_remove(title)
title = str_slash_char_remove(title)
if title != markdownLinkMatch.group(1):
print(line)
line = regexMarkdownLink.sub("[["+title+"]]", line)
print(f" remove forbid {line}\n")
return line, num_matchs
def feature_tags_convert(line):
# Convert tags after lines starting with "Tags:"
regexTags = "^Tags:\s(.+)"
# Search for Internal Links. Will give match.group(1) & match.group(2)
tagMatch = search(regexTags,line)
Otags = []
num_tag = 0
if tagMatch:
Ntags = tagMatch.group(1).split(",")
for t in enumerate(Ntags):
Otags.append("#"+t[1].strip())
num_tag += 1
line = "Tags: "+", ".join(Otags)
return line, num_tag
def N2Omd(mdFile):
newLines = []
em_link_cnt = 0
in_link_cnt = 0
bl_link_cnt = 0
tags_cnt = 0
for line in mdFile:
line = line.decode("utf-8").rstrip()
line, cnt = embedded_link_convert(line)
em_link_cnt += cnt
line, cnt = internal_link_convert(line)
in_link_cnt += cnt
line, cnt = convertBlankLink(line)
bl_link_cnt += cnt
line, cnt = feature_tags_convert(line)
tags_cnt += cnt
newLines.append(line)
return newLines, [in_link_cnt, em_link_cnt, bl_link_cnt,tags_cnt]