-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_manager.py
92 lines (78 loc) · 2.91 KB
/
text_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""Code made to read texts of the corpus
This code uses texts from http://heimskringla.no
"""
import os
import codecs
import json
from bs4 import BeautifulSoup
__author__ = ["Clément Besnier", ]
accepted_formats = ["html", "json", "txt"]
def text_extractor(orig_format, dest_format, folder, orig_filenames, dest_filenames, extraction_method,
mode="r", encoding="utf8"):
"""
From an HTML file gets
:param orig_format: "html"
:param dest_format: ["txt", "json"]
:param folder:
:param orig_filenames:
:param dest_filenames:
:param extraction_method:
:param mode:
:param encoding:
:return:
"""
assert orig_format in ["html"]
assert dest_format in ["txt", "json"]
if not os.path.exists(os.path.join(folder, dest_format+"_files")):
os.makedirs(os.path.join(folder, dest_format+"_files"))
for orig_filename, dest_filename in zip(orig_filenames, dest_filenames):
with codecs.open(os.path.join(folder, orig_format+"_files", orig_filename), mode, encoding) as f_orig:
with codecs.open(os.path.join(folder, dest_format+"_files", dest_filename), "w", encoding) as f_dest:
f_dest.write(extraction_method(f_orig.read()))
def extract_text(data):
"""
Simple use of BeautifulSoup
:param data:
:return:
"""
soup = BeautifulSoup(data, 'html.parser')
return soup.get_text()
class TextLoader:
"""
Simple c
"""
def __init__(self, name, extension):
self.name = name
self.extension = extension
@staticmethod
def get_available_names():
"""
Get available texts (forlder)
:return:
"""
return [name for name in os.listdir(".") if "." not in name]
def load(self):
"""
Gets the selected text
:return:
"""
try:
if self.extension in ["txt", "html"]:
with codecs.open(os.path.join(self.name, self.extension+"_files", "complete."+self.extension),
"r", encoding="utf8") as f:
return f.read()
elif self.extension == "json":
with open(os.path.join(self.name, self.extension+"_files", "complete."+self.extension),
"r", encoding="utf8") as f:
return json.load(f)
except IOError:
print("Impossible to load the wished text")
return None
if __name__ == "__main__":
text_extractor("html", "txt", os.path.join("Sæmundar-Edda", "Atlakviða"), ["complete.html"], ["complete.txt"],
extract_text)
text_extractor("html", "txt", os.path.join("Sæmundar-Edda",
"Hávamál"), ["complete.html"], ["complete.txt"], extract_text)
loader = TextLoader(os.path.join("Sæmundar-Edda", "Atlakviða"), "txt")
print(loader.get_available_names())
print(loader.load()[:100])