-
Notifications
You must be signed in to change notification settings - Fork 0
/
batch_pdf_file_to_text_file.py
104 lines (92 loc) · 3.71 KB
/
batch_pdf_file_to_text_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import argparse
parser = argparse.ArgumentParser(description="Batch pdf to text")
parser.add_argument('-m', '--inputMethodNumber', metavar='selectionMethod', required=True, help='Input pdf to text method number, there are 1, 2, 3, 4 optional, can only set one of them')
parser.add_argument('-i', '--inputFolderPath', metavar='targetPdfLiterature', required=True, help='Enter the path to the folder that contains only pdf literatures')
parser.add_argument('-o', '--outputFolderPath', metavar='outputFolder', required=True, help='Output folder path')
args = parser.parse_args()
folder1 = args.inputFolderPath
dirList1 = os.listdir(folder1)
dirPath1 = os.path.abspath(folder1)
folder2 = args.outputFolderPath
def mkdir(path):
folder = os.path.exists(path)
if not folder:
os.makedirs(path)
else:
pass
mkdir(folder2)
dirPath2 = os.path.abspath(folder2)
if '1' == args.inputMethodNumber:
import PyPDF2
elif '2' == args.inputMethodNumber:
import pdfplumber
elif '3' == args.inputMethodNumber:
import fitz
elif '4' == args.inputMethodNumber:
# from pdfminer.high_level import extract_text
import io
from pdfminer.converter import TextConverter
# from pdfminer.pdfdocument import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
def pdfToTextMethod1(filePath):
with open(filePath, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ''
for page in reader.pages:
text += page.extract_text()
return text
def pdfToTextMethod2(filePath):
with pdfplumber.open(filePath) as pdf:
text = ''
for page in pdf.pages:
text += page.extract_text()
return text
def pdfToTextMethod3(filePath):
# with fitz.open(filePath) as pdf:
# text = ''
# for page in pdf.pages():
# text += page.get_text()
# return text
document = fitz.open(filePath)
text = ""
for page in document:
page_text = page.get_text()
text += page_text
document.close()
def pdfToTextMethod4(filePath):
# text = extract_text(filePath)
# return text
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
laparams = LAParams(line_overlap=0.5, char_margin=2.0, line_margin=0.5, word_margin=0.1, boxes_flow=0.5,
detect_vertical=False, all_texts=False)
converter = TextConverter(resource_manager, fake_file_handle, laparams=laparams)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(filePath, 'rb') as fh:
for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
converter.close()
fake_file_handle.close()
return text
for fileName1 in dirList1:
filePath1 = os.path.join(dirPath1, fileName1)
textResultPathName = dirPath2 + '/' + fileName1 + '.txt'
try:
if '1' == args.inputMethodNumber:
pdfText = pdfToTextMethod1(filePath1)
elif '2' == args.inputMethodNumber:
pdfText = pdfToTextMethod2(filePath1)
elif '3' == args.inputMethodNumber:
pdfText = pdfToTextMethod3(filePath1)
elif '4' == args.inputMethodNumber:
pdfText = pdfToTextMethod4(filePath1)
outFile = open(textResultPathName, 'w', encoding='utf-8')
outFile.write(pdfText)
outFile.close()
except BaseException as e:
print('failed:', fileName1, '|', e)
# Created by Huilong Chen.