-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert.py
30 lines (25 loc) · 1012 Bytes
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import PyPDF2
import argparse
import os
def extract_text_from_pdf(pdf_path):
pdf_path = 'pdf/' + pdf_path
with open(pdf_path, "rb") as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
original_text = ""
for page_num in range(len(pdf_reader.pages)):
original_text += pdf_reader.pages[page_num].extract_text()
text = original_text.replace("\n", "")
return text
def main():
parser = argparse.ArgumentParser(description="Extract text from PDF files.")
parser.add_argument("pdf_path", type=str, help="Path of the PDF file to be extracted")
args = parser.parse_args()
extracted_text = extract_text_from_pdf(args.pdf_path)
output_filename = args.pdf_path.rsplit('.', 1)[0] + ".txt"
os.chdir('txt')
with open(output_filename, "w", encoding="utf-8") as txt_file:
txt_file.write(extracted_text)
os.chdir('..')
print(f"Text saved in '{output_filename}'")
if __name__ == "__main__":
main()