From 3e110b6f499d6a3cba80be251c46f6d908a53444 Mon Sep 17 00:00:00 2001
From: Azhar Mithani <azharmithani07@gmail.com>
Date: Sun, 7 Oct 2018 20:43:38 +0530
Subject: [PATCH] Create pdf2txt.py

---
 src/pdf2txt.py | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 src/pdf2txt.py

diff --git a/src/pdf2txt.py b/src/pdf2txt.py
new file mode 100644
index 0000000..a7327ab
--- /dev/null
+++ b/src/pdf2txt.py
@@ -0,0 +1,37 @@
+import PyPDF2 
+import textract
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+
+#write a for-loop to open many files -- leave a comment if you'd #like to learn how
+filename = 'enter the name of the file here' 
+#open allows you to read the file
+pdfFileObj = open(filename,'rb')
+#The pdfReader variable is a readable object that will be parsed
+pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
+#discerning the number of pages will allow us to parse through all #the pages
+num_pages = pdfReader.numPages
+count = 0
+text = ""
+#The while loop will read each page
+while count < num_pages:
+    pageObj = pdfReader.getPage(count)
+    count +=1
+    text += pageObj.extractText()
+#This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
+if text != "":
+   text = text
+#If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
+else:
+   text = textract.process(fileurl, method='tesseract', language='eng')
+# Now we have a text variable which contains all the text derived #from our PDF file. Type print(text) to see what it contains. It #likely contains a lot of spaces, possibly junk such as '\n' etc.
+# Now, we will clean our text variable, and return it as a list of keywords.
+
+#The word_tokenize() function will break our text phrases into #individual words
+tokens = word_tokenize(text)
+#we'll create a new list which contains punctuation we wish to clean
+punctuations = ['(',')',';',':','[',']',',']
+#We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
+stop_words = stopwords.words('english')
+#We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
+keywords = [word for word in tokens if not word in stop_words and not word in punctuations]