forked from lmbogo/textmining
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtxt file processing
142 lines (126 loc) · 6.17 KB
/
txt file processing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Created on Wed May 4 17:03:32 2022
@author: LMbogo
"""
#Load packages
import os
import re
#import sys
#sys.path.insert(0, python_root)
#import data_util
import pickle
import csv
import re
import pandas as pd
#text mining related packages
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#to save files to esxel
import xlsxwriter
#set wd
#shared folder 'C:\Users\LMbogo\OneDrive - International Monetary Fund (PRD)\All_Raw_Data_By_Source'
#individual folders:
# =============================================================================
# AIV_2000-2021\txt
# COM_Flagships\txt
# Ereview\txt
# Program_2021\txt
# =============================================================================
class read_files():
def __init__(self, path):
self.path = path
os.chdir(self.path)
#Read text file
def __read_txt_file(self, file_path):
with open(file_path, 'rb') as f:
opened_file=f.readlines()
f.close()
return opened_file #opening file and returning content as string
#def __parse_txt():
#iterate through all files
def iterate_through_files(self):
opened_text_file=dict()
for file in os.listdir():
if file.endswith(".txt"):
file_path = f"{self.path}\{file}"
#call read text file function
opened_text_file[file]=(self.__read_txt_file(file_path))
return opened_text_file
class find_keywords():
def __init__(self, keywords,content):
self.keywords=keywords
self.content=content
##Needs to have individual scenarios looked at and handled separately
def regex_content(self):
content=self.content.replace('U.S.','US')
self.content=re.sub(r'[^a-zA-Z]',' ',content)
self.content=self.content.split()
def iterate_through_content(self, key, keywords):
iterator=0
ind_word_dict=dict({'_Word_Count':0,'_key':key})
for item in self.content:
##Where the code from the comment above goes
if len(item)>1:
# if item in ind_word_dict:
if any(item in word for word in keywords):#if the word is in any of the keywords
x=filter(lambda k: item in k,keywords)#filter the keywords to only keywords that the word are in
for occurrences in list(x):#iterate through filtered list
iter_1=iterator+1#counter to add next word in split counter
content_str=item
while(iter_1<len(self.content) and content_str in occurrences):#check through each word in the array element
if content_str.lower()== occurrences.lower():#if they are the same count up
if occurrences in ind_word_dict:
ind_word_dict[content_str]+=1
else:
ind_word_dict[occurrences]=1
content_str=content_str+' '+self.content[iter_1]
#making the next word in the array to check i.e. bank becomes bank profitability
iter_1+=1#iterates up the next content word
ind_word_dict['_Word_Count']+=1#word count
iterator+=1#keeps count of where you are in the array
return ind_word_dict
class retrieve_file_keywords():
def __init__(self, basepath, keywords, filepaths):
self.base_path=basepath
self.keywords=keywords
self.filepaths=filepaths
self.words_counted=[]
self.content=dict()
def __insert_dataframe(self): #to make sure code only returns dataframe
v=self.keywords+['_Word_Count']
return pd.DataFrame(data=self.words_counted, index=self.content, columns=v)
def run(self):
keyword_occurrences=dict()
for filepath in self.filepaths:
x = read_files(self.base_path+filepath)
self.content=x.iterate_through_files()
self.words_counted=[]
for i in self.content:
find_kwds=find_keywords(self.keywords, str(self.content[i]))
newContent=find_kwds.regex_content()
iterate_through=find_kwds.iterate_through_content(i,self.keywords)
self.words_counted.append(iterate_through)
#keyword_occurrences is a dictionary that comprises the different folders as primary keys
keyword_occurrences[filepath.split('\\')[0]]=self.__insert_dataframe()
print('done')
self.__write_to_excel(keyword_occurrences)
def __write_to_excel(self, keyword_occurrences):
#you can change the path below to where you want the results save in your computer
writer = pd.ExcelWriter(r'C:\\Users\\LMbogo\\OneDrive - International Monetary Fund (PRD)\\SPR\\SPRAI PROJECTS\\0.2 Text Mining\\Text Mining Results\\textmining.xlsx', engine='xlsxwriter')
with pd.ExcelWriter(r'C:\\Users\\LMbogo\\OneDrive - International Monetary Fund (PRD)\\SPR\\SPRAI PROJECTS\\0.2 Text Mining\\Text Mining Results\\textmining.xlsx') as writer:
for df_key in self.filepaths:
keyword_occurrences[df_key.split('\\')[0]].to_excel(writer, sheet_name=df_key.split('\\')[0])
print('finished')
# Close the Pandas Excel writer and output the Excel file.
writer.save()
##NEEDS: more refinement on what is accepted as a word. i.e industries should
## accept industry so you like do a x.slice(0,5) as a keyword and maybe
## do a dict dict_name[x.slice(0,5)][{'words':['industry', 'industries' 'industrialization'],count:3}
if __name__ == "__main__":
base_path="C:\\Users\\LMbogo\\OneDrive - International Monetary Fund (PRD)\\All_Raw_Data_By_Source\\"
filepaths=['AIV_2000-2021\\txt','COM_Flagships\\txt','Ereview\\txt','Program_2021\\txt']
keywords = list(['governance', 'corruption', 'bribery'])
get_file_keywords=retrieve_file_keywords(base_path, keywords, filepaths)
get_file_keywords.run()