-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_brand_patterns.py
175 lines (149 loc) · 6.1 KB
/
generate_brand_patterns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python
# coding: utf8
"""
Mon, Oct 21, 2019
Stacy Bridges
This simplified code transforms an input set of brands into brand patterns
that can be used for string matching by the NERS EntityRuler.
"""
# import library components ---------------------------------------------------
import os, shutil, sys
import pathlib
from pathlib import Path
import unicodedata # use to normalize international characters
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np
# import py files -------------------------------------------------------------
import menu
def main():
# print menu options to console -----------------------------------------------
# declare menu and file arrays
menu_choices = []
file_choices = []
# get path of current folder
folder_path = os.path.dirname(os.path.abspath(__file__))
# get names of .xlsx files that are in the folder that are also input files
for r, d, f in os.walk(folder_path): # rem r=root, d=dir, f=file
for file in f:
if '.xlsx' in file and 'brand' in file and 'extract' not in file:
# rem for full path use <files.append(os.path.join(r, file))>
file_choices.append(file)
# print user menu
print('\n-----------------------------------------')
print(' Brand Input Files')
print('-----------------------------------------')
spacer =' '
print('{}{}{}'.format('m', spacer, 'Show Main Menu'))
menu_choices.append('m')
i = 0
for ic in file_choices:
i += 1
print('{} {}'.format(i, ic))
menu_choices.append(str(i))
# get user input
print('\nSelect an input file (or \'m\' for Main Menu)')
gold_choice = input()
# validate user input
while gold_choice not in menu_choices:
print('Invalid choice! Select an input file (or \'m\' for Main Menu)')
gold_choice = input()
if gold_choice == 'm':
menu.main()
# if the user chooses 'm', then program control goes back to menu.main(),
# which means that when menu.main() terminates, the program control will
# return to this program; therefore, it's important to invoke sys.exit()
# upon the callback to terminate all py execution in the terminal
sys.exit()
# identify i/o ----------------------------------------------------------------
f = file_choices[int(gold_choice)-1]
outfile_name = 'ners_' + f[0:len(f)-5] + '_patterns.jsonl'
outfile_path = folder_path + '\\' + outfile_name
brands_file = folder_path + '\\' + file_choices[int(gold_choice)-1]
brands_sheet = 'Sheet1'
brands_data = pd.read_excel(brands_file, brands_sheet)
brands = brands_data['BRAND']
dataLabel = 'BRND'
# declare variables -----------------------------------------------------------
# special chars
special_chars = [' ','/','\\','+','-','.','&',',','(',')']
# pattern components
pp = '{"label":"' + dataLabel + '", "pattern":[' # pp = pattern prefix
ps = ']}' # ps = pattern suffix
patterns = []
pattern = ''
# token components
tp = '{"lower":"' # token prefix
ts = '"}' # token suffix
td = ',' # token delimiter
tokens = []
token = ''
# build brand patterns --------------------------------------------------------
print('\nBuilding brand patterns...\n')
for brand in brands:
# iterate thru brands
# and build patterns using the pattern/token components from above
brand = str(brand) # eliminate any float objects
brand = unicodedata.normalize('NFKD', brand).encode('ASCII', 'ignore') # convert int'l chars
brand = brand.decode('utf-8') # convert bytes to strings
brand = brand.lower() # convert brand to lowercase
char_count = 0
is_last_char = False
for char in brand:
char_count += 1
if char_count == len(brand):
# set the flag if you've reached the last char in the brand string
is_last_char = True
if char in special_chars:
if token != '':
# if you reach a special char, then store the token that you've
# built from the preceding chars
tokens.append(token)
token = ''
if char == ' ':
# if char is a space, make it empty so that spacy lib can use it
char = ''
# store the special char as a token
tokens.append(char)
else:
token = token + char
if is_last_char == True:
tokens.append(token)
token = ''
# after iterating through the brand string, build the pattern from the
# pattern/token components and the tokens that you've stored in the array
pattern = pattern + pp
tok_count = 0
is_last_token = False
for tok in tokens:
tok_count += 1
if tok_count == len (tokens):
is_last_token = True
if is_last_token == False:
pattern = pattern + tp + tok + ts + td
else:
pattern = pattern + tp + tok + ts
# store the pattern in the pattern array
pattern = pattern + ps
patterns.append(pattern)
# reset your pattern string and token array before parcing next brand string
pattern = ''
tokens.clear()
# write brand patterns to file ------------------------------------------------
# iterate through pattern array, writing each line to external file
# that can then be picked up by the EntityRuler to map Brands to the model
brand_count = 0
with open(outfile_path, 'w') as outfile:
for line in patterns:
outfile.write(line)
outfile.write('\n')
print(line)
brand_count += 1
# end program
print('\n')
print('Done.')
print('JSONL file created.')
print('{} brand patterns written to:'.format(brand_count))
print('{}'.format(outfile_path))
if __name__ == '__main__' : main()