forked from vamsikrishna1902/IntentPredictionEval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQueryParser.py
315 lines (296 loc) · 16.2 KB
/
QueryParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
import sys
import os
import time, argparse
import ParseConfigFile as parseConfig
from ParseConfigFile import getConfig
import QueryExecution as QExec
import re, gc
def setLimFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel):
projFlag[parseLevel] = False
selFlag[parseLevel] = False
grpFlag[parseLevel] = False
havFlag[parseLevel] = False
tabFlag[parseLevel] = False
orderFlag[parseLevel] = False
limFlag[parseLevel] = True
def setOrderFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel):
projFlag[parseLevel] = False
selFlag[parseLevel] = False
grpFlag[parseLevel] = False
havFlag[parseLevel] = False
tabFlag[parseLevel] = False
orderFlag[parseLevel] = True
limFlag[parseLevel] = False
def setTabFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel):
projFlag[parseLevel] = False
selFlag[parseLevel] = False
grpFlag[parseLevel] = False
havFlag[parseLevel] = False
tabFlag[parseLevel] = True
orderFlag[parseLevel] = False
limFlag[parseLevel] = False
def setHavFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel):
projFlag[parseLevel] = False
selFlag[parseLevel] = False
grpFlag[parseLevel] = False
havFlag[parseLevel] = True
tabFlag[parseLevel] = False
orderFlag[parseLevel] = False
limFlag[parseLevel] = False
def setGrpFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel):
projFlag[parseLevel] = False
selFlag[parseLevel] = False
grpFlag[parseLevel] = True
havFlag[parseLevel] = False
tabFlag[parseLevel] = False
orderFlag[parseLevel] = False
limFlag[parseLevel] = False
def setSelFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel):
projFlag[parseLevel] = False
selFlag[parseLevel] = True
grpFlag[parseLevel] = False
havFlag[parseLevel] = False
tabFlag[parseLevel] = False
orderFlag[parseLevel] = False
limFlag[parseLevel] = False
def setProjFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel):
projFlag[parseLevel] = True
selFlag[parseLevel] = False
grpFlag[parseLevel] = False
havFlag[parseLevel] = False
tabFlag[parseLevel] = False
orderFlag[parseLevel] = False
limFlag[parseLevel] = False
def setAllFlagsFalse(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel):
projFlag[parseLevel] = False
selFlag[parseLevel] = False
grpFlag[parseLevel] = False
havFlag[parseLevel] = False
tabFlag[parseLevel] = False
orderFlag[parseLevel] = False
limFlag[parseLevel] = False
def checkFlagAndSetList(parseLevel, selectList, projectList, groupByList, havingList, tableList, orderByList, limitList, selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, token):
if projFlag[parseLevel]:
if parseLevel not in projectList:
projectList[parseLevel] = token
else:
projectList[parseLevel] = projectList[parseLevel] + " " + token
elif selFlag[parseLevel]:
if parseLevel not in selectList:
selectList[parseLevel] = token
else:
selectList[parseLevel] = selectList[parseLevel] + " " + token
elif grpFlag[parseLevel]:
if parseLevel not in groupByList:
groupByList[parseLevel] = token
else:
groupByList[parseLevel] = groupByList[parseLevel] + " " + token
elif havFlag[parseLevel]:
if parseLevel not in havingList:
havingList[parseLevel] = token
else:
havingList[parseLevel] = havingList[parseLevel] + " " + token
elif tabFlag[parseLevel]:
if parseLevel not in tableList:
tableList[parseLevel] = token
else:
tableList[parseLevel] = tableList[parseLevel] + " " + token
elif orderFlag[parseLevel]:
if parseLevel not in orderByList:
orderByList[parseLevel] = token
else:
orderByList[parseLevel] = orderByList[parseLevel] + " " + token
elif limFlag[parseLevel]:
if parseLevel not in limitList:
limitList[parseLevel] = token
else:
limitList[parseLevel] = limitList[parseLevel] + " " + token
return selectList, projectList, groupByList, havingList, tableList, orderByList, limitList
def parseQueryLevelWise(sessQuery, parseLevel, selectList, projectList, groupByList, havingList, tableList, orderByList, limitList, selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, stackParenth, extractFieldNum):
tokens = sessQuery.split() #accommodates one or more spaces
stackParenth[parseLevel] = 0 # 0 opening (
extractFieldNum[parseLevel] = 0 # 0 EXTRACTS so far
for token in tokens:
if "EXTRACT" in token:
extractFieldNum[parseLevel] = extractFieldNum[parseLevel] + 1
if "(" in token:
stackParenth[parseLevel] = stackParenth[parseLevel] + token.count("(")
if ")" in token:
stackParenth[parseLevel] = stackParenth[parseLevel] - token.count(")")
if stackParenth[parseLevel] < 0:
parseLevel = parseLevel - 1 # finished a subquery
if "SELECT" in token:
parseLevel =parseLevel + 1
setProjFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel)
stackParenth[parseLevel] = 0 # 0 opening (
extractFieldNum[parseLevel] = 0 # 0 Extracts so far
elif "FROM" in token:
if extractFieldNum[parseLevel] > 0:
extractFieldNum[parseLevel] = extractFieldNum[parseLevel] - 1 # this FROM is attached to EXTRACT and not to the table
selectList, projectList, groupByList, havingList, tableList, orderByList, limitList = checkFlagAndSetList(parseLevel,
selectList,
projectList,
groupByList,
havingList, tableList, orderByList, limitList,
selFlag, projFlag,
grpFlag, havFlag,
tabFlag, orderFlag, limFlag, token)
else:
setTabFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel) # this is FROM followed by tablename(s)
elif "WHERE" in token:
setSelFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel)
elif "GROUP" in token:
setGrpFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel)
elif "HAVING" in token:
setHavFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel)
elif "ORDER" in token:
setOrderFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel)
elif "LIMIT" in token:
setLimFlagTrue(selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, parseLevel)
else:
selectList, projectList, groupByList, havingList, tableList, orderByList, limitList = checkFlagAndSetList(parseLevel, selectList, projectList, groupByList, havingList, tableList, orderByList, limitList, selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, token)
return selectList, projectList, groupByList, havingList, tableList, orderByList, limitList
def parseNYCQuery(sessQuery):
selectList = {}
projectList = {}
groupByList = {}
havingList = {}
tableList = {}
orderByList = {}
limitList = {}
selFlag = {}
projFlag = {}
grpFlag = {}
havFlag = {}
tabFlag = {}
orderFlag = {}
limFlag = {}
parseLevel = -1
stackParenth = {}
extractFieldNum = {}
(selectList, projectList, groupByList, havingList, tableList, orderByList, limitList) = parseQueryLevelWise(sessQuery, parseLevel, selectList, projectList, groupByList, havingList, tableList, orderByList, limitList, selFlag, projFlag, grpFlag, havFlag, tabFlag, orderFlag, limFlag, stackParenth, extractFieldNum)
return (sessQuery, selectList, projectList, groupByList, havingList, tableList, orderByList, limitList)
def rewriteHavingGroupByComplexQuery(sessQuery, selectList, projectList, groupByList, havingList, tableList):
parseLevel = 0
colAliasList = []
cols = projectList[parseLevel].split(",")
actualColList = []
for col in cols:
lenCol = len(col.split(" AS "))
actualCol = ""
for i in range(lenCol-1):
actualCol += col.split(" AS ")[i]
if i<lenCol-2:
actualCol += " AS "
actualColList.append(actualCol)
colAlias = col.split(" AS ")[lenCol-1]
colAliasList.append(colAlias)
if projectList[parseLevel] not in sessQuery:
print("Incorrect sessQuery with extra spaces !!")
exit(0)
tempQuery = sessQuery.replace(projectList[parseLevel], projectList[parseLevel] + " INTO TEMPTABLE")
#tempQuery = "CREATE TABLE TEMPTABLE AS "+sessQuery
tableAlias = tableList[parseLevel].split()[1] # this gives the alias
#tempProjectList = projectList[parseLevel].replace(tableAlias, "TEMPTABLE")
joinSelList = ""
assert len(colAliasList) == len(actualColList)
for colID in range(len(colAliasList)):
#tempProjectList = tempProjectList.replace("AS "+colAlias, "AS TEMPTABLE_"+colAlias)
#projectList[parseLevel] = projectList[parseLevel].replace(" "+colAlias, " "+tableAlias+"_"+colAlias)
colAlias = colAliasList[colID]
if colID == 0:
joinSelList = joinSelList + "WHERE "
else:
joinSelList = joinSelList+" AND "
joinSelList = joinSelList+actualColList[colID]+"=TEMPTABLE."+colAlias
joinProjectList = projectList[parseLevel]+", "+tableAlias+".id" #ignored for now
#joinProjectList = "DISTINCT "+tableAlias+".id"
newQuery = tempQuery+";"+"SELECT "+joinProjectList+" FROM "+tableList[parseLevel]+", TEMPTABLE "+joinSelList
return newQuery
def rewriteQuery(sessQuery, selectList, projectList, groupByList, havingList, tableList):
parseLevel = 0
#newQuery = None
if "WHERE" not in sessQuery and ("HAVING" not in sessQuery or "HAVING (COUNT(1) > 0)" in sessQuery):
return None # either one of HAVING or WHERE must be in the sessQuery, else every tuple ends up being a witness
elif "WHERE" in sessQuery and "GROUP BY" not in sessQuery: #HAVING may or may not be in the query, does not change anything
if projectList[parseLevel] not in sessQuery:
print("Incorrect sessQuery with extra spaces !!")
exit(0)
newQuery = sessQuery.replace(projectList[parseLevel], projectList[parseLevel] + ", id") # we are projecting id as well
#projectList[parseLevel] = projectList[parseLevel] + ", id"
#numAttrsProjected = projectList[parseLevel].count(",") + 1
if "HAVING (COUNT(1) > 0)" in newQuery:
newQuery = newQuery.replace("HAVING (COUNT(1) > 0)", "GROUP BY id HAVING (COUNT(1) > 0)")
#newQuery = sessQuery.replace(projectList[parseLevel], "DISTINCT id") # we are projecting id as well
elif "WHERE" in sessQuery and "GROUP BY" in sessQuery and "HAVING" not in sessQuery:
if projectList[parseLevel] not in sessQuery:
print("Incorrect sessQuery with extra spaces !!")
exit(0)
newQuery = sessQuery.replace(projectList[parseLevel], projectList[parseLevel] + ", id") # we are projecting id as well
projectList[parseLevel] = projectList[parseLevel] + ", id"
#numAttrsProjected = projectList[parseLevel].count(",")+1
if groupByList[parseLevel] not in newQuery:
print("Incorrect newQuery without groupBy list !!")
exit(0)
#newQuery = newQuery.replace(groupByList[parseLevel], groupByList[parseLevel] + ", " + str(numAttrsProjected)) # we are grouping by id as well, via its index ID in the projections
#groupByList[parseLevel] = groupByList[parseLevel] + ", " + str(numAttrsProjected)
newQuery = newQuery.replace(groupByList[parseLevel], groupByList[parseLevel] + ", id") # we are grouping by id as well, via its index ID in the projections
groupByList[parseLevel] = groupByList[parseLevel] + ", id"
elif "GROUP BY" in sessQuery and "HAVING" in sessQuery:
# if MIN/MAX/AVG/SUM in sessQuery, every tuple is a witness
aggrKeywords = ["MIN", "MAX", "AVG", "COUNT", "SUM"]
if any(aggrKeyword in projectList[parseLevel] for aggrKeyword in aggrKeywords):
return None
newQuery = rewriteHavingGroupByComplexQuery(sessQuery, selectList, projectList, groupByList, havingList, tableList)
return newQuery
def rewriteQueryForProvenance(sessQuery, configDict):
# write grammar to parse query and then rewrite
#sessQuery = ' '.join(sessQuery.split('\t'))
#while ' ' in sessQuery:
#sessQuery = sessQuery.replace(' ', ' ')
sessQuery = ' '.join(sessQuery.split())
if configDict['DATASET']=='NYCTaxiTrips':
(sessQuery, selectList, projectList, groupByList, havingList, tableList, orderByList, limitList) = parseNYCQuery(sessQuery)
newQuery = rewriteQuery(sessQuery, selectList, projectList, groupByList, havingList, tableList) # no need to pass order by and limit as they do not make a difference
return newQuery
# the whole point of this parsing is to check if in the results we have a column called rowID. Else, we want to get rowID.
# Several possible cases:
#
def fetchRowIDs(sessQuery, configDict):
rowIDs = []
newQuery = rewriteQueryForProvenance(sessQuery, configDict)
if newQuery is None:
return (newQuery, None)
if ";" in newQuery: # happens for combined provenance queries generated for HAVING, GROUP BY combination
tempQuery = newQuery.split(";")[0]
QExec.executeQuery("drop table TEMPTABLE", configDict)
QExec.executeQuery(tempQuery, configDict)
print("successfully created temptable")
#QExec.executeQuery("select * from temptable", configDict)
newQuery = newQuery.split(";")[1]
cur = QExec.executeQuery(newQuery, configDict) # without intent
rowIDs = QExec.getRowIDs(cur)
return (newQuery,rowIDs)
# rowIDs = []
# for row in rows:
# rowIDs.append(row['id'])
# del rows
# gc.collect()
if __name__ == "__main__":
configDict = parseConfig.parseConfigFile("configFile.txt")
with open(getConfig(configDict['QUERYSESSIONS'])) as f:
for line in f:
sessQueries = line.split(";")
sessName = sessQueries[0]
for i in range(1,len(sessQueries)-1): # we need to ignore the empty query coming from the end of line semicolon ;
sessQuery = sessQueries[i].split("~")[0]
#sessQuery = "SELECT nyc_yellow_tripdata_2016_06_sample_1_percent.dropoff_latitude AS dropoff_latitude, nyc_yellow_tripdata_2016_06_sample_1_percent.dropoff_longitude AS dropoff_longitude, nyc_yellow_tripdata_2016_06_sample_1_percent.fare_amount AS fare_amount FROM public.nyc_yellow_tripdata_2016_06_sample_1_percent nyc_yellow_tripdata_2016_06_sample_1_percent GROUP BY 1, 2, 3 HAVING ((CAST(MIN(nyc_yellow_tripdata_2016_06_sample_1_percent.fare_amount) AS DOUBLE PRECISION) >= 11.999999999999879) AND (CAST(MIN(nyc_yellow_tripdata_2016_06_sample_1_percent.fare_amount) AS DOUBLE PRECISION) <= 14.00000000000014))"
sessQuery = ' '.join(sessQuery.split())
#(newQuery, rowIDs) = fetchRowIDs(sessQuery, configDict)
newQuery = rewriteQueryForProvenance(sessQuery, configDict)
print(sessName+", Query "+str(i)+": \n")
print("OrigQuery: "+sessQuery+"\n")
if newQuery is not None:
print("Provenance Query: "+newQuery+"\n")
else:
print("Provenance Query: None\n")