Skip to content

Commit

Permalink
Change the string append logic to use StringIO
Browse files Browse the repository at this point in the history
  • Loading branch information
[email protected] committed May 31, 2023
1 parent 4e5fe0c commit 5062043
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 23 deletions.
9 changes: 6 additions & 3 deletions src/SimpleReplay/audit_logs_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ def clear_and_set_text(self, new_value):
def append_text(self, value):
self.text.write(value)

def get_text_value(self):
return self.text.getvalue()

def get_filename(self):
base_name = (
self.database_name
Expand All @@ -53,7 +56,7 @@ def __str__(self):
self.database_name,
self.pid,
self.xid,
self.text.getvalue(),
self.get_text_value(),
)
)

Expand All @@ -67,11 +70,11 @@ def __eq__(self, other):
and self.database_name == other.database_name
and self.pid == other.pid
and self.xid == other.xid
and self.text.getvalue() == other.text.getvalue()
and self.get_text_value() == other.get_text_value()
)

def __hash__(self):
return hash((str(self.pid), str(self.xid), self.text.getvalue().strip("\n")))
return hash((str(self.pid), str(self.xid), self.get_text_value().strip("\n")))


class ConnectionLog:
Expand Down
10 changes: 5 additions & 5 deletions src/SimpleReplay/extract/extractor/extract_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,11 @@ def _parse_user_activity_log(file, logs, databases, start_time, end_time):
if filename in logs:
# Check if duplicate. This happens with JDBC connections.
prev_query = logs[filename][-1]
if not is_duplicate(prev_query.text.getvalue(), user_activity_log.text.getvalue()):
if not is_duplicate(prev_query.get_text_value(), user_activity_log.get_text_value()):
if fetch_pattern.search(
prev_query.text.getvalue()
) and fetch_pattern.search(user_activity_log.text.getvalue()):
user_activity_log.clear_and_set_text(f"--{user_activity_log.text.getvalue()}")
prev_query.get_text_value()
) and fetch_pattern.search(user_activity_log.get_text_value()):
user_activity_log.clear_and_set_text(f"--{user_activity_log.get_text_value()}")
logs[filename].append(user_activity_log)
else:
logs[filename].append(user_activity_log)
Expand Down Expand Up @@ -107,7 +107,7 @@ def _parse_start_node_log(file, logs, databases, start_time, end_time):
if filename in logs:
# Check if duplicate. This happens with JDBC connections.
prev_query = logs[filename][-1]
if not is_duplicate(prev_query.text.getvalue(), start_node_log.text.getvalue()):
if not is_duplicate(prev_query.get_text_value(), start_node_log.get_text_value()):
logs[filename].append(start_node_log)
else:
logs[filename] = [start_node_log]
Expand Down
20 changes: 10 additions & 10 deletions src/SimpleReplay/extract/extractor/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,33 +200,33 @@ def get_sql_connections_replacements(self, last_connections, log_items):
)
continue

query.clear_and_set_text(remove_line_comments(query.text.getvalue()).strip())
query.clear_and_set_text(remove_line_comments(query.get_text_value()).strip())

if "copy " in query.text.getvalue().lower() and "from 's3:" in query.text.getvalue().lower():
if "copy " in query.get_text_value().lower() and "from 's3:" in query.get_text_value().lower():
bucket = re.search(
r"from 's3:\/\/[^']*", query.text.getvalue(), re.IGNORECASE
r"from 's3:\/\/[^']*", query.get_text_value(), re.IGNORECASE
).group()[6:]
replacements.add(bucket)
query.clear_and_set_text(re.sub(
r"IAM_ROLE 'arn:aws:iam::\d+:role/\S+'",
f" IAM_ROLE ''",
query.text.getvalue(),
query.get_text_value(),
flags=re.IGNORECASE,
))
if "unload" in query.text.getvalue().lower() and "to 's3:" in query.text.getvalue().lower():
if "unload" in query.get_text_value().lower() and "to 's3:" in query.get_text_value().lower():
query.clear_and_set_text(re.sub(
r"IAM_ROLE 'arn:aws:iam::\d+:role/\S+'",
f" IAM_ROLE ''",
query.text.getvalue(),
query.get_text_value(),
flags=re.IGNORECASE,
))

query.clear_and_set_text(f"{query.text.getvalue().strip()}")
if not len(query.text.getvalue()) == 0:
if not query.text.getvalue().endswith(";"):
query.clear_and_set_text(f"{query.get_text_value().strip()}")
if not len(query.get_text_value()) == 0:
if not query.get_text_value().endswith(";"):
query.append_text(";")

query_info["text"] = query.text.getvalue()
query_info["text"] = query.get_text_value()
sql_json["transactions"][query.xid]["queries"].append(query_info)

if not hash((query.database_name, query.username, query.pid)) in last_connections:
Expand Down
10 changes: 5 additions & 5 deletions src/SimpleReplay/log_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,18 +44,18 @@ def is_valid_log(log, start_time, end_time):
if end_time and log.record_time > end_time:
return False

if any(word in log.text.getvalue() for word in problem_keywords):
if any(word in log.get_text_value() for word in problem_keywords):
return False

if any(word in log.text.getvalue() for word in potential_problem_keywords) and not any(word in log.text.getvalue() for word in not_problem_keywords):
if any(word in log.get_text_value() for word in potential_problem_keywords) and not any(word in log.get_text_value() for word in not_problem_keywords):
return False

# filter internal statement rewrites with parameter markers
if re.search('\$\d',log.text.getvalue()):
if re.search('\$\d',log.get_text_value()):
# remove \$\d in string literals ( select '$1' ) or comment blocks ( */ $1 */ )
text_without_valid_parameter_markers = re.sub("""'.*\\$\\d.*'|\\/\\*.*\\$\\d.*\\*\\/""",'',log.text.getvalue(),flags=re.DOTALL)
text_without_valid_parameter_markers = re.sub("""'.*\\$\\d.*'|\\/\\*.*\\$\\d.*\\*\\/""",'',log.get_text_value(),flags=re.DOTALL)
# remove \$\d in single line quotes ( -- $1 )
if '--' in log.text.getvalue():
if '--' in log.get_text_value():
text_without_valid_parameter_markers = re.sub('^\s*--.*\$\d','',text_without_valid_parameter_markers)
# if there are still parameter markers remaining after removing them from valid cases, the query text is invalid
if re.search('\$\d',text_without_valid_parameter_markers):
Expand Down

0 comments on commit 5062043

Please sign in to comment.