Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SimpleReplay: Change the string append logic to use StringIO for faster performance #688

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions src/SimpleReplay/audit_logs_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

This module parses various auditlogs
"""

from io import StringIO
logger = None


Expand All @@ -16,7 +16,19 @@ def __init__(self):
self.database_name = ""
self.pid = ""
self.xid = ""
self.text = ""
self.text = StringIO()

def clear_and_set_text(self, new_value):
# Better to create a new instance, rather than truncate and seek - because it’s faster
self.text.close()
self.text = StringIO()
self.text.write(new_value)

def append_text(self, value):
self.text.write(value)

def get_text_value(self):
return self.text.getvalue()

def get_filename(self):
base_name = (
Expand Down Expand Up @@ -44,7 +56,7 @@ def __str__(self):
self.database_name,
self.pid,
self.xid,
self.text,
self.get_text_value(),
)
)

Expand All @@ -58,11 +70,11 @@ def __eq__(self, other):
and self.database_name == other.database_name
and self.pid == other.pid
and self.xid == other.xid
and self.text == other.text
and self.get_text_value() == other.get_text_value()
)

def __hash__(self):
return hash((str(self.pid), str(self.xid), self.text.strip("\n")))
return hash((str(self.pid), str(self.xid), self.get_text_value().strip("\n")))


class ConnectionLog:
Expand Down
20 changes: 10 additions & 10 deletions src/SimpleReplay/extract/extractor/extract_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,11 @@ def _parse_user_activity_log(file, logs, databases, start_time, end_time):
if filename in logs:
# Check if duplicate. This happens with JDBC connections.
prev_query = logs[filename][-1]
if not is_duplicate(prev_query.text, user_activity_log.text):
if not is_duplicate(prev_query.get_text_value(), user_activity_log.get_text_value()):
if fetch_pattern.search(
prev_query.text
) and fetch_pattern.search(user_activity_log.text):
user_activity_log.text = f"--{user_activity_log.text}"
prev_query.get_text_value()
) and fetch_pattern.search(user_activity_log.get_text_value()):
user_activity_log.clear_and_set_text(f"--{user_activity_log.get_text_value()}")
logs[filename].append(user_activity_log)
else:
logs[filename].append(user_activity_log)
Expand All @@ -87,9 +87,9 @@ def _parse_user_activity_log(file, logs, databases, start_time, end_time):
user_activity_log.database_name = query_information[3][3:]
user_activity_log.pid = query_information[5][4:]
user_activity_log.xid = query_information[7][4:]
user_activity_log.text = line_split[1]
user_activity_log.clear_and_set_text(line_split[1])
else:
user_activity_log.text += line
user_activity_log.append_text(line)


def _parse_start_node_log(file, logs, databases, start_time, end_time):
Expand All @@ -107,7 +107,7 @@ def _parse_start_node_log(file, logs, databases, start_time, end_time):
if filename in logs:
# Check if duplicate. This happens with JDBC connections.
prev_query = logs[filename][-1]
if not is_duplicate(prev_query.text, start_node_log.text):
if not is_duplicate(prev_query.get_text_value(), start_node_log.get_text_value()):
logs[filename].append(start_node_log)
else:
logs[filename] = [start_node_log]
Expand All @@ -132,14 +132,14 @@ def _parse_start_node_log(file, logs, databases, start_time, end_time):
start_node_log.username = query_information[4][3:].split(":")[0]
start_node_log.pid = query_information[5][4:]
start_node_log.xid = query_information[7][4:]
start_node_log.text = line_split[1].strip()
start_node_log.clear_and_set_text(line_split[1].strip())
else:
start_node_log.text += line
start_node_log.append_text(line)


def _parse_connection_log(file, connections, last_connections, start_time, end_time):
for line in file.readlines():

line = line.decode("utf-8")

connection_information = line.split("|")
Expand Down
30 changes: 15 additions & 15 deletions src/SimpleReplay/extract/extractor/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,33 +200,33 @@ def get_sql_connections_replacements(self, last_connections, log_items):
)
continue

query.text = remove_line_comments(query.text).strip()
query.clear_and_set_text(remove_line_comments(query.get_text_value()).strip())

if "copy " in query.text.lower() and "from 's3:" in query.text.lower():
if "copy " in query.get_text_value().lower() and "from 's3:" in query.get_text_value().lower():
bucket = re.search(
r"from 's3:\/\/[^']*", query.text, re.IGNORECASE
r"from 's3:\/\/[^']*", query.get_text_value(), re.IGNORECASE
).group()[6:]
replacements.add(bucket)
query.text = re.sub(
query.clear_and_set_text(re.sub(
r"IAM_ROLE 'arn:aws:iam::\d+:role/\S+'",
f" IAM_ROLE ''",
query.text,
query.get_text_value(),
flags=re.IGNORECASE,
)
if "unload" in query.text.lower() and "to 's3:" in query.text.lower():
query.text = re.sub(
))
if "unload" in query.get_text_value().lower() and "to 's3:" in query.get_text_value().lower():
query.clear_and_set_text(re.sub(
r"IAM_ROLE 'arn:aws:iam::\d+:role/\S+'",
f" IAM_ROLE ''",
query.text,
query.get_text_value(),
flags=re.IGNORECASE,
)
))

query.text = f"{query.text.strip()}"
if not len(query.text) == 0:
if not query.text.endswith(";"):
query.text += ";"
query.clear_and_set_text(f"{query.get_text_value().strip()}")
if not len(query.get_text_value()) == 0:
if not query.get_text_value().endswith(";"):
query.append_text(";")

query_info["text"] = query.text
query_info["text"] = query.get_text_value()
sql_json["transactions"][query.xid]["queries"].append(query_info)

if not hash((query.database_name, query.username, query.pid)) in last_connections:
Expand Down
10 changes: 5 additions & 5 deletions src/SimpleReplay/log_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,18 +44,18 @@ def is_valid_log(log, start_time, end_time):
if end_time and log.record_time > end_time:
return False

if any(word in log.text for word in problem_keywords):
if any(word in log.get_text_value() for word in problem_keywords):
return False

if any(word in log.text for word in potential_problem_keywords) and not any(word in log.text for word in not_problem_keywords):
if any(word in log.get_text_value() for word in potential_problem_keywords) and not any(word in log.get_text_value() for word in not_problem_keywords):
return False

# filter internal statement rewrites with parameter markers
if re.search('\$\d',log.text):
if re.search('\$\d',log.get_text_value()):
# remove \$\d in string literals ( select '$1' ) or comment blocks ( */ $1 */ )
text_without_valid_parameter_markers = re.sub("""'.*\\$\\d.*'|\\/\\*.*\\$\\d.*\\*\\/""",'',log.text,flags=re.DOTALL)
text_without_valid_parameter_markers = re.sub("""'.*\\$\\d.*'|\\/\\*.*\\$\\d.*\\*\\/""",'',log.get_text_value(),flags=re.DOTALL)
# remove \$\d in single line quotes ( -- $1 )
if '--' in log.text:
if '--' in log.get_text_value():
text_without_valid_parameter_markers = re.sub('^\s*--.*\$\d','',text_without_valid_parameter_markers)
# if there are still parameter markers remaining after removing them from valid cases, the query text is invalid
if re.search('\$\d',text_without_valid_parameter_markers):
Expand Down