diff --git a/src/SimpleReplay/audit_logs_parsing.py b/src/SimpleReplay/audit_logs_parsing.py index cfbc3f2e..1122b4a0 100644 --- a/src/SimpleReplay/audit_logs_parsing.py +++ b/src/SimpleReplay/audit_logs_parsing.py @@ -3,7 +3,7 @@ This module parses various auditlogs """ - +from io import StringIO logger = None @@ -16,7 +16,19 @@ def __init__(self): self.database_name = "" self.pid = "" self.xid = "" - self.text = "" + self.text = StringIO() + + def clear_and_set_text(self, new_value): + # Better to create a new instance, rather than truncate and seek - because it’s faster + self.text.close() + self.text = StringIO() + self.text.write(new_value) + + def append_text(self, value): + self.text.write(value) + + def get_text_value(self): + return self.text.getvalue() def get_filename(self): base_name = ( @@ -44,7 +56,7 @@ def __str__(self): self.database_name, self.pid, self.xid, - self.text, + self.get_text_value(), ) ) @@ -58,11 +70,11 @@ def __eq__(self, other): and self.database_name == other.database_name and self.pid == other.pid and self.xid == other.xid - and self.text == other.text + and self.get_text_value() == other.get_text_value() ) def __hash__(self): - return hash((str(self.pid), str(self.xid), self.text.strip("\n"))) + return hash((str(self.pid), str(self.xid), self.get_text_value().strip("\n"))) class ConnectionLog: diff --git a/src/SimpleReplay/extract/extractor/extract_parser.py b/src/SimpleReplay/extract/extractor/extract_parser.py index e245738c..9a95483e 100644 --- a/src/SimpleReplay/extract/extractor/extract_parser.py +++ b/src/SimpleReplay/extract/extractor/extract_parser.py @@ -64,11 +64,11 @@ def _parse_user_activity_log(file, logs, databases, start_time, end_time): if filename in logs: # Check if duplicate. This happens with JDBC connections. prev_query = logs[filename][-1] - if not is_duplicate(prev_query.text, user_activity_log.text): + if not is_duplicate(prev_query.get_text_value(), user_activity_log.get_text_value()): if fetch_pattern.search( - prev_query.text - ) and fetch_pattern.search(user_activity_log.text): - user_activity_log.text = f"--{user_activity_log.text}" + prev_query.get_text_value() + ) and fetch_pattern.search(user_activity_log.get_text_value()): + user_activity_log.clear_and_set_text(f"--{user_activity_log.get_text_value()}") logs[filename].append(user_activity_log) else: logs[filename].append(user_activity_log) @@ -87,9 +87,9 @@ def _parse_user_activity_log(file, logs, databases, start_time, end_time): user_activity_log.database_name = query_information[3][3:] user_activity_log.pid = query_information[5][4:] user_activity_log.xid = query_information[7][4:] - user_activity_log.text = line_split[1] + user_activity_log.clear_and_set_text(line_split[1]) else: - user_activity_log.text += line + user_activity_log.append_text(line) def _parse_start_node_log(file, logs, databases, start_time, end_time): @@ -107,7 +107,7 @@ def _parse_start_node_log(file, logs, databases, start_time, end_time): if filename in logs: # Check if duplicate. This happens with JDBC connections. prev_query = logs[filename][-1] - if not is_duplicate(prev_query.text, start_node_log.text): + if not is_duplicate(prev_query.get_text_value(), start_node_log.get_text_value()): logs[filename].append(start_node_log) else: logs[filename] = [start_node_log] @@ -132,14 +132,14 @@ def _parse_start_node_log(file, logs, databases, start_time, end_time): start_node_log.username = query_information[4][3:].split(":")[0] start_node_log.pid = query_information[5][4:] start_node_log.xid = query_information[7][4:] - start_node_log.text = line_split[1].strip() + start_node_log.clear_and_set_text(line_split[1].strip()) else: - start_node_log.text += line + start_node_log.append_text(line) def _parse_connection_log(file, connections, last_connections, start_time, end_time): for line in file.readlines(): - + line = line.decode("utf-8") connection_information = line.split("|") diff --git a/src/SimpleReplay/extract/extractor/extractor.py b/src/SimpleReplay/extract/extractor/extractor.py index ed112342..1fcc2823 100755 --- a/src/SimpleReplay/extract/extractor/extractor.py +++ b/src/SimpleReplay/extract/extractor/extractor.py @@ -200,33 +200,33 @@ def get_sql_connections_replacements(self, last_connections, log_items): ) continue - query.text = remove_line_comments(query.text).strip() + query.clear_and_set_text(remove_line_comments(query.get_text_value()).strip()) - if "copy " in query.text.lower() and "from 's3:" in query.text.lower(): + if "copy " in query.get_text_value().lower() and "from 's3:" in query.get_text_value().lower(): bucket = re.search( - r"from 's3:\/\/[^']*", query.text, re.IGNORECASE + r"from 's3:\/\/[^']*", query.get_text_value(), re.IGNORECASE ).group()[6:] replacements.add(bucket) - query.text = re.sub( + query.clear_and_set_text(re.sub( r"IAM_ROLE 'arn:aws:iam::\d+:role/\S+'", f" IAM_ROLE ''", - query.text, + query.get_text_value(), flags=re.IGNORECASE, - ) - if "unload" in query.text.lower() and "to 's3:" in query.text.lower(): - query.text = re.sub( + )) + if "unload" in query.get_text_value().lower() and "to 's3:" in query.get_text_value().lower(): + query.clear_and_set_text(re.sub( r"IAM_ROLE 'arn:aws:iam::\d+:role/\S+'", f" IAM_ROLE ''", - query.text, + query.get_text_value(), flags=re.IGNORECASE, - ) + )) - query.text = f"{query.text.strip()}" - if not len(query.text) == 0: - if not query.text.endswith(";"): - query.text += ";" + query.clear_and_set_text(f"{query.get_text_value().strip()}") + if not len(query.get_text_value()) == 0: + if not query.get_text_value().endswith(";"): + query.append_text(";") - query_info["text"] = query.text + query_info["text"] = query.get_text_value() sql_json["transactions"][query.xid]["queries"].append(query_info) if not hash((query.database_name, query.username, query.pid)) in last_connections: diff --git a/src/SimpleReplay/log_validation.py b/src/SimpleReplay/log_validation.py index c3cff040..4a10f873 100644 --- a/src/SimpleReplay/log_validation.py +++ b/src/SimpleReplay/log_validation.py @@ -44,18 +44,18 @@ def is_valid_log(log, start_time, end_time): if end_time and log.record_time > end_time: return False - if any(word in log.text for word in problem_keywords): + if any(word in log.get_text_value() for word in problem_keywords): return False - if any(word in log.text for word in potential_problem_keywords) and not any(word in log.text for word in not_problem_keywords): + if any(word in log.get_text_value() for word in potential_problem_keywords) and not any(word in log.get_text_value() for word in not_problem_keywords): return False # filter internal statement rewrites with parameter markers - if re.search('\$\d',log.text): + if re.search('\$\d',log.get_text_value()): # remove \$\d in string literals ( select '$1' ) or comment blocks ( */ $1 */ ) - text_without_valid_parameter_markers = re.sub("""'.*\\$\\d.*'|\\/\\*.*\\$\\d.*\\*\\/""",'',log.text,flags=re.DOTALL) + text_without_valid_parameter_markers = re.sub("""'.*\\$\\d.*'|\\/\\*.*\\$\\d.*\\*\\/""",'',log.get_text_value(),flags=re.DOTALL) # remove \$\d in single line quotes ( -- $1 ) - if '--' in log.text: + if '--' in log.get_text_value(): text_without_valid_parameter_markers = re.sub('^\s*--.*\$\d','',text_without_valid_parameter_markers) # if there are still parameter markers remaining after removing them from valid cases, the query text is invalid if re.search('\$\d',text_without_valid_parameter_markers):