From 5062043d8ef34af118e99373e6fdd1948497bca8 Mon Sep 17 00:00:00 2001 From: "jaya.anathram@ft.com" Date: Wed, 31 May 2023 22:57:02 +0300 Subject: [PATCH] Change the string append logic to use StringIO --- src/SimpleReplay/audit_logs_parsing.py | 9 ++++++--- .../extract/extractor/extract_parser.py | 10 +++++----- .../extract/extractor/extractor.py | 20 +++++++++---------- src/SimpleReplay/log_validation.py | 10 +++++----- 4 files changed, 26 insertions(+), 23 deletions(-) diff --git a/src/SimpleReplay/audit_logs_parsing.py b/src/SimpleReplay/audit_logs_parsing.py index a1ec9957..1122b4a0 100644 --- a/src/SimpleReplay/audit_logs_parsing.py +++ b/src/SimpleReplay/audit_logs_parsing.py @@ -27,6 +27,9 @@ def clear_and_set_text(self, new_value): def append_text(self, value): self.text.write(value) + def get_text_value(self): + return self.text.getvalue() + def get_filename(self): base_name = ( self.database_name @@ -53,7 +56,7 @@ def __str__(self): self.database_name, self.pid, self.xid, - self.text.getvalue(), + self.get_text_value(), ) ) @@ -67,11 +70,11 @@ def __eq__(self, other): and self.database_name == other.database_name and self.pid == other.pid and self.xid == other.xid - and self.text.getvalue() == other.text.getvalue() + and self.get_text_value() == other.get_text_value() ) def __hash__(self): - return hash((str(self.pid), str(self.xid), self.text.getvalue().strip("\n"))) + return hash((str(self.pid), str(self.xid), self.get_text_value().strip("\n"))) class ConnectionLog: diff --git a/src/SimpleReplay/extract/extractor/extract_parser.py b/src/SimpleReplay/extract/extractor/extract_parser.py index bbaf9d60..9a95483e 100644 --- a/src/SimpleReplay/extract/extractor/extract_parser.py +++ b/src/SimpleReplay/extract/extractor/extract_parser.py @@ -64,11 +64,11 @@ def _parse_user_activity_log(file, logs, databases, start_time, end_time): if filename in logs: # Check if duplicate. This happens with JDBC connections. prev_query = logs[filename][-1] - if not is_duplicate(prev_query.text.getvalue(), user_activity_log.text.getvalue()): + if not is_duplicate(prev_query.get_text_value(), user_activity_log.get_text_value()): if fetch_pattern.search( - prev_query.text.getvalue() - ) and fetch_pattern.search(user_activity_log.text.getvalue()): - user_activity_log.clear_and_set_text(f"--{user_activity_log.text.getvalue()}") + prev_query.get_text_value() + ) and fetch_pattern.search(user_activity_log.get_text_value()): + user_activity_log.clear_and_set_text(f"--{user_activity_log.get_text_value()}") logs[filename].append(user_activity_log) else: logs[filename].append(user_activity_log) @@ -107,7 +107,7 @@ def _parse_start_node_log(file, logs, databases, start_time, end_time): if filename in logs: # Check if duplicate. This happens with JDBC connections. prev_query = logs[filename][-1] - if not is_duplicate(prev_query.text.getvalue(), start_node_log.text.getvalue()): + if not is_duplicate(prev_query.get_text_value(), start_node_log.get_text_value()): logs[filename].append(start_node_log) else: logs[filename] = [start_node_log] diff --git a/src/SimpleReplay/extract/extractor/extractor.py b/src/SimpleReplay/extract/extractor/extractor.py index 8b77805f..1fcc2823 100755 --- a/src/SimpleReplay/extract/extractor/extractor.py +++ b/src/SimpleReplay/extract/extractor/extractor.py @@ -200,33 +200,33 @@ def get_sql_connections_replacements(self, last_connections, log_items): ) continue - query.clear_and_set_text(remove_line_comments(query.text.getvalue()).strip()) + query.clear_and_set_text(remove_line_comments(query.get_text_value()).strip()) - if "copy " in query.text.getvalue().lower() and "from 's3:" in query.text.getvalue().lower(): + if "copy " in query.get_text_value().lower() and "from 's3:" in query.get_text_value().lower(): bucket = re.search( - r"from 's3:\/\/[^']*", query.text.getvalue(), re.IGNORECASE + r"from 's3:\/\/[^']*", query.get_text_value(), re.IGNORECASE ).group()[6:] replacements.add(bucket) query.clear_and_set_text(re.sub( r"IAM_ROLE 'arn:aws:iam::\d+:role/\S+'", f" IAM_ROLE ''", - query.text.getvalue(), + query.get_text_value(), flags=re.IGNORECASE, )) - if "unload" in query.text.getvalue().lower() and "to 's3:" in query.text.getvalue().lower(): + if "unload" in query.get_text_value().lower() and "to 's3:" in query.get_text_value().lower(): query.clear_and_set_text(re.sub( r"IAM_ROLE 'arn:aws:iam::\d+:role/\S+'", f" IAM_ROLE ''", - query.text.getvalue(), + query.get_text_value(), flags=re.IGNORECASE, )) - query.clear_and_set_text(f"{query.text.getvalue().strip()}") - if not len(query.text.getvalue()) == 0: - if not query.text.getvalue().endswith(";"): + query.clear_and_set_text(f"{query.get_text_value().strip()}") + if not len(query.get_text_value()) == 0: + if not query.get_text_value().endswith(";"): query.append_text(";") - query_info["text"] = query.text.getvalue() + query_info["text"] = query.get_text_value() sql_json["transactions"][query.xid]["queries"].append(query_info) if not hash((query.database_name, query.username, query.pid)) in last_connections: diff --git a/src/SimpleReplay/log_validation.py b/src/SimpleReplay/log_validation.py index 03d24282..4a10f873 100644 --- a/src/SimpleReplay/log_validation.py +++ b/src/SimpleReplay/log_validation.py @@ -44,18 +44,18 @@ def is_valid_log(log, start_time, end_time): if end_time and log.record_time > end_time: return False - if any(word in log.text.getvalue() for word in problem_keywords): + if any(word in log.get_text_value() for word in problem_keywords): return False - if any(word in log.text.getvalue() for word in potential_problem_keywords) and not any(word in log.text.getvalue() for word in not_problem_keywords): + if any(word in log.get_text_value() for word in potential_problem_keywords) and not any(word in log.get_text_value() for word in not_problem_keywords): return False # filter internal statement rewrites with parameter markers - if re.search('\$\d',log.text.getvalue()): + if re.search('\$\d',log.get_text_value()): # remove \$\d in string literals ( select '$1' ) or comment blocks ( */ $1 */ ) - text_without_valid_parameter_markers = re.sub("""'.*\\$\\d.*'|\\/\\*.*\\$\\d.*\\*\\/""",'',log.text.getvalue(),flags=re.DOTALL) + text_without_valid_parameter_markers = re.sub("""'.*\\$\\d.*'|\\/\\*.*\\$\\d.*\\*\\/""",'',log.get_text_value(),flags=re.DOTALL) # remove \$\d in single line quotes ( -- $1 ) - if '--' in log.text.getvalue(): + if '--' in log.get_text_value(): text_without_valid_parameter_markers = re.sub('^\s*--.*\$\d','',text_without_valid_parameter_markers) # if there are still parameter markers remaining after removing them from valid cases, the query text is invalid if re.search('\$\d',text_without_valid_parameter_markers):