From eb2eaf877fd107555bafecaf2a5d9cb9579a30fd Mon Sep 17 00:00:00 2001 From: "saptha.surendran@ibm.com" Date: Mon, 11 Nov 2024 16:50:59 +0530 Subject: [PATCH] added encoded data detection filter Signed-off-by: saptha.surendran@ibm.com --- .../python/src/code_quality_transform.py | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/transforms/code/code_quality/python/src/code_quality_transform.py b/transforms/code/code_quality/python/src/code_quality_transform.py index 4defb43fe..bc4dc2909 100644 --- a/transforms/code/code_quality/python/src/code_quality_transform.py +++ b/transforms/code/code_quality/python/src/code_quality_transform.py @@ -19,6 +19,7 @@ import os +import re from argparse import ArgumentParser, Namespace import numpy as np @@ -185,6 +186,50 @@ def has_few_assignments(data, language, minimum=4): return True return False +def contains_encoded_data(file_content): + r""" + Check if file contains inline encoded data using the following regular expressions: + – Base64 strings: [a-zA-Z0-9+/\n=]{64,} (atleast charcter long ) + – Hexadecimal sequences: (?:\b(?:0x|\\x)?[0-9a-fA-F]{2}(?:,|\b\s*)){8,} ()atleast 8 pairs + – Unicode strings: (?:\\u[0-9a-fA-F]{4}){8,} (\u_heaxadecimal atleats 8) + """ + matches = [] + total_matched_length = 0 + base64_regex = re.compile(r'[a-zA-Z0-9+/\n=]{64,}') + hex_regex = re.compile(r'(?:\b(?:0x|\\x)?[0-9a-fA-F]{2}(?:,|\b\s*)){8,}') + unicode_regex = re.compile(r'(?:\\u[0-9a-fA-F]{4}){8,}') + + + # Search for base64 encoded data + for match in base64_regex.finditer(file_content): + match_length = len(match.group()) + matches.append((match, match_length)) + total_matched_length += match_length + + # Search for hexadecimal sequences + for match in hex_regex.finditer(file_content): + + match_length = len(match.group()) + matches.append((match, match_length)) + total_matched_length += match_length + + # Search for unicode strings + for match in unicode_regex.finditer(file_content): + match_length = len(match.group()) + matches.append((match, match_length)) + total_matched_length += match_length + + + # Check for individual match length condition + for _, length in matches: + if length > 1024: + return {"contains_encoded_data":True} + + # Check for total matched characters fraction condition + if total_matched_length > 0.5 * len(file_content): + return {"contains_encoded_data":True} + + return {"contains_encoded_data":False} class CodeQualityTransform(AbstractTableTransform): """ @@ -220,6 +265,9 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab has_few_assignments_values = [] is_xml_values = [] is_html_values = [] + contains_encoded_data_values = [] + + contents = table.column(self.code_quality["contents_column_name"]).to_pylist() languages = table.column(self.code_quality["language_column_name"]).to_pylist() @@ -237,6 +285,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab char_token_ratio_values.append(calculate_char_token_ratio(c, self.tokenizer)["char_token_ratio"]) is_autogenerated_values.append(is_autogenerated(c)) + contains_encoded_data_values.append(contains_encoded_data(c)["contains_encoded_data"]) is_config_or_test_values.append(is_config_or_test(c)) has_no_keywords_values.append(has_no_keywords(c, languages[i])) has_few_assignments_values.append(has_few_assignments(c, languages[i])) @@ -250,6 +299,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab table = TransformUtils.add_column(table=table, name="alphanum_frac", content=alphanum_frac_values) table = TransformUtils.add_column(table=table, name="char_token_ratio", content=char_token_ratio_values) table = TransformUtils.add_column(table=table, name="autogenerated", content=is_autogenerated_values) + table = TransformUtils.add_column(table=table, name="contains_encoded_data", content=contains_encoded_data_values) table = TransformUtils.add_column(table=table, name="config_or_test", content=is_config_or_test_values) table = TransformUtils.add_column(table=table, name="has_no_keywords", content=has_no_keywords_values) table = TransformUtils.add_column(table=table, name="has_few_assignments", content=has_few_assignments_values)