From 7018a2ede3b9bd5c5460d0eb844a4b8730956565 Mon Sep 17 00:00:00 2001 From: Berislav Lopac Date: Tue, 13 Dec 2022 21:10:30 +0000 Subject: [PATCH] improve support for hashing field values --- docs/sensitive.md | 111 ++++++ mkdocs.yml | 3 + tests/logger/processors/test_clean_data.py | 370 ------------------ tests/logger/processors/test_clean_fields.py | 200 ++++++++++ tests/logger/processors/test_clean_values.py | 175 +++++++++ ...{test_hash_data.py => test_hash_fields.py} | 11 +- unclogger/processors/clean_data.py | 56 ++- 7 files changed, 536 insertions(+), 390 deletions(-) create mode 100644 docs/sensitive.md delete mode 100755 tests/logger/processors/test_clean_data.py create mode 100755 tests/logger/processors/test_clean_fields.py create mode 100644 tests/logger/processors/test_clean_values.py rename tests/logger/processors/{test_hash_data.py => test_hash_fields.py} (80%) diff --git a/docs/sensitive.md b/docs/sensitive.md new file mode 100644 index 0000000..77e625f --- /dev/null +++ b/docs/sensitive.md @@ -0,0 +1,111 @@ +# Handling Sensitive Data + +`unclogger` will automatically mask sensitive information such as PII, login credentials and the like. By default, the masked data is replaced by a generic string, which can be configured to use a hashing function instead. + + +## Sensitive Fields + +If the name of any field in the structured log message matches one of the listed sensitive names, the value of that field is (recursively) replaced with a safe value: + + >>> from unclogger import get_logger + >>> logger = get_logger("test logger") + >>> logger.info("clean password", password="blabla", foo={"Email": "test@example.xcom"}) + { + "password": "********", + "foo": {"Email": "********"}, + "event": "clean password", + "logger": "test logger", + "level": "info", + "timestamp": "2022-02-02T10:53:52.245833Z" + } + +A basic list of sensitive field names is included in `unclogger`: + + >>> from unclogger.processors.clean_data import SENSITIVE_FIELDS + >>> SENSITIVE_FIELDS + ['password', 'email', 'email_1', 'firstname', 'lastname', 'currentpassword', 'newpassword', 'tmppassword', 'authentication', 'refresh', 'auth', 'http_refresh', 'http_x_forwarded_authorization', 'http_x_endpoint_api_userinfo', 'http_authorization', 'idtoken', 'oauthidtoken', 'publickey', 'privatekey'] + +!!! Note + + Note that the list is case-insensitive; `unclogger` normalizes all field names to lowercase, so e.g. `email` and `Email` are treated equally. + +This list can be configured with an iterable of custom field names: + + >>> from unclogger import get_logger + >>> logger = get_logger("test logger") + >>> logger.config.sensitive_keys = {"foo", "bar"} + >>> logger.config.sensitive_keys.add("foobar") + >>> payload = {"foo": "1234", "bar": "5678", "fooBar": "9876"} + >>> logger.info("clean sensitive values)", payload=payload) + { + "payload": { + "foo": "********", + "bar": "********", + "fooBar": "********" + }, + "event": "clean sensitive values", + "logger": "test logger", + "level": "info", + "timestamp": + "2022-02-02T11:08:01.260019Z" + } + +### Configurable Replacement Value + +A custom string can be used instead of the default replacement value: + + >>> from unclogger import get_logger + >>> logger = get_logger("test logger") + >>> logger.config.replacement = "blablabla" + >>> logger.info("clean password", password="blabla", foo={"Email": "test@example.xcom"}) + { + "password": "blablabla", + "foo": {"Email": "blablabla"}, + "event": "clean password", + "logger": "test logger", + "level": "info", + "timestamp": "2022-12-13T20:02:38.520599Z" + } + +### Hashing Sensitive Data + +Instead of a replacement string, `config.replacement` can define a Python callable: + + >>> from unclogger import get_logger + >>> logger = get_logger("test logger") + >>> logger.config.replacement = hashlib.sha256 + >>> logger.info("clean password", password="blabla", foo={"Email": "test@example.xcom"}) + { + "password": "ccadd99b16cd3d200c22d6db45d8b6630ef3d936767127347ec8a76ab992c2ea", + "foo": {"Email": "77b6427267ac7638fd0cd49f2f64fd619ade2ab21d4a3891234293671c1d14b3"}, + "event": "clean password", + "logger": "test logger", + "level": "info", + "timestamp": "2022-12-13T20:06:37.542212Z" + } + +This can be used so that the data can still be identified (e.g. an email address will always have the same has value) without sending the actual data to the log. + +!!! Warning + + This functionality is intended to work out of the box with the functions present in the `hashlib` standard library. Any custom hash function has to accept a bytestring value and return a hash object as described in the [documentation](https://docs.python.org/3.10/library/hashlib.html). For typing purposes, `unclogger` provides a `Protocol` class for hash objects: + + ```python + from unclogger.processors.clean_data import HashObjectProtocol + + def custom_hash_function(data: bytes) -> HashObjectProtocol: + ... + ``` + +## Sensitive Text Values + >>> from unclogger import get_logger + >>> logger = get_logger("test logger") + >>> logger.info("'Authentication': 1234") + { + "event": "#### WARNING: Log message replaced due to sensitive keyword: 'Authentication':", + "logger": "test logger", + "level": "info", + "timestamp": "2022-02-02T11:22:21.997204Z" + } + +*[PII]: Personally Identifiable Information \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 5f67ecc..e8236ac 100755 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -10,6 +10,8 @@ plugins: - search - mkapi markdown_extensions: + - abbr + - attr_list - pymdownx.highlight: anchor_linenums: true - pymdownx.inlinehilite @@ -21,4 +23,5 @@ markdown_extensions: guess_lang: no nav: - index.md + - sensitive.md - reference.md diff --git a/tests/logger/processors/test_clean_data.py b/tests/logger/processors/test_clean_data.py deleted file mode 100755 index f1de217..0000000 --- a/tests/logger/processors/test_clean_data.py +++ /dev/null @@ -1,370 +0,0 @@ -import json -from decimal import Decimal - -from unclogger import get_logger -from unclogger.processors.clean_data import clean_sensitive_data, REPLACEMENT_TEXT - - -def test_example_log_message_from_auth_is_cleaned_correctly(caplog): - caplog.set_level("INFO") - - message = """{ - "textPayload": "Response Headers: {'Authentication': 'Bearer sensitive_info_auth_token', - 'Refresh': 'sensitive_info_refresh_token'}", - }""" - logger_name = "test logger" - - logger = get_logger(logger_name) - logger.info(message) - - record = json.loads(caplog.messages[0]) - assert len(record) == 4 - assert ( - record["event"] - == "#### WARNING: Log message replaced due to sensitive keyword: 'Authentication':" - ) - assert record["level"] == "info" - assert record["logger"] == logger_name - - -def test_message_with_Bearer_in_text_is_cleaned_correctly(caplog): - caplog.set_level("INFO") - - message = """some random text 'Bearer sensitive_info_auth_token'""" - logger_name = "test logger" - - logger = get_logger(logger_name) - logger.info(message) - - record = json.loads(caplog.messages[0]) - assert len(record) == 4 - assert ( - record["event"] == "#### WARNING: Log message replaced due to sensitive keyword: 'Bearer " - ) - assert record["level"] == "info" - assert record["logger"] == logger_name - - -def test_message_with_Refresh_in_text_is_cleaned_correctly(caplog): - caplog.set_level("INFO") - - message = """{'Refresh': 'sensitive_info_refresh_token'}""" - logger_name = "test logger" - - logger = get_logger(logger_name) - logger.info(message) - - record = json.loads(caplog.messages[0]) - assert len(record) == 4 - assert ( - record["event"] == "#### WARNING: Log message replaced due to sensitive keyword: 'Refresh':" - ) - assert record["level"] == "info" - assert record["logger"] == logger_name - - -def test_message_with_password_and_email_is_cleaned_correctly(caplog): - caplog.set_level("INFO") - - message = "Test with request password" - logger_name = "test logger" - - logger = get_logger(logger_name) - request = { - "email": "user@domain.xyz", - "password": "this is a sensitive value", - } - - logger.info(message, request=request) - - record = json.loads(caplog.messages[0]) - assert len(record) == 5 - assert record["event"] == "Test with request password" - assert record["level"] == "info" - assert record["logger"] == logger_name - assert record["request"]["password"] == REPLACEMENT_TEXT - assert record["request"]["email"] == REPLACEMENT_TEXT - - -def test_message_with_password_and_email_in_event_is_cleaned_correctly(caplog): - caplog.set_level("INFO") - - logger_name = "test logger" - message = { - "email": "user@domain.xyz", - "password": "this is a sensitive value", - } - - logger = get_logger(logger_name) - logger.info(message) - - record = json.loads(caplog.messages[0]) - assert len(record) == 4 - assert record["level"] == "info" - assert record["logger"] == logger_name - assert record["event"]["password"] == REPLACEMENT_TEXT - assert record["event"]["email"] == REPLACEMENT_TEXT - - -def test_complex_log_object_is_cleaned_correctly(): - event_dict = { - "context": {"email": "sensitive@email.address"}, - "context_id": "cxt_fe76c000000000000000000_0000000000000", - "parameters": { - "context": {"callerUserId": None}, - "body": {"email": "sensitive@email.address", "password": "sensitive_password"}, - }, - "response": { - "status": "OK", - "auth": "eyJra...", - "refresh": "eyJjd...", - "email_1": "user@domain", - }, - "Authentication": "eyJra...", - "request_type": "http", - "email": "sensitive@email.address", - "password": "sensitive_value", - "calling_user_type": "service", - "event": "Request", - "logger": "service_function.common.cloud_function", - "level": "info", - "timestamp": "2021-01-01T00:00:00.000000Z", - "as_list": [ - { - "firstName": "Sensitive First Name", - "lastName": "Sensitive Last Name", - "callerUserId": "000000000000", - }, - { - "firstName": "Sensitive First Name", - "lastName": "Sensitive Last Name", - "callerUserId": "000000000000", - }, - ], - "another_list": ["1", "2"], - } - - cleaned_event_dict = clean_sensitive_data(None, "", event_dict) - - assert cleaned_event_dict is not None - assert cleaned_event_dict["email"] is REPLACEMENT_TEXT - assert cleaned_event_dict["password"] is REPLACEMENT_TEXT - assert cleaned_event_dict["parameters"]["body"]["email"] is REPLACEMENT_TEXT - assert cleaned_event_dict["parameters"]["body"]["password"] is REPLACEMENT_TEXT - assert cleaned_event_dict["as_list"][0]["firstName"] is REPLACEMENT_TEXT - assert cleaned_event_dict["as_list"][0]["lastName"] is REPLACEMENT_TEXT - assert cleaned_event_dict["as_list"][1]["firstName"] is REPLACEMENT_TEXT - assert cleaned_event_dict["as_list"][1]["lastName"] is REPLACEMENT_TEXT - assert cleaned_event_dict["context"]["email"] is REPLACEMENT_TEXT - assert cleaned_event_dict["response"]["auth"] is REPLACEMENT_TEXT - assert cleaned_event_dict["response"]["refresh"] is REPLACEMENT_TEXT - assert cleaned_event_dict["response"]["email_1"] is REPLACEMENT_TEXT - assert cleaned_event_dict["Authentication"] is REPLACEMENT_TEXT - - assert cleaned_event_dict != event_dict - - -def test_log_object_in_event_is_cleaned_correctly(): - event_dict = { - "context_id": "cxt_fe76c000000000000000000_0000000000000", - "request_type": "http", - "event": "some random text 'Bearer sensitive_info_auth_token'", - } - - cleaned_event_dict = clean_sensitive_data(None, "", event_dict) - - assert cleaned_event_dict["event"] == ( - "#### WARNING: Log message replaced due to sensitive keyword: 'Bearer " - ) - assert event_dict != cleaned_event_dict - - -def test_textual_value_is_cleaned_correctly(): - event_dict = { - "context_id": "cxt_fe76c000000000000000000_0000000000000", - "str_field": "Bearer sensitive_info_auth_token", - "event": "some random text", - } - - cleaned_event_dict = clean_sensitive_data(None, "", event_dict) - - assert cleaned_event_dict["str_field"] == ( - "#### WARNING: Log message replaced due to sensitive keyword: Bearer " - ) - - -def test_list_value_is_cleaned_correctly(): - event_dict = { - "context_id": "cxt_fe76c000000000000000000_0000000000000", - "list_field": ("http", "Bearer sensitive_info_auth_token", "blabla"), - "event": "some random text", - } - - cleaned_event_dict = clean_sensitive_data(None, "", event_dict) - - assert cleaned_event_dict["list_field"] == [ - "http", - "#### WARNING: Log message replaced due to sensitive keyword: Bearer ", - "blabla", - ] - - -def test_dict_value_is_cleaned_correctly(): - event_dict = { - "context_id": "cxt_fe76c000000000000000000_0000000000000", - "dict_field": {"a": "http", "b": "Bearer sensitive_info_auth_token", "c": "blabla"}, - "event": "some random text", - } - - cleaned_event_dict = clean_sensitive_data(None, "", event_dict) - - assert cleaned_event_dict["dict_field"] == { - "a": "http", - "b": "#### WARNING: Log message replaced due to sensitive keyword: Bearer ", - "c": "blabla", - } - - -def test_tuple_value_is_cleaned_correctly(): - event_dict = { - "context_id": "cxt_fe76c000000000000000000_0000000000000", - "tuple_field": ("http", "Bearer sensitive_info_auth_token", "blabla"), - "event": "some random text", - } - - cleaned_event_dict = clean_sensitive_data(None, "", event_dict) - - assert cleaned_event_dict["tuple_field"] == [ - "http", - "#### WARNING: Log message replaced due to sensitive keyword: Bearer ", - "blabla", - ] - - -def test_set_value_is_cleaned_correctly(): - event_dict = { - "context_id": "cxt_fe76c000000000000000000_0000000000000", - "set_field": {"http", "Bearer sensitive_info_auth_token", "blabla"}, - "event": "some random text", - } - - cleaned_event_dict = clean_sensitive_data(None, "", event_dict) - - assert sorted(cleaned_event_dict["set_field"]) == [ - "#### WARNING: Log message replaced due to sensitive keyword: Bearer ", - "blabla", - "http", - ] - - -def test_numeric_values_are_cleaned_correctly(): - event_dict = { - "context_id": "cxt_fe76c000000000000000000_0000000000000", - "int_field": 123, - "float_field": 123.45, - "event": "some random text", - } - - cleaned_event_dict = clean_sensitive_data(None, "", event_dict) - - assert cleaned_event_dict["int_field"] == 123 - assert cleaned_event_dict["float_field"] == 123.45 - - -def test_decimal_value_is_cleaned_correctly(): - event_dict = { - "context_id": "cxt_fe76c000000000000000000_0000000000000", - "decimal_field": Decimal("123.45"), - "event": "some random text", - } - - cleaned_event_dict = clean_sensitive_data(None, "", event_dict) - - assert cleaned_event_dict["decimal_field"] == 123.45 - - -def test_generic_object_with_custom_text_is_cleaned_correctly(): - class Foo: - def __str__(self): - return "Bearer sensitive_info_auth_token" - - event_dict = { - "context_id": "cxt_fe76c000000000000000000_0000000000000", - "object_field": Foo(), - "event": "some random text", - } - - cleaned_event_dict = clean_sensitive_data(None, "", event_dict) - - assert isinstance(cleaned_event_dict["object_field"], str) - assert cleaned_event_dict["object_field"] == ( - "#### WARNING: Log message replaced due to sensitive keyword: Bearer " - ) - - -def test_generic_object_is_cleaned_correctly(): - class Foo: - foo = "Bearer sensitive_info_auth_token" - - event_dict = { - "context_id": "cxt_fe76c000000000000000000_0000000000000", - "object_field": Foo(), - "event": "some random text", - } - - cleaned_event_dict = clean_sensitive_data(None, "", event_dict) - - assert isinstance(cleaned_event_dict["object_field"], str) - assert "Bearer " not in cleaned_event_dict["object_field"] - assert ".Foo object at " in cleaned_event_dict["object_field"] - - -def test_adding_custom_sensitive_keywords_on_runtime_cleans_output_correctly(caplog): - logger = get_logger("test logger") - logger.config.sensitive_keys = {"illegalkey", "foo"} - - logger.info( - "clean sensitive keys with dict", - context_id="cxt_fe76c000000000000000000_0000000000000", - illegalKey="blabla", - ) - - record = json.loads(caplog.messages[0]) - assert record["illegalKey"] == REPLACEMENT_TEXT - assert record["context_id"] == "cxt_fe76c000000000000000000_0000000000000" - - -def test_setting_custom_sensitive_keywords_on_runtime_cleans_output_correctly(caplog): - logger = get_logger("test logger") - logger.config.sensitive_keys = {"illegalkey", "foo"} - - logger.info( - "clean sensitive keys with dict", - context_id="cxt_fe76c000000000000000000_0000000000000", - illegalKey="blabla", - ) - - record = json.loads(caplog.messages[0]) - assert record["illegalKey"] == REPLACEMENT_TEXT - assert record["context_id"] == "cxt_fe76c000000000000000000_0000000000000" - - -def test_changing_custom_sensitive_keywords_on_runtime_cleans_output_correctly(caplog): - logger = get_logger("test logger") - payload = {"foo": "1234", "bar": "5678", "fooBar": "9876"} - - logger.config.sensitive_keys = {"FOO", "bar"} - logger.info("clean sensitive keys with dict", payload=payload) - record = json.loads(caplog.messages[0]) - - assert record["payload"]["foo"] == REPLACEMENT_TEXT - assert record["payload"]["bar"] == REPLACEMENT_TEXT - assert record["payload"]["fooBar"] != REPLACEMENT_TEXT - - logger.config.sensitive_keys.add("fooBar") - logger.info("clean sensitive keys with dict", payload=payload) - fully_cleaned_record = json.loads(caplog.messages[1]) - - assert fully_cleaned_record["payload"]["foo"] == REPLACEMENT_TEXT - assert fully_cleaned_record["payload"]["bar"] == REPLACEMENT_TEXT - assert fully_cleaned_record["payload"]["fooBar"] == REPLACEMENT_TEXT diff --git a/tests/logger/processors/test_clean_fields.py b/tests/logger/processors/test_clean_fields.py new file mode 100755 index 0000000..e5acecf --- /dev/null +++ b/tests/logger/processors/test_clean_fields.py @@ -0,0 +1,200 @@ +import json +from decimal import Decimal + +from unclogger import get_logger +from unclogger.processors.clean_data import clean_sensitive_data, DEFAULT_REPLACEMENT + + +def test_message_with_password_and_email_is_cleaned_correctly(caplog): + caplog.set_level("INFO") + + message = "Test with request password" + logger_name = "test logger" + + logger = get_logger(logger_name) + request = { + "Email": "user@domain.xyz", + "password": "this is a sensitive value", + } + + logger.info(message, request=request) + + record = json.loads(caplog.messages[0]) + assert len(record) == 5 + assert record["event"] == "Test with request password" + assert record["level"] == "info" + assert record["logger"] == logger_name + assert record["request"]["password"] == DEFAULT_REPLACEMENT + assert record["request"]["Email"] == DEFAULT_REPLACEMENT + + +def test_message_with_password_and_email_in_event_is_cleaned_correctly(caplog): + caplog.set_level("INFO") + + logger_name = "test logger" + message = { + "email": "user@domain.xyz", + "password": "this is a sensitive value", + } + + logger = get_logger(logger_name) + logger.info(message) + + record = json.loads(caplog.messages[0]) + assert len(record) == 4 + assert record["level"] == "info" + assert record["logger"] == logger_name + assert record["event"]["password"] == DEFAULT_REPLACEMENT + assert record["event"]["email"] == DEFAULT_REPLACEMENT + + +def test_complex_log_object_is_cleaned_correctly(): + event_dict = { + "context": {"email": "sensitive@email.address"}, + "context_id": "cxt_fe76c000000000000000000_0000000000000", + "parameters": { + "context": {"callerUserId": None}, + "body": {"email": "sensitive@email.address", "password": "sensitive_password"}, + }, + "response": { + "status": "OK", + "auth": "eyJra...", + "refresh": "eyJjd...", + "email_1": "user@domain", + }, + "Authentication": "eyJra...", + "request_type": "http", + "email": "sensitive@email.address", + "password": "sensitive_value", + "calling_user_type": "service", + "event": "Request", + "logger": "service_function.common.cloud_function", + "level": "info", + "timestamp": "2021-01-01T00:00:00.000000Z", + "as_list": [ + { + "firstName": "Sensitive First Name", + "lastName": "Sensitive Last Name", + "callerUserId": "000000000000", + }, + { + "firstName": "Sensitive First Name", + "lastName": "Sensitive Last Name", + "callerUserId": "000000000000", + }, + ], + "another_list": ["1", "2"], + } + + cleaned_event_dict = clean_sensitive_data(None, "", event_dict) + + assert cleaned_event_dict is not None + assert cleaned_event_dict["email"] is DEFAULT_REPLACEMENT + assert cleaned_event_dict["password"] is DEFAULT_REPLACEMENT + assert cleaned_event_dict["parameters"]["body"]["email"] is DEFAULT_REPLACEMENT + assert cleaned_event_dict["parameters"]["body"]["password"] is DEFAULT_REPLACEMENT + assert cleaned_event_dict["as_list"][0]["firstName"] is DEFAULT_REPLACEMENT + assert cleaned_event_dict["as_list"][0]["lastName"] is DEFAULT_REPLACEMENT + assert cleaned_event_dict["as_list"][1]["firstName"] is DEFAULT_REPLACEMENT + assert cleaned_event_dict["as_list"][1]["lastName"] is DEFAULT_REPLACEMENT + assert cleaned_event_dict["context"]["email"] is DEFAULT_REPLACEMENT + assert cleaned_event_dict["response"]["auth"] is DEFAULT_REPLACEMENT + assert cleaned_event_dict["response"]["refresh"] is DEFAULT_REPLACEMENT + assert cleaned_event_dict["response"]["email_1"] is DEFAULT_REPLACEMENT + assert cleaned_event_dict["Authentication"] is DEFAULT_REPLACEMENT + + assert cleaned_event_dict != event_dict + + +def test_numeric_values_are_cleaned_correctly(): + event_dict = { + "context_id": "cxt_fe76c000000000000000000_0000000000000", + "int_field": 123, + "float_field": 123.45, + "event": "some random text", + } + + cleaned_event_dict = clean_sensitive_data(None, "", event_dict) + + assert cleaned_event_dict["int_field"] == 123 + assert cleaned_event_dict["float_field"] == 123.45 + + +def test_decimal_value_is_cleaned_correctly(): + event_dict = { + "context_id": "cxt_fe76c000000000000000000_0000000000000", + "decimal_field": Decimal("123.45"), + "event": "some random text", + } + + cleaned_event_dict = clean_sensitive_data(None, "", event_dict) + + assert cleaned_event_dict["decimal_field"] == 123.45 + + +def test_generic_object_is_cleaned_correctly(): + class Foo: + foo = "Bearer sensitive_info_auth_token" + + event_dict = { + "context_id": "cxt_fe76c000000000000000000_0000000000000", + "object_field": Foo(), + "event": "some random text", + } + + cleaned_event_dict = clean_sensitive_data(None, "", event_dict) + + assert isinstance(cleaned_event_dict["object_field"], str) + assert "Bearer " not in cleaned_event_dict["object_field"] + assert ".Foo object at " in cleaned_event_dict["object_field"] + + +def test_adding_custom_sensitive_keywords_on_runtime_cleans_output_correctly(caplog): + logger = get_logger("test logger") + logger.config.sensitive_keys = {"illegalkey", "foo"} + + logger.info( + "clean sensitive keys with dict", + context_id="cxt_fe76c000000000000000000_0000000000000", + illegalKey="blabla", + ) + + record = json.loads(caplog.messages[0]) + assert record["illegalKey"] == DEFAULT_REPLACEMENT + assert record["context_id"] == "cxt_fe76c000000000000000000_0000000000000" + + +def test_setting_custom_sensitive_keywords_on_runtime_cleans_output_correctly(caplog): + logger = get_logger("test logger") + logger.config.sensitive_keys = {"illegalkey", "foo"} + + logger.info( + "clean sensitive keys with dict", + context_id="cxt_fe76c000000000000000000_0000000000000", + illegalKey="blabla", + ) + + record = json.loads(caplog.messages[0]) + assert record["illegalKey"] == DEFAULT_REPLACEMENT + assert record["context_id"] == "cxt_fe76c000000000000000000_0000000000000" + + +def test_changing_custom_sensitive_keywords_on_runtime_cleans_output_correctly(caplog): + logger = get_logger("test logger") + payload = {"foo": "1234", "bar": "5678", "fooBar": "9876"} + + logger.config.sensitive_keys = {"FOO", "bar"} + logger.info("clean sensitive keys with dict", payload=payload) + record = json.loads(caplog.messages[0]) + + assert record["payload"]["foo"] == DEFAULT_REPLACEMENT + assert record["payload"]["bar"] == DEFAULT_REPLACEMENT + assert record["payload"]["fooBar"] != DEFAULT_REPLACEMENT + + logger.config.sensitive_keys.add("fooBar") + logger.info("clean sensitive keys with dict", payload=payload) + fully_cleaned_record = json.loads(caplog.messages[1]) + + assert fully_cleaned_record["payload"]["foo"] == DEFAULT_REPLACEMENT + assert fully_cleaned_record["payload"]["bar"] == DEFAULT_REPLACEMENT + assert fully_cleaned_record["payload"]["fooBar"] == DEFAULT_REPLACEMENT diff --git a/tests/logger/processors/test_clean_values.py b/tests/logger/processors/test_clean_values.py new file mode 100644 index 0000000..ed4f801 --- /dev/null +++ b/tests/logger/processors/test_clean_values.py @@ -0,0 +1,175 @@ +import json +from decimal import Decimal + +from unclogger import get_logger +from unclogger.processors.clean_data import clean_sensitive_data, DEFAULT_REPLACEMENT + + +def test_example_log_message_from_auth_is_cleaned_correctly(caplog): + caplog.set_level("INFO") + + message = """{ + "textPayload": "Response Headers: {'Authentication': 'Bearer sensitive_info_auth_token', + 'Refresh': 'sensitive_info_refresh_token'}", + }""" + logger_name = "test logger" + + logger = get_logger(logger_name) + logger.info(message) + + record = json.loads(caplog.messages[0]) + assert len(record) == 4 + assert ( + record["event"] + == "#### WARNING: Log message replaced due to sensitive keyword: 'Authentication':" + ) + assert record["level"] == "info" + assert record["logger"] == logger_name + + +def test_message_with_Bearer_in_text_is_cleaned_correctly(caplog): + caplog.set_level("INFO") + + message = """some random text 'Bearer sensitive_info_auth_token'""" + logger_name = "test logger" + + logger = get_logger(logger_name) + logger.info(message) + + record = json.loads(caplog.messages[0]) + assert len(record) == 4 + assert ( + record["event"] == "#### WARNING: Log message replaced due to sensitive keyword: 'Bearer " + ) + assert record["level"] == "info" + assert record["logger"] == logger_name + + +def test_message_with_Refresh_in_text_is_cleaned_correctly(caplog): + caplog.set_level("INFO") + + message = """{'Refresh': 'sensitive_info_refresh_token'}""" + logger_name = "test logger" + + logger = get_logger(logger_name) + logger.info(message) + + record = json.loads(caplog.messages[0]) + assert len(record) == 4 + assert ( + record["event"] == "#### WARNING: Log message replaced due to sensitive keyword: 'Refresh':" + ) + assert record["level"] == "info" + assert record["logger"] == logger_name + + +def test_log_object_in_event_is_cleaned_correctly(): + event_dict = { + "context_id": "cxt_fe76c000000000000000000_0000000000000", + "request_type": "http", + "event": "some random text 'Bearer sensitive_info_auth_token'", + } + + cleaned_event_dict = clean_sensitive_data(None, "", event_dict) + + assert cleaned_event_dict["event"] == ( + "#### WARNING: Log message replaced due to sensitive keyword: 'Bearer " + ) + assert event_dict != cleaned_event_dict + + +def test_textual_value_is_cleaned_correctly(): + event_dict = { + "context_id": "cxt_fe76c000000000000000000_0000000000000", + "str_field": "Bearer sensitive_info_auth_token", + "event": "some random text", + } + + cleaned_event_dict = clean_sensitive_data(None, "", event_dict) + + assert cleaned_event_dict["str_field"] == ( + "#### WARNING: Log message replaced due to sensitive keyword: Bearer " + ) + + +def test_list_value_is_cleaned_correctly(): + event_dict = { + "context_id": "cxt_fe76c000000000000000000_0000000000000", + "list_field": ("http", "Bearer sensitive_info_auth_token", "blabla"), + "event": "some random text", + } + + cleaned_event_dict = clean_sensitive_data(None, "", event_dict) + + assert cleaned_event_dict["list_field"] == [ + "http", + "#### WARNING: Log message replaced due to sensitive keyword: Bearer ", + "blabla", + ] + + +def test_dict_value_is_cleaned_correctly(): + event_dict = { + "context_id": "cxt_fe76c000000000000000000_0000000000000", + "dict_field": {"a": "http", "b": "Bearer sensitive_info_auth_token", "c": "blabla"}, + "event": "some random text", + } + + cleaned_event_dict = clean_sensitive_data(None, "", event_dict) + + assert cleaned_event_dict["dict_field"] == { + "a": "http", + "b": "#### WARNING: Log message replaced due to sensitive keyword: Bearer ", + "c": "blabla", + } + + +def test_tuple_value_is_cleaned_correctly(): + event_dict = { + "context_id": "cxt_fe76c000000000000000000_0000000000000", + "tuple_field": ("http", "Bearer sensitive_info_auth_token", "blabla"), + "event": "some random text", + } + + cleaned_event_dict = clean_sensitive_data(None, "", event_dict) + + assert cleaned_event_dict["tuple_field"] == [ + "http", + "#### WARNING: Log message replaced due to sensitive keyword: Bearer ", + "blabla", + ] + + +def test_set_value_is_cleaned_correctly(): + event_dict = { + "context_id": "cxt_fe76c000000000000000000_0000000000000", + "set_field": {"http", "Bearer sensitive_info_auth_token", "blabla"}, + "event": "some random text", + } + + cleaned_event_dict = clean_sensitive_data(None, "", event_dict) + + assert sorted(cleaned_event_dict["set_field"]) == [ + "#### WARNING: Log message replaced due to sensitive keyword: Bearer ", + "blabla", + "http", + ] + + +def test_generic_object_with_custom_text_is_cleaned_correctly(): + class Foo: + def __str__(self): + return "Bearer sensitive_info_auth_token" + + event_dict = { + "context_id": "cxt_fe76c000000000000000000_0000000000000", + "object_field": Foo(), + "event": "some random text", + } + + cleaned_event_dict = clean_sensitive_data(None, "", event_dict) + + assert isinstance(cleaned_event_dict["object_field"], str) + assert cleaned_event_dict["object_field"] == ( + "#### WARNING: Log message replaced due to sensitive keyword: Bearer " + ) diff --git a/tests/logger/processors/test_hash_data.py b/tests/logger/processors/test_hash_fields.py similarity index 80% rename from tests/logger/processors/test_hash_data.py rename to tests/logger/processors/test_hash_fields.py index 07ca4ec..cc5c61c 100644 --- a/tests/logger/processors/test_hash_data.py +++ b/tests/logger/processors/test_hash_fields.py @@ -4,31 +4,32 @@ import pytest from unclogger import get_logger -from unclogger.processors import clean_data def _expected(value, hash_algo): hash_function = getattr(hashlib, hash_algo) hashed = hash_function(value.encode()) - return hashed.hexdigest(256) if hash_algo.startswith("shake_") else hashed.hexdigest() + try: + return hashed.hexdigest() + except TypeError: + return hashed.hexdigest(256) @pytest.mark.parametrize("hash_algo", hashlib.algorithms_guaranteed) def test_message_with_password_and_email_is_hashed_correctly(caplog, monkeypatch, hash_algo): - monkeypatch.setattr(clean_data, "REPLACEMENT_TEXT", hash_algo) - monkeypatch.setattr(clean_data, "REPLACEMENT", getattr(hashlib, hash_algo)) caplog.set_level("INFO") message = "Test with request password" logger_name = "test logger" logger = get_logger(logger_name) + logger.config.replacement = getattr(hashlib, hash_algo) + request = { "email": "user@domain.xyz", "password": "this is a sensitive value", "safe_value": "this is not sensitive", } - logger.info(message, request=request) record = json.loads(caplog.messages[0]) diff --git a/unclogger/processors/clean_data.py b/unclogger/processors/clean_data.py index 9900d9b..79fb0e6 100644 --- a/unclogger/processors/clean_data.py +++ b/unclogger/processors/clean_data.py @@ -1,18 +1,40 @@ """Custom processor for cleaning sensitive data.""" # pylint: disable=unused-argument - -import hashlib +from __future__ import annotations import json import os from collections import ChainMap from decimal import Decimal from functools import singledispatch -from typing import Any +from typing import Any, Callable, Iterable, Union, Protocol, Optional from structlog.types import EventDict, WrappedLogger -# TODO: enable setting field names ad keywords in external configuration files -SENSITIVE_FIELD_NAMES = [ + +class HashObjectProtocol(Protocol): + """ Protocol for objects returned by the `hashlib` library functions. """ + + @property + def block_size(self) -> int: ... + + @property + def digest_size(self) -> int: ... + + @property + def name(self) -> str: ... + + def copy(self) -> HashObjectProtocol: ... + + def digest(self, length: Optional[int] = None) -> str: ... + + def hexdigest(self, length: Optional[int] = None) -> str: ... + + def update(self, obj: bytes, /) -> None: ... + + +ReplacementType = Union[str, Callable[[bytes], HashObjectProtocol]] + +SENSITIVE_FIELDS = [ "password", "email", "email_1", @@ -43,8 +65,7 @@ "Bearer ", ] -REPLACEMENT_TEXT = os.getenv("UNCLOGGER_REPLACEMENT", default="********") -REPLACEMENT = getattr(hashlib, REPLACEMENT_TEXT, REPLACEMENT_TEXT) +DEFAULT_REPLACEMENT: str = os.getenv("UNCLOGGER_REPLACEMENT", default="********") REPLACEMENT_MESSAGE = "#### WARNING: Log message replaced due to sensitive keyword: " @@ -106,22 +127,27 @@ def _clean_up_sequence(data, logger): @_clean_up.register def _clean_up_dict(data: dict, logger): - sensitive_keys = set() + sensitive_keys: Iterable[str] = () if hasattr(logger, "config") and hasattr(logger.config, "sensitive_keys"): sensitive_keys = logger.config.sensitive_keys - sensitive_fields = {field.lower() for field in [*SENSITIVE_FIELD_NAMES, *sensitive_keys]} + sensitive_fields = {field.lower() for field in [*SENSITIVE_FIELDS, *sensitive_keys]} cleaned_data = ChainMap({}, data) + replacement: ReplacementType = DEFAULT_REPLACEMENT + if hasattr(logger, "config") and hasattr(logger.config, "replacement"): + replacement = logger.config.replacement for key, value in cleaned_data.items(): cleaned_data[key] = ( - _replace(value) if key.lower() in sensitive_fields else _clean_up(value, logger) + _replace(value, replacement) + if key.lower() in sensitive_fields + else _clean_up(value, logger) ) return dict(cleaned_data) -def _replace(value: Any): - if callable(REPLACEMENT): - replaced = REPLACEMENT(str(value).encode()) - if REPLACEMENT_TEXT.startswith("shake_"): +def _replace(value: Any, replacement: ReplacementType): + if callable(replacement): + replaced = replacement(str(value).encode()) + if "shake_" in replacement.__name__: return replaced.hexdigest(256) return replaced.hexdigest() - return REPLACEMENT + return replacement