From 0e774b08a0114f083e52641c73ef96b24ffa1826 Mon Sep 17 00:00:00 2001
From: Mikayla Thompson <thomika@amazon.com>
Date: Fri, 18 Aug 2023 16:56:46 -0600
Subject: [PATCH 1/4] Add script to transform tuples to human readable format

Signed-off-by: Mikayla Thompson <thomika@amazon.com>
---
 .../main/docker/migrationConsole/Dockerfile   |   4 +-
 .../migrationConsole/humanReadableLogs.py     | 104 ++++++++++++++++++
 2 files changed, 106 insertions(+), 2 deletions(-)
 create mode 100755 TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py

diff --git a/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/Dockerfile b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/Dockerfile
index aa2ef84e8..fea968cff 100644
--- a/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/Dockerfile
+++ b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/Dockerfile
@@ -3,8 +3,8 @@ FROM ubuntu:focal
 ENV DEBIAN_FRONTEND noninteractive
 
 RUN apt-get update && \
-    apt-get install -y --no-install-recommends python3.9 python3-pip python3-dev gcc libc-dev git curl && \
-    pip3 install opensearch-benchmark
+    apt-get install -y --no-install-recommends python3.11.4 python3-pip python3-dev gcc libc-dev git curl && \
+    pip3 install urllib3==1.25.11 opensearch-benchmark==1.1.0
 
 COPY runTestBenchmarks.sh /root/
 RUN chmod ugo+x /root/runTestBenchmarks.sh
diff --git a/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py
new file mode 100755
index 000000000..3afade882
--- /dev/null
+++ b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+# Find log files in /shared_replayer_output
+# User can select one or more files (down the line, specific time ranges?)
+# Load file, read each line as json.
+# Extract `message`, do the same un-base64 and possibly un-gzip as the comparator
+
+# ./humanReadableLogs /shared_replayer_output/output_tuple.log /shared_replayer_output/readable_tuples.json
+
+import argparse
+import pathlib
+import json
+import base64
+import gzip
+from typing import Optional
+
+LOG_JSON_TUPLE_FIELD = "message"
+BASE64_ENCODED_TUPLE_PATHS = ["request.body", "primaryResponse.body", "shadowResponse.body"]
+# TODO: I'm not positive about the capitalization of the Content-Encoding and Content-Type headers.
+# This version worked on my test cases, but not guaranteed to work in all cases.
+CONTENT_ENCODING_PATH = {
+    BASE64_ENCODED_TUPLE_PATHS[0]: "request.content-encoding",
+    BASE64_ENCODED_TUPLE_PATHS[1]: "primaryResponse.content-encoding",
+    BASE64_ENCODED_TUPLE_PATHS[2]: "shadowResponse.content-encoding"
+}
+CONTENT_TYPE_PATH = {
+    BASE64_ENCODED_TUPLE_PATHS[0]: "request.content-type",
+    BASE64_ENCODED_TUPLE_PATHS[1]: "primaryResponse.content-type",
+    BASE64_ENCODED_TUPLE_PATHS[2]: "shadowResponse.content-type"
+}
+CONTENT_TYPE_JSON = "application/json"
+CONTENT_ENCODING_GZIP = "gzip"
+URI_PATH = "request.Request-URI"
+BULK_URI_PATH = "_bulk"
+
+
+def get_element(element: str, dict_: dict) -> Optional[any]:
+    keys = element.split('.')
+    rv = dict_
+    for key in keys:
+        try:
+            rv = rv[key]
+        except KeyError:
+            return None
+    return rv
+
+
+def set_element(element: str, dict_: dict, value: any) -> None:
+    keys = element.split('.')
+    rv = dict_
+    for key in keys[:-1]:
+        rv = rv[key]
+    rv[keys[-1]] = value
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("infile", type=pathlib.Path, help="Path to input logged tuple file.")
+    parser.add_argument("--outfile", type=pathlib.Path, help="Path for output human readable tuple file.")
+    return parser.parse_args()
+
+
+def parse_body_value(raw_value: str, content_encoding: Optional[str], content_type: Optional[str], is_bulk: bool):
+    b64decoded = base64.b64decode(raw_value)
+    is_gzipped = content_encoding is not None and content_encoding == CONTENT_ENCODING_GZIP
+    is_json = content_type is not None and CONTENT_TYPE_JSON in content_type
+    if is_gzipped:
+        unzipped = gzip.decompress(b64decoded)
+    else:
+        unzipped = b64decoded
+    decoded = unzipped.decode("utf-8")
+    if is_json and len(decoded) > 0:
+        if is_bulk:
+            return [json.loads(line) for line in decoded.splitlines()]
+        return json.loads(decoded)
+    return decoded
+
+
+def parse_tuple(line):
+    item = json.loads(line)
+    message = item[LOG_JSON_TUPLE_FIELD]
+    tuple = json.loads(message)
+    for path in BASE64_ENCODED_TUPLE_PATHS:
+        base64value = get_element(path, tuple)
+        content_encoding = get_element(CONTENT_ENCODING_PATH[path], tuple)
+        content_type = get_element(CONTENT_TYPE_PATH[path], tuple)
+        is_bulk_path = BULK_URI_PATH in get_element(URI_PATH, tuple)
+        value = parse_body_value(base64value, content_encoding, content_type, is_bulk_path)
+        set_element(path, tuple, value)
+    return tuple
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(args.infile)
+    if args.outfile:
+        outfile = args.outfile
+    else:
+        outfile = args.infile.parent / f"readable-{args.infile.name}"
+    print(f"Will output to {outfile}")
+    with open(args.infile, 'r') as in_f:
+        with open(outfile, 'w') as out_f:
+            for line in in_f:
+                print(parse_tuple(line), file=out_f)

From dea732c6ffd3023aec28e21788534756a4c920bf Mon Sep 17 00:00:00 2001
From: Mikayla Thompson <thomika@amazon.com>
Date: Fri, 18 Aug 2023 17:11:43 -0600
Subject: [PATCH 2/4] Add documentation and some cleanup/todos

Signed-off-by: Mikayla Thompson <thomika@amazon.com>
---
 TrafficCapture/README.md                      | 37 +++++++++++++++++++
 .../migrationConsole/humanReadableLogs.py     | 17 +++------
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/TrafficCapture/README.md b/TrafficCapture/README.md
index 596d81a8e..054902caf 100644
--- a/TrafficCapture/README.md
+++ b/TrafficCapture/README.md
@@ -90,8 +90,45 @@ will send requests to `capture-proxy-domain.com`, using the auth combo `admin`/`
 
 Support for Sigv4 signing and other auth options may be a future option.
 
+#### Understanding Data from the Replayer
+
+The Migration Console can be used to access and help interpret the data from the replayer.
+
+The data generated from the replayer is stored on an Elastic File System volume shared between the Replayer and Migration Console.
+It is mounted to the Migration Console at the path `/shared_replayer_output`. The Replayer generates files named `output_tuples.log`.
+These files are rolled over as they hit 10 Mb to a series of `output_tuples-%d{yyyy-MM-dd-HH:mm}.log` files.
+
+The data in these files is in the format of JSON lines, each of which is a log message containing a specific request-response-response tuple.
+The body of the messages is sometimes gzipped which makes it difficult to represent as text in a JSON. Therefore, the body field of all requests
+and responses is base64 encoded before it is logged. This makes the files stable, but not human-readable.
+
+We have provided a utility script that can parse these files and output them to a human-readable format: the bodies are
+base64 decoded, un-gzipped if applicable, and parsed as JSON if applicable. They're then saved back to JSON format on disk.
+
+To use this utility from the Migration Console,
+```sh
+$ ./humanReadableLogs.py --help
+usage: humanReadableLogs.py [-h] [--outfile OUTFILE] infile
+
+positional arguments:
+  infile             Path to input logged tuple file.
+
+options:
+  -h, --help         show this help message and exit
+  --outfile OUTFILE  Path for output human readable tuple file.
+
+# By default, the output file is the same path as the input file, but the file name is preface with `readable-`.
+$ ./humanReadableLogs.py /shared_replayer_output/tuples.log
+Input file: /shared_replayer_output/tuples.log; Output file: /shared_replayer_output/readable-tuples.log
+
+# A specific output file can also be specified.
+$ ./humanReadableLogs.py /shared_replayer_output/tuples.log --outfile local-tuples.log
+Input file: /shared_replayer_output/tuples.log; Output file: local-tuples.log
+```
+
 ### Capture Kafka Offloader
 
 The Capture Kafka Offloader will act as a Kafka Producer for offloading captured traffic logs to the configured Kafka cluster.
 
 Learn more about its functionality and setup here: [Capture Kafka Offloader](captureKafkaOffloader/README.md)
+
diff --git a/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py
index 3afade882..1c35bef79 100755
--- a/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py
+++ b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py
@@ -1,17 +1,10 @@
 #!/usr/bin/env python3
 
-# Find log files in /shared_replayer_output
-# User can select one or more files (down the line, specific time ranges?)
-# Load file, read each line as json.
-# Extract `message`, do the same un-base64 and possibly un-gzip as the comparator
-
-# ./humanReadableLogs /shared_replayer_output/output_tuple.log /shared_replayer_output/readable_tuples.json
-
 import argparse
-import pathlib
-import json
 import base64
 import gzip
+import json
+import pathlib
 from typing import Optional
 
 LOG_JSON_TUPLE_FIELD = "message"
@@ -92,13 +85,15 @@ def parse_tuple(line):
 
 if __name__ == "__main__":
     args = parse_args()
-    print(args.infile)
     if args.outfile:
         outfile = args.outfile
     else:
         outfile = args.infile.parent / f"readable-{args.infile.name}"
-    print(f"Will output to {outfile}")
+    print(f"Input file: {args.infile}; Output file: {outfile}")
     with open(args.infile, 'r') as in_f:
         with open(outfile, 'w') as out_f:
             for line in in_f:
                 print(parse_tuple(line), file=out_f)
+
+# TODO: add some try/catching
+# TODO: add a progress indicator for large files

From 928e55cae490ba84f5857b2a6a97f6ea600773bc Mon Sep 17 00:00:00 2001
From: Mikayla Thompson <thomika@amazon.com>
Date: Fri, 18 Aug 2023 23:23:16 -0600
Subject: [PATCH 3/4] Add error handling, progress bar, cleanup

Signed-off-by: Mikayla Thompson <thomika@amazon.com>
---
 .../main/docker/migrationConsole/Dockerfile   |  6 +-
 .../migrationConsole/humanReadableLogs.py     | 91 ++++++++++++++-----
 2 files changed, 73 insertions(+), 24 deletions(-)

diff --git a/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/Dockerfile b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/Dockerfile
index fea968cff..cf2857208 100644
--- a/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/Dockerfile
+++ b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/Dockerfile
@@ -3,11 +3,13 @@ FROM ubuntu:focal
 ENV DEBIAN_FRONTEND noninteractive
 
 RUN apt-get update && \
-    apt-get install -y --no-install-recommends python3.11.4 python3-pip python3-dev gcc libc-dev git curl && \
-    pip3 install urllib3==1.25.11 opensearch-benchmark==1.1.0
+    apt-get install -y --no-install-recommends python3.9 python3-pip python3-dev gcc libc-dev git curl && \
+    pip3 install urllib3==1.25.11 opensearch-benchmark==1.1.0 tqdm
 
 COPY runTestBenchmarks.sh /root/
+COPY humanReadableLogs.py /root/
 RUN chmod ugo+x /root/runTestBenchmarks.sh
+RUN chmod ugo+x /root/humanReadableLogs.py
 WORKDIR /root
 
 CMD tail -f /dev/null
\ No newline at end of file
diff --git a/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py
index 1c35bef79..2c38a318e 100755
--- a/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py
+++ b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py
@@ -6,6 +6,12 @@
 import json
 import pathlib
 from typing import Optional
+import logging
+
+from tqdm import tqdm
+from tqdm.contrib.logging import logging_redirect_tqdm
+
+logger = logging.getLogger(__name__)
 
 LOG_JSON_TUPLE_FIELD = "message"
 BASE64_ENCODED_TUPLE_PATHS = ["request.body", "primaryResponse.body", "shadowResponse.body"]
@@ -27,14 +33,21 @@
 BULK_URI_PATH = "_bulk"
 
 
-def get_element(element: str, dict_: dict) -> Optional[any]:
+class DictionaryPathException(Exception):
+    pass
+
+
+def get_element(element: str, dict_: dict, raise_on_error=False) -> Optional[any]:
     keys = element.split('.')
     rv = dict_
     for key in keys:
         try:
             rv = rv[key]
         except KeyError:
-            return None
+            if raise_on_error:
+                raise DictionaryPathException(f"Key {key} was not present.")
+            else:
+                return None
     return rv
 
 
@@ -53,33 +66,67 @@ def parse_args():
     return parser.parse_args()
 
 
-def parse_body_value(raw_value: str, content_encoding: Optional[str], content_type: Optional[str], is_bulk: bool):
-    b64decoded = base64.b64decode(raw_value)
+def parse_body_value(raw_value: str, content_encoding: Optional[str],
+                     content_type: Optional[str], is_bulk: bool, line_no: int):
+    try:
+        b64decoded = base64.b64decode(raw_value)
+    except Exception as e:
+        logger.error(f"Body value on line {line_no} could not be decoded: {e}. Skipping parsing body value.")
+        return None
     is_gzipped = content_encoding is not None and content_encoding == CONTENT_ENCODING_GZIP
     is_json = content_type is not None and CONTENT_TYPE_JSON in content_type
     if is_gzipped:
-        unzipped = gzip.decompress(b64decoded)
+        try:
+            unzipped = gzip.decompress(b64decoded)
+        except Exception as e:
+            logger.error(f"Body value on line {line_no} should be gzipped but could not be unzipped: {e}. "
+                         "Skipping parsing body value.")
+            return b64decoded
     else:
         unzipped = b64decoded
-    decoded = unzipped.decode("utf-8")
+    try:
+        decoded = unzipped.decode("utf-8")
+    except Exception as e:
+        logger.error(f"Body value on line {line_no} could not be decoded to utf-8: {e}. "
+                     "Skipping parsing body value.")
+        return unzipped
     if is_json and len(decoded) > 0:
         if is_bulk:
-            return [json.loads(line) for line in decoded.splitlines()]
-        return json.loads(decoded)
+            try:
+                return [json.loads(line) for line in decoded.splitlines()]
+            except Exception as e:
+                logger.error("Body value on line {line_no} should be a bulk json (list of json lines) but "
+                             f"could not be parsed: {e}. Skipping parsing body value.")
+                return decoded
+        try:
+            return json.loads(decoded)
+        except Exception as e:
+            logger.error(f"Body value on line {line_no} should be a json but could not be parsed: {e}. "
+                         "Skipping parsing body value.")
+            return decoded
     return decoded
 
 
-def parse_tuple(line):
+def parse_tuple(line: str, line_no: int):
     item = json.loads(line)
     message = item[LOG_JSON_TUPLE_FIELD]
     tuple = json.loads(message)
-    for path in BASE64_ENCODED_TUPLE_PATHS:
-        base64value = get_element(path, tuple)
-        content_encoding = get_element(CONTENT_ENCODING_PATH[path], tuple)
-        content_type = get_element(CONTENT_TYPE_PATH[path], tuple)
-        is_bulk_path = BULK_URI_PATH in get_element(URI_PATH, tuple)
-        value = parse_body_value(base64value, content_encoding, content_type, is_bulk_path)
-        set_element(path, tuple, value)
+    try:
+        is_bulk_path = BULK_URI_PATH in get_element(URI_PATH, tuple, raise_on_error=True)
+    except DictionaryPathException as e:
+        logger.error(f"`{URI_PATH}` on line {line_no} could not be loaded: {e} "
+                     f"Skipping parsing tuple.")
+        return tuple
+    for body_path in BASE64_ENCODED_TUPLE_PATHS:
+        base64value = get_element(body_path, tuple)
+        if base64value is None:
+            # This component has no body element, which is potentially valid.
+            continue
+        content_encoding = get_element(CONTENT_ENCODING_PATH[body_path], tuple)
+        content_type = get_element(CONTENT_TYPE_PATH[body_path], tuple)
+        value = parse_body_value(base64value, content_encoding, content_type, is_bulk_path, line_no)
+        if value:
+            set_element(body_path, tuple, value)
     return tuple
 
 
@@ -90,10 +137,10 @@ def parse_tuple(line):
     else:
         outfile = args.infile.parent / f"readable-{args.infile.name}"
     print(f"Input file: {args.infile}; Output file: {outfile}")
-    with open(args.infile, 'r') as in_f:
-        with open(outfile, 'w') as out_f:
-            for line in in_f:
-                print(parse_tuple(line), file=out_f)
 
-# TODO: add some try/catching
-# TODO: add a progress indicator for large files
+    logging.basicConfig(level=logging.INFO)
+    with logging_redirect_tqdm():
+        with open(args.infile, 'r') as in_f:
+            with open(outfile, 'w') as out_f:
+                for i, line in tqdm(enumerate(in_f)):
+                    print(parse_tuple(line, i + 1), file=out_f)

From c6fb8da1ea6dd689812bd775e01d4e24f6bd23a8 Mon Sep 17 00:00:00 2001
From: Mikayla Thompson <thomika@amazon.com>
Date: Sun, 20 Aug 2023 13:58:55 -0600
Subject: [PATCH 4/4] Address review comments

Signed-off-by: Mikayla Thompson <thomika@amazon.com>
---
 TrafficCapture/README.md                                      | 4 ++--
 .../src/main/docker/migrationConsole/Dockerfile               | 4 ++--
 .../src/main/docker/migrationConsole/humanReadableLogs.py     | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/TrafficCapture/README.md b/TrafficCapture/README.md
index 054902caf..bc0f19cec 100644
--- a/TrafficCapture/README.md
+++ b/TrafficCapture/README.md
@@ -96,7 +96,7 @@ The Migration Console can be used to access and help interpret the data from the
 
 The data generated from the replayer is stored on an Elastic File System volume shared between the Replayer and Migration Console.
 It is mounted to the Migration Console at the path `/shared_replayer_output`. The Replayer generates files named `output_tuples.log`.
-These files are rolled over as they hit 10 Mb to a series of `output_tuples-%d{yyyy-MM-dd-HH:mm}.log` files.
+These files are rolled over as they hit 10 MB to a series of `output_tuples-%d{yyyy-MM-dd-HH:mm}.log` files.
 
 The data in these files is in the format of JSON lines, each of which is a log message containing a specific request-response-response tuple.
 The body of the messages is sometimes gzipped which makes it difficult to represent as text in a JSON. Therefore, the body field of all requests
@@ -117,7 +117,7 @@ options:
   -h, --help         show this help message and exit
   --outfile OUTFILE  Path for output human readable tuple file.
 
-# By default, the output file is the same path as the input file, but the file name is preface with `readable-`.
+# By default, the output file is the same path as the input file, but the file name is prefixed with `readable-`.
 $ ./humanReadableLogs.py /shared_replayer_output/tuples.log
 Input file: /shared_replayer_output/tuples.log; Output file: /shared_replayer_output/readable-tuples.log
 
diff --git a/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/Dockerfile b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/Dockerfile
index cf2857208..fc344b343 100644
--- a/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/Dockerfile
+++ b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/Dockerfile
@@ -8,8 +8,8 @@ RUN apt-get update && \
 
 COPY runTestBenchmarks.sh /root/
 COPY humanReadableLogs.py /root/
-RUN chmod ugo+x /root/runTestBenchmarks.sh
-RUN chmod ugo+x /root/humanReadableLogs.py
+RUN chmod ug+x /root/runTestBenchmarks.sh
+RUN chmod ug+x /root/humanReadableLogs.py
 WORKDIR /root
 
 CMD tail -f /dev/null
\ No newline at end of file
diff --git a/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py
index 2c38a318e..badf521c0 100755
--- a/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py
+++ b/TrafficCapture/dockerSolution/src/main/docker/migrationConsole/humanReadableLogs.py
@@ -107,7 +107,7 @@ def parse_body_value(raw_value: str, content_encoding: Optional[str],
     return decoded
 
 
-def parse_tuple(line: str, line_no: int):
+def parse_tuple(line: str, line_no: int) -> dict:
     item = json.loads(line)
     message = item[LOG_JSON_TUPLE_FIELD]
     tuple = json.loads(message)
@@ -143,4 +143,4 @@ def parse_tuple(line: str, line_no: int):
         with open(args.infile, 'r') as in_f:
             with open(outfile, 'w') as out_f:
                 for i, line in tqdm(enumerate(in_f)):
-                    print(parse_tuple(line, i + 1), file=out_f)
+                    print(json.dumps(parse_tuple(line, i + 1)), file=out_f)