Skip to content

Commit

Permalink
Merge pull request #274 from mikaylathompson/human-readable-tuples
Browse files Browse the repository at this point in the history
Add script to transform tuples to human readable format
  • Loading branch information
sumobrian authored Aug 21, 2023
2 parents dd17251 + c6fb8da commit e83ffe8
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 2 deletions.
37 changes: 37 additions & 0 deletions TrafficCapture/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,45 @@ will send requests to `capture-proxy-domain.com`, using the auth combo `admin`/`

Support for Sigv4 signing and other auth options may be a future option.

#### Understanding Data from the Replayer

The Migration Console can be used to access and help interpret the data from the replayer.

The data generated from the replayer is stored on an Elastic File System volume shared between the Replayer and Migration Console.
It is mounted to the Migration Console at the path `/shared_replayer_output`. The Replayer generates files named `output_tuples.log`.
These files are rolled over as they hit 10 MB to a series of `output_tuples-%d{yyyy-MM-dd-HH:mm}.log` files.

The data in these files is in the format of JSON lines, each of which is a log message containing a specific request-response-response tuple.
The body of the messages is sometimes gzipped which makes it difficult to represent as text in a JSON. Therefore, the body field of all requests
and responses is base64 encoded before it is logged. This makes the files stable, but not human-readable.

We have provided a utility script that can parse these files and output them to a human-readable format: the bodies are
base64 decoded, un-gzipped if applicable, and parsed as JSON if applicable. They're then saved back to JSON format on disk.

To use this utility from the Migration Console,
```sh
$ ./humanReadableLogs.py --help
usage: humanReadableLogs.py [-h] [--outfile OUTFILE] infile

positional arguments:
infile Path to input logged tuple file.

options:
-h, --help show this help message and exit
--outfile OUTFILE Path for output human readable tuple file.

# By default, the output file is the same path as the input file, but the file name is prefixed with `readable-`.
$ ./humanReadableLogs.py /shared_replayer_output/tuples.log
Input file: /shared_replayer_output/tuples.log; Output file: /shared_replayer_output/readable-tuples.log

# A specific output file can also be specified.
$ ./humanReadableLogs.py /shared_replayer_output/tuples.log --outfile local-tuples.log
Input file: /shared_replayer_output/tuples.log; Output file: local-tuples.log
```

### Capture Kafka Offloader

The Capture Kafka Offloader will act as a Kafka Producer for offloading captured traffic logs to the configured Kafka cluster.

Learn more about its functionality and setup here: [Capture Kafka Offloader](captureKafkaOffloader/README.md)

Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ ENV DEBIAN_FRONTEND noninteractive

RUN apt-get update && \
apt-get install -y --no-install-recommends python3.9 python3-pip python3-dev gcc libc-dev git curl && \
pip3 install opensearch-benchmark
pip3 install urllib3==1.25.11 opensearch-benchmark==1.1.0 tqdm

COPY runTestBenchmarks.sh /root/
RUN chmod ugo+x /root/runTestBenchmarks.sh
COPY humanReadableLogs.py /root/
RUN chmod ug+x /root/runTestBenchmarks.sh
RUN chmod ug+x /root/humanReadableLogs.py
WORKDIR /root

CMD tail -f /dev/null
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#!/usr/bin/env python3

import argparse
import base64
import gzip
import json
import pathlib
from typing import Optional
import logging

from tqdm import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm

logger = logging.getLogger(__name__)

LOG_JSON_TUPLE_FIELD = "message"
BASE64_ENCODED_TUPLE_PATHS = ["request.body", "primaryResponse.body", "shadowResponse.body"]
# TODO: I'm not positive about the capitalization of the Content-Encoding and Content-Type headers.
# This version worked on my test cases, but not guaranteed to work in all cases.
CONTENT_ENCODING_PATH = {
BASE64_ENCODED_TUPLE_PATHS[0]: "request.content-encoding",
BASE64_ENCODED_TUPLE_PATHS[1]: "primaryResponse.content-encoding",
BASE64_ENCODED_TUPLE_PATHS[2]: "shadowResponse.content-encoding"
}
CONTENT_TYPE_PATH = {
BASE64_ENCODED_TUPLE_PATHS[0]: "request.content-type",
BASE64_ENCODED_TUPLE_PATHS[1]: "primaryResponse.content-type",
BASE64_ENCODED_TUPLE_PATHS[2]: "shadowResponse.content-type"
}
CONTENT_TYPE_JSON = "application/json"
CONTENT_ENCODING_GZIP = "gzip"
URI_PATH = "request.Request-URI"
BULK_URI_PATH = "_bulk"


class DictionaryPathException(Exception):
pass


def get_element(element: str, dict_: dict, raise_on_error=False) -> Optional[any]:
keys = element.split('.')
rv = dict_
for key in keys:
try:
rv = rv[key]
except KeyError:
if raise_on_error:
raise DictionaryPathException(f"Key {key} was not present.")
else:
return None
return rv


def set_element(element: str, dict_: dict, value: any) -> None:
keys = element.split('.')
rv = dict_
for key in keys[:-1]:
rv = rv[key]
rv[keys[-1]] = value


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("infile", type=pathlib.Path, help="Path to input logged tuple file.")
parser.add_argument("--outfile", type=pathlib.Path, help="Path for output human readable tuple file.")
return parser.parse_args()


def parse_body_value(raw_value: str, content_encoding: Optional[str],
content_type: Optional[str], is_bulk: bool, line_no: int):
try:
b64decoded = base64.b64decode(raw_value)
except Exception as e:
logger.error(f"Body value on line {line_no} could not be decoded: {e}. Skipping parsing body value.")
return None
is_gzipped = content_encoding is not None and content_encoding == CONTENT_ENCODING_GZIP
is_json = content_type is not None and CONTENT_TYPE_JSON in content_type
if is_gzipped:
try:
unzipped = gzip.decompress(b64decoded)
except Exception as e:
logger.error(f"Body value on line {line_no} should be gzipped but could not be unzipped: {e}. "
"Skipping parsing body value.")
return b64decoded
else:
unzipped = b64decoded
try:
decoded = unzipped.decode("utf-8")
except Exception as e:
logger.error(f"Body value on line {line_no} could not be decoded to utf-8: {e}. "
"Skipping parsing body value.")
return unzipped
if is_json and len(decoded) > 0:
if is_bulk:
try:
return [json.loads(line) for line in decoded.splitlines()]
except Exception as e:
logger.error("Body value on line {line_no} should be a bulk json (list of json lines) but "
f"could not be parsed: {e}. Skipping parsing body value.")
return decoded
try:
return json.loads(decoded)
except Exception as e:
logger.error(f"Body value on line {line_no} should be a json but could not be parsed: {e}. "
"Skipping parsing body value.")
return decoded
return decoded


def parse_tuple(line: str, line_no: int) -> dict:
item = json.loads(line)
message = item[LOG_JSON_TUPLE_FIELD]
tuple = json.loads(message)
try:
is_bulk_path = BULK_URI_PATH in get_element(URI_PATH, tuple, raise_on_error=True)
except DictionaryPathException as e:
logger.error(f"`{URI_PATH}` on line {line_no} could not be loaded: {e} "
f"Skipping parsing tuple.")
return tuple
for body_path in BASE64_ENCODED_TUPLE_PATHS:
base64value = get_element(body_path, tuple)
if base64value is None:
# This component has no body element, which is potentially valid.
continue
content_encoding = get_element(CONTENT_ENCODING_PATH[body_path], tuple)
content_type = get_element(CONTENT_TYPE_PATH[body_path], tuple)
value = parse_body_value(base64value, content_encoding, content_type, is_bulk_path, line_no)
if value:
set_element(body_path, tuple, value)
return tuple


if __name__ == "__main__":
args = parse_args()
if args.outfile:
outfile = args.outfile
else:
outfile = args.infile.parent / f"readable-{args.infile.name}"
print(f"Input file: {args.infile}; Output file: {outfile}")

logging.basicConfig(level=logging.INFO)
with logging_redirect_tqdm():
with open(args.infile, 'r') as in_f:
with open(outfile, 'w') as out_f:
for i, line in tqdm(enumerate(in_f)):
print(json.dumps(parse_tuple(line, i + 1)), file=out_f)

0 comments on commit e83ffe8

Please sign in to comment.