fix: handle exceptions for malformed lines at vector log parser

codex-storage · Jan 10, 2025 · 84bac45 · 84bac45
1 parent fc06302
commit 84bac45
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,3 +1,39 @@
 # bittorrent-benchmarks
 
-Scaffolding and experiments for benchmarking Codex against the Deluge bittorrent client.
+Scaffolding and experiments for benchmarking Codex against the Deluge bittorrent client.
+This is general enough that it could be extended to benchmark Codex against any content
+distribution network, including IPFS.
+
+This experimental harness leans on Kubernetes. It is completely possible to run experiments
+locally, however, using [Minikube](https://minikube.sigs.k8s.io/) (or Kind, or Docker Desktop).
+
+## Limits
+
+When running experiments locally in a Linux machine, you will bump onto a number of
+limitations. I have documented those here. I won't go into how to make those changes
+permanent within your system as there's significant variation across distributions.
+
+**ARP Cache.** The default size for the ARP cache is too small. You should bump it
+significantly, e.g.:
+
+```bash
+echo 4096 | sudo tee /proc/sys/net/ipv4/neigh/default/gc_thresh1
+echo 8192 | sudo tee /proc/sys/net/ipv4/neigh/default/gc_thresh2
+echo 16384 | sudo tee /proc/sys/net/ipv4/neigh/default/gc_thresh3
+``` 
+
+**Inotify.** Kubernetes seems to enjoy watching the filesystem, so 
+you should increase inotify limits across the board:
+
+```bash
+sudo sysctl -w fs.inotify.max_user_instances=2099999999
+sudo sysctl -w fs.inotify.max_queued_events=2099999999
+sudo sysctl -w fs.inotify.max_user_watches=2099999999
+``` 
+
+**Kernel key retention service.* Kubernetes also places a large number of keys 
+within the kernel. Make sure you have enough room:
+
+```bash
+echo 10000 | sudo tee /proc/sys/kernel/keys/maxkeys
+```
diff --git a/benchmarks/logging/sources.py b/benchmarks/logging/sources.py
@@ -2,9 +2,11 @@
 that stores logs. Such infrastructure might be a simple file system, a service like Logstash, or a database."""
 
 import json
+import logging
 from abc import ABC, abstractmethod
 from collections.abc import Iterator
 from contextlib import AbstractContextManager
+from json import JSONDecodeError
 from pathlib import Path
 from typing import TextIO, Optional, Tuple, List, Dict, Type
 
@@ -19,6 +21,8 @@
 ExperimentId = str
 NodeId = str
 
+logger = logging.getLogger(__name__)
+
 
 class LogSource(ABC):
     """:class:`LogSource` knows how to retrieve logs for experiments within experiment groups. A key assumption is that
@@ -131,14 +135,24 @@ def logs(
             if app_label in line and group_label in line:
                 if experiment_id is not None and experiment_label not in line:
                     continue
-                parsed = json.loads(line)
+                try:
+                    parsed = json.loads(line)
+                except JSONDecodeError as err:
+                    logger.error(
+                        f"Failed to parse line from vector from source {line}", err
+                    )
+                    continue
+
                 k8s = parsed["kubernetes"]
                 yield (
                     k8s["pod_labels"]["app.kubernetes.io/instance"],
                     k8s["pod_name"],
                     parsed["message"],
                 )
 
+    def __str__(self):
+        return f"VectorFlatFileSource({self.app_name})"
+
 
 def split_logs_in_source(
     log_source: LogSource,
@@ -160,9 +174,12 @@ def split_logs_in_source(
     splitters: Dict[str, LogSplitter] = {}
     formats = formats if formats else []
 
+    logger.info(f'Processing logs for group "{group_id} from source "{log_source}"')
+
     for experiment_id, node_id, raw_line in log_source.logs(group_id):
         splitter = splitters.get(experiment_id)
         if splitter is None:
+            logger.info(f"Found experiment {experiment_id}")
             splitter = LogSplitter(
                 lambda event_type, ext: output_manager.open(
                     Path(experiment_id) / f"{event_type}.{ext.value}"