Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

System tracing provider for tracing via native probes (BPF) #1288

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
43 changes: 41 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,43 @@ RUN pip install poetry==$POETRY_VERSION && \
./hack/build-wheels.sh /opt/mlserver/dist && \
poetry export --with all-runtimes \
--without-hashes \
-E everything \
--format constraints.txt \
-o /opt/mlserver/dist/constraints.txt && \
sed -i 's/\[.*\]//g' /opt/mlserver/dist/constraints.txt


# Build native dependencies for tracepoints;
# Almalinux is binary-compatible with rhel ubi images but contains repositories
# with additional devel packages (elfutils-libelf-devel needed here)
adriangonz marked this conversation as resolved.
Show resolved Hide resolved
FROM almalinux/9-minimal AS libstapsdt-builder
SHELL ["/bin/bash", "-c"]

ARG LIBSTAPSDT_VERSION="0.1.1"

# Install libstapsdt dev dependencies
RUN microdnf update -y && \
microdnf install -y \
wget \
tar \
gzip \
gcc \
make \
findutils \
elfutils-libelf-devel

# Get libstapsdt sources, compile and install into separate tree
# We also need to patch the resulting library symlink to be relative so that
# we may copy the resulting files in a different container directly
RUN wget "https://github.com/linux-usdt/libstapsdt/archive/refs/tags/v${LIBSTAPSDT_VERSION}.tar.gz" && \
tar -xzf v${LIBSTAPSDT_VERSION}.tar.gz && \
cd libstapsdt-${LIBSTAPSDT_VERSION} && \
make && \
make install DESTDIR=/libstapsdt-install && \
cd /libstapsdt-install/usr/lib && \
readlink libstapsdt.so | sed s+/libstapsdt-install/usr/lib/++ | xargs -I % ln -fs % libstapsdt.so


FROM registry.access.redhat.com/ubi9/ubi-minimal
SHELL ["/bin/bash", "-c"]

Expand Down Expand Up @@ -53,7 +86,13 @@ RUN microdnf update -y && \
libgomp \
mesa-libGL \
glib2-devel \
shadow-utils
shadow-utils \
elfutils-libelf

# Install libstapsdt
COPY --from=libstapsdt-builder /libstapsdt-install /
# Update symlinks & ldconfig cache
RUN ldconfig

# Install Conda, Python 3.10 and FFmpeg
RUN microdnf install -y wget && \
Expand Down Expand Up @@ -107,7 +146,7 @@ RUN . $CONDA_PATH/etc/profile.d/conda.sh && \
pip install $_wheel --constraint ./dist/constraints.txt; \
done \
fi && \
pip install $(ls "./dist/mlserver-"*.whl) --constraint ./dist/constraints.txt && \
pip install $(ls "./dist/mlserver-"*.whl)[everything] --constraint ./dist/constraints.txt && \
rm -f /opt/conda/lib/python3.10/site-packages/spacy/tests/package/requirements.txt && \
rm -rf /root/.cache/pip

Expand Down
5 changes: 5 additions & 0 deletions licenses/license.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23097,6 +23097,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.


stapsdt
0.1.1
MIT License
UNKNOWN

starlette
0.27.0
BSD License
Expand Down
1 change: 1 addition & 0 deletions licenses/license_info.csv
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@
"spacy-lookups-data","1.0.3","MIT License"
"sqlparse","0.4.4","BSD License"
"srsly","2.4.6","MIT License"
"stapsdt","0.1.1","MIT License"
"starlette","0.27.0","BSD License"
"starlette-exporter","0.16.0","Apache License 2.0"
"sympy","1.12","BSD License"
Expand Down
1 change: 1 addition & 0 deletions licenses/license_info.no_versions.csv
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@
"spacy-lookups-data","MIT License"
"sqlparse","BSD License"
"srsly","MIT License"
"stapsdt","MIT License"
"starlette","BSD License"
"starlette-exporter","Apache License 2.0"
"sympy","BSD License"
Expand Down
4 changes: 4 additions & 0 deletions mlserver/parallel/system_tracing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from ..sys_tracing import SystemTracingProvider

sysTracingProviderWorkerName = "mlserver"
sys_tracer: SystemTracingProvider = SystemTracingProvider(sysTracingProviderWorkerName)
7 changes: 7 additions & 0 deletions mlserver/parallel/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ..registry import MultiModelRegistry
from ..utils import install_uvloop_event_loop, schedule_with_callback
from ..logging import configure_logger
from ..system_tracing import configure_tracepoints
from ..settings import Settings
from ..metrics import configure_metrics, model_context
from ..env import Environment
Expand All @@ -23,6 +24,7 @@
)
from .utils import terminate_queue, END_OF_QUEUE
from .logging import logger
from .system_tracing import sys_tracer
from .errors import WorkerError

IGNORED_SIGNALS = [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT]
Expand Down Expand Up @@ -64,7 +66,11 @@ def run(self):

with ctx:
install_uvloop_event_loop()
configure_tracepoints(sys_tracer, self._settings.tracepoints_enabled)
configure_logger(self._settings)
logger.info(
f"Worker [{self.pid}]: enabled {sys_tracer.tracepoints_count} tracepoints"
)
configure_metrics(self._settings)
self._ignore_signals()
asyncio.run(self.coro_run())
Expand Down Expand Up @@ -196,3 +202,4 @@ async def stop(self):
self._model_updates.close()
self._requests.close()
self._executor.shutdown()
sys_tracer.unload()
9 changes: 9 additions & 0 deletions mlserver/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .errors import ModelNotFound
from .logging import logger
from .settings import ModelSettings
from .system_tracing import sys_tracer

ModelInitialiser = Callable[[ModelSettings], MLModel]
ModelRegistryHook = Callable[[MLModel], Awaitable[MLModel]]
Expand Down Expand Up @@ -155,6 +156,7 @@ async def _load_model(self, model: MLModel):
# appears as a not-ready (i.e. loading) model
self._register(model)

sys_tracer.tp_model_load_begin(model.name, model.version)
adriangonz marked this conversation as resolved.
Show resolved Hide resolved
for callback in self._on_model_load:
# NOTE: Callbacks need to be executed sequentially to ensure that
# they go in the right order
Expand All @@ -164,6 +166,7 @@ async def _load_model(self, model: MLModel):
self._register(model)
model.ready = await model.load()

sys_tracer.tp_model_load_end(model.name, model.version)
logger.info(f"Loaded model '{model.name}' succesfully.")
except Exception:
logger.info(
Expand All @@ -180,7 +183,11 @@ async def _reload_model(self, old_model: MLModel, new_model: MLModel):
# Loading the model before unloading the old one - this will ensure
# that at least one is available (sort of mimicking a rolling
# deployment)
sys_tracer.tp_model_reload_begin(
new_model.name, new_model.version, old_model.version
)
new_model.ready = await new_model.load()
sys_tracer.tp_model_reload_end(new_model.name, new_model.version)
self._register(new_model)

if old_model == self.default:
Expand Down Expand Up @@ -224,6 +231,8 @@ async def _unload_model(self, model: MLModel):

model.ready = not await model.unload()

sys_tracer.tp_model_unload(model.name, model.version)

def _find_model(self, version: Optional[str] = None) -> Optional[MLModel]:
if version:
if version not in self._versions:
Expand Down
5 changes: 5 additions & 0 deletions mlserver/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
from .metrics import MetricsServer
from .kafka import KafkaServer
from .utils import logger
from .system_tracing import sys_tracer, configure_tracepoints

HANDLED_SIGNALS = [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT]


class MLServer:
def __init__(self, settings: Settings):
configure_tracepoints(sys_tracer, settings.tracepoints_enabled)
self._settings = settings
self._add_signal_handlers()

Expand Down Expand Up @@ -54,6 +56,7 @@ def __init__(self, settings: Settings):
)

self._configure_logger()
logger.info(f"MLServer enabled {sys_tracer.tracepoints_count} tracepoints")
self._create_servers()

def _create_model_registry(self) -> MultiModelRegistry:
Expand Down Expand Up @@ -188,3 +191,5 @@ async def stop(self, sig: Optional[int] = None):

if self._metrics_server:
await self._metrics_server.stop(sig)

sys_tracer.unload()
9 changes: 8 additions & 1 deletion mlserver/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,17 @@ class Config:
kafka_topic_input: str = "mlserver-input"
kafka_topic_output: str = "mlserver-output"

# OpenTelemetry Tracing settings
# Tracing settings
# OpenTelemetry
tracing_server: Optional[str] = None
"""Server name used to export OpenTelemetry tracing to collector service."""

# Enable/disable tracepoints for system tracing (BPF, Systemtap)
tracepoints_enabled: Optional[bool] = True
"""
Control the export of static tracepoints for external probing at runtime
"""

# Custom server settings
_custom_rest_server_settings: Optional[dict] = None
_custom_metrics_server_settings: Optional[dict] = None
Expand Down
12 changes: 12 additions & 0 deletions mlserver/sys_tracing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""

"""
from .provider import SystemTracingProvider
from .tracepoints import Tracepoint, ArgStatus, MAX_TRACEPOINT_ARGS

__all__ = [
"SystemTracingProvider",
"Tracepoint",
"ArgStatus",
"MAX_TRACEPOINT_ARGS",
]
Loading