diff --git a/Dockerfile b/Dockerfile index 029dca39e..949112ee6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,10 +20,43 @@ RUN pip install poetry==$POETRY_VERSION && \ ./hack/build-wheels.sh /opt/mlserver/dist && \ poetry export --with all-runtimes \ --without-hashes \ + -E everything \ --format constraints.txt \ -o /opt/mlserver/dist/constraints.txt && \ sed -i 's/\[.*\]//g' /opt/mlserver/dist/constraints.txt + +# Build native dependencies for tracepoints; +# Almalinux is binary-compatible with rhel ubi images but contains repositories +# with additional devel packages (elfutils-libelf-devel needed here) +FROM almalinux/9-minimal AS libstapsdt-builder +SHELL ["/bin/bash", "-c"] + +ARG LIBSTAPSDT_VERSION="0.1.1" + +# Install libstapsdt dev dependencies +RUN microdnf update -y && \ + microdnf install -y \ + wget \ + tar \ + gzip \ + gcc \ + make \ + findutils \ + elfutils-libelf-devel + +# Get libstapsdt sources, compile and install into separate tree +# We also need to patch the resulting library symlink to be relative so that +# we may copy the resulting files in a different container directly +RUN wget "https://github.com/linux-usdt/libstapsdt/archive/refs/tags/v${LIBSTAPSDT_VERSION}.tar.gz" && \ + tar -xzf v${LIBSTAPSDT_VERSION}.tar.gz && \ + cd libstapsdt-${LIBSTAPSDT_VERSION} && \ + make && \ + make install DESTDIR=/libstapsdt-install && \ + cd /libstapsdt-install/usr/lib && \ + readlink libstapsdt.so | sed s+/libstapsdt-install/usr/lib/++ | xargs -I % ln -fs % libstapsdt.so + + FROM registry.access.redhat.com/ubi9/ubi-minimal SHELL ["/bin/bash", "-c"] @@ -53,7 +86,13 @@ RUN microdnf update -y && \ libgomp \ mesa-libGL \ glib2-devel \ - shadow-utils + shadow-utils \ + elfutils-libelf + +# Install libstapsdt +COPY --from=libstapsdt-builder /libstapsdt-install / +# Update symlinks & ldconfig cache +RUN ldconfig # Install Conda, Python 3.10 and FFmpeg RUN microdnf install -y wget && \ @@ -107,7 +146,7 @@ RUN . $CONDA_PATH/etc/profile.d/conda.sh && \ pip install $_wheel --constraint ./dist/constraints.txt; \ done \ fi && \ - pip install $(ls "./dist/mlserver-"*.whl) --constraint ./dist/constraints.txt && \ + pip install $(ls "./dist/mlserver-"*.whl)[everything] --constraint ./dist/constraints.txt && \ rm -f /opt/conda/lib/python3.10/site-packages/spacy/tests/package/requirements.txt && \ rm -rf /root/.cache/pip diff --git a/licenses/license.txt b/licenses/license.txt index a1b343cf5..90088caee 100644 --- a/licenses/license.txt +++ b/licenses/license.txt @@ -23097,6 +23097,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +stapsdt +0.1.1 +MIT License +UNKNOWN + starlette 0.27.0 BSD License diff --git a/licenses/license_info.csv b/licenses/license_info.csv index ddd982b99..da89a07b6 100644 --- a/licenses/license_info.csv +++ b/licenses/license_info.csv @@ -222,6 +222,7 @@ "spacy-lookups-data","1.0.3","MIT License" "sqlparse","0.4.4","BSD License" "srsly","2.4.6","MIT License" +"stapsdt","0.1.1","MIT License" "starlette","0.27.0","BSD License" "starlette-exporter","0.16.0","Apache License 2.0" "sympy","1.12","BSD License" diff --git a/licenses/license_info.no_versions.csv b/licenses/license_info.no_versions.csv index f68aafffe..d7ddf7cc6 100644 --- a/licenses/license_info.no_versions.csv +++ b/licenses/license_info.no_versions.csv @@ -222,6 +222,7 @@ "spacy-lookups-data","MIT License" "sqlparse","BSD License" "srsly","MIT License" +"stapsdt","MIT License" "starlette","BSD License" "starlette-exporter","Apache License 2.0" "sympy","BSD License" diff --git a/mlserver/parallel/system_tracing.py b/mlserver/parallel/system_tracing.py new file mode 100644 index 000000000..f5f4fb060 --- /dev/null +++ b/mlserver/parallel/system_tracing.py @@ -0,0 +1,4 @@ +from ..sys_tracing import SystemTracingProvider + +sysTracingProviderWorkerName = "mlserver" +sys_tracer: SystemTracingProvider = SystemTracingProvider(sysTracingProviderWorkerName) diff --git a/mlserver/parallel/worker.py b/mlserver/parallel/worker.py index 5456e139d..84a5f0f0e 100644 --- a/mlserver/parallel/worker.py +++ b/mlserver/parallel/worker.py @@ -11,6 +11,7 @@ from ..registry import MultiModelRegistry from ..utils import install_uvloop_event_loop, schedule_with_callback from ..logging import configure_logger +from ..system_tracing import configure_tracepoints from ..settings import Settings from ..metrics import configure_metrics, model_context from ..env import Environment @@ -23,6 +24,7 @@ ) from .utils import terminate_queue, END_OF_QUEUE from .logging import logger +from .system_tracing import sys_tracer from .errors import WorkerError IGNORED_SIGNALS = [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT] @@ -64,7 +66,11 @@ def run(self): with ctx: install_uvloop_event_loop() + configure_tracepoints(sys_tracer, self._settings.tracepoints_enabled) configure_logger(self._settings) + logger.info( + f"Worker [{self.pid}]: enabled {sys_tracer.tracepoints_count} tracepoints" + ) configure_metrics(self._settings) self._ignore_signals() asyncio.run(self.coro_run()) @@ -196,3 +202,4 @@ async def stop(self): self._model_updates.close() self._requests.close() self._executor.shutdown() + sys_tracer.unload() diff --git a/mlserver/registry.py b/mlserver/registry.py index 9aa026f19..037178b4f 100644 --- a/mlserver/registry.py +++ b/mlserver/registry.py @@ -8,6 +8,7 @@ from .errors import ModelNotFound from .logging import logger from .settings import ModelSettings +from .system_tracing import sys_tracer ModelInitialiser = Callable[[ModelSettings], MLModel] ModelRegistryHook = Callable[[MLModel], Awaitable[MLModel]] @@ -155,6 +156,7 @@ async def _load_model(self, model: MLModel): # appears as a not-ready (i.e. loading) model self._register(model) + sys_tracer.tp_model_load_begin(model.name, model.version) for callback in self._on_model_load: # NOTE: Callbacks need to be executed sequentially to ensure that # they go in the right order @@ -164,6 +166,7 @@ async def _load_model(self, model: MLModel): self._register(model) model.ready = await model.load() + sys_tracer.tp_model_load_end(model.name, model.version) logger.info(f"Loaded model '{model.name}' succesfully.") except Exception: logger.info( @@ -180,7 +183,11 @@ async def _reload_model(self, old_model: MLModel, new_model: MLModel): # Loading the model before unloading the old one - this will ensure # that at least one is available (sort of mimicking a rolling # deployment) + sys_tracer.tp_model_reload_begin( + new_model.name, new_model.version, old_model.version + ) new_model.ready = await new_model.load() + sys_tracer.tp_model_reload_end(new_model.name, new_model.version) self._register(new_model) if old_model == self.default: @@ -224,6 +231,8 @@ async def _unload_model(self, model: MLModel): model.ready = not await model.unload() + sys_tracer.tp_model_unload(model.name, model.version) + def _find_model(self, version: Optional[str] = None) -> Optional[MLModel]: if version: if version not in self._versions: diff --git a/mlserver/server.py b/mlserver/server.py index d82c49080..c7400e963 100644 --- a/mlserver/server.py +++ b/mlserver/server.py @@ -18,12 +18,14 @@ from .metrics import MetricsServer from .kafka import KafkaServer from .utils import logger +from .system_tracing import sys_tracer, configure_tracepoints HANDLED_SIGNALS = [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT] class MLServer: def __init__(self, settings: Settings): + configure_tracepoints(sys_tracer, settings.tracepoints_enabled) self._settings = settings self._add_signal_handlers() @@ -54,6 +56,7 @@ def __init__(self, settings: Settings): ) self._configure_logger() + logger.info(f"MLServer enabled {sys_tracer.tracepoints_count} tracepoints") self._create_servers() def _create_model_registry(self) -> MultiModelRegistry: @@ -188,3 +191,5 @@ async def stop(self, sig: Optional[int] = None): if self._metrics_server: await self._metrics_server.stop(sig) + + sys_tracer.unload() diff --git a/mlserver/settings.py b/mlserver/settings.py index 41fc90fa8..757a953fd 100644 --- a/mlserver/settings.py +++ b/mlserver/settings.py @@ -230,10 +230,17 @@ class Config: kafka_topic_input: str = "mlserver-input" kafka_topic_output: str = "mlserver-output" - # OpenTelemetry Tracing settings + # Tracing settings + # OpenTelemetry tracing_server: Optional[str] = None """Server name used to export OpenTelemetry tracing to collector service.""" + # Enable/disable tracepoints for system tracing (BPF, Systemtap) + tracepoints_enabled: Optional[bool] = True + """ + Control the export of static tracepoints for external probing at runtime + """ + # Custom server settings _custom_rest_server_settings: Optional[dict] = None _custom_metrics_server_settings: Optional[dict] = None diff --git a/mlserver/sys_tracing/__init__.py b/mlserver/sys_tracing/__init__.py new file mode 100644 index 000000000..dd14463dc --- /dev/null +++ b/mlserver/sys_tracing/__init__.py @@ -0,0 +1,12 @@ +""" + +""" +from .provider import SystemTracingProvider +from .tracepoints import Tracepoint, ArgStatus, MAX_TRACEPOINT_ARGS + +__all__ = [ + "SystemTracingProvider", + "Tracepoint", + "ArgStatus", + "MAX_TRACEPOINT_ARGS", +] diff --git a/mlserver/sys_tracing/provider.py b/mlserver/sys_tracing/provider.py new file mode 100644 index 000000000..4499623b0 --- /dev/null +++ b/mlserver/sys_tracing/provider.py @@ -0,0 +1,224 @@ +""" +A system tracing provider is the entity responsible for registering and managing +a set of tracepoints that are then visible to external tracing tools. Such tools +may register probes (e.g. Systemtap probes, BPF code) to snapshot performance +counters, debug performance issues, link application activity to kernel actions +or compute various aggregate metrics. + +MLServer fires the tracepoints defined by the tracing provider (one of the tp_* +functions) whenever application-specific events take place, such as starting to +process an inference request, loading/unloading a model, etc: + +``` +sys_trace = SystemTracingProvider() +sys_trace.generate_native_sdt_tracepoints(settings.tracepoint_settings) +... +# A new model gets loaded +sys_trace.tp_model_load_begin(...) +model.ready = await model.load() +sys_trace.tp_model_load_end(...) +``` + +Sometimes, the arguments that we want to pass to a tracepoint are obtained by +some processing, which might be expensive. We do not want to pay this cost even +when no external probe is attached to that tracepoint. In this case, we can +first call the `is_active` member function, which only returns true if the +passed tracepoint has an external probe attached: + +``` +if sys_trace.is_active(Tracepoint.inference_begin): + # compute arguments + pipeline = parse_headers(...) + + # fire tracepoint + sys_trace.tp_inference_begin(..., pipeline, ...) +``` + +Each tp_* function calls into a native shared library that is generated on the +fly and dynamically loaded inside the process when calling +`sys_trace.generate_native_sdt_tracepoints(...)`. In the shared library, a +tracepoint is just a hook function (a series of nop instructions followed by a +ret). The code of this hook can be modified at runtime to jump into code +provided from outside the process (e.g. SystemTap or BPF probes running in +kernel context). + +When no probe is attached, a tracepoint adds almost no overhead (the cost of a +function call and a branch instruction). When a probe is attached, the process +will incur a mode switch (~5-10 us) each time the probe fires, plus the overhead +of the probing code. + +At the moment, tracing is only available on x86-64 architectures. +""" +from typing import Union +from importlib import import_module + +from ..logging import logger +from .tracepoints import Tracepoint, ArgStatus, MAX_TRACEPOINT_ARGS +from .stapsdt_stub import Probe as NopTracepoint + + +class SystemTracingProvider: + def __init__(self, name: str): + # Tracing is an optional facility, requiring additional runtime + # dependencies. Because of this, we initialize this provider with stub + # tracepoints which do nothing. + # + # When tracing is enabled via MLServer settings, the actual tracepoint + # insertion will happen during the `create_native_sdt_tracepoints()` + # call. + stapsdt = import_module(".stapsdt_stub", "mlserver.sys_tracing") + self._name = name + + self._provider = stapsdt.Provider(self._name) + self._num_native_tracepoints: int = 0 + self._tracepoint_definition: dict[ + Tracepoint, Union[stapsdt.Probe, NopTracepoint] + ] = {} + + for tp in Tracepoint: + self._tracepoint_definition[tp] = NopTracepoint + + def create_native_sdt_tracepoints(self, enable_tracepoints: bool): + """ + Makes the configured tracepoints visible to external tracing programs + + In the underlying implementation, this: + - generates native shared library with the tracepoints and a + .notes ELF section with metadata used for attaching probes + - loads the shared library in the current process + """ + if enable_tracepoints: + try: + # On import, stapsdt will attempt to dynamically load + # libstapsdt.so, which is responsible for dynamically creating a + # native ELF library containing the tracepoint functions to + # which external probes can be attached to. + import stapsdt + except Exception: + logger.warning( + "Could not enable tracing. Ensure python-stapsdt and libstapsdt.so are installed" + ) + return + + self._provider = stapsdt.Provider(self._name) + + # Register tracepoints with the native stapsdt provider + for tp in Tracepoint: + arg_types, status = tp.get_arg_types() + if arg_types is not None: + self._num_native_tracepoints += 1 + self._tracepoint_definition[tp] = self._provider.add_probe( + tp.name, *arg_types + ) + if status == ArgStatus.TooManyArguments: + logger.warning( + f"The prototype for tracepoint {tp.name} has too many arguments (max: {MAX_TRACEPOINT_ARGS}): \ + only the first {MAX_TRACEPOINT_ARGS} will be considered" + ) + elif status == ArgStatus.NoPrototypeDefined: + logger.warning( + f"Could not register tracepoint {tp.name}, as it has has no defined prototype." + ) + + # Generate and load the shared object library containing the native + # tracepoints; remove tracepoints if loading fails + load_error = self._provider.load() + if load_error: + self._num_native_tracepoints = 0 + for tp in Tracepoint: + self._tracepoint_definition[tp] = NopTracepoint + + @property + def tracepoints_count(self) -> int: + return self._num_native_tracepoints + + @property + def name(self) -> str: + return self._name + + def is_active_for(self, tracepoint: Tracepoint) -> bool: + """ + Returns true only when there are active external probes attached to the + given tracepoint + + Importantly, this can be used before calling a tracepoint, when it is + expensive to compute some of the arguments passed to it and we want to + avoid incurring that cost all the time. + + The overall pattern for calling a tracepoint in this case becomes: + + ``` + if(sys_trace.is_active(Tracepoint.inference_begin)) { + // compute arguments + pipeline = headers.parse(...) + + // fire probe + sys_trace.tp_inference_begin(..., pipeline, ..) + } + ``` + + In the underlying implementation, one of the nop instructions of the + tracepoint hook is replaced by a trap instruction whenever a probe is + attached. For non-stub tracepoints, the current function checks for the + existence of the trap instruction. Stub tracepoints will always return + false. + """ + return self._tracepoint_definition[tracepoint].is_enabled() + + # Explicit function definitions for each tracepoint + # + # These should be preferred to the generic __call__ below, as they constrain + # the number of arguments and explicitly define the tracing interface for + # external consumers + def tp_model_load_begin(self, model_name, model_version) -> bool: + return self._tracepoint_definition[Tracepoint.model_load_begin].fire( + model_name, model_version + ) + + def tp_model_load_end(self, model_name, model_version) -> bool: + return self._tracepoint_definition[Tracepoint.model_load_end].fire( + model_name, model_version + ) + + def tp_model_reload_begin( + self, model_name, model_version, old_model_version + ) -> bool: + return self._tracepoint_definition[Tracepoint.model_reload_begin].fire( + model_name, model_version, old_model_version + ) + + def tp_model_reload_end(self, model_name, model_version) -> bool: + return self._tracepoint_definition[Tracepoint.model_reload_end].fire( + model_name, model_version + ) + + def tp_model_unload(self, model_name, model_version) -> bool: + return self._tracepoint_definition[Tracepoint.model_unload].fire( + model_name, model_version + ) + + def tp_inference_enqueue_req(self, model_name, queue_len) -> bool: + return self._tracepoint_definition[Tracepoint.inference_enqueue_req].fire( + model_name, queue_len + ) + + def tp_inference_begin(self, model_name, pipeline, tags, worker_pid) -> bool: + return self._tracepoint_definition[Tracepoint.inference_begin].fire( + model_name, pipeline, tags, worker_pid + ) + + def tp_inference_end(self, model_name, pipeline, tags, worker_pid) -> bool: + return self._tracepoint_definition[Tracepoint.inference_end].fire( + model_name, pipeline, tags, worker_pid + ) + + # Generic tracepoint event, without constraining the number of arguments + # Mostly here for use during the creation of new tracepoints, for which + # specialised tp_* functions do not exist. + def __call__(self, tracepoint: Tracepoint, *args) -> bool: + return self._tracepoint_definition[tracepoint].fire(*args) + + def unload(self) -> bool: + for tp in Tracepoint: + self._tracepoint_definition[tp] = NopTracepoint + return self._provider.unload() diff --git a/mlserver/sys_tracing/stapsdt_stub.py b/mlserver/sys_tracing/stapsdt_stub.py new file mode 100644 index 000000000..a53cdf2e9 --- /dev/null +++ b/mlserver/sys_tracing/stapsdt_stub.py @@ -0,0 +1,31 @@ +class Provider(object): + def __init__(self, name): + self._name = name + self._provider = None + self._probes = [] + + def __str__(self): + return self.name + + def add_probe(self, name, *args): + probe = Probe(self, name, *args) + self._probes.append(probe) + return probe + + def load(self) -> bool: + return False + + def unload(self): + return True + + +class Probe: + def __init__(self, *args): + self._args = args + + def fire(self, *args): + return False + + @property + def is_enabled(self): + return False diff --git a/mlserver/sys_tracing/tracepoints.py b/mlserver/sys_tracing/tracepoints.py new file mode 100644 index 000000000..37339c36d --- /dev/null +++ b/mlserver/sys_tracing/tracepoints.py @@ -0,0 +1,186 @@ +""" +A tracepoint is a statically defined marker placed in code to identify +application-level events. + +From the perspective of the python application, each tracepoint acts like a +normal function call. When a tracepoint is initialized by the tracing provider, +the prototype of this function call needs to be specified. + +Here, we define the list of all possible tracepoints exposed by MLServer, (the +`Tracepoint` enum) together with the arguments that can be passed to them +(returned by `Tracepoint.get_arg_types(self)`) when each type of application +event occurs. + +The list of arguments is statically defined for each tracepoint in +`ArgTypeMap._prototypes`. + +The subset of tracepoints visible to external tracing users can be restricted in +the MLServer settings, via the `tracepoint_settings.configured_tracepoints` +option. + +See `system_tracing.provider` for more details about the actual tracepoint +implementation. +""" +from typing import Optional, Tuple, List +from enum import Enum, IntEnum, auto + +# The underlying SDT tracepoint implementation currently only works on x86-64 +# architectures. For simplicity, it supports a maximum of 6 tracepoint +# arguments, as the x86-64 calling convention passes the first 6 function +# arguments via registers. +MAX_TRACEPOINT_ARGS = 6 + + +class ArgTypes(IntEnum): + """ + Encoding of Tracepoint argument types, as required by libstapsdt + """ + + # This matches the ArgTypes enum in stapsdt so it should not be changed. It + # is re-defined here so as to not depend on stapsdt when tracing is + # disabled. The values for each argument type are stable and not expected to + # change. + noarg = 0 + uint8 = 1 + int8 = -1 + uint16 = 2 + int16 = -2 + uint32 = 4 + int32 = -4 + uint64 = 8 + int64 = -8 + + +class RequestQueueId(IntEnum): + BatchQueue = 1 + WorkerQueue = 2 + + +class ArgTypeMap(IntEnum): + """ + Members of this enum give meaningful names to argument types used by + MLServer tracepoints + """ + + model_name_t = ArgTypes.uint64 + model_version_t = ArgTypes.uint64 + old_model_version_t = ArgTypes.uint64 + pipeline_t = ArgTypes.uint64 + tags_t = ArgTypes.uint64 + queue_len_t = ArgTypes.uint32 + worker_pid_t = ArgTypes.int32 + + +class ArgStatus(Enum): + Ok = auto() + TooManyArguments = auto() + NoPrototypeDefined = auto() + + +class Tracepoint(Enum): + """ + The available MLServer tracepoints. + + The tracing provider in `system_tracing.provider` has a number of member + functions (`tp_*`) for firing the tracepoints declared here when the + corresponding application-level events take place. + + The tracing provider may also be called directly, passing a member of this + enum as the first argument, together with the corresponding arguments. The + types of the tracepoint arguments are statically defined for each Tracepoint + (see `ArgTypeMap._prototypes`) + + Example: + ``` + sys_trace = SystemTracingProvider() + sys_trace.generate_native_sdt_tracepoints(tracepoint_settings) ... + + # Fire the model_load_begin tracepoint via explicit member function: + trace.tp_model_load_begin(model_name, model_version) + + # The equivalent generic call to be used if the tracepoint does not have + # an associated tp_* function (typically, during testing and development of + # new tracepoints) + sys_trace(Tracepoint.model_load_begin, model_name, model_version) + ``` + """ + + model_load_begin = auto() + model_load_end = auto() + model_reload_begin = auto() + model_reload_end = auto() + model_unload = auto() + inference_enqueue_req = auto() + inference_begin = auto() + inference_end = auto() + + def all(): + return {t for t in Tracepoint} + + def none(): + return set() + + def get_arg_types(self) -> Tuple[Optional[List[int]], ArgStatus]: + """ + Returns a (arg_types_list, status) tuple for the current tracepoint. The + arg_types_list, when not None, defines the function prototype of the + tracepoint. + + The returned arg_types_list is used by the TracingProvider when + registering the tracepoints with the underlying native tracepoint + implementation (libstapsdt). Based on this, libstapsdt dynamically + generates a shared library in ELF format, with one function + corresponding to each tracepoint and an ELF note section describing + where the tracepoint arguments can be found (in which registers) at call + time. + + Tracepoints may have a maximum of MAX_TRACEPOINT_ARGS arguments. If the + defined prototype has more, or if the prototype can not be found, this + will be reflected in the returned ArgStatus. Any ArgStatus different + from ArgStatus.Ok indicates an error state, albeit some states such as + ArgStatus.TooManyArguments are recoverable (for example, by ignoring the + additional arguments). + + The list of arguments used by each tracepoint is currently unstable. + """ + if self in ArgTypeMap._prototypes: + if len(ArgTypeMap._prototypes[self]) <= MAX_TRACEPOINT_ARGS: + return ArgTypeMap._prototypes[self], ArgStatus.Ok + else: + return ( + ArgTypeMap._prototypes[self][0:MAX_TRACEPOINT_ARGS], + ArgStatus.TooManyArguments, + ) + else: + return None, ArgStatus.NoPrototypeDefined + + +ArgTypeMap.model_args = [ArgTypeMap.model_name_t, ArgTypeMap.model_version_t] +ArgTypeMap.model_args_reload = [ + ArgTypeMap.model_name_t, + ArgTypeMap.model_version_t, + ArgTypeMap.old_model_version_t, +] +ArgTypeMap.queue_args = [ArgTypeMap.model_name_t, ArgTypeMap.queue_len_t] +ArgTypeMap.infer_args = [ + ArgTypeMap.model_name_t, + ArgTypeMap.pipeline_t, + ArgTypeMap.tags_t, + ArgTypeMap.worker_pid_t, +] + +# TODO(lucian): Stabilise the tracepoint argument definitions after writing a +# number of external applications using them +# +# Together with the tracepoint names, the prototypes form the public interface +# against which external probing code is developed. +ArgTypeMap._prototypes = { + Tracepoint.model_load_begin: ArgTypeMap.model_args, + Tracepoint.model_load_end: ArgTypeMap.model_args, + Tracepoint.model_reload_begin: ArgTypeMap.model_args_reload, + Tracepoint.model_reload_end: ArgTypeMap.model_args, + Tracepoint.model_unload: ArgTypeMap.model_args, + Tracepoint.inference_enqueue_req: ArgTypeMap.queue_args, + Tracepoint.inference_begin: ArgTypeMap.infer_args, + Tracepoint.inference_end: ArgTypeMap.infer_args, +} diff --git a/mlserver/system_tracing.py b/mlserver/system_tracing.py new file mode 100644 index 000000000..30bb6d255 --- /dev/null +++ b/mlserver/system_tracing.py @@ -0,0 +1,10 @@ +from .sys_tracing import SystemTracingProvider + +sysTracingProviderName = "mlserver" +sys_tracer: SystemTracingProvider = SystemTracingProvider(sysTracingProviderName) + + +def configure_tracepoints(sys_tracer: SystemTracingProvider, enable_tracepoints: bool): + if sys_tracer.tracepoints_count == 0: + sys_tracer.create_native_sdt_tracepoints(enable_tracepoints) + return sys_tracer diff --git a/poetry.lock b/poetry.lock index 24cc35f53..de66900db 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3313,7 +3313,7 @@ sqlserver = ["mlflow-dbstore"] [[package]] name = "mlserver-alibi-detect" -version = "1.4.0.dev2" +version = "1.4.0.dev3" description = "Alibi-Detect runtime for MLServer" optional = false python-versions = "^3.8.1,<3.12" @@ -3330,7 +3330,7 @@ url = "runtimes/alibi-detect" [[package]] name = "mlserver-alibi-explain" -version = "1.4.0.dev2" +version = "1.4.0.dev3" description = "Alibi-Explain runtime for MLServer" optional = false python-versions = "^3.8.1,<3.12" @@ -3348,7 +3348,7 @@ url = "runtimes/alibi-explain" [[package]] name = "mlserver-huggingface" -version = "1.4.0.dev2" +version = "1.4.0.dev3" description = "HuggingFace runtime for MLServer" optional = false python-versions = "^3.8.1,<3.12" @@ -3367,7 +3367,7 @@ url = "runtimes/huggingface" [[package]] name = "mlserver-lightgbm" -version = "1.4.0.dev2" +version = "1.4.0.dev3" description = "LightGBM runtime for MLServer" optional = false python-versions = "^3.8.1,<3.12" @@ -3385,7 +3385,7 @@ url = "runtimes/lightgbm" [[package]] name = "mlserver-mlflow" -version = "1.4.0.dev2" +version = "1.4.0.dev3" description = "MLflow runtime for MLServer" optional = false python-versions = "^3.8.1,<3.12" @@ -3402,7 +3402,7 @@ url = "runtimes/mlflow" [[package]] name = "mlserver-sklearn" -version = "1.4.0.dev2" +version = "1.4.0.dev3" description = "Scikit-Learn runtime for MLServer" optional = false python-versions = "^3.8.1,<3.12" @@ -3420,7 +3420,7 @@ url = "runtimes/sklearn" [[package]] name = "mlserver-xgboost" -version = "1.4.0.dev2" +version = "1.4.0.dev3" description = "XGBoost runtime for MLServer" optional = false python-versions = "^3.8.1,<3.12" @@ -6629,6 +6629,16 @@ files = [ [package.dependencies] catalogue = ">=2.0.3,<2.1.0" +[[package]] +name = "stapsdt" +version = "0.1.1" +description = "Create USDT probes and instrument your Python application dynamically" +optional = true +python-versions = "*" +files = [ + {file = "stapsdt-0.1.1.tar.gz", hash = "sha256:fabc2ca82a7c7f7307bd5326a1a1cace720aa6e1682a53afe09f6cb20218e636"}, +] + [[package]] name = "starlette" version = "0.27.0" @@ -8044,7 +8054,10 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"] test = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] +[extras] +tracepoints = ["stapsdt"] + [metadata] lock-version = "2.0" python-versions = "^3.8.1,<3.12" -content-hash = "551a2a87f59f2fc131dee0ee11d90d909ffbb69677f281598683f3af51d9f4b1" +content-hash = "d51b9f064c500b745d3530d434f454e25ae88df74f4b7129545a5dc1f36ba8ce" diff --git a/pyproject.toml b/pyproject.toml index 5f2c0da88..6a053a67b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,6 +59,7 @@ tritonclient = {version = "^2.24", extras = ["http"]} aiofiles = "*" orjson = "*" uvloop = {version = "*", markers = "sys_platform != 'win32' and (sys_platform != 'cygwin' and platform_python_implementation != 'PyPy')"} +stapsdt = {version = "^0.1.1", optional = true} ## The importlib-resources backport is required to use some ## functionality added in Python 3.10 @@ -69,6 +70,10 @@ opentelemetry-sdk = "^1.18.0" opentelemetry-exporter-otlp-proto-grpc = "^1.18.0" opentelemetry-instrumentation-grpc = "^0.39b0" +[tool.poetry.extras] +tracepoints = ["stapsdt"] +everything = ["stapsdt"] + [tool.poetry.group.dev.dependencies] datamodel-code-generator = "0.21.0" grpcio-tools = "1.48.1" diff --git a/tox.ini b/tox.ini index 7375dcb6f..e3d20c625 100644 --- a/tox.ini +++ b/tox.ini @@ -36,7 +36,8 @@ commands = commands_pre = poetry install --sync --no-root \ --with all-runtimes \ - --with all-runtimes-dev + --with all-runtimes-dev \ + --all-extras commands = pip-licenses \ --from=mixed \