diff --git a/docs/source/guides/snippets/metrics/metric_defs.py b/docs/source/guides/snippets/metrics/metric_defs.py
index ca48dce9ddb..e885cc09354 100644
--- a/docs/source/guides/snippets/metrics/metric_defs.py
+++ b/docs/source/guides/snippets/metrics/metric_defs.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 
-import bentoml
+import prometheus_client
 
-inference_duration = bentoml.metrics.Histogram(
+inference_duration = prometheus_client.Histogram(
     name="inference_duration",
     documentation="Duration of inference",
     labelnames=["nltk_version", "sentiment_cls"],
@@ -25,7 +25,7 @@
     ),
 )
 
-polarity_counter = bentoml.metrics.Counter(
+polarity_counter = prometheus_client.Counter(
     name="polarity_total",
     documentation="Count total number of analysis by polarity scores",
     labelnames=["polarity"],
diff --git a/src/_bentoml_impl/server/app.py b/src/_bentoml_impl/server/app.py
index 884e3382c92..5ef728ae39b 100644
--- a/src/_bentoml_impl/server/app.py
+++ b/src/_bentoml_impl/server/app.py
@@ -29,6 +29,7 @@
 
 if t.TYPE_CHECKING:
     from opentelemetry.sdk.trace import Span
+    from prometheus_client import Histogram
     from starlette.applications import Starlette
     from starlette.requests import Request
     from starlette.routing import BaseRoute
@@ -36,7 +37,6 @@
     from bentoml._internal import external_typing as ext
     from bentoml._internal.context import ServiceContext
     from bentoml._internal.types import LifecycleHook
-    from bentoml.metrics import Histogram
 
 R = t.TypeVar("R")
 
diff --git a/src/_bentoml_impl/worker/service.py b/src/_bentoml_impl/worker/service.py
index 103563df5e0..b579e05e124 100644
--- a/src/_bentoml_impl/worker/service.py
+++ b/src/_bentoml_impl/worker/service.py
@@ -174,6 +174,9 @@ def main(
 
     if prometheus_dir is not None:
         BentoMLContainer.prometheus_multiproc_dir.set(prometheus_dir)
+    os.environ["PROMETHEUS_MULTIPROC_DIR"] = (
+        BentoMLContainer.prometheus_multiproc_dir.get()
+    )
     server_context.service_name = service.name
     if service.bento is None:
         server_context.bento_name = service.name
diff --git a/src/bentoml/_internal/server/metrics/prometheus.py b/src/bentoml/_internal/server/metrics/prometheus.py
index 0e58fbbf6cb..72081ad5f7b 100644
--- a/src/bentoml/_internal/server/metrics/prometheus.py
+++ b/src/bentoml/_internal/server/metrics/prometheus.py
@@ -2,12 +2,13 @@
 
 import logging
 import os
-import sys
 import typing as t
 from functools import partial
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
+    from prometheus_client import Metric
+
     from ... import external_typing as ext
 
 logger = logging.getLogger(__name__)
@@ -45,20 +46,6 @@ def __init__(
 
     @property
     def prometheus_client(self):
-        if self.multiproc and not self._imported:
-            # step 1: check environment
-            assert (
-                "prometheus_client" not in sys.modules
-            ), "prometheus_client is already imported, multiprocessing will not work properly"
-
-            assert (
-                self.multiproc_dir
-            ), f"Invalid prometheus multiproc directory: {self.multiproc_dir}"
-            assert os.path.isdir(self.multiproc_dir)
-
-            os.environ["PROMETHEUS_MULTIPROC_DIR"] = self.multiproc_dir
-
-        # step 2:
         import prometheus_client
         import prometheus_client.exposition
         import prometheus_client.metrics
@@ -173,6 +160,10 @@ def Metric(self):
         """
         A Metric family and its samples.
 
-        This is a base class to be used by instrumentation client. Custom collectors should use ``bentoml.metrics.metrics_core.GaugeMetricFamily``, ``bentoml.metrics.metrics_core.CounterMetricFamily``, ``bentoml.metrics.metrics_core.SummaryMetricFamily`` instead.
+        This is a base class to be used by instrumentation client.
+        Custom collectors should use
+        ``prometheus_client.metrics_core.GaugeMetricFamily``,
+        ``prometheus_client.metrics_core.CounterMetricFamily``,
+        ``prometheus_client.metrics_core.SummaryMetricFamily`` instead.
         """
         return partial(self.prometheus_client.Metric, registry=self.registry)
diff --git a/src/bentoml/metrics.py b/src/bentoml/metrics.py
index 1640c66ac1a..30580ca630f 100644
--- a/src/bentoml/metrics.py
+++ b/src/bentoml/metrics.py
@@ -1,434 +1,13 @@
-from __future__ import annotations
+import sys
+import warnings
 
-import logging
-import typing as t
-from typing import TYPE_CHECKING
+import prometheus_client
 
-from simple_di import Provide
-from simple_di import inject
+warnings.warn(
+    "bentoml.metrics module is deprecated and will be removed in the future. "
+    "Please use prometheus_client directly for metrics reporting.",
+    DeprecationWarning,
+    stacklevel=1,
+)
 
-from ._internal.configuration.containers import BentoMLContainer
-
-if TYPE_CHECKING:
-    from ._internal.server.metrics.prometheus import PrometheusClient
-
-logger = logging.getLogger(__name__)
-
-# NOTE: We have to set our docstring here due to the fact that
-# we are lazy loading the metrics. This means that the docstring
-# won't be discovered until the metrics is initialized.
-# this won't work with help() or doocstring on Sphinx.
-# While this is less than optimal, we will do this since 'bentoml.metrics'
-# is a public API.
-_MAKE_WSGI_APP_DOCSTRING = """\
-Create a WSGI app which serves the metrics from a registry.
-
-Returns:
-    WSGIApp: A WSGI app which serves the metrics from a registry.
-"""
-_GENERATE_LATEST_DOCSTRING = """\
-Returns metrics from the registry in latest text format as a string.
-
-This function ensures that multiprocess is setup correctly.
-
-Returns:
-    str: Metrics in latest text format. Refer to `Exposition format <https://prometheus.io/docs/instrumenting/exposition_formats/#exposition-formats>`_ for details.
-"""
-_TEXT_STRING_TO_METRIC_DOCSTRING = """
-Parse Prometheus text format from a unicode string.
-
-Returns:
-    Metric: A generator that yields `Metric <https://prometheus.io/docs/concepts/metric_types/>`_ objects.
-"""
-_HISTOGRAM_DOCSTRING = """\
-A Histogram tracks the size and number of events in a given bucket.
-
-Histograms are often used to aggregatable calculation of quantiles.
-Some notable examples include measuring response latency, request size.
-
-A quick example of a Histogram:
-
-.. code-block:: python
-
-    from bentoml.metrics import Histogram
-
-    h = Histogram('request_size_bytes', 'Request size (bytes)')
-
-    @svc.api(input=JSON(), output=JSON())
-    def predict(input_data: dict[str, str]):
-        h.observe(512)  # Observe 512 (bytes)
-        ...
-
-``observe()`` will observe for given amount of time.
-Usually, this value are positive or zero. Negative values are accepted but will
-prevent current versions of Prometheus from properly detecting counter resets in the `sum of observations <https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations>`_.
-
-Histograms also provide ``time()``, which times a block of code or function, and observe for a given duration amount.
-This function can also be used as a context manager.
-
-.. tab-set::
-
-    .. tab-item:: Example
-
-        .. code-block:: python
-
-            from bentoml.metrics import Histogram
-
-            REQUEST_TIME = Histogram('response_latency_seconds', 'Response latency (seconds)')
-
-            @REQUEST_TIME.time()
-            def create_response(request):
-                body = await request.json()
-                return Response(body)
-
-    .. tab-item:: Context Manager
-
-        .. code-block:: python
-
-            from bentoml.metrics import Histogram
-
-            REQUEST_TIME = Histogram('response_latency_seconds', 'Response latency (seconds)')
-
-            def create_response(request):
-                body = await request.json()
-                with REQUEST_TIME.time():
-                    ...
-
-The default buckets are intended to cover a typical web/rpc request from milliseconds to seconds.
-See :ref:`configuration guides <guides/configuration:Configuration>` to see how to customize the buckets.
-
-Args:
-    name (str): The name of the metric.
-    documentation (str): A documentation string.
-    labelnames (tuple[str]): A tuple of strings specifying the label names for the metric. Defaults to ``()``.
-    namespace (str): The namespace of the metric. Defaults to an empty string.
-    subsystem (str): The subsystem of the metric. Defaults to an empty string.
-    unit (str): The unit of the metric. Defaults to an empty string.
-    buckets (list[float]): A list of float representing a bucket. Defaults to ``(.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, 7.5, 10.0, INF)``.
-"""
-_COUNTER_DOCSTRING = """
-A Counter tracks counts of events or running totals.
-
-.. epigraph::
-
-    It is a cumulative metric that represents a single `monotonically increasing counter <https://prometheus.io/docs/concepts/metric_types/#counter>`_ whose value can only increase or be reset to zero on restart.
-
-Some notable examples include counting the number of requests served, tasks completed, or errors.
-
-If you need to go down, uses :func:`bentoml.metrics.Gauge` instead.
-
-A quick example of a Counter:
-
-.. code-block:: python
-
-    from bentoml.metrics import Counter
-
-    c = Counter('failures', 'Total number of failures requests')
-
-    @svc.api(input=JSON(), output=JSON())
-    def predict(input_data: dict[str, str]):
-        if input_data['fail']:
-            c.inc()  # increment by 1 by default
-
-``inc()`` can optionally pass in a ``exemplar``, which is a dictionary of keys and values, defined :github:`here <OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#exemplars>`.
-
-``inc()`` can also increment by any given amount:
-
-.. code-block:: python
-
-    c.inc(2.1)
-
-``count_exceptions()`` can be used as both a decorator and context manager to count exceptions raised.
-
-.. tab-set::
-
-    .. tab-item:: Decorator
-
-        .. code-block:: python
-
-            from bentoml.metrics import Counter
-
-            c = Counter('failures', 'Total number of failures requests')
-
-            @c.count_exceptions()
-            @svc.api(input=JSON(), output=JSON())
-            def predict(input_data: dict[str, str]):
-                if input_data['acc'] < 0.5:
-                    raise ValueError("Given data is not accurate.")
-
-    .. tab-item:: Context Manager
-
-        .. code-block:: python
-
-            from bentoml.metrics import Histogram
-
-            c = Counter('failures', 'Total number of failures requests')
-
-            @svc.api(input=JSON(), output=JSON())
-            def predict(input_data: dict[str, str]):
-                with c.count_exceptions():
-                    if input_data['acc'] < 0.5:
-                        raise ValueError("Given data is not accurate.")
-                with c.count_exceptions(RuntimeError):
-                    if input_data['output'] is None:
-                        raise RuntimeError("Given pre-processing logic is invalid")
-
-``count_exceptions()`` will optionally take in an exception to only track specific exceptions.
-
-.. code-block:: python
-
-    ...
-    with c.count_exceptions(RuntimeError):
-        if input_data['output'] is None:
-            raise RuntimeError("Given pre-processing logic is invalid")
-
-Args:
-    name (str): The name of the metric.
-    documentation (str): A documentation string.
-    labelnames (tuple[str]): A tuple of strings specifying the label names for the metric. Defaults to ``()``.
-    namespace (str): The namespace of the metric. Defaults to an empty string.
-    subsystem (str): The subsystem of the metric. Defaults to an empty string.
-    unit (str): The unit of the metric. Defaults to an empty string.
-"""
-_SUMMARY_DOCSTRING = """
-A Summary tracks the size and `samples observations (usually things like request durations and response sizes).`.
-
-While it also provides a total count of observations and a sum of all observed values,
-it calculates configurable quantiles over a sliding time window.
-
-Notable examples include request latency and response size.
-
-A quick example of a Summary:
-
-.. code-block:: python
-
-    from bentoml.metrics import Summary
-
-    s = Summary('request_size_bytes', 'Request size (bytes)')
-
-    @svc.api(input=JSON(), output=JSON())
-    def predict(input_data: dict[str, str]):
-        s.observe(512)  # Observe 512 (bytes)
-        ...
-
-``observe()`` will observe for given amount of time.
-Usually, this value are positive or zero. Negative values are accepted but will
-prevent current versions of Prometheus from properly detecting counter resets in the `sum of observations <https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations>`_.
-
-Similar to :meth:`bentoml.metrics.Histogram`, ``time()`` can also be used as a decorator or context manager.
-
-.. tab-set::
-
-    .. tab-item:: Example
-
-        .. code-block:: python
-
-            from bentoml.metrics import Histogram
-
-            s = Summary('response_latency_seconds', 'Response latency (seconds)')
-
-            @s.time()
-            def create_response(request):
-                body = await request.json()
-                return Response(body)
-
-    .. tab-item:: Context Manager
-
-        .. code-block:: python
-
-            from bentoml.metrics import Histogram
-
-            s = Summary('response_latency_seconds', 'Response latency (seconds)')
-
-            def create_response(request):
-                body = await request.json()
-                with s.time():
-                    ...
-
-Args:
-    name (str): The name of the metric.
-    documentation (str): A documentation string.
-    labelnames (tuple[str]): A tuple of strings specifying the label names for the metric. Defaults to ``()``.
-    namespace (str): The namespace of the metric. Defaults to an empty string.
-    subsystem (str): The subsystem of the metric. Defaults to an empty string.
-    unit (str): The unit of the metric. Defaults to an empty string.
-"""
-_GAUGE_DOCSTRING = """
-A Gauge represents a single numerical value that can arbitrarily go up and down.
-
-Gauges are typically used to for report instantaneous values like temperatures or current memory usage.
-One can think of Gauge as a :meth:`bentoml.metrics.Counter` that can go up and down.
-
-Notable examples include in-progress requests, number of item in a queue, and free memory.
-
-A quick example of a Gauge:
-
-.. code-block:: python
-
-    from bentoml.metrics import Gauge
-
-    g = Gauge('inprogress_request', 'Request inprogress')
-
-    @svc.api(input=JSON(), output=JSON())
-    def predict(input_data: dict[str, str]):
-        g.inc()  # increment by 1 by default
-        g.dec(10) # decrement by any given value
-        g.set(0)  # set to a given value
-        ...
-
-.. note::
-
-    By default, ``inc()`` and ``dec()`` will increment and decrement by 1 respectively.
-
-Gauge also provide ``track_inprogress()``, to track inprogress object.
-This function can also be used as either a context manager or a decorator.
-
-.. tab-set::
-
-    .. tab-item:: Example
-
-        .. code-block:: python
-
-            from bentoml.metrics import Gauge
-
-            g = Gauge('inprogress_request', 'Request inprogress')
-
-            @svc.api(input=JSON(), output=JSON())
-            @g.track_inprogress()
-            def predict(input_data: dict[str, str]):
-                ...
-
-    .. tab-item:: Context Manager
-
-        .. code-block:: python
-
-            from bentoml.metrics import Gauge
-
-            g = Gauge('inprogress_request', 'Request inprogress')
-
-            @svc.api(input=JSON(), output=JSON())
-            def predict(input_data: dict[str, str]):
-                with g.track_inprogress():
-                    ...
-
-        The gauge will increment when the context is entered and decrement when the context is exited.
-
-Args:
-    name (str): The name of the metric.
-    documentation (str): A documentation string.
-    labelnames (tuple[str]): A tuple of strings specifying the label names for the metric. Defaults to ``()``.
-    namespace (str): The namespace of the metric. Defaults to an empty string.
-    subsystem (str): The subsystem of the metric. Defaults to an empty string.
-    unit (str): The unit of the metric. Defaults to an empty string.
-    multiprocess_mode (str): The multiprocess mode of the metric. Defaults to ``all``. Available options
-                             are (``all``, ``min``, ``max``, ``livesum``, ``liveall``)
-"""
-
-# This sets of functions are implemented in the PrometheusClient class
-_INTERNAL_FN_IMPL = {
-    "make_wsgi_app": _MAKE_WSGI_APP_DOCSTRING,
-    "generate_latest": _GENERATE_LATEST_DOCSTRING,
-    "text_string_to_metric_families": _TEXT_STRING_TO_METRIC_DOCSTRING,
-}
-_NOT_IMPLEMENTED = [
-    "delete_from_gateway",
-    "instance_ip_grouping_key",
-    "push_to_gateway",
-    "pushadd_to_gateway",
-]
-_NOT_SUPPORTED = [
-    "GC_COLLECTOR",
-    "GCCollector",
-    "PLATFORM_COLLECTOR",
-    "PlatformCollector",
-    "PROCESS_COLLECTOR",
-    "ProcessCollector",
-    "REGISTRY",
-    "CONTENT_TYPE_LATEST",
-    "start_http_server",
-    "start_wsgi_server",
-    "make_asgi_app",
-    "write_to_textfile",
-] + _NOT_IMPLEMENTED
-_docstring = {
-    "Counter": _COUNTER_DOCSTRING,
-    "Histogram": _HISTOGRAM_DOCSTRING,
-    "Summary": _SUMMARY_DOCSTRING,
-    "Gauge": _GAUGE_DOCSTRING,
-}
-_docstring.update(_INTERNAL_FN_IMPL)
-
-
-def __dir__() -> list[str]:
-    # This is for IPython and IDE autocompletion.
-    metrics_client = BentoMLContainer.metrics_client.get()
-    return list(set(dir(metrics_client.prometheus_client)) - set(_NOT_SUPPORTED))
-
-
-def __getattr__(item: t.Any):
-    if item in _NOT_SUPPORTED:
-        raise NotImplementedError(
-            f"{item} is not supported when using '{__name__}'. See https://docs.bentoml.com/en/latest/reference/metrics.html."
-        )
-    # This is the entrypoint for all bentoml.metrics.*
-    return _LazyMetric(item, docstring=_docstring.get(item))
-
-
-class _LazyMetric:
-    __slots__ = ("_attr", "_proxy", "_initialized", "_args", "_kwargs", "__doc__")
-
-    def __init__(self, attr: str, docstring: str | None = None):
-        self._attr = attr
-        self.__doc__ = docstring
-        self._proxy = None
-        self._initialized = False
-        self._args: tuple[t.Any, ...] = ()
-        self._kwargs: dict[str, t.Any] = {}
-
-    def __call__(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
-        """
-        Lazily initialize the metrics object.
-
-        Args:
-            *args: Arguments to pass to the metrics object.
-            **kwargs: Keyword arguments to pass to the metrics object.
-        """
-        if "registry" in kwargs:
-            raise ValueError(
-                f"'registry' should not be passed when using '{__name__}.{self._attr}'. See https://docs.bentoml.com/en/latest/reference/metrics.html."
-            )
-        self._args = args
-        self._kwargs = kwargs
-        if self._attr in _INTERNAL_FN_IMPL:
-            # first-class function implementation from BentoML Prometheus client.
-            # In this case, the function will be called directly.
-            return self._load_proxy()
-        return self
-
-    def __getattr__(self, item: t.Any) -> t.Any:
-        if item in self.__slots__:
-            raise AttributeError(f"Attribute {item} is private to {self}.")
-        if self._proxy is None:
-            self._proxy = self._load_proxy()
-        assert self._initialized and self._proxy is not None
-        return getattr(self._proxy, item)
-
-    def __dir__(self) -> list[str]:
-        if self._proxy is None:
-            self._proxy = self._load_proxy()
-        assert self._initialized and self._proxy is not None
-        return dir(self._proxy)
-
-    @inject
-    def _load_proxy(
-        self,
-        metrics_client: PrometheusClient = Provide[BentoMLContainer.metrics_client],
-    ) -> None:
-        client_impl = (
-            metrics_client
-            if self._attr in dir(metrics_client)
-            else metrics_client.prometheus_client
-        )
-        proxy = getattr(client_impl, self._attr)(*self._args, **self._kwargs)
-        self._initialized = True
-        return proxy
+sys.modules[__name__] = prometheus_client
diff --git a/src/bentoml_cli/worker/grpc_api_server.py b/src/bentoml_cli/worker/grpc_api_server.py
index 1610fc594f1..142c5491d3c 100644
--- a/src/bentoml_cli/worker/grpc_api_server.py
+++ b/src/bentoml_cli/worker/grpc_api_server.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import json
+import os
 import typing as t
 
 import click
@@ -125,6 +126,9 @@ def main(
     BentoMLContainer.development_mode.set(development_mode)
     if prometheus_dir is not None:
         BentoMLContainer.prometheus_multiproc_dir.set(prometheus_dir)
+    os.environ["PROMETHEUS_MULTIPROC_DIR"] = (
+        BentoMLContainer.prometheus_multiproc_dir.get()
+    )
     if runner_map is not None:
         BentoMLContainer.remote_runner_mapping.set(json.loads(runner_map))
 
diff --git a/src/bentoml_cli/worker/grpc_prometheus_server.py b/src/bentoml_cli/worker/grpc_prometheus_server.py
index 9c1a358aec2..6737662782c 100644
--- a/src/bentoml_cli/worker/grpc_prometheus_server.py
+++ b/src/bentoml_cli/worker/grpc_prometheus_server.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 import typing as t
 from typing import TYPE_CHECKING
 
@@ -68,7 +69,9 @@ def main(fd: int, backlog: int, prometheus_dir: str | None):
     metrics_client = BentoMLContainer.metrics_client.get()
     if prometheus_dir is not None:
         BentoMLContainer.prometheus_multiproc_dir.set(prometheus_dir)
-
+    os.environ["PROMETHEUS_MULTIPROC_DIR"] = (
+        BentoMLContainer.prometheus_multiproc_dir.get()
+    )
     # create a ASGI app that wraps around the default HTTP prometheus server.
     prom_app = Starlette(
         debug=get_debug_mode(), middleware=[Middleware(GenerateLatestMiddleware)]
diff --git a/src/bentoml_cli/worker/http_api_server.py b/src/bentoml_cli/worker/http_api_server.py
index 31ddc54d578..a391d03c33e 100644
--- a/src/bentoml_cli/worker/http_api_server.py
+++ b/src/bentoml_cli/worker/http_api_server.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import json
+import os
 import socket
 import typing as t
 
@@ -152,7 +153,9 @@ def main(
     BentoMLContainer.development_mode.set(development_mode)
     if prometheus_dir is not None:
         BentoMLContainer.prometheus_multiproc_dir.set(prometheus_dir)
-
+    os.environ["PROMETHEUS_MULTIPROC_DIR"] = (
+        BentoMLContainer.prometheus_multiproc_dir.get()
+    )
     if runner_map is not None:
         BentoMLContainer.remote_runner_mapping.set(json.loads(runner_map))
     if timeout is not None:
diff --git a/src/bentoml_cli/worker/runner.py b/src/bentoml_cli/worker/runner.py
index 56147cfa156..cae3ab724fd 100644
--- a/src/bentoml_cli/worker/runner.py
+++ b/src/bentoml_cli/worker/runner.py
@@ -106,7 +106,9 @@ def main(
 
     if prometheus_dir is not None:
         BentoMLContainer.prometheus_multiproc_dir.set(prometheus_dir)
-
+    os.environ["PROMETHEUS_MULTIPROC_DIR"] = (
+        BentoMLContainer.prometheus_multiproc_dir.get()
+    )
     if no_access_log:
         access_log_config = BentoMLContainer.runners_config.logging.access
         access_log_config.enabled.set(False)
diff --git a/tests/e2e/bento_server_grpc/service.py b/tests/e2e/bento_server_grpc/service.py
index 587f4a39810..5c265bd2e78 100644
--- a/tests/e2e/bento_server_grpc/service.py
+++ b/tests/e2e/bento_server_grpc/service.py
@@ -4,6 +4,7 @@
 import typing as t
 from typing import TYPE_CHECKING
 
+import prometheus_client
 from context_server_interceptor import AsyncContextInterceptor
 from pydantic import BaseModel
 
@@ -171,7 +172,7 @@ async def echo_image(f: PIL.Image.Image) -> NDArray[t.Any]:
     return np.array(f)
 
 
-histogram = bentoml.metrics.Histogram(
+histogram = prometheus_client.Histogram(
     name="inference_latency",
     documentation="Inference latency in seconds",
     labelnames=["model_name", "model_version"],
@@ -199,9 +200,12 @@ async def predict_multi_images(original: Image, compared: Image):
 
 @svc.api(input=bentoml.io.Text(), output=bentoml.io.Text())
 def ensure_metrics_are_registered(_: str) -> None:
+    from prometheus_client import generate_latest
+    from prometheus_client.parser import text_string_to_metric_families
+
     histograms = [
         m.name
-        for m in bentoml.metrics.text_string_to_metric_families()
+        for m in text_string_to_metric_families(generate_latest().decode())
         if m.type == "histogram"
     ]
     assert "inference_latency" in histograms
diff --git a/tests/e2e/bento_server_http/service.py b/tests/e2e/bento_server_http/service.py
index 534a1c4ed7e..c9a1e65bd46 100644
--- a/tests/e2e/bento_server_http/service.py
+++ b/tests/e2e/bento_server_http/service.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+import prometheus_client
 import pydantic
 from fastapi import FastAPI
 from PIL.Image import Image as PILImage
@@ -59,7 +60,7 @@ async def count_text_stream(self, input_text: str) -> t.AsyncGenerator[str, None
 TEST_DIR = os.getenv("BENTOML_TEST_DATA")
 
 
-metric_test = bentoml.metrics.Counter(
+metric_test = prometheus_client.Counter(
     name="test_metrics", documentation="Counter test metric"
 )
 
@@ -78,9 +79,12 @@ async def echo_delay(data: dict[str, t.Any]) -> JSONSerializable:
 
 @svc.api(input=bentoml.io.Text(), output=bentoml.io.Text())
 def ensure_metrics_are_registered(data: str) -> str:  # pylint: disable=unused-argument
+    from prometheus_client import generate_latest
+    from prometheus_client.parser import text_string_to_metric_families
+
     counters = [
         m.name
-        for m in bentoml.metrics.text_string_to_metric_families()
+        for m in text_string_to_metric_families(generate_latest().decode())
         if m.type == "counter"
     ]
     assert "test_metrics" in counters
diff --git a/tests/unit/test_metrics.py b/tests/unit/test_metrics.py
deleted file mode 100644
index d243671e2bd..00000000000
--- a/tests/unit/test_metrics.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from __future__ import annotations
-
-import bentoml
-
-
-def test_metrics_initialization():
-    o = bentoml.metrics.Gauge(name="test_metrics", documentation="test")
-    assert isinstance(o, bentoml.metrics._LazyMetric)
-    assert o._proxy is None
-    o = bentoml.metrics.Histogram(name="test_metrics", documentation="test")
-    assert isinstance(o, bentoml.metrics._LazyMetric)
-    assert o._proxy is None
-    o = bentoml.metrics.Counter(name="test_metrics", documentation="test")
-    assert isinstance(o, bentoml.metrics._LazyMetric)
-    assert o._proxy is None
-    o = bentoml.metrics.Summary(name="test_metrics", documentation="test")
-    assert isinstance(o, bentoml.metrics._LazyMetric)
-    assert o._proxy is None