From 56fa3fa37f4ca0128a596288a3944c58608d9832 Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Sun, 14 Jul 2024 19:15:21 +0200 Subject: [PATCH 01/16] Extract retry mechanic from OTLPExporterMixin This is the first change in a chain of commits to rework the retry mechanic. It is based on the work of #3764 and basically trying to land the changes proposed by this monolithic commit step by step. The plan is roughly to proceed in these steps: * Extract retry mechanic from GRPC exporters * Consolidate HTTP with GRPC exporter retry implementation * Pipe timeout through RetryingExporter * Make exporter lock protect the whole export instead of just a single iteration * Make timeout float instead of int * Add back-off with jitter It's pretty likely that the plan will change along the way. --- .../exporter/otlp/proto/common/exporter.py | 80 ++++++++ .../otlp/proto/grpc/_log_exporter/__init__.py | 2 +- .../exporter/otlp/proto/grpc/exporter.py | 172 +++++++----------- .../proto/grpc/metric_exporter/__init__.py | 6 +- .../proto/grpc/trace_exporter/__init__.py | 27 ++- .../tests/logs/test_otlp_logs_exporter.py | 8 +- .../tests/test_otlp_exporter_mixin.py | 34 ++-- .../tests/test_otlp_metrics_exporter.py | 18 +- .../tests/test_otlp_trace_exporter.py | 12 +- 9 files changed, 204 insertions(+), 155 deletions(-) create mode 100644 exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py new file mode 100644 index 00000000000..cc05fdf7dc6 --- /dev/null +++ b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py @@ -0,0 +1,80 @@ +# Copyright The OpenTelemetry Authors +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import threading +from logging import getLogger +from time import sleep +from typing import Callable, Generic, Optional, Type, TypeVar + +from ._internal import _create_exp_backoff_generator + +ExportResultT = TypeVar("ExportResultT", covariant=True) +ExportPayloadT = TypeVar("ExportPayloadT", covariant=True) + +_logger = getLogger(__name__) + + +class RetryableExportError(Exception): + def __init__(self, retry_delay_sec: Optional[int]): + super().__init__() + self.retry_delay_sec = retry_delay_sec + + +class RetryingExporter(Generic[ExportResultT]): + def __init__( + self, + export_function: Callable[[ExportPayloadT], ExportResultT], + result: Type[ExportResultT], + ): + self._export_function = export_function + self._result = result + + self._shutdown = False + self._export_lock = threading.Lock() + + def shutdown(self, timeout_millis: float = 30_000) -> None: + # wait for the last export if any + self._export_lock.acquire( # pylint: disable=consider-using-with + timeout=timeout_millis / 1e3 + ) + self._shutdown = True + self._export_lock.release() + + def export_with_retry(self, payload: ExportPayloadT) -> ExportResultT: + # After the call to shutdown, subsequent calls to Export are + # not allowed and should return a Failure result. + if self._shutdown: + _logger.warning("Exporter already shutdown, ignoring batch") + return self._result.FAILURE + + max_value = 64 + # expo returns a generator that yields delay values which grow + # exponentially. Once delay is greater than max_value, the yielded + # value will remain constant. + for delay in _create_exp_backoff_generator(max_value=max_value): + if delay == max_value or self._shutdown: + return self._result.FAILURE + + with self._export_lock: + try: + return self._export_function(payload) + except RetryableExportError as exc: + delay_sec = ( + exc.retry_delay_sec + if exc.retry_delay_sec is not None + else delay + ) + _logger.warning("Retrying in %ss", delay_sec) + sleep(delay_sec) + + return self._result.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py index d8f2ba2efb6..4c02b1346ff 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py @@ -110,7 +110,7 @@ def _translate_data( return encode_logs(data) def export(self, batch: Sequence[LogData]) -> LogExportResult: - return self._export(batch) + return self._exporter.export_with_retry(batch) def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: OTLPExporterMixin.shutdown(self, timeout_millis=timeout_millis) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index b07d24e0d0e..6d48201e877 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -14,32 +14,15 @@ """OTLP Exporter""" -import threading from abc import ABC, abstractmethod -from collections.abc import Sequence # noqa: F401 from logging import getLogger from os import environ -from time import sleep -from typing import ( # noqa: F401 - Any, - Callable, - Dict, - Generic, - List, - Optional, - Tuple, - Union, -) +from typing import Any, Callable, Dict, Generic, List, Optional # noqa: F401 from typing import Sequence as TypingSequence -from typing import TypeVar +from typing import Tuple, TypeVar, Union # noqa: F401 from urllib.parse import urlparse from deprecated import deprecated - -from opentelemetry.exporter.otlp.proto.common._internal import ( - _get_resource_data, - _create_exp_backoff_generator, -) from google.rpc.error_details_pb2 import RetryInfo from grpc import ( ChannelCredentials, @@ -51,9 +34,14 @@ ssl_channel_credentials, ) -from opentelemetry.exporter.otlp.proto.grpc import ( - _OTLP_GRPC_HEADERS, +from opentelemetry.exporter.otlp.proto.common._internal import ( + _get_resource_data, +) +from opentelemetry.exporter.otlp.proto.common.exporter import ( + RetryableExportError, + RetryingExporter, ) +from opentelemetry.exporter.otlp.proto.grpc import _OTLP_GRPC_HEADERS from opentelemetry.proto.common.v1.common_pb2 import ( # noqa: F401 AnyValue, ArrayValue, @@ -260,8 +248,8 @@ def __init__( ) ) - self._export_lock = threading.Lock() self._shutdown = False + self._exporter = RetryingExporter(self._export, self._result) @abstractmethod def _translate_data( @@ -272,98 +260,72 @@ def _translate_data( def _export( self, data: Union[TypingSequence[ReadableSpan], MetricsData] ) -> ExportResultT: - # After the call to shutdown, subsequent calls to Export are - # not allowed and should return a Failure result. - if self._shutdown: - logger.warning("Exporter already shutdown, ignoring batch") - return self._result.FAILURE + try: + self._client.Export( + request=self._translate_data(data), + metadata=self._headers, + timeout=self._timeout, + ) - # FIXME remove this check if the export type for traces - # gets updated to a class that represents the proto - # TracesData and use the code below instead. - # logger.warning( - # "Transient error %s encountered while exporting %s, retrying in %ss.", - # error.code(), - # data.__class__.__name__, - # delay, - # ) - max_value = 64 - # expo returns a generator that yields delay values which grow - # exponentially. Once delay is greater than max_value, the yielded - # value will remain constant. - for delay in _create_exp_backoff_generator(max_value=max_value): - if delay == max_value or self._shutdown: - return self._result.FAILURE - - with self._export_lock: - try: - self._client.Export( - request=self._translate_data(data), - metadata=self._headers, - timeout=self._timeout, + return self._result.SUCCESS + + except RpcError as error: + if error.code() in [ + StatusCode.CANCELLED, + StatusCode.DEADLINE_EXCEEDED, + StatusCode.RESOURCE_EXHAUSTED, + StatusCode.ABORTED, + StatusCode.OUT_OF_RANGE, + StatusCode.UNAVAILABLE, + StatusCode.DATA_LOSS, + ]: + retry_info_bin = dict(error.trailing_metadata()).get( + "google.rpc.retryinfo-bin" + ) + if retry_info_bin is not None: + retry_info = RetryInfo() + retry_info.ParseFromString(retry_info_bin) + delay = ( + retry_info.retry_delay.seconds + + retry_info.retry_delay.nanos / 1.0e9 ) + else: + delay = None + + logger.warning( + ( + "Transient error %s encountered while exporting " + "%s to %s" + ), + error.code(), + self._exporting, + self._endpoint, + ) + raise RetryableExportError(delay) + + logger.error( + "Failed to export %s to %s, error code: %s", + self._exporting, + self._endpoint, + error.code(), + exc_info=error.code() == StatusCode.UNKNOWN, + ) - return self._result.SUCCESS - - except RpcError as error: - - if error.code() in [ - StatusCode.CANCELLED, - StatusCode.DEADLINE_EXCEEDED, - StatusCode.RESOURCE_EXHAUSTED, - StatusCode.ABORTED, - StatusCode.OUT_OF_RANGE, - StatusCode.UNAVAILABLE, - StatusCode.DATA_LOSS, - ]: - - retry_info_bin = dict(error.trailing_metadata()).get( - "google.rpc.retryinfo-bin" - ) - if retry_info_bin is not None: - retry_info = RetryInfo() - retry_info.ParseFromString(retry_info_bin) - delay = ( - retry_info.retry_delay.seconds - + retry_info.retry_delay.nanos / 1.0e9 - ) - - logger.warning( - ( - "Transient error %s encountered while exporting " - "%s to %s, retrying in %ss." - ), - error.code(), - self._exporting, - self._endpoint, - delay, - ) - sleep(delay) - continue - else: - logger.error( - "Failed to export %s to %s, error code: %s", - self._exporting, - self._endpoint, - error.code(), - exc_info=error.code() == StatusCode.UNKNOWN, - ) - - if error.code() == StatusCode.OK: - return self._result.SUCCESS - - return self._result.FAILURE - - return self._result.FAILURE + if error.code() == StatusCode.OK: + return self._result.SUCCESS + + return self._result.FAILURE + + @abstractmethod + def export(self, data) -> ExportResultT: + pass def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: if self._shutdown: logger.warning("Exporter already shutdown, ignoring call") return - # wait for the last export if any - self._export_lock.acquire(timeout=timeout_millis / 1e3) + self._exporter.shutdown(timeout_millis=timeout_millis) self._shutdown = True - self._export_lock.release() @property @abstractmethod diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py index 645885b6f28..a6220fb8594 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py @@ -162,12 +162,14 @@ def export( ) -> MetricExportResult: # TODO(#2663): OTLPExporterMixin should pass timeout to gRPC if self._max_export_batch_size is None: - return self._export(data=metrics_data) + return self._exporter.export_with_retry(metrics_data) export_result = MetricExportResult.SUCCESS for split_metrics_data in self._split_metrics_data(metrics_data): - split_export_result = self._export(data=split_metrics_data) + split_export_result = self._exporter.export_with_retry( + split_metrics_data + ) if split_export_result is MetricExportResult.FAILURE: export_result = MetricExportResult.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py index ce8dcaabdf7..41ca51cf27b 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py @@ -15,21 +15,17 @@ import logging from os import environ -from typing import Dict, Optional, Sequence, Tuple, Union -from typing import Sequence as TypingSequence - +from typing import Dict, Optional +from typing import Sequence +from typing import Tuple, Union from grpc import ChannelCredentials, Compression -from opentelemetry.exporter.otlp.proto.common.trace_encoder import ( - encode_spans, -) -from opentelemetry.exporter.otlp.proto.grpc.exporter import ( +from opentelemetry.exporter.otlp.proto.common.trace_encoder import encode_spans +from opentelemetry.exporter.otlp.proto.grpc.exporter import ( # noqa: F401 OTLPExporterMixin, _get_credentials, environ_to_compression, -) -from opentelemetry.exporter.otlp.proto.grpc.exporter import ( # noqa: F401 get_resource_data, ) from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import ( @@ -41,12 +37,14 @@ from opentelemetry.proto.common.v1.common_pb2 import ( # noqa: F401 InstrumentationScope, ) -from opentelemetry.proto.trace.v1.trace_pb2 import ( # noqa: F401 - ScopeSpans, +from opentelemetry.proto.trace.v1.trace_pb2 import Status # noqa: F401 +from opentelemetry.proto.trace.v1.trace_pb2 import ( # noqa: F40 ResourceSpans, + ScopeSpans, +) +from opentelemetry.proto.trace.v1.trace_pb2 import ( # noqa: F401 Span as CollectorSpan, ) -from opentelemetry.proto.trace.v1.trace_pb2 import Status # noqa: F401 from opentelemetry.sdk.environment_variables import ( OTEL_EXPORTER_OTLP_TRACES_CLIENT_CERTIFICATE, OTEL_EXPORTER_OTLP_TRACES_CLIENT_KEY, @@ -91,12 +89,11 @@ def __init__( insecure: Optional[bool] = None, credentials: Optional[ChannelCredentials] = None, headers: Optional[ - Union[TypingSequence[Tuple[str, str]], Dict[str, str], str] + Union[Sequence[Tuple[str, str]], Dict[str, str], str] ] = None, timeout: Optional[int] = None, compression: Optional[Compression] = None, ): - if insecure is None: insecure = environ.get(OTEL_EXPORTER_OTLP_TRACES_INSECURE) if insecure is not None: @@ -143,7 +140,7 @@ def _translate_data( return encode_spans(data) def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: - return self._export(spans) + return self._exporter.export_with_retry(spans) def shutdown(self) -> None: OTLPExporterMixin.shutdown(self) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py index fc2211c5aeb..8800f23096a 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py @@ -389,9 +389,9 @@ def test_otlp_headers_from_env(self): ) @patch( - "opentelemetry.exporter.otlp.proto.grpc.exporter._create_exp_backoff_generator" + "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.sleep") + @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") def test_unavailable(self, mock_sleep, mock_expo): mock_expo.configure_mock(**{"return_value": [0.01]}) @@ -405,9 +405,9 @@ def test_unavailable(self, mock_sleep, mock_expo): mock_sleep.assert_called_with(0.01) @patch( - "opentelemetry.exporter.otlp.proto.grpc.exporter._create_exp_backoff_generator" + "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.sleep") + @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") def test_unavailable_delay(self, mock_sleep, mock_expo): mock_expo.configure_mock(**{"return_value": [1]}) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index f7bbdabb11f..9a8a3598d9f 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -63,7 +63,7 @@ def test_environ_to_compression(self): environ_to_compression("test_invalid") @patch( - "opentelemetry.exporter.otlp.proto.grpc.exporter._create_exp_backoff_generator" + "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) def test_export_warning(self, mock_expo): mock_expo.configure_mock(**{"return_value": [0]}) @@ -86,6 +86,9 @@ def _translate_data( ) -> ExportServiceRequestT: pass + def export(self, data): + return self._exporter.export_with_retry(data) + @property def _exporting(self) -> str: return "mock" @@ -93,8 +96,7 @@ def _exporting(self) -> str: otlp_mock_exporter = OTLPMockExporter() with self.assertLogs(level=WARNING) as warning: - # pylint: disable=protected-access - otlp_mock_exporter._export(Mock()) + otlp_mock_exporter.export(Mock()) self.assertEqual( warning.records[0].message, "Failed to export mock to localhost:4317, error code: None", @@ -110,15 +112,15 @@ def trailing_metadata(self): rpc_error.trailing_metadata = MethodType(trailing_metadata, rpc_error) with self.assertLogs(level=WARNING) as warning: - # pylint: disable=protected-access - otlp_mock_exporter._export([]) + otlp_mock_exporter.export([]) self.assertEqual( warning.records[0].message, ( "Transient error StatusCode.CANCELLED encountered " - "while exporting mock to localhost:4317, retrying in 0s." + "while exporting mock to localhost:4317" ), ) + self.assertEqual(warning.records[1].message, "Retrying in 0s") def test_shutdown(self): result_mock = Mock() @@ -132,6 +134,9 @@ def _translate_data( ) -> ExportServiceRequestT: pass + def export(self, data): + return self._exporter.export_with_retry(data) + @property def _exporting(self) -> str: return "mock" @@ -139,14 +144,12 @@ def _exporting(self) -> str: otlp_mock_exporter = OTLPMockExporter() with self.assertLogs(level=WARNING) as warning: - # pylint: disable=protected-access self.assertEqual( - otlp_mock_exporter._export(data={}), result_mock.SUCCESS + otlp_mock_exporter.export(data={}), result_mock.SUCCESS ) otlp_mock_exporter.shutdown() - # pylint: disable=protected-access self.assertEqual( - otlp_mock_exporter._export(data={}), result_mock.FAILURE + otlp_mock_exporter.export(data={}), result_mock.FAILURE ) self.assertEqual( warning.records[0].message, @@ -181,6 +184,9 @@ def _translate_data( ) -> ExportServiceRequestT: pass + def export(self, data): + return self._exporter.export_with_retry(data) + @property def _exporting(self) -> str: return "mock" @@ -189,12 +195,12 @@ def _exporting(self) -> str: # pylint: disable=protected-access export_thread = threading.Thread( - target=otlp_mock_exporter._export, args=({},) + target=otlp_mock_exporter.export, args=({},) ) export_thread.start() try: # pylint: disable=protected-access - self.assertTrue(otlp_mock_exporter._export_lock.locked()) + self.assertTrue(otlp_mock_exporter._exporter._export_lock.locked()) # delay is 1 second while the default shutdown timeout is 30_000 milliseconds start_time = time_ns() otlp_mock_exporter.shutdown() @@ -203,6 +209,8 @@ def _exporting(self) -> str: # pylint: disable=protected-access self.assertTrue(otlp_mock_exporter._shutdown) # pylint: disable=protected-access - self.assertFalse(otlp_mock_exporter._export_lock.locked()) + self.assertFalse( + otlp_mock_exporter._exporter._export_lock.locked() + ) finally: export_thread.join() diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py index f9f9427b776..fbfd49e48ed 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py @@ -443,7 +443,7 @@ def test_otlp_exporter_endpoint(self, mock_secure, mock_insecure): # pylint: disable=no-self-use @patch( - "opentelemetry.exporter.otlp.proto.grpc.exporter._create_exp_backoff_generator" + "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.insecure_channel") @patch.dict("os.environ", {OTEL_EXPORTER_OTLP_COMPRESSION: "gzip"}) @@ -481,9 +481,9 @@ def test_otlp_exporter_otlp_compression_unspecified( ) @patch( - "opentelemetry.exporter.otlp.proto.grpc.exporter._create_exp_backoff_generator" + "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.sleep") + @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") def test_unavailable(self, mock_sleep, mock_expo): mock_expo.configure_mock(**{"return_value": [0.01]}) @@ -498,9 +498,9 @@ def test_unavailable(self, mock_sleep, mock_expo): mock_sleep.assert_called_with(0.01) @patch( - "opentelemetry.exporter.otlp.proto.grpc.exporter._create_exp_backoff_generator" + "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.sleep") + @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") def test_unavailable_delay(self, mock_sleep, mock_expo): mock_expo.configure_mock(**{"return_value": [1]}) @@ -515,9 +515,9 @@ def test_unavailable_delay(self, mock_sleep, mock_expo): mock_sleep.assert_called_with(0.01) @patch( - "opentelemetry.exporter.otlp.proto.grpc.exporter._create_exp_backoff_generator" + "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.sleep") + @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.logger.error") def test_unknown_logs(self, mock_logger_error, mock_sleep, mock_expo): @@ -863,7 +863,7 @@ def test_shutdown_wait_last_export(self): export_thread.start() try: # pylint: disable=protected-access - self.assertTrue(self.exporter._export_lock.locked()) + self.assertTrue(self.exporter._exporter._export_lock.locked()) # delay is 4 seconds while the default shutdown timeout is 30_000 milliseconds start_time = time_ns() self.exporter.shutdown() @@ -872,7 +872,7 @@ def test_shutdown_wait_last_export(self): # pylint: disable=protected-access self.assertTrue(self.exporter._shutdown) # pylint: disable=protected-access - self.assertFalse(self.exporter._export_lock.locked()) + self.assertFalse(self.exporter._exporter._export_lock.locked()) finally: export_thread.join() diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py index d618ffb13a3..101567d6ca1 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py @@ -522,9 +522,9 @@ def test_otlp_headers(self, mock_ssl_channel, mock_secure): ) @patch( - "opentelemetry.exporter.otlp.proto.grpc.exporter._create_exp_backoff_generator" + "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.sleep") + @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") def test_unavailable(self, mock_sleep, mock_expo): mock_expo.configure_mock(**{"return_value": [0.01]}) @@ -537,9 +537,9 @@ def test_unavailable(self, mock_sleep, mock_expo): mock_sleep.assert_called_with(0.01) @patch( - "opentelemetry.exporter.otlp.proto.grpc.exporter._create_exp_backoff_generator" + "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.sleep") + @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") def test_unavailable_delay(self, mock_sleep, mock_expo): mock_expo.configure_mock(**{"return_value": [1]}) @@ -999,7 +999,7 @@ def test_shutdown_wait_last_export(self): export_thread.start() try: # pylint: disable=protected-access - self.assertTrue(self.exporter._export_lock.locked()) + self.assertTrue(self.exporter._exporter._export_lock.locked()) # delay is 4 seconds while the default shutdown timeout is 30_000 milliseconds start_time = time_ns() self.exporter.shutdown() @@ -1008,7 +1008,7 @@ def test_shutdown_wait_last_export(self): # pylint: disable=protected-access self.assertTrue(self.exporter._shutdown) # pylint: disable=protected-access - self.assertFalse(self.exporter._export_lock.locked()) + self.assertFalse(self.exporter._exporter._export_lock.locked()) finally: export_thread.join() From 5596c072d6fa0e620b629a128525cf546e65ee68 Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Sun, 14 Jul 2024 20:22:32 +0200 Subject: [PATCH 02/16] Use RetryingExporter in HTTP log exporter This unifies the implementation of the OTLP exporters and the HTTP log exporter. Next step is to consolidate the remaining HTTP exporters. Fixes #4043. --- .../otlp/proto/http/_log_exporter/__init__.py | 55 ++++++++----------- .../tests/test_proto_log_exporter.py | 10 ++-- 2 files changed, 29 insertions(+), 36 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index 597b012a49a..d36588130d3 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -18,12 +18,12 @@ from io import BytesIO from os import environ from typing import Dict, Optional, Sequence -from time import sleep import requests -from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, +from opentelemetry.exporter.otlp.proto.common.exporter import ( + RetryableExportError, + RetryingExporter, ) from opentelemetry.exporter.otlp.proto.common._log_encoder import encode_logs from opentelemetry.sdk.environment_variables import ( @@ -124,6 +124,7 @@ def __init__( {"Content-Encoding": self._compression.value} ) self._shutdown = False + self._exporter = RetryingExporter(self._export, LogExportResult) def _export(self, serialized_data: bytes): data = serialized_data @@ -135,7 +136,7 @@ def _export(self, serialized_data: bytes): elif self._compression == Compression.Deflate: data = zlib.compress(serialized_data) - return self._session.post( + resp = self._session.post( url=self._endpoint, data=data, verify=self._certificate_file, @@ -143,6 +144,23 @@ def _export(self, serialized_data: bytes): cert=self._client_cert, ) + if resp.ok: + return LogExportResult.SUCCESS + + if self._retryable(resp): + _logger.warning( + "Transient error %s encountered while exporting logs batch.", + resp.reason, + ) + raise RetryableExportError(None) + + _logger.error( + "Failed to export logs batch code: %s, reason: %s", + resp.status_code, + resp.text, + ) + return LogExportResult.FAILURE + @staticmethod def _retryable(resp: requests.Response) -> bool: if resp.status_code == 408: @@ -159,34 +177,7 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: return LogExportResult.FAILURE serialized_data = encode_logs(batch).SerializeToString() - - for delay in _create_exp_backoff_generator( - max_value=self._MAX_RETRY_TIMEOUT - ): - - if delay == self._MAX_RETRY_TIMEOUT: - return LogExportResult.FAILURE - - resp = self._export(serialized_data) - # pylint: disable=no-else-return - if resp.ok: - return LogExportResult.SUCCESS - elif self._retryable(resp): - _logger.warning( - "Transient error %s encountered while exporting logs batch, retrying in %ss.", - resp.reason, - delay, - ) - sleep(delay) - continue - else: - _logger.error( - "Failed to export logs batch code: %s, reason: %s", - resp.status_code, - resp.text, - ) - return LogExportResult.FAILURE - return LogExportResult.FAILURE + return self._exporter.export_with_retry(serialized_data) def force_flush(self, timeout_millis: float = 10_000) -> bool: """Nothing is buffered in this exporter, so this method does nothing.""" diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py index f5606794620..d2e443580aa 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py @@ -270,7 +270,7 @@ def test_exported_log_without_span_id(self): self.fail("No log records found") @responses.activate - @patch("opentelemetry.exporter.otlp.proto.http._log_exporter.sleep") + @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") def test_exponential_backoff(self, mock_sleep): # return a retryable error responses.add( @@ -358,12 +358,14 @@ def _get_sdk_log_data() -> List[LogData]: return [log1, log2, log3, log4] - @patch.object(OTLPLogExporter, "_export", return_value=Mock(ok=True)) - def test_2xx_status_code(self, mock_otlp_metric_exporter): + def test_2xx_status_code(self): """ Test that any HTTP 2XX code returns a successful result """ self.assertEqual( - OTLPLogExporter().export(MagicMock()), LogExportResult.SUCCESS + OTLPLogExporter( + session=Mock(**{"post.return_value": Mock(ok=True)}) + ).export(MagicMock()), + LogExportResult.SUCCESS, ) From bf18b85adc9a897213a43cba2c377045a225c5b8 Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Mon, 15 Jul 2024 07:54:29 +0200 Subject: [PATCH 03/16] Use RetryingExporter in HTTP metric exporter This unifies the implementation of the OTLP exporters and the HTTP metric exporter. --- .../proto/http/metric_exporter/__init__.py | 58 +++++++++---------- .../metrics/test_otlp_metrics_exporter.py | 9 +-- 2 files changed, 33 insertions(+), 34 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 16ac042dd89..52aef1105ab 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -19,12 +19,14 @@ from typing import Sequence, Mapping # noqa: F401 from io import BytesIO -from time import sleep from deprecated import deprecated from opentelemetry.exporter.otlp.proto.common._internal import ( _get_resource_data, - _create_exp_backoff_generator, +) +from opentelemetry.exporter.otlp.proto.common.exporter import ( + RetryableExportError, + RetryingExporter, ) from opentelemetry.exporter.otlp.proto.common._internal.metrics_encoder import ( OTLPMetricExporterMixin, @@ -160,6 +162,8 @@ def __init__( preferred_temporality, preferred_aggregation ) + self._exporter = RetryingExporter(self._export, MetricExportResult) + def _export(self, serialized_data: bytes): data = serialized_data if self._compression == Compression.Gzip: @@ -170,7 +174,7 @@ def _export(self, serialized_data: bytes): elif self._compression == Compression.Deflate: data = zlib.compress(serialized_data) - return self._session.post( + resp = self._session.post( url=self._endpoint, data=data, verify=self._certificate_file, @@ -178,6 +182,24 @@ def _export(self, serialized_data: bytes): cert=self._client_cert, ) + if resp.ok: + return MetricExportResult.SUCCESS + + if self._retryable(resp): + _logger.warning( + "Transient error %s encountered while exporting metric batch", + resp.reason, + ) + + raise RetryableExportError(None) + + _logger.error( + "Failed to export batch code: %s, reason: %s", + resp.status_code, + resp.text, + ) + return MetricExportResult.FAILURE + @staticmethod def _retryable(resp: requests.Response) -> bool: if resp.status_code == 408: @@ -193,33 +215,9 @@ def export( **kwargs, ) -> MetricExportResult: serialized_data = encode_metrics(metrics_data) - for delay in _create_exp_backoff_generator( - max_value=self._MAX_RETRY_TIMEOUT - ): - - if delay == self._MAX_RETRY_TIMEOUT: - return MetricExportResult.FAILURE - - resp = self._export(serialized_data.SerializeToString()) - # pylint: disable=no-else-return - if resp.ok: - return MetricExportResult.SUCCESS - elif self._retryable(resp): - _logger.warning( - "Transient error %s encountered while exporting metric batch, retrying in %ss.", - resp.reason, - delay, - ) - sleep(delay) - continue - else: - _logger.error( - "Failed to export batch code: %s, reason: %s", - resp.status_code, - resp.text, - ) - return MetricExportResult.FAILURE - return MetricExportResult.FAILURE + return self._exporter.export_with_retry( + serialized_data.SerializeToString() + ) def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: pass diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py index ac039d4ff81..836ae447530 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py @@ -329,7 +329,7 @@ def test_serialization(self, mock_post): ) @activate - @patch("opentelemetry.exporter.otlp.proto.http.metric_exporter.sleep") + @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") def test_exponential_backoff(self, mock_sleep): # return a retryable error add( @@ -501,14 +501,15 @@ def test_exponential_explicit_bucket_histogram(self): ExplicitBucketHistogramAggregation, ) - @patch.object(OTLPMetricExporter, "_export", return_value=Mock(ok=True)) - def test_2xx_status_code(self, mock_otlp_metric_exporter): + def test_2xx_status_code(self): """ Test that any HTTP 2XX code returns a successful result """ self.assertEqual( - OTLPMetricExporter().export(MagicMock()), + OTLPMetricExporter( + session=Mock(**{"post.return_value": Mock(ok=True)}) + ).export(MagicMock()), MetricExportResult.SUCCESS, ) From 5c90194f15b7cf8a9891cbd71c2569d540f85aa0 Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Mon, 15 Jul 2024 07:58:41 +0200 Subject: [PATCH 04/16] Add missing shutdown mechanic to HTTP metric exporter --- .../otlp/proto/http/metric_exporter/__init__.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 52aef1105ab..97d1d3cfe36 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -162,6 +162,7 @@ def __init__( preferred_temporality, preferred_aggregation ) + self._shutdown = False self._exporter = RetryingExporter(self._export, MetricExportResult) def _export(self, serialized_data: bytes): @@ -214,13 +215,22 @@ def export( timeout_millis: float = 10_000, **kwargs, ) -> MetricExportResult: + if self._shutdown: + _logger.warning("Exporter already shutdown, ignoring batch") + return MetricExportResult.FAILURE + serialized_data = encode_metrics(metrics_data) return self._exporter.export_with_retry( serialized_data.SerializeToString() ) def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: - pass + if self._shutdown: + _logger.warning("Exporter already shutdown, ignoring call") + return + self._exporter.shutdown(timeout_millis=timeout_millis) + self._session.close() + self._shutdown = True @property def _exporting(self) -> str: From 0c3831a416b934177441a00b3847876fae14cb08 Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Mon, 15 Jul 2024 08:04:31 +0200 Subject: [PATCH 05/16] Use RetryingExporter in HTTP trace exporter This unifies the implementation of the OTLP exporters and the HTTP trace exporter. --- .../proto/http/trace_exporter/__init__.py | 57 ++++++++----------- .../tests/test_proto_span_exporter.py | 10 ++-- 2 files changed, 29 insertions(+), 38 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index ac69b6acde3..6cfb486e8fb 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -18,12 +18,12 @@ from io import BytesIO from os import environ from typing import Dict, Optional -from time import sleep import requests -from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, +from opentelemetry.exporter.otlp.proto.common.exporter import ( + RetryableExportError, + RetryingExporter, ) from opentelemetry.exporter.otlp.proto.common.trace_encoder import ( encode_spans, @@ -121,6 +121,7 @@ def __init__( {"Content-Encoding": self._compression.value} ) self._shutdown = False + self._exporter = RetryingExporter(self._export, SpanExportResult) def _export(self, serialized_data: bytes): data = serialized_data @@ -132,7 +133,7 @@ def _export(self, serialized_data: bytes): elif self._compression == Compression.Deflate: data = zlib.compress(serialized_data) - return self._session.post( + resp = self._session.post( url=self._endpoint, data=data, verify=self._certificate_file, @@ -140,6 +141,23 @@ def _export(self, serialized_data: bytes): cert=self._client_cert, ) + if resp.ok: + return SpanExportResult.SUCCESS + + if self._retryable(resp): + _logger.warning( + "Transient error %s encountered while exporting span batch", + resp.reason, + ) + raise RetryableExportError(None) + + _logger.error( + "Failed to export batch code: %s, reason: %s", + resp.status_code, + resp.text, + ) + return SpanExportResult.FAILURE + @staticmethod def _retryable(resp: requests.Response) -> bool: if resp.status_code == 408: @@ -151,34 +169,6 @@ def _retryable(resp: requests.Response) -> bool: def _serialize_spans(self, spans): return encode_spans(spans).SerializePartialToString() - def _export_serialized_spans(self, serialized_data): - for delay in _create_exp_backoff_generator( - max_value=self._MAX_RETRY_TIMEOUT - ): - if delay == self._MAX_RETRY_TIMEOUT: - return SpanExportResult.FAILURE - - resp = self._export(serialized_data) - # pylint: disable=no-else-return - if resp.ok: - return SpanExportResult.SUCCESS - elif self._retryable(resp): - _logger.warning( - "Transient error %s encountered while exporting span batch, retrying in %ss.", - resp.reason, - delay, - ) - sleep(delay) - continue - else: - _logger.error( - "Failed to export batch code: %s, reason: %s", - resp.status_code, - resp.text, - ) - return SpanExportResult.FAILURE - return SpanExportResult.FAILURE - def export(self, spans) -> SpanExportResult: # After the call to Shutdown subsequent calls to Export are # not allowed and should return a Failure result. @@ -187,8 +177,7 @@ def export(self, spans) -> SpanExportResult: return SpanExportResult.FAILURE serialized_data = self._serialize_spans(spans) - - return self._export_serialized_spans(serialized_data) + return self._exporter.export_with_retry(serialized_data) def shutdown(self): if self._shutdown: diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py index 9d57c62bae8..1f66e3903d7 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py @@ -234,7 +234,7 @@ def test_headers_parse_from_env(self): # pylint: disable=no-self-use @responses.activate - @patch("opentelemetry.exporter.otlp.proto.http.trace_exporter.sleep") + @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") def test_exponential_backoff(self, mock_sleep): # return a retryable error responses.add( @@ -263,12 +263,14 @@ def test_exponential_backoff(self, mock_sleep): [call(1), call(2), call(4), call(8), call(16), call(32)] ) - @patch.object(OTLPSpanExporter, "_export", return_value=Mock(ok=True)) - def test_2xx_status_code(self, mock_otlp_metric_exporter): + def test_2xx_status_code(self): """ Test that any HTTP 2XX code returns a successful result """ self.assertEqual( - OTLPSpanExporter().export(MagicMock()), SpanExportResult.SUCCESS + OTLPSpanExporter( + session=Mock(**{"post.return_value": Mock(ok=True)}) + ).export(MagicMock()), + SpanExportResult.SUCCESS, ) From 178dc040419baa7c77f4983070ee5276ae248550 Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Mon, 26 Aug 2024 08:27:14 +0200 Subject: [PATCH 06/16] Pipe timeout through RetryingExporter Propagate the given timeout value from all exporters to the RetryingExporter, which will hand it over to the individual `_export` function call. Currently the same unaltered value is handed over. In following commits the Retrying exporter will update the timeout handed over to `_export` depending on the remaining time. --- .../exporter/otlp/proto/common/exporter.py | 6 ++++-- .../otlp/proto/grpc/_log_exporter/__init__.py | 4 ++-- .../exporter/otlp/proto/grpc/exporter.py | 18 +++++++++++------- .../proto/grpc/metric_exporter/__init__.py | 4 ++-- .../otlp/proto/grpc/trace_exporter/__init__.py | 4 ++-- .../otlp/proto/http/_log_exporter/__init__.py | 14 +++++++++----- .../proto/http/metric_exporter/__init__.py | 14 +++++++++----- .../otlp/proto/http/trace_exporter/__init__.py | 14 +++++++++----- .../tests/test_proto_log_exporter.py | 8 ++++---- .../tests/test_proto_span_exporter.py | 8 ++++---- 10 files changed, 56 insertions(+), 38 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py index cc05fdf7dc6..cb11dddbb88 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py @@ -33,11 +33,13 @@ def __init__(self, retry_delay_sec: Optional[int]): class RetryingExporter(Generic[ExportResultT]): def __init__( self, - export_function: Callable[[ExportPayloadT], ExportResultT], + export_function: Callable[[ExportPayloadT, float], ExportResultT], result: Type[ExportResultT], + timeout_sec: float, ): self._export_function = export_function self._result = result + self._timeout_sec = timeout_sec self._shutdown = False self._export_lock = threading.Lock() @@ -67,7 +69,7 @@ def export_with_retry(self, payload: ExportPayloadT) -> ExportResultT: with self._export_lock: try: - return self._export_function(payload) + return self._export_function(payload, self._timeout_sec) except RetryableExportError as exc: delay_sec = ( exc.retry_delay_sec diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py index 4c02b1346ff..1dc590afec7 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py @@ -60,7 +60,7 @@ def __init__( headers: Optional[ Union[TypingSequence[Tuple[str, str]], Dict[str, str], str] ] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, ): if insecure is None: @@ -81,7 +81,7 @@ def __init__( environ_timeout = environ.get(OTEL_EXPORTER_OTLP_LOGS_TIMEOUT) environ_timeout = ( - int(environ_timeout) if environ_timeout is not None else None + float(environ_timeout) if environ_timeout is not None else None ) compression = ( diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index 6d48201e877..7729c592e49 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -164,7 +164,7 @@ def _get_credentials( class OTLPExporterMixin( ABC, Generic[SDKDataT, ExportServiceRequestT, ExportResultT] ): - """OTLP span exporter + """OTLP exporter Args: endpoint: OpenTelemetry Collector receiver endpoint @@ -183,7 +183,7 @@ def __init__( headers: Optional[ Union[TypingSequence[Tuple[str, str]], Dict[str, str], str] ] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, ): super().__init__() @@ -220,8 +220,8 @@ def __init__( else: self._headers = tuple(self._headers) + tuple(_OTLP_GRPC_HEADERS) - self._timeout = timeout or int( - environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, 10) + timeout_sec = timeout or float( + environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, 10.0) ) self._collector_kwargs = None @@ -249,7 +249,9 @@ def __init__( ) self._shutdown = False - self._exporter = RetryingExporter(self._export, self._result) + self._exporter = RetryingExporter( + self._export, self._result, timeout_sec + ) @abstractmethod def _translate_data( @@ -258,13 +260,15 @@ def _translate_data( pass def _export( - self, data: Union[TypingSequence[ReadableSpan], MetricsData] + self, + data: Union[TypingSequence[ReadableSpan], MetricsData], + timeout_sec: float, ) -> ExportResultT: try: self._client.Export( request=self._translate_data(data), metadata=self._headers, - timeout=self._timeout, + timeout=timeout_sec, ) return self._result.SUCCESS diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py index a6220fb8594..d50262c0576 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py @@ -98,7 +98,7 @@ def __init__( headers: Optional[ Union[TypingSequence[Tuple[str, str]], Dict[str, str], str] ] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, preferred_temporality: Dict[type, AggregationTemporality] = None, preferred_aggregation: Dict[type, Aggregation] = None, @@ -123,7 +123,7 @@ def __init__( environ_timeout = environ.get(OTEL_EXPORTER_OTLP_METRICS_TIMEOUT) environ_timeout = ( - int(environ_timeout) if environ_timeout is not None else None + float(environ_timeout) if environ_timeout is not None else None ) compression = ( diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py index 41ca51cf27b..a93e5f99a70 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py @@ -91,7 +91,7 @@ def __init__( headers: Optional[ Union[Sequence[Tuple[str, str]], Dict[str, str], str] ] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, ): if insecure is None: @@ -112,7 +112,7 @@ def __init__( environ_timeout = environ.get(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT) environ_timeout = ( - int(environ_timeout) if environ_timeout is not None else None + float(environ_timeout) if environ_timeout is not None else None ) compression = ( diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index d36588130d3..17d710fc465 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -74,7 +74,7 @@ def __init__( client_key_file: Optional[str] = None, client_certificate_file: Optional[str] = None, headers: Optional[Dict[str, str]] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, session: Optional[requests.Session] = None, ): @@ -109,7 +109,7 @@ def __init__( self._headers = headers or parse_env_headers( headers_string, liberal=True ) - self._timeout = timeout or int( + timeout_sec = timeout or float( environ.get( OTEL_EXPORTER_OTLP_LOGS_TIMEOUT, environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, DEFAULT_TIMEOUT), @@ -124,9 +124,13 @@ def __init__( {"Content-Encoding": self._compression.value} ) self._shutdown = False - self._exporter = RetryingExporter(self._export, LogExportResult) + self._exporter = RetryingExporter( + self._export, LogExportResult, timeout_sec + ) - def _export(self, serialized_data: bytes): + def _export( + self, serialized_data: bytes, timeout_sec: float + ) -> LogExportResult: data = serialized_data if self._compression == Compression.Gzip: gzip_data = BytesIO() @@ -140,7 +144,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 97d1d3cfe36..9f7cee5eff2 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -105,7 +105,7 @@ def __init__( client_key_file: Optional[str] = None, client_certificate_file: Optional[str] = None, headers: Optional[Dict[str, str]] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, session: Optional[requests.Session] = None, preferred_temporality: Dict[type, AggregationTemporality] = None, @@ -141,7 +141,7 @@ def __init__( self._headers = headers or parse_env_headers( headers_string, liberal=True ) - self._timeout = timeout or int( + self._timeout = timeout or float( environ.get( OTEL_EXPORTER_OTLP_METRICS_TIMEOUT, environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, DEFAULT_TIMEOUT), @@ -163,9 +163,13 @@ def __init__( ) self._shutdown = False - self._exporter = RetryingExporter(self._export, MetricExportResult) + self._exporter = RetryingExporter( + self._export, MetricExportResult, self._timeout + ) - def _export(self, serialized_data: bytes): + def _export( + self, serialized_data: bytes, timeout_sec: float + ) -> MetricExportResult: data = serialized_data if self._compression == Compression.Gzip: gzip_data = BytesIO() @@ -179,7 +183,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 6cfb486e8fb..c3584604681 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -72,7 +72,7 @@ def __init__( client_key_file: Optional[str] = None, client_certificate_file: Optional[str] = None, headers: Optional[Dict[str, str]] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, session: Optional[requests.Session] = None, ): @@ -106,7 +106,7 @@ def __init__( self._headers = headers or parse_env_headers( headers_string, liberal=True ) - self._timeout = timeout or int( + timeout_sec = timeout or float( environ.get( OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, DEFAULT_TIMEOUT), @@ -121,9 +121,13 @@ def __init__( {"Content-Encoding": self._compression.value} ) self._shutdown = False - self._exporter = RetryingExporter(self._export, SpanExportResult) + self._exporter = RetryingExporter( + self._export, SpanExportResult, timeout_sec + ) - def _export(self, serialized_data: bytes): + def _export( + self, serialized_data: bytes, timeout_sec: float + ) -> SpanExportResult: data = serialized_data if self._compression == Compression.Gzip: gzip_data = BytesIO() @@ -137,7 +141,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py index d2e443580aa..1391629b313 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py @@ -77,7 +77,7 @@ def test_constructor_default(self): self.assertEqual(exporter._certificate_file, True) self.assertEqual(exporter._client_certificate_file, None) self.assertEqual(exporter._client_key_file, None) - self.assertEqual(exporter._timeout, DEFAULT_TIMEOUT) + self.assertEqual(exporter._exporter._timeout_sec, DEFAULT_TIMEOUT) self.assertIs(exporter._compression, DEFAULT_COMPRESSION) self.assertEqual(exporter._headers, {}) self.assertIsInstance(exporter._session, requests.Session) @@ -119,7 +119,7 @@ def test_exporter_metrics_env_take_priority(self): exporter._client_certificate_file, "logs/client-cert.pem" ) self.assertEqual(exporter._client_key_file, "logs/client-key.pem") - self.assertEqual(exporter._timeout, 40) + self.assertEqual(exporter._exporter._timeout_sec, 40) self.assertIs(exporter._compression, Compression.Deflate) self.assertEqual( exporter._headers, @@ -160,7 +160,7 @@ def test_exporter_constructor_take_priority(self): self.assertEqual(exporter._certificate_file, "/hello.crt") self.assertEqual(exporter._client_certificate_file, "/client-cert.pem") self.assertEqual(exporter._client_key_file, "/client-key.pem") - self.assertEqual(exporter._timeout, 70) + self.assertEqual(exporter._exporter._timeout_sec, 70) self.assertIs(exporter._compression, Compression.NoCompression) self.assertEqual( exporter._headers, @@ -192,7 +192,7 @@ def test_exporter_env(self): exporter._client_certificate_file, ENV_CLIENT_CERTIFICATE ) self.assertEqual(exporter._client_key_file, ENV_CLIENT_KEY) - self.assertEqual(exporter._timeout, int(ENV_TIMEOUT)) + self.assertEqual(exporter._exporter._timeout_sec, int(ENV_TIMEOUT)) self.assertIs(exporter._compression, Compression.Gzip) self.assertEqual( exporter._headers, {"envheader1": "val1", "envheader2": "val2"} diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py index 1f66e3903d7..c0290559683 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py @@ -66,7 +66,7 @@ def test_constructor_default(self): self.assertEqual(exporter._certificate_file, True) self.assertEqual(exporter._client_certificate_file, None) self.assertEqual(exporter._client_key_file, None) - self.assertEqual(exporter._timeout, DEFAULT_TIMEOUT) + self.assertEqual(exporter._exporter._timeout_sec, DEFAULT_TIMEOUT) self.assertIs(exporter._compression, DEFAULT_COMPRESSION) self.assertEqual(exporter._headers, {}) self.assertIsInstance(exporter._session, requests.Session) @@ -108,7 +108,7 @@ def test_exporter_traces_env_take_priority(self): exporter._client_certificate_file, "traces/client-cert.pem" ) self.assertEqual(exporter._client_key_file, "traces/client-key.pem") - self.assertEqual(exporter._timeout, 40) + self.assertEqual(exporter._exporter._timeout_sec, 40) self.assertIs(exporter._compression, Compression.Deflate) self.assertEqual( exporter._headers, @@ -151,7 +151,7 @@ def test_exporter_constructor_take_priority(self): exporter._client_certificate_file, "path/to/client-cert.pem" ) self.assertEqual(exporter._client_key_file, "path/to/client-key.pem") - self.assertEqual(exporter._timeout, 20) + self.assertEqual(exporter._exporter._timeout_sec, 20) self.assertIs(exporter._compression, Compression.NoCompression) self.assertEqual( exporter._headers, @@ -179,7 +179,7 @@ def test_exporter_env(self): exporter._client_certificate_file, OS_ENV_CLIENT_CERTIFICATE ) self.assertEqual(exporter._client_key_file, OS_ENV_CLIENT_KEY) - self.assertEqual(exporter._timeout, int(OS_ENV_TIMEOUT)) + self.assertEqual(exporter._exporter._timeout_sec, int(OS_ENV_TIMEOUT)) self.assertIs(exporter._compression, Compression.Gzip) self.assertEqual( exporter._headers, {"envheader1": "val1", "envheader2": "val2"} From e7a716fffcde597f08b210c79a23ba13ad498eca Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Wed, 10 Jul 2024 21:36:59 +0200 Subject: [PATCH 07/16] Try to understand and fix test_shutdown_wait_last_export TL;DR The time assert in the test was wrong. Looking at the original commit (c84ba9) of this test, the delay of the mocked backoff was 4 seconds fixed with 6 iterations. So the time until shutdown returned was around 24 seconds. The default timeout of the lock wait in shutdown did not hit before the retry of the exporter was over. The line `self.assertGreaterEqual(now, (start_time + 30 / 1000))` seemed to be wrong from the beginning. It just ensured that the shutdown call took at least 30ms. I assume it was supposed to do something else. But to me it's not clear what exactly it should do. One explantion would be that it was intended to do `self.assertGreaterEqual(now, (start_time + 30 * 1000))` instead. But that would mean: Shutdown should need at least 30 seconds to pass, which does not make sense imho. Another option would be that it was intended to do `self.assertLessEqual(now, (start_time + 30 * 1000))` instead. That would mean: Shutdown should need at maximum 30 seconds to pass, which actually could make more sense. It would ensure that the call of `self.exporter.shutdown` returns after 30 seconds. My biggest issue here is, that it tests the happy path where the retry is over before the timeout while there is no mechanism to ensure that the exporter thread is stopped in a sensible time in case it's current exponential backoff is in a longer sleep already while the shutdown flag is set. If I understand it correctly, it can hit the beginning of a 32 second sleep, which could mean that a process needs a) 30 seconds to signal the shutdown and additional 32 seconds to finish the exporter thread. But that's something for another change. After all these words, looking at the current HEAD, the situation of the test case is basically the same as originally. The retry interval has been changed recently to be way shorter. The default lock timeout stayed the same at 30 seconds, but that does not matter too much since the retry mechanism is finished long before that. And the aforementioned assert is still wrong, with some additional problem from #4014. --- .../tests/test_otlp_exporter_mixin.py | 22 ++++++++++++++----- .../tests/test_otlp_metrics_exporter.py | 22 ++++++++++++++----- .../tests/test_otlp_trace_exporter.py | 22 ++++++++++++++----- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index 9a8a3598d9f..e4b042811d2 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -13,6 +13,7 @@ # limitations under the License. import threading +import time from logging import WARNING from time import time_ns from types import MethodType @@ -156,7 +157,9 @@ def _exporting(self) -> str: "Exporter already shutdown, ignoring batch", ) - def test_shutdown_wait_last_export(self): + def test_shutdown_wait_for_last_export_finishing_within_shutdown_timeout( + self, + ): result_mock = Mock() rpc_error = RpcError() @@ -200,15 +203,24 @@ def _exporting(self) -> str: export_thread.start() try: # pylint: disable=protected-access + + # Wait for the export thread to hold the lock. Since the main thread is not synchronized + # with the export thread, the thread may not be ready yet. + for _ in range(5): + if otlp_mock_exporter._exporter._export_lock.locked(): + break + time.sleep(0.01) self.assertTrue(otlp_mock_exporter._exporter._export_lock.locked()) - # delay is 1 second while the default shutdown timeout is 30_000 milliseconds + + # 6 retries with a fixed retry delay of 10ms (see TraceServiceServicerUNAVAILABLEDelay) + # The default shutdown timeout is 30 seconds + # This means the retries will finish long before the shutdown flag will be set start_time = time_ns() otlp_mock_exporter.shutdown() now = time_ns() - self.assertGreaterEqual(now, (start_time + 30 / 1000)) - # pylint: disable=protected-access + # Verify that the shutdown method finished within the shutdown timeout + self.assertLessEqual(now, (start_time + 3e10)) self.assertTrue(otlp_mock_exporter._shutdown) - # pylint: disable=protected-access self.assertFalse( otlp_mock_exporter._exporter._export_lock.locked() ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py index fbfd49e48ed..20298d2bbc2 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py @@ -15,6 +15,7 @@ # pylint: disable=too-many-lines import threading +import time from concurrent.futures import ThreadPoolExecutor # pylint: disable=too-many-lines @@ -852,7 +853,9 @@ def test_shutdown(self): ) self.exporter = OTLPMetricExporter() - def test_shutdown_wait_last_export(self): + def test_shutdown_wait_for_last_export_finishing_within_shutdown_timeout( + self, + ): add_MetricsServiceServicer_to_server( MetricsServiceServicerUNAVAILABLEDelay(), self.server ) @@ -863,15 +866,24 @@ def test_shutdown_wait_last_export(self): export_thread.start() try: # pylint: disable=protected-access + + # Wait for the export thread to hold the lock. Since the main thread is not synchronized + # with the export thread, the thread may not be ready yet. + for _ in range(5): + if self.exporter._exporter._export_lock.locked(): + break + time.sleep(0.01) self.assertTrue(self.exporter._exporter._export_lock.locked()) - # delay is 4 seconds while the default shutdown timeout is 30_000 milliseconds + + # 6 retries with a fixed retry delay of 10ms (see TraceServiceServicerUNAVAILABLEDelay) + # The default shutdown timeout is 30 seconds + # This means the retries will finish long before the shutdown flag will be set start_time = time_ns() self.exporter.shutdown() now = time_ns() - self.assertGreaterEqual(now, (start_time + 30 / 1000)) - # pylint: disable=protected-access + # Verify that the shutdown method finished within the shutdown timeout + self.assertLessEqual(now, (start_time + 3e10)) self.assertTrue(self.exporter._shutdown) - # pylint: disable=protected-access self.assertFalse(self.exporter._exporter._export_lock.locked()) finally: export_thread.join() diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py index 101567d6ca1..a4ddcd92ebe 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py @@ -16,6 +16,7 @@ import os import threading +import time from concurrent.futures import ThreadPoolExecutor from logging import WARNING from time import time_ns @@ -988,7 +989,9 @@ def test_shutdown(self): "Exporter already shutdown, ignoring batch", ) - def test_shutdown_wait_last_export(self): + def test_shutdown_wait_for_last_export_finishing_within_shutdown_timeout( + self, + ): add_TraceServiceServicer_to_server( TraceServiceServicerUNAVAILABLEDelay(), self.server ) @@ -999,15 +1002,24 @@ def test_shutdown_wait_last_export(self): export_thread.start() try: # pylint: disable=protected-access + + # Wait for the export thread to hold the lock. Since the main thread is not synchronized + # with the export thread, the thread may not be ready yet. + for _ in range(5): + if self.exporter._exporter._export_lock.locked(): + break + time.sleep(0.01) self.assertTrue(self.exporter._exporter._export_lock.locked()) - # delay is 4 seconds while the default shutdown timeout is 30_000 milliseconds + + # 6 retries with a fixed retry delay of 10ms (see TraceServiceServicerUNAVAILABLEDelay) + # The default shutdown timeout is 30 seconds + # This means the retries will finish long before the shutdown flag will be set start_time = time_ns() self.exporter.shutdown() now = time_ns() - self.assertGreaterEqual(now, (start_time + 30 / 1000)) - # pylint: disable=protected-access + # Verify that the shutdown method finished within the shutdown timeout + self.assertLessEqual(now, (start_time + 3e10)) self.assertTrue(self.exporter._shutdown) - # pylint: disable=protected-access self.assertFalse(self.exporter._exporter._export_lock.locked()) finally: export_thread.join() From 08a0dd9c62ee204de0b70543c1f58cc11259c70e Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Sat, 14 Sep 2024 17:54:12 +0200 Subject: [PATCH 08/16] Change export lock to cover the whole export procedure Previously the export lock was released by the export between retries, which gave a shutdown the opportunity to interrupt the export depending on which thread is first to get the lock again. This behavior is not desired as it heavily relies on timing and makes things less deterministic. The retry loop still checks whether the shutdown flag is set and will break in case it was told to do so. --- .../exporter/otlp/proto/common/exporter.py | 44 ++++++++---- .../tests/test_otlp_exporter_mixin.py | 68 +++++++++++++++++++ 2 files changed, 98 insertions(+), 14 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py index cb11dddbb88..b9ddf77a10e 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py @@ -12,9 +12,10 @@ # limitations under the License. import threading +from contextlib import contextmanager from logging import getLogger from time import sleep -from typing import Callable, Generic, Optional, Type, TypeVar +from typing import Callable, Generic, Iterator, Optional, Type, TypeVar from ._internal import _create_exp_backoff_generator @@ -46,11 +47,8 @@ def __init__( def shutdown(self, timeout_millis: float = 30_000) -> None: # wait for the last export if any - self._export_lock.acquire( # pylint: disable=consider-using-with - timeout=timeout_millis / 1e3 - ) - self._shutdown = True - self._export_lock.release() + with acquire_timeout(self._export_lock, timeout_millis / 1e3): + self._shutdown = True def export_with_retry(self, payload: ExportPayloadT) -> ExportResultT: # After the call to shutdown, subsequent calls to Export are @@ -59,15 +57,23 @@ def export_with_retry(self, payload: ExportPayloadT) -> ExportResultT: _logger.warning("Exporter already shutdown, ignoring batch") return self._result.FAILURE - max_value = 64 - # expo returns a generator that yields delay values which grow - # exponentially. Once delay is greater than max_value, the yielded - # value will remain constant. - for delay in _create_exp_backoff_generator(max_value=max_value): - if delay == max_value or self._shutdown: + with acquire_timeout( + self._export_lock, self._timeout_sec + ) as is_locked: + if not is_locked: + _logger.warning( + "Exporter failed to acquire lock before timeout" + ) return self._result.FAILURE - with self._export_lock: + max_value = 64 + # expo returns a generator that yields delay values which grow + # exponentially. Once delay is greater than max_value, the yielded + # value will remain constant. + for delay in _create_exp_backoff_generator(max_value=max_value): + if delay == max_value or self._shutdown: + return self._result.FAILURE + try: return self._export_function(payload, self._timeout_sec) except RetryableExportError as exc: @@ -79,4 +85,14 @@ def export_with_retry(self, payload: ExportPayloadT) -> ExportResultT: _logger.warning("Retrying in %ss", delay_sec) sleep(delay_sec) - return self._result.FAILURE + return self._result.FAILURE + + +@contextmanager +def acquire_timeout(lock: threading.Lock, timeout: float) -> Iterator[bool]: + result = lock.acquire(timeout=timeout) + try: + yield result + finally: + if result: + lock.release() diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index e4b042811d2..affad1d252b 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -226,3 +226,71 @@ def _exporting(self) -> str: ) finally: export_thread.join() + + def test_shutdown_wait_for_last_export_not_finishing_within_shutdown_timeout( + self, + ): + result_mock = Mock() + rpc_error = RpcError() + + def code(self): + return StatusCode.UNAVAILABLE + + def trailing_metadata(self): + return { + "google.rpc.retryinfo-bin": RetryInfo( + retry_delay=Duration(nanos=int(1e7)) + ).SerializeToString() + } + + rpc_error.code = MethodType(code, rpc_error) + rpc_error.trailing_metadata = MethodType(trailing_metadata, rpc_error) + + class OTLPMockExporter(OTLPExporterMixin): + _result = result_mock + _stub = Mock( + **{"return_value": Mock(**{"Export.side_effect": rpc_error})} + ) + + def _translate_data( + self, data: Sequence[SDKDataT] + ) -> ExportServiceRequestT: + pass + + def export(self, data): + return self._exporter.export_with_retry(data) + + @property + def _exporting(self) -> str: + return "mock" + + otlp_mock_exporter = OTLPMockExporter() + + # pylint: disable=protected-access + export_thread = threading.Thread( + target=otlp_mock_exporter.export, args=({},) + ) + export_thread.start() + try: + # pylint: disable=protected-access + + # Wait for the export thread to hold the lock. Since the main thread is not synchronized + # with the export thread, the thread may not be ready yet. + for _ in range(5): + if otlp_mock_exporter._exporter._export_lock.locked(): + break + time.sleep(0.01) + self.assertTrue(otlp_mock_exporter._exporter._export_lock.locked()) + + # 6 retries with a fixed retry delay of 10ms (see TraceServiceServicerUNAVAILABLEDelay) + # The default shutdown timeout is 30 seconds + # This means the retries will finish long before the shutdown flag will be set + start_time = time_ns() + otlp_mock_exporter.shutdown(0.001) + now = time_ns() + # Verify that the shutdown method finished within the shutdown timeout + self.assertLessEqual(now, (start_time + 3e10)) + self.assertTrue(otlp_mock_exporter._shutdown) + finally: + export_thread.join() + self.assertFalse(otlp_mock_exporter._exporter._export_lock.locked()) From c7b03b769a99f5417ccd427ab9bf1b8a639975b0 Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Mon, 15 Jul 2024 13:55:28 +0200 Subject: [PATCH 09/16] Add test cases for RetryableExporter A few tests are still commented out and will be enabled later, once the implementation is updated to implement the desired behavior. The tests are, like other existing tests, timing sensitive which makes them prone to fail on slow systems. It would certainly be better to get rid of this issue. I haven't found another approach in the existing test cases so I sticked to the current pattern. --- .../tests/test_retryable_exporter.py | 305 ++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100644 exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py new file mode 100644 index 00000000000..d5761ab2c34 --- /dev/null +++ b/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py @@ -0,0 +1,305 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import threading +import time +import unittest +from itertools import repeat +from logging import WARNING +from typing import Type +from unittest.mock import Mock, patch + +from opentelemetry.exporter.otlp.proto.common.exporter import ( + RetryableExportError, + RetryingExporter, +) +from opentelemetry.exporter.otlp.proto.common.exporter import ( + logger as exporter_logger, +) + +result_type: Type = Mock() + + +class TestRetryableExporter(unittest.TestCase): + def test_export_no_retry(self): + export_func = Mock() + exporter = RetryingExporter(export_func, result_type, timeout_sec=10.0) + with self.subTest("Export success"): + export_func.reset_mock() + export_func.configure_mock(return_value=result_type.SUCCESS) + with self.assertRaises(AssertionError): + with self.assertLogs(level=WARNING): + result = exporter.export_with_retry("payload") + self.assertIs(result, result_type.SUCCESS) + export_func.assert_called_once_with("payload", 10.0) + + with self.subTest("Export Fail"): + export_func.reset_mock() + export_func.configure_mock(return_value=result_type.FAILURE) + with self.assertRaises(AssertionError): + with self.assertLogs(exporter_logger, level=WARNING): + result = exporter.export_with_retry("payload") + self.assertIs(result, result_type.FAILURE) + export_func.assert_called_once_with("payload", 10.0) + + @patch( + "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator", + return_value=repeat(0), + ) + def test_export_retry(self, mock_backoff): + """ + Test we retry until success/failure. + """ + side_effect = [ + RetryableExportError(None), + RetryableExportError(None), + result_type.SUCCESS, + ] + export_func = Mock(side_effect=side_effect) + exporter = RetryingExporter(export_func, result_type, timeout_sec=10.0) + + with self.subTest("Retry until success"): + with self.assertLogs(level=WARNING): + result = exporter.export_with_retry("") + self.assertEqual(export_func.call_count, len(side_effect)) + self.assertIs(result, result_type.SUCCESS) + + with self.subTest("Retry until failure"): + export_func.reset_mock() + side_effect.insert(0, RetryableExportError(None)) + side_effect[-1] = result_type.FAILURE + export_func.configure_mock(side_effect=side_effect) + with self.assertLogs(level=WARNING): + result = exporter.export_with_retry("") + self.assertEqual(export_func.call_count, len(side_effect)) + self.assertIs(result, result_type.FAILURE) + + @patch( + "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator", + return_value=repeat(0.25), + ) + def test_export_uses_retry_delay(self, mock_backoff): + """ + Test we retry using the delay specified in the RPC error as a lower bound. + """ + side_effects = [ + RetryableExportError(0.0), + RetryableExportError(0.25), + RetryableExportError(0.75), + RetryableExportError(1.0), + result_type.SUCCESS, + ] + exporter = RetryingExporter( + Mock(side_effect=side_effects), result_type, timeout_sec=10.0 + ) + + with patch( + "opentelemetry.exporter.otlp.proto.common.exporter.sleep" + ) as wait_mock: + with self.assertLogs(level=WARNING): + result = exporter.export_with_retry("payload") + self.assertIs(result, result_type.SUCCESS) + self.assertEqual(wait_mock.call_count, len(side_effects) - 1) + self.assertEqual(wait_mock.call_args_list[0].args, (0.0,)) + self.assertEqual(wait_mock.call_args_list[1].args, (0.25,)) + self.assertEqual(wait_mock.call_args_list[2].args, (0.75,)) + self.assertEqual(wait_mock.call_args_list[3].args, (1.00,)) + + # Does not work like this yet. Will be enabled soon. + # @patch( + # "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator", + # return_value=repeat(0.1), + # ) + # def test_retry_delay_exceeds_timeout(self, mock_backoff): + # """ + # Test we timeout if we can't respect retry_delay. + # """ + # side_effects = [ + # RetryableExportError(0.25), + # RetryableExportError(1.0), # should timeout here + # result_type.SUCCESS, + # ] + + # mock_export_func = Mock(side_effect=side_effects) + # exporter = RetryingExporter( + # mock_export_func, + # result_type, + # timeout_sec=0.5, + # ) + + # with self.assertLogs(level=WARNING): + # self.assertEqual( + # exporter.export_with_retry("payload"), result_type.FAILURE + # ) + # self.assertEqual(mock_export_func.call_count, 2) + + def test_shutdown(self): + """Test we refuse to export if shut down.""" + mock_export_func = Mock(return_value=result_type.SUCCESS) + exporter = RetryingExporter( + mock_export_func, + result_type, + timeout_sec=10.0, + ) + + self.assertEqual( + exporter.export_with_retry("payload"), result_type.SUCCESS + ) + mock_export_func.assert_called_once() + exporter.shutdown() + with self.assertLogs(level=WARNING) as warning: + self.assertEqual( + exporter.export_with_retry("payload"), result_type.FAILURE + ) + self.assertEqual( + warning.records[0].message, + "Exporter already shutdown, ignoring batch", + ) + mock_export_func.assert_called_once() + + @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") + def test_shutdown_wait_last_export(self, mock_sleep): + """Test that shutdown waits for ongoing export to complete.""" + + timeout_sec = 10.0 + + class ExportFunc: + is_exporting = threading.Event() + ready_to_continue = threading.Event() + side_effect = [ + RetryableExportError(None), + RetryableExportError(None), + result_type.SUCCESS, + ] + mock_export_func = Mock(side_effect=side_effect) + + def __call__(self, *args, **kwargs): + self.is_exporting.set() + self.ready_to_continue.wait() + return self.mock_export_func(*args, **kwargs) + + export_func = ExportFunc() + + exporter = RetryingExporter( + export_func, result_type, timeout_sec=timeout_sec + ) + + class ExportWrap: + def __init__(self) -> None: + self.result = None + + def __call__(self, *args, **kwargs): + self.result = exporter.export_with_retry("payload") + return self.result + + export_wrapped = ExportWrap() + + export_thread = threading.Thread( + name="export_thread", target=export_wrapped + ) + with self.assertLogs(level=WARNING): + try: + # Simulate shutdown occurring during retry process + # Intended execution flow + # + # main thread: + # - start export_thread + # - wait for is_exporting + # export_thread: + # - call export_func + # - set is_exporting + # - wait for ready_to_continue + # main thread: + # - start shutdown thread + # - sleep to yield to shutdown thread + # shutdown_thread: + # - block at acquiring lock held by export_thread + # - shutdown is now pending timeout/export completion + # main thread: + # - set ready_to_continue + # - join all threads + export_thread.start() + export_func.is_exporting.wait() + start_time = time.time() + shutdown_thread = threading.Thread( + name="shutdown_thread", target=exporter.shutdown + ) + shutdown_thread.start() + time.sleep(0.025) + export_func.ready_to_continue.set() + finally: + export_thread.join() + shutdown_thread.join() + + duration = time.time() - start_time + self.assertLessEqual(duration, timeout_sec) + # pylint: disable=protected-access + self.assertTrue(exporter._shutdown) + self.assertIs(export_wrapped.result, result_type.SUCCESS) + + # Does not work like this yet. Will be enabled soon. + # def test_shutdown_timeout_cancels_export_retries(self): + # """Test that shutdown timing out cancels ongoing retries.""" + + # class ExportFunc: + # is_exporting = threading.Event() + # ready_to_continue = threading.Event() + # mock_export_func = Mock(side_effect=RetryableExportError(None)) + + # def __call__(self, *args, **kwargs): + # self.is_exporting.set() + # self.ready_to_continue.wait() + # return self.mock_export_func(*args, **kwargs) + + # export_func = ExportFunc() + + # exporter = RetryingExporter(export_func, result_type, timeout_sec=30.0) + + # class ExportWrap: + # def __init__(self) -> None: + # self.result = None + + # def __call__(self, *args, **kwargs): + # self.result = exporter.export_with_retry("payload") + # return self.result + + # export_wrapped = ExportWrap() + + # shutdown_timeout = 1 + + # export_thread = threading.Thread(target=export_wrapped) + # with self.assertLogs(level=WARNING) as warning: + # try: + # export_thread.start() + # export_func.is_exporting.wait() + # start_time = time.time() + # shutdown_thread = threading.Thread( + # target=exporter.shutdown, args=[shutdown_timeout * 1e3] + # ) + # shutdown_thread.start() + # time.sleep(0) + # export_func.ready_to_continue.set() + # finally: + # export_thread.join() + # shutdown_thread.join() + # duration = time.time() - start_time + # self.assertAlmostEqual(duration, shutdown_timeout, places=1) + # # pylint: disable=protected-access + # self.assertTrue(exporter._shutdown) + # self.assertIs(export_wrapped.result, result_type.FAILURE) + # print(warning.records) + # self.assertEqual( + # warning.records[-1].message, + # "Export cancelled due to shutdown timing out", + # ) From 8696bf7764b04385ebf83dd240cf5b90c112aa4f Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Sat, 14 Sep 2024 17:54:57 +0200 Subject: [PATCH 10/16] Enable exporter shutdown timeout to abort ongoing export Replace the RetryableExporter shutdown flag and sleep with a `threading.Event` to make the waiting time for the next retry interruptable. This enables us to change the semantic of the shutdown timeout to work in a less surprising way. Previously the timeout worked roughly like this: 1. Wait for the current export iteration to finish 2. Try to get the lock faster than the export thread 3. If the lock was acquired, the shutdown flag was set 4. If the lock was not acquired, the shutdown flag was set 5. The export thread would have picked this flag up after finishing the current retry cycle. This lead to hanging applications on shutdown, giving the applications no choice to influence the shutdown behavior. Uses could not decide how much time shall be spent trying to prevent the loss of data. This change allows shutdown to interrupt the current export waiting call so that the exporter can be shut down with a user provided deadline. --- .../exporter/otlp/proto/common/exporter.py | 23 ++- .../tests/test_retryable_exporter.py | 139 +++++++++--------- .../tests/logs/test_otlp_logs_exporter.py | 32 ++-- .../tests/test_otlp_metrics_exporter.py | 52 ++++--- .../tests/test_otlp_trace_exporter.py | 26 ++-- .../metrics/test_otlp_metrics_exporter.py | 11 +- .../tests/test_proto_log_exporter.py | 11 +- .../tests/test_proto_span_exporter.py | 11 +- 8 files changed, 175 insertions(+), 130 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py index b9ddf77a10e..1d712ca9e30 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py @@ -14,7 +14,6 @@ import threading from contextlib import contextmanager from logging import getLogger -from time import sleep from typing import Callable, Generic, Iterator, Optional, Type, TypeVar from ._internal import _create_exp_backoff_generator @@ -42,18 +41,22 @@ def __init__( self._result = result self._timeout_sec = timeout_sec - self._shutdown = False + self._shutdown = threading.Event() self._export_lock = threading.Lock() def shutdown(self, timeout_millis: float = 30_000) -> None: - # wait for the last export if any + """Shutdown the retrying exporter + + Waits for the current export to finish up to `timeout_millis`. In case the timeout is + reached, the export will be interrupted to to prevent application hanging after completion. + """ with acquire_timeout(self._export_lock, timeout_millis / 1e3): - self._shutdown = True + self._shutdown.set() def export_with_retry(self, payload: ExportPayloadT) -> ExportResultT: # After the call to shutdown, subsequent calls to Export are # not allowed and should return a Failure result. - if self._shutdown: + if self._shutdown.is_set(): _logger.warning("Exporter already shutdown, ignoring batch") return self._result.FAILURE @@ -71,7 +74,13 @@ def export_with_retry(self, payload: ExportPayloadT) -> ExportResultT: # exponentially. Once delay is greater than max_value, the yielded # value will remain constant. for delay in _create_exp_backoff_generator(max_value=max_value): - if delay == max_value or self._shutdown: + if delay == max_value: + return self._result.FAILURE + + if self._shutdown.is_set(): + _logger.warning( + "Export cancelled due to shutdown timing out" + ) return self._result.FAILURE try: @@ -83,7 +92,7 @@ def export_with_retry(self, payload: ExportPayloadT) -> ExportResultT: else delay ) _logger.warning("Retrying in %ss", delay_sec) - sleep(delay_sec) + self._shutdown.wait(delay_sec) return self._result.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py index d5761ab2c34..eec7c93473e 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py @@ -67,11 +67,14 @@ def test_export_retry(self, mock_backoff): result_type.SUCCESS, ] export_func = Mock(side_effect=side_effect) - exporter = RetryingExporter(export_func, result_type, timeout_sec=10.0) + exporter = RetryingExporter(export_func, result_type, timeout_sec=0.1) with self.subTest("Retry until success"): - with self.assertLogs(level=WARNING): + with patch.object( + exporter._shutdown, "wait" # pylint: disable=protected-access + ) as wait_mock, self.assertLogs(level=WARNING): result = exporter.export_with_retry("") + self.assertEqual(wait_mock.call_count, len(side_effect) - 1) self.assertEqual(export_func.call_count, len(side_effect)) self.assertIs(result, result_type.SUCCESS) @@ -104,11 +107,10 @@ def test_export_uses_retry_delay(self, mock_backoff): Mock(side_effect=side_effects), result_type, timeout_sec=10.0 ) - with patch( - "opentelemetry.exporter.otlp.proto.common.exporter.sleep" - ) as wait_mock: - with self.assertLogs(level=WARNING): - result = exporter.export_with_retry("payload") + with patch.object( + exporter._shutdown, "wait" # pylint: disable=protected-access + ) as wait_mock, self.assertLogs(level=WARNING): + result = exporter.export_with_retry("payload") self.assertIs(result, result_type.SUCCESS) self.assertEqual(wait_mock.call_count, len(side_effects) - 1) self.assertEqual(wait_mock.call_args_list[0].args, (0.0,)) @@ -118,8 +120,8 @@ def test_export_uses_retry_delay(self, mock_backoff): # Does not work like this yet. Will be enabled soon. # @patch( - # "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator", - # return_value=repeat(0.1), + # "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator", + # return_value=repeat(0.1), # ) # def test_retry_delay_exceeds_timeout(self, mock_backoff): # """ @@ -168,11 +170,14 @@ def test_shutdown(self): ) mock_export_func.assert_called_once() - @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") - def test_shutdown_wait_last_export(self, mock_sleep): + @patch( + "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator", + return_value=repeat(0.01), + ) + def test_shutdown_wait_last_export(self, mock_backoff): """Test that shutdown waits for ongoing export to complete.""" - timeout_sec = 10.0 + timeout_sec = 0.05 class ExportFunc: is_exporting = threading.Event() @@ -236,7 +241,6 @@ def __call__(self, *args, **kwargs): name="shutdown_thread", target=exporter.shutdown ) shutdown_thread.start() - time.sleep(0.025) export_func.ready_to_continue.set() finally: export_thread.join() @@ -248,58 +252,57 @@ def __call__(self, *args, **kwargs): self.assertTrue(exporter._shutdown) self.assertIs(export_wrapped.result, result_type.SUCCESS) - # Does not work like this yet. Will be enabled soon. - # def test_shutdown_timeout_cancels_export_retries(self): - # """Test that shutdown timing out cancels ongoing retries.""" - - # class ExportFunc: - # is_exporting = threading.Event() - # ready_to_continue = threading.Event() - # mock_export_func = Mock(side_effect=RetryableExportError(None)) - - # def __call__(self, *args, **kwargs): - # self.is_exporting.set() - # self.ready_to_continue.wait() - # return self.mock_export_func(*args, **kwargs) - - # export_func = ExportFunc() - - # exporter = RetryingExporter(export_func, result_type, timeout_sec=30.0) - - # class ExportWrap: - # def __init__(self) -> None: - # self.result = None - - # def __call__(self, *args, **kwargs): - # self.result = exporter.export_with_retry("payload") - # return self.result - - # export_wrapped = ExportWrap() - - # shutdown_timeout = 1 - - # export_thread = threading.Thread(target=export_wrapped) - # with self.assertLogs(level=WARNING) as warning: - # try: - # export_thread.start() - # export_func.is_exporting.wait() - # start_time = time.time() - # shutdown_thread = threading.Thread( - # target=exporter.shutdown, args=[shutdown_timeout * 1e3] - # ) - # shutdown_thread.start() - # time.sleep(0) - # export_func.ready_to_continue.set() - # finally: - # export_thread.join() - # shutdown_thread.join() - # duration = time.time() - start_time - # self.assertAlmostEqual(duration, shutdown_timeout, places=1) - # # pylint: disable=protected-access - # self.assertTrue(exporter._shutdown) - # self.assertIs(export_wrapped.result, result_type.FAILURE) - # print(warning.records) - # self.assertEqual( - # warning.records[-1].message, - # "Export cancelled due to shutdown timing out", - # ) + def test_shutdown_timeout_cancels_export_retries(self): + """Test that shutdown timing out cancels ongoing retries.""" + + class ExportFunc: + is_exporting = threading.Event() + ready_to_continue = threading.Event() + mock_export_func = Mock(side_effect=RetryableExportError(None)) + + def __call__(self, *args, **kwargs): + self.is_exporting.set() + self.ready_to_continue.wait() + return self.mock_export_func(*args, **kwargs) + + export_func = ExportFunc() + + exporter = RetryingExporter(export_func, result_type, timeout_sec=30.0) + + class ExportWrap: + def __init__(self) -> None: + self.result = None + + def __call__(self, *args, **kwargs): + self.result = exporter.export_with_retry("payload") + return self.result + + export_wrapped = ExportWrap() + + shutdown_timeout = 0.02 + + export_thread = threading.Thread(target=export_wrapped) + with self.assertLogs(level=WARNING) as warning: + try: + export_thread.start() + export_func.is_exporting.wait() + start_time = time.time() + shutdown_thread = threading.Thread( + target=exporter.shutdown, args=[shutdown_timeout * 1e3] + ) + shutdown_thread.start() + export_func.ready_to_continue.set() + finally: + export_thread.join() + shutdown_thread.join() + duration = time.time() - start_time + self.assertAlmostEqual(duration, shutdown_timeout, places=1) + # pylint: disable=protected-access + self.assertTrue(exporter._shutdown) + self.assertIs(export_wrapped.result, result_type.FAILURE) + print(warning.records) + self.assertEqual(warning.records[0].message, "Retrying in 1s") + self.assertEqual( + warning.records[-1].message, + "Export cancelled due to shutdown timing out", + ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py index 8800f23096a..3e85fe9e444 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py @@ -391,34 +391,42 @@ def test_otlp_headers_from_env(self): @patch( "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") - def test_unavailable(self, mock_sleep, mock_expo): + def test_unavailable(self, mock_expo): mock_expo.configure_mock(**{"return_value": [0.01]}) add_LogsServiceServicer_to_server( LogsServiceServicerUNAVAILABLE(), self.server ) - self.assertEqual( - self.exporter.export([self.log_data_1]), LogExportResult.FAILURE - ) - mock_sleep.assert_called_with(0.01) + with patch.object( + self.exporter._exporter._shutdown, # pylint: disable=protected-access + "wait", + ) as wait_mock: + self.assertEqual( + self.exporter.export([self.log_data_1]), + LogExportResult.FAILURE, + ) + wait_mock.assert_called_with(0.01) @patch( "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") - def test_unavailable_delay(self, mock_sleep, mock_expo): + def test_unavailable_delay(self, mock_expo): mock_expo.configure_mock(**{"return_value": [1]}) add_LogsServiceServicer_to_server( LogsServiceServicerUNAVAILABLEDelay(), self.server ) - self.assertEqual( - self.exporter.export([self.log_data_1]), LogExportResult.FAILURE - ) - mock_sleep.assert_called_with(0.01) + with patch.object( + self.exporter._exporter._shutdown, # pylint: disable=protected-access + "wait", + ) as wait_mock: + self.assertEqual( + self.exporter.export([self.log_data_1]), + LogExportResult.FAILURE, + ) + wait_mock.assert_called_with(0.01) def test_success(self): add_LogsServiceServicer_to_server( diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py index 20298d2bbc2..593df8c141d 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py @@ -484,54 +484,64 @@ def test_otlp_exporter_otlp_compression_unspecified( @patch( "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") - def test_unavailable(self, mock_sleep, mock_expo): + def test_unavailable(self, mock_expo): mock_expo.configure_mock(**{"return_value": [0.01]}) add_MetricsServiceServicer_to_server( MetricsServiceServicerUNAVAILABLE(), self.server ) - self.assertEqual( - self.exporter.export(self.metrics["sum_int"]), - MetricExportResult.FAILURE, - ) - mock_sleep.assert_called_with(0.01) + + with patch.object( + self.exporter._exporter._shutdown, # pylint: disable=protected-access + "wait", + ) as wait_mock: + self.assertEqual( + self.exporter.export(self.metrics["sum_int"]), + MetricExportResult.FAILURE, + ) + wait_mock.assert_called_with(0.01) @patch( "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") - def test_unavailable_delay(self, mock_sleep, mock_expo): + def test_unavailable_delay(self, mock_expo): mock_expo.configure_mock(**{"return_value": [1]}) add_MetricsServiceServicer_to_server( MetricsServiceServicerUNAVAILABLEDelay(), self.server ) - self.assertEqual( - self.exporter.export(self.metrics["sum_int"]), - MetricExportResult.FAILURE, - ) - mock_sleep.assert_called_with(0.01) + with patch.object( + self.exporter._exporter._shutdown, # pylint: disable=protected-access + "wait", + ) as wait_mock: + self.assertEqual( + self.exporter.export(self.metrics["sum_int"]), + MetricExportResult.FAILURE, + ) + wait_mock.assert_called_with(0.01) @patch( "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.logger.error") - def test_unknown_logs(self, mock_logger_error, mock_sleep, mock_expo): + def test_unknown_logs(self, mock_logger_error, mock_expo): mock_expo.configure_mock(**{"return_value": [1]}) add_MetricsServiceServicer_to_server( MetricsServiceServicerUNKNOWN(), self.server ) - self.assertEqual( - self.exporter.export(self.metrics["sum_int"]), - MetricExportResult.FAILURE, - ) - mock_sleep.assert_not_called() + with patch.object( + self.exporter._exporter._shutdown, # pylint: disable=protected-access + "wait", + ) as wait_mock: + self.assertEqual( + self.exporter.export(self.metrics["sum_int"]), + MetricExportResult.FAILURE, + ) + wait_mock.assert_not_called() mock_logger_error.assert_called_with( "Failed to export %s to %s, error code: %s", "metrics", diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py index a4ddcd92ebe..dab77005dfd 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py @@ -525,33 +525,39 @@ def test_otlp_headers(self, mock_ssl_channel, mock_secure): @patch( "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") - def test_unavailable(self, mock_sleep, mock_expo): + def test_unavailable(self, mock_expo): mock_expo.configure_mock(**{"return_value": [0.01]}) add_TraceServiceServicer_to_server( TraceServiceServicerUNAVAILABLE(), self.server ) - result = self.exporter.export([self.span]) + with patch.object( + self.exporter._exporter._shutdown, # pylint: disable=protected-access + "wait", + ) as wait_mock: + result = self.exporter.export([self.span]) self.assertEqual(result, SpanExportResult.FAILURE) - mock_sleep.assert_called_with(0.01) + wait_mock.assert_called_with(0.01) @patch( "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) - @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") - def test_unavailable_delay(self, mock_sleep, mock_expo): + def test_unavailable_delay(self, mock_expo): mock_expo.configure_mock(**{"return_value": [1]}) add_TraceServiceServicer_to_server( TraceServiceServicerUNAVAILABLEDelay(), self.server ) - self.assertEqual( - self.exporter.export([self.span]), SpanExportResult.FAILURE - ) - mock_sleep.assert_called_with(0.01) + with patch.object( + self.exporter._exporter._shutdown, # pylint: disable=protected-access + "wait", + ) as wait_mock: + self.assertEqual( + self.exporter.export([self.span]), SpanExportResult.FAILURE + ) + wait_mock.assert_called_with(0.01) def test_success(self): add_TraceServiceServicer_to_server( diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py index 836ae447530..5bc81a04f73 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py @@ -329,8 +329,7 @@ def test_serialization(self, mock_post): ) @activate - @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") - def test_exponential_backoff(self, mock_sleep): + def test_exponential_backoff(self): # return a retryable error add( POST, @@ -344,8 +343,12 @@ def test_exponential_backoff(self, mock_sleep): ) metrics_data = self.metrics["sum_int"] - exporter.export(metrics_data) - mock_sleep.assert_has_calls( + with patch.object( + exporter._exporter._shutdown, # pylint: disable=protected-access + "wait", + ) as wait_mock: + exporter.export(metrics_data) + wait_mock.assert_has_calls( [call(1), call(2), call(4), call(8), call(16), call(32)] ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py index 1391629b313..8e598c5ab4e 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py @@ -270,8 +270,7 @@ def test_exported_log_without_span_id(self): self.fail("No log records found") @responses.activate - @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") - def test_exponential_backoff(self, mock_sleep): + def test_exponential_backoff(self): # return a retryable error responses.add( responses.POST, @@ -283,8 +282,12 @@ def test_exponential_backoff(self, mock_sleep): exporter = OTLPLogExporter(endpoint="http://logs.example.com/export") logs = self._get_sdk_log_data() - exporter.export(logs) - mock_sleep.assert_has_calls( + with patch.object( + exporter._exporter._shutdown, # pylint: disable=protected-access + "wait", + ) as wait_mock: + exporter.export(logs) + wait_mock.assert_has_calls( [call(1), call(2), call(4), call(8), call(16), call(32)] ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py index c0290559683..f739495c00a 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py @@ -234,8 +234,7 @@ def test_headers_parse_from_env(self): # pylint: disable=no-self-use @responses.activate - @patch("opentelemetry.exporter.otlp.proto.common.exporter.sleep") - def test_exponential_backoff(self, mock_sleep): + def test_exponential_backoff(self): # return a retryable error responses.add( responses.POST, @@ -258,8 +257,12 @@ def test_exponential_backoff(self, mock_sleep): ), ) - exporter.export([span]) - mock_sleep.assert_has_calls( + with patch.object( + exporter._exporter._shutdown, + "wait", # pylint: disable=protected-access + ) as wait_mock: + exporter.export([span]) + wait_mock.assert_has_calls( [call(1), call(2), call(4), call(8), call(16), call(32)] ) From 1686936447c1e09872b96bfc270b7beec60ed14b Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Sat, 14 Sep 2024 17:58:26 +0200 Subject: [PATCH 11/16] RetryableExporter: Reduce timeout during retries Derive a deadline from the initial timeout for each export_with_retry call and try to stick to this time frame as good as possible. Each call to export_function will get the remaining time as timeout parameter and is expected to return as quick in case of an issue so that export_with_retry can care for waiting and retrying. This also removes the surprsing deadline triggered by the max_value of the exponential backoff. --- .../exporter/otlp/proto/common/exporter.py | 78 ++++++++++++++++--- .../tests/test_retryable_exporter.py | 72 +++++++++-------- .../tests/logs/test_otlp_logs_exporter.py | 2 +- .../tests/test_otlp_exporter_mixin.py | 4 +- .../tests/test_otlp_metrics_exporter.py | 4 +- .../tests/test_otlp_trace_exporter.py | 4 +- .../metrics/test_otlp_metrics_exporter.py | 33 ++++---- .../tests/test_proto_log_exporter.py | 24 +++--- .../tests/test_proto_span_exporter.py | 24 +++--- 9 files changed, 160 insertions(+), 85 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py index 1d712ca9e30..210e119030f 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py @@ -14,6 +14,7 @@ import threading from contextlib import contextmanager from logging import getLogger +from time import time from typing import Callable, Generic, Iterator, Optional, Type, TypeVar from ._internal import _create_exp_backoff_generator @@ -31,6 +32,28 @@ def __init__(self, retry_delay_sec: Optional[int]): class RetryingExporter(Generic[ExportResultT]): + """OTLP exporter helper to handle retries and timeouts + + Encapsulates timeout behavior for shutdown and export tasks. + + Accepts a callable `export_function` of the form + + def export_function( + payload: object, + timeout_sec: float, + ) -> result: + .... + + that either returns the appropriate export result, or raises a RetryableExportError exception if + the encountered error should be retried. + + Args: + export_function: A callable handling a single export attempt to be used by + export_with_retry() + result: Enum-like type defining SUCCESS and FAILURE values returned by export. + timeout_sec: Timeout for exports in seconds. + """ + def __init__( self, export_function: Callable[[ExportPayloadT, float], ExportResultT], @@ -53,16 +76,41 @@ def shutdown(self, timeout_millis: float = 30_000) -> None: with acquire_timeout(self._export_lock, timeout_millis / 1e3): self._shutdown.set() - def export_with_retry(self, payload: ExportPayloadT) -> ExportResultT: + def export_with_retry( # pylint: disable=too-many-return-statements + self, payload: ExportPayloadT + ) -> ExportResultT: + """Exports payload with handling of retryable errors + + Calls the export_function provided at initialization with the following signature: + + export_function(payload, timeout_sec=remaining_time) + + where `remaining_time` is updated with each retry. + + Retries will be attempted using exponential backoff. If retry_delay_sec is specified in the + raised error, a retry attempt will not occur before that delay. If a retry after that delay + is not possible, will immediately abort without retrying. + + Will reattempt the export until timeout has passed, at which point the export will be + abandoned and a failure will be returned. A pending shutdown timing out will also cause + retries to time out. + + Note: Can block longer than timeout if export_function is blocking. Ensure export_function + blocks minimally and does not attempt retries. + + Args: + payload: Data to be exported, which is forwarded to the underlying export + """ # After the call to shutdown, subsequent calls to Export are # not allowed and should return a Failure result. if self._shutdown.is_set(): _logger.warning("Exporter already shutdown, ignoring batch") return self._result.FAILURE - with acquire_timeout( - self._export_lock, self._timeout_sec - ) as is_locked: + timeout_sec = self._timeout_sec + deadline_sec = time() + timeout_sec + + with acquire_timeout(self._export_lock, timeout_sec) as is_locked: if not is_locked: _logger.warning( "Exporter failed to acquire lock before timeout" @@ -73,9 +121,12 @@ def export_with_retry(self, payload: ExportPayloadT) -> ExportResultT: # expo returns a generator that yields delay values which grow # exponentially. Once delay is greater than max_value, the yielded # value will remain constant. - for delay in _create_exp_backoff_generator(max_value=max_value): - if delay == max_value: - return self._result.FAILURE + for delay_sec in _create_exp_backoff_generator( + max_value=max_value + ): + remaining_time_sec = deadline_sec - time() + if remaining_time_sec < 1e-09: + return self._result.FAILURE # Timed out if self._shutdown.is_set(): _logger.warning( @@ -84,14 +135,21 @@ def export_with_retry(self, payload: ExportPayloadT) -> ExportResultT: return self._result.FAILURE try: - return self._export_function(payload, self._timeout_sec) + return self._export_function(payload, remaining_time_sec) except RetryableExportError as exc: + time_remaining_sec = deadline_sec - time() + delay_sec = ( exc.retry_delay_sec if exc.retry_delay_sec is not None - else delay + else min(time_remaining_sec, delay_sec) ) - _logger.warning("Retrying in %ss", delay_sec) + + if delay_sec > time_remaining_sec: + # We should not exceed the requested timeout + return self._result.FAILURE + + _logger.warning("Retrying in %0.2fs", delay_sec) self._shutdown.wait(delay_sec) return self._result.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py index eec7c93473e..35d99c6feb1 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py @@ -18,7 +18,7 @@ from itertools import repeat from logging import WARNING from typing import Type -from unittest.mock import Mock, patch +from unittest.mock import ANY, Mock, patch from opentelemetry.exporter.otlp.proto.common.exporter import ( RetryableExportError, @@ -42,7 +42,12 @@ def test_export_no_retry(self): with self.assertLogs(level=WARNING): result = exporter.export_with_retry("payload") self.assertIs(result, result_type.SUCCESS) - export_func.assert_called_once_with("payload", 10.0) + export_func.assert_called_once_with( + "payload", ANY + ) # Timeout checked in the following line + self.assertAlmostEqual( + export_func.call_args_list[0][0][1], 10.0, places=4 + ) with self.subTest("Export Fail"): export_func.reset_mock() @@ -51,7 +56,12 @@ def test_export_no_retry(self): with self.assertLogs(exporter_logger, level=WARNING): result = exporter.export_with_retry("payload") self.assertIs(result, result_type.FAILURE) - export_func.assert_called_once_with("payload", 10.0) + export_func.assert_called_once_with( + "payload", ANY + ) # Timeout checked in the following line + self.assertAlmostEqual( + export_func.call_args_list[0][0][1], 10.0, places=4 + ) @patch( "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator", @@ -113,38 +123,36 @@ def test_export_uses_retry_delay(self, mock_backoff): result = exporter.export_with_retry("payload") self.assertIs(result, result_type.SUCCESS) self.assertEqual(wait_mock.call_count, len(side_effects) - 1) - self.assertEqual(wait_mock.call_args_list[0].args, (0.0,)) self.assertEqual(wait_mock.call_args_list[1].args, (0.25,)) self.assertEqual(wait_mock.call_args_list[2].args, (0.75,)) self.assertEqual(wait_mock.call_args_list[3].args, (1.00,)) - # Does not work like this yet. Will be enabled soon. - # @patch( - # "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator", - # return_value=repeat(0.1), - # ) - # def test_retry_delay_exceeds_timeout(self, mock_backoff): - # """ - # Test we timeout if we can't respect retry_delay. - # """ - # side_effects = [ - # RetryableExportError(0.25), - # RetryableExportError(1.0), # should timeout here - # result_type.SUCCESS, - # ] - - # mock_export_func = Mock(side_effect=side_effects) - # exporter = RetryingExporter( - # mock_export_func, - # result_type, - # timeout_sec=0.5, - # ) - - # with self.assertLogs(level=WARNING): - # self.assertEqual( - # exporter.export_with_retry("payload"), result_type.FAILURE - # ) - # self.assertEqual(mock_export_func.call_count, 2) + @patch( + "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator", + return_value=repeat(0.1), + ) + def test_retry_delay_exceeds_timeout(self, mock_backoff): + """ + Test we timeout if we can't respect retry_delay. + """ + side_effects = [ + RetryableExportError(0.25), + RetryableExportError(1.0), # should timeout here + result_type.SUCCESS, + ] + + mock_export_func = Mock(side_effect=side_effects) + exporter = RetryingExporter( + mock_export_func, + result_type, + timeout_sec=0.5, + ) + + with self.assertLogs(level=WARNING): + self.assertEqual( + exporter.export_with_retry("payload"), result_type.FAILURE + ) + self.assertEqual(mock_export_func.call_count, 2) def test_shutdown(self): """Test we refuse to export if shut down.""" @@ -301,7 +309,7 @@ def __call__(self, *args, **kwargs): self.assertTrue(exporter._shutdown) self.assertIs(export_wrapped.result, result_type.FAILURE) print(warning.records) - self.assertEqual(warning.records[0].message, "Retrying in 1s") + self.assertEqual(warning.records[0].message, "Retrying in 1.00s") self.assertEqual( warning.records[-1].message, "Export cancelled due to shutdown timing out", diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py index 3e85fe9e444..ef5a9f83d76 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py @@ -413,7 +413,7 @@ def test_unavailable(self, mock_expo): ) def test_unavailable_delay(self, mock_expo): - mock_expo.configure_mock(**{"return_value": [1]}) + mock_expo.configure_mock(**{"return_value": [0.01]}) add_LogsServiceServicer_to_server( LogsServiceServicerUNAVAILABLEDelay(), self.server diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index affad1d252b..faec76677d7 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -121,7 +121,7 @@ def trailing_metadata(self): "while exporting mock to localhost:4317" ), ) - self.assertEqual(warning.records[1].message, "Retrying in 0s") + self.assertEqual(warning.records[1].message, "Retrying in 0.00s") def test_shutdown(self): result_mock = Mock() @@ -194,7 +194,7 @@ def export(self, data): def _exporting(self) -> str: return "mock" - otlp_mock_exporter = OTLPMockExporter() + otlp_mock_exporter = OTLPMockExporter(timeout=0.05) # pylint: disable=protected-access export_thread = threading.Thread( diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py index 593df8c141d..46aa4a405ce 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py @@ -148,7 +148,7 @@ class TestOTLPMetricExporter(TestCase): def setUp(self): - self.exporter = OTLPMetricExporter() + self.exporter = OTLPMetricExporter(timeout=0.05) self.server = server(ThreadPoolExecutor(max_workers=10)) @@ -507,7 +507,7 @@ def test_unavailable(self, mock_expo): ) def test_unavailable_delay(self, mock_expo): - mock_expo.configure_mock(**{"return_value": [1]}) + mock_expo.configure_mock(**{"return_value": [0.01]}) add_MetricsServiceServicer_to_server( MetricsServiceServicerUNAVAILABLEDelay(), self.server diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py index dab77005dfd..e76a6f9aa31 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py @@ -134,7 +134,7 @@ class TestOTLPSpanExporter(TestCase): def setUp(self): tracer_provider = TracerProvider() - self.exporter = OTLPSpanExporter(insecure=True) + self.exporter = OTLPSpanExporter(insecure=True, timeout=0.05) tracer_provider.add_span_processor(SimpleSpanProcessor(self.exporter)) self.tracer = tracer_provider.get_tracer(__name__) @@ -545,7 +545,7 @@ def test_unavailable(self, mock_expo): ) def test_unavailable_delay(self, mock_expo): - mock_expo.configure_mock(**{"return_value": [1]}) + mock_expo.configure_mock(**{"return_value": [0.01]}) add_TraceServiceServicer_to_server( TraceServiceServicerUNAVAILABLEDelay(), self.server diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py index 5bc81a04f73..3b409159cd5 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py @@ -15,11 +15,12 @@ from logging import WARNING from os import environ from unittest import TestCase -from unittest.mock import MagicMock, Mock, call, patch +from unittest.mock import ANY, MagicMock, Mock, call, patch +import responses from requests import Session from requests.models import Response -from responses import POST, activate, add +from responses.registries import OrderedRegistry from opentelemetry.exporter.otlp.proto.common.metrics_encoder import ( encode_metrics, @@ -324,19 +325,25 @@ def test_serialization(self, mock_post): url=exporter._endpoint, data=serialized_data.SerializeToString(), verify=exporter._certificate_file, - timeout=exporter._timeout, + timeout=ANY, # Is checked in the following line cert=exporter._client_cert, ) + self.assertAlmostEqual( + mock_post.call_args_list[0][1]["timeout"], 10.0, places=4 + ) - @activate + # Pylint is wrong about this + @responses.activate( # pylint: disable=unexpected-keyword-arg,no-value-for-parameter + registry=OrderedRegistry + ) def test_exponential_backoff(self): - # return a retryable error - add( - POST, - "http://metrics.example.com/export", - json={"error": "something exploded"}, - status=500, - ) + for status in [500, 500, 500, 200]: + responses.add( + responses.POST, + "http://metrics.example.com/export", + json={"error": "something exploded"}, + status=status, + ) exporter = OTLPMetricExporter( endpoint="http://metrics.example.com/export" @@ -348,9 +355,7 @@ def test_exponential_backoff(self): "wait", ) as wait_mock: exporter.export(metrics_data) - wait_mock.assert_has_calls( - [call(1), call(2), call(4), call(8), call(16), call(32)] - ) + wait_mock.assert_has_calls([call(1), call(2), call(4)]) def test_aggregation_temporality(self): diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py index 8e598c5ab4e..fa9150cf58e 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py @@ -21,6 +21,7 @@ import requests import responses from google.protobuf.json_format import MessageToDict +from responses.registries import OrderedRegistry from opentelemetry._logs import SeverityNumber from opentelemetry.exporter.otlp.proto.http import Compression @@ -269,15 +270,18 @@ def test_exported_log_without_span_id(self): else: self.fail("No log records found") - @responses.activate + # Pylint is wrong about this + @responses.activate( # pylint: disable=unexpected-keyword-arg,no-value-for-parameter + registry=OrderedRegistry + ) def test_exponential_backoff(self): - # return a retryable error - responses.add( - responses.POST, - "http://logs.example.com/export", - json={"error": "something exploded"}, - status=500, - ) + for status in [500, 500, 500, 200]: + responses.add( + responses.POST, + "http://logs.example.com/export", + json={"error": "something exploded"}, + status=status, + ) exporter = OTLPLogExporter(endpoint="http://logs.example.com/export") logs = self._get_sdk_log_data() @@ -287,9 +291,7 @@ def test_exponential_backoff(self): "wait", ) as wait_mock: exporter.export(logs) - wait_mock.assert_has_calls( - [call(1), call(2), call(4), call(8), call(16), call(32)] - ) + wait_mock.assert_has_calls([call(1), call(2), call(4)]) @staticmethod def _get_sdk_log_data() -> List[LogData]: diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py index f739495c00a..b40b195c260 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py @@ -17,6 +17,7 @@ import requests import responses +from responses.registries import OrderedRegistry from opentelemetry.exporter.otlp.proto.http import Compression from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( @@ -233,15 +234,18 @@ def test_headers_parse_from_env(self): ) # pylint: disable=no-self-use - @responses.activate + # Pylint is wrong about this + @responses.activate( # pylint: disable=unexpected-keyword-arg,no-value-for-parameter + registry=OrderedRegistry + ) def test_exponential_backoff(self): - # return a retryable error - responses.add( - responses.POST, - "http://traces.example.com/export", - json={"error": "something exploded"}, - status=500, - ) + for status in [500, 500, 500, 200]: + responses.add( + responses.POST, + "http://traces.example.com/export", + json={"error": "something exploded"}, + status=status, + ) exporter = OTLPSpanExporter( endpoint="http://traces.example.com/export" @@ -262,9 +266,7 @@ def test_exponential_backoff(self): "wait", # pylint: disable=protected-access ) as wait_mock: exporter.export([span]) - wait_mock.assert_has_calls( - [call(1), call(2), call(4), call(8), call(16), call(32)] - ) + wait_mock.assert_has_calls([call(1), call(2), call(4)]) def test_2xx_status_code(self): """ From 6c9332696dfc9c9e8be30b4205fdc6e865e142ad Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Tue, 16 Jul 2024 08:03:04 +0200 Subject: [PATCH 12/16] Add optional timeout argument to export_with_retry This change allows the user to call retrying exports with individiual timeouts. This argument will be needed in the next step, where the GRPC metric exporters batch processing is changed to respect the timeout for the whole export. --- .../exporter/otlp/proto/common/exporter.py | 17 +++++++++++++++-- .../tests/test_retryable_exporter.py | 9 +++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py index 210e119030f..7734413079e 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/exporter.py @@ -77,7 +77,9 @@ def shutdown(self, timeout_millis: float = 30_000) -> None: self._shutdown.set() def export_with_retry( # pylint: disable=too-many-return-statements - self, payload: ExportPayloadT + self, + payload: ExportPayloadT, + timeout_sec: Optional[float] = None, ) -> ExportResultT: """Exports payload with handling of retryable errors @@ -91,6 +93,9 @@ def export_with_retry( # pylint: disable=too-many-return-statements raised error, a retry attempt will not occur before that delay. If a retry after that delay is not possible, will immediately abort without retrying. + In case no timeout_sec is not given, the timeout defaults to the timeout given during + initialization. + Will reattempt the export until timeout has passed, at which point the export will be abandoned and a failure will be returned. A pending shutdown timing out will also cause retries to time out. @@ -107,9 +112,17 @@ def export_with_retry( # pylint: disable=too-many-return-statements _logger.warning("Exporter already shutdown, ignoring batch") return self._result.FAILURE - timeout_sec = self._timeout_sec + timeout_sec = ( + timeout_sec if timeout_sec is not None else self._timeout_sec + ) deadline_sec = time() + timeout_sec + # If negative timeout passed (from e.g. external batch deadline - see GRPC metric exporter) + # fail immediately + if timeout_sec <= 0: + _logger.warning("Export deadline passed, ignoring data") + return self._result.FAILURE + with acquire_timeout(self._export_lock, timeout_sec) as is_locked: if not is_locked: _logger.warning( diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py index 35d99c6feb1..65e29bedfda 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_retryable_exporter.py @@ -98,6 +98,15 @@ def test_export_retry(self, mock_backoff): self.assertEqual(export_func.call_count, len(side_effect)) self.assertIs(result, result_type.FAILURE) + def test_export_uses_arg_timout_when_given(self) -> None: + export_func = Mock(side_effect=RetryableExportError(None)) + exporter = RetryingExporter(export_func, result_type, timeout_sec=2) + with self.assertLogs(level=WARNING): + start = time.time() + exporter.export_with_retry("payload", 0.1) + duration = time.time() - start + self.assertAlmostEqual(duration, 0.1, places=1) + @patch( "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator", return_value=repeat(0.25), From cf6a6798cdd7ea17e4544c4126b5c6f0153e9792 Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Tue, 16 Jul 2024 09:13:39 +0200 Subject: [PATCH 13/16] GRPC metric exporter: Use timeout passed to export The timeout passed as argument to the export method is now overriding the default values set during initialization) once given. Additonally also ensure the batching in OTLPMetricExporter.export respects the export timeout. --- .../proto/grpc/metric_exporter/__init__.py | 16 +++- .../tests/test_otlp_metrics_exporter.py | 93 +++++++++++++++++++ 2 files changed, 105 insertions(+), 4 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py index d50262c0576..94a46615ce1 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py @@ -11,6 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from time import time from dataclasses import replace from logging import getLogger from os import environ @@ -157,18 +158,25 @@ def _translate_data( def export( self, metrics_data: MetricsData, - timeout_millis: float = 10_000, + timeout_millis: Optional[float] = None, **kwargs, ) -> MetricExportResult: - # TODO(#2663): OTLPExporterMixin should pass timeout to gRPC + timeout_sec = ( + timeout_millis / 1000 + if timeout_millis is not None + else self._exporter._timeout_sec # pylint: disable=protected-access + ) + if self._max_export_batch_size is None: - return self._exporter.export_with_retry(metrics_data) + return self._exporter.export_with_retry(metrics_data, timeout_sec) export_result = MetricExportResult.SUCCESS + deadline_sec = time() + timeout_sec for split_metrics_data in self._split_metrics_data(metrics_data): + time_remaining_sec = deadline_sec - time() split_export_result = self._exporter.export_with_retry( - split_metrics_data + split_metrics_data, timeout_sec=time_remaining_sec ) if split_export_result is MetricExportResult.FAILURE: diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py index 46aa4a405ce..3758ca4bc55 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py @@ -33,6 +33,9 @@ from google.rpc.error_details_pb2 import RetryInfo from grpc import ChannelCredentials, Compression, StatusCode, server +from opentelemetry.exporter.otlp.proto.common.exporter import ( + RetryableExportError, +) from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import ( OTLPMetricExporter, ) @@ -838,6 +841,96 @@ def test_split_metrics_data_many_resources_scopes_metrics(self): split_metrics_data, ) + @patch( + "opentelemetry.exporter.otlp.proto.grpc.metric_exporter.OTLPMetricExporter._export", + side_effect=RetryableExportError(None), + ) + def test_split_metrics_timeout(self, mock_export): + """ + Test that given a batch that will be split, timeout is respected across + the batch as a whole. + """ + metrics_data = MetricsData( + resource_metrics=[ + _resource_metrics( + index=1, + scope_metrics=[ + _scope_metrics( + index=1, + metrics=[ + _gauge( + index=1, + data_points=[ + _number_data_point(11), + ], + ), + _gauge( + index=2, + data_points=[ + _number_data_point(12), + ], + ), + ], + ), + _scope_metrics( + index=2, + metrics=[ + _gauge( + index=3, + data_points=[ + _number_data_point(13), + ], + ), + ], + ), + ], + ), + _resource_metrics( + index=2, + scope_metrics=[ + _scope_metrics( + index=3, + metrics=[ + _gauge( + index=4, + data_points=[ + _number_data_point(14), + ], + ), + ], + ), + ], + ), + ] + ) + split_metrics_data: List[MetricsData] = list( + # pylint: disable=protected-access + OTLPMetricExporter(max_export_batch_size=2)._split_metrics_data( + metrics_data=metrics_data, + ) + ) + self.assertEqual(len(split_metrics_data), 2) + exporter = OTLPMetricExporter(max_export_batch_size=2) + + timeout_s = 0.5 + # The first export should block the full timeout duration and succeed. + # The subsequent export should fail immediately as the timeout will + # have passed. + with self.assertLogs(level="WARNING") as warning: + self.assertIs( + exporter.export(metrics_data, timeout_s * 1e3), + MetricExportResult.FAILURE, + ) + # There could be multiple calls to export because of the jitter in backoff + self.assertNotIn( + split_metrics_data[1], + [call_args[1] for call_args in mock_export.call_args_list], + ) + self.assertEqual( + warning.records[-1].message, + "Export deadline passed, ignoring data", + ) + @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.secure_channel") def test_insecure_https_endpoint(self, mock_secure_channel): OTLPMetricExporter(endpoint="https://ab.c:123", insecure=True) From caf5aaaedd9889c2ff3726ebd3f976fb18f22fe4 Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Tue, 16 Jul 2024 17:11:28 +0200 Subject: [PATCH 14/16] GRPC log and span exporter: Use timeout passed to export The timeout passed as argument to the export method is now overriding the default values set during initialization if given. This change is made to align the API of all exporters. In the metric exporter this parameter was already existant but was not used. This was changed with the previous commit (2102e3b25b52). --- .../otlp/proto/grpc/_log_exporter/__init__.py | 13 +++++++-- .../proto/grpc/trace_exporter/__init__.py | 13 +++++++-- .../tests/logs/test_otlp_logs_exporter.py | 16 +++++++++++ .../tests/test_otlp_trace_exporter.py | 16 +++++++++++ .../otlp/proto/http/_log_exporter/__init__.py | 13 +++++++-- .../proto/http/trace_exporter/__init__.py | 18 +++++++++--- .../tests/test_proto_log_exporter.py | 19 +++++++++++++ .../tests/test_proto_span_exporter.py | 28 +++++++++++++++++++ 8 files changed, 126 insertions(+), 10 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py index 1dc590afec7..5e3eafa17c3 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py @@ -109,8 +109,17 @@ def _translate_data( ) -> ExportLogsServiceRequest: return encode_logs(data) - def export(self, batch: Sequence[LogData]) -> LogExportResult: - return self._exporter.export_with_retry(batch) + def export( + self, + batch: Sequence[LogData], + timeout_millis: Optional[float] = None, + ) -> LogExportResult: + return self._exporter.export_with_retry( + batch, + timeout_sec=( + timeout_millis / 1000 if timeout_millis is not None else None + ), + ) def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: OTLPExporterMixin.shutdown(self, timeout_millis=timeout_millis) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py index a93e5f99a70..da00502b7d5 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py @@ -139,8 +139,17 @@ def _translate_data( ) -> ExportTraceServiceRequest: return encode_spans(data) - def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: - return self._exporter.export_with_retry(spans) + def export( + self, + spans: Sequence[ReadableSpan], + timeout_millis: Optional[float] = None, + ) -> SpanExportResult: + return self._exporter.export_with_retry( + spans, + timeout_sec=( + timeout_millis / 1000 if timeout_millis is not None else None + ), + ) def shutdown(self) -> None: OTLPExporterMixin.shutdown(self) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py index ef5a9f83d76..9090019fc0b 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/logs/test_otlp_logs_exporter.py @@ -29,6 +29,9 @@ from opentelemetry._logs import SeverityNumber from opentelemetry.exporter.otlp.proto.common._internal import _encode_value +from opentelemetry.exporter.otlp.proto.common.exporter import ( + RetryableExportError, +) from opentelemetry.exporter.otlp.proto.grpc._log_exporter import ( OTLPLogExporter, ) @@ -388,6 +391,19 @@ def test_otlp_headers_from_env(self): (("user-agent", "OTel-OTLP-Exporter-Python/" + __version__),), ) + @patch( + "opentelemetry.exporter.otlp.proto.grpc._log_exporter.OTLPLogExporter._export", + side_effect=RetryableExportError(None), + ) + def test_export_uses_arg_timeout_when_given(self, export_mock) -> None: + exporter = OTLPLogExporter(timeout=20) + + with self.assertLogs(level="WARNING"): + start = time.time() + exporter.export([self.log_data_1], timeout_millis=100.0) + duration = time.time() - start + self.assertAlmostEqual(duration, 0.1, places=1) + @patch( "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py index e76a6f9aa31..0f27eb5ea03 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py @@ -33,6 +33,9 @@ from opentelemetry.exporter.otlp.proto.common._internal import ( _encode_key_value, ) +from opentelemetry.exporter.otlp.proto.common.exporter import ( + RetryableExportError, +) from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( OTLPSpanExporter, ) @@ -522,6 +525,19 @@ def test_otlp_headers(self, mock_ssl_channel, mock_secure): (("user-agent", "OTel-OTLP-Exporter-Python/" + __version__),), ) + @patch( + "opentelemetry.exporter.otlp.proto.grpc.trace_exporter.OTLPSpanExporter._export", + side_effect=RetryableExportError(None), + ) + def test_export_uses_arg_timeout_when_given(self, export_mock) -> None: + exporter = OTLPSpanExporter(timeout=20) + + with self.assertLogs(level="WARNING"): + start = time.time() + exporter.export([self.span], timeout_millis=100.0) + duration = time.time() - start + self.assertAlmostEqual(duration, 0.1, places=1) + @patch( "opentelemetry.exporter.otlp.proto.common.exporter._create_exp_backoff_generator" ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index 17d710fc465..3b772351671 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -173,7 +173,11 @@ def _retryable(resp: requests.Response) -> bool: return True return False - def export(self, batch: Sequence[LogData]) -> LogExportResult: + def export( + self, + batch: Sequence[LogData], + timeout_millis: Optional[float] = None, + ) -> LogExportResult: # After the call to Shutdown subsequent calls to Export are # not allowed and should return a Failure result. if self._shutdown: @@ -181,7 +185,12 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: return LogExportResult.FAILURE serialized_data = encode_logs(batch).SerializeToString() - return self._exporter.export_with_retry(serialized_data) + return self._exporter.export_with_retry( + serialized_data, + timeout_sec=( + timeout_millis / 1000.0 if timeout_millis is not None else None + ), + ) def force_flush(self, timeout_millis: float = 10_000) -> bool: """Nothing is buffered in this exporter, so this method does nothing.""" diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index c3584604681..65863fe7c21 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -17,7 +17,7 @@ import zlib from io import BytesIO from os import environ -from typing import Dict, Optional +from typing import Dict, Optional, Sequence import requests @@ -44,6 +44,7 @@ OTEL_EXPORTER_OTLP_HEADERS, OTEL_EXPORTER_OTLP_TIMEOUT, ) +from opentelemetry.sdk.trace import ReadableSpan from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult from opentelemetry.exporter.otlp.proto.http import ( _OTLP_HTTP_HEADERS, @@ -170,10 +171,14 @@ def _retryable(resp: requests.Response) -> bool: return True return False - def _serialize_spans(self, spans): + def _serialize_spans(self, spans: Sequence[ReadableSpan]) -> str: return encode_spans(spans).SerializePartialToString() - def export(self, spans) -> SpanExportResult: + def export( + self, + spans: Sequence[ReadableSpan], + timeout_millis: Optional[float] = None, + ) -> SpanExportResult: # After the call to Shutdown subsequent calls to Export are # not allowed and should return a Failure result. if self._shutdown: @@ -181,7 +186,12 @@ def export(self, spans) -> SpanExportResult: return SpanExportResult.FAILURE serialized_data = self._serialize_spans(spans) - return self._exporter.export_with_retry(serialized_data) + return self._exporter.export_with_retry( + serialized_data, + timeout_sec=( + timeout_millis / 1000.0 if timeout_millis is not None else None + ), + ) def shutdown(self): if self._shutdown: diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py index fa9150cf58e..dc698a4c0ad 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py @@ -15,6 +15,7 @@ # pylint: disable=protected-access import unittest +from time import time from typing import List from unittest.mock import MagicMock, Mock, call, patch @@ -24,6 +25,9 @@ from responses.registries import OrderedRegistry from opentelemetry._logs import SeverityNumber +from opentelemetry.exporter.otlp.proto.common.exporter import ( + RetryableExportError, +) from opentelemetry.exporter.otlp.proto.http import Compression from opentelemetry.exporter.otlp.proto.http._log_exporter import ( DEFAULT_COMPRESSION, @@ -200,6 +204,21 @@ def test_exporter_env(self): ) self.assertIsInstance(exporter._session, requests.Session) + @patch( + "opentelemetry.exporter.otlp.proto.http._log_exporter.OTLPLogExporter._export", + side_effect=RetryableExportError(None), + ) + def test_export_uses_arg_timeout_when_given(self, export_mock) -> None: + exporter = OTLPLogExporter( + endpoint="http://traces.example.com/export", timeout=20 + ) + + with self.assertLogs(level="WARNING"): + start = time() + exporter.export(self._get_sdk_log_data(), timeout_millis=100.0) + duration = time() - start + self.assertAlmostEqual(duration, 0.1, places=1) + @staticmethod def export_log_and_deserialize(log): with patch("requests.Session.post") as mock_post: diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py index b40b195c260..1109d5647ae 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py @@ -13,12 +13,16 @@ # limitations under the License. import unittest +from time import time from unittest.mock import MagicMock, Mock, call, patch import requests import responses from responses.registries import OrderedRegistry +from opentelemetry.exporter.otlp.proto.common.exporter import ( + RetryableExportError, +) from opentelemetry.exporter.otlp.proto.http import Compression from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( DEFAULT_COMPRESSION, @@ -233,6 +237,30 @@ def test_headers_parse_from_env(self): ), ) + @patch( + "opentelemetry.exporter.otlp.proto.http.trace_exporter.OTLPSpanExporter._export", + side_effect=RetryableExportError(None), + ) + def test_export_uses_arg_timeout_when_given(self, export_mock) -> None: + exporter = OTLPSpanExporter( + endpoint="http://traces.example.com/export", timeout=20 + ) + + span = _Span( + "abc", + context=Mock( + trace_state={"a": "b", "c": "d"}, + span_id=10217189687419569865, + trace_id=67545097771067222548457157018666467027, + ), + ) + + with self.assertLogs(level="WARNING"): + start = time() + exporter.export([span], timeout_millis=100.0) + duration = time() - start + self.assertAlmostEqual(duration, 0.1, places=1) + # pylint: disable=no-self-use # Pylint is wrong about this @responses.activate( # pylint: disable=unexpected-keyword-arg,no-value-for-parameter From 40fe96e42e6fd260d4af22209bcf96f5b3e9e18b Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Tue, 16 Jul 2024 17:20:21 +0200 Subject: [PATCH 15/16] Align HTTP metric exporter timeout with other exporters It's desirable to have a common API across all exporters. Since the previously existing timeout_millis argument wasn't used anyways, it should not be an issue to change the default value here. The None case will fall back to the object level timeout, which is the same as the previous default argument value. --- .../exporter/otlp/proto/http/metric_exporter/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 9f7cee5eff2..260df9570e6 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -216,7 +216,7 @@ def _retryable(resp: requests.Response) -> bool: def export( self, metrics_data: MetricsData, - timeout_millis: float = 10_000, + timeout_millis: Optional[float] = None, **kwargs, ) -> MetricExportResult: if self._shutdown: @@ -225,7 +225,10 @@ def export( serialized_data = encode_metrics(metrics_data) return self._exporter.export_with_retry( - serialized_data.SerializeToString() + serialized_data.SerializeToString(), + timeout_sec=( + timeout_millis / 1000 if timeout_millis is not None else None + ), ) def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: From 62a9101eed4bd7d38fc351d0d4fedf2deb446b04 Mon Sep 17 00:00:00 2001 From: Lars Michelsen Date: Sat, 14 Sep 2024 18:01:33 +0200 Subject: [PATCH 16/16] Add change log entry --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 516b10c2aa5..5a036b21782 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Remove `_start_time_unix_nano` attribute from `_ViewInstrumentMatch` in favor of using `time_ns()` at the moment when the aggregation object is created ([#4137](https://github.com/open-telemetry/opentelemetry-python/pull/4137)) +- Improve timeout and retry mechanic of exporters + ([#4183](https://github.com/open-telemetry/opentelemetry-python/pull/4183)) ## Version 1.26.0/0.47b0 (2024-07-25)