Move audio util

Signed-off-by: DarkLight1337 <[email protected]>
vllm-project · Dec 25, 2024 · fdf13b0 · fdf13b0
1 parent b384a4c
commit fdf13b0
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 12 deletions.
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 import numpy as np
 import numpy.typing as npt
 
@@ -26,6 +28,16 @@ def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
             "There is no default maximum multimodal tokens")
 
 
+def try_import_audio_packages() -> tuple[Any, Any]:
+    try:
+        import librosa
+        import soundfile
+    except ImportError as exc:
+        raise ImportError(
+            "Please install vllm[audio] for audio support.") from exc
+    return librosa, soundfile
+
+
 def resample_audio(
     audio: npt.NDArray[np.floating],
     *,

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
@@ -2,7 +2,7 @@
 import os
 from functools import lru_cache
 from io import BytesIO
-from typing import Any, List, Optional, Tuple, TypeVar, Union
+from typing import List, Optional, Tuple, TypeVar, Union
 
 import numpy as np
 import numpy.typing as npt
@@ -14,6 +14,7 @@
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
+from .audio import try_import_audio_packages
 from .inputs import MultiModalDataDict, PlaceholderRange
 from .video import try_import_video_packages
 
@@ -205,16 +206,6 @@ async def async_fetch_video(video_url: str,
     return video
 
 
-def try_import_audio_packages() -> Tuple[Any, Any]:
-    try:
-        import librosa
-        import soundfile
-    except ImportError as exc:
-        raise ImportError(
-            "Please install vllm[audio] for audio support.") from exc
-    return librosa, soundfile
-
-
 def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
     """
     Load audio from a URL.

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
@@ -78,7 +78,7 @@ def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
         return 4096
 
 
-def try_import_video_packages() -> Any:
+def try_import_video_packages() -> tuple[Any, Any]:
     try:
         import cv2
         import decord