Skip to content

Commit

Permalink
refactored the download module to have reusable clients (#817)
Browse files Browse the repository at this point in the history
  • Loading branch information
ethantang-db authored Nov 4, 2024
1 parent a3edf7d commit 06b1d7f
Show file tree
Hide file tree
Showing 9 changed files with 1,002 additions and 669 deletions.
3 changes: 3 additions & 0 deletions streaming/base/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,6 @@

# Time to wait, in seconds.
TICK = 0.007

# Default download timeout
DEFAULT_TIMEOUT = 60.0
4 changes: 2 additions & 2 deletions streaming/base/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from streaming.base.array import Array
from streaming.base.batching import generate_work
from streaming.base.constant import (BARRIER, BARRIER_FILELOCK, CACHE_FILELOCK, CACHE_USAGE,
EPOCH_DATA, EPOCH_SHAPE, NEXT_EPOCH, RESUME,
DEFAULT_TIMEOUT, EPOCH_DATA, EPOCH_SHAPE, NEXT_EPOCH, RESUME,
SHARD_ACCESS_TIMES, SHARD_STATES, TICK)
from streaming.base.distributed import maybe_init_dist
from streaming.base.format import get_index_basename
Expand Down Expand Up @@ -314,7 +314,7 @@ def __init__(self,
local: Optional[str] = None,
split: Optional[str] = None,
download_retry: int = 2,
download_timeout: float = 60,
download_timeout: float = DEFAULT_TIMEOUT,
validate_hash: Optional[str] = None,
keep_zip: bool = False,
epoch_size: Optional[Union[int, str]] = None,
Expand Down
33 changes: 17 additions & 16 deletions streaming/base/storage/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@

"""Base module for downloading/uploading files from/to cloud storage."""
# isort: off
from streaming.base.storage.download import (
download_file, download_from_alipan, download_from_azure, download_from_azure_datalake,
download_from_databricks_unity_catalog, download_from_dbfs, download_from_gcs,
download_from_hf, download_from_local, download_from_oci, download_from_s3, download_from_sftp)
from streaming.base.storage.download import (CloudDownloader, S3Downloader, SFTPDownloader,
GCSDownloader, OCIDownloader, AzureDownloader,
AzureDataLakeDownloader, HFDownloader,
DatabricksUnityCatalogDownloader, DBFSDownloader,
AlipanDownloader, LocalDownloader)
from streaming.base.storage.upload import (AzureDataLakeUploader, AzureUploader, CloudUploader,
GCSUploader, HFUploader, LocalUploader, OCIUploader,
S3Uploader)

__all__ = [
'download_file',
'CloudUploader',
'S3Uploader',
'GCSUploader',
Expand All @@ -21,15 +21,16 @@
'AzureUploader',
'AzureDataLakeUploader',
'HFUploader',
'download_from_s3',
'download_from_sftp',
'download_from_gcs',
'download_from_oci',
'download_from_azure',
'download_from_azure_datalake',
'download_from_databricks_unity_catalog',
'download_from_dbfs',
'download_from_alipan',
'download_from_local',
'download_from_hf',
'CloudDownloader',
'S3Downloader',
'SFTPDownloader',
'GCSDownloader',
'OCIDownloader',
'AzureDownloader',
'AzureDataLakeDownloader',
'HFDownloader',
'DatabricksUnityCatalogDownloader',
'DBFSDownloader',
'AlipanDownloader',
'LocalDownloader',
]
Loading

0 comments on commit 06b1d7f

Please sign in to comment.