Skip to content

Commit

Permalink
grow cache size limit, nice errors, handle corrupted files
Browse files Browse the repository at this point in the history
  • Loading branch information
Jan Škoda committed Sep 25, 2023
1 parent fe49a86 commit 8ef1840
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 5 deletions.
7 changes: 7 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@
History
=======

0.8.0 (2023-09-18)
------------------

* grow default cache size limit
* nicer error messages when data are missing
* pass and print warning when file is corrupted

0.7.0 (2023-09-18)
------------------

Expand Down
4 changes: 1 addition & 3 deletions lakeapi/_cache.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
'''
Cache downloaded data
You can adjust module-level `bytes_limit`, but only before you use the cache.
'''
from typing import Any, Callable
import joblib

default_bytes_limit: int = 10_000_000_000
default_bytes_limit: int = 1_000_000_000_000
verbose_cache = 0

_store: joblib.Memory = joblib.Memory(
Expand Down
4 changes: 2 additions & 2 deletions lakeapi/_read_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ def _read_parquet_file(
path = path,
)
if pq_file is None:
raise exceptions.InvalidFile(f"Invalid Parquet file: {path}")
return pa.Table.from_arrays(arrays=[], names=[])
return pq_file.read(columns=columns, use_threads=False, use_pandas_metadata=False)
except pyarrow.lib.ArrowInvalid:
raise pyarrow.lib.ArrowInvalid(path)
Expand Down Expand Up @@ -760,7 +760,7 @@ def read_parquet(
if path_root is not None and partition_filter is not None:
paths = _apply_partition_filter(path_root=path_root, paths=paths, filter_func=partition_filter)
if len(paths) < 1:
raise exceptions.NoFilesFound(f"No files Found on: {path}.")
raise exceptions.NoFilesFound("No data found for your query")
_logger.debug("paths:\n%s", paths)

args: Dict[str, Any] = {
Expand Down
5 changes: 5 additions & 0 deletions lakeapi/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,11 @@ def partition_filter(partition: Dict[str, str]) -> bool:
continue
else:
raise
except awswrangler.exceptions.NoFilesFound:
if is_anonymous_access:
raise awswrangler.exceptions.NoFilesFound("No data found for your query in the free sample dataset. Please subscribe to access more data.")
else:
raise
else:
# got error 404 both before and after the cache.clear()
raise last_ex
Expand Down

0 comments on commit 8ef1840

Please sign in to comment.