Skip to content

Commit

Permalink
FIX-modin-project#6778: Read parquet files without file extensions us…
Browse files Browse the repository at this point in the history
…ing fastparquet

In supporting fastparquet, modin takes the paths provided, globs them,
and filters them to only look at files with the .parq or .parquet
extension. This commit adds support so that if the path supplied is
explicitly a file, it will be included.

Signed-off by: Ari Brown <[email protected]>
  • Loading branch information
Ari Brown committed Nov 30, 2023
1 parent 76d741b commit a0f9677
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 9 deletions.
32 changes: 23 additions & 9 deletions modin/core/io/column_stores/parquet_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,19 +285,33 @@ def files(self):
def to_pandas_dataframe(self, columns):
return self.dataset.to_pandas(columns=columns)

# Karthik Velayutham writes:
#
# fastparquet doesn't have a nice method like PyArrow, so we
# have to copy some of their logic here while we work on getting
# an easier method to get a list of valid files.
# See: https://github.com/dask/fastparquet/issues/795
def _get_fastparquet_files(self): # noqa: GL08
# fastparquet doesn't have a nice method like PyArrow, so we
# have to copy some of their logic here while we work on getting
# an easier method to get a list of valid files.
# See: https://github.com/dask/fastparquet/issues/795
if "*" in self.path:
files = self.fs.glob(self.path)
else:
files = [
f
for f in self.fs.find(self.path)
if f.endswith(".parquet") or f.endswith(".parq")
]
# (Resolving issue #6778)
#
# Users will pass in a directory to a delta table, which stores parquet
# files in various directories along with other, non-parquet files. We
# need to identify those parquet files and not the non-parquet files.
#
# However, we also need to support users passing in explicit files that
# don't necessarily have the `.parq` or `.parquet` extension -- if a user
# says that a file is parquet, then we should probably give it a shot.
if os.path.isfile(self.path):
files = self.fs.find(self.path)
else:
files = [
f
for f in self.fs.find(self.path)
if f.endswith(".parquet") or f.endswith(".parq")
]
return files


Expand Down
15 changes: 15 additions & 0 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1464,6 +1464,21 @@ def comparator(df1, df2):
comparator=comparator,
)

# Tests issue #6778
def test_read_parquet_no_extension(self, engine, make_parquet_file):
with ensure_clean(".parquet") as unique_filename:
# Remove the .parquet extension
no_ext_fname = unique_filename[:unique_filename.index(".parquet")]
print(no_ext_fname)

make_parquet_file(filename=no_ext_fname)
eval_io(
fn_name="read_parquet",
# read_parquet kwargs
engine=engine,
path=no_ext_fname
)

@pytest.mark.parametrize(
"filters",
[None, [], [("col1", "==", 5)], [("col1", "<=", 215), ("col2", ">=", 35)]],
Expand Down

0 comments on commit a0f9677

Please sign in to comment.