Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX-#6778: Read parquet files without file extensions using fastparquet #6789

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions modin/core/io/column_stores/parquet_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,19 +285,33 @@ def files(self):
def to_pandas_dataframe(self, columns):
return self.dataset.to_pandas(columns=columns)

# Karthik Velayutham writes:
#
# fastparquet doesn't have a nice method like PyArrow, so we
# have to copy some of their logic here while we work on getting
# an easier method to get a list of valid files.
# See: https://github.com/dask/fastparquet/issues/795
def _get_fastparquet_files(self): # noqa: GL08
# fastparquet doesn't have a nice method like PyArrow, so we
# have to copy some of their logic here while we work on getting
# an easier method to get a list of valid files.
# See: https://github.com/dask/fastparquet/issues/795
if "*" in self.path:
files = self.fs.glob(self.path)
else:
files = [
f
for f in self.fs.find(self.path)
if f.endswith(".parquet") or f.endswith(".parq")
]
# (Resolving issue #6778)
#
# Users will pass in a directory to a delta table, which stores parquet
# files in various directories along with other, non-parquet files. We
# need to identify those parquet files and not the non-parquet files.
#
# However, we also need to support users passing in explicit files that
# don't necessarily have the `.parq` or `.parquet` extension -- if a user
# says that a file is parquet, then we should probably give it a shot.
if os.path.isfile(self.path):
files = self.fs.find(self.path)
else:
files = [
f
for f in self.fs.find(self.path)
if f.endswith(".parquet") or f.endswith(".parq")
]
return files


Expand Down
14 changes: 14 additions & 0 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1464,6 +1464,20 @@ def comparator(df1, df2):
comparator=comparator,
)

# Tests issue #6778
def test_read_parquet_no_extension(self, engine, make_parquet_file):
with ensure_clean(".parquet") as unique_filename:
# Remove the .parquet extension
no_ext_fname = unique_filename[: unique_filename.index(".parquet")]

make_parquet_file(filename=no_ext_fname)
eval_io(
fn_name="read_parquet",
# read_parquet kwargs
engine=engine,
path=no_ext_fname,
)

@pytest.mark.parametrize(
"filters",
[None, [], [("col1", "==", 5)], [("col1", "<=", 215), ("col2", ">=", 35)]],
Expand Down
Loading