diff --git a/modin/core/io/column_stores/parquet_dispatcher.py b/modin/core/io/column_stores/parquet_dispatcher.py index 6596ead087d..294245e63f8 100644 --- a/modin/core/io/column_stores/parquet_dispatcher.py +++ b/modin/core/io/column_stores/parquet_dispatcher.py @@ -285,19 +285,33 @@ def files(self): def to_pandas_dataframe(self, columns): return self.dataset.to_pandas(columns=columns) + # Karthik Velayutham writes: + # + # fastparquet doesn't have a nice method like PyArrow, so we + # have to copy some of their logic here while we work on getting + # an easier method to get a list of valid files. + # See: https://github.com/dask/fastparquet/issues/795 def _get_fastparquet_files(self): # noqa: GL08 - # fastparquet doesn't have a nice method like PyArrow, so we - # have to copy some of their logic here while we work on getting - # an easier method to get a list of valid files. - # See: https://github.com/dask/fastparquet/issues/795 if "*" in self.path: files = self.fs.glob(self.path) else: - files = [ - f - for f in self.fs.find(self.path) - if f.endswith(".parquet") or f.endswith(".parq") - ] + # (Resolving issue #6778) + # + # Users will pass in a directory to a delta table, which stores parquet + # files in various directories along with other, non-parquet files. We + # need to identify those parquet files and not the non-parquet files. + # + # However, we also need to support users passing in explicit files that + # don't necessarily have the `.parq` or `.parquet` extension -- if a user + # says that a file is parquet, then we should probably give it a shot. + if os.path.isfile(self.path): + files = self.fs.find(self.path) + else: + files = [ + f + for f in self.fs.find(self.path) + if f.endswith(".parquet") or f.endswith(".parq") + ] return files diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 672bd3a8003..ddac2748847 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -1464,6 +1464,20 @@ def comparator(df1, df2): comparator=comparator, ) + # Tests issue #6778 + def test_read_parquet_no_extension(self, engine, make_parquet_file): + with ensure_clean(".parquet") as unique_filename: + # Remove the .parquet extension + no_ext_fname = unique_filename[:unique_filename.index(".parquet")] + + make_parquet_file(filename=no_ext_fname) + eval_io( + fn_name="read_parquet", + # read_parquet kwargs + engine=engine, + path=no_ext_fname + ) + @pytest.mark.parametrize( "filters", [None, [], [("col1", "==", 5)], [("col1", "<=", 215), ("col2", ">=", 35)]],