From 9f186cc7cc932f3c2cdf8a22fda52afdfe039353 Mon Sep 17 00:00:00 2001 From: Ari Brown Date: Thu, 30 Nov 2023 16:40:33 -0500 Subject: [PATCH 1/2] FIX-#6778: Read parquet files without file extensions using fastparquet In supporting fastparquet, modin takes the paths provided, globs them, and filters them to only look at files with the .parq or .parquet extension. This commit adds support so that if the path supplied is explicitly a file, it will be included. Signed-off by: Ari Brown --- .../io/column_stores/parquet_dispatcher.py | 32 +++++++++++++------ modin/pandas/test/test_io.py | 14 ++++++++ 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/modin/core/io/column_stores/parquet_dispatcher.py b/modin/core/io/column_stores/parquet_dispatcher.py index 6596ead087d..294245e63f8 100644 --- a/modin/core/io/column_stores/parquet_dispatcher.py +++ b/modin/core/io/column_stores/parquet_dispatcher.py @@ -285,19 +285,33 @@ def files(self): def to_pandas_dataframe(self, columns): return self.dataset.to_pandas(columns=columns) + # Karthik Velayutham writes: + # + # fastparquet doesn't have a nice method like PyArrow, so we + # have to copy some of their logic here while we work on getting + # an easier method to get a list of valid files. + # See: https://github.com/dask/fastparquet/issues/795 def _get_fastparquet_files(self): # noqa: GL08 - # fastparquet doesn't have a nice method like PyArrow, so we - # have to copy some of their logic here while we work on getting - # an easier method to get a list of valid files. - # See: https://github.com/dask/fastparquet/issues/795 if "*" in self.path: files = self.fs.glob(self.path) else: - files = [ - f - for f in self.fs.find(self.path) - if f.endswith(".parquet") or f.endswith(".parq") - ] + # (Resolving issue #6778) + # + # Users will pass in a directory to a delta table, which stores parquet + # files in various directories along with other, non-parquet files. We + # need to identify those parquet files and not the non-parquet files. + # + # However, we also need to support users passing in explicit files that + # don't necessarily have the `.parq` or `.parquet` extension -- if a user + # says that a file is parquet, then we should probably give it a shot. + if os.path.isfile(self.path): + files = self.fs.find(self.path) + else: + files = [ + f + for f in self.fs.find(self.path) + if f.endswith(".parquet") or f.endswith(".parq") + ] return files diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 672bd3a8003..ddac2748847 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -1464,6 +1464,20 @@ def comparator(df1, df2): comparator=comparator, ) + # Tests issue #6778 + def test_read_parquet_no_extension(self, engine, make_parquet_file): + with ensure_clean(".parquet") as unique_filename: + # Remove the .parquet extension + no_ext_fname = unique_filename[:unique_filename.index(".parquet")] + + make_parquet_file(filename=no_ext_fname) + eval_io( + fn_name="read_parquet", + # read_parquet kwargs + engine=engine, + path=no_ext_fname + ) + @pytest.mark.parametrize( "filters", [None, [], [("col1", "==", 5)], [("col1", "<=", 215), ("col2", ">=", 35)]], From 1e761baee82a2b171142826661729dcc3647db60 Mon Sep 17 00:00:00 2001 From: Ari Brown Date: Thu, 30 Nov 2023 16:57:56 -0500 Subject: [PATCH 2/2] FIX-#6778: Read parquet files without file extensions using fastparquet In supporting fastparquet, modin takes the paths provided, globs them, and filters them to only look at files with the .parq or .parquet extension. This commit adds support so that if the path supplied is explicitly a file, it will be included. This commit also fixes the formatting issues for the `black` linter. Signed-off by: Ari Brown --- modin/pandas/test/test_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index ddac2748847..da12c980a67 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -1468,14 +1468,14 @@ def comparator(df1, df2): def test_read_parquet_no_extension(self, engine, make_parquet_file): with ensure_clean(".parquet") as unique_filename: # Remove the .parquet extension - no_ext_fname = unique_filename[:unique_filename.index(".parquet")] + no_ext_fname = unique_filename[: unique_filename.index(".parquet")] make_parquet_file(filename=no_ext_fname) eval_io( fn_name="read_parquet", # read_parquet kwargs engine=engine, - path=no_ext_fname + path=no_ext_fname, ) @pytest.mark.parametrize(