diff --git a/modin/conftest.py b/modin/conftest.py index d860807ec19..417f6fe70c3 100644 --- a/modin/conftest.py +++ b/modin/conftest.py @@ -306,43 +306,31 @@ def pytest_runtest_call(item): @pytest.fixture(scope="class") -def TestReadCSVFixture(): +def TestReadCSVFixture(tmp_path_factory): + tmp_path = tmp_path_factory.mktemp("TestReadCSVFixture") filenames = [] - files_ids = [ - "test_read_csv_regular", - "test_read_csv_blank_lines", - "test_read_csv_yes_no", - "test_read_csv_nans", - "test_read_csv_bad_lines", - ] # each xdist worker spawned in separate process with separate namespace and dataset - pytest.csvs_names = {file_id: get_unique_filename() for file_id in files_ids} + pytest.csvs_names = {} # test_read_csv_col_handling, test_read_csv_parsing - _make_csv_file(filenames)( - filename=pytest.csvs_names["test_read_csv_regular"], - ) + pytest.csvs_names["test_read_csv_regular"] = _make_csv_file(data_dir=tmp_path)() # test_read_csv_parsing - _make_csv_file(filenames)( - filename=pytest.csvs_names["test_read_csv_yes_no"], + pytest.csvs_names["test_read_csv_yes_no"] = _make_csv_file(data_dir=tmp_path)( additional_col_values=["Yes", "true", "No", "false"], ) # test_read_csv_col_handling - _make_csv_file(filenames)( - filename=pytest.csvs_names["test_read_csv_blank_lines"], + pytest.csvs_names["test_read_csv_blank_lines"] = _make_csv_file(data_dir=tmp_path)( add_blank_lines=True, ) # test_read_csv_nans_handling - _make_csv_file(filenames)( - filename=pytest.csvs_names["test_read_csv_nans"], + pytest.csvs_names["test_read_csv_nans"] = _make_csv_file(data_dir=tmp_path)( add_blank_lines=True, additional_col_values=["", "N/A", "NA", "NULL", "custom_nan", "73"], ) # test_read_csv_error_handling - _make_csv_file(filenames)( - filename=pytest.csvs_names["test_read_csv_bad_lines"], + pytest.csvs_names["test_read_csv_bad_lines"] = _make_csv_file(data_dir=tmp_path)( add_bad_lines=True, ) - + filenames.extend(pytest.csvs_names.values()) yield # Delete csv files that were created teardown_test_files(filenames) @@ -350,19 +338,14 @@ def TestReadCSVFixture(): @pytest.fixture @doc(_doc_pytest_fixture, file_type="csv") -def make_csv_file(): - filenames = [] - - yield _make_csv_file(filenames) - - # Delete csv files that were created - teardown_test_files(filenames) +def make_csv_file(tmp_path): + yield _make_csv_file(data_dir=tmp_path) def create_fixture(file_type): @doc(_doc_pytest_fixture, file_type=file_type) - def fixture(): - func, filenames = make_default_file(file_type=file_type) + def fixture(tmp_path): + func, filenames = make_default_file(file_type=file_type, data_dir=tmp_path) yield func teardown_test_files(filenames) @@ -476,20 +459,18 @@ def _sql_connection(filename, table=""): @pytest.fixture(scope="class") -def TestReadGlobCSVFixture(): - filenames = [] +def TestReadGlobCSVFixture(tmp_path_factory): + tmp_path = tmp_path_factory.mktemp("TestReadGlobCSVFixture") base_name = get_unique_filename(extension="") - pytest.glob_path = "{}_*.csv".format(base_name) - pytest.files = ["{}_{}.csv".format(base_name, i) for i in range(11)] + pytest.glob_path = str(tmp_path / "{}_*.csv".format(base_name)) + pytest.files = [str(tmp_path / "{}_{}.csv".format(base_name, i)) for i in range(11)] for fname in pytest.files: # Glob does not guarantee ordering so we have to remove the randomness in the generated csvs. - _make_csv_file(filenames)(fname, row_size=11, remove_randomness=True) + _make_csv_file(data_dir=tmp_path)(fname, row_size=11, remove_randomness=True) yield - teardown_test_files(filenames) - @pytest.fixture def get_generated_doc_urls(): diff --git a/modin/experimental/pandas/test/test_io_exp.py b/modin/experimental/pandas/test/test_io_exp.py index 4bfd8c9b7ab..dd7da32bd12 100644 --- a/modin/experimental/pandas/test/test_io_exp.py +++ b/modin/experimental/pandas/test/test_io_exp.py @@ -41,7 +41,7 @@ def test_from_sql_distributed(tmp_path, make_sql_connection): filename = "test_from_sql_distributed.db" table = "test_from_sql_distributed" - conn = make_sql_connection(tmp_path / filename, table) + conn = make_sql_connection(str(tmp_path / filename), table) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, conn) @@ -73,7 +73,7 @@ def test_from_sql_distributed(tmp_path, make_sql_connection): def test_from_sql_defaults(tmp_path, make_sql_connection): filename = "test_from_sql_distributed.db" table = "test_from_sql_distributed" - conn = make_sql_connection(tmp_path / filename, table) + conn = make_sql_connection(str(tmp_path / filename), table) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, conn) @@ -134,7 +134,7 @@ def test_read_csv_without_glob(self): storage_options={"anon": True}, ) - def test_read_csv_glob_4373(self): + def test_read_csv_glob_4373(self, tmp_path): columns, filename = ["col0"], "1x1.csv" df = pd.DataFrame([[1]], columns=columns) with ( @@ -142,7 +142,7 @@ def test_read_csv_glob_4373(self): if Engine.get() == "Dask" else contextlib.nullcontext() ): - df.to_csv(filename) + df.to_csv(str(tmp_path / filename)) kwargs = {"filepath_or_buffer": filename, "usecols": columns} modin_df = pd.read_csv_glob(**kwargs) @@ -203,9 +203,6 @@ def _pandas_read_csv_glob(path, storage_options): ) -test_default_to_pickle_filename = "test_default_to_pickle.pkl" - - @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask"), reason=f"{Engine.get()} does not have experimental API", @@ -245,9 +242,9 @@ def _pandas_read_csv_glob(path, storage_options): ) @pytest.mark.parametrize("compression", [None, "gzip"]) @pytest.mark.parametrize( - "filename", [test_default_to_pickle_filename, "test_to_pickle*.pkl"] + "filename", ["test_default_to_pickle.pkl", "test_to_pickle*.pkl"] ) -def test_distributed_pickling(filename, compression): +def test_distributed_pickling(tmp_path, filename, compression): data = test_data["int_data"] df = pd.DataFrame(data) @@ -257,11 +254,13 @@ def test_distributed_pickling(filename, compression): with ( warns_that_defaulting_to_pandas() - if filename_param == test_default_to_pickle_filename + if filename_param == "test_default_to_pickle.pkl" else contextlib.nullcontext() ): - df.to_pickle_distributed(filename, compression=compression) - pickled_df = pd.read_pickle_distributed(filename, compression=compression) + df.to_pickle_distributed(str(tmp_path / filename), compression=compression) + pickled_df = pd.read_pickle_distributed( + str(tmp_path / filename), compression=compression + ) df_equals(pickled_df, df) pickle_files = glob.glob(filename) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 6f1aa00c13a..fd801a61d12 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -52,7 +52,6 @@ from modin.utils import to_pandas from .utils import ( - COMP_TO_EXT, check_file_leaks, create_test_dfs, default_to_pandas_ignore_string, @@ -268,42 +267,38 @@ class TestCsv: def test_read_csv_delimiters( self, make_csv_file, sep, delimiter, decimal, thousands ): - with ensure_clean(".csv") as unique_filename: - make_csv_file( - filename=unique_filename, - delimiter=delimiter, - thousands_separator=thousands, - decimal_separator=decimal, - ) - - eval_io( - fn_name="read_csv", - # read_csv kwargs - filepath_or_buffer=unique_filename, - delimiter=delimiter, - sep=sep, - decimal=decimal, - thousands=thousands, - ) + unique_filename = make_csv_file( + delimiter=delimiter, + thousands_separator=thousands, + decimal_separator=decimal, + ) + eval_io( + fn_name="read_csv", + # read_csv kwargs + filepath_or_buffer=unique_filename, + delimiter=delimiter, + sep=sep, + decimal=decimal, + thousands=thousands, + ) @pytest.mark.parametrize( "dtype_backend", [lib.no_default, "numpy_nullable", "pyarrow"] ) def test_read_csv_dtype_backend(self, make_csv_file, dtype_backend): - with ensure_clean(".csv") as unique_filename: - make_csv_file(filename=unique_filename) + unique_filename = make_csv_file() - def comparator(df1, df2): - df_equals(df1, df2) - df_equals(df1.dtypes, df2.dtypes) + def comparator(df1, df2): + df_equals(df1, df2) + df_equals(df1.dtypes, df2.dtypes) - eval_io( - fn_name="read_csv", - # read_csv kwargs - filepath_or_buffer=unique_filename, - dtype_backend=dtype_backend, - comparator=comparator, - ) + eval_io( + fn_name="read_csv", + # read_csv kwargs + filepath_or_buffer=unique_filename, + dtype_backend=dtype_backend, + comparator=comparator, + ) # Column and Index Locations and Names tests @pytest.mark.parametrize("header", ["infer", None, 0]) @@ -421,38 +416,34 @@ def test_read_csv_parsing_2( names, encoding, ): - with ensure_clean(".csv") as unique_filename: - if encoding: - make_csv_file( - filename=unique_filename, - encoding=encoding, - ) - kwargs = { - "filepath_or_buffer": unique_filename - if encoding - else pytest.csvs_names["test_read_csv_regular"], - "header": header, - "skiprows": skiprows, - "nrows": nrows, - "names": names, - "encoding": encoding, - } + if encoding: + unique_filename = make_csv_file( + encoding=encoding, + ) + kwargs = { + "filepath_or_buffer": unique_filename + if encoding + else pytest.csvs_names["test_read_csv_regular"], + "header": header, + "skiprows": skiprows, + "nrows": nrows, + "names": names, + "encoding": encoding, + } - if Engine.get() != "Python": - df = pandas.read_csv(**dict(kwargs, nrows=1)) - # in that case first partition will contain str - if df[df.columns[0]][df.index[0]] in ["c1", "col1", "c3", "col3"]: - pytest.xfail( - "read_csv incorrect output with float data - issue #2634" - ) + if Engine.get() != "Python": + df = pandas.read_csv(**dict(kwargs, nrows=1)) + # in that case first partition will contain str + if df[df.columns[0]][df.index[0]] in ["c1", "col1", "c3", "col3"]: + pytest.xfail("read_csv incorrect output with float data - issue #2634") - eval_io( - fn_name="read_csv", - raising_exceptions=None, - check_kwargs_callable=not callable(skiprows), - # read_csv kwargs - **kwargs, - ) + eval_io( + fn_name="read_csv", + raising_exceptions=None, + check_kwargs_callable=not callable(skiprows), + # read_csv kwargs + **kwargs, + ) @pytest.mark.parametrize("true_values", [["Yes"], ["Yes", "true"], None]) @pytest.mark.parametrize("false_values", [["No"], ["No", "false"], None]) @@ -640,24 +631,16 @@ def test_read_csv_encoding_976(self): @pytest.mark.parametrize("encoding", [None, "latin8", "utf16"]) @pytest.mark.parametrize("engine", [None, "python", "c"]) def test_read_csv_compression(self, make_csv_file, compression, encoding, engine): - with ensure_clean(".csv") as unique_filename: - make_csv_file( - filename=unique_filename, encoding=encoding, compression=compression - ) - compressed_file_path = ( - f"{unique_filename}.{COMP_TO_EXT[compression]}" - if compression != "infer" - else unique_filename - ) + unique_filename = make_csv_file(encoding=encoding, compression=compression) - eval_io( - fn_name="read_csv", - # read_csv kwargs - filepath_or_buffer=compressed_file_path, - compression=compression, - encoding=encoding, - engine=engine, - ) + eval_io( + fn_name="read_csv", + # read_csv kwargs + filepath_or_buffer=unique_filename, + compression=compression, + encoding=encoding, + engine=engine, + ) @pytest.mark.parametrize( "encoding", @@ -685,15 +668,13 @@ def test_read_csv_compression(self, make_csv_file, compression, encoding, engine ], ) def test_read_csv_encoding(self, make_csv_file, encoding): - with ensure_clean(".csv") as unique_filename: - make_csv_file(filename=unique_filename, encoding=encoding) - - eval_io( - fn_name="read_csv", - # read_csv kwargs - filepath_or_buffer=unique_filename, - encoding=encoding, - ) + unique_filename = make_csv_file(encoding=encoding) + eval_io( + fn_name="read_csv", + # read_csv kwargs + filepath_or_buffer=unique_filename, + encoding=encoding, + ) @pytest.mark.parametrize("thousands", [None, ",", "_", " "]) @pytest.mark.parametrize("decimal", [".", "_"]) @@ -709,56 +690,50 @@ def test_read_csv_file_format( escapechar, dialect, ): - with ensure_clean(".csv") as unique_filename: - if dialect: - test_csv_dialect_params = { - "delimiter": "_", - "doublequote": False, - "escapechar": "\\", - "quotechar": "d", - "quoting": csv.QUOTE_ALL, - } - csv.register_dialect(dialect, **test_csv_dialect_params) - if dialect != "use_dialect_name": - # otherwise try with dialect name instead of `_csv.Dialect` object - dialect = csv.get_dialect(dialect) - make_csv_file(filename=unique_filename, **test_csv_dialect_params) - else: - make_csv_file( - filename=unique_filename, - thousands_separator=thousands, - decimal_separator=decimal, - escapechar=escapechar, - lineterminator=lineterminator, - ) - - if ( - (StorageFormat.get() == "Hdk") - and (escapechar is not None) - and (lineterminator is None) - and (thousands is None) - and (decimal == ".") - ): - with open(unique_filename, "r") as f: - if any( - line.find(f',"{escapechar}') != -1 for _, line in enumerate(f) - ): - pytest.xfail( - "Tests with this character sequence fail due to #5649" - ) - - eval_io( - raising_exceptions=None, - fn_name="read_csv", - # read_csv kwargs - filepath_or_buffer=unique_filename, - thousands=thousands, - decimal=decimal, - lineterminator=lineterminator, + if dialect: + test_csv_dialect_params = { + "delimiter": "_", + "doublequote": False, + "escapechar": "\\", + "quotechar": "d", + "quoting": csv.QUOTE_ALL, + } + csv.register_dialect(dialect, **test_csv_dialect_params) + if dialect != "use_dialect_name": + # otherwise try with dialect name instead of `_csv.Dialect` object + dialect = csv.get_dialect(dialect) + unique_filename = make_csv_file(**test_csv_dialect_params) + else: + unique_filename = make_csv_file( + thousands_separator=thousands, + decimal_separator=decimal, escapechar=escapechar, - dialect=dialect, + lineterminator=lineterminator, ) + if ( + (StorageFormat.get() == "Hdk") + and (escapechar is not None) + and (lineterminator is None) + and (thousands is None) + and (decimal == ".") + ): + with open(unique_filename, "r") as f: + if any(line.find(f',"{escapechar}') != -1 for _, line in enumerate(f)): + pytest.xfail("Tests with this character sequence fail due to #5649") + + eval_io( + raising_exceptions=None, + fn_name="read_csv", + # read_csv kwargs + filepath_or_buffer=unique_filename, + thousands=thousands, + decimal=decimal, + lineterminator=lineterminator, + escapechar=escapechar, + dialect=dialect, + ) + @pytest.mark.parametrize( "quoting", [csv.QUOTE_ALL, csv.QUOTE_MINIMAL, csv.QUOTE_NONNUMERIC, csv.QUOTE_NONE], @@ -780,26 +755,24 @@ def test_read_csv_quoting( not doublequote and quotechar != '"' and quoting != csv.QUOTE_NONE ) escapechar = "\\" if use_escapechar else None - with ensure_clean(".csv") as unique_filename: - make_csv_file( - filename=unique_filename, - quoting=quoting, - quotechar=quotechar, - doublequote=doublequote, - escapechar=escapechar, - comment_col_char=comment, - ) + unique_filename = make_csv_file( + quoting=quoting, + quotechar=quotechar, + doublequote=doublequote, + escapechar=escapechar, + comment_col_char=comment, + ) - eval_io( - fn_name="read_csv", - # read_csv kwargs - filepath_or_buffer=unique_filename, - quoting=quoting, - quotechar=quotechar, - doublequote=doublequote, - escapechar=escapechar, - comment=comment, - ) + eval_io( + fn_name="read_csv", + # read_csv kwargs + filepath_or_buffer=unique_filename, + quoting=quoting, + quotechar=quotechar, + doublequote=doublequote, + escapechar=escapechar, + comment=comment, + ) # Error Handling parameters tests @pytest.mark.skip(reason="https://github.com/modin-project/modin/issues/6239") @@ -839,6 +812,7 @@ def test_read_csv_internal( low_memory, memory_map, float_precision, + tmp_path, ): # In this case raised TypeError: cannot use a string pattern on a bytes-like object, # so TypeError should be excluded from raising_exceptions list in order to check, that @@ -866,29 +840,28 @@ def test_read_csv_internal( "float_precision": float_precision, } - with ensure_clean(".csv") as unique_filename: - if use_str_data: - str_delim_whitespaces = ( - "col1 col2 col3 col4\n5 6 7 8\n9 10 11 12\n" - ) - eval_io_from_str( - str_delim_whitespaces, - unique_filename, - raising_exceptions=raising_exceptions, - **kwargs, - ) - else: - make_csv_file( - filename=unique_filename, - delimiter=delimiter, - ) - - eval_io( - filepath_or_buffer=unique_filename, - fn_name="read_csv", - raising_exceptions=raising_exceptions, - **kwargs, - ) + if use_str_data: + str_delim_whitespaces = ( + "col1 col2 col3 col4\n5 6 7 8\n9 10 11 12\n" + ) + unique_filename = get_unique_filename(data_dir=tmp_path) + eval_io_from_str( + str_delim_whitespaces, + unique_filename, + raising_exceptions=raising_exceptions, + **kwargs, + ) + else: + unique_filename = make_csv_file( + delimiter=delimiter, + ) + + eval_io( + filepath_or_buffer=unique_filename, + fn_name="read_csv", + raising_exceptions=raising_exceptions, + **kwargs, + ) # Issue related, specific or corner cases @pytest.mark.parametrize("nrows", [2, None]) @@ -1192,21 +1165,13 @@ def wrapped_read_csv(file, method): def test_read_csv_file_handle( self, read_mode, make_csv_file, buffer_start_pos, set_async_read_mode ): - with ensure_clean() as unique_filename: - make_csv_file(filename=unique_filename) - - with open(unique_filename, mode=read_mode) as buffer: - buffer.seek(buffer_start_pos) - pandas_df = pandas.read_csv(buffer) - buffer.seek(buffer_start_pos) - modin_df = pd.read_csv(buffer) - if AsyncReadMode.get(): - # If read operations are asynchronous, then the dataframes - # check should be inside `ensure_clean` context - # because the file may be deleted before actual reading starts - df_equals(modin_df, pandas_df) - if not AsyncReadMode.get(): - df_equals(modin_df, pandas_df) + unique_filename = make_csv_file() + with open(unique_filename, mode=read_mode) as buffer: + buffer.seek(buffer_start_pos) + pandas_df = pandas.read_csv(buffer) + buffer.seek(buffer_start_pos) + modin_df = pd.read_csv(buffer) + df_equals(modin_df, pandas_df) def test_unnamed_index(self): def get_internal_df(df): @@ -1330,21 +1295,19 @@ def _check_relative_io(fn_name, unique_filename, path_arg, storage_default=()): # TODO(https://github.com/modin-project/modin/issues/3655): Get rid of this # commment once we turn all default to pandas messages into errors. def test_read_csv_relative_to_user_home(make_csv_file): - with ensure_clean(".csv") as unique_filename: - make_csv_file(filename=unique_filename) - _check_relative_io("read_csv", unique_filename, "filepath_or_buffer") + unique_filename = make_csv_file() + _check_relative_io("read_csv", unique_filename, "filepath_or_buffer") @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestTable: def test_read_table(self, make_csv_file): - with ensure_clean() as unique_filename: - make_csv_file(filename=unique_filename, delimiter="\t") - eval_io( - fn_name="read_table", - # read_table kwargs - filepath_or_buffer=unique_filename, - ) + unique_filename = make_csv_file(delimiter="\t") + eval_io( + fn_name="read_table", + # read_table kwargs + filepath_or_buffer=unique_filename, + ) @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True) def test_read_table_within_decorator(self, make_csv_file, set_async_read_mode): @@ -1356,34 +1319,26 @@ def wrapped_read_table(file, method): if method == "modin": return pd.read_table(file) - with ensure_clean() as unique_filename: - make_csv_file(filename=unique_filename, delimiter="\t") + unique_filename = make_csv_file(delimiter="\t") - pandas_df = wrapped_read_table(unique_filename, method="pandas") - modin_df = wrapped_read_table(unique_filename, method="modin") + pandas_df = wrapped_read_table(unique_filename, method="pandas") + modin_df = wrapped_read_table(unique_filename, method="modin") if StorageFormat.get() == "Hdk": modin_df, pandas_df = align_datetime_dtypes(modin_df, pandas_df) - if AsyncReadMode.get(): - # If read operations are asynchronous, then the dataframes - # check should be inside `ensure_clean` context - # because the file may be deleted before actual reading starts - df_equals(modin_df, pandas_df) - if not AsyncReadMode.get(): - df_equals(modin_df, pandas_df) + df_equals(modin_df, pandas_df) def test_read_table_empty_frame(self, make_csv_file): - with ensure_clean() as unique_filename: - make_csv_file(filename=unique_filename, delimiter="\t") + unique_filename = make_csv_file(delimiter="\t") - eval_io( - fn_name="read_table", - # read_table kwargs - filepath_or_buffer=unique_filename, - usecols=["col1"], - index_col="col1", - ) + eval_io( + fn_name="read_table", + # read_table kwargs + filepath_or_buffer=unique_filename, + usecols=["col1"], + index_col="col1", + ) @pytest.mark.parametrize("engine", ["pyarrow", "fastparquet"]) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index ab1251ccf4f..2c73d8caa33 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -1343,9 +1343,9 @@ def generate_dataframe(row_size=NROWS, additional_col_values=None, idx_name=None return pandas.DataFrame(data, index=index) -def _make_csv_file(filenames): +def _make_csv_file(data_dir): def _csv_file_maker( - filename, + filename=None, row_size=NROWS, force=True, delimiter=",", @@ -1365,8 +1365,10 @@ def _csv_file_maker( escapechar=None, lineterminator=None, ): + if filename is None: + filename = get_unique_filename(data_dir=data_dir) if os.path.exists(filename) and not force: - pass + return None else: df = generate_dataframe(row_size, additional_col_values) if remove_randomness: @@ -1436,8 +1438,7 @@ def _csv_file_maker( encoding=encoding, **csv_reader_writer_params, ) - filenames.append(filename) - return df + return filename return _csv_file_maker @@ -1493,7 +1494,7 @@ def rotate_decimal_digits_or_symbols(value): return tens + ones * 10 -def make_default_file(file_type: str): +def make_default_file(file_type: str, data_dir: str): """Helper function for pytest fixtures.""" filenames = [] @@ -1523,9 +1524,8 @@ def _create_file(filenames, filename, force, nrows, ncols, func: str, func_kw=No } extension = file_type_to_extension.get(file_type, file_type) - def _make_default_file(filename=None, nrows=NROWS, ncols=2, force=True, **kwargs): - if filename is None: - filename = get_unique_filename(extension=extension) + def _make_default_file(nrows=NROWS, ncols=2, force=True, **kwargs): + filename = get_unique_filename(extension=extension, data_dir=data_dir) if file_type == "json": lines = kwargs.get("lines")