diff --git a/.vscode/settings.json b/.vscode/settings.json index 2875801..1d62061 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,7 @@ { "python.linting.flake8Enabled": false, "python.linting.mypyEnabled": true, - "python.linting.enabled": true + "python.linting.enabled": true, + "python.analysis.inlayHints.callArgumentNames": "off", + "python.analysis.stubPath": "stubs" } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index c8e8b27..6c24cc0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +- Fixes a crash when creating a column from a null or complex type. + ## [0.1.2] - 2023-10-02 ## [0.1.1] - 2023-09-29 diff --git a/src/textual_fastdatatable/backend.py b/src/textual_fastdatatable/backend.py index 4bcaad3..bb98968 100644 --- a/src/textual_fastdatatable/backend.py +++ b/src/textual_fastdatatable/backend.py @@ -6,6 +6,7 @@ import pyarrow as pa import pyarrow.compute as pc +import pyarrow.lib as pl import pyarrow.parquet as pq AutoBackendType = Union[ @@ -171,19 +172,17 @@ def column_content_widths(self) -> list[int]: if not self._column_content_widths: if self._string_data is None: self._string_data = pa.Table.from_arrays( - arrays=[arr.cast(pa.string()) for arr in self.data.columns], + arrays=[ + self._safe_cast_arr_to_str(arr) for arr in self.data.columns + ], names=self.data.column_names, ) self._column_content_widths = [ - pc.max(pc.utf8_length(arr)).as_py() + pc.max(pc.utf8_length(arr).fill_null(0)).as_py() for arr in self._string_data.itercolumns() ] return self._column_content_widths - def _reset_content_widths(self) -> None: - self._string_data = None - self._column_content_widths = [] - def get_row_at(self, index: int) -> Sequence[Any]: row: Dict[str, Any] = self.data.slice(index, length=1).to_pylist()[0] return list(row.values()) @@ -267,3 +266,24 @@ def sort( indicated. """ self.data = self.data.sort_by(by) + + def _reset_content_widths(self) -> None: + self._string_data = None + self._column_content_widths = [] + + @staticmethod + def _safe_cast_arr_to_str(arr: pa._PandasConvertible) -> pa._PandasConvertible: + """ + Safe here means avoiding type errors casting to str; ironically that means + setting PyArrow safe=false. If PyArrow can't do the cast (as with structs + and other nested types), we fall back to Python. + """ + try: + return arr.cast( + pa.string(), + safe=False, + ) + except pl.ArrowNotImplementedError: + # todo: vectorize this with a pyarrow udf + native_list = arr.to_pylist() + return pa.array([str(i) for i in native_list], type=pa.string()) diff --git a/src/textual_fastdatatable/data_table.py b/src/textual_fastdatatable/data_table.py index f517c28..5813a10 100644 --- a/src/textual_fastdatatable/data_table.py +++ b/src/textual_fastdatatable/data_table.py @@ -508,7 +508,6 @@ def __init__( disabled: bool = False, ) -> None: super().__init__(name=name, id=id, classes=classes, disabled=disabled) - # TODO: HANDLE EMPTY CASE try: self.backend: DataTableBackend | None = ( backend if backend is not None else create_backend(data) # type: ignore diff --git a/stubs/pyarrow/__init__.pyi b/stubs/pyarrow/__init__.pyi index f089f66..5ff89e7 100644 --- a/stubs/pyarrow/__init__.pyi +++ b/stubs/pyarrow/__init__.pyi @@ -2,6 +2,7 @@ from __future__ import annotations from typing import Any, Iterable, Iterator, Literal, Mapping, Type, TypeVar +from .compute import CastOptions from .types import DataType as DataType from .types import string as string @@ -22,7 +23,7 @@ class _PandasConvertible: self: A, target_type: DataType | None = None, safe: bool = True, - options: Any | None = None, + options: CastOptions | None = None, ) -> A: ... def __getitem__(self, index: int) -> Scalar: ... def to_pylist(self) -> list[Any]: ... @@ -31,6 +32,9 @@ class _PandasConvertible: class Array(_PandasConvertible): ... class ChunkedArray(_PandasConvertible): ... +class StructArray(Array): + def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: ... + T = TypeVar("T", bound="_Tabular") class _Tabular: @@ -106,3 +110,6 @@ def nulls( type: DataType | None = None, # noqa: A002 memory_pool: MemoryPool | None = None, ) -> Array: ... +def concat_arrays( + arrays: Iterable[Array], memory_pool: MemoryPool | None = None +) -> Array: ... diff --git a/stubs/pyarrow/compute.pyi b/stubs/pyarrow/compute.pyi index 5d7f6ec..7b416c7 100644 --- a/stubs/pyarrow/compute.pyi +++ b/stubs/pyarrow/compute.pyi @@ -1,8 +1,23 @@ +from __future__ import annotations + from . import MemoryPool, Scalar, _PandasConvertible +from .types import DataType class Expression: ... class ScalarAggregateOptions: ... +class CastOptions: + def __init__( + self, + target_type: DataType | None = None, + allow_int_overflow: bool | None = None, + allow_time_truncate: bool | None = None, + allow_time_overflow: bool | None = None, + allow_decimal_truncate: bool | None = None, + allow_float_truncate: bool | None = None, + allow_invalid_utf8: bool | None = None, + ) -> None: ... + def max( # noqa: A001 array: _PandasConvertible, /, diff --git a/stubs/pyarrow/lib.pyi b/stubs/pyarrow/lib.pyi new file mode 100644 index 0000000..b7e377c --- /dev/null +++ b/stubs/pyarrow/lib.pyi @@ -0,0 +1 @@ +class ArrowNotImplementedError(Exception): ... diff --git a/stubs/pyarrow/types.pyi b/stubs/pyarrow/types.pyi index 6d1e133..3017292 100644 --- a/stubs/pyarrow/types.pyi +++ b/stubs/pyarrow/types.pyi @@ -2,3 +2,4 @@ class DataType: ... def string() -> DataType: ... def is_null(t: DataType) -> bool: ... +def is_struct(t: DataType) -> bool: ...