fix: safely handle empty and complex columns (#7)

tconbeer · Oct 9, 2023 · 257f5d7 · 257f5d7
1 parent 250097e
commit 257f5d7
Show file tree

Hide file tree

Showing 8 changed files with 56 additions and 9 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,5 +1,7 @@
 {
     "python.linting.flake8Enabled": false,
     "python.linting.mypyEnabled": true,
-    "python.linting.enabled": true
+    "python.linting.enabled": true,
+    "python.analysis.inlayHints.callArgumentNames": "off",
+    "python.analysis.stubPath": "stubs"
 }
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,8 @@ All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
 
+-   Fixes a crash when creating a column from a null or complex type.
+
 ## [0.1.2] - 2023-10-02
 
 ## [0.1.1] - 2023-09-29

diff --git a/src/textual_fastdatatable/backend.py b/src/textual_fastdatatable/backend.py
@@ -6,6 +6,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
+import pyarrow.lib as pl
 import pyarrow.parquet as pq
 
 AutoBackendType = Union[
@@ -171,19 +172,17 @@ def column_content_widths(self) -> list[int]:
         if not self._column_content_widths:
             if self._string_data is None:
                 self._string_data = pa.Table.from_arrays(
-                    arrays=[arr.cast(pa.string()) for arr in self.data.columns],
+                    arrays=[
+                        self._safe_cast_arr_to_str(arr) for arr in self.data.columns
+                    ],
                     names=self.data.column_names,
                 )
             self._column_content_widths = [
-                pc.max(pc.utf8_length(arr)).as_py()
+                pc.max(pc.utf8_length(arr).fill_null(0)).as_py()
                 for arr in self._string_data.itercolumns()
             ]
         return self._column_content_widths
 
-    def _reset_content_widths(self) -> None:
-        self._string_data = None
-        self._column_content_widths = []
-
     def get_row_at(self, index: int) -> Sequence[Any]:
         row: Dict[str, Any] = self.data.slice(index, length=1).to_pylist()[0]
         return list(row.values())
@@ -267,3 +266,24 @@ def sort(
             indicated.
         """
         self.data = self.data.sort_by(by)
+
+    def _reset_content_widths(self) -> None:
+        self._string_data = None
+        self._column_content_widths = []
+
+    @staticmethod
+    def _safe_cast_arr_to_str(arr: pa._PandasConvertible) -> pa._PandasConvertible:
+        """
+        Safe here means avoiding type errors casting to str; ironically that means
+        setting PyArrow safe=false. If PyArrow can't do the cast (as with structs
+        and other nested types), we fall back to Python.
+        """
+        try:
+            return arr.cast(
+                pa.string(),
+                safe=False,
+            )
+        except pl.ArrowNotImplementedError:
+            # todo: vectorize this with a pyarrow udf
+            native_list = arr.to_pylist()
+            return pa.array([str(i) for i in native_list], type=pa.string())
diff --git a/src/textual_fastdatatable/data_table.py b/src/textual_fastdatatable/data_table.py
@@ -508,7 +508,6 @@ def __init__(
         disabled: bool = False,
     ) -> None:
         super().__init__(name=name, id=id, classes=classes, disabled=disabled)
-        # TODO: HANDLE EMPTY CASE
         try:
             self.backend: DataTableBackend | None = (
                 backend if backend is not None else create_backend(data)  # type: ignore

diff --git a/stubs/pyarrow/__init__.pyi b/stubs/pyarrow/__init__.pyi
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 from typing import Any, Iterable, Iterator, Literal, Mapping, Type, TypeVar
 
+from .compute import CastOptions
 from .types import DataType as DataType
 from .types import string as string
 
@@ -22,7 +23,7 @@ class _PandasConvertible:
         self: A,
         target_type: DataType | None = None,
         safe: bool = True,
-        options: Any | None = None,
+        options: CastOptions | None = None,
     ) -> A: ...
     def __getitem__(self, index: int) -> Scalar: ...
     def to_pylist(self) -> list[Any]: ...
@@ -31,6 +32,9 @@ class _PandasConvertible:
 class Array(_PandasConvertible): ...
 class ChunkedArray(_PandasConvertible): ...
 
+class StructArray(Array):
+    def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: ...
+
 T = TypeVar("T", bound="_Tabular")
 
 class _Tabular:
@@ -106,3 +110,6 @@ def nulls(
     type: DataType | None = None,  # noqa: A002
     memory_pool: MemoryPool | None = None,
 ) -> Array: ...
+def concat_arrays(
+    arrays: Iterable[Array], memory_pool: MemoryPool | None = None
+) -> Array: ...
diff --git a/stubs/pyarrow/compute.pyi b/stubs/pyarrow/compute.pyi
@@ -1,8 +1,23 @@
+from __future__ import annotations
+
 from . import MemoryPool, Scalar, _PandasConvertible
+from .types import DataType
 
 class Expression: ...
 class ScalarAggregateOptions: ...
 
+class CastOptions:
+    def __init__(
+        self,
+        target_type: DataType | None = None,
+        allow_int_overflow: bool | None = None,
+        allow_time_truncate: bool | None = None,
+        allow_time_overflow: bool | None = None,
+        allow_decimal_truncate: bool | None = None,
+        allow_float_truncate: bool | None = None,
+        allow_invalid_utf8: bool | None = None,
+    ) -> None: ...
+
 def max(  # noqa: A001
     array: _PandasConvertible,
     /,

diff --git a/stubs/pyarrow/lib.pyi b/stubs/pyarrow/lib.pyi
@@ -0,0 +1 @@
+class ArrowNotImplementedError(Exception): ...
diff --git a/stubs/pyarrow/types.pyi b/stubs/pyarrow/types.pyi
@@ -2,3 +2,4 @@ class DataType: ...
 
 def string() -> DataType: ...
 def is_null(t: DataType) -> bool: ...
+def is_struct(t: DataType) -> bool: ...
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,3 +2,4 @@ class DataType: ...

		def string() -> DataType: ...
		def is_null(t: DataType) -> bool: ...
		def is_struct(t: DataType) -> bool: ...