Skip to content

Commit

Permalink
fix: safely handle empty and complex columns (#7)
Browse files Browse the repository at this point in the history
  • Loading branch information
tconbeer authored Oct 9, 2023
1 parent 250097e commit 257f5d7
Show file tree
Hide file tree
Showing 8 changed files with 56 additions and 9 deletions.
4 changes: 3 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
{
"python.linting.flake8Enabled": false,
"python.linting.mypyEnabled": true,
"python.linting.enabled": true
"python.linting.enabled": true,
"python.analysis.inlayHints.callArgumentNames": "off",
"python.analysis.stubPath": "stubs"
}
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ All notable changes to this project will be documented in this file.

## [Unreleased]

- Fixes a crash when creating a column from a null or complex type.

## [0.1.2] - 2023-10-02

## [0.1.1] - 2023-09-29
Expand Down
32 changes: 26 additions & 6 deletions src/textual_fastdatatable/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.lib as pl
import pyarrow.parquet as pq

AutoBackendType = Union[
Expand Down Expand Up @@ -171,19 +172,17 @@ def column_content_widths(self) -> list[int]:
if not self._column_content_widths:
if self._string_data is None:
self._string_data = pa.Table.from_arrays(
arrays=[arr.cast(pa.string()) for arr in self.data.columns],
arrays=[
self._safe_cast_arr_to_str(arr) for arr in self.data.columns
],
names=self.data.column_names,
)
self._column_content_widths = [
pc.max(pc.utf8_length(arr)).as_py()
pc.max(pc.utf8_length(arr).fill_null(0)).as_py()
for arr in self._string_data.itercolumns()
]
return self._column_content_widths

def _reset_content_widths(self) -> None:
self._string_data = None
self._column_content_widths = []

def get_row_at(self, index: int) -> Sequence[Any]:
row: Dict[str, Any] = self.data.slice(index, length=1).to_pylist()[0]
return list(row.values())
Expand Down Expand Up @@ -267,3 +266,24 @@ def sort(
indicated.
"""
self.data = self.data.sort_by(by)

def _reset_content_widths(self) -> None:
self._string_data = None
self._column_content_widths = []

@staticmethod
def _safe_cast_arr_to_str(arr: pa._PandasConvertible) -> pa._PandasConvertible:
"""
Safe here means avoiding type errors casting to str; ironically that means
setting PyArrow safe=false. If PyArrow can't do the cast (as with structs
and other nested types), we fall back to Python.
"""
try:
return arr.cast(
pa.string(),
safe=False,
)
except pl.ArrowNotImplementedError:
# todo: vectorize this with a pyarrow udf
native_list = arr.to_pylist()
return pa.array([str(i) for i in native_list], type=pa.string())
1 change: 0 additions & 1 deletion src/textual_fastdatatable/data_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,6 @@ def __init__(
disabled: bool = False,
) -> None:
super().__init__(name=name, id=id, classes=classes, disabled=disabled)
# TODO: HANDLE EMPTY CASE
try:
self.backend: DataTableBackend | None = (
backend if backend is not None else create_backend(data) # type: ignore
Expand Down
9 changes: 8 additions & 1 deletion stubs/pyarrow/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ from __future__ import annotations

from typing import Any, Iterable, Iterator, Literal, Mapping, Type, TypeVar

from .compute import CastOptions
from .types import DataType as DataType
from .types import string as string

Expand All @@ -22,7 +23,7 @@ class _PandasConvertible:
self: A,
target_type: DataType | None = None,
safe: bool = True,
options: Any | None = None,
options: CastOptions | None = None,
) -> A: ...
def __getitem__(self, index: int) -> Scalar: ...
def to_pylist(self) -> list[Any]: ...
Expand All @@ -31,6 +32,9 @@ class _PandasConvertible:
class Array(_PandasConvertible): ...
class ChunkedArray(_PandasConvertible): ...

class StructArray(Array):
def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: ...

T = TypeVar("T", bound="_Tabular")

class _Tabular:
Expand Down Expand Up @@ -106,3 +110,6 @@ def nulls(
type: DataType | None = None, # noqa: A002
memory_pool: MemoryPool | None = None,
) -> Array: ...
def concat_arrays(
arrays: Iterable[Array], memory_pool: MemoryPool | None = None
) -> Array: ...
15 changes: 15 additions & 0 deletions stubs/pyarrow/compute.pyi
Original file line number Diff line number Diff line change
@@ -1,8 +1,23 @@
from __future__ import annotations

from . import MemoryPool, Scalar, _PandasConvertible
from .types import DataType

class Expression: ...
class ScalarAggregateOptions: ...

class CastOptions:
def __init__(
self,
target_type: DataType | None = None,
allow_int_overflow: bool | None = None,
allow_time_truncate: bool | None = None,
allow_time_overflow: bool | None = None,
allow_decimal_truncate: bool | None = None,
allow_float_truncate: bool | None = None,
allow_invalid_utf8: bool | None = None,
) -> None: ...

def max( # noqa: A001
array: _PandasConvertible,
/,
Expand Down
1 change: 1 addition & 0 deletions stubs/pyarrow/lib.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
class ArrowNotImplementedError(Exception): ...
1 change: 1 addition & 0 deletions stubs/pyarrow/types.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ class DataType: ...

def string() -> DataType: ...
def is_null(t: DataType) -> bool: ...
def is_struct(t: DataType) -> bool: ...

0 comments on commit 257f5d7

Please sign in to comment.