Skip to content

Commit

Permalink
Merge pull request #6289 from JakaKokosar/dask-test-owselectrows
Browse files Browse the repository at this point in the history
Dask: test Select Rows
  • Loading branch information
markotoplak committed Feb 1, 2023
2 parents fbaff72 + e7578b1 commit 3a1a379
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 69 deletions.
80 changes: 52 additions & 28 deletions Orange/data/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,50 @@ class DaskTable(Table):

_array_interface = da

def __new__(cls, *args, **kwargs):
if not args and not kwargs:
return super().__new__(cls)
elif isinstance(args[0], DaskTable):
if len(args) > 1:
raise TypeError("DaskTable(table: DaskTable) expects just one argument")
return cls.from_table(args[0].domain, args[0])
return cls.from_arrays(*args, **kwargs)

@classmethod
def from_arrays(cls, domain, X=None, Y=None, metas=None):
self = cls()

size = None
# get size from X, Y, or metas
for array in [X, Y, metas]:
if array is not None:
size = len(array)
break

assert size is not None

if X is None:
X = da.zeros((size, 0), chunks=(size, 0))

if Y is None:
Y = da.zeros((size, 0), chunks=(size, 0))

if metas is None:
metas = np.zeros((size, 0))

assert isinstance(X, da.Array)
assert isinstance(Y, da.Array)
assert isinstance(metas, np.ndarray)

self.domain = domain
self._X = X
self._Y = Y
self._metas = metas
self._W = np.ones((len(X), 0)) # weights are unsupported
self._init_ids(self)

return self

@classmethod
def from_file(cls, filename, sheet=None):
"""
Expand All @@ -42,47 +86,27 @@ def from_file(cls, filename, sheet=None):
:return: a new data table
:rtype: Orange.data.Table
"""
self = cls()

self.__h5file = f = h5py.File(filename, "r")
h5file = f = h5py.File(filename, "r")

def read_format_da(name):
# dask's automatic chunking has problems with 0-dimension arrays
if name in f and 0 not in f[name].shape:
return da.from_array(f[name])
return None

self._X = read_format_da("X")
self._Y = read_format_da("Y")
X = read_format_da("X")
Y = read_format_da("Y")

# metas are in memory
if "metas" in f:
self._metas = pickle.loads(np.array(f['metas']).tobytes())
metas = pickle.loads(np.array(f['metas']).tobytes())
else:
self._metas = None

size = None
# get size from X, Y, or metas
for el in ("_X", "_Y", "_metas"):
array = getattr(self, el)
if array is not None:
size = len(array)
break

if self._X is None:
self._X = da.zeros((size, 0), chunks=(size, 0))

if self._Y is None:
self._Y = da.zeros((size, 0), chunks=(size, 0))

if self._metas is None:
self._metas = np.zeros((size, 0))

self._W = np.ones((size, 0)) # weights are unsupported
metas = None

self.domain = pickle.loads(np.array(f['domain']).tobytes())
domain = pickle.loads(np.array(f['domain']).tobytes())

cls._init_ids(self)
self = DaskTable(domain, X, Y, metas)
self.__h5file = h5file

return self

Expand Down
14 changes: 10 additions & 4 deletions Orange/widgets/data/owselectrows.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from datetime import datetime, timezone, timedelta

import numpy as np
import dask.array as da

from AnyQt.QtWidgets import (
QWidget, QTableWidget, QHeaderView, QComboBox, QLineEdit, QToolButton,
Expand Down Expand Up @@ -883,10 +884,15 @@ def convert_timestamp(timestamp):
return datetime(1970, 1, 1, tzinfo=timezone.utc) + \
timedelta(seconds=int(timestamp))

min_datetime = convert_timestamp(
np.nanmin(column)).strftime(convert_format)
max_datetime = convert_timestamp(
np.nanmax(column)).strftime(convert_format)
min_timestamp = np.nanmin(column)
max_timestamp = np.nanmax(column)

if isinstance(column, da.Array):
min_timestamp, max_timestamp = da.compute(min_timestamp, max_timestamp)

min_datetime = convert_timestamp(min_timestamp).strftime(convert_format)
max_datetime = convert_timestamp(max_timestamp).strftime(convert_format)

return min_datetime, max_datetime


Expand Down
Loading

0 comments on commit 3a1a379

Please sign in to comment.