Merge pull request #312 from MC-kit/devel

Stop using xarray
MC-kit · Jan 31, 2024 · b14721f · b14721f
2 parents 0d0e422 + cb11aac
commit b14721f
Show file tree

Hide file tree

Showing 27 changed files with 408 additions and 1,318 deletions.
diff --git a/README.rst b/README.rst
@@ -1,5 +1,5 @@
 ==============================================================================
-*xpypact*: FISPACT output to datasets converter
+*xpypact*: FISPACT output to Polars or DuckDB converter
 ==============================================================================
 
 
@@ -16,22 +16,26 @@
 Description
 -----------
 
-The module loads FISPACT JSON output as xarray dataset.
+The module loads FISPACT JSON output files and converts to Polars dataframes
+with minor data normalization.
 This allows efficient data extraction and aggregation.
+Multiple JSON files can be combined using simple additional identification for different
+FISPACT runs. So far we use just two-dimensional identification by material
+and case. The case usually identifies certain neutron flux.
+
 
 Implemented functionality
 -------------------------
 
 - export to DuckDB
 - export to parquet files
 
-.. configures and runs FISPACT, converts FISPACT output to xarray datasets.
-
 .. note::
 
     Currently available FISPACT v.5 API uses rather old python version (3.6).
-    That prevents direct use of their API in our package (>=3.8).
+    That prevents direct use of their API in our package (>=3.10).
     Check if own python integration with FISPACT is reasonable and feasible.
+    Or provide own FISPACT python binding.
 
 
 Installation
@@ -61,9 +65,50 @@ From source
 Examples
 --------
 
-.. note::
+.. code-block::
+
+    from xpypact import FullDataCollector, Inventory
+
+    def get_material_id(p: Path) -> int:
+        ...
+
+    def get_case_id(p: Path) -> int:
+        ...
+
+    jsons = [path1, path2, ...]
+    material_ids = {p: get_material_id(p) for p in jsons }
+    case_ids = {c:: get_case_id(p) for p in jsons
+
+    collector = FullDataCollector()
+
+    for json in jsons:
+        inventory = Inventory.from_json(json)
+        collector.append(inventory, material_id=material_ids[json], case_id=case_ids[json])
+
+    collected = collector.get_result()
+
+    # save to parquet files
+
+    collected.save_to_parquets(Path.cwd() / "parquets")
+
+    # or use DuckDB database
+
+    import from xpypact.dao save
+    import duckdb as db
+
+    con = db.connect()
+    save(con, collected)
+
+    gamma_from_db = con.sql(
+        """
+        select
+        g, rate
+        from timestep_gamma
+        where material_id = 1 and case_id = 54 and time_step_number = 7
+        order by g
+        """,
+    ).fetchall()
 
-    Add examples
 
 Contributing
 ------------

diff --git a/adhoc/demo_duckdb_multithreading.py b/adhoc/demo_duckdb_multithreading.py
@@ -2,6 +2,7 @@
 
 From: https://duckdb.org/docs/guides/python/multiple_threads.html
 """
+
 from __future__ import annotations
 
 import random

diff --git a/adhoc/demo_duckdb_parquet_access.py b/adhoc/demo_duckdb_parquet_access.py
@@ -1,4 +1,5 @@
 """TODO..."""
+
 from __future__ import annotations
 
 from pathlib import Path

diff --git a/benchmarks/test_inventory.py b/benchmarks/test_inventory.py
@@ -2,6 +2,7 @@
 
 See https://pytest-benchmark.readthedocs.io/en/latest/index.html
 """
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING

diff --git a/noxfile.py b/noxfile.py
@@ -1,4 +1,5 @@
 """Nox sessions."""
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING, Final

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "xpypact"
-version = "0.10.0"
+version = "0.11.0"
 description = "\"Python workflow framework for FISPACT.\""
 authors = ["dvp <[email protected]>"]
 license = "MIT"
@@ -48,21 +48,16 @@ Changelog = "https://github.com/MC-kit/xpypact/releases"
 
 
 [tool.poetry.dependencies]
-# msgspec-0.18.5 doesn't work on 3.9 - uses | without importing annotations
+# msgspec-0.18.5 doesn't work on 3.9 - uses `|` without importing annotations from __future__
 # duckdb-0.9.2, has no wheels for 3.12 and fails to build from source
-python = ">=3.10,<3.13"
+python = ">=3.9,<3.13"
 duckdb = ">=0.8.0"
-h5netcdf = ">=0.13.1"
 # mckit-nuclides = {version = ">=0.2.5", allow-prereleases = true}
 numpy = ">=1.26.0"
-openpyxl = ">=3.0.9"
-pandas = ">=2.0.0"
-xarray = ">=2022.3.0"
 multipledispatch = ">=0.6.0"
 msgspec = ">=0.18.5"
 rich = ">=13.7.0"
 polars = {version = "^0.20.3", extras = ["all"]}
-pyarrow = "^14.0.2"
 mckit-nuclides = "^0.3.0"
 
 [tool.poetry.group.dev.dependencies]

diff --git a/src/xpypact/__init__.py b/src/xpypact/__init__.py
@@ -2,12 +2,23 @@
 
 Wraps FISPACT workflow. Transforms FISPACT output to xarray datasets.
 """
+
 from __future__ import annotations
 
 from importlib import metadata as _meta
 from importlib.metadata import PackageNotFoundError, version
 
-from .data_arrays import from_json, scale_by_flux, scale_by_mass
+from .collector import (
+    FullDataCollector,
+    GammaSchema,
+    NuclideSchema,
+    RunDataSchema,
+    TimeStepNuclideSchema,
+    TimeStepSchema,
+)
+from .inventory import Inventory, RunDataCorrected
+from .nuclide import Nuclide, NuclideInfo
+from .time_step import DoseRate, GammaSpectrum, TimeStep
 
 try:
     __version__ = version(__name__)
@@ -23,15 +34,25 @@
 __copyright__ = f"Copyright 2021 {__author__}"
 
 __all__ = [
-    "__version__",
-    "__distribution__",
-    "__meta_data__",
+    "DoseRate",
+    "FullDataCollector",
+    "GammaSchema",
+    "GammaSpectrum",
+    "Inventory",
+    "Nuclide",
+    "NuclideInfo",
+    "NuclideSchema",
+    "RunDataCorrected",
+    "RunDataSchema",
+    "TimeStep",
+    "TimeStepNuclideSchema",
+    "TimeStepSchema",
     "__author__",
     "__author_email__",
+    "__copyright__",
+    "__distribution__",
     "__license__",
+    "__meta_data__",
     "__summary__",
-    "__copyright__",
-    "from_json",
-    "scale_by_flux",
-    "scale_by_mass",
+    "__version__",
 ]
diff --git a/src/xpypact/collector.py b/src/xpypact/collector.py
@@ -1,9 +1,10 @@
 """Collect data from multiple inventories to Polars tables."""
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
-import datetime
+import datetime as dt
 import sys
 import threading
 
@@ -25,9 +26,9 @@
 
 
 if sys.version_info >= (3, 11):  # pragma: no cover
-    UTC = datetime.UTC
+    UTC = dt.UTC
 else:
-    UTC = datetime.timezone.utc  # pragma: no cover
+    UTC = dt.timezone.utc  # pragma: no cover
 
 RunDataSchema = OrderedDict(
     material_id=pl.UInt32,
@@ -146,14 +147,14 @@ def append(self, inventory: Inventory, material_id: int, case_id: int) -> FullDa
     def _append_rundata(self, inventory, material_id, case_id):
         rundata = inventory.meta_info
         st = strptime(rundata.timestamp, "%H:%M:%S %d %B %Y")
-        ts = datetime.datetime(
+        ts = dt.datetime(  # noqa: DTZ001 - no tzinfo is available from the FISPACT output
             year=st.tm_year,
             month=st.tm_mon,
             day=st.tm_mday,
             hour=st.tm_hour,
             minute=st.tm_min,
             second=st.tm_sec,
-            tzinfo=UTC,
+            tzinfo=None,
         )
         rundata_df = pl.DataFrame(
             [

diff --git a/src/xpypact/dao/__init__.py b/src/xpypact/dao/__init__.py
@@ -1,4 +1,5 @@
 """Interface and implementations of data access objects (DAO)."""
+
 from __future__ import annotations
 
 from .api import DataAccessInterface

diff --git a/src/xpypact/dao/api.py b/src/xpypact/dao/api.py
@@ -1,4 +1,5 @@
 """Interface to data access facilities."""
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING

diff --git a/src/xpypact/dao/duckdb/__init__.py b/src/xpypact/dao/duckdb/__init__.py
@@ -1,4 +1,5 @@
 """DAO implementation for DuckDB."""
+
 from __future__ import annotations
 
 from .implementation import DuckDBDAO, create_indices, save

diff --git a/src/xpypact/dao/duckdb/implementation.py b/src/xpypact/dao/duckdb/implementation.py
@@ -1,4 +1,5 @@
 """Code to implement DuckDB DAO."""
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
@@ -9,7 +10,6 @@
 
 if TYPE_CHECKING:
     import duckdb as db
-    import pandas as pd
 
     from xpypact.collector import FullDataCollector
 
@@ -29,9 +29,9 @@ class DuckDBDAO(ms.Struct):
 
     con: db.DuckDBPyConnection
 
-    def get_tables_info(self) -> pd.DataFrame:
+    def get_tables_info(self) -> db.DuckDBPyRelation:
         """Get information on tables in schema."""
-        return self.con.execute("select * from information_schema.tables").df()
+        return self.con.sql("select * from information_schema.tables")
 
     def tables(self) -> tuple[str, str, str, str, str]:
         """List tables being used by xpypact dao.
@@ -43,13 +43,11 @@ def tables(self) -> tuple[str, str, str, str, str]:
 
     def has_schema(self) -> bool:
         """Check if the schema is available in a database."""
-        db_tables = self.get_tables_info()
+        table_names = self.get_tables_info().select("table_name").fetchnumpy()["table_name"]
 
-        if len(db_tables) < len(self.tables()):
+        if len(table_names) < len(self.tables()):
             return False
 
-        table_names = db_tables["table_name"].to_numpy()
-
         return all(name in table_names for name in self.tables())
 
     def create_schema(self) -> None: