Skip to content

Commit

Permalink
Merge pull request #312 from MC-kit/devel
Browse files Browse the repository at this point in the history
Stop using xarray
  • Loading branch information
dvp2015 authored Jan 31, 2024
2 parents 0d0e422 + cb11aac commit b14721f
Show file tree
Hide file tree
Showing 27 changed files with 408 additions and 1,318 deletions.
59 changes: 52 additions & 7 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
==============================================================================
*xpypact*: FISPACT output to datasets converter
*xpypact*: FISPACT output to Polars or DuckDB converter
==============================================================================


Expand All @@ -16,22 +16,26 @@
Description
-----------

The module loads FISPACT JSON output as xarray dataset.
The module loads FISPACT JSON output files and converts to Polars dataframes
with minor data normalization.
This allows efficient data extraction and aggregation.
Multiple JSON files can be combined using simple additional identification for different
FISPACT runs. So far we use just two-dimensional identification by material
and case. The case usually identifies certain neutron flux.


Implemented functionality
-------------------------

- export to DuckDB
- export to parquet files

.. configures and runs FISPACT, converts FISPACT output to xarray datasets.
.. note::

Currently available FISPACT v.5 API uses rather old python version (3.6).
That prevents direct use of their API in our package (>=3.8).
That prevents direct use of their API in our package (>=3.10).
Check if own python integration with FISPACT is reasonable and feasible.
Or provide own FISPACT python binding.


Installation
Expand Down Expand Up @@ -61,9 +65,50 @@ From source
Examples
--------

.. note::
.. code-block::
from xpypact import FullDataCollector, Inventory
def get_material_id(p: Path) -> int:
...
def get_case_id(p: Path) -> int:
...
jsons = [path1, path2, ...]
material_ids = {p: get_material_id(p) for p in jsons }
case_ids = {c:: get_case_id(p) for p in jsons
collector = FullDataCollector()
for json in jsons:
inventory = Inventory.from_json(json)
collector.append(inventory, material_id=material_ids[json], case_id=case_ids[json])
collected = collector.get_result()
# save to parquet files
collected.save_to_parquets(Path.cwd() / "parquets")
# or use DuckDB database
import from xpypact.dao save
import duckdb as db
con = db.connect()
save(con, collected)
gamma_from_db = con.sql(
"""
select
g, rate
from timestep_gamma
where material_id = 1 and case_id = 54 and time_step_number = 7
order by g
""",
).fetchall()
Add examples
Contributing
------------
Expand Down
1 change: 1 addition & 0 deletions adhoc/demo_duckdb_multithreading.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
From: https://duckdb.org/docs/guides/python/multiple_threads.html
"""

from __future__ import annotations

import random
Expand Down
1 change: 1 addition & 0 deletions adhoc/demo_duckdb_parquet_access.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""TODO..."""

from __future__ import annotations

from pathlib import Path
Expand Down
1 change: 1 addition & 0 deletions benchmarks/test_inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
See https://pytest-benchmark.readthedocs.io/en/latest/index.html
"""

from __future__ import annotations

from typing import TYPE_CHECKING
Expand Down
1 change: 1 addition & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Nox sessions."""

from __future__ import annotations

from typing import TYPE_CHECKING, Final
Expand Down
613 changes: 256 additions & 357 deletions poetry.lock

Large diffs are not rendered by default.

11 changes: 3 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "xpypact"
version = "0.10.0"
version = "0.11.0"
description = "\"Python workflow framework for FISPACT.\""
authors = ["dvp <[email protected]>"]
license = "MIT"
Expand Down Expand Up @@ -48,21 +48,16 @@ Changelog = "https://github.com/MC-kit/xpypact/releases"


[tool.poetry.dependencies]
# msgspec-0.18.5 doesn't work on 3.9 - uses | without importing annotations
# msgspec-0.18.5 doesn't work on 3.9 - uses `|` without importing annotations from __future__
# duckdb-0.9.2, has no wheels for 3.12 and fails to build from source
python = ">=3.10,<3.13"
python = ">=3.9,<3.13"
duckdb = ">=0.8.0"
h5netcdf = ">=0.13.1"
# mckit-nuclides = {version = ">=0.2.5", allow-prereleases = true}
numpy = ">=1.26.0"
openpyxl = ">=3.0.9"
pandas = ">=2.0.0"
xarray = ">=2022.3.0"
multipledispatch = ">=0.6.0"
msgspec = ">=0.18.5"
rich = ">=13.7.0"
polars = {version = "^0.20.3", extras = ["all"]}
pyarrow = "^14.0.2"
mckit-nuclides = "^0.3.0"

[tool.poetry.group.dev.dependencies]
Expand Down
37 changes: 29 additions & 8 deletions src/xpypact/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,23 @@
Wraps FISPACT workflow. Transforms FISPACT output to xarray datasets.
"""

from __future__ import annotations

from importlib import metadata as _meta
from importlib.metadata import PackageNotFoundError, version

from .data_arrays import from_json, scale_by_flux, scale_by_mass
from .collector import (
FullDataCollector,
GammaSchema,
NuclideSchema,
RunDataSchema,
TimeStepNuclideSchema,
TimeStepSchema,
)
from .inventory import Inventory, RunDataCorrected
from .nuclide import Nuclide, NuclideInfo
from .time_step import DoseRate, GammaSpectrum, TimeStep

try:
__version__ = version(__name__)
Expand All @@ -23,15 +34,25 @@
__copyright__ = f"Copyright 2021 {__author__}"

__all__ = [
"__version__",
"__distribution__",
"__meta_data__",
"DoseRate",
"FullDataCollector",
"GammaSchema",
"GammaSpectrum",
"Inventory",
"Nuclide",
"NuclideInfo",
"NuclideSchema",
"RunDataCorrected",
"RunDataSchema",
"TimeStep",
"TimeStepNuclideSchema",
"TimeStepSchema",
"__author__",
"__author_email__",
"__copyright__",
"__distribution__",
"__license__",
"__meta_data__",
"__summary__",
"__copyright__",
"from_json",
"scale_by_flux",
"scale_by_mass",
"__version__",
]
11 changes: 6 additions & 5 deletions src/xpypact/collector.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Collect data from multiple inventories to Polars tables."""

from __future__ import annotations

from typing import TYPE_CHECKING

import datetime
import datetime as dt
import sys
import threading

Expand All @@ -25,9 +26,9 @@


if sys.version_info >= (3, 11): # pragma: no cover
UTC = datetime.UTC
UTC = dt.UTC
else:
UTC = datetime.timezone.utc # pragma: no cover
UTC = dt.timezone.utc # pragma: no cover

RunDataSchema = OrderedDict(
material_id=pl.UInt32,
Expand Down Expand Up @@ -146,14 +147,14 @@ def append(self, inventory: Inventory, material_id: int, case_id: int) -> FullDa
def _append_rundata(self, inventory, material_id, case_id):
rundata = inventory.meta_info
st = strptime(rundata.timestamp, "%H:%M:%S %d %B %Y")
ts = datetime.datetime(
ts = dt.datetime( # noqa: DTZ001 - no tzinfo is available from the FISPACT output
year=st.tm_year,
month=st.tm_mon,
day=st.tm_mday,
hour=st.tm_hour,
minute=st.tm_min,
second=st.tm_sec,
tzinfo=UTC,
tzinfo=None,
)
rundata_df = pl.DataFrame(
[
Expand Down
1 change: 1 addition & 0 deletions src/xpypact/dao/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Interface and implementations of data access objects (DAO)."""

from __future__ import annotations

from .api import DataAccessInterface
Expand Down
1 change: 1 addition & 0 deletions src/xpypact/dao/api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Interface to data access facilities."""

from __future__ import annotations

from typing import TYPE_CHECKING
Expand Down
1 change: 1 addition & 0 deletions src/xpypact/dao/duckdb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""DAO implementation for DuckDB."""

from __future__ import annotations

from .implementation import DuckDBDAO, create_indices, save
Expand Down
12 changes: 5 additions & 7 deletions src/xpypact/dao/duckdb/implementation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Code to implement DuckDB DAO."""

from __future__ import annotations

from typing import TYPE_CHECKING
Expand All @@ -9,7 +10,6 @@

if TYPE_CHECKING:
import duckdb as db
import pandas as pd

from xpypact.collector import FullDataCollector

Expand All @@ -29,9 +29,9 @@ class DuckDBDAO(ms.Struct):

con: db.DuckDBPyConnection

def get_tables_info(self) -> pd.DataFrame:
def get_tables_info(self) -> db.DuckDBPyRelation:
"""Get information on tables in schema."""
return self.con.execute("select * from information_schema.tables").df()
return self.con.sql("select * from information_schema.tables")

def tables(self) -> tuple[str, str, str, str, str]:
"""List tables being used by xpypact dao.
Expand All @@ -43,13 +43,11 @@ def tables(self) -> tuple[str, str, str, str, str]:

def has_schema(self) -> bool:
"""Check if the schema is available in a database."""
db_tables = self.get_tables_info()
table_names = self.get_tables_info().select("table_name").fetchnumpy()["table_name"]

if len(db_tables) < len(self.tables()):
if len(table_names) < len(self.tables()):
return False

table_names = db_tables["table_name"].to_numpy()

return all(name in table_names for name in self.tables())

def create_schema(self) -> None:
Expand Down
Loading

0 comments on commit b14721f

Please sign in to comment.