Skip to content

Commit

Permalink
Implement DataSchema-6.0 (#48)
Browse files Browse the repository at this point in the history
* Initial Dataschema 6.0

* Fix tests.

* Add/fix tests.

* Fix readme.
  • Loading branch information
PeterKraus authored Nov 14, 2024
1 parent 0314c04 commit d8a3483
Show file tree
Hide file tree
Showing 15 changed files with 539 additions and 4 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@

Currently implemented schemas are:
- `Recipe` validator for **dgpost**, versions `{2.1, 1.0}`
- `DataSchema` validator for **yadg**, versions `{5.1, 5.0, 4.2, 4.1, 4.0}`
- `Payload` validator for **tomato**, at version `{0.2, 0.1}`
- `DataSchema` validator for **yadg**, versions `{6.0, 5.1, 5.0, 4.2, 4.1, 4.0}`
- `Payload` validator for **tomato**, at version `{1.0, 0.2, 0.1}`

2 changes: 2 additions & 0 deletions src/dgbowl_schemas/yadg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from . import dataschema
from pydantic import ValidationError
from pydantic.v1 import ValidationError as ValidationError_v1
from .dataschema_6_0 import DataSchema as DataSchema_6_0
from .dataschema_5_1 import DataSchema as DataSchema_5_1
from .dataschema_5_0 import DataSchema as DataSchema_5_0, Metadata as Metadata_5_0
from .dataschema_4_2 import DataSchema as DataSchema_4_2, Metadata as Metadata_4_2
Expand All @@ -11,6 +12,7 @@
logger = logging.getLogger(__name__)

models = {
"6.0": (DataSchema_6_0, None),
"5.1": (DataSchema_5_1, None),
"5.0": (DataSchema_5_0, Metadata_5_0),
"4.2": (DataSchema_4_2, Metadata_4_2),
Expand Down
2 changes: 1 addition & 1 deletion src/dgbowl_schemas/yadg/dataschema/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ..dataschema_5_1 import (
from ..dataschema_6_0 import (
DataSchema,
StepDefaults,
FileType,
Expand Down
7 changes: 7 additions & 0 deletions src/dgbowl_schemas/yadg/dataschema_5_1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
FileType as FileType,
FileTypes as FileTypes,
)
from ..dataschema_6_0 import DataSchema as NewDataSchema


class DataSchema(BaseModel, extra="forbid"):
Expand All @@ -26,3 +27,9 @@ class DataSchema(BaseModel, extra="forbid"):
steps: Sequence[Step]
"""Input commands for :mod:`yadg`'s extractors, organised as a :class:`Sequence`
of :class:`Steps`."""

def update(self):
nsch = self.model_dump(exclude_none=True, exclude_defaults=True)

nsch["version"] = "6.0"
return NewDataSchema(**nsch)
28 changes: 28 additions & 0 deletions src/dgbowl_schemas/yadg/dataschema_6_0/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from pydantic import BaseModel, Field
from typing import Sequence, Optional, Mapping, Any, Literal
from .step import Step
from .stepdefaults import StepDefaults
from .filetype import ( # noqa: F401
ExtractorFactory as ExtractorFactory,
FileType as FileType,
FileTypes as FileTypes,
)


class DataSchema(BaseModel, extra="forbid"):
"""
A :class:`pydantic.BaseModel` implementing ``DataSchema-6.0`` model
introduced in ``yadg-6.0``.
"""

version: Literal["6.0"]

metadata: Optional[Mapping[str, Any]]
"""Input metadata for :mod:`yadg`."""

step_defaults: StepDefaults = Field(..., default_factory=StepDefaults)
"""Default values for configuration of each :class:`Step`."""

steps: Sequence[Step]
"""Input commands for :mod:`yadg`'s extractors, organised as a :class:`Sequence`
of :class:`Steps`."""
59 changes: 59 additions & 0 deletions src/dgbowl_schemas/yadg/dataschema_6_0/externaldate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from pydantic import BaseModel
from typing import Literal, Optional, Union


class ExternalDateFile(BaseModel, extra="forbid"):
"""Read external date information from file."""

class Content(BaseModel, extra="forbid"):
path: str
"""Path to the external date information file."""

type: str
"""Type of the external date information file."""

match: Optional[str] = None
"""String to be matched within the file."""

file: Content


class ExternalDateFilename(BaseModel, extra="forbid"):
"""Read external date information from the file name."""

class Content(BaseModel, extra="forbid"):
format: str
"""``strptime``-like format string for processing the date."""

len: int
"""Number of characters from the start of the filename to parse."""

filename: Content


class ExternalDateISOString(BaseModel, extra="forbid"):
"""Read a constant external date using an ISO-formatted string."""

isostring: str


class ExternalDateUTSOffset(BaseModel, extra="forbid"):
"""Read a constant external date using a Unix timestamp offset."""

utsoffset: float


class ExternalDate(BaseModel, extra="forbid"):
"""Supply timestamping information that are external to the processed file."""

using: Union[
ExternalDateFile,
ExternalDateFilename,
ExternalDateISOString,
ExternalDateUTSOffset,
]
"""Specification of the external date format."""

mode: Literal["add", "replace"] = "add"
"""Whether the external timestamps should be added to or should replace the
parsed data."""
220 changes: 220 additions & 0 deletions src/dgbowl_schemas/yadg/dataschema_6_0/filetype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
import sys
import inspect
from pydantic import BaseModel, Field, field_validator
from abc import ABC
from typing import Optional, Literal, Mapping, Any, TypeVar
import tzlocal
from babel import Locale
import logging

from .stepdefaults import StepDefaults
from .parameters import Timestamps, Timestamp

logger = logging.getLogger(__name__)


class FileType(BaseModel, ABC, extra="forbid"):
"""Template abstract base class for parser classes."""

filetype: Optional[str] = None
timezone: Optional[str] = None
locale: Optional[str] = None
encoding: Optional[str] = None
parameters: Optional[Any] = None

@field_validator("timezone")
@classmethod
def timezone_resolve_localtime(cls, v):
if v == "localtime":
v = tzlocal.get_localzone_name()
return v

@field_validator("locale")
@classmethod
def locale_validate_default(cls, v):
if v is not None:
v = str(Locale.parse(v))
return v


class Example(FileType):
class Parameters(BaseModel, extra="allow"):
pass

parameters: Parameters = Field(default_factory=Parameters)
filetype: Literal["example"]


class Agilent_ch(FileType):
filetype: Literal["agilent.ch"]


class Agilent_dx(FileType):
filetype: Literal["agilent.dx"]


class Agilent_csv(FileType):
filetype: Literal["agilent.csv"]


class Basic_csv(FileType):
class Parameters(BaseModel, extra="forbid"):
sep: str = ","
"""Separator of table columns."""

strip: Optional[str] = None
"""A :class:`str` of characters to strip from headers & data."""

units: Optional[Mapping[str, str]] = None
"""A :class:`dict` containing ``column: unit`` keypairs."""

timestamp: Optional[Timestamps] = None
"""Timestamp specification allowing calculation of Unix timestamp for
each table row."""

parameters: Parameters = Field(default_factory=Parameters)
filetype: Literal["basic.csv"]


class Drycal_csv(FileType):
filetype: Literal["drycal.csv"]


class Drycal_rtf(FileType):
filetype: Literal["drycal.rtf"]


class Drycal_txt(FileType):
filetype: Literal["drycal.txt"]


class EClab_mpr(FileType):
filetype: Literal["eclab.mpr"]


class EClab_mpt(FileType):
filetype: Literal["eclab.mpt"]
encoding: Optional[str] = "windows-1252"

@field_validator("encoding")
@classmethod
def set_encoding(cls, encoding):
return encoding or "windows-1252"


class EmpaLC_csv(FileType):
filetype: Literal["empalc.csv"]


class EmpaLC_xlsx(FileType):
filetype: Literal["empalc.xlsx"]


class EZChrom_dat(FileType):
filetype: Literal["ezchrom.dat"]


class EZChrom_asc(FileType):
filetype: Literal["ezchrom.asc"]
encoding: Optional[str] = "windows-1252"

@field_validator("encoding")
@classmethod
def set_encoding(cls, encoding):
return encoding or "windows-1252"


class FHI_csv(FileType):
class Parameters(BaseModel, extra="forbid"):
timestamp: Timestamps = Field(
Timestamp(timestamp={"index": 0, "format": "%Y-%m-%d-%H-%M-%S"})
)

parameters: Parameters = Field(default_factory=Parameters)
filetype: Literal["fhimcpt.csv"]


class FHI_vna(FileType):
filetype: Literal["fhimcpt.vna"]


class Fusion_json(FileType):
filetype: Literal["fusion.json"]


class Fusion_zip(FileType):
filetype: Literal["fusion.zip"]


class Fusion_csv(FileType):
filetype: Literal["fusion.csv"]


class Panalytical_xy(FileType):
filetype: Literal["panalytical.xy"]


class Panalytical_csv(FileType):
filetype: Literal["panalytical.csv"]


class PicoLog_tc08(FileType):
filetype: Literal["picolog.tc08"]


class Panalytical_xrdml(FileType):
filetype: Literal["panalytical.xrdml"]


class Phi_spe(FileType):
filetype: Literal["phi.spe"]


class Quadstar_sac(FileType):
filetype: Literal["quadstar.sac"]


class Tomato_json(FileType):
filetype: Literal["tomato.json"]


class Touchstone_snp(FileType):
filetype: Literal["touchstone.snp"]


classlist = []
for name, obj in inspect.getmembers(sys.modules[__name__]):
if inspect.isclass(obj) and issubclass(obj, FileType) and obj is not FileType:
classlist.append(obj)
FileTypes = TypeVar("FileTypes", *classlist)


class ExtractorFactory(BaseModel):
"""
Extractor factory class.
Given an ``extractor=dict(filetype=k, ...)`` argument, attempts to determine the
correct :class:`FileType`, parses any additionally supplied parameters for that
:class:`FileType`, and back-fills defaults such as ``timezone``, ``locale``, and
``encoding``.
The following is the current usage pattern in :mod:`yadg`:
.. code-block::
ftype = ExtractorFactory(extractor={"filetype": k}).extractor
"""

extractor: FileTypes = Field(..., discriminator="filetype")

@field_validator("extractor")
@classmethod
def extractor_set_defaults(cls, v):
defaults = StepDefaults()
if v.timezone is None:
v.timezone = defaults.timezone
if v.locale is None:
v.locale = defaults.locale
if v.encoding is None:
v.encoding = defaults.encoding
return v
Loading

0 comments on commit d8a3483

Please sign in to comment.