diff --git a/README.md b/README.md index cadaf21..69babc2 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,6 @@ Currently implemented schemas are: - `Recipe` validator for **dgpost**, versions `{2.1, 1.0}` -- `DataSchema` validator for **yadg**, versions `{5.1, 5.0, 4.2, 4.1, 4.0}` -- `Payload` validator for **tomato**, at version `{0.2, 0.1}` +- `DataSchema` validator for **yadg**, versions `{6.0, 5.1, 5.0, 4.2, 4.1, 4.0}` +- `Payload` validator for **tomato**, at version `{1.0, 0.2, 0.1}` diff --git a/src/dgbowl_schemas/yadg/__init__.py b/src/dgbowl_schemas/yadg/__init__.py index 8c0c49c..77ce8f4 100644 --- a/src/dgbowl_schemas/yadg/__init__.py +++ b/src/dgbowl_schemas/yadg/__init__.py @@ -2,6 +2,7 @@ from . import dataschema from pydantic import ValidationError from pydantic.v1 import ValidationError as ValidationError_v1 +from .dataschema_6_0 import DataSchema as DataSchema_6_0 from .dataschema_5_1 import DataSchema as DataSchema_5_1 from .dataschema_5_0 import DataSchema as DataSchema_5_0, Metadata as Metadata_5_0 from .dataschema_4_2 import DataSchema as DataSchema_4_2, Metadata as Metadata_4_2 @@ -11,6 +12,7 @@ logger = logging.getLogger(__name__) models = { + "6.0": (DataSchema_6_0, None), "5.1": (DataSchema_5_1, None), "5.0": (DataSchema_5_0, Metadata_5_0), "4.2": (DataSchema_4_2, Metadata_4_2), diff --git a/src/dgbowl_schemas/yadg/dataschema/__init__.py b/src/dgbowl_schemas/yadg/dataschema/__init__.py index ecf6f1b..c311141 100644 --- a/src/dgbowl_schemas/yadg/dataschema/__init__.py +++ b/src/dgbowl_schemas/yadg/dataschema/__init__.py @@ -1,4 +1,4 @@ -from ..dataschema_5_1 import ( +from ..dataschema_6_0 import ( DataSchema, StepDefaults, FileType, diff --git a/src/dgbowl_schemas/yadg/dataschema_5_1/__init__.py b/src/dgbowl_schemas/yadg/dataschema_5_1/__init__.py index 1591d3b..898bc73 100644 --- a/src/dgbowl_schemas/yadg/dataschema_5_1/__init__.py +++ b/src/dgbowl_schemas/yadg/dataschema_5_1/__init__.py @@ -7,6 +7,7 @@ FileType as FileType, FileTypes as FileTypes, ) +from ..dataschema_6_0 import DataSchema as NewDataSchema class DataSchema(BaseModel, extra="forbid"): @@ -26,3 +27,9 @@ class DataSchema(BaseModel, extra="forbid"): steps: Sequence[Step] """Input commands for :mod:`yadg`'s extractors, organised as a :class:`Sequence` of :class:`Steps`.""" + + def update(self): + nsch = self.model_dump(exclude_none=True, exclude_defaults=True) + + nsch["version"] = "6.0" + return NewDataSchema(**nsch) diff --git a/src/dgbowl_schemas/yadg/dataschema_6_0/__init__.py b/src/dgbowl_schemas/yadg/dataschema_6_0/__init__.py new file mode 100644 index 0000000..231e21b --- /dev/null +++ b/src/dgbowl_schemas/yadg/dataschema_6_0/__init__.py @@ -0,0 +1,28 @@ +from pydantic import BaseModel, Field +from typing import Sequence, Optional, Mapping, Any, Literal +from .step import Step +from .stepdefaults import StepDefaults +from .filetype import ( # noqa: F401 + ExtractorFactory as ExtractorFactory, + FileType as FileType, + FileTypes as FileTypes, +) + + +class DataSchema(BaseModel, extra="forbid"): + """ + A :class:`pydantic.BaseModel` implementing ``DataSchema-6.0`` model + introduced in ``yadg-6.0``. + """ + + version: Literal["6.0"] + + metadata: Optional[Mapping[str, Any]] + """Input metadata for :mod:`yadg`.""" + + step_defaults: StepDefaults = Field(..., default_factory=StepDefaults) + """Default values for configuration of each :class:`Step`.""" + + steps: Sequence[Step] + """Input commands for :mod:`yadg`'s extractors, organised as a :class:`Sequence` + of :class:`Steps`.""" diff --git a/src/dgbowl_schemas/yadg/dataschema_6_0/externaldate.py b/src/dgbowl_schemas/yadg/dataschema_6_0/externaldate.py new file mode 100644 index 0000000..3b552ed --- /dev/null +++ b/src/dgbowl_schemas/yadg/dataschema_6_0/externaldate.py @@ -0,0 +1,59 @@ +from pydantic import BaseModel +from typing import Literal, Optional, Union + + +class ExternalDateFile(BaseModel, extra="forbid"): + """Read external date information from file.""" + + class Content(BaseModel, extra="forbid"): + path: str + """Path to the external date information file.""" + + type: str + """Type of the external date information file.""" + + match: Optional[str] = None + """String to be matched within the file.""" + + file: Content + + +class ExternalDateFilename(BaseModel, extra="forbid"): + """Read external date information from the file name.""" + + class Content(BaseModel, extra="forbid"): + format: str + """``strptime``-like format string for processing the date.""" + + len: int + """Number of characters from the start of the filename to parse.""" + + filename: Content + + +class ExternalDateISOString(BaseModel, extra="forbid"): + """Read a constant external date using an ISO-formatted string.""" + + isostring: str + + +class ExternalDateUTSOffset(BaseModel, extra="forbid"): + """Read a constant external date using a Unix timestamp offset.""" + + utsoffset: float + + +class ExternalDate(BaseModel, extra="forbid"): + """Supply timestamping information that are external to the processed file.""" + + using: Union[ + ExternalDateFile, + ExternalDateFilename, + ExternalDateISOString, + ExternalDateUTSOffset, + ] + """Specification of the external date format.""" + + mode: Literal["add", "replace"] = "add" + """Whether the external timestamps should be added to or should replace the + parsed data.""" diff --git a/src/dgbowl_schemas/yadg/dataschema_6_0/filetype.py b/src/dgbowl_schemas/yadg/dataschema_6_0/filetype.py new file mode 100644 index 0000000..958cfd4 --- /dev/null +++ b/src/dgbowl_schemas/yadg/dataschema_6_0/filetype.py @@ -0,0 +1,220 @@ +import sys +import inspect +from pydantic import BaseModel, Field, field_validator +from abc import ABC +from typing import Optional, Literal, Mapping, Any, TypeVar +import tzlocal +from babel import Locale +import logging + +from .stepdefaults import StepDefaults +from .parameters import Timestamps, Timestamp + +logger = logging.getLogger(__name__) + + +class FileType(BaseModel, ABC, extra="forbid"): + """Template abstract base class for parser classes.""" + + filetype: Optional[str] = None + timezone: Optional[str] = None + locale: Optional[str] = None + encoding: Optional[str] = None + parameters: Optional[Any] = None + + @field_validator("timezone") + @classmethod + def timezone_resolve_localtime(cls, v): + if v == "localtime": + v = tzlocal.get_localzone_name() + return v + + @field_validator("locale") + @classmethod + def locale_validate_default(cls, v): + if v is not None: + v = str(Locale.parse(v)) + return v + + +class Example(FileType): + class Parameters(BaseModel, extra="allow"): + pass + + parameters: Parameters = Field(default_factory=Parameters) + filetype: Literal["example"] + + +class Agilent_ch(FileType): + filetype: Literal["agilent.ch"] + + +class Agilent_dx(FileType): + filetype: Literal["agilent.dx"] + + +class Agilent_csv(FileType): + filetype: Literal["agilent.csv"] + + +class Basic_csv(FileType): + class Parameters(BaseModel, extra="forbid"): + sep: str = "," + """Separator of table columns.""" + + strip: Optional[str] = None + """A :class:`str` of characters to strip from headers & data.""" + + units: Optional[Mapping[str, str]] = None + """A :class:`dict` containing ``column: unit`` keypairs.""" + + timestamp: Optional[Timestamps] = None + """Timestamp specification allowing calculation of Unix timestamp for + each table row.""" + + parameters: Parameters = Field(default_factory=Parameters) + filetype: Literal["basic.csv"] + + +class Drycal_csv(FileType): + filetype: Literal["drycal.csv"] + + +class Drycal_rtf(FileType): + filetype: Literal["drycal.rtf"] + + +class Drycal_txt(FileType): + filetype: Literal["drycal.txt"] + + +class EClab_mpr(FileType): + filetype: Literal["eclab.mpr"] + + +class EClab_mpt(FileType): + filetype: Literal["eclab.mpt"] + encoding: Optional[str] = "windows-1252" + + @field_validator("encoding") + @classmethod + def set_encoding(cls, encoding): + return encoding or "windows-1252" + + +class EmpaLC_csv(FileType): + filetype: Literal["empalc.csv"] + + +class EmpaLC_xlsx(FileType): + filetype: Literal["empalc.xlsx"] + + +class EZChrom_dat(FileType): + filetype: Literal["ezchrom.dat"] + + +class EZChrom_asc(FileType): + filetype: Literal["ezchrom.asc"] + encoding: Optional[str] = "windows-1252" + + @field_validator("encoding") + @classmethod + def set_encoding(cls, encoding): + return encoding or "windows-1252" + + +class FHI_csv(FileType): + class Parameters(BaseModel, extra="forbid"): + timestamp: Timestamps = Field( + Timestamp(timestamp={"index": 0, "format": "%Y-%m-%d-%H-%M-%S"}) + ) + + parameters: Parameters = Field(default_factory=Parameters) + filetype: Literal["fhimcpt.csv"] + + +class FHI_vna(FileType): + filetype: Literal["fhimcpt.vna"] + + +class Fusion_json(FileType): + filetype: Literal["fusion.json"] + + +class Fusion_zip(FileType): + filetype: Literal["fusion.zip"] + + +class Fusion_csv(FileType): + filetype: Literal["fusion.csv"] + + +class Panalytical_xy(FileType): + filetype: Literal["panalytical.xy"] + + +class Panalytical_csv(FileType): + filetype: Literal["panalytical.csv"] + + +class PicoLog_tc08(FileType): + filetype: Literal["picolog.tc08"] + + +class Panalytical_xrdml(FileType): + filetype: Literal["panalytical.xrdml"] + + +class Phi_spe(FileType): + filetype: Literal["phi.spe"] + + +class Quadstar_sac(FileType): + filetype: Literal["quadstar.sac"] + + +class Tomato_json(FileType): + filetype: Literal["tomato.json"] + + +class Touchstone_snp(FileType): + filetype: Literal["touchstone.snp"] + + +classlist = [] +for name, obj in inspect.getmembers(sys.modules[__name__]): + if inspect.isclass(obj) and issubclass(obj, FileType) and obj is not FileType: + classlist.append(obj) +FileTypes = TypeVar("FileTypes", *classlist) + + +class ExtractorFactory(BaseModel): + """ + Extractor factory class. + + Given an ``extractor=dict(filetype=k, ...)`` argument, attempts to determine the + correct :class:`FileType`, parses any additionally supplied parameters for that + :class:`FileType`, and back-fills defaults such as ``timezone``, ``locale``, and + ``encoding``. + + The following is the current usage pattern in :mod:`yadg`: + + .. code-block:: + + ftype = ExtractorFactory(extractor={"filetype": k}).extractor + """ + + extractor: FileTypes = Field(..., discriminator="filetype") + + @field_validator("extractor") + @classmethod + def extractor_set_defaults(cls, v): + defaults = StepDefaults() + if v.timezone is None: + v.timezone = defaults.timezone + if v.locale is None: + v.locale = defaults.locale + if v.encoding is None: + v.encoding = defaults.encoding + return v diff --git a/src/dgbowl_schemas/yadg/dataschema_6_0/input.py b/src/dgbowl_schemas/yadg/dataschema_6_0/input.py new file mode 100644 index 0000000..b5f9a5b --- /dev/null +++ b/src/dgbowl_schemas/yadg/dataschema_6_0/input.py @@ -0,0 +1,45 @@ +from pydantic import BaseModel, Field +from typing import Optional, Sequence, List +import os + + +class Input(BaseModel, extra="forbid", populate_by_name=True): + """Specification of input files/folders to be processed by the :class:`Step`.""" + + files: Sequence[str] = Field(alias="folders") + """Files, or folders to be searched for matching files.""" + + prefix: Optional[str] = None + """Prefix of the filenames to be matched.""" + + suffix: Optional[str] = None + """Suffix of the filenames to be matched.""" + + contains: Optional[str] = None + """A string the matched filenames must contain.""" + + exclude: Optional[str] = None + """A string the matched filenames must not contain.""" + + def paths(self) -> List[str]: + """Returns a list of files to be processed by the :class:`Step`.""" + ret = [] + for item in sorted(self.files): + if os.path.isdir(item): + paths = [os.path.join(item, fn) for fn in sorted(os.listdir(item))] + else: + paths = [item] + for path in paths: + tail = os.path.basename(path) + inc = True + if self.prefix is not None and not tail.startswith(self.prefix): + inc = False + if self.suffix is not None and not tail.endswith(self.suffix): + inc = False + if self.contains is not None and self.contains not in tail: + inc = False + if self.exclude is not None and self.exclude in tail: + inc = False + if inc: + ret.append(path) + return ret diff --git a/src/dgbowl_schemas/yadg/dataschema_6_0/parameters.py b/src/dgbowl_schemas/yadg/dataschema_6_0/parameters.py new file mode 100644 index 0000000..b08d1f0 --- /dev/null +++ b/src/dgbowl_schemas/yadg/dataschema_6_0/parameters.py @@ -0,0 +1,13 @@ +from pydantic import BaseModel +from typing import Union + +from .timestamp import Timestamp, TimeDate, UTS + + +class Parameters(BaseModel, extra="forbid"): + """Empty parameters specification with no extras allowed.""" + + pass + + +Timestamps = Union[Timestamp, TimeDate, UTS] diff --git a/src/dgbowl_schemas/yadg/dataschema_6_0/step.py b/src/dgbowl_schemas/yadg/dataschema_6_0/step.py new file mode 100644 index 0000000..bb56cb7 --- /dev/null +++ b/src/dgbowl_schemas/yadg/dataschema_6_0/step.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel +from typing import Optional, Union +from .externaldate import ExternalDate +from .input import Input +from .filetype import FileTypes + + +class Step(BaseModel, extra="forbid"): + extractor: Union[FileTypes] + input: Input + tag: Optional[str] = None + externaldate: Optional[ExternalDate] = None diff --git a/src/dgbowl_schemas/yadg/dataschema_6_0/stepdefaults.py b/src/dgbowl_schemas/yadg/dataschema_6_0/stepdefaults.py new file mode 100644 index 0000000..a71cce8 --- /dev/null +++ b/src/dgbowl_schemas/yadg/dataschema_6_0/stepdefaults.py @@ -0,0 +1,53 @@ +from pydantic import BaseModel, Field, field_validator +from typing import Optional +import locale +from babel import Locale, UnknownLocaleError +import tzlocal +import logging + +logger = logging.getLogger(__name__) + + +class StepDefaults(BaseModel, extra="forbid"): + """Configuration of defaults applicable for all steps.""" + + timezone: str = Field("localtime", validate_default=True) + """Global timezone specification. + + .. note:: + + This should be set to the timezone where the measurements have been + performed, as opposed to the timezone where :mod:`yadg` is being executed. + Otherwise timezone offsets may not be accounted for correctly. + + """ + + locale: Optional[str] = Field(None, validate_default=True) + """Global locale specification. Will default to current locale.""" + + encoding: Optional[str] = "utf-8" + """Global filetype encoding. Will default to ``utf-8``.""" + + @field_validator("timezone") + @classmethod + def timezone_resolve_localtime(cls, v): + if v == "localtime": + v = tzlocal.get_localzone_name() + return v + + @field_validator("locale") + @classmethod + def locale_set_default(cls, v): + if v is not None: + v = str(Locale.parse(v)) + else: + for loc in (locale.getlocale(locale.LC_NUMERIC)[0], locale.getlocale()[0]): + try: + v = str(Locale.parse(loc)) + break + except (TypeError, UnknownLocaleError, ValueError) as e: + logger.debug("Could not process locale '%s': %s", loc, e) + else: + logger.debug("No valid locale string provided. Defaulting to 'en_GB'.") + v = "en_GB" + return v diff --git a/src/dgbowl_schemas/yadg/dataschema_6_0/timestamp.py b/src/dgbowl_schemas/yadg/dataschema_6_0/timestamp.py new file mode 100644 index 0000000..18c205e --- /dev/null +++ b/src/dgbowl_schemas/yadg/dataschema_6_0/timestamp.py @@ -0,0 +1,28 @@ +from pydantic import BaseModel +from typing import Optional + + +class TimestampSpec(BaseModel, extra="forbid"): + """Specification of the column index and string format of the timestamp.""" + + index: Optional[int] = None + format: Optional[str] = None + + +class Timestamp(BaseModel, extra="forbid"): + """Timestamp from a column containing a single timestamp string.""" + + timestamp: TimestampSpec + + +class UTS(BaseModel, extra="forbid"): + """Timestamp from a column containing a Unix timestamp.""" + + uts: TimestampSpec + + +class TimeDate(BaseModel, extra="forbid"): + """Timestamp from a separate date and/or time column.""" + + date: Optional[TimestampSpec] = None + time: Optional[TimestampSpec] = None diff --git a/tests/test_dataschema.py b/tests/test_dataschema.py index d1ec74f..c6fbdb7 100644 --- a/tests/test_dataschema.py +++ b/tests/test_dataschema.py @@ -56,6 +56,7 @@ def test_dataschema_metadata_json(inpath, success, datadir): ("ts14_basic_csv.json"), # 5.1 ("ts15_example.json"), # 5.1 ("ts16_locales.json"), # 5.1 + ("ts17_basic_csv.json"), # 6.0 ], ) def test_dataschema_steps_json(inpath, datadir): @@ -85,6 +86,7 @@ def test_dataschema_steps_json(inpath, datadir): ("err8_metadata.json"), # 5.0 ("err9_locale_step.json"), # 5.1, check crash on wrong locale ("err10_locale_sd.json"), # 5.1, check crash on wrong locale in stepdefaults + ("err11_deprecated.json"), # 6.0, check crash with deprecated ], ) def test_dataschema_err(inpath, datadir): @@ -136,7 +138,7 @@ def test_dataschema_update_chain(inpath, datadir): ret = to_dataschema(**jsdata) while hasattr(ret, "update"): ret = ret.update() - assert ret.version == "5.1" + assert ret.version == "6.0" @pytest.mark.parametrize( diff --git a/tests/test_dataschema/err11_deprecated.json b/tests/test_dataschema/err11_deprecated.json new file mode 100644 index 0000000..55ff875 --- /dev/null +++ b/tests/test_dataschema/err11_deprecated.json @@ -0,0 +1,19 @@ +{ + "input": { + "version": "6.0", + "metadata": { + "provenance": {"type": "manual"} + }, + "step_defaults": {}, + "steps": [ + { + "tag": "deprecated", + "input": {"files": ["test"]}, + "extractor": { + "filetype": "biologic-mpr" + } + } + ] + }, + "exception": "Input should be 'eclab.mpr'" +} \ No newline at end of file diff --git a/tests/test_dataschema/ts17_basic_csv.json b/tests/test_dataschema/ts17_basic_csv.json new file mode 100644 index 0000000..d975bc7 --- /dev/null +++ b/tests/test_dataschema/ts17_basic_csv.json @@ -0,0 +1,47 @@ +{ + "input": { + "version": "6.0", + "metadata": { + "provenance": {"type": "manual"} + }, + "step_defaults": { + "timezone": "UTC", + "locale": "en_GB.UTF-8" + }, + "steps": [ + { + "input": {"files": ["measurement.csv"]}, + "extractor": { + "filetype": "basic.csv", + "locale": "de_DE.UTF-8", + "timezone": "Europe/Zurich" + } + } + ] + }, + "output": [ + { + "input": { + "files": ["measurement.csv"], + "prefix": null, + "suffix": null, + "contains": null, + "exclude": null + }, + "extractor": { + "filetype": "basic.csv", + "encoding": null, + "locale": "de_DE", + "timezone": "Europe/Zurich", + "parameters": { + "sep": ",", + "strip": null, + "timestamp": null, + "units": null + } + }, + "tag": null, + "externaldate": null + } + ] +} \ No newline at end of file