Skip to content

Commit

Permalink
feat: use tablib for data parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
MyPyDavid committed Nov 25, 2023
1 parent 092fe98 commit 744082f
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 43 deletions.
39 changes: 35 additions & 4 deletions src/raman_fitting/indexing/file_parsers.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,48 @@
import logging
from pathlib import Path
import re

from warnings import warn
from typing import List
from typing import List, Sequence, Type, Optional

import numpy as np
import pandas as pd
from tablib import Dataset

logger = logging.getLogger(__name__)


def filter_data_for_numeric(data: Dataset):
filtered_data = Dataset()
filtered_data.headers = data.headers

for row in data:
try:
digits_row = tuple(map(lambda x: float(x), row))
except ValueError:
continue
except TypeError:
continue

if not any(i is None for i in digits_row):
filtered_data.append(digits_row)
return filtered_data


def read_file_with_tablib(filepath, headers_keys: Sequence[str], **kwargs):
with open(filepath, "r") as fh:
imported_data = Dataset().load(fh)

if headers_keys and set(imported_data.headers) not in set(headers_keys):
with open(filepath, "r") as fh:
imported_data = Dataset().load(fh, headers=False)
imported_data.headers = headers_keys

numeric_data = filter_data_for_numeric(imported_data)
data_df = numeric_data.export("df")
return data_df


def read_text(filepath, max_bytes=10**6, encoding="utf-8", errors=None):
"""additional read text method for raw text data inspection"""
_text = "read_text_method"
Expand Down Expand Up @@ -54,9 +87,7 @@ def cast_array_into_spectrum_frame(array, keys: List[str] = None) -> pd.DataFram
)

try:
spectrum_data = pd.DataFrame(
array, columns=keys
)
spectrum_data = pd.DataFrame(array, columns=keys)
return spectrum_data
except Exception as exc:
_msg = f"Can not create DataFrame from array object: {array}\n{exc}"
Expand Down
80 changes: 41 additions & 39 deletions src/raman_fitting/indexing/filedata_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,40 +10,47 @@
import re

from warnings import warn
from typing import List
from typing import List, Sequence

import numpy as np
import pandas as pd

from .file_parsers import load_spectrum_from_txt
from .file_parsers import read_file_with_tablib
from .validators import ValidateSpectrumValues

logger = logging.getLogger(__name__)


SPECTRUM_FILETYPE_PARSERS = {
".txt": {
"method": load_spectrum_from_txt,
"kwargs": {
"usecols": (0, 1),
"keys": ("ramanshift", "intensity"),
},
},
".xlsx": {
"method": pd.read_excel,
"kwargs": {},
},
".csv": {
"method": pd.read_csv,
"kwargs": {},
".txt": {
"method": read_file_with_tablib, # load_spectrum_from_txt,
"_kwargs": {
"usecols": (0, 1),
"keys": ("ramanshift", "intensity"),
},
},
".xlsx": {
"method": read_file_with_tablib, # pd.read_excel,
"kwargs": {},
},
".csv": {
"method": read_file_with_tablib, # pd.read_csv,
"kwargs": {},
},
".json": {
"method": read_file_with_tablib,
},
}

supported_filetypes = [".txt"]
spectrum_data_keys = ("ramanshift", "intensity")

ramanshift_expected_values = ValidateSpectrumValues(spectrum_key="ramanshift", min=-95, max=3600, len=1600)
intensity_expected_values = ValidateSpectrumValues(spectrum_key="intensity", min=0, max=1e4, len=1600)
ramanshift_expected_values = ValidateSpectrumValues(
spectrum_key="ramanshift", min=-95, max=3600, len=1600
)
intensity_expected_values = ValidateSpectrumValues(
spectrum_key="intensity", min=0, max=1e4, len=1600
)


@dataclass
Expand All @@ -55,15 +62,17 @@ class SpectrumReader:
Double checks the values
Sets a hash attribute afterwards
"""

filepath: Path | str
max_bytesize: int = 10**6
spectrum_data_keys: tuple = ("ramanshift", "intensity")
spectrum_keys_expected_values: List[ValidateSpectrumValues] = field(default_factory=list)
spectrum_keys_expected_values: List[ValidateSpectrumValues] = field(
default_factory=list
)
spectrum: pd.DataFrame = field(default_factory=pd.DataFrame)
spectrum_hash: str = ""
spectrum_length: int = 0


def __post_init__(self, **kwargs):
super().__init__()

Expand All @@ -85,9 +94,14 @@ def __post_init__(self, **kwargs):
logger.warning(f"File too large ({filesize})=> skipped")
return

self.spectrum = self.spectrum_parser(self.filepath)
self.spectrum = self.spectrum_parser(self.filepath, self.spectrum_data_keys)
for spectrum_key in self.spectrum.columns:
validators = list(filter(lambda x: x.spectrum_key == spectrum_key, self.spectrum_keys_expected_values))
validators = list(
filter(
lambda x: x.spectrum_key == spectrum_key,
self.spectrum_keys_expected_values,
)
)
for validator in validators:
self.validate_spectrum_keys_expected_values(self.spectrum, validator)

Expand All @@ -102,7 +116,7 @@ def __post_init__(self, **kwargs):
for key in self.spectrum_data_keys:
setattr(self, key, self.spectrum[key].to_numpy())

def spectrum_parser(self, filepath: Path):
def spectrum_parser(self, filepath: Path, header_keys: Sequence[str]):
"""
Reads data from a file and converts into pd.DataFrame object
Expand All @@ -116,21 +130,12 @@ def spectrum_parser(self, filepath: Path):
pd.DataFrame
Contains the data of the spectrum in a DataFrame with the selected spectrum keys as columns
"""

spectrum_data = pd.DataFrame()

suffix = filepath.suffix

if suffix not in SPECTRUM_FILETYPE_PARSERS:
raise ValueError(f"Filetype {suffix} not supported")

parser = SPECTRUM_FILETYPE_PARSERS[suffix]["method"]
kwargs = SPECTRUM_FILETYPE_PARSERS[suffix]["kwargs"]
spectrum_data = parser(filepath, **kwargs)

kwargs = SPECTRUM_FILETYPE_PARSERS[suffix].get("kwargs", {})
spectrum_data = parser(filepath, header_keys, **kwargs)
return spectrum_data


def validate_spectrum_keys_expected_values(
self, spectrum_data: pd.DataFrame, expected_values: ValidateSpectrumValues
):
Expand All @@ -142,15 +147,13 @@ def validate_spectrum_keys_expected_values(
logger.error("Spectrum data is empty")
return

validation = expected_values.validate_spectrum(spectrum_data)
validation = expected_values.validate(spectrum_data)

if not validation:
logger.warning(
f"The {expected_values.spectrum_key} of this spectrum does not match the expected values {expected_values}"
)



def sort_spectrum(
self, spectrum: pd.DataFrame, sort_by="ramanshift", ignore_index=True
):
Expand All @@ -177,4 +180,3 @@ def quickplot(self):
self.spectrum.plot(x="ramanshift", y="intensity")
except TypeError:
logger.warning("No numeric data to plot")

0 comments on commit 744082f

Please sign in to comment.