From e37499133d31e0f5a161906afa1aab13f3842d3c Mon Sep 17 00:00:00 2001 From: Altay Sansal Date: Thu, 29 Feb 2024 18:23:50 -0600 Subject: [PATCH] Fix bugs in SEG-Y file and settings handling and improve type stability Revised the SEG-Y file handling mechanism to support extended text headers better. The binary header now mandated to return a DataFrame, and a NotImplemented error is raised when not using pandas for headers, ensuring superior type stability. In the ibm.py module, np.float32 and np.uint32 are enforced to ensure numeric consistency. --- src/segy/file.py | 51 +++++++++++++++++++++++++++++++----------------- src/segy/ibm.py | 40 +++++++++++++++++++++++-------------- 2 files changed, 58 insertions(+), 33 deletions(-) diff --git a/src/segy/file.py b/src/segy/file.py index 04110ce..f9f075f 100644 --- a/src/segy/file.py +++ b/src/segy/file.py @@ -16,12 +16,12 @@ from segy.schema import SegyStandard from segy.standards.registry import get_spec from segy.standards.rev1 import rev1_binary_file_header +from segy.standards.rev1 import rev1_extended_text_header if TYPE_CHECKING: from typing import Any from fsspec import AbstractFileSystem - from numpy.typing import NDArray from segy.indexing import AbstractIndexer from segy.schema import SegyDescriptor @@ -38,10 +38,9 @@ def __init__( url: str, spec: SegyDescriptor | None = None, settings: SegyFileSettings | None = None, - storage_options: dict[str, Any] = None, + storage_options: dict[str, Any] | None = None, ): - if settings is None: - self.settings = SegyFileSettings() + self.settings = SegyFileSettings() if settings is None else settings if storage_options is None: storage_options = {} @@ -58,15 +57,16 @@ def from_spec( cls: type[SegyFile], url: str, spec: SegyDescriptor, - **kwargs: dict[str, Any], + storage_options: dict[str, Any] | None = None, ) -> SegyFile: """Open a SEG-Y file based on custom spec.""" - return cls(url=url, spec=spec, **kwargs) + return cls(url=url, spec=spec, storage_options=storage_options) @property def file_size(self) -> int: """Return file size in bytes.""" - return self._info["size"] + size: int = self._info["size"] + return size @property def num_traces(self) -> int: @@ -75,21 +75,27 @@ def num_traces(self) -> int: file_bin_hdr_size = self.spec.binary_file_header.itemsize trace_size = self.spec.trace.itemsize - rev0_file = self.spec.segy_standard == SegyStandard.REV0 - if self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE is None and not rev0_file: + num_ext_text = 0 + # All but Rev0 can have extended text headers + if self.spec.segy_standard != SegyStandard.REV0: header_key = self.settings.BINARY.EXTENDED_TEXT_HEADER.KEY num_ext_text = 0 if header_key in self.binary_header: # Use df.at method to get a single value and not return a series num_ext_text = self.binary_header.at[0, header_key] - else: + + # Overriding from settings + elif self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE is not None: num_ext_text = self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE file_metadata_size = file_textual_hdr_size + file_bin_hdr_size - if num_ext_text is not None: - file_metadata_size += num_ext_text * self.spec.extended_text_header.itemsize + if num_ext_text > 0: + self.spec.extended_text_header = rev1_extended_text_header + + ext_text_size = self.spec.extended_text_header.itemsize * int(num_ext_text) + file_metadata_size = file_metadata_size + ext_text_size return (self.file_size - file_metadata_size) // trace_size @@ -127,7 +133,7 @@ def _infer_standard(self) -> SegyDescriptor: return get_spec(standard) @cached_property - def binary_header(self) -> NDArray[Any] | DataFrame: + def binary_header(self) -> DataFrame: """Read binary header from store, based on spec.""" buffer = bytearray( self.fs.read_block( @@ -145,7 +151,11 @@ def binary_header(self) -> NDArray[Any] | DataFrame: if self.settings.USE_PANDAS: return DataFrame(bin_hdr.reshape(1)) - return bin_hdr.squeeze() + # The numpy array breaks downstream logic so for now + # turning it off and raising a not implemented error. + msg = "Not using pandas for headers not implemented yet." + raise NotImplementedError(msg) + # return bin_hdr.squeeze() def _parse_binary_header(self) -> None: """Parse the binary header and apply some rules.""" @@ -165,18 +175,23 @@ def _parse_binary_header(self) -> None: self.spec.trace.data_descriptor.samples = int(samples_per_trace) self.spec.trace.offset = text_hdr_size + bin_hdr_size - rev0_file = self.spec.segy_standard == SegyStandard.REV0 - if self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE is None and not rev0_file: + num_ext_text = 0 + # All but Rev0 can have extended text headers + if self.spec.segy_standard != SegyStandard.REV0: header_key = self.settings.BINARY.EXTENDED_TEXT_HEADER.KEY num_ext_text = 0 if header_key in self.binary_header: # Use df.at method to get a single value and not return a series num_ext_text = self.binary_header.at[0, header_key] - else: + + # Overriding from settings + elif self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE is not None: num_ext_text = self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE - if num_ext_text is not None: + if num_ext_text > 0: + self.spec.extended_text_header = rev1_extended_text_header + ext_text_size = self.spec.extended_text_header.itemsize * int(num_ext_text) self.spec.trace.offset = self.spec.trace.offset + ext_text_size diff --git a/src/segy/ibm.py b/src/segy/ibm.py index 66749e7..71a2599 100644 --- a/src/segy/ibm.py +++ b/src/segy/ibm.py @@ -1,9 +1,17 @@ """Low-level floating point conversion operations.""" + + from __future__ import annotations +from typing import TYPE_CHECKING + import numba as nb import numpy as np +if TYPE_CHECKING: + from segy.typing import NDArrayFloat32 + from segy.typing import NDArrayUint32 + # IEEE to IBM MASKS ETC IEEE32_SIGN = np.uint32(0x80000000) IEEE32_EXPONENT = np.int32(0x7F800000) @@ -15,7 +23,7 @@ IBM32_FRACTION = np.uint32(0xFFFFFF) -@nb.njit( +@nb.njit( # type: ignore "uint32(float32)", cache=True, locals={ @@ -48,11 +56,11 @@ def ieee2ibm_single(ieee: np.float32) -> np.uint32: ieee = np.float32(ieee).view(np.uint32) if ieee in [0, 2147483648]: # 0.0 or np.float32(-0.0).view('uint32') - return 0 + return np.uint32(0) # Get IEEE's sign and exponent - sign = ieee & IEEE32_SIGN - exponent = ((ieee & IEEE32_EXPONENT) >> 23) - 127 + sign = ieee & IEEE32_SIGN # type: ignore + exponent = ((ieee & IEEE32_EXPONENT) >> 23) - 127 # type: ignore # The IBM 7-bit exponent is to the base 16 and the mantissa is presumed to # be entirely to the right of the radix point. In contrast, the IEEE # exponent is to the base 2 and there is an assumed 1-bit to the left of @@ -73,12 +81,12 @@ def ieee2ibm_single(ieee: np.float32) -> np.uint32: # Add the implicit initial 1-bit to the 23-bit IEEE mantissa to get the # 24-bit IBM mantissa. Downshift it by the remainder from the exponent's # division by 4. It is allowed to have up to 3 leading 0s. - ibm_mantissa = ((ieee & IEEE32_FRACTION) | 0x800000) >> downshift + ibm_mantissa = ((ieee & IEEE32_FRACTION) | 0x800000) >> downshift # type: ignore - return sign | exponent | ibm_mantissa + return sign | exponent | ibm_mantissa # type: ignore -@nb.njit( +@nb.njit( # type: ignore "float32(uint32)", cache=True, locals={ @@ -104,7 +112,7 @@ def ibm2ieee_single(ibm: np.uint32) -> np.float32: Value parsed to 32-bit IEEE Float in Little-Endian Format. """ if ibm & IBM32_FRACTION == 0: - return 0.0 + return np.float32(0) sign_bit = ibm >> 31 @@ -117,16 +125,18 @@ def ibm2ieee_single(ibm: np.uint32) -> np.float32: # This 16.0 (instead of just 16) is super important. # If the base is not a float, it won't work for negative # exponents, and fail silently and return zero. - return sign * mantissa * 16.0 ** (exponent - 64) + return sign * mantissa * 16.0 ** (exponent - 64) # type: ignore -@nb.vectorize("uint32(float32)", cache=True) -def ieee2ibm(ieee_array: np.float32) -> np.uint32: # pragma: no cover +@nb.vectorize("uint32(float32)", cache=True) # type: ignore +def ieee2ibm(ieee_array: NDArrayFloat32) -> NDArrayUint32: # pragma: no cover """Wrapper for vectorizing IEEE to IBM conversion to arrays.""" - return ieee2ibm_single(ieee_array) + ibm: NDArrayUint32 = ieee2ibm_single(ieee_array) + return ibm -@nb.vectorize("float32(uint32)", cache=True) -def ibm2ieee(ibm_array: np.uint32) -> np.float32: # pragma: no cover +@nb.vectorize("float32(uint32)", cache=True) # type: ignore +def ibm2ieee(ibm_array: NDArrayUint32) -> NDArrayFloat32: # pragma: no cover """Wrapper for vectorizing IBM to IEEE conversion to arrays.""" - return ibm2ieee_single(ibm_array) + ieee: NDArrayFloat32 = ibm2ieee_single(ibm_array) + return ieee