Skip to content

Commit

Permalink
Fix bugs in SEG-Y file and settings handling and improve type stability
Browse files Browse the repository at this point in the history
Revised the SEG-Y file handling mechanism to support extended text headers better. The binary header now mandated to return a DataFrame, and a NotImplemented error is raised when not using pandas for headers, ensuring superior type stability. In the ibm.py module, np.float32 and np.uint32 are enforced to ensure numeric consistency.
  • Loading branch information
Altay Sansal committed Mar 1, 2024
1 parent 7b90153 commit e374991
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 33 deletions.
51 changes: 33 additions & 18 deletions src/segy/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
from segy.schema import SegyStandard
from segy.standards.registry import get_spec
from segy.standards.rev1 import rev1_binary_file_header
from segy.standards.rev1 import rev1_extended_text_header

if TYPE_CHECKING:
from typing import Any

from fsspec import AbstractFileSystem
from numpy.typing import NDArray

from segy.indexing import AbstractIndexer
from segy.schema import SegyDescriptor
Expand All @@ -38,10 +38,9 @@ def __init__(
url: str,
spec: SegyDescriptor | None = None,
settings: SegyFileSettings | None = None,
storage_options: dict[str, Any] = None,
storage_options: dict[str, Any] | None = None,
):
if settings is None:
self.settings = SegyFileSettings()
self.settings = SegyFileSettings() if settings is None else settings

if storage_options is None:
storage_options = {}
Expand All @@ -58,15 +57,16 @@ def from_spec(
cls: type[SegyFile],
url: str,
spec: SegyDescriptor,
**kwargs: dict[str, Any],
storage_options: dict[str, Any] | None = None,
) -> SegyFile:
"""Open a SEG-Y file based on custom spec."""
return cls(url=url, spec=spec, **kwargs)
return cls(url=url, spec=spec, storage_options=storage_options)

@property
def file_size(self) -> int:
"""Return file size in bytes."""
return self._info["size"]
size: int = self._info["size"]
return size

@property
def num_traces(self) -> int:
Expand All @@ -75,21 +75,27 @@ def num_traces(self) -> int:
file_bin_hdr_size = self.spec.binary_file_header.itemsize
trace_size = self.spec.trace.itemsize

rev0_file = self.spec.segy_standard == SegyStandard.REV0
if self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE is None and not rev0_file:
num_ext_text = 0
# All but Rev0 can have extended text headers
if self.spec.segy_standard != SegyStandard.REV0:
header_key = self.settings.BINARY.EXTENDED_TEXT_HEADER.KEY

num_ext_text = 0
if header_key in self.binary_header:
# Use df.at method to get a single value and not return a series
num_ext_text = self.binary_header.at[0, header_key]
else:

# Overriding from settings
elif self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE is not None:
num_ext_text = self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE

file_metadata_size = file_textual_hdr_size + file_bin_hdr_size

if num_ext_text is not None:
file_metadata_size += num_ext_text * self.spec.extended_text_header.itemsize
if num_ext_text > 0:
self.spec.extended_text_header = rev1_extended_text_header

ext_text_size = self.spec.extended_text_header.itemsize * int(num_ext_text)
file_metadata_size = file_metadata_size + ext_text_size

return (self.file_size - file_metadata_size) // trace_size

Expand Down Expand Up @@ -127,7 +133,7 @@ def _infer_standard(self) -> SegyDescriptor:
return get_spec(standard)

@cached_property
def binary_header(self) -> NDArray[Any] | DataFrame:
def binary_header(self) -> DataFrame:
"""Read binary header from store, based on spec."""
buffer = bytearray(
self.fs.read_block(
Expand All @@ -145,7 +151,11 @@ def binary_header(self) -> NDArray[Any] | DataFrame:
if self.settings.USE_PANDAS:
return DataFrame(bin_hdr.reshape(1))

return bin_hdr.squeeze()
# The numpy array breaks downstream logic so for now
# turning it off and raising a not implemented error.
msg = "Not using pandas for headers not implemented yet."
raise NotImplementedError(msg)
# return bin_hdr.squeeze()

def _parse_binary_header(self) -> None:
"""Parse the binary header and apply some rules."""
Expand All @@ -165,18 +175,23 @@ def _parse_binary_header(self) -> None:
self.spec.trace.data_descriptor.samples = int(samples_per_trace)
self.spec.trace.offset = text_hdr_size + bin_hdr_size

rev0_file = self.spec.segy_standard == SegyStandard.REV0
if self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE is None and not rev0_file:
num_ext_text = 0
# All but Rev0 can have extended text headers
if self.spec.segy_standard != SegyStandard.REV0:
header_key = self.settings.BINARY.EXTENDED_TEXT_HEADER.KEY

num_ext_text = 0
if header_key in self.binary_header:
# Use df.at method to get a single value and not return a series
num_ext_text = self.binary_header.at[0, header_key]
else:

# Overriding from settings
elif self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE is not None:
num_ext_text = self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE

if num_ext_text is not None:
if num_ext_text > 0:
self.spec.extended_text_header = rev1_extended_text_header

ext_text_size = self.spec.extended_text_header.itemsize * int(num_ext_text)
self.spec.trace.offset = self.spec.trace.offset + ext_text_size

Expand Down
40 changes: 25 additions & 15 deletions src/segy/ibm.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
"""Low-level floating point conversion operations."""


from __future__ import annotations

from typing import TYPE_CHECKING

import numba as nb
import numpy as np

if TYPE_CHECKING:
from segy.typing import NDArrayFloat32
from segy.typing import NDArrayUint32

# IEEE to IBM MASKS ETC
IEEE32_SIGN = np.uint32(0x80000000)
IEEE32_EXPONENT = np.int32(0x7F800000)
Expand All @@ -15,7 +23,7 @@
IBM32_FRACTION = np.uint32(0xFFFFFF)


@nb.njit(
@nb.njit( # type: ignore
"uint32(float32)",
cache=True,
locals={
Expand Down Expand Up @@ -48,11 +56,11 @@ def ieee2ibm_single(ieee: np.float32) -> np.uint32:
ieee = np.float32(ieee).view(np.uint32)

if ieee in [0, 2147483648]: # 0.0 or np.float32(-0.0).view('uint32')
return 0
return np.uint32(0)

# Get IEEE's sign and exponent
sign = ieee & IEEE32_SIGN
exponent = ((ieee & IEEE32_EXPONENT) >> 23) - 127
sign = ieee & IEEE32_SIGN # type: ignore
exponent = ((ieee & IEEE32_EXPONENT) >> 23) - 127 # type: ignore
# The IBM 7-bit exponent is to the base 16 and the mantissa is presumed to
# be entirely to the right of the radix point. In contrast, the IEEE
# exponent is to the base 2 and there is an assumed 1-bit to the left of
Expand All @@ -73,12 +81,12 @@ def ieee2ibm_single(ieee: np.float32) -> np.uint32:
# Add the implicit initial 1-bit to the 23-bit IEEE mantissa to get the
# 24-bit IBM mantissa. Downshift it by the remainder from the exponent's
# division by 4. It is allowed to have up to 3 leading 0s.
ibm_mantissa = ((ieee & IEEE32_FRACTION) | 0x800000) >> downshift
ibm_mantissa = ((ieee & IEEE32_FRACTION) | 0x800000) >> downshift # type: ignore

return sign | exponent | ibm_mantissa
return sign | exponent | ibm_mantissa # type: ignore


@nb.njit(
@nb.njit( # type: ignore
"float32(uint32)",
cache=True,
locals={
Expand All @@ -104,7 +112,7 @@ def ibm2ieee_single(ibm: np.uint32) -> np.float32:
Value parsed to 32-bit IEEE Float in Little-Endian Format.
"""
if ibm & IBM32_FRACTION == 0:
return 0.0
return np.float32(0)

sign_bit = ibm >> 31

Expand All @@ -117,16 +125,18 @@ def ibm2ieee_single(ibm: np.uint32) -> np.float32:
# This 16.0 (instead of just 16) is super important.
# If the base is not a float, it won't work for negative
# exponents, and fail silently and return zero.
return sign * mantissa * 16.0 ** (exponent - 64)
return sign * mantissa * 16.0 ** (exponent - 64) # type: ignore


@nb.vectorize("uint32(float32)", cache=True)
def ieee2ibm(ieee_array: np.float32) -> np.uint32: # pragma: no cover
@nb.vectorize("uint32(float32)", cache=True) # type: ignore
def ieee2ibm(ieee_array: NDArrayFloat32) -> NDArrayUint32: # pragma: no cover
"""Wrapper for vectorizing IEEE to IBM conversion to arrays."""
return ieee2ibm_single(ieee_array)
ibm: NDArrayUint32 = ieee2ibm_single(ieee_array)
return ibm


@nb.vectorize("float32(uint32)", cache=True)
def ibm2ieee(ibm_array: np.uint32) -> np.float32: # pragma: no cover
@nb.vectorize("float32(uint32)", cache=True) # type: ignore
def ibm2ieee(ibm_array: NDArrayUint32) -> NDArrayFloat32: # pragma: no cover
"""Wrapper for vectorizing IBM to IEEE conversion to arrays."""
return ibm2ieee_single(ibm_array)
ieee: NDArrayFloat32 = ibm2ieee_single(ibm_array)
return ieee

0 comments on commit e374991

Please sign in to comment.