From e37499133d31e0f5a161906afa1aab13f3842d3c Mon Sep 17 00:00:00 2001
From: Altay Sansal <altay.sansal@tgs.com>
Date: Thu, 29 Feb 2024 18:23:50 -0600
Subject: [PATCH] Fix bugs in SEG-Y file and settings handling and improve type
 stability

Revised the SEG-Y file handling mechanism to support extended text headers better. The binary header now mandated to return a DataFrame, and a NotImplemented error is raised when not using pandas for headers, ensuring superior type stability. In the ibm.py module, np.float32 and np.uint32 are enforced to ensure numeric consistency.
---
 src/segy/file.py | 51 +++++++++++++++++++++++++++++++-----------------
 src/segy/ibm.py  | 40 +++++++++++++++++++++++--------------
 2 files changed, 58 insertions(+), 33 deletions(-)

diff --git a/src/segy/file.py b/src/segy/file.py
index 04110ce..f9f075f 100644
--- a/src/segy/file.py
+++ b/src/segy/file.py
@@ -16,12 +16,12 @@
 from segy.schema import SegyStandard
 from segy.standards.registry import get_spec
 from segy.standards.rev1 import rev1_binary_file_header
+from segy.standards.rev1 import rev1_extended_text_header
 
 if TYPE_CHECKING:
     from typing import Any
 
     from fsspec import AbstractFileSystem
-    from numpy.typing import NDArray
 
     from segy.indexing import AbstractIndexer
     from segy.schema import SegyDescriptor
@@ -38,10 +38,9 @@ def __init__(
         url: str,
         spec: SegyDescriptor | None = None,
         settings: SegyFileSettings | None = None,
-        storage_options: dict[str, Any] = None,
+        storage_options: dict[str, Any] | None = None,
     ):
-        if settings is None:
-            self.settings = SegyFileSettings()
+        self.settings = SegyFileSettings() if settings is None else settings
 
         if storage_options is None:
             storage_options = {}
@@ -58,15 +57,16 @@ def from_spec(
         cls: type[SegyFile],
         url: str,
         spec: SegyDescriptor,
-        **kwargs: dict[str, Any],
+        storage_options: dict[str, Any] | None = None,
     ) -> SegyFile:
         """Open a SEG-Y file based on custom spec."""
-        return cls(url=url, spec=spec, **kwargs)
+        return cls(url=url, spec=spec, storage_options=storage_options)
 
     @property
     def file_size(self) -> int:
         """Return file size in bytes."""
-        return self._info["size"]
+        size: int = self._info["size"]
+        return size
 
     @property
     def num_traces(self) -> int:
@@ -75,21 +75,27 @@ def num_traces(self) -> int:
         file_bin_hdr_size = self.spec.binary_file_header.itemsize
         trace_size = self.spec.trace.itemsize
 
-        rev0_file = self.spec.segy_standard == SegyStandard.REV0
-        if self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE is None and not rev0_file:
+        num_ext_text = 0
+        # All but Rev0 can have extended text headers
+        if self.spec.segy_standard != SegyStandard.REV0:
             header_key = self.settings.BINARY.EXTENDED_TEXT_HEADER.KEY
 
             num_ext_text = 0
             if header_key in self.binary_header:
                 # Use df.at method to get a single value and not return a series
                 num_ext_text = self.binary_header.at[0, header_key]
-        else:
+
+        # Overriding from settings
+        elif self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE is not None:
             num_ext_text = self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE
 
         file_metadata_size = file_textual_hdr_size + file_bin_hdr_size
 
-        if num_ext_text is not None:
-            file_metadata_size += num_ext_text * self.spec.extended_text_header.itemsize
+        if num_ext_text > 0:
+            self.spec.extended_text_header = rev1_extended_text_header
+
+            ext_text_size = self.spec.extended_text_header.itemsize * int(num_ext_text)
+            file_metadata_size = file_metadata_size + ext_text_size
 
         return (self.file_size - file_metadata_size) // trace_size
 
@@ -127,7 +133,7 @@ def _infer_standard(self) -> SegyDescriptor:
         return get_spec(standard)
 
     @cached_property
-    def binary_header(self) -> NDArray[Any] | DataFrame:
+    def binary_header(self) -> DataFrame:
         """Read binary header from store, based on spec."""
         buffer = bytearray(
             self.fs.read_block(
@@ -145,7 +151,11 @@ def binary_header(self) -> NDArray[Any] | DataFrame:
         if self.settings.USE_PANDAS:
             return DataFrame(bin_hdr.reshape(1))
 
-        return bin_hdr.squeeze()
+        # The numpy array breaks downstream logic so for now
+        # turning it off and raising a not implemented error.
+        msg = "Not using pandas for headers not implemented yet."
+        raise NotImplementedError(msg)
+        # return bin_hdr.squeeze()
 
     def _parse_binary_header(self) -> None:
         """Parse the binary header and apply some rules."""
@@ -165,18 +175,23 @@ def _parse_binary_header(self) -> None:
         self.spec.trace.data_descriptor.samples = int(samples_per_trace)
         self.spec.trace.offset = text_hdr_size + bin_hdr_size
 
-        rev0_file = self.spec.segy_standard == SegyStandard.REV0
-        if self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE is None and not rev0_file:
+        num_ext_text = 0
+        # All but Rev0 can have extended text headers
+        if self.spec.segy_standard != SegyStandard.REV0:
             header_key = self.settings.BINARY.EXTENDED_TEXT_HEADER.KEY
 
             num_ext_text = 0
             if header_key in self.binary_header:
                 # Use df.at method to get a single value and not return a series
                 num_ext_text = self.binary_header.at[0, header_key]
-        else:
+
+        # Overriding from settings
+        elif self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE is not None:
             num_ext_text = self.settings.BINARY.EXTENDED_TEXT_HEADER.VALUE
 
-        if num_ext_text is not None:
+        if num_ext_text > 0:
+            self.spec.extended_text_header = rev1_extended_text_header
+
             ext_text_size = self.spec.extended_text_header.itemsize * int(num_ext_text)
             self.spec.trace.offset = self.spec.trace.offset + ext_text_size
 
diff --git a/src/segy/ibm.py b/src/segy/ibm.py
index 66749e7..71a2599 100644
--- a/src/segy/ibm.py
+++ b/src/segy/ibm.py
@@ -1,9 +1,17 @@
 """Low-level floating point conversion operations."""
+
+
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import numba as nb
 import numpy as np
 
+if TYPE_CHECKING:
+    from segy.typing import NDArrayFloat32
+    from segy.typing import NDArrayUint32
+
 # IEEE to IBM MASKS ETC
 IEEE32_SIGN = np.uint32(0x80000000)
 IEEE32_EXPONENT = np.int32(0x7F800000)
@@ -15,7 +23,7 @@
 IBM32_FRACTION = np.uint32(0xFFFFFF)
 
 
-@nb.njit(
+@nb.njit(  # type: ignore
     "uint32(float32)",
     cache=True,
     locals={
@@ -48,11 +56,11 @@ def ieee2ibm_single(ieee: np.float32) -> np.uint32:
     ieee = np.float32(ieee).view(np.uint32)
 
     if ieee in [0, 2147483648]:  # 0.0 or np.float32(-0.0).view('uint32')
-        return 0
+        return np.uint32(0)
 
     # Get IEEE's sign and exponent
-    sign = ieee & IEEE32_SIGN
-    exponent = ((ieee & IEEE32_EXPONENT) >> 23) - 127
+    sign = ieee & IEEE32_SIGN  # type: ignore
+    exponent = ((ieee & IEEE32_EXPONENT) >> 23) - 127  # type: ignore
     # The IBM 7-bit exponent is to the base 16 and the mantissa is presumed to
     # be entirely to the right of the radix point. In contrast, the IEEE
     # exponent is to the base 2 and there is an assumed 1-bit to the left of
@@ -73,12 +81,12 @@ def ieee2ibm_single(ieee: np.float32) -> np.uint32:
     # Add the implicit initial 1-bit to the 23-bit IEEE mantissa to get the
     # 24-bit IBM mantissa. Downshift it by the remainder from the exponent's
     # division by 4. It is allowed to have up to 3 leading 0s.
-    ibm_mantissa = ((ieee & IEEE32_FRACTION) | 0x800000) >> downshift
+    ibm_mantissa = ((ieee & IEEE32_FRACTION) | 0x800000) >> downshift  # type: ignore
 
-    return sign | exponent | ibm_mantissa
+    return sign | exponent | ibm_mantissa  # type: ignore
 
 
-@nb.njit(
+@nb.njit(  # type: ignore
     "float32(uint32)",
     cache=True,
     locals={
@@ -104,7 +112,7 @@ def ibm2ieee_single(ibm: np.uint32) -> np.float32:
         Value parsed to 32-bit IEEE Float in Little-Endian Format.
     """
     if ibm & IBM32_FRACTION == 0:
-        return 0.0
+        return np.float32(0)
 
     sign_bit = ibm >> 31
 
@@ -117,16 +125,18 @@ def ibm2ieee_single(ibm: np.uint32) -> np.float32:
     # This 16.0 (instead of just 16) is super important.
     # If the base is not a float, it won't work for negative
     # exponents, and fail silently and return zero.
-    return sign * mantissa * 16.0 ** (exponent - 64)
+    return sign * mantissa * 16.0 ** (exponent - 64)  # type: ignore
 
 
-@nb.vectorize("uint32(float32)", cache=True)
-def ieee2ibm(ieee_array: np.float32) -> np.uint32:  # pragma: no cover
+@nb.vectorize("uint32(float32)", cache=True)  # type: ignore
+def ieee2ibm(ieee_array: NDArrayFloat32) -> NDArrayUint32:  # pragma: no cover
     """Wrapper for vectorizing IEEE to IBM conversion to arrays."""
-    return ieee2ibm_single(ieee_array)
+    ibm: NDArrayUint32 = ieee2ibm_single(ieee_array)
+    return ibm
 
 
-@nb.vectorize("float32(uint32)", cache=True)
-def ibm2ieee(ibm_array: np.uint32) -> np.float32:  # pragma: no cover
+@nb.vectorize("float32(uint32)", cache=True)  # type: ignore
+def ibm2ieee(ibm_array: NDArrayUint32) -> NDArrayFloat32:  # pragma: no cover
     """Wrapper for vectorizing IBM to IEEE conversion to arrays."""
-    return ibm2ieee_single(ibm_array)
+    ieee: NDArrayFloat32 = ibm2ieee_single(ibm_array)
+    return ieee