StreetEasy · Casyfill · Aug 13, 2024 · Aug 12, 2024 · Aug 13, 2024 · Aug 13, 2024
diff --git a/.github/workflows/pre-release.yml b/.github/workflows/pre-release.yml
@@ -11,8 +11,8 @@ jobs:
 
     strategy:
       matrix:
-        python-version: [3.7, 3.8, 3.9]
-        poetry-version: [1.2]
+        python-version: [3.8, 3.9, "3.10", 3.11]
+        poetry-version: [1.8]
         os: [ubuntu-latest, macos-latest, windows-latest]
     runs-on: ${{ matrix.os }}
 
@@ -72,14 +72,14 @@ jobs:
         with:
           submodules: recursive
 
-      - name: Set up Python 3.7
+      - name: Set up Python 3.8
         uses: actions/setup-python@v2
         with:
-           python-version: 3.7
-      - name: Set up Poetry 1.2
+           python-version: 3.8
+      - name: Set up Poetry 1.8
         uses: abatilo/[email protected]
         with:
-          poetry-version: 1.2
+          poetry-version: 1.8
 
       - uses: actions/download-artifact@v2
         with:

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -11,8 +11,8 @@ jobs:
     if: "!github.event.release.prerelease"
     strategy:
       matrix:
-        python-version: [3.7, 3.8, 3.9]
-        poetry-version: [1.2]
+        python-version: [3.8, 3.9, "3.10", 3.11]
+        poetry-version: [1.8]
         os: [ubuntu-latest, macos-latest, windows-latest]
     runs-on: ${{ matrix.os }}
 
@@ -72,15 +72,15 @@ jobs:
         with:
           submodules: recursive
 
-      - name: Set up Python 3.7
+      - name: Set up Python 3.8
         uses: actions/setup-python@v2
         with:
-           python-version: 3.7
+           python-version: 3.8
 
-      - name: Set up Poetry 1.2
+      - name: Set up Poetry 1.8
         uses: abatilo/[email protected]
         with:
-          poetry-version: 1.2
+          poetry-version: 1.8
 
       - name: Set up cache
         uses: actions/cache@v1

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -16,8 +16,8 @@ jobs:
     runs-on: macos-latest
     strategy:
       matrix:
-        python-version: [3.7, 3.8, 3.9]
-        poetry-version: [1.2]
+        python-version: [3.8, 3.9, "3.10", 3.11]
+        poetry-version: [1.8]
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:

diff --git a/.github/workflows/typer_build_docs.yaml-deactivated-for-now b/.github/workflows/typer_build_docs.yaml-deactivated-for-now
@@ -23,7 +23,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v2
         with:
-          python-version: "3.7"
+          python-version: "3.8"
       # Allow debugging with tmate
       - name: Setup tmate session
         uses: mxschmitt/action-tmate@v3

diff --git a/changelog.md b/changelog.md
@@ -1,20 +1,31 @@
 # Changelog
 
-v0.0.11:
+### v0.0.12
+
+Fixed:
+- using `na_pct_below`
+- `from_df` now includes metadata
+Changed:
+- bumped minimum python version to 3.8
+- Pydantic migrated to v2
+- Allows use of Pandas v2
+
+
+### v0.0.11:
 - Version in metadata
   - adds `dfschema` and `pandas` version in metadata upon generation (Later will worn if Schema is initialized from json, generated by later version)
 - Renamed `na_limit` to `na_pct_below` to make it unambiguous (with backward support)
 - Added `optional=True` flag for columns. If true, does not raise exception if column is not present
 - added `dfschema update {existing_schema} {output_schema}` command to upgrade schemas
 
-v0.0.10:
+### v0.0.10:
 - relaxed Pydantic requirement to `>=1.9`
 
-v0.0.9:
+### v0.0.9:
 - Pydantic bumped to `1.10`
 - Bug Fix: Categorical constraints (`exact_set`, `oneof`, `include`) now can keeo `int` and `float` values. That expands to legacy schemas as well.
 
-v0.0.8:
+### v0.0.8:
 Legacy Schema Aliases (support for legacy schemas):
 - `min_value` now also supports `min` alias
 - `max_value` now also supports `max` alias
@@ -28,25 +39,27 @@ Testing:
 - pre-commit setup was updated
 
 
-v0.0.7:
+### v0.0.7:
 - rename `DfSchema.validate_df` to `DfSchema.validate` (UNDONE: `validate` is reserved by Pydantic object)
 - updated documentation
 
-v0.0.6:
+### v0.0.6:
     - `DfSchema.to_file`, `DfSchema.from_file` proper testing
     - CLI command help texts
     - added pre-commit install to the repo
     - Some benchmarking
     - renamed `dfs.validate_df` to `dfs.validate`
 
-v0.0.5: fix column dtype generation/validation bug
+### v0.0.5
+- fix column dtype generation/validation bug
 
 ## Pre-Publication
-v1.3.0
+
+### v1.3.0
 - renamed strict_column_set to additionalColumns
 - renamed strict_column_order to exactColumnOrder
 
-v1.2.0
+### v1.2.0
 - Metadata SubObject
 - Summary Exception is now collected for specific DfSchema, not via Borg State
 - Supports SubSets

diff --git a/dfschema/__init__.py b/dfschema/__init__.py
@@ -7,7 +7,7 @@
     DataFrameSummaryError,
 )
 
-__version__ = "0.0.11"
+__version__ = "0.0.12"
 
 __all__ = [
     "validate",

diff --git a/dfschema/core/column.py b/dfschema/core/column.py
@@ -4,7 +4,7 @@
 from warnings import warn
 
 import pandas as pd
-from pydantic import BaseModel, Extra, Field  # , validator
+from pydantic.v1 import BaseModel, Extra, Field  # , validator
 
 from .dtype import DtypeAliasPool, DtypeLiteral
 from .exceptions import DataFrameSchemaError, DataFrameValidationError
@@ -52,6 +52,21 @@ def _validate_column_presence(
             raise DataFrameValidationError(text)
 
 
+def _is_string(series: pd.Series, strict: bool = False) -> bool:
+    """Check if series is string-like
+    NOTE: Pandas 2 does not accept object dtype as string;
+    THis is a workaround.
+    TODO: explicitly check for object dtype and raise warning
+    """
+    result = pd.api.types.is_string_dtype(series) or pd.isnull(series).all()
+    if strict:
+        return result
+
+    object_like = pd.api.types.is_object_dtype(series)
+
+    return result or object_like
+
+
 class ValueLimits(BaseModel):  # type: ignore
     min: Union[float, date, datetime, str, None] = None
     max: Union[float, date, datetime, str, None] = None
@@ -212,6 +227,7 @@ class ColSchema(BaseModel):
         lt=1.0,
         description="limit of missing values. If set to true, will raise if all values are empty. If set to a number, will raise if more than given perecnt of values are empty (Nan)",
         alias="na_limit",
+        alias_priority=2,
     )
 
     value_limits: Optional[ValueLimits] = Field(
@@ -234,6 +250,7 @@ class ColSchema(BaseModel):
     class Config:
         extra = Extra.forbid
         use_enum_values = True
+        allow_population_by_field_name = True
 
     def _map_dtype(
         self, dtype: Optional[str] = None, raise_error: bool = True
@@ -252,7 +269,7 @@ def _map_dtype(
         "numeric": pd.api.types.is_numeric_dtype,
         "int": pd.api.types.is_integer_dtype,
         "float": pd.api.types.is_float_dtype,
-        "string": lambda s: (pd.api.types.is_string_dtype(s) or pd.isnull(s).all()),
+        "string": _is_string,
         "timedelta64[ns]": pd.api.types.is_timedelta64_dtype,
     }
 

diff --git a/dfschema/core/core.py b/dfschema/core/core.py
@@ -1,10 +1,10 @@
 from typing import Callable, Optional, Union, List
 import json
 from pathlib import Path
-
+from datetime import datetime
 
 import pandas as pd
-from pydantic import BaseModel, Extra, Field, PrivateAttr
+from pydantic.v1 import BaseModel, Extra, Field, PrivateAttr
 
 from .column import ColSchema, _validate_column_presence
 from .exceptions import DataFrameSchemaError, DataFrameSummaryError, SubsetSummaryError
@@ -30,7 +30,7 @@ class Config:
         arbitrary_types_allowed = True
 
     metadata: Optional[MetaData] = Field(
-        MetaData(),
+        MetaData,
         description="optional metadata, including version and protocol version",
     )
 
@@ -197,7 +197,7 @@ def from_file(cls, path: Union[str, Path]) -> "DfSchema":
                 )
             return cls.from_dict(schema)
         except Exception as e:
-            raise DataFrameSchemaError(f"Error loading schema from file {path}") from e
+            raise DataFrameSchemaError(f"Error loading schema from file {path}: {e}")
 
     def to_file(self, path: Union[str, Path]) -> None:
         """write chema to file
@@ -233,7 +233,7 @@ def to_file(self, path: Union[str, Path]) -> None:
                 )
 
         except Exception as e:
-            raise DataFrameSchemaError(f"Error wriging schema to file {path}") from e
+            raise DataFrameSchemaError(f"Error wriging schema to file {path}: {e}")
 
     @classmethod
     def from_dict(
@@ -282,10 +282,14 @@ def from_df(
         """
 
         schema = generate_schema_dict_from_df(df)
+        schema["metadata"] = {
+            "protocol_version": 2.0,
+            "version": datetime.now().isoformat(),
+        }
         subset_schemas = []
         if subset_predicates:
             for predicate in subset_predicates:
-                filtered = SubsetSchema.filter_df(df, predicate)
+                filtered = SubsetSchema._filter(df, predicate)
 
                 subset_schema = generate_schema_dict_from_df(filtered)
                 subset_schema["predicate"] = predicate

diff --git a/dfschema/core/dtype.py b/dfschema/core/dtype.py
@@ -3,6 +3,7 @@
 mostly based on pandas.core.dtypes.dtypes.
 https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes
 """
+
 import sys
 
 if sys.version_info >= (3, 8):

diff --git a/dfschema/core/legacy/__init__.py b/dfschema/core/legacy/__init__.py
@@ -9,4 +9,4 @@ def infer_protocol_version(d: dict) -> float:
     if "metadata" not in d or "protocol_version" not in d["metadata"]:
         logging.info("Missing `protocol_version` in metadata. Assuming PV=1.0")
 
-    return d.get("metadata", {}).get("protocol_version", 1.0)
+    return d.get("metadata", dict()).get("protocol_version", 1.0)
diff --git a/dfschema/core/legacy/v1.py b/dfschema/core/legacy/v1.py
@@ -1,4 +1,4 @@
-from pydantic import BaseModel, Extra, Field, PositiveInt
+from pydantic.v1 import BaseModel, Extra, Field, PositiveInt
 
 # import json
 

diff --git a/dfschema/core/metadata.py b/dfschema/core/metadata.py
@@ -2,7 +2,7 @@
 from datetime import date
 from typing import Optional
 
-from pydantic import BaseModel, Field
+from pydantic.v1 import BaseModel, Field
 
 from .config import CURRENT_PROTOCOL_VERSION
 

diff --git a/dfschema/core/shape.py b/dfschema/core/shape.py
@@ -1,6 +1,6 @@
 from typing import Optional
 import pandas as pd
-from pydantic import BaseModel, Field, PositiveInt, Extra
+from pydantic.v1 import BaseModel, Field, PositiveInt, Extra
 
 from .exceptions import DataFrameValidationError
 from .collector import exception_collector