Skip to content

Commit

Permalink
124 add strain information linked with the species csv (#126)
Browse files Browse the repository at this point in the history
* feat: working on adding species -- bit complicated because of duplicates

* feat: working generator for species

* feat: add MGI strain registries
  • Loading branch information
dbirman authored Jan 24, 2025
1 parent 7b609c0 commit aab568c
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 17 deletions.
3 changes: 2 additions & 1 deletion src/aind_data_schema_models/_generators/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import argparse
from jinja2 import Environment
import pandas as pd
from aind_data_schema_models.utils import to_class_name, to_class_name_underscored
from aind_data_schema_models.utils import to_class_name, to_class_name_underscored, unique_rows
from pathlib import Path
import subprocess

Expand Down Expand Up @@ -35,6 +35,7 @@ def generate_code(data_type: str, root_path: str, isort: bool = True, black: boo
env = Environment()
env.filters["to_class_name"] = to_class_name
env.filters["to_class_name_underscored"] = to_class_name_underscored
env.filters["unique_rows"] = unique_rows
rendered_template = env.from_string(template)

# Render template with data
Expand Down
13 changes: 7 additions & 6 deletions src/aind_data_schema_models/_generators/models/species.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name,registry_abbreviation,registry_identifier
Callithrix jacchus,NCBI,NCBI:txid9483
Homo sapiens,NCBI,NCBI:txid9606
Macaca mulatta,NCBI,NCBI:txid9544
Mus musculus,NCBI,NCBI:txid10090
Rattus norvegicus,NCBI,NCBI:txid10116
species,species_registry_abbreviation,species_registry_identifier,strain,strain_registry_abbreviation,strain_registry_identifier
Callithrix jacchus,NCBI,NCBI:txid9483,default,,
Homo sapiens,NCBI,NCBI:txid9606,default,,
Macaca mulatta,NCBI,NCBI:txid9544,default,,
Mus musculus,NCBI,NCBI:txid10090,C57BL/6J,MGI,MGI:3028467
Mus musculus,NCBI,NCBI:txid10090,BALB/c,MGI,MGI:2159737
Rattus norvegicus,NCBI,NCBI:txid10116,default,,
51 changes: 42 additions & 9 deletions src/aind_data_schema_models/_generators/templates/species.txt
Original file line number Diff line number Diff line change
@@ -1,30 +1,63 @@
"""Species"""
{% set species_data = data | unique_rows("species") %}
{% set strain_data = data | unique_rows("strain") %}
{% raw -%}
from pydantic import BaseModel, Field, ConfigDict
from typing import Literal, Union
from typing_extensions import Annotated
from aind_data_schema_models.registries import Registry
{% endraw %}

class StrainModel(BaseModel):
"""Base model for a strain"""
model_config = ConfigDict(frozen=True)
name: str
species: str
registry: Registry.ONE_OF
registry_identifier: str

{% for _, row in strain_data.iterrows() %}
{% if row['strain'] != 'default' %}
class {{ row['strain'] | to_class_name_underscored }}(StrainModel):
"""Model {{row['strain']}}"""
name: Literal["{{ row['strain'] }}"] = "{{ row['strain'] }}"
species: Literal["{{ row['species'] }}"] = "{{ row['species'] }}"
registry: Registry.ONE_OF = Registry.{{ row['strain_registry_abbreviation'] | upper }}
registry_identifier: Literal["{{ row['strain_registry_identifier'] }}"] = "{{ row['strain_registry_identifier'] }}"
{% endif %}
{% endfor %}
class Strain:
"""Strain"""
{% for _, row in strain_data.iterrows() %}
{% if row['strain'] != 'default' %}
{{ row['strain'] | to_class_name | upper }} = {{ row['strain'] | to_class_name_underscored }}()
{% endif %}
{%- endfor %}

ALL = tuple(StrainModel.__subclasses__())

ONE_OF = Annotated[Union[tuple(StrainModel.__subclasses__())], Field(discriminator="name")]


class SpeciesModel(BaseModel):
"""Base model for platform"""
"""Base model for species"""
model_config = ConfigDict(frozen=True)
name: str
registry: Registry.ONE_OF
registry_identifier: str

{% for _, row in data.iterrows() %}
class {{ row['name'] | to_class_name_underscored }}(SpeciesModel):
"""Model {{row['name']}}"""
name: Literal["{{ row['name'] }}"] = "{{ row['name'] }}"
registry: Registry.ONE_OF = Registry.{{ row['registry_abbreviation'] | upper }}
registry_identifier: Literal["{{ row['registry_identifier'] }}"] = "{{ row['registry_identifier'] }}"
{% for _, row in species_data.iterrows() %}
class {{ row['species'] | to_class_name_underscored }}(SpeciesModel):
"""Model {{row['species']}}"""
name: Literal["{{ row['species'] }}"] = "{{ row['species'] }}"
registry: Registry.ONE_OF = Registry.{{ row['species_registry_abbreviation'] | upper }}
registry_identifier: Literal["{{ row['species_registry_identifier'] }}"] = "{{ row['species_registry_identifier'] }}"

{% endfor %}
class Species:
"""Species"""
{% for _, row in data.iterrows() %}
{{ row['name'] | to_class_name | upper }} = {{ row['name'] | to_class_name_underscored }}()
{% for _, row in species_data.iterrows() %}
{{ row['species'] | to_class_name | upper }} = {{ row['species'] | to_class_name_underscored }}()
{%- endfor %}

ALL = tuple(SpeciesModel.__subclasses__())
Expand Down
42 changes: 41 additions & 1 deletion src/aind_data_schema_models/species.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,48 @@
from aind_data_schema_models.registries import Registry


class StrainModel(BaseModel):
"""Base model for a strain"""

model_config = ConfigDict(frozen=True)
name: str
species: str
registry: Registry.ONE_OF
registry_identifier: str


class _C57Bl_6J(StrainModel):
"""Model C57BL/6J"""

name: Literal["C57BL/6J"] = "C57BL/6J"
species: Literal["Mus musculus"] = "Mus musculus"
registry: Registry.ONE_OF = Registry.MGI
registry_identifier: Literal["MGI:3028467"] = "MGI:3028467"


class _Balb_C(StrainModel):
"""Model BALB/c"""

name: Literal["BALB/c"] = "BALB/c"
species: Literal["Mus musculus"] = "Mus musculus"
registry: Registry.ONE_OF = Registry.MGI
registry_identifier: Literal["MGI:2159737"] = "MGI:2159737"


class Strain:
"""Strain"""

C57BL_6J = _C57Bl_6J()

BALB_C = _Balb_C()

ALL = tuple(StrainModel.__subclasses__())

ONE_OF = Annotated[Union[tuple(StrainModel.__subclasses__())], Field(discriminator="name")]


class SpeciesModel(BaseModel):
"""Base model for platform"""
"""Base model for species"""

model_config = ConfigDict(frozen=True)
name: str
Expand Down
22 changes: 22 additions & 0 deletions src/aind_data_schema_models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,37 @@
from pydantic import BaseModel, Field
from typing import Union, List, Type, Any
from typing_extensions import Annotated
import pandas as pd


def unique_rows(value, key):
"""Generate a unique subset of a dataframe based on a key column.
Parameters
----------
data : pd.DataFrame
The data to filter.
key : str
The column to filter on.
"""
seen = set()
unique_rows = []
for _, row in value.iterrows():
if row[key] not in seen:
seen.add(row[key])
unique_rows.append(row)
return pd.DataFrame(unique_rows)


def to_class_name_underscored(name: str) -> str:
"""Convert a name to a valid class name by capitalizing and removing non-alphanumeric characters."""
name = str(name)
return "_" + re.sub(r"\W+", "_", name.title()).replace(" ", "")


def to_class_name(name: str) -> str:
"""Convert a name to a valid class name by capitalizing and removing non-alphanumeric characters."""
name = str(name)
return re.sub(r"\W|^(?=\d)", "_", name.title()).replace(" ", "")


Expand Down

0 comments on commit aab568c

Please sign in to comment.