Skip to content

Commit

Permalink
Merge pull request #172 from databio/dev_deps
Browse files Browse the repository at this point in the history
Release 0.4.2
  • Loading branch information
khoroshevskyi authored Oct 7, 2024
2 parents 2809e8e + f71a279 commit 06e4e21
Show file tree
Hide file tree
Showing 44 changed files with 186 additions and 113 deletions.
5 changes: 1 addition & 4 deletions .github/workflows/run-pytest-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,11 @@ jobs:
python-version: ${{ matrix.python-version }}
cache: 'pip' # caching can speed up the workflow by reusing the installed dependencies

- name: Install dev dependencies
run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi

- name: Install test dependencies
run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi

- name: Install package
run: python -m pip install .
run: python -m pip install .[ml]

- name: Run pytest tests
run: pytest tests -x -vv --remote-data
9 changes: 3 additions & 6 deletions .github/workflows/run-pytest-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,18 @@ jobs:
os: [ubuntu-latest]

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install dev dependencies
run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi

- name: Install test dependencies
run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi

- name: Install package
run: python -m pip install .
run: python -m pip install .[ml]

- name: Run pytest tests
run: pytest tests -x -vv --remote-data
26 changes: 25 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,32 @@ Geniml is a python package for building machine learning models of genomic inter

Documentation is hosted at <https://docs.bedbase.org/geniml/>.


## Installation
### To install `geniml` use this commands.

Without specifying dependencies, the default dependencies will be installed,
which DO NOT include machine learning (ML) or heavy processing libraries.


From pypi:
```
pip install geniml
```
or install the latest version from the GitHub repository:
```
pip install git+https://github.com/databio/geniml.git
```

### To install Machine learning dependencies use this command:

From pypi:
```
pip install geniml[ml]
```


## Development

Run tests (from `/tests`) with `pytest`. Please read the [contributor guide](https://docs.bedbase.org/geniml/contributing/) to contribute.


2 changes: 1 addition & 1 deletion geniml/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.1"
__version__ = "0.4.2"
2 changes: 1 addition & 1 deletion geniml/atacformer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import List, Tuple

import torch
from genimtools.utils import read_tokens_from_gtok
from gtars.utils import read_tokens_from_gtok
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset

Expand Down
23 changes: 13 additions & 10 deletions geniml/bbclient/bbclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,17 @@
import zarr
from botocore.exceptions import ClientError
from pybiocfilecache import BiocFileCache
from pybiocfilecache._exceptions import RnameExistsError
from contextlib import suppress
from ubiquerg import is_url
from zarr.errors import PathNotFoundError
from zarr import Array
from zarr.errors import PathNotFoundError

from ..exceptions import TokenizedFileNotFoundError, TokenizedFileNotFoundInCacheError
from ..io.io import BedSet, RegionSet
from ..io.utils import is_gzipped
from .const import (
BED_TOKENS_PATTERN,
BEDFILE_URL_PATTERN,
BEDSET_URL_PATTERN,
DEFAULT_BEDBASE_API,
Expand All @@ -30,7 +33,6 @@
DEFAULT_CACHE_FOLDER,
DEFAULT_ZARR_FOLDER,
MODULE_NAME,
BED_TOKENS_PATTERN,
)
from .utils import BedCacheManager, get_abs_path

Expand Down Expand Up @@ -146,11 +148,12 @@ def add_bedset_to_cache(self, bedset: BedSet) -> str:
self.bedset_cache.add(bedset_id, fpath=file_path, action="asis")
return bedset_id

def add_bed_to_cache(self, bedfile: Union[RegionSet, str]) -> str:
def add_bed_to_cache(self, bedfile: Union[RegionSet, str], force: bool = False) -> str:
"""
Add a BED file to the cache
:param bedfile: a RegionSet object or a path or url to the BED file
:param force: whether to overwrite the existing file in cache
:return: the RegionSet identifier
"""

Expand All @@ -163,15 +166,14 @@ def add_bed_to_cache(self, bedfile: Union[RegionSet, str]) -> str:

bedfile_id = bedfile.compute_bed_identifier()
file_path = self._bedfile_path(bedfile_id)
if os.path.exists(file_path):
if os.path.exists(file_path) and not force:
_LOGGER.info(f"{file_path} already exists in cache.")
else:
if bedfile.path is None or is_url(bedfile.path):
# write the regions to .bed.gz file
with gzip.open(file_path, "wt") as f:
for region in bedfile:
f.write(f"{region.chr}\t{region.start}\t{region.end}\n")

compression_opts = dict(method="zip", archive_name=f"{bedfile_id}.bed")
bedfile.to_pandas().to_csv(
file_path, index=False, compression=compression_opts, header=False, sep="\t"
)
else:
# copy the BED file out of cache
if is_gzipped(bedfile.path):
Expand All @@ -181,7 +183,8 @@ def add_bed_to_cache(self, bedfile: Union[RegionSet, str]) -> str:
with open(bedfile.path, "rb") as f_in:
with gzip.open(file_path, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
self.bedfile_cache.add(bedfile_id, fpath=file_path, action="asis")
with suppress(RnameExistsError):
self.bedfile_cache.add(bedfile_id, fpath=file_path, action="asis")
return bedfile_id

def add_bed_tokens_to_cache(self, bed_id: str, universe_id: str) -> None:
Expand Down
2 changes: 1 addition & 1 deletion geniml/bbclient/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from logging import getLogger

from .const import MODULE_NAME, DEFAULT_CACHE_FOLDER
from .const import DEFAULT_CACHE_FOLDER, MODULE_NAME

_LOGGER = getLogger(MODULE_NAME)

Expand Down
1 change: 0 additions & 1 deletion geniml/bedshift/bedshift.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import logmuse
import numpy as np
import pandas as pd
from genimtools.ailist import AIList, Interval

from geniml.bedshift import BedshiftYAMLHandler, arguments
from geniml.bedshift._version import __version__
Expand Down
6 changes: 3 additions & 3 deletions geniml/region2vec/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .main import Region2Vec, Region2VecExModel
from .main_legacy import region2vec
from .pooling import *
# from .main import Region2Vec, Region2VecExModel
# from .main_legacy import region2vec
#
13 changes: 9 additions & 4 deletions geniml/region2vec/experimental.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import logging
from typing import List, Tuple

import torch
import torch.nn as nn
from rich.progress import track
from torch.utils.data import Dataset
try:
import torch
import torch.nn as nn
from rich.progress import track
from torch.utils.data import Dataset
except ImportError:
raise ImportError(
"Please install Machine Learning dependencies by running 'pip install geniml[ml]'"
)

from .const import DEFAULT_N_SHUFFLES, DEFAULT_NS_POWER, DEFAULT_WINDOW_SIZE, MODULE_NAME
from .utils import shuffle_documents
Expand Down
14 changes: 10 additions & 4 deletions geniml/region2vec/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,18 @@
from typing import List, Union

import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence

try:
import torch
except ImportError:
raise ImportError(
"Please install Machine Learning dependencies by running 'pip install geniml[ml]'"
)

from gtars.tokenizers import Region as GRegion
from gtars.tokenizers import RegionSet as GRegionSet
from huggingface_hub import hf_hub_download
from rich.progress import track
from genimtools.tokenizers import RegionSet as GRegionSet
from genimtools.tokenizers import Region as GRegion

from ..io import Region, RegionSet
from ..models import ExModel
Expand Down
9 changes: 7 additions & 2 deletions geniml/region2vec/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import torch
import torch.nn as nn
try:
import torch
import torch.nn as nn
except ImportError:
raise ImportError(
"Please install Machine Learning dependencies by running 'pip install geniml[ml]'"
)

from .const import DEFAULT_EMBEDDING_DIM, POOLING_TYPES

Expand Down
1 change: 0 additions & 1 deletion geniml/region2vec/region_shuffling.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import argparse
import datetime
import glob
import os
import pickle
Expand Down
10 changes: 8 additions & 2 deletions geniml/region2vec/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,14 @@
from typing import TYPE_CHECKING, Dict, List, Tuple, Union

import numpy as np
import torch
from genimtools.utils import read_tokens_from_gtok

try:
import torch
except ImportError:
raise ImportError(
"Please install Machine Learning dependencies by running 'pip install geniml[ml]'"
)
from gtars.utils import read_tokens_from_gtok
from yaml import safe_dump, safe_load

if TYPE_CHECKING:
Expand Down
8 changes: 4 additions & 4 deletions geniml/scembed/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .annotation import *
from .const import *
from .main import *
from .utils import *
# from .annotation import *
# from .const import *
# from .main import *
# from .utils import *
2 changes: 1 addition & 1 deletion geniml/scembed/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
load_local_region2vec_model,
train_region2vec_model,
)
from ..tokenization import AnnDataTokenizer, Tokenizer
from ..tokenization.main import AnnDataTokenizer, Tokenizer
from .const import MODULE_NAME

_GENSIM_LOGGER = getLogger("gensim")
Expand Down
2 changes: 1 addition & 1 deletion geniml/scembed/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import scanpy as sc
import torch
from genimtools.utils import read_tokens_from_gtok
from gtars.utils import read_tokens_from_gtok
from rich.progress import track
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
Expand Down
1 change: 1 addition & 0 deletions geniml/search/backends/filebackend.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# )

import numpy as np

from geniml.search.const import (
DEFAULT_DIM,
DEFAULT_EF,
Expand Down
1 change: 1 addition & 0 deletions geniml/search/interfaces/text2bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Dict, List, Union

import numpy as np

from geniml.const import PKG_NAME

from ..backends import HNSWBackend, QdrantBackend
Expand Down
2 changes: 1 addition & 1 deletion geniml/search/query2vec/bed2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from ...const import PKG_NAME
from ...io import RegionSet
from ...region2vec import Region2VecExModel
from ...region2vec.main import Region2VecExModel
from .abstract import Query2Vec

_LOGGER = logging.getLogger(PKG_NAME)
Expand Down
10 changes: 8 additions & 2 deletions geniml/text2bednn/text2bednn.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,15 @@

import matplotlib.pyplot as plt
import numpy as np
import torch

try:
import torch
from torch.nn import CosineEmbeddingLoss, CosineSimilarity, Linear, MSELoss, ReLU, Sequential
except ImportError:
raise ImportError(
"Please install Machine Learning dependencies by running 'pip install geniml[ml]'"
)
from huggingface_hub import hf_hub_download
from torch.nn import CosineEmbeddingLoss, CosineSimilarity, Linear, MSELoss, ReLU, Sequential
from yaml import safe_dump, safe_load

from .const import (
Expand Down
10 changes: 8 additions & 2 deletions geniml/text2bednn/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,14 @@

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset

try:
import torch
from torch.utils.data import DataLoader, TensorDataset
except ImportError:
raise ImportError(
"Please install Machine Learning dependencies by running 'pip install geniml[ml]'"
)

from .const import (
DEFAULT_BATCH_SIZE,
Expand Down
4 changes: 2 additions & 2 deletions geniml/tokenization/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .main import Tokenizer, AnnDataTokenizer, TreeTokenizer
from .main import hard_tokenization_main as hard_tokenization
# from .main import Tokenizer, AnnDataTokenizer, TreeTokenizer
# from .main import hard_tokenization_main as hard_tokenization
8 changes: 3 additions & 5 deletions geniml/tokenization/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,13 @@

import numpy as np
import scanpy as sc
from gtars.tokenizers import Region as GRegion
from gtars.tokenizers import TreeTokenizer as GTreeTokenizer
from huggingface_hub import hf_hub_download
from rich.progress import track

from geniml.tokenization.split_file import split_file
from geniml.io import Region, RegionSet
from genimtools.tokenizers import (
TreeTokenizer as GTreeTokenizer,
Region as GRegion,
)
from geniml.tokenization.split_file import split_file

from .hard_tokenization_batch import main as hard_tokenization
from .utils import Timer, time_str
Expand Down
4 changes: 2 additions & 2 deletions geniml/training/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

from ..atacformer.main import AtacformerExModel
from ..nn import GradientReversal
from ..region2vec import Region2VecExModel
from ..scembed import ScEmbed
from ..region2vec.main import Region2VecExModel
from ..scembed.main import ScEmbed
from .const import BATCH_CORRECTION_ADVERSARIAL_TRAINING_MODES

_LOGGER = logging.getLogger(__name__)
Expand Down
3 changes: 1 addition & 2 deletions geniml/training/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@

import scanpy as sc
import torch
from gtars.tokenizers import TreeTokenizer
from rich.progress import track
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset

from genimtools.tokenizers import TreeTokenizer


@contextlib.contextmanager
def tempseed(seed: int):
Expand Down
Loading

0 comments on commit 06e4e21

Please sign in to comment.