Skip to content

Commit

Permalink
Merge pull request #51 from PathwayCommons/major-refactor
Browse files Browse the repository at this point in the history
Major refactor
  • Loading branch information
JohnGiorgi authored Feb 25, 2021
2 parents 0b396d6 + 472398b commit 6e3c35d
Show file tree
Hide file tree
Showing 7 changed files with 166 additions and 130 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
![build](https://github.com/PathwayCommons/semantic-search/workflows/build/badge.svg)
[![codecov](https://codecov.io/gh/PathwayCommons/semantic-search/branch/master/graph/badge.svg)](https://codecov.io/gh/PathwayCommons/semantic-search)
[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
![GitHub](https://img.shields.io/github/license/PathwayCommons/semantic-search?color=blue)

# Scientific Semantic Search

Expand Down
1 change: 1 addition & 0 deletions semantic_search/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "0.1.0"
96 changes: 96 additions & 0 deletions semantic_search/common/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from enum import Enum
from typing import Tuple, List, Optional

import torch
import typer
from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizer


class Emoji(Enum):
# Emoji's used in typer.secho calls
# See: https://github.com/carpedm20/emoji/blob/master/emoji/unicode_codes.py
SUCCESS = "\U00002705"
WARNING = "\U000026A0"
FAST = "\U0001F3C3"


def get_device(cuda_device: int = -1) -> torch.device:
"""Return a `torch.cuda` device if `torch.cuda.is_available()` and `cuda_device>=0`.
Otherwise returns a `torch.cpu` device.
"""
if cuda_device != -1 and torch.cuda.is_available():
device = torch.device("cuda")
typer.secho(
f"{Emoji.FAST.value} Using CUDA device {torch.cuda.get_device_name()} with index"
f" {torch.cuda.current_device()}.",
fg=typer.colors.GREEN,
bold=True,
)
else:
device = torch.device("cpu")
typer.secho(
f"{Emoji.WARNING.value} Using CPU. Note that this will be many times slower than a GPU.",
fg=typer.colors.YELLOW,
bold=True,
)
return device


def setup_model_and_tokenizer(
pretrained_model_name_or_path: str, cuda_device: int = -1
) -> Tuple[PreTrainedTokenizer, PreTrainedModel]:
"""Given a HuggingFace Transformer `pretrained_model_name_or_path`, return the corresponding
model and tokenizer. Optionally, places the model on `cuda_device`, if available.
"""
device = get_device(cuda_device)
# Load the Transformers tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
typer.secho(
(
f'{Emoji.SUCCESS.value} Tokenizer "{pretrained_model_name_or_path}" from Transformers'
" loaded successfully."
),
fg=typer.colors.GREEN,
bold=True,
)
# Load the Transformers model
model = AutoModel.from_pretrained(pretrained_model_name_or_path)
model = model.to(device)
model.eval()
typer.secho(
(
f'{Emoji.SUCCESS.value} Model "{pretrained_model_name_or_path}" from Transformers'
" loaded successfully."
),
fg=typer.colors.GREEN,
bold=True,
)

return tokenizer, model


@torch.no_grad()
def encode_with_transformer(
text: List[str],
tokenizer: PreTrainedTokenizer,
model: PreTrainedModel,
max_length: Optional[int] = None,
mean_pool: bool = True,
) -> torch.Tensor:

inputs = tokenizer(
text, padding=True, truncation=True, max_length=max_length, return_tensors="pt"
)
for name, tensor in inputs.items():
inputs[name] = tensor.to(model.device)
attention_mask = inputs["attention_mask"]
output = model(**inputs).last_hidden_state

if mean_pool:
embedding = torch.sum(output * attention_mask.unsqueeze(-1), dim=1) / torch.clamp(
torch.sum(attention_mask, dim=1, keepdims=True), min=1e-9
)
else:
embedding = output[:, 0, :]

return embedding
137 changes: 13 additions & 124 deletions semantic_search/main.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,19 @@
from operator import itemgetter
from typing import Callable, Dict, List, Optional, Tuple, cast
from typing import Dict, List, Optional, Tuple, cast

import torch
import typer
from fastapi import FastAPI
from pydantic import BaseModel, BaseSettings, validator
from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizer
from pydantic import BaseSettings

from semantic_search.ncbi import uids_to_docs
from semantic_search.common.util import encode_with_transformer, setup_model_and_tokenizer
from semantic_search.schemas import Model, Query
from semantic_search import __version__

PRETRAINED_MODEL = "johngiorgi/declutr-sci-base"

UID = str

# Emoji's used in typer.secho calls
# See: https://github.com/carpedm20/emoji/blob/master/emoji/unicode_codes.py
SUCCESS = "\U00002705"
WARNING = "\U000026A0"
FAST = "\U0001F3C3"


app = FastAPI()
app = FastAPI(
title="Scientific Semantic Search",
description="A simple semantic search engine for scientific papers.",
version=__version__,
)


class Settings(BaseSettings):
Expand All @@ -30,121 +23,17 @@ class Settings(BaseSettings):
`CUDA_DEVICE=0 MAX_LENGTH=384 uvicorn semantic_search.main:app`
"""

pretrained_model_name_or_path: str = PRETRAINED_MODEL
pretrained_model_name_or_path: str = "johngiorgi/declutr-sci-base"
batch_size: int = 64
max_length: Optional[int] = None
mean_pool: bool = True
cuda_device: int = -1


class Model(BaseModel):
tokenizer: PreTrainedModel = None
model: PreTrainedTokenizer = None
similarity: Callable[..., torch.Tensor] = None # type: ignore

class Config:
arbitrary_types_allowed = True


class Document(BaseModel):
uid: UID
text: str


class Query(BaseModel):
query: Document
documents: List[Document] = []
top_k: Optional[int] = None

@validator("query", "documents", pre=True)
def normalize_document(cls, v, field):
if field.name == "query":
v = [v]

normalized_docs = []
for doc in v:
if isinstance(doc, UID):
normalized_docs.append(Document(**uids_to_docs([doc])[0]))
else:
normalized_docs.append(doc)
return normalized_docs[0] if field.name == "query" else normalized_docs


settings = Settings()
model = Model()


def _get_device(cuda_device):
"""Return a `torch.cuda` device if `torch.cuda.is_available()` and `cuda_device>=0`.
Otherwise returns a `torch.cpu` device.
"""
if cuda_device != -1 and torch.cuda.is_available():
device = torch.device("cuda")
typer.secho(
f"{FAST} Using CUDA device {torch.cuda.get_device_name()} with index {torch.cuda.current_device()}.",
fg=typer.colors.GREEN,
bold=True,
)
else:
device = torch.device("cpu")
typer.secho(
f"{WARNING} Using CPU. Note that this will be many times slower than a GPU.",
fg=typer.colors.YELLOW,
bold=True,
)
return device


def _setup_model_and_tokenizer(
pretrained_model_name_or_path: str, cuda_device: int = -1
) -> Tuple[PreTrainedTokenizer, PreTrainedModel]:
device = _get_device(cuda_device)
# Load the Transformers tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
typer.secho(
f'{SUCCESS} Tokenizer "{pretrained_model_name_or_path}" from Transformers loaded successfully.',
fg=typer.colors.GREEN,
bold=True,
)
# Load the Transformers model
model = AutoModel.from_pretrained(pretrained_model_name_or_path)
model = model.to(device)
model.eval()
typer.secho(
f'{SUCCESS} Model "{pretrained_model_name_or_path}" from Transformers loaded successfully.',
fg=typer.colors.GREEN,
bold=True,
)

return tokenizer, model


@torch.no_grad()
def _encode(
text: List[str],
tokenizer: PreTrainedTokenizer,
model: PreTrainedModel,
mean_pool: bool = True,
) -> torch.Tensor:

inputs = tokenizer(
text, padding=True, truncation=True, max_length=settings.max_length, return_tensors="pt"
)
for name, tensor in inputs.items():
inputs[name] = tensor.to(model.device)
attention_mask = inputs["attention_mask"]
output = model(**inputs).last_hidden_state

if mean_pool:
embedding = torch.sum(output * attention_mask.unsqueeze(-1), dim=1) / torch.clamp(
torch.sum(attention_mask, dim=1, keepdims=True), min=1e-9
)
else:
embedding = output[:, 0, :]

return embedding


def encode(text: List[str]) -> torch.Tensor:
# Sort the inputs by length, maintaining the original indices so we can un-sort
# before returning the embeddings. This speeds up embedding by minimizing the
Expand All @@ -160,7 +49,7 @@ def encode(text: List[str]) -> torch.Tensor:

embeddings: torch.Tensor = []
for i in range(0, len(text), settings.batch_size):
embedding = _encode(
embedding = encode_with_transformer(
list(text[i : i + settings.batch_size]),
tokenizer=model.tokenizer,
model=model.model,
Expand All @@ -179,7 +68,7 @@ def encode(text: List[str]) -> torch.Tensor:
@app.on_event("startup")
def app_startup():

model.tokenizer, model.model = _setup_model_and_tokenizer(
model.tokenizer, model.model = setup_model_and_tokenizer(
settings.pretrained_model_name_or_path, cuda_device=settings.cuda_device
)
model.similarity = torch.nn.CosineSimilarity(-1)
Expand Down
44 changes: 44 additions & 0 deletions semantic_search/schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from typing import Callable, List, Optional

import torch
from pydantic import BaseModel, validator
from transformers import PreTrainedModel, PreTrainedTokenizer

from semantic_search.ncbi import uids_to_docs

UID = str

# See: https://fastapi.tiangolo.com/tutorial/body/ for more details on creating a Request Body.


class Model(BaseModel):
tokenizer: PreTrainedModel = None
model: PreTrainedTokenizer = None
similarity: Callable[..., torch.Tensor] = None # type: ignore

class Config:
arbitrary_types_allowed = True


class Document(BaseModel):
uid: UID
text: str


class Query(BaseModel):
query: Document
documents: List[Document] = []
top_k: Optional[int] = None

@validator("query", "documents", pre=True)
def normalize_document(cls, v, field):
if field.name == "query":
v = [v]

normalized_docs = []
for doc in v:
if isinstance(doc, UID):
normalized_docs.append(Document(**uids_to_docs([doc])[0]))
else:
normalized_docs.append(doc)
return normalized_docs[0] if field.name == "query" else normalized_docs
12 changes: 6 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
version="0.1.0",
author="John Giorgi",
author_email="[email protected]",
description=("A simple semantic search engine powered by HuggingFace's Transformers library."),
description=("A simple semantic search engine for scientific papers."),
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/PathwayCommons/semantic-search",
Expand All @@ -21,18 +21,18 @@
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Typing :: Typed",
],
python_requires=">=3.7.0",
install_requires=[
"fastapi>=0.62.0",
"uvicorn>=0.13.0",
"torch>=1.7.0",
"transformers>=4.0.1,<4.4.0",
"fastapi>=0.63.0",
"uvicorn>=0.13.4",
"torch>=1.7.1",
"transformers>=4.3.3",
"typer>=0.3.2",
"python-dotenv>=0.15.0",
"xmltodict>=0.12.0",
Expand Down
5 changes: 5 additions & 0 deletions tests/test_semantic_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from semantic_search import __version__


def test_version():
assert __version__ == "0.1.0"

0 comments on commit 6e3c35d

Please sign in to comment.