Skip to content

Commit

Permalink
Merge pull request #320 from debbiemarkslab/compare_af
Browse files Browse the repository at this point in the history
Compare predicted 3D models
  • Loading branch information
thomashopf authored Nov 13, 2024
2 parents fd0572e + 745d180 commit 86d55a7
Show file tree
Hide file tree
Showing 5 changed files with 544 additions and 43 deletions.
17 changes: 17 additions & 0 deletions config/sample_config_monomer.txt
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,17 @@ compare:
# Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences.
pdb_alignment_method: jackhmmer

# Uncomment to enable comparison to predicted models (e.g. AlphaFoldDB, as specified in databases section below);
# will inherit configuration as passed to compare stage, with any keys specified in compared_to_models subsection
# taking precedence
# compare_models:
# max_num_hits: 1
# pdb_alignment_method: jackhmmer
# use_bitscores: True
# domain_threshold: 1.5
# sequence_threshold: 1.5
# iterations: 1

# Settings for Mutation effect predictions
mutate:
# Options: standard
Expand Down Expand Up @@ -414,6 +425,12 @@ databases:
sifts_mapping_table: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.csv
sifts_sequence_db: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.fasta

# Mapping onto predicted 3D structure models
# modeldb_type: alphafolddb_v4
# modeldb_sequence_file: /n/groups/marks/databases/alphafolddb/2022-11-14/sequences.fasta
# modeldb_list_file: /n/groups/marks/databases/alphafolddb/2022-11-14/accession_ids.csv
# modeldb_file_dir:

# Paths to external tools used by evcouplings. Please refer to README.md for installation instructions and which tools are required.
tools:
jackhmmer: /n/groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/jackhmmer
Expand Down
137 changes: 102 additions & 35 deletions evcouplings/compare/pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import requests
import msgpack
from Bio.PDB.binary_cif import _decode
from Bio.PDB.MMCIF2Dict import MMCIF2Dict

from evcouplings.utils.config import InvalidParameterError
from evcouplings.utils.constants import AA3_to_AA1
Expand Down Expand Up @@ -412,9 +413,9 @@ class PDB:
Holds PDB structure from binaryCIF format; supersedes original PDB class based
on MMTF format (renamed to MmtfPDB, cf. below) due to MMTF retirement in 2024
"""
def __init__(self, filehandle, keep_full_data=False):
def __init__(self, filehandle, binary=True, keep_full_data=False):
"""
Initialize by parsing binaryCIF from open filehandle.
Initialize by parsing binaryCIF/CIF from open filehandle.
Recommended to use from_file() and from_id() class methods to create object.
Column extraction and decoding based on https://github.com/biopython/biopython/blob/master/Bio/PDB/binary_cif.py
Expand All @@ -423,18 +424,23 @@ def __init__(self, filehandle, keep_full_data=False):
----------
filehandle: file-like object
Open filehandle (binary) from which to read binaryCIF data
binary: bool (default: True)
Indicates if file is binaryCIF (true) or regular text-based CIF file (false)
keep_full_data: bool (default: False)
Associate raw extracted data with object
"""
# unpack information in bCIF file
raw_data = msgpack.unpack(
filehandle, use_list=True
)
if binary:
# unpack information in bCIF file
raw_data = msgpack.unpack(
filehandle, use_list=True
)

data = {
f"{category['name']}.{column['name']}": column
for block in raw_data["dataBlocks"] for category in block["categories"] for column in category["columns"]
}
data = {
f"{category['name']}.{column['name']}": column
for block in raw_data["dataBlocks"] for category in block["categories"] for column in category["columns"]
}
else:
data = MMCIF2Dict(filehandle)

ATOM_TARGET_COLS = {
"_atom_site.pdbx_PDB_model_num": "model_number",
Expand Down Expand Up @@ -498,23 +504,55 @@ def __init__(self, filehandle, keep_full_data=False):
self.data = None

# decode information into dataframe with BioPython helper method
self.atom_table = pd.DataFrame({
name: _decode(data[source_column]) for source_column, name in ATOM_TARGET_COLS.items()
}).assign(
# make sure chain identifiers are strings, in some pathologic cases, these are int rather than str
# (e.g. entry 6swy)
auth_asym_id=lambda df: df.auth_asym_id.astype(str),
label_asym_id=lambda df: df.label_asym_id.astype(str),
)
if binary:
self.atom_table = pd.DataFrame({
name: _decode(data[source_column]) for source_column, name in ATOM_TARGET_COLS.items()
}).assign(
# make sure chain identifiers are strings, in some pathologic cases, these are int rather than str
# (e.g. entry 6swy)
auth_asym_id=lambda df: df.auth_asym_id.astype(str),
label_asym_id=lambda df: df.label_asym_id.astype(str),
)
else:
self.atom_table = pd.DataFrame({
name: data[source_column] for source_column, name in ATOM_TARGET_COLS.items()
}).replace(
# replace with empty values for consistency with bCIF parsing (otherwise could use pd.NA here)
{"?": "", ".": ""}
).assign(
# update column types for float columns
x=lambda df: pd.to_numeric(df.x),
y=lambda df: pd.to_numeric(df.y),
z=lambda df: pd.to_numeric(df.z),
occupancy=lambda df: pd.to_numeric(df.occupancy),
b_factor=lambda df: pd.to_numeric(df.b_factor),
# update data types for int columns
model_number=lambda df: df.model_number.astype("int32"),
id=lambda df: df.id.astype("int32"),
label_entity_id=lambda df: df.label_entity_id.astype("int32"),
# align behaviour with missing value coding in bCIF parser for label_seq_id
label_seq_id=lambda df: df.label_seq_id.replace("", 0).astype("int32"),
auth_seq_id=lambda df: df.auth_seq_id.astype("int32"),
)

# decode information into dataframe with BioPython helper method; note this section may not be
# present if no helices exist in the structure
try:
self.conf_table = pd.DataFrame({
name: _decode(data[source_column]) for source_column, name in CONF_TARGET_COLS.items()
}).query(
# there are a handful of PDB entries that have (probably wrong) secondary structure assignments
# extending over more than one segment (e.g. 2bp7, 2wjv), drop these rather than raising an error
if binary:
self.conf_table = pd.DataFrame({
name: _decode(data[source_column]) for source_column, name in CONF_TARGET_COLS.items()
})
else:
self.conf_table = pd.DataFrame({
name: data[source_column] for source_column, name in CONF_TARGET_COLS.items()
}).assign(
beg_label_seq_id=lambda df: df.beg_label_seq_id.astype("int32"),
end_label_seq_id=lambda df: df.end_label_seq_id.astype("int32"),
)

# there are a handful of PDB entries that have (probably wrong) secondary structure assignments
# extending over more than one segment (e.g. 2bp7, 2wjv), drop these rather than raising an error
self.conf_table = self.conf_table.query(
"beg_label_asym_id == end_label_asym_id"
)
except KeyError:
Expand All @@ -523,9 +561,19 @@ def __init__(self, filehandle, keep_full_data=False):
# decode information into dataframe with BioPython helper method; note this section may not be
# present if no sheets exist in the structure
try:
self.sheet_table = pd.DataFrame({
name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items()
})
if binary:
self.sheet_table = pd.DataFrame({
name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items()
})
else:
self.sheet_table = pd.DataFrame({
name: data[source_column] for source_column, name in SHEET_TARGET_COLS.items()
}).assign(
id=lambda df: df.id.astype("int32"),
beg_label_seq_id=lambda df: df.beg_label_seq_id.astype("int32"),
end_label_seq_id=lambda df: df.end_label_seq_id.astype("int32"),
)

except KeyError:
self.sheet_table = None

Expand Down Expand Up @@ -593,9 +641,14 @@ def __init__(self, filehandle, keep_full_data=False):
@classmethod
def from_file(cls, filename, keep_full_data=False):
"""
Initialize structure from binaryCIF file
Initialize structure from binaryCIF or CIF file (gzipped or not).
Note for simplicity this function will determine if CIF or bCIF, and gzipped or not solely on
case-independent filename extensions (.cif.gz, .cif, .bcif.gz, .bcif) as supplied by the PDB
rather than performing checks on the file itself. If this logic does not hold in your use case,
supply an appropriate filehandle and binary=True/False directly to the constructor of this class.
inspired by https://github.com/biopython/biopython/blob/master/Bio/PDB/binary_cif.py
Inspired by https://github.com/biopython/biopython/blob/master/Bio/PDB/binary_cif.py
Parameters
----------
Expand All @@ -610,11 +663,23 @@ def from_file(cls, filename, keep_full_data=False):
initialized PDB structure
"""
try:
with (
gzip.open(filename, mode="rb")
if filename.lower().endswith(".gz") else open(filename, mode="rb")
) as f:
return cls(f, keep_full_data=keep_full_data)
fnl = filename.lower()

# determine if gzipped or not, use appropriate function to open file
if fnl.endswith(".gz"):
openfunc = gzip.open
else:
openfunc = open

# check if binaryCIF or text-based CIF, adjust file open mode accordingly
binary = fnl.endswith(".bcif") or fnl.endswith(".bcif.gz")
if binary:
mode = "rb"
else:
mode = "r"

with openfunc(filename, mode=mode) as f:
return cls(f, binary=binary, keep_full_data=keep_full_data)
except IOError as e:
raise ResourceError(
"Could not open file {}".format(filename)
Expand Down Expand Up @@ -1280,7 +1345,7 @@ def get_chain(self, chain, model=0):
return Chain(res_df, coord_df)


def load_structures(pdb_ids, structure_dir=None, raise_missing=True):
def load_structures(pdb_ids, structure_dir=None, raise_missing=True, extension=".bcif.gz"):
"""
Load PDB structures from files / web
Expand All @@ -1298,6 +1363,8 @@ def load_structures(pdb_ids, structure_dir=None, raise_missing=True):
Raise a ResourceError exception if any of the
PDB IDs cannot be loaded. If False, missing
entries will be ignored.
extension: str, optional (default: ".bcif.gz")
File extension to be added to identifier if loading file locally
Returns
-------
Expand All @@ -1320,7 +1387,7 @@ def load_structures(pdb_ids, structure_dir=None, raise_missing=True):

has_file = False
if structure_dir is not None:
structure_file = path.join(structure_dir, pdb_id + ".mmtf")
structure_file = path.join(structure_dir, pdb_id + extension)
has_file = valid_file(structure_file)

try:
Expand Down
Loading

0 comments on commit 86d55a7

Please sign in to comment.