Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve pdb_merge to include TERs and END lines #150

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 162 additions & 15 deletions pdbtools/pdb_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,18 @@
"""
Merges several PDB files into one.

The contents are not sorted and no lines are deleted (e.g. END, TER
statements) so we recommend piping the results through `pdb_tidy.py`.
Use `pdb_mkensemble` if you with to make an ensemble of multiple
conformation states of the same protein.

Follows the criteria:

* The merged PDB file will represent a single MODEL.
* Non-coordinate lines in input PDBs will be ignored.
* Atom numbers are restarted from 1.
* CONECT lines are yield at the end. CONECT numbers are updated to
the new atom numbers.
* Missing TER and END statements are placed accordingly. Original
TER and END statements are maintained.

Usage:
python pdb_merge.py <pdb file> <pdb file>
Expand All @@ -41,23 +51,34 @@
__email__ = "[email protected]"


# Python 2.7 compatibility
try:
FileNotFoundError
except NameError:
FileNotFoundError = IOError


def check_input(args):
"""Checks whether to read from stdin/file and validates user input/options.
"""

# Defaults
fl = [] # file list

if len(args) >= 1:
if len(args) == 1:
sys.stderr.write('ERROR!! Please provide more than one input file.')
sys.stderr.write(__doc__)
sys.exit(1)

elif len(args) >= 1:
for fn in args:
if not os.path.isfile(fn):
emsg = 'ERROR!! File not found or not readable: \'{}\'\n'
emsg = 'ERROR!! File not found or not readable: \'{}\''
sys.stderr.write(emsg.format(fn))
sys.stderr.write(__doc__)
sys.exit(1)

fh = open(fn, 'r')
fl.append(fh)
fl.append(fn)

else: # Whatever ...
sys.stderr.write(__doc__)
Expand All @@ -66,25 +87,151 @@ def check_input(args):
return fl


def run(flist):
# TER 606 LEU A 75
_fmt_TER = "TER {:>5d} {:3s} {:1s}{:>4s}{:1s}" + " " * 53 + os.linesep


def make_TER(prev_line, fmt_TER=_fmt_TER):
"""Creates a TER statement based on the last ATOM/HETATM line."""
# Add last TER statement
serial = int(prev_line[6:11]) + 1
rname = prev_line[17:20]
chain = prev_line[21]
resid = prev_line[22:26]
icode = prev_line[26]

return fmt_TER.format(serial, rname, chain, resid, icode)


def _get_lines_from_input(pinput, i=0):
"""Decide wheter input is file or lines."""
try:
return open(pinput, 'r')
except (FileNotFoundError, TypeError):
return pinput


def _update_atom_number(line, number, anisou=('ANISOU',)):
if line.startswith(anisou):
number -= 1
return line[:6] + str(number).rjust(5) + line[11:]


def run(input_list):
"""
Iterate over a list of files and yields each line sequentially.
Merges PDB files into a single file.

Follows the criteria:

* The merged PDB file will represent a single MODEL.
* Non-coordinate lines will be ignored.
* Atom numbers are restarted from 1.
* CONECT lines are yield at the end. CONECT numbers are updated
to the new atom numbers.
* TER and END statements are placed accordingly.

Use `pdb_mkensemble` if you with to make an ensemble of multiple
conformation states of the same protein.

Parameters
----------
flist : list of file-like objects
Must handle `.close()` attribute.
input_list : iterator of iterators
`input_list` can be:
* a list of file paths
* a list of file handlers
* a list of lists of lines, the latter representing the
content of the different input PDB files

Yields
------
str (line-by-line)
Lines from the concatenated PDB files.
Lines from the merged PDB files.
"""
records = ('ATOM', 'HETATM', 'ANISOU', 'CONECT', 'MODEL', 'ENDMDL')
atom_anisou = ('ATOM', 'ANISOU')
atom_hetatm = ('ATOM', 'HETATM')
hetatm = ('HETATM',)
conect = ('CONECT',)
prev_chain = None
chain = None
prev_line = ''
conect_lines = []

# CONECT logic taken from pdb_preatom
fmt_CONECT = "CONECT{:>5s}{:>5s}{:>5s}{:>5s}{:>5s}" + " " * 49 + os.linesep
char_ranges = (
slice(6, 11),
slice(11, 16),
slice(16, 21),
slice(21, 26),
slice(26, 31),
)
atom_number = 1

for input_item in input_list:

lines = _get_lines_from_input(input_item)

# store for CONECT statements
# restart at each PDB. Read docs above
serial_equiv = {'': ''}

for line in lines:

if not line.startswith(records):
continue

chain = line[21]

if line.startswith(atom_hetatm):
serial_equiv[line[6:11].strip()] = atom_number

if \
line.startswith(hetatm) \
and prev_line.startswith(atom_anisou):

yield _update_atom_number(make_TER(prev_line), atom_number)
atom_number += 1

elif \
prev_chain is not None \
and chain != prev_chain \
and prev_line.startswith(atom_anisou):

yield _update_atom_number(make_TER(prev_line), atom_number)
atom_number += 1

elif line.startswith(conect):

# 6:11, 11:16, 16:21, 21:26, 26:31
serials = (line[cr].strip() for cr in char_ranges)

# If not found, return default
new_serials = (str(serial_equiv.get(s, s)) for s in serials)
conect_line = fmt_CONECT.format(*new_serials)

conect_lines.append(conect_line)

continue

elif not line.strip(os.linesep).strip():
continue

yield _update_atom_number(line, atom_number)
atom_number += 1

prev_line = line
prev_chain = chain

try:
lines.close()
except AttributeError:
pass

for line in conect_lines:
yield line

for fhandle in flist:
for line in fhandle:
yield line
fhandle.close()
yield 'END' + os.linesep


concatenate_files = run
Expand Down
15 changes: 15 additions & 0 deletions tests/data/dummy_merge_A.pdb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
HEADER CHAIN A FOR pdb_merge
ATOM 1 N ASN A 1 22.066 40.557 0.420 1.00 0.00 N
ATOM 2 CA BASN A 1 20.000 30.000 0.005 0.60 0.00 C
ATOM 3 CA AASN A 1 21.411 39.311 0.054 0.40 0.00 C
ATOM 4 C ASN A 1 22.143 38.629 -1.102 1.00 0.00 C
ATOM 5 O ASN A 1 21.581 38.297 -2.176 1.00 0.00 O
ATOM 6 N ARG A 2 23.408 38.395 -0.829 1.00 0.00 N
ATOM 7 CA ARG A 2 24.384 37.823 -1.757 1.00 0.00 C
ATOM 8 C ARG A 2 24.061 36.421 -2.189 1.00 0.00 C
ATOM 9 O ARG A 2 24.411 36.106 -3.325 1.00 0.00 O
ATOM 10 N GLU A 3 23.456 35.570 -1.408 1.00 0.00 N
ATOM 11 CA GLU A 3 23.064 34.219 -1.780 1.00 0.00 C
ATOM 12 C GLU A 3 21.682 34.241 -2.380 1.00 0.00 C
ATOM 13 O GLU A 3 21.239 33.190 -2.786 1.00 0.00 O
TER 17 HOH A 6
14 changes: 14 additions & 0 deletions tests/data/dummy_merge_B.pdb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
HEADER CHAIN B FOR pdb_merge
ATOM 1 N ARG B 1 36.898 42.175 -2.688 1.00 0.00 N
ATOM 2 CA ARG B 1 37.080 43.455 -3.421 1.00 0.00 C
ATOM 3 C ARG B 1 36.102 44.524 -2.998 1.00 0.00 C
ATOM 4 O ARG B 1 36.577 45.677 -2.879 1.00 0.00 O
ATOM 5 N GLU B 2 34.849 44.167 -2.710 1.00 0.00 N
ATOM 6 CA GLU B 2 33.861 45.127 -2.233 1.00 0.00 C
ATOM 7 C GLU B 2 34.180 45.629 -0.820 1.00 0.00 C
ATOM 8 O GLU B 2 33.914 46.775 -0.464 1.00 0.00 O
ATOM 9 N ALA B 3 34.725 44.679 -0.087 1.00 0.00 N
ATOM 10 CA ALA B 3 35.081 45.036 1.305 1.00 0.00 C
ATOM 11 C ALA B 3 36.213 46.067 1.258 1.00 0.00 C
ATOM 12 O ALA B 3 36.287 47.046 2.028 1.00 0.00 O
HETATM 13 O HOH B 4 11.052 -12.419 29.700 1.00 73.70
21 changes: 21 additions & 0 deletions tests/data/dummy_merge_C.pdb
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
ATOM 1 N ARG C 1 36.898 42.175 -2.688 1.00 0.00 N
ATOM 2 CA ARG C 1 37.080 43.455 -3.421 1.00 0.00 C
ATOM 3 C ARG C 1 36.102 44.524 -2.998 1.00 0.00 C
ATOM 4 O ARG C 1 36.577 45.677 -2.879 1.00 0.00 O
ATOM 5 N GLU C 2 34.849 44.167 -2.710 1.00 0.00 N
ATOM 6 CA GLU C 2 33.861 45.127 -2.233 1.00 0.00 C
ATOM 7 C GLU C 2 34.180 45.629 -0.820 1.00 0.00 C
ATOM 8 O GLU C 2 33.914 46.775 -0.464 1.00 0.00 O
ATOM 9 N MET C 3 43.010 -16.998 71.911 1.00 54.34
ATOM 10 CA MET C 3 42.850 -16.494 70.506 1.00 52.98
ATOM 11 C MET C 3 41.752 -17.205 69.684 1.00 52.05
ATOM 12 O MET C 3 41.560 -18.418 69.777 1.00 54.00
TER 13 MET C 3
HETATM 14 O HOH C 4 -8.172 -22.003 57.197 1.00 70.53
HETATM 15 O HOH C 5 36.020 -23.583 73.186 1.00 24.82
HETATM 16 O HOH C 6 41.203 -28.852 57.698 1.00 53.16
HETATM 17 O HOH C 7 -4.491 -9.687 56.752 1.00 55.08
HETATM 18 O HOH C 8 24.561 0.532 70.565 1.00 44.77
CONECT 16 17
CONECT 1 2 4 5

49 changes: 49 additions & 0 deletions tests/data/dummy_merged.pdb
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
ATOM 1 N ASN A 1 22.066 40.557 0.420 1.00 0.00 N
ATOM 2 CA BASN A 1 20.000 30.000 0.005 0.60 0.00 C
ATOM 3 CA AASN A 1 21.411 39.311 0.054 0.40 0.00 C
ATOM 4 C ASN A 1 22.143 38.629 -1.102 1.00 0.00 C
ATOM 5 O ASN A 1 21.581 38.297 -2.176 1.00 0.00 O
ATOM 6 N ARG A 2 23.408 38.395 -0.829 1.00 0.00 N
ATOM 7 CA ARG A 2 24.384 37.823 -1.757 1.00 0.00 C
ATOM 8 C ARG A 2 24.061 36.421 -2.189 1.00 0.00 C
ATOM 9 O ARG A 2 24.411 36.106 -3.325 1.00 0.00 O
ATOM 10 N GLU A 3 23.456 35.570 -1.408 1.00 0.00 N
ATOM 11 CA GLU A 3 23.064 34.219 -1.780 1.00 0.00 C
ATOM 12 C GLU A 3 21.682 34.241 -2.380 1.00 0.00 C
ATOM 13 O GLU A 3 21.239 33.190 -2.786 1.00 0.00 O
TER 14 GLU A 3
ATOM 15 N ARG B 1 36.898 42.175 -2.688 1.00 0.00 N
ATOM 16 CA ARG B 1 37.080 43.455 -3.421 1.00 0.00 C
ATOM 17 C ARG B 1 36.102 44.524 -2.998 1.00 0.00 C
ATOM 18 O ARG B 1 36.577 45.677 -2.879 1.00 0.00 O
ATOM 19 N GLU B 2 34.849 44.167 -2.710 1.00 0.00 N
ATOM 20 CA GLU B 2 33.861 45.127 -2.233 1.00 0.00 C
ATOM 21 C GLU B 2 34.180 45.629 -0.820 1.00 0.00 C
ATOM 22 O GLU B 2 33.914 46.775 -0.464 1.00 0.00 O
ATOM 23 N ALA B 3 34.725 44.679 -0.087 1.00 0.00 N
ATOM 24 CA ALA B 3 35.081 45.036 1.305 1.00 0.00 C
ATOM 25 C ALA B 3 36.213 46.067 1.258 1.00 0.00 C
ATOM 26 O ALA B 3 36.287 47.046 2.028 1.00 0.00 O
TER 27 ALA B 3
HETATM 28 O HOH B 4 11.052 -12.419 29.700 1.00 73.70
ATOM 29 N ARG C 1 36.898 42.175 -2.688 1.00 0.00 N
ATOM 30 CA ARG C 1 37.080 43.455 -3.421 1.00 0.00 C
ATOM 31 C ARG C 1 36.102 44.524 -2.998 1.00 0.00 C
ATOM 32 O ARG C 1 36.577 45.677 -2.879 1.00 0.00 O
ATOM 33 N GLU C 2 34.849 44.167 -2.710 1.00 0.00 N
ATOM 34 CA GLU C 2 33.861 45.127 -2.233 1.00 0.00 C
ATOM 35 C GLU C 2 34.180 45.629 -0.820 1.00 0.00 C
ATOM 36 O GLU C 2 33.914 46.775 -0.464 1.00 0.00 O
ATOM 37 N MET C 3 43.010 -16.998 71.911 1.00 54.34
ATOM 38 CA MET C 3 42.850 -16.494 70.506 1.00 52.98
ATOM 39 C MET C 3 41.752 -17.205 69.684 1.00 52.05
ATOM 40 O MET C 3 41.560 -18.418 69.777 1.00 54.00
TER 41 MET C 3
HETATM 42 O HOH C 4 -8.172 -22.003 57.197 1.00 70.53
HETATM 43 O HOH C 5 36.020 -23.583 73.186 1.00 24.82
HETATM 44 O HOH C 6 41.203 -28.852 57.698 1.00 53.16
HETATM 45 O HOH C 7 -4.491 -9.687 56.752 1.00 55.08
HETATM 46 O HOH C 8 24.561 0.532 70.565 1.00 44.77
CONECT 44 45
CONECT 29 30 32 33
END
Loading