You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Really awesome tool! Thanks so much for offering it!
Is your tool request related to a problem? Please describe.
Many bioassembly cif files have chain IDs like A-2, A-3, B-1 etc.. This will not work for converting to PDB file, since PDB only takes single character.
I edited the code so that it would relabel the chain id, successfully convert the cif file to pdb file, and also output a json file with chain mapping.
def convert_to_pdb(fhandle, output_prefix):
"""Converts a structure in mmCIF format to PDB format and saves chain ID mapping."""
_a = "{:<6s}{:>5d} {:<4s}{:1s}{:<3s} {:1s}{:>4d}{:1s} {:>8.3f}{:>8.3f}{:>8.3f}"
_a += "{:>6.2f}{:>6.2f} {:<4s}{:<2s}{:2s}\n"
in_section, read_atom = False, False
label_pos = 0
labels = {}
empty = set(('.', '?'))
prev_model = None
atom_num = 0
serial = 0 # Do not read serial numbers from mmCIF; they may be incorrect.
# Initialize the chain ID mapping
chain_id_map = OrderedDict() # Original chain ID -> new single-character chain ID
chain_id_counter = 0
allowed_chain_ids = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz')
model_data = [] # Store atom data for multi-model files
for line in fhandle:
if line.startswith('loop_'): # Start of a section
in_section = True
elif line.startswith('#'): # End of a section
in_section = False
read_atom = False
elif in_section and line.startswith('_atom_site.'): # ATOM/HETATM labels
read_atom = True
labels[line.strip()] = label_pos
label_pos += 1
elif read_atom and line.startswith(('ATOM', 'HETATM')): # Process atom lines
fields = re.findall(r'[^"\s]\S*|".+?"', line) # Handle quoted strings
# Handle model number
model_no = fields[labels.get('_atom_site.pdbx_PDB_model_num')]
if prev_model != model_no:
prev_model = model_no
model_data.append([])
serial = 0
record = fields[labels.get('_atom_site.group_PDB')]
serial += 1
# Atom name
fid = labels.get('_atom_site.auth_atom_id') or labels.get('_atom_site.label_atom_id')
atname = fields[fid].strip('"')
element = fields[labels.get('_atom_site.type_symbol')]
if element in empty:
element = ' '
if len(atname.strip()) < 4 and atname[0].isalpha() and len(element.strip()) < 2:
atname = ' ' + atname # Pad atom name
altloc = fields[labels.get('_atom_site.label_alt_id')]
if altloc in empty:
altloc = ' '
# Residue name
fid = labels.get('_atom_site.auth_comp_id') or labels.get('_atom_site.label_comp_id')
resname = fields[fid]
# Original chain ID
fid = labels.get('_atom_site.auth_asym_id') or labels.get('_atom_site.label_asym_id')
original_chainid = fields[fid]
# Map to a single-character chain ID
if original_chainid not in chain_id_map:
if chain_id_counter >= len(allowed_chain_ids):
emsg = 'ERROR!! Too many unique chains; ran out of chain IDs to assign.\n'
sys.stderr.write(emsg)
sys.exit(1)
new_chainid = allowed_chain_ids[chain_id_counter]
chain_id_map[original_chainid] = new_chainid
chain_id_counter += 1
else:
new_chainid = chain_id_map[original_chainid]
chainid = new_chainid
# Residue number
fid = labels.get('_atom_site.auth_seq_id') or labels.get('_atom_site.label_seq_id')
resnum = int(fields[fid])
icode = fields[labels.get('_atom_site.pdbx_PDB_ins_code')]
if icode in empty:
icode = ' '
x = float(fields[labels.get('_atom_site.Cartn_x')])
y = float(fields[labels.get('_atom_site.Cartn_y')])
z = float(fields[labels.get('_atom_site.Cartn_z')])
occ = float(fields[labels.get('_atom_site.occupancy')])
bfactor = float(fields[labels.get('_atom_site.B_iso_or_equiv')])
charge = fields[labels.get('_atom_site.pdbx_formal_charge')]
if charge in empty:
charge = ' '
segid = original_chainid[:4] # Optionally store original chain ID in segID (max 4 chars)
atom_line = _a.format(record, serial, atname, altloc, resname,
chainid, resnum, icode, x, y, z, occ, bfactor,
segid, element, charge)
atom_num += 1
# Check for PDB format limitations
if atom_num > 99999:
emsg = f"ERROR!! Number of atoms exceeds PDB format limit: '{atom_num}'\n"
sys.stderr.write(emsg)
sys.exit(1)
elif resnum > 9999:
emsg = f"ERROR!! Residue number exceeds PDB format limit: '{resnum}'\n"
sys.stderr.write(emsg)
sys.exit(1)
model_data[-1].append(atom_line)
# Write the chain ID mapping to a JSON file
json_filename = f"{output_prefix}_chain_id_map.json"
with open(json_filename, 'w') as json_file:
json.dump(chain_id_map, json_file, indent=4)
# Inform the user
sys.stderr.write(f"Chain ID mapping saved to {json_filename}\n")
# Write the output
is_ensemble = len(model_data) > 1
if is_ensemble:
for model_no, model in enumerate(model_data, start=1):
yield f"MODEL {model_no:>4d}\n"
for line in model:
yield line
yield 'ENDMDL\n'
else:
for line in model_data[0]:
yield line
yield "END\n"
The text was updated successfully, but these errors were encountered:
Really awesome tool! Thanks so much for offering it!
Is your tool request related to a problem? Please describe.
Many bioassembly cif files have chain IDs like A-2, A-3, B-1 etc.. This will not work for converting to PDB file, since PDB only takes single character.
I edited the code so that it would relabel the chain id, successfully convert the cif file to pdb file, and also output a json file with chain mapping.
The text was updated successfully, but these errors were encountered: