Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chain renaming when converting cif to pdb #167

Open
zzhangzzhang opened this issue Nov 8, 2024 · 0 comments
Open

chain renaming when converting cif to pdb #167

zzhangzzhang opened this issue Nov 8, 2024 · 0 comments

Comments

@zzhangzzhang
Copy link

Really awesome tool! Thanks so much for offering it!

Is your tool request related to a problem? Please describe.
Many bioassembly cif files have chain IDs like A-2, A-3, B-1 etc.. This will not work for converting to PDB file, since PDB only takes single character.

I edited the code so that it would relabel the chain id, successfully convert the cif file to pdb file, and also output a json file with chain mapping.

def convert_to_pdb(fhandle, output_prefix):
    """Converts a structure in mmCIF format to PDB format and saves chain ID mapping."""
    
    _a = "{:<6s}{:>5d} {:<4s}{:1s}{:<3s} {:1s}{:>4d}{:1s}   {:>8.3f}{:>8.3f}{:>8.3f}"
    _a += "{:>6.2f}{:>6.2f}      {:<4s}{:<2s}{:2s}\n"

    in_section, read_atom = False, False

    label_pos = 0
    labels = {}
    empty = set(('.', '?'))

    prev_model = None
    atom_num = 0
    serial = 0  # Do not read serial numbers from mmCIF; they may be incorrect.

    # Initialize the chain ID mapping
    chain_id_map = OrderedDict()  # Original chain ID -> new single-character chain ID
    chain_id_counter = 0
    allowed_chain_ids = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz')

    model_data = []  # Store atom data for multi-model files

    for line in fhandle:
        if line.startswith('loop_'):  # Start of a section
            in_section = True

        elif line.startswith('#'):  # End of a section
            in_section = False
            read_atom = False

        elif in_section and line.startswith('_atom_site.'):  # ATOM/HETATM labels
            read_atom = True
            labels[line.strip()] = label_pos
            label_pos += 1

        elif read_atom and line.startswith(('ATOM', 'HETATM')):  # Process atom lines
            fields = re.findall(r'[^"\s]\S*|".+?"', line)  # Handle quoted strings

            # Handle model number
            model_no = fields[labels.get('_atom_site.pdbx_PDB_model_num')]
            if prev_model != model_no:
                prev_model = model_no
                model_data.append([])
                serial = 0

            record = fields[labels.get('_atom_site.group_PDB')]
            serial += 1

            # Atom name
            fid = labels.get('_atom_site.auth_atom_id') or labels.get('_atom_site.label_atom_id')
            atname = fields[fid].strip('"')

            element = fields[labels.get('_atom_site.type_symbol')]
            if element in empty:
                element = ' '

            if len(atname.strip()) < 4 and atname[0].isalpha() and len(element.strip()) < 2:
                atname = ' ' + atname  # Pad atom name

            altloc = fields[labels.get('_atom_site.label_alt_id')]
            if altloc in empty:
                altloc = ' '

            # Residue name
            fid = labels.get('_atom_site.auth_comp_id') or labels.get('_atom_site.label_comp_id')
            resname = fields[fid]

            # Original chain ID
            fid = labels.get('_atom_site.auth_asym_id') or labels.get('_atom_site.label_asym_id')
            original_chainid = fields[fid]

            # Map to a single-character chain ID
            if original_chainid not in chain_id_map:
                if chain_id_counter >= len(allowed_chain_ids):
                    emsg = 'ERROR!! Too many unique chains; ran out of chain IDs to assign.\n'
                    sys.stderr.write(emsg)
                    sys.exit(1)
                new_chainid = allowed_chain_ids[chain_id_counter]
                chain_id_map[original_chainid] = new_chainid
                chain_id_counter += 1
            else:
                new_chainid = chain_id_map[original_chainid]

            chainid = new_chainid

            # Residue number
            fid = labels.get('_atom_site.auth_seq_id') or labels.get('_atom_site.label_seq_id')
            resnum = int(fields[fid])

            icode = fields[labels.get('_atom_site.pdbx_PDB_ins_code')]
            if icode in empty:
                icode = ' '

            x = float(fields[labels.get('_atom_site.Cartn_x')])
            y = float(fields[labels.get('_atom_site.Cartn_y')])
            z = float(fields[labels.get('_atom_site.Cartn_z')])
            occ = float(fields[labels.get('_atom_site.occupancy')])
            bfactor = float(fields[labels.get('_atom_site.B_iso_or_equiv')])

            charge = fields[labels.get('_atom_site.pdbx_formal_charge')]
            if charge in empty:
                charge = '  '

            segid = original_chainid[:4]  # Optionally store original chain ID in segID (max 4 chars)

            atom_line = _a.format(record, serial, atname, altloc, resname,
                                  chainid, resnum, icode, x, y, z, occ, bfactor,
                                  segid, element, charge)

            atom_num += 1

            # Check for PDB format limitations
            if atom_num > 99999:
                emsg = f"ERROR!! Number of atoms exceeds PDB format limit: '{atom_num}'\n"
                sys.stderr.write(emsg)
                sys.exit(1)
            elif resnum > 9999:
                emsg = f"ERROR!! Residue number exceeds PDB format limit: '{resnum}'\n"
                sys.stderr.write(emsg)
                sys.exit(1)

            model_data[-1].append(atom_line)

    # Write the chain ID mapping to a JSON file
    json_filename = f"{output_prefix}_chain_id_map.json"
    with open(json_filename, 'w') as json_file:
        json.dump(chain_id_map, json_file, indent=4)
    # Inform the user
    sys.stderr.write(f"Chain ID mapping saved to {json_filename}\n")

    # Write the output
    is_ensemble = len(model_data) > 1
    if is_ensemble:
        for model_no, model in enumerate(model_data, start=1):
            yield f"MODEL     {model_no:>4d}\n"
            for line in model:
                yield line
            yield 'ENDMDL\n'
    else:
        for line in model_data[0]:
            yield line

    yield "END\n"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

1 participant