mol_featurizer.py

# -*- coding: utf-8 -*-
"""
@Time:Created on 2019/4/30 14:19
@author: LiFan Chen
@Filename: mol_featurizer.py
@Software: PyCharm
"""
import numpy as np
import joblib
from joblib import parallel_backend, Parallel, delayed
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors

num_atom_feat = 34
def one_of_k_encoding(x, allowable_set):
    if x not in allowable_set:
        raise Exception("input {0} not in allowable set{1}:".format(
            x, allowable_set))
    return [x == s for s in allowable_set]


def one_of_k_encoding_unk(x, allowable_set):
    """Maps inputs not in the allowable set to the last element."""
    if x not in allowable_set:
        x = allowable_set[-1]
    return [x == s for s in allowable_set]


def atom_features(atom,explicit_H=False,use_chirality=True):
    """Generate atom features including atom symbol(10),degree(7),formal charge,
    radical electrons,hybridization(6),aromatic(1),Chirality(3)
    """
    symbol = ['C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br', 'I', 'other']  # 10-dim
    degree = [0, 1, 2, 3, 4, 5, 6]  # 7-dim
    hybridizationType = [Chem.rdchem.HybridizationType.SP,
                              Chem.rdchem.HybridizationType.SP2,
                              Chem.rdchem.HybridizationType.SP3,
                              Chem.rdchem.HybridizationType.SP3D,
                              Chem.rdchem.HybridizationType.SP3D2,
                              'other']   # 6-dim
    results = one_of_k_encoding_unk(atom.GetSymbol(),symbol) + \
                  one_of_k_encoding(atom.GetDegree(),degree) + \
                  [atom.GetFormalCharge(), atom.GetNumRadicalElectrons()] + \
                  one_of_k_encoding_unk(atom.GetHybridization(), hybridizationType) + [atom.GetIsAromatic()]  # 10+7+2+6+1=26

    # In case of explicit hydrogen(QM8, QM9), avoid calling `GetTotalNumHs`
    if not explicit_H:
        results = results + one_of_k_encoding_unk(atom.GetTotalNumHs(),
                                                      [0, 1, 2, 3, 4])   # 26+5=31
    if use_chirality:
        try:
            results = results + one_of_k_encoding_unk(
                    atom.GetProp('_CIPCode'),
                    ['R', 'S']) + [atom.HasProp('_ChiralityPossible')]
        except:
            results = results + [False, False] + [atom.HasProp('_ChiralityPossible')]  # 31+3 =34
    return results


def adjacent_matrix(mol):
    adjacency = Chem.GetAdjacencyMatrix(mol)
    return np.array(adjacency,dtype=np.float32)


def mol_features(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
    except:
        raise RuntimeError("SMILES cannot been parsed!")
    #mol = Chem.AddHs(mol)
    atom_feat = np.zeros((mol.GetNumAtoms(), num_atom_feat))
    for atom in mol.GetAtoms():
        atom_feat[atom.GetIdx(), :] = atom_features(atom)
    adj_matrix = adjacent_matrix(mol)
    return atom_feat, adj_matrix

def construct_single_data(data):
    smiles, sequence, interaction = data.strip().split(" ")

    atom_feature, adj = mol_features(smiles)
    protein_embedding = get_protein_embedding(model, seq_to_kmers(sequence))
    label = np.array(interaction,dtype=np.float32)

    atom_feature = torch.FloatTensor(atom_feature)
    adj = torch.FloatTensor(adj)
    protein = torch.FloatTensor(protein_embedding)
    label = torch.FloatTensor(label)

    return atom_feature, adj, protein, label

if __name__ == "__main__":

    from word2vec import seq_to_kmers, get_protein_embedding
    from gensim.models import Word2Vec
    import os
    import torch
    import pickle
    # DATASET = "davis"

    with open('data/input/converted_all_data.csv',"r") as f:          # ../data/kinase_tran.txt
    #with open('data/input/kinase_dropna_clean_400.csv',"r") as f:          # ../data/kinase_tran.txt
        data_list = f.read().strip().split('\n')

    N = len(data_list)

    model = Word2Vec.load("word2vec_30.model")

#    with parallel_backend('threading', n_jobs=joblib.cpu_count()):
#        dataset = Parallel()(delayed(construct_single_data)(data) for no, data in enumerate(tqdm(data_list)))

    compounds, adjacencies,proteins,interactions = [], [], [], []
    for no, data in enumerate(tqdm(data_list)):
        #print('/'.join(map(str, [no + 1, N])))
        smiles, sequence, interaction = data.strip().split(" ")

        try:
            atom_feature, adj = mol_features(smiles)
            protein_embedding = get_protein_embedding(model, seq_to_kmers(sequence))
        except Exception as e:
            print(e)
            continue
        label = np.array(interaction,dtype=np.float32)

        atom_feature = torch.FloatTensor(atom_feature)
        adj = torch.FloatTensor(adj)
        protein = torch.FloatTensor(protein_embedding)
        label = torch.FloatTensor(label)

        compounds.append(atom_feature)
        adjacencies.append(adj)
        proteins.append(protein)
        interactions.append(label)
    dataset = list(zip(compounds, adjacencies, proteins, interactions))
    with open("data/output/kinase_test.txt", "wb") as f:
        joblib.dump(dataset, f)
    # print('The preprocess of ' + DATASET + ' dataset has finished!')