Unverified Commit c2d615e8 authored by jinyuan sun's avatar jinyuan sun Committed by GitHub
Browse files

Merge pull request #29 from ivandon15/ivan

parents 1b4a80b1 43201583
Loading
Loading
Loading
Loading
+186 −0
Original line number Diff line number Diff line
@@ -10,6 +10,118 @@ def translate_to_protein(self, seq: str, pname=None):
    else:
        return f"The protein sequence of {seq} is `>protein\n{protein_seq}`"


def get_smiles_feature(self, smiles):
    from rdkit import Chem
    from rdkit.Chem.QED import properties
    
    try:
        mol = Chem.MolFromSmiles(smiles)
    except:
        return "Error: Not a valid SMILES string"
    p = properties(mol)
    formatted_result = (
        f"Molecular Weight: {p.MW:.2f}, "
        f"LOGP: {p.ALOGP:.2f}, "
        f"HBA (Hydrogen Bond Acceptors): {p.HBA}, "
        f"HBD (Hydrogen Bond Donors): {p.HBD}, "
        f"PSA (Polar Surface Area): {p.PSA:.2f}"
    )
    return formatted_result


def capped(self, smiles):
    """ cap one amino acid """
    from rdkit import Chem
    def update_idx(removed, current):
        if current>removed:
            current -= 1
        return current
    
    try:
        mol = Chem.MolFromSmiles(smiles)
    except:
        return "Error: Not a valid SMILES string"
    
    acetyl_smiles = "CC(=O)N"
    methyl_amide_smiles = "NC"
    m_idx =  0
    ace_mol = Chem.MolFromSmiles(acetyl_smiles)
    methyl_mol = Chem.MolFromSmiles(methyl_amide_smiles)
    
    combined_mol = Chem.CombineMols(mol,ace_mol)
    combined_mol = Chem.CombineMols(combined_mol, methyl_mol)

    backbone_matches = "NC[C:1](=[O:2])-[OD1]"
    backbone = Chem.MolFromSmarts(backbone_matches)
    
    cp_nh2 = Chem.MolFromSmarts(methyl_amide_smiles)
    cp_cooh = Chem.MolFromSmarts('[C:1](=[O:2])-N')
    OH = combined_mol.GetSubstructMatches(backbone)[0][-1]
    C1 = combined_mol.GetSubstructMatches(backbone)[0][-3]
    NH2 = combined_mol.GetSubstructMatches(backbone)[0][0]
    METH = combined_mol.GetSubstructMatches(cp_nh2)[-1][m_idx]
    ACE = combined_mol.GetSubstructMatches(cp_cooh)[-1][-1]
    
    capped_mol = Chem.EditableMol(combined_mol)
    capped_mol.RemoveAtom(OH)
    C1 = update_idx(OH,C1)
    NH2 = update_idx(OH,NH2)
    METH = update_idx(OH,METH)
    ACE = update_idx(OH,ACE)
    capped_mol.AddBond(C1, METH, order=Chem.rdchem.BondType.SINGLE)

    bonds = capped_mol.GetMol().GetBonds()
    connected = []
    for bond in bonds:
        if bond.GetBeginAtom().GetIdx() == NH2:
            connected.append(bond.GetEndAtom().GetIdx()-1 \
                            if bond.GetEndAtom().GetIdx()>NH2 \
                            else bond.GetEndAtom().GetIdx())
        elif bond.GetEndAtom().GetIdx() == NH2:
            connected.append(bond.GetBeginAtom().GetIdx()-1 \
                            if bond.GetBeginAtom().GetIdx()>NH2 \
                            else bond.GetBeginAtom().GetIdx())
            
    capped_mol.RemoveAtom(NH2)
    ACE = update_idx(NH2,ACE)
    for conn in connected:
        capped_mol.AddBond(conn, ACE, order=Chem.rdchem.BondType.SINGLE)
    capped_smi = Chem.MolToSmiles(capped_mol.GetMol())
    return f"After capping (adding ace and nme), the smiles is `{capped_smi}`"


def smiles_similarity(self, smiles1, smiles2, types="ECFP"):
    from rdkit import Chem
    from rdkit.Chem import AllChem
    from rdkit.Chem import MACCSkeys
    from rdkit.DataStructs import TanimotoSimilarity
    
    def get_fingerprint(smiles, types):
        try:
            molecule = Chem.MolFromSmiles(smiles)
        except:
            return "Error: Not a valid SMILES string"
        # ECFP
        fp = AllChem.GetMorganFingerprint(molecule, 2)
        if types == "FCFP":
            fp = AllChem.GetMorganFingerprint(
                                    molecule, 2,
                                    useFeatures=True,
                                    useChirality=True
                                    )            
        elif types == "RDK":
            fp = AllChem.RDKFingerprint(molecule)
        elif types == "MACC":
            fp = MACCSkeys.GenMACCSKeys(molecule)
        return fp

    fp1, fp2 = get_fingerprint(smiles1,types), get_fingerprint(smiles2,types)
    similarity = TanimotoSimilarity(fp1, fp2)
    
    return f"Using {types} fingerprint and Tanimoto, the result is {similarity:.2f}"


function_descriptions = [{  # This is the description of the function
    "type": "function",
    "function": {
@@ -23,6 +135,56 @@ function_descriptions = [{ # This is the description of the function
        },
        "required": ["seq"],
    },
},
{
    "type": "function",
    "function": {
        "name": "get_smiles_feature",
        "description": "Input a smiles, and will return molecule weight, logp, \
                        HBA(Hydrogen Bond Acceptors), HBD(Hydrogen Bond Donors) \
                        and PSA(Polar Surface Area) values",
        "parameters": {
            "type": "object",
            "properties": {
                "smiles": {"type": "string", "description": "The smiles sequence"},
            },
        },
        "required": ["smiles"],
    }
},
{
    "type": "function",
    "function": {
        "name": "capped",
        "description": "Input a smiles, and will return a capped smiles, which \
                        means adding ACE and NME to the smiles to prevent or \
                        block unwanted reactions.",
        "parameters": {
            "type": "object",
            "properties": {
                "smiles": {"type": "string", "description": "The smiles sequence"},
            },
        },
        "required": ["smiles"],
    }
},
{
    "type": "function",
    "function": {
        "name": "smiles_similarity",
        "description": "Input two smiles and a fingerprint method (if not provided, \
                        ECFP - the default morgan fingerprint will be used) and \
                        return the TanimotoSimilarity",
        "parameters": {
            "type": "object",
            "properties": {
                "smiles1": {"type": "string", "description": "The smiles sequence"},
                "smiles2": {"type": "string", "description": "The smiles sequence"},
                "types": {"type": "string", "description": "The fingerprint method"},
            },
        },
        "required": ["smiles1", "smiles2"],
    }
}]

test_data = {
@@ -32,6 +194,30 @@ test_data = {
            "seq": "ATGCGAATTTGGGCCC",
        },
        "output": "The protein sequence of ATGCGAATTTGGGCCC is `>protein\nMRFL`",
    },
    "get_smiles_feature": {
        "input": {
            "self": None,
            "smiles": "C[C@H](N)C(=O)N[C@@H](CS)C(=O)N[C@@H](CS)C(=O)O",
        },
        "output": "Molecular Weight: 295.39, LOGP: -1.75, HBA (Hydrogen Bond Acceptors): \
                   5, HBD (Hydrogen Bond Donors): 6, PSA (Polar Surface Area): 121.52",
    },
    "capped": {
        "input": {
            "self": None,
            "smiles": "N[C@@H](CS)C(=O)O",
        },
        "output": "After capping (adding ace and nme), the smiles is \
                   `CNC(=O)[C@H](CS)NC(C)=O`",
    },
    "smiles_similarity": {
        "input": {
            "self": None,
            "smiles1": "N[C@@H](CS)C(=O)O",
            "smiles2": "N[C@@H](CS)C(=O)O",
        },
        "output": "Using ECFP fingerprint and Tanimoto, the result is 1.00",
    }
}