Commit ca6d9e6f authored by DavidRFB's avatar DavidRFB
Browse files

New comments

parent 0cfe727a
Loading
Loading
Loading
Loading
+0 −1489

File deleted.

Preview size limit exceeded, changes collapsed.

+305 −0
Original line number Diff line number Diff line
@@ -5,77 +5,99 @@ get_ipython().getoutput("/root/miniconda/bin/conda info -e")


get_ipython().getoutput("pip install --pre deepchem")


get_ipython().getoutput("pip install propy3 ")


#imports 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt


data =pd.read_csv("./all_measurementsProtherDB.tsv",delimiter="\t")

import deepchem as dc 
import os 
from deepchem.utils import download_url
data_dir = dc.utils.get_data_dir()
download_url("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/pucci-proteins-appendixtable1.csv",dest_dir=data_dir)
print('Dataset Dowloaded at {}'.format(data_dir))
dataset_file = os.path.join(data_dir, "pucci-proteins-appendixtable1.csv")

data.shape

import pandas as pd 
data = pd.read_csv(dataset_file)
data

df = data[ (data['MEASURE'] == 'DSC') & (data['MUTATION'] == 'wild-type')]

WT_Tm = data[['PDBid','Tmexp [wt]']]
WT_Tm.set_index('PDBid',inplace=True)

df.shape

WT_Tm

no_mutants_dataset = df.drop_duplicates(['PDB_wild'])

dict_WT_TM = {}
for k,v in WT_Tm.itertuples():
    if(k not in dict_WT_TM):
        dict_WT_TM[k]=float(v)

no_mutants_dataset.shape

pdbs = data[data['PDBid'].str.len()<5]
pdbs = pdbs[pdbs['Chain'] == "A"]

df2 = data[ (data['MEASURE'] == 'DSC')]

pdbs[['RESN','RESwt','RESmut']]

df2.shape

alls=[]
for resnum,wt in pdbs[['RESN','RESwt','RESmut','PDBid','ΔTmexp']].iteritems():
    alls.append(wt.values)
d = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
     'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 
     'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 
     'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}
resnum=alls[0]
wt=[d[x.strip()] for x in alls[1]] # extract the Wildtype aminoacid with one letter code 
mut=[d[x.strip()] for x in alls[2]] # extract the Mutation aminoacid with one letter code 
codes=alls[3] # PDB code 
tms=alls[4] # Melting temperature

no_wild_type_repeated  =df2.drop_duplicates(['MUTATION','PDB_wild'],keep='first')

print("pdbid {}, WT-AA {}, Resnum {}, MUT-AA {},DeltaTm {}".format(codes[0],wt[0],resnum[0],mut[0],tms[0]))

no_wild_type_repeated.sort_values('UniProt_ID')[['UniProt_ID','PDB_wild','PROTEIN']]


Mutation_list = no_wild_type_repeated[['UniProt_ID','MUTATION']].set_index('UniProt_ID')
from pdbfixer import PDBFixer
from simtk.openmm.app import PDBFile


Mutation_list
get_ipython().getoutput("mkdir PDBs")


import requests, sys
def Seq_From_AccNum(num):
    '''
    perform a http  request to the EBI-API to obtain the sequence based on the Uniprot Accession number
    return a string 
import os 
import time
t0 = time.time()

    '''
    requestURL = "https://www.ebi.ac.uk/proteins/api/features/{}".format(num)
    r = requests.get(requestURL, headers={ "Accept" : "application/json"})
    if not r.ok:
        print("Failure in Uniprot request")
        return None
        r.raise_for_status()
        sys.exit()
    responseBody = r.json()
    return responseBody['sequence']
downloaded = os.listdir("PDBs")
PDBs_ids= set(pdbs['PDBid'])
pdb_list = []
print("Start Download ")
for pdbid in PDBs_ids:
    name=pdbid+".pdb"
    if(name in downloaded):
        continue
    try:
        fixer = PDBFixer(pdbid=pdbid)
        fixer.findMissingResidues()
        fixer.findNonstandardResidues()
        fixer.replaceNonstandardResidues()
        fixer.removeHeterogens(True)
        fixer.findMissingAtoms()
        fixer.addMissingAtoms()
        PDBFile.writeFile(fixer.topology, fixer.positions, open('./PDBs/get_ipython().run_line_magic("s.pdb'", " % (pdbid), 'w'),keepIds=True)")
    except:
        print("Problem with {}".format(pdbid))
print("Total Time {}".format(time.time()-t0))


import re
def MutateSeq(seq,Mutant):
    '''
    mutate a sequence based on a string (Mutant) that has the notation : 
    Mutate a sequence based on a string (Mutant) that has the notation : 
    A###B where A is the wildtype aminoacid ### the position and B the mutation
    
    '''
    aalist = re.findall('([A-Z])([0-9]+)([A-Z])', Mutant)
    
@@ -95,122 +117,89 @@ def MutateSeq(seq,Mutant):
            listseq[pos]=MutAA
            
        else:
            print("WildType AA does not match")
            #print("WildType AA does not match")
            return None
    return("".join(listseq))


from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Polypeptide import PPBuilder
ppb=PPBuilder()
def GetSeqFromPDB(pdbid):
    structure = PDBParser().get_structure(pdbid.split(".")[0], 'PDBs/{}'.format(pdbid))
    seqs=[]
    return ppb.build_peptides(structure)
    
Mutation_list.sort_values('UniProt_ID',inplace=True)


Mutation_list
import warnings; warnings.simplefilter('ignore')
test="1ezm"
print(test)
seq = GetSeqFromPDB("{}.pdb".format(test))[0].get_sequence()
print("Original Sequence")
print(seq)


import time 
t0 = time.time()
Sequences = {}
fail  = {}
AccNum = []
count = 0 
print("Perfoming data curation : ")

for accnumber, row in Mutation_list.iterrows():
    #if(count == 100):
    #    break
    #count += 1
    mutation = row['MUTATION'].split("(")[0].strip()
    print("{} - {}".format(accnumber,mutation), end =" ")
    name ="{}-{}".format(accnumber,mutation)
    if(accnumber=='-'):
        fail[accnumber] = [mutation]
        continue
    if accnumber not in AccNum:
        AccNum.append(accnumber)
        seq = Seq_From_AccNum(accnumber)
        if(seq == None):
            continue
    if(mutation =='wild-type'):
        name ="{}-{}".format(accnumber,"WT")
        Sequences[name]=seq
    else:
        mutseq = MutateSeq(seq,mutation)
        if(mutseq==None):
            if(accnumber not in fail ):
                fail[accnumber] = [mutation]
            else:
                fail[accnumber].append(mutation)
        Sequences[name] = mutseq

print("Total time analyzing all the dataFrame {} s".format(time.time() - t0))
informSeq=GetSeqFromPDB(test+".pdb")[0].__repr__()
print("Seq information",informSeq)
start = re.findall('[0-9]+',informSeq)[0]
print("Reported Mutation {}{}{}".format("R",179,"A"))
numf =179 - int(start) + 1  # fix some cases of negative aminoacid numbers


fail
mutfinal = "R{}A".format(numf)
print("Real Mutation = ",mutfinal)
mutseq = MutateSeq(seq,mutfinal)
print(mutseq)


temp_accnum = no_wild_type_repeated[['UniProt_ID','MUTATION','Tm_(C)','PDB_wild']]
arr_acc_temp = temp_accnum.to_numpy()

temp_dic={}

for i in arr_acc_temp:
    acc = i[0]
    if(i[1] == 'wild-type'):
        mut ='WT'
information = {}
count = 1
failures=[]
for code,tm,numr,wt_val,mut_val in zip(codes,tms,resnum,wt,mut):
    count += 1
    seq = GetSeqFromPDB("{}.pdb".format(code))[0].get_sequence()
    mutfinal="WT"
    if("{}-{}".format(code,mutfinal) not in information):
        informSeq=GetSeqFromPDB(code+".pdb")[0].__repr__()
        start = re.findall('[-0-9]+',informSeq)[0]
    if(int(start)<0):
        numf =numr - int(start) # if start is negative 0 is not used as resnumber
    else:
        mut = i[1].split('(')[0].strip()

    name="{}-{}".format(acc,mut)
    temp_dic[name]=[i[-2],i[-1].upper()]
    
with open('sequences_protherm.csv','w') as file:
    file.write("AccNumber,T_m,PDBID,Sequence \n")
    for k,v in Sequences.items():
        if(v==None):
        numf =numr - int(start) + 1 
    mutfinal = "{}{}{}".format(wt_val,numf,mut_val)
    mutseq = MutateSeq(seq,mutfinal)
    if(mutseq==None):
        failures.append((code,mutfinal))
        continue
        temp = temp_dic[k][0]
        code_pdb = temp_dic[k][1]
    information["{}-{}".format(code,mutfinal)]=[mutseq,dict_WT_TM[code]-float(tm)]


        text = "{},{},{},{} \n".format(k,temp,code_pdb,v)
        file.write(text)


import deepchem as dc
from rdkit import Chem
import pandas as pd 


Final_Data  = pd.read_csv("sequences_protherm.csv")


seq_list = list(Final_Data['Sequence '])


codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
         'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
seq_list=[]
deltaTm=[]
for i in information.values():
    seq_list.append(i[0])
    deltaTm.append(i[1])


max_seq= 0 
for i in seq_list:
    if(len(i)>max_seq):
        max_seq=len(i)
max_seq


codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
         'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
OneHotFeaturizer = dc.feat.OneHotFeaturizer(codes,max_length=max_seq)


features = OneHotFeaturizer.featurize(seq_list)


temp = [float(x.split("(")[0]) for x in list(Final_Data['T_m'])]


dc_dataset = dc.data.NumpyDataset(X=features,y=temp)


dc_dataset.X.shape
dc_dataset = dc.data.NumpyDataset(X=features,y=deltaTm)


from deepchem import splits
@@ -218,17 +207,14 @@ splitter = splits.RandomSplitter()
train, test  = splitter.train_test_split(dc_dataset,seed=42)


dc_dataset.X.shape[1:]


from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential([
    layers.Dense(units=32, activation='relu', input_shape=dc_dataset.X.shape[1:]),
    layers.Dropout(0.2),
    layers.Dense(units=32, activation='relu'), 
    layers.Dropout(0.2),
    layers.Dense(units=32, activation='relu'), 
    layers.Dropout(0.4),
    layers.Dense(units=64, activation='relu'), 
    layers.Dropout(0.5),
    layers.Dense(units=128, activation='relu'), 
    layers.Dropout(0.2),
    layers.Dense(units=1),
])
@@ -240,7 +226,7 @@ history = model.fit(
    train.X, train.y,
    validation_data=(test.X,test.y),
    batch_size=24,
    epochs=10,
    epochs=30,
)

## perform a plot of loss vs epochs 
@@ -249,30 +235,28 @@ history_df = pd.DataFrame(history.history)
history_df[['loss', 'val_loss']].plot()


dc_model = dc.models.KerasModel(model,dc.metrics.mae_score)


from deepchem import splits
splitter = splits.RandomSplitter()
train, test  = splitter.train_test_split(dc_dataset,seed=42)

from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential([
    layers.Dense(units=32, activation='relu', input_shape=dc_dataset.X.shape[1:]),
    layers.Dropout(0.2),
    layers.Dense(units=32, activation='relu'), 
    layers.Dropout(0.2),
    layers.Dense(units=32, activation='relu'), 
    layers.Dropout(0.2),
    layers.Dense(units=1),
])

train.X.shape
dcmodel = dc.models.KerasModel(model,loss=dc.metrics.mae_score)


from sklearn.ensemble import RandomForestRegressor
from deepchem.utils.evaluate import Evaluator
import pandas as pd
seed = 42 # Set a random seed to get stable results
sklearn_model = RandomForestRegressor(n_estimators=100, max_features='sqrt')
sklearn_model.random_state = seed
model = dc.models.SklearnModel(sklearn_model)
model.fit(train)
dcmodel.fit(dc_dataset)


from propy import PyPro


import numpy as np 
aaComplist = []
CTDList =[]
for seq in seq_list:
@@ -281,8 +265,8 @@ for seq in seq_list:
    CTDList.append(np.array(list(Obj.GetCTD().values())))


dc_dataset_aacomp = dc.data.NumpyDataset(X=aaComplist,y= temp)
dc_dataset_ctd = dc.data.NumpyDataset(X=CTDList,y= temp)
dc_dataset_aacomp = dc.data.NumpyDataset(X=aaComplist,y=deltaTm)
dc_dataset_ctd = dc.data.NumpyDataset(X=CTDList,y=deltaTm)


from deepchem import splits
@@ -302,36 +286,9 @@ train_score = model.evaluate(train, [metric])
test_score = model.evaluate(test, [metric])
print("Train score is : {}".format(train_score))
print("Test score is : {}".format(test_score))
print("SupportVectorMachineRegressor")
from sklearn.svm import SVR
svr_sklearn = SVR(kernel="poly",degree=4)
svr_sklearn.random_state = seed 
model = dc.models.SklearnModel(svr_sklearn)
model.fit(train)
metric = dc.metrics.Metric(dc.metrics.mae_score)
train_score = model.evaluate(train, [metric])
test_score = model.evaluate(test, [metric])
print("Train score is : {}".format(train_score))
print("Test score is : {}".format(test_score))


from deepchem import splits
splitter = splits.RandomSplitter()
train, test  = splitter.train_test_split(dc_dataset_ctd,seed=42)
from sklearn.ensemble import RandomForestRegressor
from deepchem.utils.evaluate import Evaluator
import pandas as pd
print("RandomForestRegressor")
seed = 42 # Set a random seed to get stable results
sklearn_model = RandomForestRegressor(n_estimators=100, max_features='sqrt')
sklearn_model.random_state = seed
model = dc.models.SklearnModel(sklearn_model)
model.fit(train)
metric = dc.metrics.Metric(dc.metrics.mae_score)
train_score = model.evaluate(train, [metric])
test_score = model.evaluate(test, [metric])
print("Train score is : {}".format(train_score))
print("Test score is : {}".format(test_score))

print("SupportVectorMachineRegressor")
from sklearn.svm import SVR
svr_sklearn = SVR(kernel="poly",degree=4)
@@ -343,3 +300,6 @@ train_score = model.evaluate(train, [metric])
test_score = model.evaluate(test, [metric])
print("Train score is : {}".format(train_score))
print("Test score is : {}".format(test_score))


+180 −1192

File changed.

Preview size limit exceeded, changes collapsed.