Commit 03ef391a authored by leswing's avatar leswing
Browse files

No Openbabel in pdbbind benchmark

parent 0906d8b0
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ from deepchem.feat.basic import RDKitDescriptors
from deepchem.feat.coulomb_matrices import CoulombMatrix
from deepchem.feat.coulomb_matrices import CoulombMatrixEig
from deepchem.feat.grid_featurizer import GridFeaturizer
from deepchem.feat.rdkit_grid_featurizer import RdkitGridFeaturizer
from deepchem.feat.nnscore_utils import hydrogenate_and_compute_partial_charges
from deepchem.feat.binding_pocket_features import BindingPocketFeaturizer
from deepchem.feat.one_hot import OneHotFeaturizer
+1191 −0

File added.

Preview size limit exceeded, changes collapsed.

+1 −0
Original line number Diff line number Diff line
# coding=utf-8
"""
Contains an abstract base class that supports data transformations.
"""
+108 −0
Original line number Diff line number Diff line
import logging

import os
import numpy as np
import tempfile
from rdkit import Chem
from rdkit.Chem import AllChem
from pdbfixer import PDBFixer
from simtk.openmm.app import PDBFile


class MoleculeLoadException(Exception):

  def __init__(self, *args, **kwargs):
    Exception.__init__(*args, **kwargs)


def get_xyz_from_mol(mol):
  """
  returns an m x 3 np array of 3d coords
  of given rdkit molecule
  """
  xyz = np.zeros((mol.GetNumAtoms(), 3))
  conf = mol.GetConformer()
  for i in range(conf.GetNumAtoms()):
    position = conf.GetAtomPosition(i)
    xyz[i, 0] = position.x
    xyz[i, 1] = position.y
    xyz[i, 2] = position.z
  return (xyz)


def add_hydrogens_f(mol):
  molecule_file = str(tempfile.NamedTemporaryFile().name)
  Chem.MolToPDBFile(mol, molecule_file)
  try:
    fixer = PDBFixer(filename=molecule_file)
    fixer.addMissingHydrogens(7.4)
    PDBFile.writeFile(fixer.topology, fixer.positions, open(molecule_file, 'w'))
  except ValueError as e:
    print(e)
    raise MoleculeLoadException(e)
  return Chem.MolFromPDBFile(str(molecule_file), sanitize=False, removeHs=False)


def compute_charges(mol):
  try:
    AllChem.ComputeGasteigerCharges(mol)
  except Exception as e:
    logging.exception("Unable to compute charges for mol")
    raise MoleculeLoadException(e)
  return mol


def load_molecule(molecule_file, add_hydrogens=True, calc_charges=True):
  """Converts molecule file to (xyz-coords, obmol object)

  Given molecule_file, returns a tuple of xyz coords of molecule
  and an rdkit object representing that molecule
  """
  if ".mol2" in molecule_file or ".sdf" in molecule_file:
    suppl = Chem.SDMolSupplier(str(molecule_file), sanitize=False)
    my_mol = suppl[0]
  elif ".pdbqt" in molecule_file:
    raise MoleculeLoadException("Don't support pdbqt files yet")
  elif ".pdb" in molecule_file:
    my_mol = Chem.MolFromPDBFile(str(molecule_file), sanitize=False, removeHs=False)
  else:
    raise ValueError("Unrecognized file type")

  if my_mol is None:
    raise ValueError("Unable to read non None Molecule Object")

  if calc_charges:
    my_mol = add_hydrogens_f(my_mol)
    compute_charges(my_mol)
  elif add_hydrogens:
    my_mol = add_hydrogens_f(my_mol)
    compute_charges(my_mol)

  xyz = get_xyz_from_mol(my_mol)

  return xyz, my_mol


def write_molecule(mol, outfile):
  if ".pdbqt" in outfile:
    # TODO (LESWING) create writer for pdbqt which includes charges
    writer = Chem.PDBWriter(outfile)
    writer.write(mol)
    writer.close()
    pass
  elif ".pdb" in outfile:
    writer = Chem.PDBWriter(outfile)
    writer.write(mol)
    writer.close()
  else:
    raise ValueError("Unsupported Format")


def pdbqt_to_pdb(filename):
  base_filename = os.path.splitext(filename)[0]
  pdb_filename = base_filename + ".pdb"
  pdbqt_data = open(filename).readlines()
  with open(pdb_filename, 'w') as fout:
    for line in pdbqt_data:
      fout.write("%s\n" % line[:66])
  return pdb_filename
+49 −52
Original line number Diff line number Diff line
@@ -2,40 +2,26 @@
PDBBind dataset loader.
"""

from __future__ import print_function
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import logging
import multiprocessing
import os
import numpy as np
import pandas as pd
import shutil
import time
import re
from rdkit import Chem
import time
from multiprocessing.pool import Pool

import deepchem as dc
import numpy as np
import pandas as pd
from deepchem.utils.rdkit_util import MoleculeLoadException


def load_pdbbind_labels(labels_file):
  """Loads pdbbind labels as dataframe"""
  # Some complexes have labels but no PDB files. Filter these manually
  missing_pdbs = [
      "1d2v", "1jou", "1s8j", "3f39", "3i3d", "3i3b", "3dyo", "3t0d", "1cam",
      "3vdb", "3f37", "3f38", "4mlt", "3f36", "4o7d", "3t08", "3f34", "3f35",
      "2wik", "4mlx", "2wij", "1px4", "4wkt", "3f33", "2wig", "3muz", "3t2p",
      "3t2q", "4pji", "2adj", "3t09", "3mv0", "1pts", "3vd9", "3axk", "4q1s",
      "3t0b", "4b82", "3vd7", "3hg1", "3vd4", "3vdc", "3b5y", "4oi6", "3axm",
      "4mdm", "2mlm", "3eql", "4ob0", "3wi6", "4fgt", "4pnc", "4mvn", "4lv3",
      "4lz9", "1pyg", "3h1k", "7gpb", "1e8h", "4wku", "2f2h", "1zyr", "1z9j",
      "3b5d", "3b62", "4q3q", "4mdl", "4no6", "4mdg", "3dxj", "4u0x", "4l6q",
      "4q3r", "1h9s", "4ob1", "4ob2", "4qq5", "4nk3", "3k1j", "4m8t", "4mzo",
      "4nnn", "4q3s", "4nnw", "3cf1", "4u5t", "4wkv", "4ool", "3a2c", "4wm9",
      "4pkb", "4qkx", "4no8", "1ztz", "1nu1", "4kn4", "4mao", "4qqc", "4len",
      "4lv1", "4r02", "4r6v", "4fil", "4q2k", "1hpb", "4oon", "4qbb", "4ruu",
      "4no1", "3w8o", "4kn7", "4r17", "4r18", "5hvp", "1e59", "1sqq", "3n75",
      "4kmu", "4mzs", "1sqb", "1lr8", "4lv2", "4wmc", "1sqp", "3whw", "4cpa",
      "3i8w", "4hrd", "4hrc", "1ntk", "1rbo"
  ]
  contents = []
  with open(labels_file) as f:
    for line in f:
@@ -47,9 +33,6 @@ def load_pdbbind_labels(labels_file):
        p = re.compile('\(([^\)\s]*) ([^\)\s]*)\)')
        line = p.sub('(\\1-\\2)', line)
        elts = line.split()
        # Filter if missing PDB files
        if elts[0] in missing_pdbs:
          continue
        contents.append(elts)
  contents_df = pd.DataFrame(
      contents,
@@ -58,15 +41,6 @@ def load_pdbbind_labels(labels_file):
  return contents_df


def compute_pdbbind_features(grid_featurizer, pdb_subdir, pdb_code):
  """Compute features for a given complex"""
  protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
  ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
  features = grid_featurizer.featurize_complexes([ligand_file], [protein_file])
  features = np.squeeze(features)
  return features


def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
  """Featurizes pdbbind according to provided featurization"""
  tasks = ["-logKd/Ki"]
@@ -94,7 +68,7 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):

  # Define featurizers
  if feat == "grid":
    featurizer = dc.feat.GridFeaturizer(
    featurizer = dc.feat.RdkitGridFeaturizer(
        voxel_width=16.0,
        feature_types="voxel_combined",
        # TODO(rbharath, enf, leswing): Figure out why pi_stack and cation_pi
@@ -116,30 +90,25 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):

  # Featurize Dataset
  features = []
  feature_len = None
  y_inds = []
  missing_pdbs = []
  time1 = time.time()
  p = Pool(multiprocessing.cpu_count())
  args = []
  for ind, pdb_code in enumerate(ids):
    print("Processing complex %d, %s" % (ind, str(pdb_code)))
    pdb_subdir = os.path.join(pdbbind_dir, pdb_code)
    if not os.path.exists(pdb_subdir):
      print("%s is missing!" % pdb_subdir)
      missing_pdbs.append(pdb_subdir)
    args.append((ind, pdb_code, pdbbind_dir, featurizer))
  results = p.map(compute_single_pdbbind_feature, args)
  feature_len = None
  for result in results:
    if result is None:
      continue
    computed_feature = compute_pdbbind_features(featurizer, pdb_subdir,
                                                pdb_code)
    if feature_len is None:
      feature_len = len(computed_feature)
    if len(computed_feature) != feature_len:
      print("Featurization failed for %s!" % pdb_code)
      feature_len = len(result[1])
    if len(result[1]) != feature_len:
      continue
    y_inds.append(ind)
    features.append(computed_feature)
    y_inds.append(result[0])
    features.append(result[1])
  time2 = time.time()
  print("TIMING: PDBBind Featurization took %0.3f s" % (time2 - time1))
  print("missing_pdbs")
  print(missing_pdbs)
  y = y[y_inds]
  X = np.vstack(features)
  w = np.ones_like(y)
@@ -148,6 +117,31 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
  return dataset, tasks


def compute_single_pdbbind_feature(x):
  ind, pdb_code, pdbbind_dir, featurizer = x[0], x[1], x[2], x[3]
  print("Processing complex %d, %s" % (ind, str(pdb_code)))
  pdb_subdir = os.path.join(pdbbind_dir, pdb_code)
  try:
    computed_feature = compute_pdbbind_features(featurizer, pdb_subdir,
                                                pdb_code)
  except MoleculeLoadException as e:
    logging.warning("Unable to compute features for %s" % x)
    return None
  except Exception as e:
    logging.warning("Unable to compute features for %s" % x)
    return None
  return ind, computed_feature

def compute_pdbbind_features(grid_featurizer, pdb_subdir, pdb_code):
  """Compute features for a given complex"""
  protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
  ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
  features = grid_featurizer.featurize_complexes([ligand_file],
                                                 [protein_file])
  features = np.squeeze(features)
  return features


def load_pdbbind_grid(split="index", featurizer="grid", subset="full"):
  """Load PDBBind datasets. Does not do train/test split"""
  dataset, tasks = featurize_pdbbind(feat=featurizer, subset=subset)
@@ -168,3 +162,6 @@ def load_pdbbind_grid(split="index", featurizer="grid", subset="full"):
    test = transformer.transform(test)

  return tasks, (train, valid, test), transformers

if __name__ == "__main__":
  load_pdbbind_grid()
Loading