Commit b89b0137 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Featurizing

parent cee8c837
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -245,6 +245,10 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
           system_coords, system_neighbor_list, system_z

  def get_Z_matrix(self, mol, max_atoms):
    ######################################### DEBUG
    print("len(mol.GetAtoms())")
    print(len(mol.GetAtoms()))
    ######################################### DEBUG
    return pad_array(
        np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), max_atoms)

+51 −0
Original line number Diff line number Diff line
@@ -6,11 +6,29 @@ import numpy as np
from rdkit import Chem
from rdkit.Chem import rdGeometry, rdMolTransforms
from deepchem.utils.save import log
import multiprocessing

__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "BSD 3-clause"

def get_ligand_filetype(ligand_filename):
  """Returns the filetype of ligand."""
  if ".mol2" in ligand_filename:
    return "mol2"
  elif ".sdf" in ligand_filename:
    return "sdf"
  elif ".pdbqt" in ligand_filename:
    return "pdbqt"
  elif ".pdb" in ligand_filename:
    return "pdb"
  else:
    raise ValueError("Unrecognized_filename")

def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file):
  return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


class ComplexFeaturizer(object):
  """"
  Abstract class for calculating features for mol/protein complexes.
@@ -38,6 +56,39 @@ class ComplexFeaturizer(object):
    features = np.asarray(features)
    return features

  #def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
  #  """
  #  Calculate features for mol/protein complexes.

  #  Parameters
  #  ----------
  #  mols: list
  #    List of PDB filenames for molecules.
  #  protein_pdbs: list
  #    List of PDB filenames for proteins.
  #  """
  #  pool = multiprocessing.Pool()
  #  results = []
  #  for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
  #    log_message = "Featurizing %d / %d" % (
  #        i, len(mol_files)) if i % log_every_n == 0 else None
  #    #ligand_ext = get_ligand_filetype(mol_file)
  #    #with open(mol_file) as mol_f:
  #    #  mol_lines = mol_f.readlines()
  #    #with open(protein_pdb) as protein_file:
  #    #  protein_pdb_lines = protein_file.readlines()
  #    results.append(
  #        pool.apply_async(
  #            _featurize_complex,
  #            (self, mol_file, protein_pdb)))
  #            #(self, ligand_ext, mol_lines, protein_pdb_lines, log_message)))
  #  pool.close()
  #  features = []
  #  for result in results:
  #    features += result.get()
  #  features = np.asarray(features)
  #  return features

  def _featurize_complex(self, mol_pdb, complex_pdb):
    """
    Calculate features for single mol/protein complex.
+27 −5
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ import pandas as pd
import logging
import tarfile
from deepchem.feat import rdkit_grid_featurizer as rgf
from deepchem.feat.atomic_coordinates import ComplexNeighborListFragmentAtomicCoordinates

logger = logging.getLogger(__name__)

@@ -50,10 +51,6 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
  return deepchem.data.DiskDataset(dataset_dir), tasks


def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
  """Loads and featurizes PDBBind dataset."""


def load_pdbbind_grid(split="random",
                      featurizer="grid",
                      subset="core",
@@ -138,7 +135,19 @@ def load_pdbbind_grid(split="random",


def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
  """Load and featurize raw PDBBind dataset."""
  """Load and featurize raw PDBBind dataset.
  
  Parameters
  ----------
  data_dir: String, optional
    Specifies the data directory to store the featurized dataset.
  split: Str
    Either "random" or "index"
  feat: Str
    Either "grid" or "atomic" for grid and atomic featurizations.
  subset: Str
    Only "core" or "refined" for now.
  """
  pdbbind_tasks = ["-logKd/Ki"]
  data_dir = deepchem.utils.get_data_dir()
  if reload:
@@ -208,6 +217,19 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
        ecfp_power=ecfp_power,
        splif_power=splif_power,
        flatten=True)
  elif featurizer == "atomic":
    # Pulled from PDB files. For larger datasets with more PDBs, would use
    # max num atoms instead of exact.
    frag1_num_atoms = 60  # for ligand atoms
    frag2_num_atoms = 24000  # for protein atoms
    complex_num_atoms = 24060  # in total
    max_num_neighbors = 4
    # Cutoff in angstroms
    neighbor_cutoff = 4
    featurizer = ComplexNeighborListFragmentAtomicCoordinates(
        frag1_num_atoms, frag2_num_atoms, complex_num_atoms, max_num_neighbors,
        neighbor_cutoff)

  else:
    raise ValueError("Featurizer not supported")
  print("Featurizing Complexes")