Unverified Commit ce4860a5 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1360 from rbharath/pddbind_load

RDKitGridFeaturizer Loader for PDBBind
parents ab3899bf 4b65b2c1
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -16,7 +16,7 @@ from deepchem.molnet.load_function.lipo_datasets import load_lipo
from deepchem.molnet.load_function.muv_datasets import load_muv
from deepchem.molnet.load_function.nci_datasets import load_nci
from deepchem.molnet.load_function.pcba_datasets import load_pcba, load_pcba_146, load_pcba_2475
from deepchem.molnet.load_function.pdbbind_datasets import load_pdbbind_grid
from deepchem.molnet.load_function.pdbbind_datasets import load_pdbbind_grid, load_pdbbind
from deepchem.molnet.load_function.ppb_datasets import load_ppb
from deepchem.molnet.load_function.qm7_datasets import load_qm7
from deepchem.molnet.load_function.qm7_datasets import load_qm7_from_mat, load_qm7b_from_mat
+103 −0
Original line number Diff line number Diff line
@@ -14,6 +14,11 @@ import time
import deepchem
import numpy as np
import pandas as pd
import logging
import tarfile
from deepchem.feat import rdkit_grid_featurizer as rgf

logger = logging.getLogger(__name__)


def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
@@ -45,6 +50,10 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
  return deepchem.data.DiskDataset(dataset_dir), tasks


def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
  """Loads and featurizes PDBBind dataset."""


def load_pdbbind_grid(split="random",
                      featurizer="grid",
                      subset="core",
@@ -126,3 +135,97 @@ def load_pdbbind_grid(split="random",
                                               transformers)

    return tasks, (train, valid, test), transformers


def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
  """Load and featurize raw PDBBind dataset."""
  pdbbind_tasks = ["-logKd/Ki"]
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir,
                            "pdbbind/" + featurizer + "/" + str(split))
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return pdbbind_tasks, all_dataset, transformers
  dataset_file = os.path.join(data_dir, "pdbbind_v2015.tar.gz")
  data_folder = os.path.join(data_dir, "v2015")
  if not os.path.exists(dataset_file):
    logger.warning("About to download PDBBind full dataset. Large file, 2GB")
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/' +
        "pdbbind_v2015.tar.gz")
  if os.path.exists(data_folder):
    logger.info("Data directory for %s already exists" % subset)
  else:
    print("Untarring full dataset")
    deepchem.utils.untargz_file(dataset_file, dest_dir=data_dir)
  if subset == "core":
    index_file = os.path.join(data_folder, "INDEX_core_name.2013")
    labels_file = os.path.join(data_folder, "INDEX_core_data.2013")
  elif subset == "refined":
    index_file = os.path.join(data_folder, "INDEX_refined_name.2013")
    labels_file = os.path.join(data_folder, "INDEX_refined_data.2013")
  else:
    raise ValueError("Other subsets not supported")
  # Extract locations of data
  pdbs = []
  with open(index_file, "r") as g:
    lines = g.readlines()
    for line in lines:
      line = line.split(" ")
      pdb = line[0]
      if len(pdb) == 4:
        pdbs.append(pdb)
  protein_files = [
      os.path.join(data_folder, pdb, "%s_protein.pdb" % pdb) for pdb in pdbs
  ]
  ligand_files = [
      os.path.join(data_folder, pdb, "%s_ligand.sdf" % pdb) for pdb in pdbs
  ]
  # Extract labels
  labels = []
  with open(labels_file, "r") as f:
    lines = f.readlines()
    for line in lines:
      # Skip comment lines
      if line[0] == "#":
        continue
      # Lines have format
      # PDB code, resolution, release year, -logKd/Ki, Kd/Ki, reference, ligand name
      line = line.split()
      # The base-10 logarithm, -log kd/pk
      log_label = line[3]
      labels.append(log_label)
  # Featurize Data
  if featurizer == "grid":
    # TODO: This is not the correct setting. Set hyperparameters correctly
    ecfp_power = 5
    splif_power = 5
    featurizer = rgf.RdkitGridFeaturizer(
        voxel_width=16.0,
        feature_types=['ecfp', 'splif', 'hbond', 'salt_bridge'],
        ecfp_power=ecfp_power,
        splif_power=splif_power,
        flatten=True)
  else:
    raise ValueError("Featurizer not supported")
  print("Featurizing Complexes")
  features = featurizer.featurize_complexes(
      ligand_files, protein_files, log_every_n=1)
  dataset = deepchem.data.DiskDataset.from_numpy(features, labels)
  # No transformations of data
  transformers = []
  # TODO(rbharath): This should be modified to contain a cluster split so
  # structures of the same protein aren't in both train/test
  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  all_dataset = (train, valid, test)
  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
  return pdbbind_tasks, all_dataset, transformers