Commit b047a00b authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Merge branch 'pddbind_load' of https://github.com/rbharath/deepchem into pddbind_load

parents 62948177 056e7745
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -16,7 +16,7 @@ from deepchem.molnet.load_function.lipo_datasets import load_lipo
from deepchem.molnet.load_function.muv_datasets import load_muv
from deepchem.molnet.load_function.nci_datasets import load_nci
from deepchem.molnet.load_function.pcba_datasets import load_pcba, load_pcba_146, load_pcba_2475
from deepchem.molnet.load_function.pdbbind_datasets import load_pdbbind_grid
from deepchem.molnet.load_function.pdbbind_datasets import load_pdbbind_grid, extract_pdbbind
from deepchem.molnet.load_function.ppb_datasets import load_ppb
from deepchem.molnet.load_function.qm7_datasets import load_qm7
from deepchem.molnet.load_function.qm7_datasets import load_qm7_from_mat, load_qm7b_from_mat
+93 −1
Original line number Diff line number Diff line
@@ -14,6 +14,10 @@ import time
import deepchem
import numpy as np
import pandas as pd
import logging
import tarfile

logger = logging.getLogger(__name__)


def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
@@ -49,7 +53,7 @@ def load_pdbbind_grid(split="random",
                      featurizer="grid",
                      subset="core",
                      reload=True):
  """Load PDBBind datasets. Does not do train/test split"""
  """Load PDBBind datasets."""
  if featurizer == 'grid':
    dataset, tasks = featurize_pdbbind(feat=featurizer, subset=subset)

@@ -126,3 +130,91 @@ def load_pdbbind_grid(split="random",
                                               transformers)

    return tasks, (train, valid, test), transformers


def extract_pdbbind(featurizer, split="random", subset="core", reload=True):
  """Load and featurize raw PDBBind dataset."""
  # TODO(rbharath): This should contain a cluster split so structures of the
  # same protein aren't in both train/test
  tasks = ["-logKd/Ki"]
  data_dir = deepchem.utils.get_data_dir()
  dataset_file = os.path.join(data_dir, "pdbbind_v2015.tar.gz")
  if subset == "core":
    data_folder = os.path.join(data_dir, "pdbbind_core")
  elif subset == "refined":
    data_folder = os.path.join(data_dir, "pdbbind_refined")
  else:
    raise ValueError("Unsupported subset %s." % subset)
  if os.path.exists(data_folder):
    logger.info("Data directory for %s already exists" % subset)
  print("dataset_file")
  print(dataset_file)
  print("os.path.exists(dataset_file)")
  print(os.path.exists(dataset_file))
  if not os.path.exists(dataset_file):
    logger.warning("About to download PDBBind full dataset. Large file, 2GB")
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/' +
        "pdbbind_v2015.tar.gz")
  tar = tarfile.open(dataset_file, "r:gz")
  pdbs = []
  if subset == "core":
    f = tar.extractfile("v2015/INDEX_core_name.2013")
    contentlines = f.readlines()
    for line in contentlines:
      line = line.decode("utf-8")
      line = line.split(" ")
      pdb = line[0]
      print("pdb")
      print(pdb)
      print("len(pdb)")
      print(len(pdb))
      # TODO(rbharath): Why 6 instead of 4?
      if len(pdb) == 4:
        pdbs.append(pdb)
  elif subset == "refined":
    f = tar.extractfile("v2015/INDEX_refined_name.2015")
    contentlines = f.readlines()
    for line in contentlines:
      line = str(line)
      line = line.split(" ")
      pdb = line[0]
      if len(pdb) == 4:
        pdbs.append(pdb)
  else:
    raise ValueError("Other subsets not supported.")
  print("pdbs")
  print(pdbs)
  # Make dir
  if not os.path.exists(data_folder):
    os.makedirs(data_folder)
  for ind, pdb in enumerate(pdbs):
    protein_filename = "v2015/" + pdb + "/" + pdb + "_protein.pdb"
    ligand_filename = "v2015/" + pdb + "/" + pdb + "_ligand.sdf"
    print("ind")
    print(ind)
    print("protein_filename, ligand_filename")
    print(protein_filename, ligand_filename)
    protein_f = tar.extractfile(protein_filename)
    protein_lines = protein_f.readlines()
    ligand_f = tar.extractfile(ligand_filename)
    ligand_lines = ligand_f.readlines()
    print("read lines")
    protein_out = os.path.join(data_folder, pdb + "_protein.pdb")
    with open(protein_out, "w") as f:
      print("type(protein_lines)")
      print(type(protein_lines))
      f.writelines(protein_lines)
    ligand_out = os.path.join(data_folder, pdb + "_ligand.sdf")
    with open(ligand_out, "w") as f:
      f.writelines(ligand_lines)
    return pdbs
#  for member in tar.getmembers():
#    print("member.name")
#    print(member.name)
#    if member.name == "v2015/INDEX_core_name.2013":
#      f = tar.extractfile(member)
#      contentlines = f.read()
#      print("content")
#      print(content)
#      break