Commit 894b08a6 authored by miaecle's avatar miaecle
Browse files

dataset cache and new models

parent bfb81f38
Loading
Loading
Loading
Loading
+0 −47
Original line number Diff line number Diff line
@@ -132,21 +132,6 @@ def bond_features(bond):
                   bond.GetIsConjugated(),
                   bond.IsInRing()])

def pair_features(mol, canon_adj_list):
  features = np.zeros((mol.GetNumAtoms(), mol.GetNumAtoms(), 12))
  for a1 in mol.GetAtoms():
    a1_id = a1.GetIdx()
    for a2 in mol.GetAtoms():
      a2_id = a2.GetIdx()
      if a2_id in canon_adj_list[a1_id]:
        bt = bond_features(mol.GetBondBetweenAtoms(a1_id, a2_id))
  return np.array([bt == Chem.rdchem.BondType.SINGLE,
                   bt == Chem.rdchem.BondType.DOUBLE,
                   bt == Chem.rdchem.BondType.TRIPLE,
                   bt == Chem.rdchem.BondType.AROMATIC,
                   bond.GetIsConjugated(),
                   bond.IsInRing()])

class ConvMolFeaturizer(Featurizer):

  name = ['conv_mol']
@@ -175,35 +160,3 @@ class ConvMolFeaturizer(Featurizer):
      canon_adj_list[edge[1]].append(edge[0])

    return ConvMol(nodes, canon_adj_list)
    

class WeaveFeaturizer(Featurizer):

  name = ['weave_mol']
  def __init__(self):
    # Since ConvMol is an object and not a numpy array, need to set dtype to
    # object.
    self.dtype = object

  def _featurize(self, mol):
    """Encodes mol as a ConvMol object."""
    # Atom features
    idx_nodes = [(a.GetIdx(), atom_features(a)) for a in mol.GetAtoms()]
    idx_nodes.sort()  # Sort by ind to ensure same order as rd_kit
    idx, nodes = list(zip(*idx_nodes))

    # Stack nodes into an array
    nodes = np.vstack(nodes)

    # Get bond lists with reverse edges included
    edge_list = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds()]

    # Get canonical adjacency list
    canon_adj_list = [[] for mol_id in range(len(nodes))]
    for edge in edge_list:
      canon_adj_list[edge[0]].append(edge[1])
      canon_adj_list[edge[1]].append(edge[0])
    
    pairs = pair_features(mol, canon_adj_list)
      
    return ConvMol(nodes, canon_adj_list)
 No newline at end of file
+36 −1
Original line number Diff line number Diff line
@@ -4,30 +4,39 @@ CheckFeaturizer = {
    ('bace_c', 'tf_robust'): ['ECFP', 1024],
    ('bace_c', 'rf'): ['ECFP', 1024],
    ('bace_c', 'irv'): ['ECFP', 1024],
    ('bace_c', 'xgb'): ['ECFP', 1024],
    ('bace_c', 'graphconv'): ['GraphConv', 75],
    ('bace_c', 'dag'): ['GraphConv', 75],
    ('bbbp', 'logreg'): ['ECFP', 1024],
    ('bbbp', 'tf'): ['ECFP', 1024],
    ('bbbp', 'tf_robust'): ['ECFP', 1024],
    ('bbbp', 'rf'): ['ECFP', 1024],
    ('bbbp', 'irv'): ['ECFP', 1024],
    ('bbbp', 'xgb'): ['ECFP', 1024],
    ('bbbp', 'graphconv'): ['GraphConv', 75],
    ('bbbp', 'dag'): ['GraphConv', 75],
    ('clintox', 'logreg'): ['ECFP', 1024],
    ('clintox', 'tf'): ['ECFP', 1024],
    ('clintox', 'tf_robust'): ['ECFP', 1024],
    ('clintox', 'rf'): ['ECFP', 1024],
    ('clintox', 'irv'): ['ECFP', 1024],
    ('clintox', 'xgb'): ['ECFP', 1024],
    ('clintox', 'graphconv'): ['GraphConv', 75],
    ('clintox', 'dag'): ['GraphConv', 75],
    ('hiv', 'logreg'): ['ECFP', 1024],
    ('hiv', 'tf'): ['ECFP', 1024],
    ('hiv', 'tf_robust'): ['ECFP', 1024],
    ('hiv', 'rf'): ['ECFP', 1024],
    ('hiv', 'irv'): ['ECFP', 1024],
    ('hiv', 'xgb'): ['ECFP', 1024],
    ('hiv', 'graphconv'): ['GraphConv', 75],
    ('hiv', 'dag'): ['GraphConv', 75],
    ('muv', 'logreg'): ['ECFP', 1024],
    ('muv', 'tf'): ['ECFP', 1024],
    ('muv', 'tf_robust'): ['ECFP', 1024],
    ('muv', 'rf'): ['ECFP', 1024],
    ('muv', 'irv'): ['ECFP', 1024],
    ('muv', 'xgb'): ['ECFP', 1024],
    ('muv', 'graphconv'): ['GraphConv', 75],
    ('muv', 'siamese'): ['GraphConv', 75],
    ('muv', 'attn'): ['GraphConv', 75],
@@ -37,13 +46,16 @@ CheckFeaturizer = {
    ('pcba', 'tf_robust'): ['ECFP', 1024],
    ('pcba', 'rf'): ['ECFP', 1024],
    ('pcba', 'irv'): ['ECFP', 1024],
    ('pcba', 'xgb'): ['ECFP', 1024],
    ('pcba', 'graphconv'): ['GraphConv', 75],
    ('sider', 'logreg'): ['ECFP', 1024],
    ('sider', 'tf'): ['ECFP', 1024],
    ('sider', 'tf_robust'): ['ECFP', 1024],
    ('sider', 'rf'): ['ECFP', 1024],
    ('sider', 'irv'): ['ECFP', 1024],
    ('sider', 'xgb'): ['ECFP', 1024],
    ('sider', 'graphconv'): ['GraphConv', 75],
    ('sider', 'dag'): ['GraphConv', 75],
    ('sider', 'siamese'): ['GraphConv', 75],
    ('sider', 'attn'): ['GraphConv', 75],
    ('sider', 'res'): ['GraphConv', 75],
@@ -52,7 +64,9 @@ CheckFeaturizer = {
    ('tox21', 'tf_robust'): ['ECFP', 1024],
    ('tox21', 'rf'): ['ECFP', 1024],
    ('tox21', 'irv'): ['ECFP', 1024],
    ('tox21', 'xgb'): ['ECFP', 1024],
    ('tox21', 'graphconv'): ['GraphConv', 75],
    ('tox21', 'dag'): ['GraphConv', 75],
    ('tox21', 'siamese'): ['GraphConv', 75],
    ('tox21', 'attn'): ['GraphConv', 75],
    ('tox21', 'res'): ['GraphConv', 75],
@@ -61,42 +75,63 @@ CheckFeaturizer = {
    ('toxcast', 'tf_robust'): ['ECFP', 1024],
    ('toxcast', 'rf'): ['ECFP', 1024],
    ('toxcast', 'irv'): ['ECFP', 1024],
    ('toxcast', 'xgb'): ['ECFP', 1024],
    ('toxcast', 'graphconv'): ['GraphConv', 75],
    ('bace_r', 'tf_regression'): ['ECFP', 1024],
    ('bace_r', 'rf_regression'): ['ECFP', 1024],
    ('bace_r', 'xgb_regression'): ['ECFP', 1024],
    ('bace_r', 'graphconvreg'): ['GraphConv', 75],
    ('bace_r', 'dag_regression'): ['GraphConv', 75],
    ('chembl', 'tf_regression'): ['ECFP', 1024],
    ('chembl', 'rf_regression'): ['ECFP', 1024],
    ('chembl', 'xgb_regression'): ['ECFP', 1024],
    ('chembl', 'graphconvreg'): ['GraphConv', 75],
    ('clearance', 'tf_regression'): ['ECFP', 1024],
    ('clearance', 'rf_regression'): ['ECFP', 1024],
    ('clearance', 'xgb_regression'): ['ECFP', 1024],
    ('clearance', 'graphconvreg'): ['GraphConv', 75],
    ('clearance', 'dag_regression'): ['GraphConv', 75],
    ('delaney', 'tf_regression'): ['ECFP', 1024],
    ('delaney', 'rf_regression'): ['ECFP', 1024],
    ('delaney', 'xgb_regression'): ['ECFP', 1024],
    ('delaney', 'graphconvreg'): ['GraphConv', 75],
    ('delaney', 'dag_regression'): ['GraphConv', 75],
    ('hopv', 'tf_regression'): ['ECFP', 1024],
    ('hopv', 'rf_regression'): ['ECFP', 1024],
    ('hopv', 'xgb_regression'): ['ECFP', 1024],
    ('hopv', 'graphconvreg'): ['GraphConv', 75],
    ('hopv', 'dag_regression'): ['GraphConv', 75],
    ('lipo', 'tf_regression'): ['ECFP', 1024],
    ('lipo', 'rf_regression'): ['ECFP', 1024],
    ('lipo', 'xgb_regression'): ['ECFP', 1024],
    ('lipo', 'graphconvreg'): ['GraphConv', 75],
    ('lipo', 'dag_regression'): ['GraphConv', 75],
    ('nci', 'tf_regression'): ['ECFP', 1024],
    ('nci', 'rf_regression'): ['ECFP', 1024],
    ('nci', 'xgb_regression'): ['ECFP', 1024],
    ('nci', 'graphconvreg'): ['GraphConv', 75],
    ('ppb', 'tf_regression'): ['ECFP', 1024],
    ('ppb', 'rf_regression'): ['ECFP', 1024],
    ('ppb', 'xgb_regression'): ['ECFP', 1024],
    ('ppb', 'graphconvreg'): ['GraphConv', 75],
    ('ppb', 'dag_regression'): ['GraphConv', 75],
    ('sampl', 'tf_regression'): ['ECFP', 1024],
    ('sampl', 'rf_regression'): ['ECFP', 1024],
    ('sampl', 'xgb_regression'): ['ECFP', 1024],
    ('sampl', 'graphconvreg'): ['GraphConv', 75],
    ('sampl', 'dag_regression'): ['GraphConv', 75],
    ('kaggle', 'tf_regression'): [None, 14293],
    ('kaggle', 'rf_regression'): [None, 14293],
    ('pdbbind', 'tf_regression'): ['grid', 2052],
    ('pdbbind', 'rf_regression'): ['grid', 2052],
    ('qm7', 'tf_regression_ft'): [None, [23, 23]],
    ('qm7', 'dtnn'): [None, [23, 23]],
    ('qm7b', 'tf_regression_ft'): [None, [23, 23]],
    ('qm7b', 'dtnn'): [None, [23, 23]],
    ('qm8', 'tf_regression_ft'): [None, [26, 26]],
    ('qm9', 'tf_regression_ft'): [None, [29, 29]]
    ('qm8', 'dtnn'): [None, [26, 26]],
    ('qm9', 'tf_regression_ft'): [None, [29, 29]],
    ('qm9', 'dtnn'): [None, [29, 29]]
}

CheckSplit = {
+51 −2
Original line number Diff line number Diff line
@@ -7,15 +7,19 @@ from __future__ import unicode_literals

import os
import deepchem
import pickle
from deepchem.molnet.load_function.bace_features import bace_user_specified_features


def load_bace_regression(featurizer=None, split='random'):
def load_bace_regression(featurizer=None, split='random', reload=True):
  """Load bace datasets."""
  # Featurize bace dataset
  print("About to featurize bace dataset.")
  save = False
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
    if reload:
      save = True
  else:
    data_dir = "/tmp"

@@ -28,6 +32,21 @@ def load_bace_regression(featurizer=None, split='random'):
    )

  bace_tasks = ["pIC50"]
  if save:
    save_dir = os.path.join(data_dir, "bace_r/" + featurizer + "/" + split)
    train_dir = os.path.join(save_dir, "train_dir")
    valid_dir = os.path.join(save_dir, "valid_dir")
    test_dir = os.path.join(save_dir, "test_dir")
    if os.path.exists(train_dir) and os.path.exists(
        valid_dir) and os.path.exists(test_dir):
      train = deepchem.data.DiskDataset(train_dir)
      valid = deepchem.data.DiskDataset(valid_dir)
      test = deepchem.data.DiskDataset(test_dir)
      all_dataset = (train, valid, test)
      with open(os.path.join(save_dir, "transformers.pkl"), 'r') as f:
        transformers = pickle.load(f)
      return bace_tasks, all_dataset, transformers

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
@@ -59,15 +78,24 @@ def load_bace_regression(featurizer=None, split='random'):
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  if save:
    train.move(train_dir)
    valid.move(valid_dir)
    test.move(test_dir)
    with open(os.path.join(save_dir, "transformers.pkl"), 'w') as f:
      pickle.dump(transformers, f)
  return bace_tasks, (train, valid, test), transformers


def load_bace_classification(featurizer=None, split='random'):
def load_bace_classification(featurizer=None, split='random', reload=True):
  """Load bace datasets."""
  # Featurize bace dataset
  print("About to featurize bace dataset.")
  save = False
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
    if reload:
      save = True
  else:
    data_dir = "/tmp"

@@ -80,6 +108,21 @@ def load_bace_classification(featurizer=None, split='random'):
    )

  bace_tasks = ["Class"]
  if save:
    save_dir = os.path.join(data_dir, "bace_c/" + featurizer + "/" + split)
    train_dir = os.path.join(save_dir, "train_dir")
    valid_dir = os.path.join(save_dir, "valid_dir")
    test_dir = os.path.join(save_dir, "test_dir")
    if os.path.exists(train_dir) and os.path.exists(
        valid_dir) and os.path.exists(test_dir):
      train = deepchem.data.DiskDataset(train_dir)
      valid = deepchem.data.DiskDataset(valid_dir)
      test = deepchem.data.DiskDataset(test_dir)
      all_dataset = (train, valid, test)
      with open(os.path.join(save_dir, "transformers.pkl"), 'r') as f:
        transformers = pickle.load(f)
      return bace_tasks, all_dataset, transformers

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
@@ -110,4 +153,10 @@ def load_bace_classification(featurizer=None, split='random'):
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  if save:
    train.move(train_dir)
    valid.move(valid_dir)
    test.move(test_dir)
    with open(os.path.join(save_dir, "transformers.pkl"), 'w') as f:
      pickle.dump(transformers, f)
  return bace_tasks, (train, valid, test), transformers
+27 −1
Original line number Diff line number Diff line
@@ -7,14 +7,18 @@ from __future__ import unicode_literals

import os
import deepchem
import pickle


def load_bbbp(featurizer='ECFP', split='index'):
def load_bbbp(featurizer='ECFP', split='index', reload=True):
  """Load blood-brain barrier penetration datasets """
  # Featurize bbb dataset
  print("About to featurize bbbp dataset.")
  save = False
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
    if reload:
      save = True
  else:
    data_dir = "/tmp"

@@ -26,6 +30,22 @@ def load_bbbp(featurizer='ECFP', split='index'):
    )

  bbbp_tasks = ["p_np"]

  if save:
    save_dir = os.path.join(data_dir, "bbbp/" + featurizer + "/" + split)
    train_dir = os.path.join(save_dir, "train_dir")
    valid_dir = os.path.join(save_dir, "valid_dir")
    test_dir = os.path.join(save_dir, "test_dir")
    if os.path.exists(train_dir) and os.path.exists(
        valid_dir) and os.path.exists(test_dir):
      train = deepchem.data.DiskDataset(train_dir)
      valid = deepchem.data.DiskDataset(valid_dir)
      test = deepchem.data.DiskDataset(test_dir)
      all_dataset = (train, valid, test)
      with open(os.path.join(save_dir, "transformers.pkl"), 'r') as f:
        transformers = pickle.load(f)
      return bbbp_tasks, all_dataset, transformers

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
@@ -52,4 +72,10 @@ def load_bbbp(featurizer='ECFP', split='index'):
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  if save:
    train.move(train_dir)
    valid.move(valid_dir)
    test.move(test_dir)
    with open(os.path.join(save_dir, "transformers.pkl"), 'w') as f:
      pickle.dump(transformers, f)
  return bbbp_tasks, (train, valid, test), transformers
+27 −3
Original line number Diff line number Diff line
@@ -7,17 +7,21 @@ from __future__ import unicode_literals

import os
import deepchem

import pickle
from deepchem.molnet.load_function.chembl_tasks import chembl_tasks


def load_chembl(shard_size=2000,
                featurizer="ECFP",
                set="5thresh",
                split="random"):
                split="random",
                reload=True):

  save = False
  if "DEEPCHEM_DATA_DIR" in os.environ:
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
    if reload:
      save = True
  else:
    data_dir = "/tmp"

@@ -57,6 +61,21 @@ def load_chembl(shard_size=2000,
    )

  print("About to load ChEMBL dataset.")
  if save:
    save_dir = os.path.join(data_dir, "chembl/" + featurizer + "/" + split)
    train_dir = os.path.join(save_dir, "train_dir")
    valid_dir = os.path.join(save_dir, "valid_dir")
    test_dir = os.path.join(save_dir, "test_dir")
    if os.path.exists(train_dir) and os.path.exists(
        valid_dir) and os.path.exists(test_dir):
      train = deepchem.data.DiskDataset(train_dir)
      valid = deepchem.data.DiskDataset(valid_dir)
      test = deepchem.data.DiskDataset(test_dir)
      all_dataset = (train, valid, test)
      with open(os.path.join(save_dir, "transformers.pkl"), 'r') as f:
        transformers = pickle.load(f)
      return chembl_tasks, all_dataset, transformers

  if split == "year":
    train_files = os.path.join(
        data_dir, "./chembl_year_sets/chembl_%s_ts_train.csv.gz" % set)
@@ -115,5 +134,10 @@ def load_chembl(shard_size=2000,
    splitter = splitters[split]
    print("Performing new split.")
    train, valid, test = splitter.train_valid_test_split(dataset)

  if save:
    train.move(train_dir)
    valid.move(valid_dir)
    test.move(test_dir)
    with open(os.path.join(save_dir, "transformers.pkl"), 'w') as f:
      pickle.dump(transformers, f)
  return chembl_tasks, (train, valid, test), transformers
Loading