Commit 3f2475b7 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #578 from miaecle/qm

Molnet v2 changes
parents 1ea69f5f 3bbebc3e
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -84,7 +84,8 @@ class MultitaskGraphClassifier(Model):
               pad_batches=True,
               verbose=True):

    super().__init__(self, model_dir=logdir, verbose=verbose)
    super(MultitaskGraphClassifier, self).__init__(
        model_dir=logdir, verbose=verbose)
    self.n_tasks = n_tasks
    self.final_loss = final_loss
    self.model = model
+2 −1
Original line number Diff line number Diff line
@@ -43,7 +43,8 @@ class MultitaskGraphRegressor(Model):
               pad_batches=True,
               verbose=True):

    super().__init__(self, model_dir=logdir, verbose=verbose)
    super(MultitaskGraphRegressor, self).__init__(
        model_dir=logdir, verbose=verbose)
    self.n_tasks = n_tasks
    self.final_loss = final_loss
    self.model = model
+9 −0
Original line number Diff line number Diff line
@@ -142,12 +142,21 @@ CheckFeaturizer = {
    ('kaggle', 'rf_regression'): [None, 14293],
    ('pdbbind', 'tf_regression'): ['grid', 2052],
    ('pdbbind', 'rf_regression'): ['grid', 2052],
    ('qm7', 'tf_regression'): ['ECFP', 1024],
    ('qm7', 'rf_regression'): ['ECFP', 1024],
    ('qm7', 'graphconvreg'): ['GraphConv', 75],
    ('qm7', 'tf_regression_ft'): ['CoulombMatrix', [23, 23]],
    ('qm7', 'dtnn'): ['CoulombMatrix', [23, 23]],
    ('qm7b', 'tf_regression_ft'): ['CoulombMatrix', [23, 23]],
    ('qm7b', 'dtnn'): ['CoulombMatrix', [23, 23]],
    ('qm8', 'tf_regression'): ['ECFP', 1024],
    ('qm8', 'rf_regression'): ['ECFP', 1024],
    ('qm8', 'graphconvreg'): ['GraphConv', 75],
    ('qm8', 'tf_regression_ft'): ['CoulombMatrix', [26, 26]],
    ('qm8', 'dtnn'): ['CoulombMatrix', [26, 26]],
    ('qm9', 'tf_regression'): ['ECFP', 1024],
    ('qm9', 'rf_regression'): ['ECFP', 1024],
    ('qm9', 'graphconvreg'): ['GraphConv', 75],
    ('qm9', 'tf_regression_ft'): ['CoulombMatrix', [29, 29]],
    ('qm9', 'dtnn'): ['CoulombMatrix', [29, 29]]
}
+73 −17
Original line number Diff line number Diff line
@@ -52,9 +52,10 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):

def load_pdbbind_grid(split="random",
                      featurizer="grid",
                      subset="full",
                      subset="core",
                      reload=True):
  """Load PDBBind datasets. Does not do train/test split"""
  if featurizer == 'grid':
    dataset, tasks = featurize_pdbbind(feat=featurizer, subset=subset)

    splitters = {
@@ -71,5 +72,60 @@ def load_pdbbind_grid(split="random",
      valid = transformer.transform(valid)
    for transformer in transformers:
      test = transformer.transform(test)
  else:
    if "DEEPCHEM_DATA_DIR" in os.environ:
      data_dir = os.environ["DEEPCHEM_DATA_DIR"]
    else:
      data_dir = "/tmp"
    if reload:
      save_dir = os.path.join(
          data_dir, "pdbbind_" + subset + "/" + featurizer + "/" + split)

    dataset_file = os.path.join(data_dir, subset + "_smiles_labels.csv")

    if not os.path.exists(dataset_file):
      os.system(
          'wget -P ' + data_dir +
          ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/' +
          subset + "_smiles_labels.csv")

    tasks = ["-logKd/Ki"]
    if reload:
      loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
          save_dir)
      if loaded:
        return tasks, all_dataset, transformers

    if featurizer == 'ECFP':
      featurizer = deepchem.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
      featurizer = deepchem.feat.ConvMolFeaturizer()
    elif featurizer == 'Weave':
      featurizer = deepchem.feat.WeaveFeaturizer()
    elif featurizer == 'Raw':
      featurizer = deepchem.feat.RawFeaturizer()

    loader = deepchem.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(dataset_file, shard_size=8192)
    transformers = [
        deepchem.trans.NormalizationTransformer(
            transform_y=True, dataset=dataset)
    ]

    for transformer in transformers:
      dataset = transformer.transform(dataset)

    splitters = {
        'index': deepchem.splits.IndexSplitter(),
        'random': deepchem.splits.RandomSplitter(),
        'scaffold': deepchem.splits.ScaffoldSplitter()
    }
    splitter = splitters[split]
    train, valid, test = splitter.train_valid_test_split(dataset)

    if reload:
      deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                               transformers)

  return tasks, (train, valid, test), transformers
+42 −14
Original line number Diff line number Diff line
@@ -18,7 +18,18 @@ def load_qm7_from_mat(featurizer='CoulombMatrix',
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
  else:
    data_dir = "/tmp"
  if reload:
    save_dir = os.path.join(data_dir, "qm7/" + featurizer + "/" + split)

  qm7_tasks = ["u0_atom"]

  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return qm7_tasks, all_dataset, transformers

  if featurizer == 'CoulombMatrix':
    dataset_file = os.path.join(data_dir, "qm7.mat")

    if not os.path.exists(dataset_file):
@@ -26,14 +37,29 @@ def load_qm7_from_mat(featurizer='CoulombMatrix',
          'wget -P ' + data_dir +
          ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/qm7.mat'
      )

    dataset = scipy.io.loadmat(dataset_file)

    X = dataset['X']
    y = dataset['T']
    w = np.ones_like(y)
    dataset = deepchem.data.DiskDataset.from_numpy(X, y, w, ids=None)
  print(len(dataset))
  else:
    dataset_file = os.path.join(data_dir, "qm7.csv")
    if not os.path.exists(dataset_file):
      os.system(
          'wget -P ' + data_dir +
          ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/qm7.csv '
      )
    if featurizer == 'ECFP':
      featurizer = deepchem.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
      featurizer = deepchem.feat.ConvMolFeaturizer()
    elif featurizer == 'Weave':
      featurizer = deepchem.feat.WeaveFeaturizer()
    elif featurizer == 'Raw':
      featurizer = deepchem.feat.RawFeaturizer()
    loader = deepchem.data.CSVLoader(
        tasks=qm7_tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(dataset_file)

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
@@ -54,8 +80,10 @@ def load_qm7_from_mat(featurizer='CoulombMatrix',
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    test_dataset = transformer.transform(test_dataset)
  if reload:
    deepchem.utils.save.save_dataset_to_disk(
        save_dir, train_dataset, valid_dataset, test_dataset, transformers)

  qm7_tasks = np.arange(y.shape[0])
  return qm7_tasks, (train_dataset, valid_dataset, test_dataset), transformers


Loading