Commit ff86541a authored by unknown's avatar unknown
Browse files

Merge remote-tracking branch 'remotes/mine/qm' into DAG

parents 1ea69f5f d4a5703b
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -142,12 +142,21 @@ CheckFeaturizer = {
    ('kaggle', 'rf_regression'): [None, 14293],
    ('pdbbind', 'tf_regression'): ['grid', 2052],
    ('pdbbind', 'rf_regression'): ['grid', 2052],
    ('qm7', 'tf_regression'): ['ECFP', 1024],
    ('qm7', 'rf_regression'): ['ECFP', 1024],
    ('qm7', 'graphconvreg'): ['GraphConv', 75],
    ('qm7', 'tf_regression_ft'): ['CoulombMatrix', [23, 23]],
    ('qm7', 'dtnn'): ['CoulombMatrix', [23, 23]],
    ('qm7b', 'tf_regression_ft'): ['CoulombMatrix', [23, 23]],
    ('qm7b', 'dtnn'): ['CoulombMatrix', [23, 23]],
    ('qm8', 'tf_regression'): ['ECFP', 1024],
    ('qm8', 'rf_regression'): ['ECFP', 1024],
    ('qm8', 'graphconvreg'): ['GraphConv', 75],
    ('qm8', 'tf_regression_ft'): ['CoulombMatrix', [26, 26]],
    ('qm8', 'dtnn'): ['CoulombMatrix', [26, 26]],
    ('qm9', 'tf_regression'): ['ECFP', 1024],
    ('qm9', 'rf_regression'): ['ECFP', 1024],
    ('qm9', 'graphconvreg'): ['GraphConv', 75],
    ('qm9', 'tf_regression_ft'): ['CoulombMatrix', [29, 29]],
    ('qm9', 'dtnn'): ['CoulombMatrix', [29, 29]]
}
+72 −17
Original line number Diff line number Diff line
@@ -52,9 +52,10 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):

def load_pdbbind_grid(split="random",
                      featurizer="grid",
                      subset="full",
                      subset="core",
                      reload=True):
  """Load PDBBind datasets. Does not do train/test split"""
  if featurizer == 'grid':
    dataset, tasks = featurize_pdbbind(feat=featurizer, subset=subset)

    splitters = {
@@ -71,5 +72,59 @@ def load_pdbbind_grid(split="random",
      valid = transformer.transform(valid)
    for transformer in transformers:
      test = transformer.transform(test)
  else:
    if "DEEPCHEM_DATA_DIR" in os.environ:
      data_dir = os.environ["DEEPCHEM_DATA_DIR"]
    else:
      data_dir = "/tmp"
    if reload:
      save_dir = os.path.join(data_dir, "pdbbind_" + subset + "/" + featurizer + "/" + split)

    dataset_file = os.path.join(data_dir, subset + "_smiles_labels.csv")

    if not os.path.exists(dataset_file):
      os.system(
          'wget -P ' + data_dir +
          ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/' + subset + "_smiles_labels.csv"
      )

    tasks = ["-logKd/Ki"]
    if reload:
      loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
          save_dir)
      if loaded:
        return tasks, all_dataset, transformers

    if featurizer == 'ECFP':
      featurizer = deepchem.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
      featurizer = deepchem.feat.ConvMolFeaturizer()
    elif featurizer == 'Weave':
      featurizer = deepchem.feat.WeaveFeaturizer()
    elif featurizer == 'Raw':
      featurizer = deepchem.feat.RawFeaturizer()

    loader = deepchem.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(dataset_file, shard_size=8192)
    transformers = [
        deepchem.trans.NormalizationTransformer(
            transform_y=True, dataset=dataset)
    ]

    for transformer in transformers:
      dataset = transformer.transform(dataset)

    splitters = {
        'index': deepchem.splits.IndexSplitter(),
        'random': deepchem.splits.RandomSplitter(),
        'scaffold': deepchem.splits.ScaffoldSplitter()
    }
    splitter = splitters[split]
    train, valid, test = splitter.train_valid_test_split(dataset)

    if reload:
      deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                               transformers)

  return tasks, (train, valid, test), transformers
+46 −18
Original line number Diff line number Diff line
@@ -18,7 +18,18 @@ def load_qm7_from_mat(featurizer='CoulombMatrix',
    data_dir = os.environ["DEEPCHEM_DATA_DIR"]
  else:
    data_dir = "/tmp"
  if reload:
    save_dir = os.path.join(data_dir, "qm7/" + featurizer + "/" + split)
    
  qm7_tasks = ["u0_atom"]

  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return qm7_tasks, all_dataset, transformers
  
  if featurizer == 'CoulombMatrix':
    dataset_file = os.path.join(data_dir, "qm7.mat")

    if not os.path.exists(dataset_file):
@@ -26,14 +37,29 @@ def load_qm7_from_mat(featurizer='CoulombMatrix',
          'wget -P ' + data_dir +
          ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/qm7.mat'
      )

    dataset = scipy.io.loadmat(dataset_file)

    X = dataset['X']
    y = dataset['T']
    w = np.ones_like(y)
    dataset = deepchem.data.DiskDataset.from_numpy(X, y, w, ids=None)
  print(len(dataset))
  else:
    dataset_file = os.path.join(data_dir, "qm7.csv")
    if not os.path.exists(dataset_file):
      os.system(
          'wget -P ' + data_dir +
          ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/qm7.csv '
      )
    if featurizer == 'ECFP':
      featurizer = deepchem.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
      featurizer = deepchem.feat.ConvMolFeaturizer()
    elif featurizer == 'Weave':
      featurizer = deepchem.feat.WeaveFeaturizer()
    elif featurizer == 'Raw':
      featurizer = deepchem.feat.RawFeaturizer()
    loader = deepchem.data.CSVLoader(
        tasks=qm7_tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(dataset_file)
      
  splitters = {
      'index': deepchem.splits.IndexSplitter(),
@@ -54,8 +80,10 @@ def load_qm7_from_mat(featurizer='CoulombMatrix',
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    test_dataset = transformer.transform(test_dataset)
  if reload:
    deepchem.utils.save.save_dataset_to_disk(
        save_dir, train_dataset, valid_dataset, test_dataset, transformers)
    
  qm7_tasks = np.arange(y.shape[0])
  return qm7_tasks, (train_dataset, valid_dataset, test_dataset), transformers


+33 −13
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ def load_qm8(featurizer='CoulombMatrix', split='random', reload=True):
  if reload:
    save_dir = os.path.join(data_dir, "qm8/" + featurizer + "/" + split)

  if featurizer == 'CoulombMatrix':
    dataset_file = os.path.join(data_dir, "qm8.sdf")

    if not os.path.exists(dataset_file):
@@ -26,6 +27,13 @@ def load_qm8(featurizer='CoulombMatrix', split='random', reload=True):
      )
      os.system('tar -zxvf ' + os.path.join(data_dir, 'gdb8.tar.gz') + ' -C ' +
                data_dir)
  else:
    dataset_file = os.path.join(data_dir, "qm8.csv")
    if not os.path.exists(dataset_file):
      os.system(
          'wget -P ' + data_dir +
          ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/qm8.csv '
      )

  qm8_tasks = [
      "E1-CC2", "E2-CC2", "f1-CC2", "f2-CC2", "E1-PBE0", "E2-PBE0", "f1-PBE0",
@@ -46,6 +54,18 @@ def load_qm8(featurizer='CoulombMatrix', split='random', reload=True):
        smiles_field="smiles",
        mol_field="mol",
        featurizer=featurizer)
  else:
    if featurizer == 'ECFP':
      featurizer = deepchem.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
      featurizer = deepchem.feat.ConvMolFeaturizer()
    elif featurizer == 'Weave':
      featurizer = deepchem.feat.WeaveFeaturizer()
    elif featurizer == 'Raw':
      featurizer = deepchem.feat.RawFeaturizer()
    loader = deepchem.data.CSVLoader(
        tasks=qm8_tasks, smiles_field="smiles", featurizer=featurizer)
    
  dataset = loader.featurize(dataset_file)
  splitters = {
      'index': deepchem.splits.IndexSplitter(),
+35 −13
Original line number Diff line number Diff line
@@ -20,6 +20,8 @@ def load_qm9(featurizer='CoulombMatrix', split='random', reload=True):
  if reload:
    save_dir = os.path.join(data_dir, "qm9/" + featurizer + "/" + split)


  if featurizer == 'CoulombMatrix':
    dataset_file = os.path.join(data_dir, "gdb9.sdf")

    if not os.path.exists(dataset_file):
@@ -29,6 +31,13 @@ def load_qm9(featurizer='CoulombMatrix', split='random', reload=True):
      )
      os.system('tar -zxvf ' + os.path.join(data_dir, 'gdb9.tar.gz') + ' -C ' +
                data_dir)
  else:
    dataset_file = os.path.join(data_dir, "qm9.csv")
    if not os.path.exists(dataset_file):
      os.system(
          'wget -P ' + data_dir +
          ' http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/qm9.csv '
      )
      
  qm9_tasks = [
      "A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "cv",
@@ -48,6 +57,19 @@ def load_qm9(featurizer='CoulombMatrix', split='random', reload=True):
        smiles_field="smiles",
        mol_field="mol",
        featurizer=featurizer)
  else:
    if featurizer == 'ECFP':
      featurizer = deepchem.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
      featurizer = deepchem.feat.ConvMolFeaturizer()
    elif featurizer == 'Weave':
      featurizer = deepchem.feat.WeaveFeaturizer()
    elif featurizer == 'Raw':
      featurizer = deepchem.feat.RawFeaturizer()
    loader = deepchem.data.CSVLoader(
        tasks=qm9_tasks, smiles_field="smiles", featurizer=featurizer)


  dataset = loader.featurize(dataset_file)
  splitters = {
      'index': deepchem.splits.IndexSplitter(),
Loading