Commit ac458a40 authored by ZHENQIN WU's avatar ZHENQIN WU
Browse files

update performances

parent 7268451b
Loading
Loading
Loading
Loading
+11 −7
Original line number Diff line number Diff line
@@ -347,9 +347,12 @@ Scaffold splitting
|                |Random forest       |Scaffold    |0.956         |0.201         |
|                |NN regression       |Scaffold    |0.897         |0.208         |
|                |Graphconv regression|Scaffold    |0.783         |0.068         |
|chembl          |MT-NN regression    |Index       |0.443         |0.427         |
|                |MT-NN regression    |Random      |0.464         |0.434         |
|                |MT-NN regression    |Scaffold    |0.484         |0.361         |
|chembl          |MT-NN regression    |Index       |0.828         |0.565         |
|                |Graphconv regression|Index       |0.192         |0.293         |
|                |MT-NN regression    |Random      |0.829         |0.562         |
|                |Graphconv regression|Random      |0.198         |0.271         |
|                |MT-NN regression    |Scaffold    |0.843         |0.430         |
|                |Graphconv regression|Scaffold    |0.231         |0.294         |
|clearance       |Random forest       |Index       |0.953         |0.244         |
|                |NN regression       |Index       |0.884         |0.211         |
|                |Graphconv regression|Index       |0.696         |0.230         |
@@ -388,11 +391,11 @@ Scaffold splitting
|                |NN regression       |Scaffold    |0.831         |0.302         |
|                |Graphconv regression|Scaffold    |0.882         |0.593         |
|nci             |MT-NN regression    |Index       |0.690         |0.062         |
|                |Graphconv regression|Index       |0.123         |0.048         |
|                |MT-NN regression    |Random      |0.168         |0.085         |
|                |Graphconv regression|Index       |0.123         |0.053         |
|                |MT-NN regression    |Random      |0.698         |0.117         |
|                |Graphconv regression|Random      |0.117         |0.076         |
|                |MT-NN regression    |Scaffold    |0.180         |0.052         |
|                |Graphconv regression|Scaffold    |0.131         |0.046         |
|                |MT-NN regression    |Scaffold    |0.692         |0.036         |
|                |Graphconv regression|Scaffold    |0.131         |0.036         |
|pdbbind(core)   |Random forest       |Random      |0.969         |0.445         |
|                |NN regression       |Random      |0.973         |0.494         |
|pdbbind(refined)|Random forest       |Random      |0.963         |0.511         |
@@ -528,6 +531,7 @@ Time needed for benchmark test(~20h in total)
|                |Graphconv regression|10              |110            |
|                |Random forest       |10              |50             |
|chembl          |MT-NN regression    |200             |9000           |
|                |Graphconv regression|250             |1800           |
|clearance       |NN regression       |10              |20             |
|                |Graphconv regression|10              |60             |
|                |Random forest       |10              |10             |
+9 −14
Original line number Diff line number Diff line
@@ -42,7 +42,6 @@ def get_transformers(train_dataset):

# Set shard size low to avoid memory problems.
def gen_kaggle(KAGGLE_tasks,
               raw_train_dir,
               train_dir,
               valid_dir,
               test_dir,
@@ -94,9 +93,11 @@ def gen_kaggle(KAGGLE_tasks,
  remove_missing_entries(valid_dataset)
  remove_missing_entries(test_dataset)

  print("Shuffling order of train dataset.")
  train_dataset.sparse_shuffle()

  print("Transforming datasets with transformers.")
  transformers = get_transformers(train_dataset)
  raw_train_dataset = train_dataset

  for transformer in transformers:
    print("Performing transformations with %s" % transformer.__class__.__name__)
@@ -105,11 +106,7 @@ def gen_kaggle(KAGGLE_tasks,
    valid_dataset = transformer.transform(valid_dataset)
    test_dataset = transformer.transform(test_dataset)

  print("Shuffling order of train dataset.")
  train_dataset.sparse_shuffle()

  print("Moving directories")
  raw_train_dataset.move(raw_train_dir)
  train_dataset.move(train_dir)
  valid_dataset.move(valid_dir)
  test_dataset.move(test_dir)
@@ -119,7 +116,7 @@ def gen_kaggle(KAGGLE_tasks,
  print("TIMING: KAGGLE fitting took %0.3f s" % (time2 - time1))
  ############################################################## TIMING

  return (raw_train_dataset, train_dataset, valid_dataset, test_dataset)
  return train_dataset, valid_dataset, test_dataset


def load_kaggle(shard_size=2000, featurizer=None, split=None):
@@ -134,24 +131,22 @@ def load_kaggle(shard_size=2000, featurizer=None, split=None):
    data_dir = "/tmp"

  data_dir = os.path.join(data_dir, "kaggle")
  raw_train_dir = os.path.join(data_dir, "raw_train_dir")
  train_dir = os.path.join(data_dir, "train_dir")
  valid_dir = os.path.join(data_dir, "valid_dir")
  test_dir = os.path.join(data_dir, "test_dir")

  if (os.path.exists(raw_train_dir) and os.path.exists(train_dir) and
      os.path.exists(valid_dir) and os.path.exists(test_dir)):
  if (os.path.exists(train_dir) and os.path.exists(valid_dir) and
      os.path.exists(test_dir)):
    print("Reloading existing datasets")
    raw_train_dataset = deepchem.data.DiskDataset(raw_train_dir)
    train_dataset = deepchem.data.DiskDataset(train_dir)
    valid_dataset = deepchem.data.DiskDataset(valid_dir)
    test_dataset = deepchem.data.DiskDataset(test_dir)
  else:
    print("Featurizing datasets")
    (raw_train_dataset, train_dataset, valid_dataset, test_dataset) = \
      gen_kaggle(KAGGLE_tasks, raw_train_dir, train_dir, valid_dir, test_dir, data_dir,
    train_dataset, valid_dataset, test_dataset = \
      gen_kaggle(KAGGLE_tasks, train_dir, valid_dir, test_dir, data_dir,
                  shard_size=shard_size)

  transformers = get_transformers(raw_train_dataset)
  transformers = get_transformers(train_dataset)
  return KAGGLE_tasks, (train_dataset, valid_dataset,
                        test_dataset), transformers