Commit 14d4ffa6 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #347 from miaecle/nci_ref2

nci dataset refinement
parents c0a17207 140a9fd4
Loading
Loading
Loading
Loading
+9 −5
Original line number Diff line number Diff line
@@ -251,9 +251,12 @@ Scaffold splitting
|           |graphconv regression|Random      |0.996         |0.873         |
|           |MT-NN regression    |Scaffold    |0.782         |0.426         |
|           |graphconv regression|Scaffold    |0.994         |0.606         |
|nci        |MT-NN regression    |Index       |0.890         |0.890         |
|           |MT-NN regression    |Random      |0.891         |0.888         |
|           |MT-NN regression    |Scaffold    |0.912         |0.020         |
|nci        |MT-NN regression    |Index       |0.171         |0.062         |
|           |graphconv regression|Index       |0.123         |0.048         |
|           |MT-NN regression    |Random      |0.168         |0.085         |
|           |graphconv regression|Random      |0.117         |0.076         |
|           |MT-NN regression    |Scaffold    |0.180         |0.052         |
|           |graphconv regression|Scaffold    |0.131         |0.046         |
|kaggle     |MT-NN regression    |User-defined|0.748         |0.452         |

* General features
@@ -269,7 +272,7 @@ Number of tasks and examples in the datasets
|toxcast    |617        |8615       |
|delaney    |1          |1128       |
|kaggle     |15         |173065     |
|nci        |60         |1057371    |
|nci        |60         |19127      |

Time needed for benchmark test(~20h in total)

@@ -297,7 +300,8 @@ Time needed for benchmark test(~20h in total)
|           |graph convolution   |80              |900            |
|delaney    |MT-NN regression    |10              |40             |
|           |graphconv regression|10              |40             |
|nci        |MT-NN regression    |2000            |30000          |
|nci        |MT-NN regression    |400             |1200           |
|           |graphconv regression|400             |2500           |
|kaggle     |MT-NN regression    |2200            |3200           |


+19127 −0

File added.

Preview size limit exceeded, changes collapsed.

+2 −2
Original line number Diff line number Diff line
@@ -137,7 +137,7 @@ def benchmark_loading_datasets(hyper_parameters,
    time_finish_fitting = time.time()
    
    
    with open(os.path.join(out_path, 'results.csv'),'ab') as f:
    with open(os.path.join(out_path, 'results.csv'),'a') as f:
      writer = csv.writer(f)
      if mode == 'classification':
        for i in train_score:
@@ -511,7 +511,7 @@ if __name__ == '__main__':
              'tf_regression', 'graphconvreg']
  if len(datasets) == 0:
    datasets = ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 
                'delaney', 'kaggle', 'nci']
                'delaney', 'nci', 'kaggle']

  #input hyperparameters
  #tf: dropouts, learning rate, layer_sizes, weight initial stddev,penalty,
+3 −7
Original line number Diff line number Diff line
@@ -19,12 +19,8 @@ def load_nci(featurizer='ECFP', shard_size=1000, split='random'):

  # Load nci dataset
  print("About to load NCI dataset.")
  dataset_file1_path = os.path.join(
      current_dir, "../../datasets/nci_1.csv.gz")
  dataset_file2_path = os.path.join(
      current_dir, "../../datasets/nci_2.csv.gz")

  dataset_paths = [dataset_file1_path, dataset_file2_path]
  dataset_path = os.path.join(
      current_dir, "../../datasets/nci_unique.csv")


  # Featurize nci dataset
@@ -50,7 +46,7 @@ def load_nci(featurizer='ECFP', shard_size=1000, split='random'):
  loader = dc.data.CSVLoader(
      tasks=all_nci_tasks, smiles_field="smiles", featurizer=featurizer)

  dataset = loader.featurize(dataset_paths, shard_size=shard_size)
  dataset = loader.featurize(dataset_path, shard_size=shard_size)

  # Initialize transformers
  print("About to transform data")
+6 −0
Original line number Diff line number Diff line
@@ -21,6 +21,8 @@
0,pcba,index,classification,train,graphconv,0.87647472,valid,graphconv,0.8523348204,time_for_running,14497.7339029
0,delaney,index,regression,train,tf_regression,0.7830983671,valid,tf_regression,0.5789729655,time_for_running,41.1367759705
0,delaney,index,regression,train,graphconvreg,0.9911206824,valid,graphconvreg,0.7892057714,time_for_running,101.8902909756
0,nci,index,regression,train,tf_regression,0.1705293978,valid,tf_regression,0.0623320607,time_for_running,1206.9139225483
0,nci,index,regression,train,graphconvreg,0.1232265163,valid,graphconvreg,0.0479731306,time_for_running,2324.3746082783
0,kaggle,None,regression,train,tf_regression,0.7480423542,valid,tf_regression,0.4516795145,time_for_running,3238.91535401
0,tox21,random,classification,train,tf,0.8565178786,valid,tf,0.7834036936,time_for_running,53.8197240829
0,tox21,random,classification,train,tf_robust,0.8549658589,valid,tf_robust,0.7735497329,time_for_running,88.9351768494
@@ -44,6 +46,8 @@
0,pcba,random,classification,train,graphconv,0.872172184,valid,graphconv,0.8435271472,time_for_running,11502.8221002
0,delaney,random,regression,train,tf_regression,0.7791066217,valid,tf_regression,0.6164873014,time_for_running,35.6433098316
0,delaney,random,regression,train,graphconvreg,0.9951851944,valid,graphconvreg,0.8397307618,time_for_running,102.9403319359
0,nci,random,regression,train,tf_regression,0.167724376,valid,tf_regression,0.0846994662,time_for_running,1255.847104311
0,nci,random,regression,train,graphconvreg,0.1173603957,valid,graphconvreg,0.0762376225,time_for_running,2608.3014204502
0,tox21,scaffold,classification,train,tf,0.8626085326,valid,tf,0.7030201614,time_for_running,63.5685660839
0,tox21,scaffold,classification,train,tf_robust,0.8608722489,valid,tf_robust,0.7100530015,time_for_running,101.614424944
0,tox21,scaffold,classification,train,logreg,0.9004137009,valid,logreg,0.650190286,time_for_running,60.018599987
@@ -66,3 +70,5 @@
0,pcba,scaffold,classification,train,graphconv,0.8743221913,valid,graphconv,0.8166550236,time_for_running,14184.1512611
0,delaney,scaffold,regression,train,tf_regression,0.7893516465,valid,tf_regression,0.4218847009,time_for_running,35.2720739841
0,delaney,scaffold,regression,train,graphconvreg,0.992822139,valid,graphconvreg,0.5578625785,time_for_running,100.2594189644
0,nci,scaffold,regression,train,tf_regression,0.1804114788,valid,tf_regression,0.0517622004,time_for_running,1294.0389139652
0,nci,scaffold,regression,train,graphconvreg,0.1309050281,valid,graphconvreg,0.0458034089,time_for_running,2546.8053176403