Commit c0a17207 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #336 from miaecle/temp

merge in nci dataset
parents eb8124e4 16ecf372
Loading
Loading
Loading
Loading
+29 −23
Original line number Diff line number Diff line
@@ -195,26 +195,26 @@ Random splitting

|Dataset    |Model               |Train score/ROC-AUC|Valid score/ROC-AUC|
|-----------|--------------------|-------------------|-------------------|
|tox21      |logistic regression |0.903              |0.741              |
|           |Multitask network   |0.846              |0.812              |
|           |robust MT-NN        |0.844              |0.793              |
|           |graph convolution   |0.872              |0.816              |
|muv        |logistic regression |0.961              |0.696              |
|           |Multitask network   |0.895              |0.740              |
|           |robust MT-NN        |0.914              |0.667              |
|           |graph convolution   |0.846              |0.776              |
|pcba       |logistic regression |0.807        	     |0.772              |
|           |Multitask network   |0.811        	     |0.787              |
|           |robust MT-NN        |0.809              |0.778              |
|           |graph convolution   |0.875       	     |0.844              |
|sider      |logistic regression |0.932        	     |0.628              |
|           |Multitask network   |0.779        	     |0.665              |
|           |robust MT-NN        |0.761              |0.621              |
|           |graph convolution   |0.706        	     |0.638              |
|toxcast    |logistic regression |0.737        	     |0.543              |
|           |Multitask network   |0.831        	     |0.684              |
|           |robust MT-NN        |0.814              |0.692              |
|           |graph convolution   |0.820        	     |0.692              |
|tox21      |logistic regression |0.903              |0.735              |
|           |Multitask network   |0.856              |0.783              |
|           |robust MT-NN        |0.855              |0.773              |
|           |graph convolution   |0.865              |0.827              |
|muv        |logistic regression |0.957              |0.719              |
|           |Multitask network   |0.902              |0.734              |
|           |robust MT-NN        |0.933              |0.732              |
|           |graph convolution   |0.860              |0.730              |
|pcba       |logistic regression |0.808        	     |0.776              |
|           |Multitask network   |0.811        	     |0.778              |
|           |robust MT-NN        |0.811              |0.771              |
|           |graph convolution   |0.872       	     |0.844              |
|sider      |logistic regression |0.929        	     |0.656              |
|           |Multitask network   |0.777        	     |0.655              |
|           |robust MT-NN        |0.804              |0.630              |
|           |graph convolution   |0.705        	     |0.618              |
|toxcast    |logistic regression |0.725        	     |0.586              |
|           |Multitask network   |0.836        	     |0.684              |
|           |robust MT-NN        |0.822              |0.681              |
|           |graph convolution   |0.820        	     |0.717              |

Scaffold splitting

@@ -246,11 +246,14 @@ Scaffold splitting
|Dataset    |Model               |Splitting   |Train score/R2|Valid score/R2|
|-----------|--------------------|------------|--------------|--------------|
|delaney    |MT-NN regression    |Index       |0.773         |0.574         |
|           |graphconv regression|Index       |0.964         |0.829         |
|           |graphconv regression|Index       |0.991         |0.825         |
|           |MT-NN regression    |Random      |0.769         |0.591         |
|           |graphconv regression|Random      |0.959         |0.821         |
|           |graphconv regression|Random      |0.996         |0.873         |
|           |MT-NN regression    |Scaffold    |0.782         |0.426         |
|           |graphconv regression|Scaffold    |0.976         |0.581         |
|           |graphconv regression|Scaffold    |0.994         |0.606         |
|nci        |MT-NN regression    |Index       |0.890         |0.890         |
|           |MT-NN regression    |Random      |0.891         |0.888         |
|           |MT-NN regression    |Scaffold    |0.912         |0.020         |
|kaggle     |MT-NN regression    |User-defined|0.748         |0.452         |

* General features
@@ -266,6 +269,7 @@ Number of tasks and examples in the datasets
|toxcast    |617        |8615       |
|delaney    |1          |1128       |
|kaggle     |15         |173065     |
|nci        |60         |1057371    |

Time needed for benchmark test(~20h in total)

@@ -292,6 +296,8 @@ Time needed for benchmark test(~20h in total)
|           |robust MT-NN        |80              |4000           |
|           |graph convolution   |80              |900            |
|delaney    |MT-NN regression    |10              |40             |
|           |graphconv regression|10              |40             |
|nci        |MT-NN regression    |2000            |30000          |
|kaggle     |MT-NN regression    |2200            |3200           |


+1 −1
Original line number Diff line number Diff line
@@ -110,7 +110,7 @@ def atom_features(atom, bool_id_feat=False):
         'Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H',    # H?
         'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr',
         'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
        one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6]) +
        one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +
        one_of_k_encoding_unk(atom.GetTotalNumHs(), [0, 1, 2, 3, 4]) +
        one_of_k_encoding_unk(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6]) +
        [atom.GetFormalCharge(), atom.GetNumRadicalElectrons()] +
+3 −3
Original line number Diff line number Diff line
@@ -47,7 +47,7 @@ class ConvMol(object):
  Resorts order of atoms internally to be in order of increasing degree. Note
  that only heavy atoms (hydrogens excluded) are considered here.
  """
  def __init__(self, atom_features, adj_list, max_deg=6, min_deg=0):
  def __init__(self, atom_features, adj_list, max_deg=10, min_deg=0):
    """
    Parameters
    ----------
@@ -223,7 +223,7 @@ class ConvMol(object):

  # TODO(rbharath): Can this be removed?
  @staticmethod
  def get_null_mol(n_feat, max_deg=6, min_deg=0):
  def get_null_mol(n_feat, max_deg=10, min_deg=0):
    """Constructs a null molecules

    Get one molecule with one atom of each degree, with all the atoms 
@@ -243,7 +243,7 @@ class ConvMol(object):
    return ConvMol(atom_features, canon_adj_list)

  @staticmethod
  def agglomerate_mols(mols, max_deg=6, min_deg=0):
  def agglomerate_mols(mols, max_deg=10, min_deg=0):
    """Concatenates list of ConvMol's into one mol object that can be used to feed 
    into tensorflow placeholders. The indexing of the molecules are preseved during the
    combination, but the indexing of the atoms are greatly changed.
+9 −3
Original line number Diff line number Diff line
@@ -49,7 +49,12 @@ class TestMolGraphs(unittest.TestCase):
        # 0 atoms of degree 4
        # 0 atoms of degree 5
        # 0 atoms of degree 6
        np.array([[0, 0], [0, 0], [0, 4], [0, 0], [0, 0], [0, 0], [0,0]]))
        # 0 atoms of degree 7
        # 0 atoms of degree 8
        # 0 atoms of degree 9
        # 0 atoms of degree 10
        np.array([[0, 0], [0, 0], [0, 4], [0, 0], [0, 0], [0, 0], [0, 0], 
                  [0, 0], [0, 0], [0, 0], [0, 0]]))

  def test_get_atom_features(self):
    """Test that the atom features are computed properly."""
@@ -168,10 +173,11 @@ class TestMolGraphs(unittest.TestCase):

    # Check that atoms are only connected to themselves.
    assert np.array_equal(
        deg_adj_lists[6], [[6, 6, 6, 6, 6, 6]])
        deg_adj_lists[10], [[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]])
    assert np.array_equal(
        deg_adj_lists[1], [[1]])
    # Check that there's one atom of each degree.
    assert np.array_equal(
        null_mol.get_deg_slice(),
        [[0, 1], [1, 1], [2, 1], [3, 1], [4, 1], [5, 1], [6, 1]])
        [[0, 1], [1, 1], [2, 1], [3, 1], [4, 1], [5, 1], [6, 1],
         [7, 1], [8, 1], [9, 1], [10, 1]])
+5 −5
Original line number Diff line number Diff line
@@ -486,7 +486,7 @@ class TestOverfit(test_util.TensorFlowTestCase):
      classification_metric = dc.metrics.Metric(
          dc.metrics.accuracy_score)

      n_feat = 71
      n_feat = 75
      batch_size = 10
      graph_model = dc.nn.SequentialGraph(n_feat)
      graph_model.add(dc.nn.GraphConv(64, activation='relu'))
@@ -537,7 +537,7 @@ class TestOverfit(test_util.TensorFlowTestCase):
          dc.metrics.mean_squared_error,
          task_averager=np.mean)

      n_feat = 71
      n_feat = 75
      batch_size = 10
      graph_model = dc.nn.SequentialGraph(n_feat)
      graph_model.add(dc.nn.GraphConv(64, activation='relu'))
@@ -572,7 +572,7 @@ class TestOverfit(test_util.TensorFlowTestCase):
    K.set_session(sess)
    with g.as_default():
      n_tasks = 1
      n_feat = 71
      n_feat = 75
      max_depth = 4
      n_pos = 6
      n_neg = 4
@@ -637,7 +637,7 @@ class TestOverfit(test_util.TensorFlowTestCase):
    K.set_session(sess)
    with g.as_default():
      n_tasks = 1
      n_feat = 71
      n_feat = 75
      max_depth = 4
      n_pos = 6
      n_neg = 4
@@ -704,7 +704,7 @@ class TestOverfit(test_util.TensorFlowTestCase):
    K.set_session(sess)
    with g.as_default():
      n_tasks = 1
      n_feat = 71
      n_feat = 75
      max_depth = 4
      n_pos = 6
      n_neg = 4
Loading