Commit 1d0f96e0 authored by Joseph Gomes's avatar Joseph Gomes
Browse files

Merge branch 'master' of https://github.com/deepchem/deepchem into gdb7_update

parents 1ac89d52 6df39d4e
Loading
Loading
Loading
Loading
+15 −1
Original line number Diff line number Diff line
@@ -260,8 +260,18 @@ Scaffold splitting
|pdbbind(core)   |MT-NN regression    |Random      |0.973         |0.494         |
|pdbbind(refined)|MT-NN regression    |Random      |0.987         |0.503         |
|pdbbind(full)   |MT-NN regression    |Random      |0.983         |0.528         |
|chembl          |MT-NN regression    |Index       |0.443         |0.427         |
|                |MT-NN regression    |Random      |0.464         |0.434         |
|                |MT-NN regression    |Scaffold    |0.484         |0.361         |
|gdb7            |MT-NN regression    |Index       |0.961         |0.011         |
|                |MT-NN regression    |Random      |0.742         |0.732         |
|kaggle          |MT-NN regression    |User-defined|0.748         |0.452         |

|Dataset         |Model               |Splitting   |Train score/MAE(kcal/mol)|Valid score/MAE(kcal/mol)|
|----------------|--------------------|------------|-------------------------|-------------------------|
|gdb7            |MT-NN regression    |Index       |44.5                     |185.6                    |
|                |MT-NN regression    |Random      |86.1                     |92.2                     |

* General features

Number of tasks and examples in the datasets
@@ -279,6 +289,8 @@ Number of tasks and examples in the datasets
|pdbbind(core)   |1          |195        |
|pdbbind(refined)|1          |3706       |
|pdbbind(full)   |1          |11908      |
|chembl(5thresh) |691        |23871      |
|gdb7            |1          |7165       |



@@ -313,6 +325,8 @@ Time needed for benchmark test(~20h in total)
|pdbbind(core)   |MT-NN regression    |0(featurized)   |30             |
|pdbbind(refined)|MT-NN regression    |0(featurized)   |40             |
|pdbbind(full)   |MT-NN regression    |0(featurized)   |60             |
|chembl          |MT-NN regression    |200             |9000           |
|gdb7            |MT-NN regression    |10              |110            |
|kaggle          |MT-NN regression    |2200            |3200           |


+1 −0
Original line number Diff line number Diff line
@@ -10,5 +10,6 @@ from deepchem.splits.splitters import *
from deepchem.splits.splitters import ScaffoldSplitter
from deepchem.splits.splitters import SpecifiedSplitter
from deepchem.splits.splitters import IndexSplitter
from deepchem.splits.splitters import IndiceSplitter
from deepchem.splits.task_splitter import merge_fold_datasets
from deepchem.splits.task_splitter import TaskSplitter
+39 −0
Original line number Diff line number Diff line
@@ -327,6 +327,45 @@ class IndexSplitter(Splitter):
    return (indices[:train_cutoff], indices[train_cutoff:valid_cutoff],
            indices[valid_cutoff:])

class IndiceSplitter(Splitter):
  """
  Class for splits based on input order. 
  """
  def __init__(self, verbose=False, valid_indices=None, test_indices=None):
    """
    Parameters
    -----------
    valid_indices: list of int
        indices of samples in the valid set
    test_indices: list of int
        indices of samples in the test set
    """
    self.verbose = verbose
    self.valid_indices = valid_indices
    self.test_indices = test_indices
    
  def split(self, dataset, seed=None, frac_train=.8, frac_valid=.1,
            frac_test=.1, log_every_n=None):
    """
    Splits internal compounds into train/validation/test in designated order.
    """
    num_datapoints = len(dataset)
    indices = np.arange(num_datapoints).tolist()
    if self.valid_indices is None:
      self.valid_indices = []
    else:
      for indice in indices:
        if indice in self.valid_indices:
          indices.remove(indice)
    if self.test_indices is None:
      self.test_indices = []
    else:
      for indice in indices:
        if indice in self.valid_indices:
          indices.remove(indice)

    return (indices, self.valid_indices, self.test_indices)


class ScaffoldSplitter(Splitter):
  """
+1 −0
Original line number Diff line number Diff line
@@ -104,6 +104,7 @@ class NormalizationTransformer(Transformer):
      y_means, y_stds = dataset.get_statistics(X_stats=False, y_stats=True)
      self.y_means = y_means 
      # Control for pathological case with no variance.
      y_stds = np.array(y_stds)
      y_stds[y_stds == 0] = 1.
      self.y_stds = y_stds
    self.transform_gradients = transform_gradients
+2 −0
Original line number Diff line number Diff line
@@ -108,6 +108,8 @@ def load_from_disk(filename):
    except KeyError:
      # Try older joblib version for legacy files.
      return old_joblib.load(filename)
    except ValueError:
      return old_joblib.load(filename)
  elif os.path.splitext(name)[1] == ".csv":
    # First line of user-specified CSV *must* be header.
    df = pd.read_csv(filename, header=0)
Loading