Commit 4a4ad255 authored by nd-02110114's avatar nd-02110114
Browse files

📝 update splitter and transformer

parent 5dbf3b0a
Loading
Loading
Loading
Loading
+20 −20
Original line number Diff line number Diff line
@@ -398,15 +398,15 @@ class RandomGroupSplitter(Splitter):
      An array indicating the group of each item.
      The length is equals to `len(dataset.X)`

    Notes
    -----
    Note
    ----
    The examples of groups is the following.

    groups    : 3 2 2 0 1 1 2 4 3
    dataset.X : 0 1 2 3 4 5 6 7 8
    | groups    : 3 2 2 0 1 1 2 4 3
    | dataset.X : 0 1 2 3 4 5 6 7 8

    groups    : a b b e q x a a r
    dataset.X : 0 1 2 3 4 5 6 7 8
    | groups    : a b b e q x a a r
    | dataset.X : 0 1 2 3 4 5 6 7 8
    """
    self.groups = groups

@@ -488,8 +488,8 @@ class RandomStratifiedSplitter(Splitter):
  sparse multitask datasets it usually manages to produces a fairly accurate
  division of the actives for each task.

  Notes
  -----
  Note
  ----
  This splitter is primarily designed for boolean labeled data. It considers
  only whether a label is zero or non-zero. When labels can take on multiple
  non-zero values, it does not try to give each split a proportional fraction
@@ -873,8 +873,8 @@ class MolecularWeightSplitter(Splitter):
  """
  Class for doing data splits by molecular weight.

  Notes
  -----
  Note
  ----
  This class requires RDKit to be installed.
  """

@@ -946,8 +946,8 @@ class MaxMinSplitter(Splitter):
  Furthermore, the validation set is comprised of diverse compounds under
  the test set.

  Notes
  -----
  Note
  ----
  This class requires RDKit to be installed.
  """

@@ -1044,8 +1044,8 @@ class ButinaSplitter(Splitter):
  """Class for doing data splits based on the butina clustering of a bulk tanimoto
  fingerprint matrix.

  Notes
  -----
  Note
  ----
  This class requires RDKit to be installed.
  """

@@ -1166,8 +1166,8 @@ def _generate_scaffold(smiles: str, include_chirality: bool = False) -> str:
  .. [1] Bemis, Guy W., and Mark A. Murcko. "The properties of known drugs.
     1. Molecular frameworks." Journal of medicinal chemistry 39.15 (1996): 2887-2893.

  Notes
  -----
  Note
  ----
  This function requires RDKit to be installed.
  """
  try:
@@ -1184,8 +1184,8 @@ def _generate_scaffold(smiles: str, include_chirality: bool = False) -> str:
class ScaffoldSplitter(Splitter):
  """Class for doing data splits based on the scaffold of small molecules.

  Notes
  -----
  Note
  ----
  This class requires RDKit to be installed.
  """

@@ -1285,8 +1285,8 @@ class FingerprintSplitter(Splitter):
  """Class for doing data splits based on the fingerprints of small
  molecules O(N**2) algorithm.

  Notes
  -----
  Note
  ----
  This class requires RDKit to be installed.
  """

+59 −24
Original line number Diff line number Diff line
@@ -306,7 +306,8 @@ class MinMaxTransformer(Transformer):

  Raises
  ------
  `ValueError` if `transform_X` and `transform_y` are both set.
  ValueError
    if `transform_X` and `transform_y` are both set.
  """

  def __init__(self, transform_X=False, transform_y=False, dataset=None):
@@ -454,7 +455,8 @@ class NormalizationTransformer(Transformer):

  Raises
  ------
  `ValueError` if `transform_X` and `transform_y` are both set.
  ValueError
    if `transform_X` and `transform_y` are both set.
  """

  def __init__(self,
@@ -659,7 +661,8 @@ class ClippingTransformer(Transformer):

    Raises
    ------
    `ValueError` if `transform_w` is set.
    ValueError
      if `transform_w` is set.
    """
    super(ClippingTransformer, self).__init__(
        transform_X=transform_X, transform_y=transform_y, dataset=dataset)
@@ -737,8 +740,8 @@ class LogTransformer(Transformer):

  Raises
  ------
  `ValueError` if `transform_w` is set or `transform_X` and `transform_y` are
  both set.
  ValueError
    if `transform_w` is set or `transform_X` and `transform_y` are both set.
  """

  def __init__(self,
@@ -901,8 +904,8 @@ class BalancingTransformer(Transformer):

  Raises
  ------
  `ValueError` if `transform_X` or `transform_y` are set. Also raises
  `ValueError` if `y` or `w` aren't of shape `(N,)` or `(N, n_tasks)`.
  ValueError
    if `transform_X` or `transform_y` are set. Also raises or if `y` or `w` aren't of shape `(N,)` or `(N, n_tasks)`.
  """

  def __init__(self, dataset: Dataset):
@@ -1026,7 +1029,6 @@ class CDFTransformer(Transformer):
  >>> dataset = dc.data.NumpyDataset(X, y)
  >>> cdftrans = dc.trans.CDFTransformer(transform_y=True, dataset=dataset, bins=n_bins)
  >>> dataset = cdftrans.transform(dataset)

  """

  def __init__(self,
@@ -1178,7 +1180,6 @@ class PowerTransformer(Transformer):
  >>> dataset = dc.data.NumpyDataset(X, y)
  >>> trans = dc.trans.PowerTransformer(transform_y=True, dataset=dataset, powers=powers)
  >>> dataset = trans.transform(dataset)

  """

  def __init__(self,
@@ -1265,9 +1266,8 @@ class PowerTransformer(Transformer):
class CoulombFitTransformer(Transformer):
  """Performs randomization and binarization operations on batches of Coulomb Matrix features during fit.

  Example
  -------

  Examples
  --------
  >>> n_samples = 10
  >>> n_features = 3
  >>> n_tasks = 1
@@ -1288,8 +1288,8 @@ class CoulombFitTransformer(Transformer):

    Parameters
    ----------
    dataset: dc.data.Dataset object

    dataset: dc.data.Dataset
      Dataset object to be transformed.
    """
    X = dataset.X
    num_atoms = X.shape[1]
@@ -1342,7 +1342,6 @@ class CoulombFitTransformer(Transformer):
    -------
    X: np.ndarray
      Normalized features

    """
    return (X - self.mean) / self.std

@@ -1577,6 +1576,7 @@ class IRVTransformer(Transformer):

    Returns
    -------
    DiskDataset or NumpyDataset
      `Dataset` object that is transformed.
    """
    X_length = dataset.X.shape[0]
@@ -1674,6 +1674,7 @@ class DAGTransformer(Transformer):

    Returns
    -------
    List
      List of parent adjacency matrices
    """
    # list of calculation orders for DAGs
@@ -2068,6 +2069,11 @@ class DataTransforms(object):
      Height of the images
    w: int
      Width of the images

    Returns
    -------
    np.ndarray
      The scaled image.
    """
    from PIL import Image
    return Image.fromarray(self.Image).resize((h, w))
@@ -2079,6 +2085,11 @@ class DataTransforms(object):
    ----------
    direction: str
      "lr" denotes left-right flip and "ud" denotes up-down flip.

    Returns
    -------
    np.ndarray
      The flipped image.
    """
    if direction == "lr":
      return np.fliplr(self.Image)
@@ -2099,7 +2110,8 @@ class DataTransforms(object):

    Returns
    -------
    The rotated input array
    np.ndarray
      The rotated image.
    """
    return scipy.ndimage.rotate(self.Image, angle)

@@ -2110,6 +2122,11 @@ class DataTransforms(object):
    ----------
    sigma: float
      Std dev. of the gaussian distribution

    Returns
    -------
    np.ndarray
      The image added gaussian noise.
    """
    return scipy.ndimage.gaussian_filter(self.Image, sigma)

@@ -2125,8 +2142,8 @@ class DataTransforms(object):

    Returns
    -------
    The center cropped input array

    np.ndarray
      The center cropped image.
    """
    y = self.Image.shape[0]
    x = self.Image.shape[1]
@@ -2150,7 +2167,8 @@ class DataTransforms(object):

    Returns
    -------
    The cropped input array
    np.ndarray
      The cropped image.
    """
    y = self.Image.shape[0]
    x = self.Image.shape[1]
@@ -2161,6 +2179,7 @@ class DataTransforms(object):

    Returns
    -------
    np.ndarray
      The grayscale image.
    """
    return np.dot(self.Image[..., :3], [0.2989, 0.5870, 0.1140])
@@ -2180,6 +2199,11 @@ class DataTransforms(object):
      ‘constant’
    order: int
      The order of the spline interpolation, default is 3. The order has to be in the range 0-5.

    Returns
    -------
    np.ndarray
      The shifted image.
    """
    if len(self.Image.shape) == 2:
      return scipy.ndimage.shift(
@@ -2197,6 +2221,11 @@ class DataTransforms(object):
      Mean of gaussian.
    std: float
      Standard deviation of gaussian.

    Returns
    -------
    np.ndarray
      The image added gaussian noise.
    """

    x = self.Image
@@ -2214,6 +2243,11 @@ class DataTransforms(object):
      value of salt noise.
    pepper: float
      value of pepper noise.

    Returns
    -------
    np.ndarray
      The image added salt and pepper noise.
    """

    noise = np.random.random(size=self.Image.shape)
@@ -2232,6 +2266,7 @@ class DataTransforms(object):

    Returns
    -------
    np.ndarray
      The median filtered image.
    """
    from PIL import Image, ImageFilter
+44 −17
Original line number Diff line number Diff line
@@ -15,78 +15,105 @@ learning models more rigorously than standard deep models since we're
looking for the ability to generalize to new domains. Some of the
implemented splitters here may help.

Splitter
--------
The :code:`dc.splits.Splitter` class is the abstract parent class for
all splitters. This class should never be directly instantiated.
.. contents:: Contents
    :local:

.. autoclass:: deepchem.splits.Splitter
  :members:
General Splitters
-----------------

RandomSplitter
--------------
^^^^^^^^^^^^^^

.. autoclass:: deepchem.splits.RandomSplitter
  :members:
  :inherited-members:
  :exclude-members: __init__

IndexSplitter
-------------
^^^^^^^^^^^^^

.. autoclass:: deepchem.splits.IndexSplitter
  :members:
  :inherited-members:
  :exclude-members: __init__

SpecifiedSplitter
-----------------
^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.splits.SpecifiedSplitter
  :members:
  :inherited-members:


RandomGroupSplitter
-------------------
^^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.splits.RandomGroupSplitter
  :members:
  :inherited-members:

RandomStratifiedSplitter
------------------------
^^^^^^^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.splits.RandomStratifiedSplitter
  :members:
  :inherited-members:
  :exclude-members: __init__

SingletaskStratifiedSplitter
----------------------------
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.splits.SingletaskStratifiedSplitter
  :members:
  :inherited-members:

TaskSplitter
^^^^^^^^^^^^

.. autoclass:: deepchem.splits.TaskSplitter
  :members:
  :inherited-members:


Molecule Splitters
------------------

MolecularWeightSplitter
-----------------------
^^^^^^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.splits.MolecularWeightSplitter
  :members:
  :inherited-members:
  :exclude-members: __init__

MaxMinSplitter
--------------
^^^^^^^^^^^^^^

.. autoclass:: deepchem.splits.MaxMinSplitter
  :members:
  :inherited-members:
  :exclude-members: __init__

ButinaSplitter
--------------
^^^^^^^^^^^^^^

.. autoclass:: deepchem.splits.ButinaSplitter
  :members:
  :inherited-members:

ScaffoldSplitter
----------------
^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.splits.ScaffoldSplitter
  :members:
  :inherited-members:
  :exclude-members: __init__

FingeprintSplitter
------------------
^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.splits.FingerprintSplitter
  :members:
  :inherited-members:
  :exclude-members: __init__
+46 −37
Original line number Diff line number Diff line
@@ -8,101 +8,110 @@ distribution. Real data of course is wild and hard to control. What do
you do if you have a crazy dataset and need to bring its statistics to
heel? Fear not for you have :code:`Transformer` objects.

Transformer
-----------
The :code:`dc.trans.Transformer` class is the abstract parent class
for all transformers. This class should never be directly initialized,
but contains a number of useful method implementations.
.. contents:: Contents
    :local:

.. autoclass:: deepchem.trans.Transformer
  :members:

General Transformers
--------------------

MinMaxTransformer
-----------------
^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.MinMaxTransformer
  :members:
  :inherited-members:

NormalizationTransformer
------------------------
^^^^^^^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.NormalizationTransformer
  :members:
  :inherited-members:

ClippingTransformer
-------------------
^^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.ClippingTransformer
  :members:
  :inherited-members:

LogTransformer
--------------
^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.LogTransformer
  :members:
  :inherited-members:

BalancingTransformer
--------------------
^^^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.BalancingTransformer
  :members:
  :inherited-members:

DuplicateBalancingTransformer
-----------------------------
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.DuplicateBalancingTransformer
  :members:
  :inherited-members:

CDFTransformer
--------------
^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.CDFTransformer
  :members:
  :inherited-members:

PowerTransformer
----------------
^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.PowerTransformer
  :members:
  :inherited-members:

ImageTransformer
^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.ImageTransformer
  :members:
  :inherited-members:

FeaturizationTransformer
^^^^^^^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.FeaturizationTransformer
  :members:
  :inherited-members:

Specified Usecase Transformers
------------------------------

CoulombFitTransformer
---------------------
^^^^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.CoulombFitTransformer
  :members:
  :inherited-members:

IRVTransformer
--------------
^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.IRVTransformer
  :members:
  :inherited-members:

DAGTransformer
--------------
^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.DAGTransformer
  :members:

ImageTransformer
----------------

.. autoclass:: deepchem.trans.ImageTransformer
  :members:
  :inherited-members:

ANITransformer
--------------
^^^^^^^^^^^^^^

.. autoclass:: deepchem.trans.ANITransformer
  :members:

FeaturizationTransformer
------------------------

.. autoclass:: deepchem.trans.FeaturizationTransformer
  :members:

DataTransforms
--------------

.. autoclass:: deepchem.trans.DataTransforms
  :members:
  :inherited-members: