Commit 93cbcaf7 authored by nd-02110114's avatar nd-02110114
Browse files

🐛 update type annotarions and docs

parent 7ea5ea02
Loading
Loading
Loading
Loading
+191 −105
Original line number Diff line number Diff line
@@ -5,7 +5,7 @@ import os
import logging
import time
import warnings
from typing import Optional, Tuple, List, Any
from typing import Any, List, Optional, Tuple, Union

import numpy as np
import scipy
@@ -14,6 +14,7 @@ import tensorflow as tf

import deepchem as dc
from deepchem.data import Dataset, NumpyDataset, DiskDataset
from deepchem.feat import Featurizer
from deepchem.feat.mol_graphs import ConvMol

logger = logging.getLogger(__name__)
@@ -145,7 +146,7 @@ class Transformer(object):
    raise NotImplementedError(
        "Each Transformer is responsible for its own transform_array method.")

  def untransform(self, z):
  def untransform(self, transformed):
    """Reverses stored transformation on provided data.

    Depending on whether `transform_X` or `transform_y` or `transform_w` was
@@ -154,12 +155,8 @@ class Transformer(object):

    Parameters
    ----------
    z: np.ndarray
    transformed: np.ndarray
      Array which was previously transformed by this class.

    Returns
    -------
    ztrans
    """
    raise NotImplementedError(
        "Each Transformer is responsible for its own untransform method.")
@@ -168,7 +165,7 @@ class Transformer(object):
                dataset: Dataset,
                parallel: bool = False,
                out_dir: Optional[str] = None,
                **kwargs):
                **kwargs) -> Dataset:
    """Transforms all internally stored data in dataset.

    This method transforms all internal data in the provided dataset by using
@@ -189,7 +186,8 @@ class Transformer(object):

    Returns
    -------
    a newly constructed Dataset object
    Dataset
      A newly transformed Dataset object
    """
    # Add this case in to handle non-DiskDataset that should be written to disk
    if out_dir is not None:
@@ -203,7 +201,9 @@ class Transformer(object):
      raise ValueError("Cannot transform w when w_values are not present")
    return dataset.transform(self, out_dir=out_dir, parallel=parallel)

  def transform_on_array(self, X, y, w, ids):
  def transform_on_array(
      self, X: np.ndarray, y: np.ndarray, w: np.ndarray,
      ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Transforms numpy arrays X, y, and w

    DEPRECATED. Use `transform_array` instead.
@@ -231,8 +231,8 @@ class Transformer(object):
      Transformed array of ids
    """
    warnings.warn(
        "transform_on_array() is deprecated and has been renamed to transform_array(). transform_on_array() will be removed in DeepChem 3.0",
        FutureWarning)
        "transform_on_array() is deprecated and has been renamed to transform_array()."
        "transform_on_array() will be removed in DeepChem 3.0", FutureWarning)
    X, y, w, ids = self.transform_array(X, y, w, ids)
    return X, y, w, ids

@@ -310,7 +310,10 @@ class MinMaxTransformer(Transformer):
    if `transform_X` and `transform_y` are both set.
  """

  def __init__(self, transform_X=False, transform_y=False, dataset=None):
  def __init__(self,
               transform_X: bool = False,
               transform_y: bool = False,
               dataset: Optional[Dataset] = None):
    """Initialization of MinMax transformer.

    Parameters
@@ -324,11 +327,10 @@ class MinMaxTransformer(Transformer):
    """
    if transform_X and transform_y:
      raise ValueError("Can only transform only one of X and y")
    if transform_X:
    if dataset is not None and transform_X:
      self.X_min = np.min(dataset.X, axis=0)
      self.X_max = np.max(dataset.X, axis=0)

    elif transform_y:
    elif dataset is not None and transform_y:
      self.y_min = np.min(dataset.y, axis=0)
      self.y_max = np.max(dataset.y, axis=0)

@@ -338,26 +340,9 @@ class MinMaxTransformer(Transformer):
    super(MinMaxTransformer, self).__init__(
        transform_X=transform_X, transform_y=transform_y, dataset=dataset)

  def transform(self, dataset, parallel=False):
    """Transforms the dataset.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset object to be transformed.
    parallel: bool, optional (default False)
      At present this argument is ignored.
    out_dir: str, optional
      If `out_dir` is specified in `kwargs` and `dataset` is a `DiskDataset`,
      the output dataset will be written to the specified directory.

    Returns
    -------
    a newly constructed Dataset object
    """
    return super(MinMaxTransformer, self).transform(dataset, parallel=parallel)

  def transform_array(self, X, y, w, ids):
  def transform_array(
      self, X: np.ndarray, y: np.ndarray, w: np.ndarray,
      ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Transform the data in a set of (X, y, w, ids) arrays.

    Parameters
@@ -396,14 +381,19 @@ class MinMaxTransformer(Transformer):
      y = np.nan_to_num((y - self.y_min) / denominator)
    return (X, y, w, ids)

  def untransform(self, z):
  def untransform(self, z: np.ndarray) -> np.ndarray:
    """
    Undo transformation on provided data.

    Parameters
    ----------
    z: np.ndarray,
    z: np.ndarray
      Transformed X or y array

    Returns
    -------
    np.ndarray
      Array with min-max scaling undone.
    """
    if self.transform_X:
      X_max = self.X_max
@@ -460,12 +450,12 @@ class NormalizationTransformer(Transformer):
  """

  def __init__(self,
               transform_X=False,
               transform_y=False,
               transform_w=False,
               dataset=None,
               transform_gradients=False,
               move_mean=True):
               transform_X: bool = False,
               transform_y: bool = False,
               transform_w: bool = False,
               dataset: Optional[Dataset] = None,
               transform_gradients: bool = False,
               move_mean: bool = True):
    """Initialize normalization transformation.

    Parameters
@@ -483,17 +473,17 @@ class NormalizationTransformer(Transformer):
      raise ValueError("Can only transform only one of X and y")
    if transform_w:
      raise ValueError("MinMaxTransformer doesn't support w transformation.")
    if transform_X:
    if dataset is not None and transform_X:
      X_means, X_stds = dataset.get_statistics(X_stats=True, y_stats=False)
      self.X_means = X_means
      self.X_stds = X_stds
    elif transform_y:
    elif dataset is not None and transform_y:
      y_means, y_stds = dataset.get_statistics(X_stats=False, y_stats=True)
      self.y_means = y_means
      # Control for pathological case with no variance.
      y_stds = np.array(y_stds)
      y_stds[y_stds == 0] = 1.
      self.y_stds = y_stds
      y_stds_np = np.array(y_stds)
      y_stds_np[y_stds_np == 0] = 1.
      self.y_stds = y_stds_np
    self.transform_gradients = transform_gradients
    self.move_mean = move_mean
    if self.transform_gradients:
@@ -507,11 +497,9 @@ class NormalizationTransformer(Transformer):
        transform_w=transform_w,
        dataset=dataset)

  def transform(self, dataset, parallel=False):
    return super(NormalizationTransformer, self).transform(
        dataset, parallel=parallel)

  def transform_array(self, X, y, w, ids):
  def transform_array(
      self, X: np.ndarray, y: np.ndarray, w: np.ndarray,
      ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Transform the data in a set of (X, y, w) arrays.

    Parameters
@@ -548,7 +536,7 @@ class NormalizationTransformer(Transformer):
        y = np.nan_to_num(y / self.y_stds)
    return (X, y, w, ids)

  def untransform(self, z):
  def untransform(self, z: np.ndarray) -> np.ndarray:
    """
    Undo transformation on provided data.

@@ -634,11 +622,11 @@ class ClippingTransformer(Transformer):
  """

  def __init__(self,
               transform_X=False,
               transform_y=False,
               dataset=None,
               x_max=5.,
               y_max=500.):
               transform_X: bool = False,
               transform_y: bool = False,
               dataset: Optional[Dataset] = None,
               x_max: float = 5.,
               y_max: float = 500.):
    """Initialize clipping transformation.

    Parameters
@@ -670,7 +658,9 @@ class ClippingTransformer(Transformer):
    self.x_max = x_max
    self.y_max = y_max

  def transform_array(self, X, y, w, ids):
  def transform_array(
      self, X: np.ndarray, y: np.ndarray, w: np.ndarray,
      ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Transform the data in a set of (X, y, w) arrays.

    Parameters
@@ -704,6 +694,7 @@ class ClippingTransformer(Transformer):
    return (X, y, w, ids)

  def untransform(self, z):
    """Not implemented."""
    raise NotImplementedError(
        "Cannot untransform datasets with ClippingTransformer.")

@@ -745,11 +736,11 @@ class LogTransformer(Transformer):
  """

  def __init__(self,
               transform_X=False,
               transform_y=False,
               features=None,
               tasks=None,
               dataset=None):
               transform_X: bool = False,
               transform_y: bool = False,
               features: Optional[List[int]] = None,
               tasks: Optional[List[str]] = None,
               dataset: Optional[Dataset] = None):
    """Initialize log transformer.

    Parameters
@@ -758,12 +749,12 @@ class LogTransformer(Transformer):
      Whether to transform X
    transform_y: bool, optional (default False)
      Whether to transform y
    dataset: dc.data.Dataset object, optional (default None)
      Dataset to be transformed
    features: list[Int]
      List of features indices to transform
    tasks: list[str]
      List of task names to transform.
    dataset: dc.data.Dataset object, optional (default None)
      Dataset to be transformed
    """
    if transform_X and transform_y:
      raise ValueError("Can only transform only one of X and y")
@@ -772,7 +763,9 @@ class LogTransformer(Transformer):
    super(LogTransformer, self).__init__(
        transform_X=transform_X, transform_y=transform_y, dataset=dataset)

  def transform_array(self, X, y, w, ids):
  def transform_array(
      self, X: np.ndarray, y: np.ndarray, w: np.ndarray,
      ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Transform the data in a set of (X, y, w) arrays.

    Parameters
@@ -819,7 +812,7 @@ class LogTransformer(Transformer):
            y[:, j] = y[:, j]
    return (X, y, w, ids)

  def untransform(self, z):
  def untransform(self, z: np.ndarray) -> np.ndarray:
    """
    Undo transformation on provided data.

@@ -827,6 +820,11 @@ class LogTransformer(Transformer):
    ----------
    z: np.ndarray,
      Transformed X or y array

    Returns
    -------
    np.ndarray
      Array with a logarithmic transformation undone.
    """
    if self.transform_X:
      num_features = len(z[0])
@@ -949,7 +947,9 @@ class BalancingTransformer(Transformer):
      weights.append(class_weights)
    self.weights = weights

  def transform_array(self, X, y, w, ids):
  def transform_array(
      self, X: np.ndarray, y: np.ndarray, w: np.ndarray,
      ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Transform the data in a set of (X, y, w) arrays.

    Parameters
@@ -1057,7 +1057,9 @@ class CDFTransformer(Transformer):
        raise ValueError("dataset must be specified when transforming y")
      self.y = dataset.y

  def transform_array(self, X, y, w, ids):
  def transform_array(
      self, X: np.ndarray, y: np.ndarray, w: np.ndarray,
      ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Performs CDF transform on data.

    Parameters
@@ -1091,7 +1093,7 @@ class CDFTransformer(Transformer):
      y_t = get_cdf_values(y, self.bins)
    return X_t, y_t, w_t, ids

  def untransform(self, z):
  def untransform(self, z: np.ndarray) -> np.ndarray:
    """Undo transformation on provided data.

    Note that this transformation is only undone for y.
@@ -1100,6 +1102,11 @@ class CDFTransformer(Transformer):
    ----------
    z: np.ndarray,
      Transformed y array

    Returns
    -------
    np.ndarray
      Array with the transformation undone.
    """
    # Need this for transform_y
    if self.transform_y:
@@ -1183,10 +1190,10 @@ class PowerTransformer(Transformer):
  """

  def __init__(self,
               transform_X=False,
               transform_y=False,
               dataset=None,
               powers=[1]):
               transform_X: bool = False,
               transform_y: bool = False,
               dataset: Optional[Dataset] = None,
               powers: List[int] = [1]):
    """Initialize this transformer

    Parameters
@@ -1205,7 +1212,9 @@ class PowerTransformer(Transformer):
        transform_X=transform_X, transform_y=transform_y)
    self.powers = powers

  def transform_array(self, X, y, w, ids):
  def transform_array(
      self, X: np.ndarray, y: np.ndarray, w: np.ndarray,
      ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Performs power transform on data.

    Parameters
@@ -1248,13 +1257,18 @@ class PowerTransformer(Transformer):
      X_t = X
    return (X_t, y_t, w_t, ids)

  def untransform(self, z):
  def untransform(self, z: np.ndarray) -> np.ndarray:
    """Undo transformation on provided data.

    Parameters
    ----------
    z: np.ndarray,
      Transformed y array

    Returns
    -------
    np.ndarray
      Array with the power transformation undone.
    """
    n_powers = len(self.powers)
    orig_len = (z.shape[1]) // n_powers
@@ -1283,7 +1297,7 @@ class CoulombFitTransformer(Transformer):
  12
  """

  def __init__(self, dataset):
  def __init__(self, dataset: Dataset):
    """Initializes CoulombFitTransformer.

    Parameters
@@ -1306,7 +1320,7 @@ class CoulombFitTransformer(Transformer):
    self.std = (X - self.mean).std()
    super(CoulombFitTransformer, self).__init__(transform_X=True)

  def realize(self, X):
  def realize(self, X: np.ndarray) -> np.ndarray:
    """Randomize features.

    Parameters
@@ -1330,7 +1344,7 @@ class CoulombFitTransformer(Transformer):

    return np.array([_realize_(z) for z in X])

  def normalize(self, X):
  def normalize(self, X: np.ndarray) -> np.ndarray:
    """Normalize features.

    Parameters
@@ -1345,7 +1359,7 @@ class CoulombFitTransformer(Transformer):
    """
    return (X - self.mean) / self.std

  def expand(self, X):
  def expand(self, X: np.ndarray) -> np.ndarray:
    """Binarize features.

    Parameters
@@ -1360,11 +1374,11 @@ class CoulombFitTransformer(Transformer):
    """
    Xexp = []
    for i in range(X.shape[1]):
      for k in np.arange(0, self.max[i] + self.step, self.step):
      for k in np.arange(0, self.max[i] + self.step, self.step):  # type: ignore
        Xexp += [np.tanh((X[:, i] - k) / self.step)]
    return np.array(Xexp).T

  def X_transform(self, X):
  def X_transform(self, X: np.ndarray) -> np.ndarray:
    """Perform Coulomb Fit transform on features.

    Parameters
@@ -1381,11 +1395,38 @@ class CoulombFitTransformer(Transformer):
    X = self.normalize(self.expand(self.realize(X)))
    return X

  def transform_array(self, X, y, w, ids):
  def transform_array(
      self, X: np.ndarray, y: np.ndarray, w: np.ndarray,
      ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Performs randomization and binarization operations on data.

    Parameters
    ----------
    X: np.ndarray
      Array of features
    y: np.ndarray
      Array of labels
    w: np.ndarray
      Array of weights.
    ids: np.ndarray
      Array of identifiers.

    Returns
    -------
    Xtrans: np.ndarray
      Transformed array of features
    ytrans: np.ndarray
      Transformed array of labels
    wtrans: np.ndarray
      Transformed array of weights
    idstrans: np.ndarray
      Transformed array of ids
    """
    X = self.X_transform(X)
    return (X, y, w, ids)

  def untransform(self, z):
    "Not implemented."
    raise NotImplementedError(
        "Cannot untransform datasets with FitTransformer.")

@@ -1425,7 +1466,7 @@ class IRVTransformer(Transformer):
  This class requires TensorFlow to be installed.
  """

  def __init__(self, K, n_tasks, dataset):
  def __init__(self, K: int, n_tasks: int, dataset: Dataset):
    """Initializes IRVTransformer.

    Parameters
@@ -1444,7 +1485,8 @@ class IRVTransformer(Transformer):
    self.w = dataset.w
    super(IRVTransformer, self).__init__(transform_X=True)

  def realize(self, similarity, y, w):
  def realize(self, similarity: np.ndarray, y: np.ndarray,
              w: np.ndarray) -> List:
    """find samples with top ten similarity values in the reference dataset

    Parameters
@@ -1478,25 +1520,26 @@ class IRVTransformer(Transformer):
      top_label = tf.gather(y, indice)
      values.append(value)
      top_labels.append(top_label)
    values = np.concatenate(values, axis=0)
    top_labels = np.concatenate(top_labels, axis=0)
    values_np = np.concatenate(values, axis=0)
    top_labels_np = np.concatenate(top_labels, axis=0)
    # concatenate batches of data together
    for count in range(values.shape[0]):
      if values[count, 0] == 1:
    for count in range(values_np.shape[0]):
      if values_np[count, 0] == 1:
        features.append(
            np.concatenate([
                values[count, 1:(self.K + 1)], top_labels[count, 1:(self.K + 1)]
                values_np[count, 1:(self.K + 1)],
                top_labels_np[count, 1:(self.K + 1)]
            ]))
        # highest similarity is 1: target is in the reference
        # use the following K points
      else:
        features.append(
            np.concatenate(
                [values[count, 0:self.K], top_labels[count, 0:self.K]]))
                [values_np[count, 0:self.K], top_labels_np[count, 0:self.K]]))
        # highest less than 1: target not in the reference, use top K points
    return features

  def X_transform(self, X_target):
  def X_transform(self, X_target: np.ndarray) -> np.ndarray:
    """ Calculate similarity between target dataset(X_target) and
    reference dataset(X): #(1 in intersection)/#(1 in union)

@@ -1562,7 +1605,11 @@ class IRVTransformer(Transformer):
      del result
    return all_result

  def transform(self, dataset, parallel=False, out_dir=None, **kwargs):
  def transform(self,
                dataset: Dataset,
                parallel: bool = False,
                out_dir: Optional[str] = None,
                **kwargs) -> Union[DiskDataset, NumpyDataset]:
    """Transforms a given dataset

    Parameters
@@ -1592,6 +1639,7 @@ class IRVTransformer(Transformer):
        X_trans, dataset.y, dataset.w, data_dir=out_dir)

  def untransform(self, z):
    "Not implemented."
    raise NotImplementedError(
        "Cannot untransform datasets with IRVTransformer.")

@@ -1618,7 +1666,7 @@ class DAGTransformer(Transformer):
  >>> dataset = trans.transform(dataset)
  """

  def __init__(self, max_atoms=50):
  def __init__(self, max_atoms: int = 50):
    """Initializes DAGTransformer.

    Parameters
@@ -1661,6 +1709,7 @@ class DAGTransformer(Transformer):
    return (X, y, w, ids)

  def untransform(self, z):
    "Not implemented."
    raise NotImplementedError(
        "Cannot untransform datasets with DAGTransformer.")

@@ -1768,18 +1817,53 @@ class DAGTransformer(Transformer):


class ImageTransformer(Transformer):
  """Convert an image into width, height, channel

  Note
  ----
  This class require Pillow to be installed.
  """
  Convert an image into width, height, channel
  """

  def __init__(self, size):
    """Initializes transformation based on dataset statistics."""
  def __init__(self, size: Tuple[int]):
    """Initializes transformation based on dataset statistics.
    
    Parameters
    ----------
    size: int
      The image size
    """
    self.size = size
    super(ImageTransformer, self).__init__(transform_X=True)

  def transform_array(self, X, y, w):
    """Transform the data in a set of (X, y, w) arrays."""
    """Transform the data in a set of (X, y, w, ids) arrays.

    Parameters
    ----------
    X: np.ndarray
      Array of features
    y: np.ndarray
      Array of labels
    w: np.ndarray
      Array of weights.
    ids: np.ndarray
      Array of identifiers.

    Returns
    -------
    Xtrans: np.ndarray
      Transformed array of features
    ytrans: np.ndarray
      Transformed array of labels
    wtrans: np.ndarray
      Transformed array of weights
    idstrans: np.ndarray
      Transformed array of ids
    """
    try:
      from PIL import Image
    except ModuleNotFoundError:
      raise ImportError("This function requires Pillow to be installed.")
    images = [scipy.ndimage.imread(x, mode='RGB') for x in X]
    images = [Image.fromarray(x).resize(self.size) for x in images]
    return np.array(images), y, w
@@ -1999,14 +2083,16 @@ class FeaturizationTransformer(Transformer):
  >>> dataset = trans.transform(dataset)
  """

  def __init__(self, dataset=None, featurizer=None):
  def __init__(self,
               dataset: Optional[Dataset] = None,
               featurizer: Optional[Featurizer] = None):
    """Initialization of FeaturizationTransformer

    Parameters
    ----------
    dataset: dc.data.Dataset object, optional (default None)
      Dataset to be transformed
    featurizer: dc.feat.Featurizer object
    featurizer: dc.feat.Featurizer object, optional (default None)
      Featurizer applied to perform transformations.
    """
    if featurizer is None:
+10 −2
Original line number Diff line number Diff line
@@ -109,11 +109,19 @@ ButinaSplitter
  :members:
  :inherited-members:

FingeprintSplitter
^^^^^^^^^^^^^^^^^^
FingerprintSplitter
^^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.splits.FingerprintSplitter
  :members:
  :inherited-members:
  :exclude-members: __init__

Base Splitters (for develop)
----------------------------

The :code:`dc.splits.Splitter` class is the abstract parent class for
all splitters. This class should never be directly instantiated.

.. autoclass:: deepchem.splits.Splitter
  :members:
+10 −1
Original line number Diff line number Diff line
@@ -11,7 +11,6 @@ heel? Fear not for you have :code:`Transformer` objects.
.. contents:: Contents
    :local:


General Transformers
--------------------

@@ -115,3 +114,13 @@ ANITransformer
.. autoclass:: deepchem.trans.ANITransformer
  :members:
  :inherited-members:

Base Transformer (for develop)
-------------------------------

The :code:`dc.trans.Transformer` class is the abstract parent class
for all transformers. This class should never be directly initialized,
but contains a number of useful method implementations.

.. autoclass:: deepchem.trans.Transformer
  :members: