Commit c8083f04 authored by TaranSinghania's avatar TaranSinghania
Browse files

Added Data Augmentation Support

parent e604b362
Loading
Loading
Loading
Loading
+64 −33
Original line number Diff line number Diff line
@@ -40,8 +40,8 @@ class TestTransformers(unittest.TestCase):
  def test_y_log_transformer(self):
    """Tests logarithmic data transformer."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    log_transformer = dc.trans.LogTransformer(
        transform_y=True, dataset=solubility_dataset)
    log_transformer = dc.trans.LogTransformer(transform_y=True,
                                              dataset=solubility_dataset)
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                    solubility_dataset.w, solubility_dataset.ids)
    solubility_dataset = log_transformer.transform(solubility_dataset)
@@ -72,14 +72,14 @@ class TestTransformers(unittest.TestCase):
      dc.trans.transformers.Transformer(transform_w=True).transform(ul_dataset)

    # transforming X should be okay
    dc.trans.NormalizationTransformer(
        transform_X=True, dataset=ul_dataset).transform(ul_dataset)
    dc.trans.NormalizationTransformer(transform_X=True,
                                      dataset=ul_dataset).transform(ul_dataset)

  def test_X_log_transformer(self):
    """Tests logarithmic data transformer."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    log_transformer = dc.trans.LogTransformer(
        transform_X=True, dataset=solubility_dataset)
    log_transformer = dc.trans.LogTransformer(transform_X=True,
                                              dataset=solubility_dataset)
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                    solubility_dataset.w, solubility_dataset.ids)
    solubility_dataset = log_transformer.transform(solubility_dataset)
@@ -112,8 +112,9 @@ class TestTransformers(unittest.TestCase):
      tiid = dfe.columns.get_loc(task) - dfe.columns.get_loc(first_task)
      tid = np.concatenate((tid, np.array([tiid])))
    tasks = tid.astype(int)
    log_transformer = dc.trans.LogTransformer(
        transform_y=True, tasks=tasks, dataset=multitask_dataset)
    log_transformer = dc.trans.LogTransformer(transform_y=True,
                                              tasks=tasks,
                                              dataset=multitask_dataset)
    X, y, w, ids = (multitask_dataset.X, multitask_dataset.y,
                    multitask_dataset.w, multitask_dataset.ids)
    multitask_dataset = log_transformer.transform(multitask_dataset)
@@ -146,8 +147,9 @@ class TestTransformers(unittest.TestCase):
      fiid = dfe.columns.get_loc(feature) - dfe.columns.get_loc(first_feature)
      fid = np.concatenate((fid, np.array([fiid])))
    features = fid.astype(int)
    log_transformer = dc.trans.LogTransformer(
        transform_X=True, features=features, dataset=multitask_dataset)
    log_transformer = dc.trans.LogTransformer(transform_X=True,
                                              features=features,
                                              dataset=multitask_dataset)
    X, y, w, ids = (multitask_dataset.X, multitask_dataset.y,
                    multitask_dataset.w, multitask_dataset.ids)
    multitask_dataset = log_transformer.transform(multitask_dataset)
@@ -170,8 +172,8 @@ class TestTransformers(unittest.TestCase):
  def test_y_minmax_transformer(self):
    """Tests MinMax transformer. """
    solubility_dataset = dc.data.tests.load_solubility_data()
    minmax_transformer = dc.trans.MinMaxTransformer(
        transform_y=True, dataset=solubility_dataset)
    minmax_transformer = dc.trans.MinMaxTransformer(transform_y=True,
                                                    dataset=solubility_dataset)
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                    solubility_dataset.w, solubility_dataset.ids)
    solubility_dataset = minmax_transformer.transform(solubility_dataset)
@@ -203,8 +205,8 @@ class TestTransformers(unittest.TestCase):
    y = np.random.randn(n_samples, n_tasks)
    dataset = dc.data.NumpyDataset(X, y)

    minmax_transformer = dc.trans.MinMaxTransformer(
        transform_y=True, dataset=dataset)
    minmax_transformer = dc.trans.MinMaxTransformer(transform_y=True,
                                                    dataset=dataset)
    w, ids = dataset.w, dataset.ids

    dataset = minmax_transformer.transform(dataset)
@@ -230,8 +232,8 @@ class TestTransformers(unittest.TestCase):

  def test_X_minmax_transformer(self):
    solubility_dataset = dc.data.tests.load_solubility_data()
    minmax_transformer = dc.trans.MinMaxTransformer(
        transform_X=True, dataset=solubility_dataset)
    minmax_transformer = dc.trans.MinMaxTransformer(transform_X=True,
                                                    dataset=solubility_dataset)
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                    solubility_dataset.w, solubility_dataset.ids)
    solubility_dataset = minmax_transformer.transform(solubility_dataset)
@@ -317,8 +319,9 @@ class TestTransformers(unittest.TestCase):
    target = np.transpose(np.array(np.append([target], [target], axis=0)))
    gaussian_dataset = dc.data.tests.load_gaussian_cdf_data()
    bins = 1001
    cdf_transformer = dc.trans.CDFTransformer(
        transform_X=True, dataset=gaussian_dataset, bins=bins)
    cdf_transformer = dc.trans.CDFTransformer(transform_X=True,
                                              dataset=gaussian_dataset,
                                              bins=bins)
    X, y, w, ids = (gaussian_dataset.X, gaussian_dataset.y, gaussian_dataset.w,
                    gaussian_dataset.ids)
    gaussian_dataset = cdf_transformer.transform(gaussian_dataset, bins=bins)
@@ -342,8 +345,9 @@ class TestTransformers(unittest.TestCase):
    target = np.transpose(np.array(np.append([target], [target], axis=0)))
    gaussian_dataset = dc.data.tests.load_gaussian_cdf_data()
    bins = 1001
    cdf_transformer = dc.trans.CDFTransformer(
        transform_y=True, dataset=gaussian_dataset, bins=bins)
    cdf_transformer = dc.trans.CDFTransformer(transform_y=True,
                                              dataset=gaussian_dataset,
                                              bins=bins)
    X, y, w, ids = (gaussian_dataset.X, gaussian_dataset.y, gaussian_dataset.w,
                    gaussian_dataset.ids)
    gaussian_dataset = cdf_transformer.transform(gaussian_dataset, bins=bins)
@@ -420,8 +424,8 @@ class TestTransformers(unittest.TestCase):
    """Test Power transformer on Gaussian normal dataset."""
    gaussian_dataset = dc.data.tests.load_gaussian_cdf_data()
    powers = [1, 2, 0.5]
    power_transformer = dc.trans.PowerTransformer(
        transform_X=True, powers=powers)
    power_transformer = dc.trans.PowerTransformer(transform_X=True,
                                                  powers=powers)
    X, y, w, ids = (gaussian_dataset.X, gaussian_dataset.y, gaussian_dataset.w,
                    gaussian_dataset.ids)
    gaussian_dataset2 = power_transformer.transform(gaussian_dataset)
@@ -445,8 +449,8 @@ class TestTransformers(unittest.TestCase):
    """Test Power transformer on Gaussian normal dataset."""
    gaussian_dataset = dc.data.tests.load_gaussian_cdf_data()
    powers = [1, 2, 0.5]
    power_transformer = dc.trans.PowerTransformer(
        transform_y=True, powers=powers)
    power_transformer = dc.trans.PowerTransformer(transform_y=True,
                                                  powers=powers)
    X, y, w, ids = (gaussian_dataset.X, gaussian_dataset.y, gaussian_dataset.w,
                    gaussian_dataset.ids)
    gaussian_dataset2 = power_transformer.transform(gaussian_dataset)
@@ -497,8 +501,8 @@ class TestTransformers(unittest.TestCase):
      np.testing.assert_allclose(w_task[w_orig_task == 0],
                                 np.zeros_like(w_task[w_orig_task == 0]))
      # Check that sum of 0s equals sum of 1s in transformed for each task
      assert np.isclose(
          np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
      assert np.isclose(np.sum(w_task[y_task == 0]),
                        np.sum(w_task[y_task == 1]))

  def test_multitask_balancing_transformer(self):
    """Test balancing transformer on multitask dataset."""
@@ -525,8 +529,8 @@ class TestTransformers(unittest.TestCase):
      np.testing.assert_allclose(w_task[w_orig_task == 0],
                                 np.zeros_like(w_task[w_orig_task == 0]))
      # Check that sum of 0s equals sum of 1s in transformed for each task
      assert np.isclose(
          np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
      assert np.isclose(np.sum(w_task[y_task == 0]),
                        np.sum(w_task[y_task == 1]))

  def test_coulomb_fit_transformer(self):
    """Test coulomb fit transformer on singletask dataset."""
@@ -555,8 +559,7 @@ class TestTransformers(unittest.TestCase):
    y_test = np.zeros((test_samples, n_tasks))
    w_test = np.ones((test_samples, n_tasks))
    test_dataset = dc.data.NumpyDataset(X_test, y_test, w_test, ids=None)
    sims = np.sum(
        X_test[0, :] * X, axis=1, dtype=float) / np.sum(
    sims = np.sum(X_test[0, :] * X, axis=1, dtype=float) / np.sum(
        np.sign(X_test[0, :] + X), axis=1, dtype=float)
    sims = sorted(sims, reverse=True)
    IRV_transformer = dc.trans.IRVTransformer(10, n_tasks, dataset)
@@ -587,6 +590,26 @@ class TestTransformers(unittest.TestCase):
    check_blur = scipy.ndimage.gaussian_filter(self.d, 1.5)
    assert np.allclose(check_blur, blurred)

  def check_crop(self):
    # Check crop
    dt = DataTransforms(self.d)
    x_crop = 50
    y_crop = 50
    crop = dt.crop(x_crop, y_crop)
    y = self.d.shape[0]
    x = self.d.shape[1]
    x_start = x // 2 - (x_crop // 2)
    y_start = y // 2 - (y_crop // 2)
    check_crop = self.d[y_start:y_start + y_crop, x_start:x_start + x_crop]
    assert np.allclose(check_crop, crop)

  def chef_convert2gray(self):
    # Check convert2gray
    dt = DataTransforms(self.d)
    gray = dt.convert2gray()
    check_gray = np.dot(self.d[..., :3], [0.2989, 0.5870, 0.1140])
    assert np.allclose(check_gray, gray)

  def test_rotation(self):
    # Check rotation
    dt = DataTransforms(self.d)
@@ -668,8 +691,9 @@ class TestTransformers(unittest.TestCase):
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir,
                              "../../models/tests/example_regression.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    loader = dc.data.CSVLoader(tasks=tasks,
                               smiles_field="smiles",
                               featurizer=featurizer)
    dataset = loader.featurize(input_file)
    transformer = dc.trans.DAGTransformer(max_atoms=50)
    dataset = transformer.transform(dataset)
@@ -677,3 +701,10 @@ class TestTransformers(unittest.TestCase):
    # atoms. These are denoted the "parents"
    for idm, mol in enumerate(dataset.X):
      assert dataset.X[idm].get_num_atoms() == len(dataset.X[idm].parents)

  def test_median_filter(self):
    #Check median filter
    dt = DataTransforms(self.d)
    filtered = dt.median_filter(size=2)
    check_filtered = scipy.ndimage.gaussian_filter(self.d, 2)
    assert np.allclose(check_filtered, filtered)
+95 −58
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@ import time
import deepchem as dc
import tensorflow as tf
from deepchem.data import NumpyDataset
from PIL import Image


def undo_transforms(y, transformers):
@@ -154,8 +155,7 @@ class MinMaxTransformer(Transformer):
      if len(dataset.y.shape) > 1:
        assert len(self.y_min) == dataset.y.shape[1]

    super(MinMaxTransformer, self).__init__(
        transform_X=transform_X,
    super(MinMaxTransformer, self).__init__(transform_X=transform_X,
                                            transform_y=transform_y,
                                            transform_w=transform_w,
                                            dataset=dataset)
@@ -232,15 +232,14 @@ class NormalizationTransformer(Transformer):
      self.grad = np.reshape(true_grad, (true_grad.shape[0], -1, 3))
      self.ydely_means = ydely_means

    super(NormalizationTransformer, self).__init__(
        transform_X=transform_X,
    super(NormalizationTransformer, self).__init__(transform_X=transform_X,
                                                   transform_y=transform_y,
                                                   transform_w=transform_w,
                                                   dataset=dataset)

  def transform(self, dataset, parallel=False):
    return super(NormalizationTransformer, self).transform(
        dataset, parallel=parallel)
    return super(NormalizationTransformer, self).transform(dataset,
                                                           parallel=parallel)

  def transform_array(self, X, y, w):
    """Transform the data in a set of (X, y, w) arrays."""
@@ -291,8 +290,8 @@ class NormalizationTransformer(Transformer):

      grad_means = self.y_means[1:]
      energy_var = self.y_stds[0]
      grad_var = 1 / energy_var * (
          self.ydely_means - self.y_means[0] * self.y_means[1:])
      grad_var = 1 / energy_var * (self.ydely_means -
                                   self.y_means[0] * self.y_means[1:])
      energy = tasks[:, 0]
      transformed_grad = []

@@ -350,8 +349,7 @@ class ClippingTransformer(Transformer):
      Maximum absolute value for y

    """
    super(ClippingTransformer, self).__init__(
        transform_X=transform_X,
    super(ClippingTransformer, self).__init__(transform_X=transform_X,
                                              transform_y=transform_y,
                                              transform_w=transform_w,
                                              dataset=dataset)
@@ -405,8 +403,9 @@ class LogTransformer(Transformer):
    self.features = features
    self.tasks = tasks
    """Initialize log  transformation."""
    super(LogTransformer, self).__init__(
        transform_X=transform_X, transform_y=transform_y, dataset=dataset)
    super(LogTransformer, self).__init__(transform_X=transform_X,
                                         transform_y=transform_y,
                                         dataset=dataset)

  def transform_array(self, X, y, w):
    """Transform the data in a set of (X, y, w) arrays."""
@@ -469,8 +468,7 @@ class BalancingTransformer(Transformer):
               transform_w=False,
               dataset=None,
               seed=None):
    super(BalancingTransformer, self).__init__(
        transform_X=transform_X,
    super(BalancingTransformer, self).__init__(transform_X=transform_X,
                                               transform_y=transform_y,
                                               transform_w=transform_w,
                                               dataset=dataset)
@@ -517,7 +515,10 @@ class CDFTransformer(Transformer):
  """Histograms the data and assigns values based on sorted list."""
  """Acts like a Cumulative Distribution Function (CDF)."""

  def __init__(self, transform_X=False, transform_y=False, dataset=None,
  def __init__(self,
               transform_X=False,
               transform_y=False,
               dataset=None,
               bins=2):
    self.transform_X = transform_X
    self.transform_y = transform_y
@@ -838,9 +839,9 @@ class IRVTransformer():
    n_features = X_target.shape[1]
    print('start similarity calculation')
    time1 = time.time()
    similarity = IRVTransformer.matrix_mul(X_target, np.transpose(
        self.X)) / (n_features - IRVTransformer.matrix_mul(
            1 - X_target, np.transpose(1 - self.X)))
    similarity = IRVTransformer.matrix_mul(X_target, np.transpose(self.X)) / (
        n_features -
        IRVTransformer.matrix_mul(1 - X_target, np.transpose(1 - self.X)))
    time2 = time.time()
    print('similarity calculation takes %i s' % (time2 - time1))
    for i in range(self.n_tasks):
@@ -867,8 +868,8 @@ class IRVTransformer():
        partial_result = np.matmul(
            X1[X1_id * shard_size:min((X1_id + 1) *
                                      shard_size, X1_shape[0]), :],
            X2[:, X2_id * shard_size:min((X2_id + 1) *
                                         shard_size, X2_shape[1])])
            X2[:,
               X2_id * shard_size:min((X2_id + 1) * shard_size, X2_shape[1])])
        # calculate matrix multiplicatin on slices
        if result.size == 1:
          result = partial_result
@@ -888,8 +889,8 @@ class IRVTransformer():
    X_trans = []
    for count in range(X_length // 5000 + 1):
      X_trans.append(
          self.X_transform(
              dataset.X[count * 5000:min((count + 1) * 5000, X_length), :]))
          self.X_transform(dataset.X[count * 5000:min((count + 1) *
                                                      5000, X_length), :]))
    X_trans = np.concatenate(X_trans, axis=0)
    return NumpyDataset(X_trans, dataset.y, dataset.w, ids=None)

@@ -1097,8 +1098,8 @@ class ANITransformer(Transformer):
      while True:
        end = min((start + 1) * batch_size, X.shape[0])
        X_batch = X[(start * batch_size):end]
        output = self.sess.run(
            [self.outputs], feed_dict={self.inputs: X_batch})[0]
        output = self.sess.run([self.outputs],
                               feed_dict={self.inputs: X_batch})[0]
        X_out.append(output)
        num_transformed = num_transformed + X_batch.shape[0]
        print('%i samples transformed' % num_transformed)
@@ -1132,8 +1133,7 @@ class ANITransformer(Transformer):
      radial_sym = self.radial_symmetry(d_radial_cutoff, d, atom_numbers)
      angular_sym = self.angular_symmetry(d_angular_cutoff, d, atom_numbers,
                                          coordinates)
      self.outputs = tf.concat(
          [
      self.outputs = tf.concat([
          tf.cast(tf.expand_dims(atom_numbers, 2), tf.float32), radial_sym,
          angular_sym
      ],
@@ -1179,8 +1179,8 @@ class ANITransformer(Transformer):
    if self.atomic_number_differentiated:
      out_tensors = []
      for atom_type in self.atom_cases:
        selected_atoms = tf.expand_dims(
            tf.expand_dims(atom_numbers_embedded[:, :, atom_type], axis=1),
        selected_atoms = tf.expand_dims(tf.expand_dims(
            atom_numbers_embedded[:, :, atom_type], axis=1),
                                        axis=3)
        out_tensors.append(tf.reduce_sum(out * selected_atoms, axis=2))
      return tf.concat(out_tensors, axis=2)
@@ -1234,8 +1234,9 @@ class ANITransformer(Transformer):
        for atom_type_k in self.atom_cases[id_j:]:
          selected_atoms = tf.stack([atom_numbers_embedded[:, :, atom_type_j]] * max_atoms, axis=2) * \
                           tf.stack([atom_numbers_embedded[:, :, atom_type_k]] * max_atoms, axis=1)
          selected_atoms = tf.expand_dims(
              tf.expand_dims(selected_atoms, axis=1), axis=4)
          selected_atoms = tf.expand_dims(tf.expand_dims(selected_atoms,
                                                         axis=1),
                                          axis=4)
          out_tensors.append(
              tf.reduce_sum(out_tensor * selected_atoms, axis=(2, 3)))
      return tf.concat(out_tensors, axis=2)
@@ -1263,8 +1264,7 @@ class FeaturizationTransformer(Transformer):
    self.featurizer = featurizer
    if not transform_X:
      raise ValueError("FeaturizingTransfomer can only be used on X")
    super(FeaturizationTransformer, self).__init__(
        transform_X=transform_X,
    super(FeaturizationTransformer, self).__init__(transform_X=transform_X,
                                                   transform_y=transform_y,
                                                   transform_w=transform_w,
                                                   dataset=dataset)
@@ -1304,12 +1304,20 @@ class DataTransforms(Transformer):
          "Invalid flip command : Enter either lr (for left to right flip) or ud (for up to down flip)"
      )

  def rotate(self, angle=0):
  def rotate(self, angle=0, reshape=True, mode='constant', order=3):
    """ Rotates the image
          Parameters:
              angle (default = 0 i.e no rotation) - Denotes angle by which the image should be rotated (in Degrees)
              reshape (default = True i.e will get reshape)
              mode - Points outside the boundaries of the input are filled according to the given mode
             (‘constant’, ‘nearest’, ‘reflect’ or ‘wrap’). Default is ‘constant’.
              order - The order of the spline interpolation, default is 3. The order has to be in the range 0-5.
    """
    return scipy.ndimage.rotate(self.Image, angle)
    return scipy.ndimage.rotate(self.Image,
                                angle,
                                reshape=reshape,
                                mode=mode,
                                order=order)

  def gaussian_blur(self, sigma=0.2):
    """ Adds gaussian noise to the image
@@ -1318,6 +1326,23 @@ class DataTransforms(Transformer):
    """
    return scipy.ndimage.gaussian_filter(self.Image, sigma)

  def crop(self, x_crop, y_crop):
    """ Crops the image from the center.
          Parameters:
            x_crop - bound for x
            y_crop - bound for y
    """
    y = self.Image.shape[0]
    x = self.Image.shape[1]
    x_start = x // 2 - (x_crop // 2)
    y_start = y // 2 - (y_crop // 2)
    return self.Image[y_start:y_start + y_crop, x_start:x_start + x_crop]

  def convert2gray(self):
    """ Converts the image to grayscale
    """
    return np.dot(self.Image[..., :3], [0.2989, 0.5870, 0.1140])

  def shift(self, width, height, mode='constant', order=3):
    """Shifts the image
        Parameters:
@@ -1328,11 +1353,13 @@ class DataTransforms(Transformer):
          order - The order of the spline interpolation, default is 3. The order has to be in the range 0-5.
          """
    if len(self.Image.shape) == 2:
      return scipy.ndimage.shift(
          self.Image, [height, width], order=order, mode=mode)
      return scipy.ndimage.shift(self.Image, [height, width],
                                 order=order,
                                 mode=mode)
    if len(self.Image.shape == 3):
      return scipy.ndimage.shift(
          self.Image, [height, width, 0], order=order, mode=mode)
      return scipy.ndimage.shift(self.Image, [height, width, 0],
                                 order=order,
                                 mode=mode)

  def gaussian_noise(self, mean=0, std=25.5):
    '''Adds gaussian noise to the image
@@ -1358,3 +1385,13 @@ class DataTransforms(Transformer):
    x[noise < (prob / 2)] = pepper
    x[noise > (1 - prob / 2)] = salt
    return x

  def median_filter(self, size, mode='reflect', cval=0.0):
    """ Calculates a multidimensional median filter
    Parameters:
      size - Shape taken from array at every element to define the input to the filter.
      mode - Points outside the boundaries of the input are filled according to the given mode
             (‘constant’, ‘nearest’, ‘reflect’ or ‘wrap’). Default is ‘constant’.
      cval - value to fill past edges if mode is 'constant'.
    """
    return scipy.ndimage.median_filter(self.Image, size, mode=mode, cval=cval)