Commit 9df90305 authored by Vignesh's avatar Vignesh
Browse files

Changes to transformers and NaN fixes

parent 94809d39
Loading
Loading
Loading
Loading
+10 −0
Original line number Diff line number Diff line
@@ -201,6 +201,9 @@ class SmilesToImage(Featurizer):
      # Compute atom properties
      atom_props = np.array([[atom.GetAtomicNum()] for atom in cmol.GetAtoms()])

      bond_props = bond_props.astype(np.float32)
      atom_props = atom_props.astype(np.float32)

    else:
      # Setup image
      img = np.zeros((self.img_size, self.img_size, 4))
@@ -218,6 +221,13 @@ class SmilesToImage(Featurizer):
          atom.GetHybridization().real,
      ] for atom in cmol.GetAtoms()])

      bond_props = bond_props.astype(np.float32)
      atom_props = atom_props.astype(np.float32)

      partial_charges = atom_props[:, 1]
      if np.any(np.isnan(partial_charges)):
        return []

    frac = np.linspace(0, 1, int(1 / self.res * 2))
    # Reshape done for proper broadcast
    frac = frac.reshape(-1, 1, 1)
+25 −8
Original line number Diff line number Diff line
@@ -52,6 +52,7 @@ def load_chembl25(featurizer="smiles2seq",
                  save_dir=None,
                  split_seed=None,
                  reload=True,
                  transformer_type='minmax',
                  **kwargs):
  """Loads the ChEMBL25 dataset, featurizes it, and does a split.
  Parameters
@@ -68,6 +69,8 @@ def load_chembl25(featurizer="smiles2seq",
    Seed to be used for splitting the dataset
  reload: bool, default True
    Whether to reload saved dataset
  transformer_type: str, default minmax:
    Transformer to use
  """
  if data_dir is None:
    data_dir = DEFAULT_DIR
@@ -121,10 +124,17 @@ def load_chembl25(featurizer="smiles2seq",
      input_files=[dataset_file], shard_size=10000, data_dir=save_folder)

  if split is None:
    transformer = [
    if transformer_type == "minmax":
      transformers = [
          dc.trans.MinMaxTransformer(
              transform_X=False, transform_y=True, dataset=dataset)
      ]
    else:
      transformers = [
          dc.trans.NormalizationTransformer(
              transform_X=False, transform_y=True, dataset=dataset)
      ]

    logger.info("Split is None, about to transform dataset.")
    for transformer in transformers:
      dataset = transformer.transform(dataset)
@@ -140,10 +150,17 @@ def load_chembl25(featurizer="smiles2seq",
  splitter = splitters[split]

  train, valid, test = splitter.train_valid_test_split(dataset, seed=split_seed)
  if transformer_type == "minmax":
    transformers = [
        dc.trans.MinMaxTransformer(
            transform_X=False, transform_y=True, dataset=train)
    ]
  else:
    transformers = [
        dc.trans.NormalizationTransformer(
            transform_X=False, transform_y=True, dataset=train)
    ]

  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
+1 −0
Original line number Diff line number Diff line
@@ -16,3 +16,4 @@ from deepchem.trans.transformers import CoulombFitTransformer
from deepchem.trans.transformers import IRVTransformer
from deepchem.trans.transformers import DAGTransformer
from deepchem.trans.transformers import ANITransformer
from deepchem.trans.transformers import MinMaxTransformer
+47 −0
Original line number Diff line number Diff line
@@ -103,6 +103,53 @@ class Transformer(object):
    return X, y, w


class MinMaxTransformer(Transformer):

  def __init__(self,
               transform_X=False,
               transform_y=False,
               transform_w=False,
               dataset=None):
    if transform_X:
      raise NotImplementedError("MinMax transformer does not work for X yet.")
    if transform_y:
      self.y_min = np.min(dataset.y, axis=0)
      self.y_max = np.max(dataset.y, axis=0)

      if len(dataset.y.shape) > 1:
        assert len(self.y_min) == dataset.y.shape[1]

    super(MinMaxTransformer, self).__init__(
        transform_X=transform_X,
        transform_y=transform_y,
        transform_w=transform_w,
        dataset=dataset)

  def transform(self, dataset, parallel=False):
    return super(MinMaxTransformer, self).transform(dataset, parallel=parallel)

  def transform_array(self, X, y, w):
    """Transform the data in a set of (X, y, w) arrays."""
    if self.transform_X:
      raise NotImplementedError("MinMax transformer does not work for X yet")
    if self.transform_y:
      y = np.nan_to_num((y - self.y_min) / (self.y_max - self.y_min))
    return (X, y, w)

  def untransform(self, z):
    """
    Undo transformation on provided data.
    """
    if self.transform_X:
      raise NotImplementedError("MinMax does not work for X yet.")
    if self.transform_y:
      y_min = self.y_min
      y_max = self.y_max

      y = z * (y_max - y_min) + y_min
      return y


class NormalizationTransformer(Transformer):

  def __init__(self,