Commit 87f27e16 authored by VIGNESHinZONE's avatar VIGNESHinZONE
Browse files

Merge branch 'master' into gpu-install

parents 08e7467e d6368781
Loading
Loading
Loading
Loading
+8 −6
Original line number Diff line number Diff line
@@ -253,12 +253,14 @@ class CGCNNModel(TorchModel):
  Here is a simple example of code that uses the CGCNNModel with
  materials dataset.

  >> import deepchem as dc
  >> dataset_config = {"reload": False, "featurizer": dc.feat.CGCNNFeaturizer, "transformers": []}
  >> tasks, datasets, transformers = dc.molnet.load_perovskite(**dataset_config)
  >> train, valid, test = datasets
  >> model = dc.models.CGCNNModel(mode='regression', batch_size=32, learning_rate=0.001)
  >> model.fit(train, nb_epoch=50)
  Examples
  --------
  >>> import deepchem as dc
  >>> dataset_config = {"reload": False, "featurizer": dc.feat.CGCNNFeaturizer(), "transformers": []}
  >>> tasks, datasets, transformers = dc.molnet.load_perovskite(**dataset_config)
  >>> train, valid, test = datasets
  >>> model = dc.models.CGCNNModel(mode='regression', batch_size=32, learning_rate=0.001)
  >>> avg_loss = model.fit(train, nb_epoch=50)

  This model takes arbitary crystal structures as an input, and predict material properties
  using the element information and connection of atoms in the crystal. If you want to get
+5 −8
Original line number Diff line number Diff line
@@ -30,7 +30,7 @@ class _PerovskiteLoader(_MolnetLoader):


def load_perovskite(
    featurizer: Union[dc.feat.Featurizer, str] = dc.feat.SineCoulombMatrix(),
    featurizer: Union[dc.feat.Featurizer, str] = dc.feat.CGCNNFeaturizer(),
    splitter: Union[dc.splits.Splitter, str, None] = 'random',
    transformers: List[Union[TransformerGenerator, str]] = ['normalization'],
    reload: bool = True,
@@ -93,13 +93,10 @@ def load_perovskite(

  Examples
  --------
  >>>
  >> import deepchem as dc
  >> tasks, datasets, transformers = dc.molnet.load_perovskite()
  >> train_dataset, val_dataset, test_dataset = datasets
  >> n_tasks = len(tasks)
  >> n_features = train_dataset.get_data_shape()[0]
  >> model = dc.models.MultitaskRegressor(n_tasks, n_features)
  >>> import deepchem as dc
  >>> tasks, datasets, transformers = dc.molnet.load_perovskite()
  >>> train_dataset, val_dataset, test_dataset = datasets
  >>> model = dc.models.CGCNNModel(mode='regression', batch_size=32, learning_rate=0.001)

  """
  loader = _PerovskiteLoader(featurizer, splitter, transformers,
+1 −0
Original line number Diff line number Diff line
@@ -21,4 +21,5 @@ from deepchem.trans.transformers import ImageTransformer
from deepchem.trans.transformers import DataTransforms
from deepchem.trans.transformers import Transformer
from deepchem.trans.transformers import FlatteningTransformer
from deepchem.trans.transformers import RxnSplitTransformer
from deepchem.trans.duplicate import DuplicateBalancingTransformer
+55 −0
Original line number Diff line number Diff line
import unittest
import numpy as np

from deepchem.trans.transformers import RxnSplitTransformer

reactions: np.ndarray = np.array(
    [
        "CC(C)C[Mg+].CON(C)C(=O)c1ccc(O)nc1>C1CCOC1.[Cl-]>CC(C)CC(=O)c1ccc(O)nc1",
        "CCn1cc(C(=O)O)c(=O)c2cc(F)c(-c3ccc(N)cc3)cc21.O=CO>>CCn1cc(C(=O)O)c(=O)c2cc(F)c(-c3ccc(NC=O)cc3)cc21"
    ],
    dtype=object)

split: np.ndarray = np.array(
    [[
        "CC(C)C[Mg+].CON(C)C(=O)c1ccc(O)nc1>C1CCOC1.[Cl-]",
        "CC(C)CC(=O)c1ccc(O)nc1"
    ], [
        "CCn1cc(C(=O)O)c(=O)c2cc(F)c(-c3ccc(N)cc3)cc21.O=CO>",
        "CCn1cc(C(=O)O)c(=O)c2cc(F)c(-c3ccc(NC=O)cc3)cc21"
    ]],
    dtype=object)

sep: np.ndarray = np.array(
    [[
        "CC(C)C[Mg+].CON(C)C(=O)c1ccc(O)nc1.C1CCOC1.[Cl-]>",
        "CC(C)CC(=O)c1ccc(O)nc1"
    ], [
        "CCn1cc(C(=O)O)c(=O)c2cc(F)c(-c3ccc(N)cc3)cc21.O=CO>",
        "CCn1cc(C(=O)O)c(=O)c2cc(F)c(-c3ccc(NC=O)cc3)cc21"
    ]],
    dtype=object)


class TestRxnSplitTransformer(unittest.TestCase):
  """
  Tests the Reaction split transformer for the source/target splitting and
  for the reagent mixing operation.
  """

  def test_split(self):
    """Tests the source/target split from an input reaction SMILES."""
    trans = RxnSplitTransformer(sep_reagent=True)
    split_reactions = trans.transform_array(
        X=reactions, y=np.array([]), w=np.array([]), ids=np.array([]))
    assert split_reactions[0].shape == (2, 2)
    assert (split_reactions[0] == split).all()

  def test_mixing(self):
    """Tests the reagent - reactant mixing toggle."""

    trans = RxnSplitTransformer(sep_reagent=False)
    split_reactions = trans.transform_array(
        X=reactions, y=np.array([]), w=np.array([]), ids=np.array([]))
    assert split_reactions[0].shape == (2, 2)
    assert (split_reactions[0] == sep).all()
+113 −0
Original line number Diff line number Diff line
@@ -2469,3 +2469,116 @@ class DataTransforms(object):
    image = Image.fromarray(self.Image)
    image = image.filter(ImageFilter.MedianFilter(size=size))
    return np.array(image)


class RxnSplitTransformer(Transformer):
  """Splits the reaction SMILES input into the source and target strings
  required for machine translation tasks.

  The input is expected to be in the form reactant>reagent>product. The source
  string would be reactants>reagents and the target string would be the products.

  The transformer can also separate the reagents from the reactants for a mixed
  training mode. During mixed training, the source string is transformed from
  reactants>reagent to reactants.reagent> . This can be toggled (default True)
  by setting the value of sep_reagent while calling the transformer.

  Examples
  --------
  >>> # When mixed training is toggled.
  >>> import numpy as np
  >>> from deepchem.trans.transformers import RxnSplitTransformer
  >>> reactions = np.array(
    [
        "CC(C)C[Mg+].CON(C)C(=O)c1ccc(O)nc1>C1CCOC1.[Cl-]>CC(C)CC(=O)c1ccc(O)nc1",
        "CCn1cc(C(=O)O)c(=O)c2cc(F)c(-c3ccc(N)cc3)cc21.O=CO>>CCn1cc(C(=O)O)c(=O)c2cc(F)c(-c3ccc(NC=O)cc3)cc21"
    ],
    dtype=object)
  >>> trans = RxnSplitTransformer(sep_reagent=True)
  >>> split_reactions = trans.transform_array(X=reactions, y=np.array([]), w=np.array([]), ids=np.array([]))
  >>> split_reactions
  (array([['CC(C)C[Mg+].CON(C)C(=O)c1ccc(O)nc1>C1CCOC1.[Cl-]',
           'CC(C)CC(=O)c1ccc(O)nc1'],
          ['CCn1cc(C(=O)O)c(=O)c2cc(F)c(-c3ccc(N)cc3)cc21.O=CO>',
           'CCn1cc(C(=O)O)c(=O)c2cc(F)c(-c3ccc(NC=O)cc3)cc21']], dtype='<U51'), array([], dtype=float64), array([], dtype=float64), array([], dtype=float64))

  When mixed training is disabled, you get the following outputs:

  >>> trans_disable = RxnSplitTransformer(sep_reagent=False)
  >>> split_reactions = trans_disable.transform_array(X=reactions, y=np.array([]), w=np.array([]), ids=np.array([]))
  >>> split_reactions
  (array([['CC(C)C[Mg+].CON(C)C(=O)c1ccc(O)nc1.C1CCOC1.[Cl-]>',
           'CC(C)CC(=O)c1ccc(O)nc1'],
          ['CCn1cc(C(=O)O)c(=O)c2cc(F)c(-c3ccc(N)cc3)cc21.O=CO>',
           'CCn1cc(C(=O)O)c(=O)c2cc(F)c(-c3ccc(NC=O)cc3)cc21']], dtype='<U51'), array([], dtype=float64), array([], dtype=float64), array([], dtype=float64))

  Note
  ----
  This class only transforms the feature field of a reaction dataset like USPTO.
  """

  def __init__(self,
               sep_reagent: bool = True,
               dataset: Optional[Dataset] = None):
    """Initializes the Reaction split Transformer.

    Parameters
    ----------
    sep_reagent: bool, optional (default True)
      To separate the reagent and reactants for training.
    dataset: dc.data.Dataset object, optional (default None)
      Dataset to be transformed.
    """

    self.sep_reagent = sep_reagent
    super(RxnSplitTransformer, self).__init__(transform_X=True, dataset=dataset)

  def transform_array(
      self, X: np.ndarray, y: np.ndarray, w: np.ndarray,
      ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Transform the data in a set of (X, y, w, ids) arrays.

    Parameters
    ----------
    X: np.ndarray
      Array of features(the reactions)
    y: np.ndarray
      Array of labels
    w: np.ndarray
      Array of weights.
    ids: np.ndarray
      Array of weights.

    Returns
    -------
    Xtrans: np.ndarray
      Transformed array of features
    ytrans: np.ndarray
      Transformed array of labels
    wtrans: np.ndarray
      Transformed array of weights
    idstrans: np.ndarray
      Transformed array of ids
    """

    reactant = list(map(lambda x: x.split('>')[0], X))
    reagent = list(map(lambda x: x.split('>')[1], X))
    product = list(map(lambda x: x.split('>')[2], X))

    if self.sep_reagent:
      source = [x + '>' + y for x, y in zip(reactant, reagent)]
    else:
      source = [
          x + '.' + y + '>' if y else x + '>' + y
          for x, y in zip(reactant, reagent)
      ]

    target = product

    X = np.column_stack((source, target))

    return (X, y, w, ids)

  def untransform(self, z):
    """Not Implemented."""
    raise NotImplementedError("Cannot untransform the source/target split.")
Loading