Commit f83a3ed4 authored by leswing's avatar leswing
Browse files

Update docstrings to numpy format

parent 4a26c666
Loading
Loading
Loading
Loading

.keras.json

deleted100644 → 0
+0 −6
Original line number Diff line number Diff line
{
    "image_dim_ordering": "tf",
    "epsilon": 1e-07,
    "floatx": "float32",
    "backend": "tensorflow"
}
+1 −1
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@ install:
- source activate deepchem
- pip install yapf==0.16.0
- python setup.py install
- cp .keras.json ~/
- KERAS_BACKEND=tensorflow
script:
- nosetests -v deepchem --nologcapture
- find ./deepchem | grep .py$ |xargs python -m doctest -v
+89 −2
Original line number Diff line number Diff line
@@ -15,35 +15,110 @@ class OneHotFeaturizer(Featurizer):
  """

  def __init__(self, charset, padlength=120):
    """
    Parameters
    ----------
    charset: obj:`list` of obj:`str`
      Each string is length 1
    padlength: int
      length to pad the smile strings to
    """
    self.charset = charset
    self.pad_length = padlength

  def featurize(self, mols, verbose=True, log_every_n=1000):
    """
    Parameters
    ----------
    mols: obj
      List of rdkit Molecule Objects
    verbose: bool
      How much logging
    log_every_n:
      How often to log
    Returns

    -------
    obj
      numpy array of features
    """
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    if self.charset is None:
      self.charset = self._create_charset(mols)
    return np.array([self.one_hot_encoded(smile) for smile in smiles])

  def one_hot_array(self, i):
    """
    Create a one hot array with bit i set to 1
    Parameters
    ----------
    i: int
      bit to set to 1
    Returns
    -------
    obj:`list` of obj:`int`
      length len(self.charset)
    """
    return [int(x) for x in [ix == i for ix in range(len(self.charset))]]

  def one_hot_index(self, c):
    """
    TODO(LESWING) replace with map lookup vs linear scan
    :param charset:
    :return:
    Parameters
    ----------
    c
      character whose index we want
    Returns
    -------
    int
      index of c in self.charset
    """
    return self.charset.index(c)

  def pad_smile(self, smile):
    """
    Pad A Smile String to self.pad_length
    Parameters
    ----------
    smile: str

    Returns
    -------
    str
      smile string space padded to self.pad_length
    """

    return smile.ljust(self.pad_length)

  def one_hot_encoded(self, smile):
    """
    One Hot Encode an entire SMILE string
    Parameters
    ----------
    smile: str
      smile string to encode

    Returns
    -------
    object
      np.array of one hot encoded arrays for each character in smile
    """
    return np.array([
        self.one_hot_array(self.one_hot_index(x)) for x in self.pad_smile(smile)
    ])

  def untransform(self, z):
    """
    Convert from one hot representation back to SMILE
    Parameters
    ----------
    z: obj:`list`
      list of one hot encoded features

    Returns
    -------
    Smile Strings picking MAX for each one hot encoded array
    """
    z1 = []
    for i in range(len(z)):
      s = ""
@@ -54,6 +129,18 @@ class OneHotFeaturizer(Featurizer):
    return z1

  def _create_charset(self, smiles):
    """
    create the charset from smiles
    Parameters
    ----------
    smiles: obj:`list` of obj:`str`
      list of smile strings

    Returns
    -------
    obj:`list` of obj:`str`
      List of length one strings that are characters in smiles.  No duplicates
    """
    s = set()
    for smile in smiles:
      s.union(list(smile))
+38 −10
Original line number Diff line number Diff line
@@ -19,11 +19,19 @@ class TensorflowMoleculeEncoder(Model):
               charset_length=len(zinc_charset),
               latent_rep_size=292):
    """
    :param model_dir: Folder to store cached weights
    :weights_file: File to store cached weights in model_dir
    :param verbose: True for more logging
    :param charset_length: Length of one_hot_encoded vectors
    :param latent_rep_size: How large a 1D Vector for latent representation

    Parameters
    ----------
    model_dir: str
      Folder to store cached weights
    weights_file: str
      File to store cached weights in model_dir
    verbose: bool
      True for more logging
    charset_length: int
      Length of one_hot_encoded vectors
    latent_rep_size: int
      How large a 1D Vector for latent representation
    """
    super(TensorflowMoleculeEncoder, self).__init__(
        model_dir=model_dir, verbose=verbose)
@@ -38,6 +46,12 @@ class TensorflowMoleculeEncoder(Model):

  @staticmethod
  def zinc_encoder():
    """
    Returns
    -------
    obj
      An Encoder with weights that were trained on the zinc dataset
    """
    current_dir = os.path.dirname(os.path.realpath(__file__))
    weights_filename = "zinc_model.h5"
    weights_file = os.path.join(current_dir, weights_filename)
@@ -67,11 +81,19 @@ class TensorflowMoleculeDecoder(Model):
               charset_length=len(zinc_charset),
               latent_rep_size=292):
    """
    :param model_dir: Folder To Store Cached Weights
    :weights_file: File to store cached weights in model_dir
    :param verbose: True for more logging
    :param charset_length: Length of one_hot_encoded vectors
    :param latent_rep_size: How large a 1D Vector for latent representation

    Parameters
    ----------
    model_dir: str
      Folder to store cached weights
    weights_file: str
      File to store cached weights in model_dir
    verbose: bool
      True for more logging
    charset_length: int
      Length of one_hot_encoded vectors
    latent_rep_size: int
      How large a 1D Vector for latent representation
    """
    super(TensorflowMoleculeDecoder, self).__init__(
        model_dir=model_dir, verbose=verbose)
@@ -89,6 +111,12 @@ class TensorflowMoleculeDecoder(Model):

  @staticmethod
  def zinc_decoder():
    """
    Returns
    -------
    obj
      A Decoder with weights that were trained on the zinc dataset
    """
    current_dir = os.path.dirname(os.path.realpath(__file__))
    weights_filename = "zinc_model.h5"
    weights_file = os.path.join(current_dir, weights_filename)
+2 −4
Original line number Diff line number Diff line
@@ -12,9 +12,7 @@ from models.autoencoder_models.autoencoder import TensorflowMoleculeEncoder, Ten
class TestTensorflowEncoders(TestCase):

  def test_fit(self):
    data_dir = "/home/leswing/Documents/data_sets/keras-molecule"

    tf_enc = TensorflowMoleculeEncoder(model_dir=data_dir)
    tf_enc = TensorflowMoleculeEncoder.zinc_encoder()

    smiles = [
        "Cn1cnc2c1c(=O)n(C)c(=O)n2C", "O=C(O)[C@@H]1/C(=C/CO)O[C@@H]2CC(=O)N21",
@@ -33,7 +31,7 @@ class TestTensorflowEncoders(TestCase):

    dataset = DiskDataset.from_numpy(features, features)
    prediction = tf_enc.predict_on_batch(dataset.X)
    tf_de = TensorflowMoleculeDecoder(model_dir=data_dir)
    tf_de = TensorflowMoleculeDecoder.zinc_decoder()
    one_hot_decoded = tf_de.predict_on_batch(prediction)
    decoded_smiles = featurizer.untransform(one_hot_decoded)
    assert_equals(len(decoded_smiles), len(smiles))