Commit 31321ece authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #417 from lilleswing/MolAutoEncoder-cr

Use Cached Models from keras-molecules as Deepchem Models
parents 18e74be0 37967372
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -5,8 +5,8 @@ python:
sudo: required
dist: trusty
install:
- if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then wget https://repo.continuum.io/archive/Anaconda2-4.2.0-Linux-x86_64.sh
  -O anaconda.sh; else wget https://repo.continuum.io/archive/Anaconda3-4.2.0-Linux-x86_64.sh
- if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then wget https://repo.continuum.io/archive/Anaconda2-4.3.0-Linux-x86_64.sh
  -O anaconda.sh; else wget https://repo.continuum.io/archive/Anaconda3-4.3.0-Linux-x86_64.sh
  -O anaconda.sh; fi
- bash anaconda.sh -b -p $HOME/anaconda
- export PATH="$HOME/anaconda/bin:$PATH"
+1 −0
Original line number Diff line number Diff line
@@ -20,3 +20,4 @@ from deepchem.feat.coulomb_matrices import CoulombMatrixEig
from deepchem.feat.grid_featurizer import GridFeaturizer
from deepchem.feat.nnscore_utils import hydrogenate_and_compute_partial_charges
from deepchem.feat.binding_pocket_features import BindingPocketFeaturizer
from deepchem.feat.one_hot import OneHotFeaturizer
+147 −0
Original line number Diff line number Diff line
import numpy as np
from deepchem.feat import Featurizer
from rdkit import Chem

zinc_charset = [
    ' ', '#', ')', '(', '+', '-', '/', '1', '3', '2', '5', '4', '7', '6', '8',
    '=', '@', 'C', 'B', 'F', 'I', 'H', 'O', 'N', 'S', '[', ']', '\\', 'c', 'l',
    'o', 'n', 'p', 's', 'r'
]


class OneHotFeaturizer(Featurizer):
  """
  NOTE(LESWING) Not Thread Safe in initialization of charset
  """

  def __init__(self, charset, padlength=120):
    """
    Parameters
    ----------
    charset: obj:`list` of obj:`str`
      Each string is length 1
    padlength: int
      length to pad the smile strings to
    """
    self.charset = charset
    self.pad_length = padlength

  def featurize(self, mols, verbose=True, log_every_n=1000):
    """
    Parameters
    ----------
    mols: obj
      List of rdkit Molecule Objects
    verbose: bool
      How much logging
    log_every_n:
      How often to log
    Returns

    -------
    obj
      numpy array of features
    """
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    if self.charset is None:
      self.charset = self._create_charset(mols)
    return np.array([self.one_hot_encoded(smile) for smile in smiles])

  def one_hot_array(self, i):
    """
    Create a one hot array with bit i set to 1
    Parameters
    ----------
    i: int
      bit to set to 1
    Returns
    -------
    obj:`list` of obj:`int`
      length len(self.charset)
    """
    return [int(x) for x in [ix == i for ix in range(len(self.charset))]]

  def one_hot_index(self, c):
    """
    TODO(LESWING) replace with map lookup vs linear scan
    Parameters
    ----------
    c
      character whose index we want
    Returns
    -------
    int
      index of c in self.charset
    """
    return self.charset.index(c)

  def pad_smile(self, smile):
    """
    Pad A Smile String to self.pad_length
    Parameters
    ----------
    smile: str

    Returns
    -------
    str
      smile string space padded to self.pad_length
    """

    return smile.ljust(self.pad_length)

  def one_hot_encoded(self, smile):
    """
    One Hot Encode an entire SMILE string
    Parameters
    ----------
    smile: str
      smile string to encode

    Returns
    -------
    object
      np.array of one hot encoded arrays for each character in smile
    """
    return np.array([
        self.one_hot_array(self.one_hot_index(x)) for x in self.pad_smile(smile)
    ])

  def untransform(self, z):
    """
    Convert from one hot representation back to SMILE
    Parameters
    ----------
    z: obj:`list`
      list of one hot encoded features

    Returns
    -------
    Smile Strings picking MAX for each one hot encoded array
    """
    z1 = []
    for i in range(len(z)):
      s = ""
      for j in range(len(z[i])):
        oh = np.argmax(z[i][j])
        s += self.charset[oh]
      z1.append([s.strip()])
    return z1

  def _create_charset(self, smiles):
    """
    create the charset from smiles
    Parameters
    ----------
    smiles: obj:`list` of obj:`str`
      list of smile strings

    Returns
    -------
    obj:`list` of obj:`str`
      List of length one strings that are characters in smiles.  No duplicates
    """
    s = set()
    for smile in smiles:
      s.union(list(smile))
    return sorted(list(s))
+19 −0
Original line number Diff line number Diff line
from unittest import TestCase

from nose.tools import assert_equals
from rdkit import Chem

import deepchem as dc


class TestOneHotFeaturizer(TestCase):

  def test_featurize(self):
    smiles = ["Cn1c(=O)c2c(ncn2C)n(C)c1=O", "CC(=O)N1CN(C(C)=O)C(O)C1O"]
    mols = [Chem.MolFromSmiles(smile) for smile in smiles]
    featurizer = dc.feat.one_hot.OneHotFeaturizer(dc.feat.one_hot.zinc_charset)
    one_hots = featurizer.featurize(mols)
    untransformed = featurizer.untransform(one_hots)
    assert_equals(len(smiles), len(untransformed))
    for i in range(len(smiles)):
      assert_equals(smiles[i], untransformed[i][0])
+1 −0
Original line number Diff line number Diff line
zinc_model.h5
 No newline at end of file
Loading