Merge pull request #417 from lilleswing/MolAutoEncoder-cr (31321ece) · Commits · 钟慕尧 / deepchem

.travis.yml

+2 −2

Original line number	Diff line number	Diff line
		@@ -5,8 +5,8 @@ python:
		sudo: required
		dist: trusty
		install:
		- if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then wget https://repo.continuum.io/archive/Anaconda2-4.2.0-Linux-x86_64.sh
		-O anaconda.sh; else wget https://repo.continuum.io/archive/Anaconda3-4.2.0-Linux-x86_64.sh
		- if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then wget https://repo.continuum.io/archive/Anaconda2-4.3.0-Linux-x86_64.sh
		-O anaconda.sh; else wget https://repo.continuum.io/archive/Anaconda3-4.3.0-Linux-x86_64.sh
		-O anaconda.sh; fi
		- bash anaconda.sh -b -p $HOME/anaconda
		- export PATH="$HOME/anaconda/bin:$PATH"

deepchem/feat/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -20,3 +20,4 @@ from deepchem.feat.coulomb_matrices import CoulombMatrixEig
		from deepchem.feat.grid_featurizer import GridFeaturizer
		from deepchem.feat.nnscore_utils import hydrogenate_and_compute_partial_charges
		from deepchem.feat.binding_pocket_features import BindingPocketFeaturizer
		from deepchem.feat.one_hot import OneHotFeaturizer

deepchem/feat/one_hot.py

0 → 100644

+147 −0

Original line number	Diff line number	Diff line
		import numpy as np
		from deepchem.feat import Featurizer
		from rdkit import Chem

		zinc_charset = [
		' ', '#', ')', '(', '+', '-', '/', '1', '3', '2', '5', '4', '7', '6', '8',
		'=', '@', 'C', 'B', 'F', 'I', 'H', 'O', 'N', 'S', '[', ']', '\\', 'c', 'l',
		'o', 'n', 'p', 's', 'r'
		]


		class OneHotFeaturizer(Featurizer):
		"""
		NOTE(LESWING) Not Thread Safe in initialization of charset
		"""

		def __init__(self, charset, padlength=120):
		"""
		Parameters
		----------
		charset: obj:`list` of obj:`str`
		Each string is length 1
		padlength: int
		length to pad the smile strings to
		"""
		self.charset = charset
		self.pad_length = padlength

		def featurize(self, mols, verbose=True, log_every_n=1000):
		"""
		Parameters
		----------
		mols: obj
		List of rdkit Molecule Objects
		verbose: bool
		How much logging
		log_every_n:
		How often to log
		Returns

		-------
		obj
		numpy array of features
		"""
		smiles = [Chem.MolToSmiles(mol) for mol in mols]
		if self.charset is None:
		self.charset = self._create_charset(mols)
		return np.array([self.one_hot_encoded(smile) for smile in smiles])

		def one_hot_array(self, i):
		"""
		Create a one hot array with bit i set to 1
		Parameters
		----------
		i: int
		bit to set to 1
		Returns
		-------
		obj:`list` of obj:`int`
		length len(self.charset)
		"""
		return [int(x) for x in [ix == i for ix in range(len(self.charset))]]

		def one_hot_index(self, c):
		"""
		TODO(LESWING) replace with map lookup vs linear scan
		Parameters
		----------
		c
		character whose index we want
		Returns
		-------
		int
		index of c in self.charset
		"""
		return self.charset.index(c)

		def pad_smile(self, smile):
		"""
		Pad A Smile String to self.pad_length
		Parameters
		----------
		smile: str

		Returns
		-------
		str
		smile string space padded to self.pad_length
		"""

		return smile.ljust(self.pad_length)

		def one_hot_encoded(self, smile):
		"""
		One Hot Encode an entire SMILE string
		Parameters
		----------
		smile: str
		smile string to encode

		Returns
		-------
		object
		np.array of one hot encoded arrays for each character in smile
		"""
		return np.array([
		self.one_hot_array(self.one_hot_index(x)) for x in self.pad_smile(smile)
		])

		def untransform(self, z):
		"""
		Convert from one hot representation back to SMILE
		Parameters
		----------
		z: obj:`list`
		list of one hot encoded features

		Returns
		-------
		Smile Strings picking MAX for each one hot encoded array
		"""
		z1 = []
		for i in range(len(z)):
		s = ""
		for j in range(len(z[i])):
		oh = np.argmax(z[i][j])
		s += self.charset[oh]
		z1.append([s.strip()])
		return z1

		def _create_charset(self, smiles):
		"""
		create the charset from smiles
		Parameters
		----------
		smiles: obj:`list` of obj:`str`
		list of smile strings

		Returns
		-------
		obj:`list` of obj:`str`
		List of length one strings that are characters in smiles. No duplicates
		"""
		s = set()
		for smile in smiles:
		s.union(list(smile))
		return sorted(list(s))

deepchem/feat/test_one_hot.py

0 → 100644

+19 −0

Original line number	Diff line number	Diff line
		from unittest import TestCase

		from nose.tools import assert_equals
		from rdkit import Chem

		import deepchem as dc


		class TestOneHotFeaturizer(TestCase):

		def test_featurize(self):
		smiles = ["Cn1c(=O)c2c(ncn2C)n(C)c1=O", "CC(=O)N1CN(C(C)=O)C(O)C1O"]
		mols = [Chem.MolFromSmiles(smile) for smile in smiles]
		featurizer = dc.feat.one_hot.OneHotFeaturizer(dc.feat.one_hot.zinc_charset)
		one_hots = featurizer.featurize(mols)
		untransformed = featurizer.untransform(one_hots)
		assert_equals(len(smiles), len(untransformed))
		for i in range(len(smiles)):
		assert_equals(smiles[i], untransformed[i][0])

deepchem/models/autoencoder_models/.gitignore

0 → 100644

+1 −0

Original line number	Diff line number	Diff line
		zinc_model.h5
		No newline at end of file

Admin message