Changes for N2 up to simple fit test (90dde2da) · Commits · 钟慕尧 / deepchem

deepchem/feat/graph_features.py

+58 −24

Original line number	Diff line number	Diff line
		@@ -455,7 +455,7 @@ def max_pair_distance_pairs(mol: RDKitMol,
		sum_adj += np.linalg.matrix_power(adj, power)
		nonzero_locs = np.where(sum_adj != 0)
		num_pairs = len(nonzero_locs[0])
		# This creates a mstrix of shape (2, num_pairs)
		# This creates a matrix of shape (2, num_pairs)
		pair_edges = np.reshape(np.array(list(zip(nonzero_locs))), (2, num_pairs))
		return pair_edges

		@@ -489,13 +489,14 @@ def pair_features(
		The number of different bond types to consider.
		graph_distance: bool, optional (default True)
		If true, use graph distance between molecules. Else use euclidean
		distance. The specified `mol` must have a conformer. Atomic positions will
		be retrieved by calling `mol.getConformer(0)`.
		distance. The specified `mol` must have a conformer. Atomic
		positions will be retrieved by calling `mol.getConformer(0)`.
		max_pair_distance: Union[int, str], (default 'infinity')
		This value can be a positive integer or the string 'infinity'. This
		parameter determines the maximum graph distance at which pair features
		are computed. For example, if `max_pair_distance==2`, then pair features
		are computed only for atoms at most graph distance 2 apart.
		This value can be a positive integer or the string 'infinity'.
		This parameter determines the maximum graph distance at which pair
		features are computed. For example, if `max_pair_distance==2`,
		then pair features are computed only for atoms at most graph
		distance 2 apart.

		Note
		----
		@@ -504,8 +505,13 @@ def pair_features(
		Returns
		-------
		features: np.ndarray
		Of shape `(N, N, bt_len + max_distance + 1)`. This is the array of pairwise
		features for all atom pairs.
		Of shape `(N_edges, bt_len + max_distance + 1)`. This is the array
		of pairwise features for all atom pairs, where N_edges is the
		number of edges within max_pair_distance of one another in this
		molecules.
		pair_edges: np.ndarray
		Of shape `(2, num_pairs)` where `num_pairs` is the total number of
		pairs within `max_pair_distance` of one another.
		"""
		if graph_distance:
		max_distance = 7
		@@ -514,27 +520,55 @@ def pair_features(
		N = mol.GetNumAtoms()
		pair_edges = max_pair_distance_pairs(mol, max_pair_distance)
		num_pairs = pair_edges.shape[1]
		# TODO(rbharath): Figure out how to rewrite this to use the pair_edges correctly
		if max_pair_distance == "infinity":
		features = np.zeros((N, N, bt_len + max_distance + 1))
		N_edges = pair_edges.shape[1]
		features = np.zeros((N_edges, bt_len + max_distance + 1))
		#if max_pair_distance == "infinity":
		# features = np.zeros((N, N, bt_len + max_distance + 1))
		#else:
		# Get mapping
		mapping = {}
		for n in range(N_edges):
		a1, a2 = pair_edges[:, n]
		mapping[(int(a1), int(a2))] = n
		num_atoms = mol.GetNumAtoms()
		rings = mol.GetRingInfo().AtomRings()
		for a1 in range(num_atoms):
		for a2 in bond_adj_list[a1]:
		# first `bt_len` features are bond features(if applicable)
		features[a1, a2, :bt_len] = np.asarray(
		if (int(a1), int(a2)) not in mapping:
		raise ValueError(
		"Malformed molecule with bonds not in specified graph distance.")
		else:
		n = mapping[(int(a1), int(a2))]
		features[n, :bt_len] = np.asarray(
		bond_features_map[tuple(sorted((a1, a2)))], dtype=float)
		for ring in rings:
		if a1 in ring:
		for a2 in ring:
		if (int(a1), int(a2)) not in mapping:
		# For ring pairs outside max pairs distance continue
		continue
		else:
		n = mapping[(int(a1), int(a2))]
		# `bt_len`-th feature is if the pair of atoms are in the same ring
		features[a1, ring, bt_len] = 1
		features[a1, a1, bt_len] = 0.
		if a2 == a1:
		features[n, bt_len] = 0
		else:
		features[n, bt_len] = 1
		#features[a1, ring, bt_len] = 1
		#features[a1, a1, bt_len] = 0.
		# graph distance between two atoms
		if graph_distance:
		# distance is a matrix of 1-hot encoded distances
		# distance is a matrix of 1-hot encoded distances for all atoms
		distance = find_distance(
		a1, num_atoms, bond_adj_list, max_distance=max_distance)
		features[a1, :, bt_len + 1:] = distance
		for a2 in range(num_atoms):
		if (int(a1), int(a2)) not in mapping:
		# For ring pairs outside max pairs distance continue
		continue
		else:
		n = mapping[(int(a1), int(a2))]
		features[n, bt_len + 1:] = distance[a2]
		# Euclidean distance between atoms
		if not graph_distance:
		coords = np.zeros((N, 3))
		@@ -545,11 +579,11 @@ def pair_features(
		np.stack([coords] * N, axis=1) - \
		np.stack([coords] * N, axis=0)), axis=2))

		if max_pair_distance == "infinity":
		features = np.reshape(features, (N * N, bt_len + max_distance + 1))
		return features
		#if max_pair_distance == "infinity":
		# features = np.reshape(features, (N * N, bt_len + max_distance + 1))
		# return features

		return features
		return features, pair_edges


		def find_distance(a1: RDKitAtom, num_atoms: int, bond_adj_list,
		@@ -824,7 +858,7 @@ class WeaveFeaturizer(MolecularFeaturizer):
		bond_adj_list[bond[1]].append(bond[0])

		# Calculate pair features
		pairs = pair_features(
		pairs, pair_edges = pair_features(
		mol,
		bond_features_map,
		bond_adj_list,
		@@ -832,7 +866,7 @@ class WeaveFeaturizer(MolecularFeaturizer):
		graph_distance=self.graph_distance,
		max_pair_distance=self.max_pair_distance)

		return WeaveMol(nodes, pairs)
		return WeaveMol(nodes, pairs, pair_edges)


		class AtomicConvFeaturizer(ComplexNeighborListFragmentAtomicCoordinates):

deepchem/feat/mol_graphs.py

+5 −1

Original line number	Diff line number	Diff line
		@@ -379,11 +379,15 @@ class WeaveMol(object):
		.. [1] Kearnes, Steven, et al. "Molecular graph convolutions: moving beyond fingerprints." Journal of computer-aided molecular design 30.8 (2016): 595-608.
		"""

		def __init__(self, nodes, pairs):
		def __init__(self, nodes, pairs, pair_edges):
		self.nodes = nodes
		self.pairs = pairs
		self.num_atoms = self.nodes.shape[0]
		self.n_features = self.nodes.shape[1]
		self.pair_edges = pair_edges

		def get_pair_edges(self):
		return self.pair_edges

		def get_pair_features(self):
		return self.pairs

deepchem/feat/tests/test_weave.py

+25 −20

Original line number	Diff line number	Diff line
		@@ -33,6 +33,24 @@ def test_max_pair_distance_pairs():
		assert pair_edges.shape == (2, 9)


		def test_max_pair_distance_infinity():
		"""Test that max pair distance pairs are computed properly with infinity distance."""
		from rdkit import Chem
		# Test alkane
		mol = Chem.MolFromSmiles('CCC')
		# Test distance infinity
		pair_edges = max_pair_distance_pairs(mol, "infinity")
		# Everything is connected at this distance
		assert pair_edges.shape == (2, 9)

		# Test pentane
		mol = Chem.MolFromSmiles('CCCCC')
		# Test distance infinity
		pair_edges = max_pair_distance_pairs(mol, "infinity")
		# Everything is connected at this distance
		assert pair_edges.shape == (2, 25)


		def test_weave_single_carbon():
		"""Test that single carbon atom is featurized properly."""
		mols = ['C']
		@@ -73,8 +91,10 @@ def test_weave_alkane_max_pairs():
		"""Test on simple alkane with max pairs distance cutoff"""
		mols = ['CCC']
		featurizer = dc.feat.WeaveFeaturizer(max_pair_distance=1)
		mol_list = featurizer.featurize(mols)
		mol = mol_list[0]
		#mol_list = featurizer.featurize(mols)
		#mol = mol_list[0]
		from rdkit import Chem
		mol = featurizer._featurize(Chem.MolFromSmiles(mols[0]))

		# 3 carbonds in alkane
		assert mol.get_num_atoms() == 3
		@@ -82,7 +102,9 @@ def test_weave_alkane_max_pairs():
		# Test feature sizes
		assert mol.get_num_features() == 75

		# Should be a 3x3 interaction grid
		# Should be a 7x14 interaction grid since there are 7 pairs within graph
		# distance 1 (3 self interactions plus 2 bonds counted twice because of
		# symmetry)
		assert mol.get_pair_features().shape == (7, 14)


		@@ -105,20 +127,3 @@ def test_carbon_nitrogen():

		# Should be a 3x3 interaction grid
		assert mol.get_pair_features().shape == (5 * 5, 14)


		#def test_alkane_max_pair_distance():
		# """Test on simple alkane with max_pair_distance < infinity"""
		# mols = ['CCC']
		# featurizer = dc.feat.WeaveFeaturizer(max_pair_distance=1)
		# mol_list = featurizer.featurize(mols)
		# mol = mol_list[0]
		#
		# # 3 carbonds in alkane
		# assert mol.get_num_atoms() == 3
		#
		# # Test feature sizes
		# assert mol.get_num_features() == 75
		#
		# # Should be a 3x3 interaction grid
		# assert mol.get_pair_features().shape == (3, 1, 14)

deepchem/models/graph_models.py

+66 −35

Original line number	Diff line number	Diff line
		@@ -197,7 +197,6 @@ class WeaveModel(KerasModel):
		self.n_classes = n_classes

		# Build the model.

		atom_features = Input(shape=(self.n_atom_feat[0],))
		pair_features = Input(shape=(self.n_pair_feat[0],))
		pair_split = Input(shape=tuple(), dtype=tf.int32)
		@@ -277,6 +276,71 @@ class WeaveModel(KerasModel):
		super(WeaveModel, self).__init__(
		model, loss, output_types=output_types, batch_size=batch_size, **kwargs)

		def compute_features_on_batch(self, X_b):
		"""Compute tensors that will be input into the model from featurized representation.

		The featurized input to `WeaveModel` is instances of `WeaveMol` created by
		`WeaveFeaturizer`. This method converts input `WeaveMol` objects into
		tensors used by the Keras implementation to compute `WeaveModel` outputs.

		Parameters
		----------
		X_b: np.ndarray
		A numpy array with dtype=object where elements are `WeaveMol` objects.

		Returns
		-------
		atom_feat: np.ndarray
		Of shape `(N_atoms, N_atom_feat)`.
		pair_feat: np.ndarray
		Of shape `(N_pairs, N_pair_feat)`. Note that `N_pairs` will depend on
		the number of pairs being considered. If `max_pair_distance` is
		"infinity", then this will be `N_atoms**2`. Else it will be the number
		of pairs within the specifed graph distance.
		pair_split: np.ndarray
		Of shape `(N_pairs,)`. The i-th entry in this array will tell you the
		originating atom for this pair (the "source"). Note that pairs are
		symmetric so for a pair `(a, b)`, both `a` and `b` will separately be
		sources at different points in this array.
		atom_split: np.ndarray
		Of shape `(N_atoms,)`. The i-th entry in this array will be the molecule
		with the i-th atom belongs to.
		atom_to_pair: np.ndarray
		Of shape `(N_pairs, 2)`. The i-th row in this array will be the array
		`[a, b]` if `(a, b)` is a pair to be considered. (Note by symmetry, this
		implies some other row will contain `[b, a]`.
		"""
		atom_feat = []
		pair_feat = []
		atom_split = []
		atom_to_pair = []
		pair_split = []
		start = 0
		for im, mol in enumerate(X_b):
		n_atoms = mol.get_num_atoms()
		# pair_edges is of shape (2, N)
		pair_edges = mol.get_pair_edges()
		N_pairs = pair_edges[1]
		# number of atoms in each molecule
		atom_split.extend([im] * n_atoms)
		# index of pair features
		C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms))
		atom_to_pair.append(pair_edges.T + start)
		# Get starting pair atoms
		pair_starts = pair_edges.T[:, 0]
		# number of pairs for each atom
		pair_split.extend(pair_starts + start)
		start = start + n_atoms

		# atom features
		atom_feat.append(mol.get_atom_features())
		# pair features
		pair_feat.append(mol.get_pair_features())

		return (np.concatenate(atom_feat, axis=0), np.concatenate(
		pair_feat, axis=0), np.array(pair_split), np.array(atom_split),
		np.concatenate(atom_to_pair, axis=0))

		def default_generator(
		self,
		dataset: Dataset,
		@@ -313,40 +377,7 @@ class WeaveModel(KerasModel):
		if self.mode == 'classification':
		y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape(
		-1, self.n_tasks, self.n_classes)
		atom_feat = []
		pair_feat = []
		atom_split = []
		atom_to_pair = []
		pair_split = []
		start = 0
		for im, mol in enumerate(X_b):
		n_atoms = mol.get_num_atoms()
		# number of atoms in each molecule
		atom_split.extend([im] * n_atoms)
		# index of pair features
		C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms))
		atom_to_pair.append(
		np.transpose(
		np.array([C1.flatten() + start,
		C0.flatten() + start])))
		# number of pairs for each atom
		pair_split.extend(C1.flatten() + start)
		start = start + n_atoms

		# atom features
		atom_feat.append(mol.get_atom_features())
		# pair features
		pair_feat.append(
		np.reshape(mol.get_pair_features(),
		(n_atoms * n_atoms, self.n_pair_feat[0])))

		inputs = [
		np.concatenate(atom_feat, axis=0),
		np.concatenate(pair_feat, axis=0),
		np.array(pair_split),
		np.array(atom_split),
		np.concatenate(atom_to_pair, axis=0)
		]
		inputs = self.compute_features_on_batch(X_b)
		yield (inputs, [y_b], [w_b])

deepchem/models/tests/test_weave_models.py

+116 −5

Original line number	Diff line number	Diff line
		@@ -39,6 +39,88 @@ def get_dataset(mode='classification', featurizer='GraphConv', num_tasks=2):
		return tasks, ds, transformers, metric


		def test_compute_features_on_infinity_distance():
		"""Test that WeaveModel correctly transforms WeaveMol objects into tensors with infinite max_pair_distance."""
		featurizer = dc.feat.WeaveFeaturizer(max_pair_distance="infinity")
		X = featurizer(["C", "CCC"])
		batch_size = 20
		model = WeaveModel(
		1,
		batch_size=batch_size,
		mode='classification',
		fully_connected_layer_sizes=[2000, 1000],
		batch_normalize=True,
		batch_normalize_kwargs={
		"fused": False,
		"trainable": True,
		"renorm": True
		},
		learning_rage=0.0005)
		atom_feat, pair_feat, pair_split, atom_split, atom_to_pair = model.compute_features_on_batch(
		X)

		# There are 4 atoms each of which have 75 atom features
		assert atom_feat.shape == (4, 75)
		# There are 10 pairs with infinity distance and 14 pair features
		assert pair_feat.shape == (10, 14)
		# 4 atoms in total
		assert atom_split.shape == (4,)
		assert np.all(atom_split == np.array([0, 1, 1, 1]))
		# 10 pairs in total
		assert pair_split.shape == (10,)
		assert np.all(pair_split == np.array([0, 1, 1, 1, 2, 2, 2, 3, 3, 3]))
		# 10 pairs in total each with start/finish
		assert atom_to_pair.shape == (10, 2)
		assert np.all(
		atom_to_pair == np.array([[0, 0], [1, 1], [1, 2], [1, 3], [2, 1], [2, 2],
		[2, 3], [3, 1], [3, 2], [3, 3]]))


		def test_compute_features_on_distance_1():
		"""Test that WeaveModel correctly transforms WeaveMol objects into tensors with finite max_pair_distance."""
		featurizer = dc.feat.WeaveFeaturizer(max_pair_distance=1)
		X = featurizer(["C", "CCC"])
		batch_size = 20
		model = WeaveModel(
		1,
		batch_size=batch_size,
		mode='classification',
		fully_connected_layer_sizes=[2000, 1000],
		batch_normalize=True,
		batch_normalize_kwargs={
		"fused": False,
		"trainable": True,
		"renorm": True
		},
		learning_rage=0.0005)
		atom_feat, pair_feat, pair_split, atom_split, atom_to_pair = model.compute_features_on_batch(
		X)

		# There are 4 atoms each of which have 75 atom features
		assert atom_feat.shape == (4, 75)
		# There are 8 pairs with distance 1 and 14 pair features. (To see why 8,
		# there's the self pair for "C". For "CCC" there are 7 pairs including self
		# connections and accounting for symmetry.)
		assert pair_feat.shape == (8, 14)
		# 4 atoms in total
		assert atom_split.shape == (4,)
		assert np.all(atom_split == np.array([0, 1, 1, 1]))
		# 10 pairs in total
		assert pair_split.shape == (8,)
		print("pair_split")
		print(pair_split)
		print("atom_to_pair")
		print(atom_to_pair)
		# The center atom is self connected and to both neighbors so it appears
		# thrice. The canonical ranking used in MolecularFeaturizer means this
		# central atom is ranked last in ordering.
		assert np.all(pair_split == np.array([0, 1, 1, 2, 2, 3, 3, 3]))
		# 10 pairs in total each with start/finish
		assert atom_to_pair.shape == (8, 2)
		assert np.all(atom_to_pair == np.array([[0, 0], [1, 1], [1, 3], [2, 2],
		[2, 3], [3, 1], [3, 2], [3, 3]]))


		@flaky
		@pytest.mark.slow
		def test_weave_model():
		@@ -84,16 +166,15 @@ def test_weave_regression_model():
		assert scores['mean_absolute_error'] < 0.1


		def test_weave_fit_simple():
		featurizer = dc.feat.WeaveFeaturizer()
		def test_weave_fit_simple_infinity_distance():
		featurizer = dc.feat.WeaveFeaturizer(max_pair_distance="infinity")
		X = featurizer(["C", "CCC"])
		y = np.random.randint(2, size=(2,))
		y = np.array([0, 1.])
		dataset = dc.data.NumpyDataset(X, y)
		tasks, dataset, transformers, metric = get_dataset('classification', 'Weave')

		batch_size = 20
		model = WeaveModel(
		len(tasks),
		1,
		batch_size=batch_size,
		mode='classification',
		fully_connected_layer_sizes=[2000, 1000],
		@@ -105,5 +186,35 @@ def test_weave_fit_simple():
		},
		learning_rage=0.0005)
		model.fit(dataset, nb_epoch=200)
		transformers = []
		metric = dc.metrics.Metric(
		dc.metrics.roc_auc_score, np.mean, mode="classification")
		scores = model.evaluate(dataset, [metric], transformers)
		assert scores['mean-roc_auc_score'] >= 0.9


		def test_weave_fit_simple_distance_1():
		featurizer = dc.feat.WeaveFeaturizer(max_pair_distance=1)
		X = featurizer(["C", "CCC"])
		y = np.array([0, 1.])
		dataset = dc.data.NumpyDataset(X, y)

		batch_size = 20
		model = WeaveModel(
		1,
		batch_size=batch_size,
		mode='classification',
		fully_connected_layer_sizes=[2000, 1000],
		batch_normalize=True,
		batch_normalize_kwargs={
		"fused": False,
		"trainable": True,
		"renorm": True
		},
		learning_rage=0.0005)
		model.fit(dataset, nb_epoch=200)
		transformers = []
		metric = dc.metrics.Metric(
		dc.metrics.roc_auc_score, np.mean, mode="classification")
		scores = model.evaluate(dataset, [metric], transformers)
		assert scores['mean-roc_auc_score'] >= 0.9

Admin message