docs change (c7ae000b) · Commits · 钟慕尧 / deepchem

deepchem/models/tf_new_models/DTNN_regressor.py

+2 −0

Original line number	Diff line number	Diff line
		@@ -13,6 +13,8 @@ class DTNNGraphRegressor(MultitaskGraphRegressor):

		feat = self.model.return_outputs()
		feat_size = self.feat_dim
		# dimension of `feat` becomes Unknown after tf.tensordot operation
		# need to define dimension of W and b explicitly
		outputs = []
		W_list = []
		b_list = []

deepchem/models/tf_new_models/graph_topology.py

+63 −59

Original line number	Diff line number	Diff line
		@@ -324,7 +324,6 @@ class DAGGraphTopology(GraphTopology):
		feed_dict : dict
		Can be merged with other feed_dicts for input into tensorflow
		"""
		# Merge mol conv objects

		atoms_per_mol = [mol.get_num_atoms() for mol in batch]
		n_atom_features = batch[0].get_atom_features().shape[1]
		@@ -336,41 +335,42 @@ class DAGGraphTopology(GraphTopology):
		axis=0)

		atoms_all = []
		parents_all = []
		# calculation orders for a batch of molecules
		parents_all = []
		calculation_orders = []
		for idm, mol in enumerate(batch):
		# padding atom features vector of each molecule with 0
		atom_features_padded = np.concatenate(
		[
		mol.get_atom_features(), np.zeros(
		(self.max_atoms - atoms_per_mol[idm], n_atom_features))
		],
		axis=0)
		# padding atom features vector of each molecule with 0
		atoms_all.append(atom_features_padded)

		parents = self.UG_to_DAG(mol)
		# calculation orders for DAGs
		assert len(parents) == atoms_per_mol[idm]
		parents = self.UG_to_DAG(mol)
		# number of DAGs should equal number of atoms
		assert len(parents) == atoms_per_mol[idm]
		parents_all.extend(parents[:])
		# padding with `max_atoms`
		parents_all.extend([
		self.max_atoms * np.ones((self.max_atoms, self.max_atoms), dtype=int)
		for i in range(self.max_atoms - atoms_per_mol[idm])
		])
		# padding with max_atoms
		for parent in parents:
		calculation_orders.append(self.indice_changing(parent[:, 0], idm))
		# the indice for a specific atom in the molecule's DAGs and atom_features_placeholder
		# is different, this function changes the indice from the position in current molecule(DAGs)
		# to position in batch of molecules(atom_features_placeholder)
		# and this is only going to be used in tf.gather on atom_features_placeholder
		# index for an atom in `parents_all` and `atoms_all` is different,
		# this function changes the index from the position in current molecule(DAGs, `parents_all`)
		# to position in batch of molecules(`atoms_all`)
		# only used in tf.gather on `atom_features_placeholder`
		calculation_orders.append(self.index_changing(parent[:, 0], idm))

		# padding with `batch_size*max_atoms`
		calculation_orders.extend([
		self.batch_size * self.max_atoms * np.ones(
		(self.max_atoms,), dtype=int)
		for i in range(self.max_atoms - atoms_per_mol[idm])
		])
		# padding with (batch_size*max_atoms)

		atoms_all = np.concatenate(atoms_all, axis=0)
		parents_all = np.stack(parents_all, axis=0)
		@@ -384,9 +384,9 @@ class DAGGraphTopology(GraphTopology):

		return atoms_dict

		def indice_changing(self, indice, n_mol):
		output = np.zeros_like(indice)
		for ide, element in enumerate(indice):
		def index_changing(self, index, n_mol):
		output = np.zeros_like(index)
		for ide, element in enumerate(index):
		if element < self.max_atoms:
		output[ide] = element + n_mol * self.max_atoms
		else:
		@@ -396,82 +396,86 @@ class DAGGraphTopology(GraphTopology):
		def UG_to_DAG(self, sample):
		"""This function generates the DAGs for a molecule
		"""
		# list of calculation orders for DAGs
		# stemming from one specific atom in the molecule
		parents = []
		# list of DAGs, one DAG represents the calculation orders
		# stemming from one specific atom in the molecule,
		# hence this list include k elements for a molecule with k atoms
		UG = sample.get_adjacency_list()
		# starting from the adjacency list derived by graphconv featurizer
		UG = sample.get_adjacency_list()
		# number of atoms, also number of DAGs
		n_atoms = sample.get_num_atoms()
		# number of graphs need to be generated
		# DAG on a molecule with k atoms includes k steps of calculation,
		# each step calculating graph features for one atom.
		# `max_atoms` is the maximum number of steps
		max_atoms = self.max_atoms
		# for a graph on a molecule with k atoms, there will be k steps,
		# each step calculate graph features for one atom,
		# maximum number of steps is the same as max_atoms
		for count in range(n_atoms):
		# each iteration generates one DAG
		# stemming from atom with indice `count`
		# each iteration generates the DAG starting from atom with index `count`
		DAG = []
		# list of lists, elements represent the calculation orders
		# for atoms in the current graph
		parent = [[] for i in range(n_atoms)]
		# list of lists, each element(also a list) represents the calculation order
		# for every atom in the molecule in the current graph
		# starting from the target atom with index `count`
		current_atoms = [count]
		# starting from the atom with indice `count`
		# flags of whether the atom is already included in the DAG
		atoms_indicator = np.ones((n_atoms,))
		# flags, whether the atom is already included in the DAG
		atoms_indicator[count] = 0
		# atom `count` is in the DAG
		radial = 0
		atoms_indicator[count] = 0
		# recording number of radial propagation steps
		radial = 0
		while np.sum(atoms_indicator) > 0:
		# in this while loop, atoms directly connected to `count` will be first added into
		# the DAG(radial=0), then atoms two-bond away from `count` will be added in the
		# second loop(radial=1). Atoms i-bond away will be added in loop i
		# in the fisrt loop, atoms directly connected to `count` will be added
		# into the DAG(radial=0), then atoms two-bond away from `count`
		# will be added in the second loop(radial=1).
		# atoms i-bond away will be added in i-th loop
		if radial > n_atoms:
		# when molecules have separate parts, starting from one part,
		# it is not possible to include all atoms.
		# this break quit the loop when going into such condition
		break
		# when molecules have separate parts, starting from one part, it is not possible
		# to include all atoms.
		next_atoms = []
		# reinitialize targets for next iteration
		next_atoms = []
		for current_atom in current_atoms:
		for atom_adj in UG[current_atom]:
		# atoms connected to current_atom
		if atoms_indicator[atom_adj] > 0:
		# generate the dependency map of current DAG
		# atoms connected to `current_atoms`(and not included in the DAG)
		# are added, and will be the `current_atoms` for next iteration.
		DAG.append((current_atom, atom_adj))
		# this for loop generates the dependency map of this DAG
		# atoms that connected to current_atoms(and not included in the DAG yet)
		# are added into DAG, and will be the current_atoms for next iteration.
		atoms_indicator[atom_adj] = 0
		next_atoms.append(atom_adj)
		# including into targets for next iteration
		current_atoms = next_atoms
		# into next iteration, finding atoms connected one more bond away
		radial = radial + 1
		# DAG starts from the target atom, calculation should go in reverse
		for edge in reversed(DAG):
		# DAG starts from the target atom, hence the calculation should go in reverse
		# `edge[1]` is the parent of `edge[0]`
		parent[edge[0]].append(edge[1])
		# edge[1] is the parent of edge[0]
		# all the parents of `edge[1]` is also the parents of `edge[0]`
		parent[edge[0]].extend(parent[edge[1]])
		# all the parents of edge[1] is also the parents of edge[0]
		# after this for loop, parents[i] is the list that includes all parents of atom i
		# after this loop, `parents[i]` includes all parents of atom i

		for ids, atom in enumerate(parent):
		# manually adding the atom index into its parents list
		parent[ids].insert(0, ids)
		# manually adding the atom indice into its parents list
		# after this for loop, parents[i][0] = i, parents[i][1:] are all parents of atom i
		# after this loop, `parents[i][0]` is i, `parents[i][1:]` are all parents of atom i

		# atoms with less parents(farther from the target atom) come first.
		# graph features of atoms without parents will be first calculated,
		# then atoms with more parents can be calculated in order
		# based on previously calculated graph features.
		# target atom of this DAG will be calculated in the last step
		parent = sorted(parent, key=len)
		# key part of this function, atoms with less parents come first,
		# so when we do a for loop on the list , atoms without parents will be first calculated
		# then atoms with more parents can be calculated based on calculated graph features.
		# the starting atom of this DAG will be calculated in the last step,
		# since every other atom is its parent.

		for ids, atom in enumerate(parent):
		n_par = len(atom)
		# padding with `max_atoms`
		parent[ids].extend([max_atoms for i in range(max_atoms - n_par)])
		# padding with max_atoms

		while len(parent) < max_atoms:
		parent.insert(0, [max_atoms] * max_atoms)
		# padding
		parent.insert(0, [max_atoms] * max_atoms)
		# `parents[i]` is the calculation order for the DAG stemming from atom i,
		# which is a max_atoms * max_atoms numpy array after padding
		parents.append(np.array(parent))
		# parents[i] is the calculation order for the DAG stemming from atom i,
		# which is a max_atoms * max_atoms numpy array(after padding)

		return parents

deepchem/nn/layers.py

+43 −38

Original line number	Diff line number	Diff line
		@@ -1082,79 +1082,84 @@ class DAGLayer(Layer):
		self.build()

		# Extract atom_features
		atom_features = x[0]
		# Basic features of every atom: (batch_sizemax_atoms) n_atom_features
		atom_features = x[0]

		# calculation orders of graph: (batch_sizemax_atoms) max_atoms * max_atoms
		# each atom corresponds to a graph, which is represented by the `max_atoms*max_atoms` int32 matrix of index
		# each gragh include `max_atoms` of steps(corresponding to rows) of calculating graph features
		# step i calculates the graph features for atoms of index `parents[:,i,0]`
		parents = x[1]
		# Structure of graph: (batch_sizemax_atoms) max_atoms * max_atoms
		# each atom corresponds to a graph, which is represented by the max_atoms * max_atoms int32 matrix of indices
		# there are in total max_atoms number of steps(corresponding to rows) in calculating the graph outputs
		# in step i, we calculate the graph features of atom(i,0)
		# from inputs: atom features of atom(i,0), graph_features of this atom's parents in the graph(atom(i,1) through atom(i,max_atoms))
		# if number of parents is less than max_atoms-1, padded it with max_atoms, representing a dummy with all zeros)
		calculation_orders = x[2]
		# (batch_sizemax_atoms) max_atoms
		# indices of atom(i,0)
		# represent the same atoms of parents[:, :, 0], different in that the indices are for atom_features(0~max_atoms*batch_size)

		# target atoms for each step: (batch_sizemax_atoms) max_atoms
		# represent the same atoms of `parents[:, :, 0]`,
		# different in that these index are positions in `atom_features`
		# paded with max_atoms*batch_size
		calculation_orders = x[2]
		# flags: (batch_size*max_atoms)
		# 0 for paddings, 1 for real atoms
		membership = x[3]
		# (batch_size*max_atoms)
		# 0 for dummy atoms, 1 for real atoms
		# number of atoms in total, should equal `batch_size*max_atoms`
		n_atoms = atom_features.get_shape()[0]
		# number of atoms in total, =batch_size

		# initialize graph features for each graph
		# another row of zeros is generated for padded dummy atoms
		graph_features = tf.Variable(
		tf.constant(0., shape=(n_atoms, self.max_atoms + 1, self.n_graph_feat)),
		trainable=False)
		# Initialize graph features for atoms in the molecule for each graph
		# for each graph, another row of zeros is generated as the dummy
		# add dummy
		atom_features = tf.concat(
		axis=0,
		values=[
		atom_features, tf.constant(0., shape=(1, self.n_atom_features))
		])
		# dummy
		for count in range(self.max_atoms):
		# count-th step
		# `count`-th step
		# extracting atom features of target atoms: (batch_sizemax_atoms) n_atom_features
		batch_atom_features = tf.gather(atom_features,
		calculation_orders[:, count])
		# extracting atom features of target atoms, shape: (batch_sizemax_atoms) n_atom_features

		indice = tf.stack(
		# generating index for graph features used in the inputs
		index = tf.stack(
		[
		tf.reshape(
		tf.stack([tf.range(n_atoms)] * (self.max_atoms - 1), axis=1),
		[-1]), tf.reshape(parents[:, count, 1:], [-1])
		],
		axis=1)
		# generating indices for graph features used in the inputs
		# extracting graph features for parents of the target atoms, then flatten
		# shape: (batch_sizemax_atoms) [(max_atoms-1)*n_graph_features]
		batch_graph_features = tf.reshape(
		tf.gather_nd(graph_features, indice),
		tf.gather_nd(graph_features, index),
		[-1, (self.max_atoms - 1) * self.n_graph_feat])
		# extracting graph features of the parents of the target atoms, then flatten
		# shape: (batch_sizemax_atoms) [(max_atoms-1)*n_graph_features]

		# concat into the input tensor: (batch_sizemax_atoms) n_inputs
		batch_inputs = tf.concat(
		axis=1, values=[batch_atom_features, batch_graph_features])
		# concat into the input tensor, shape: (batch_sizemax_atoms) n_inputs
		# DAGgraph_step maps from batch_inputs to a batch of graph_features
		# of shape: (batch_sizemax_atoms) n_graph_features
		# representing the graph features of target atoms in each graph
		batch_outputs = self.DAGgraph_step(batch_inputs, self.W_list, self.b_list)
		# DAGgraph_step mapping from batch_inputs to a batch of graph_features
		# shape: (batch_sizemax_atoms) n_graph_features
		# representing the graph features of the target atoms in each graph
		target_indices = tf.stack(

		# index for targe atoms
		target_index = tf.stack(
		[tf.range(n_atoms), parents[:, count, 0]], axis=1)
		target_indices2 = tf.stack(
		# index for dummies
		target_index2 = tf.stack(
		[tf.range(n_atoms), tf.constant(self.max_atoms, shape=(n_atoms,))],
		axis=1)
		graph_features = tf.scatter_nd_update(graph_features, target_indices,
		batch_outputs)
		# update the graph features for target atoms
		graph_features = tf.scatter_nd_update(graph_features, target_indices2,
		graph_features = tf.scatter_nd_update(graph_features, target_index,
		batch_outputs)
		# recover dummies to zeros if being updated
		graph_features = tf.scatter_nd_update(graph_features, target_index2,
		tf.zeros(
		(n_atoms, self.n_graph_feat)))
		# recover dummies to zeros if being updated

		# last step generates graph features for all target atoms
		# masking the outputs
		outputs = tf.multiply(batch_outputs,
		tf.expand_dims(tf.to_float(membership), axis=1))
		# masking the outputs of the last step
		return outputs

		def DAGgraph_step(self, batch_inputs, W_list, b_list):

examples/delaney/delaney_DAG.py

+1 −1

Original line number	Diff line number	Diff line
		"""
		Script that trains graph-conv models on Tox21 dataset.
		Script that trains DAG models on delaney dataset.
		"""
		from __future__ import print_function
		from __future__ import division

examples/tox21/tox21_DAG.py

+1 −1

Original line number	Diff line number	Diff line
		"""
		Script that trains graph-conv models on Tox21 dataset.
		Script that trains DAG models on Tox21 dataset.
		"""
		from __future__ import print_function
		from __future__ import division

Admin message