Updated (939161ff) · Commits · 钟慕尧 / deepchem

deepchem/models/tests/test_layers.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -612,7 +612,7 @@ def test_scale_norm():
		"""Test invoking ScaleNorm."""
		input_ar = torch.tensor([[1., 99., 10000.], [0.003, 999.37, 23.]])
		layer = torch_layers.ScaleNorm(0.35)
		result1 = layer.forward(input_ar)
		result1 = layer(input_ar)
		output_ar = torch.tensor([[5.9157897e-05, 5.8566318e-03, 5.9157896e-01],
		[1.7754727e-06, 5.9145141e-01, 1.3611957e-02]])
		assert torch.allclose(result1, output_ar)

deepchem/models/torch_models/layers.py

+21 −370

Original line number	Diff line number	Diff line
		@@ -27,33 +27,12 @@ class ScaleNorm(nn.Module):
		--------
		>>> from deepchem.models.torch_models.layers import ScaleNorm
		>>> scale = 0.35
		<<<<<<< HEAD
		>>> layer = dc.models.torch_models.layers.ScaleNorm(scale)
		>>> input_tensor = torch.Tensor([[1.269, 39.36], [0.00918, -9.12]])
		=======
		>>> layer = ScaleNorm(scale)
		>>> input_tensor = torch.tensor([[1.269, 39.36], [0.00918, -9.12]])
		>>>>>>> Added return type annotations + doctest fixes
		>>> output_tensor = layer(input_tensor)
		"""

		<<<<<<< HEAD
		<<<<<<< HEAD
		def __init__(self, scale: float, eps: float = 1e-5):
		=======
		<<<<<<< HEAD
		<<<<<<< HEAD
		def __init__(self, scale: float, eps: float = 1e-5):
		=======
		def __init__(self, scale: int, eps: float = 1e-5):
		>>>>>>> Type annotations
		=======
		def __init__(self, scale: float, eps: float = 1e-5):
		>>>>>>> Scalenorm scale to float
		>>>>>>> Scalenorm scale to float
		=======
		def __init__(self, scale: float, eps: float = 1e-5):
		>>>>>>> Rebase+Update
		"""Initialize a ScaleNorm layer.

		Parameters
		@@ -63,68 +42,27 @@ class ScaleNorm(nn.Module):
		eps: float
		Epsilon value. Default = 1e-5.
		"""

		super(ScaleNorm, self).__init__()
		self.scale = nn.Parameter(torch.tensor(math.sqrt(scale)))
		self.eps = eps

		<<<<<<< HEAD
		<<<<<<< HEAD
		<<<<<<< HEAD
		def forward(self, x: torch.Tensor):
		=======
		<<<<<<< HEAD
		<<<<<<< HEAD
		<<<<<<< HEAD
		def forward(self, x: torch.Tensor):
		=======
		def forward(self, x : torch.Tensor):
		>>>>>>> scalenorm forward annotation
		=======
		def forward(self, x: torch.Tensor):
		>>>>>>> Code changes + Test + docs
		=======
		def forward(self, x : torch.Tensor):
		>>>>>>> Update
		>>>>>>> Update
		=======
		def forward(self, x: torch.Tensor):
		>>>>>>> Rebase+Update
		=======
		def forward(self, x: torch.Tensor) -> torch.Tensor:
		>>>>>>> Added return type annotations + doctest fixes
		norm = self.scale / torch.norm(x, dim=-1, keepdim=True).clamp(min=self.eps)
		return x * norm


		<<<<<<< HEAD
		class MATEncoder(nn.Module):
		"""Encoder block for the Molecule Attention Transformer [1]_.

		A stack of N layers which form the MAT encoder block. The block primarily consists of a self-attention layer and a feed-forward layer.
		This block is constructed from its basic layer: MATEncoderLayer. See dc.models.torch_models.layers.MATEncoderLayer for more details regarding the working of the block.

		=======
		class MultiHeadedMATAttention(nn.Module):
		"""First constructs an attention layer tailored to the Molecular Attention Transformer [1]_ and then converts it into Multi-Headed Attention.

		In Multi-Headed attention the attention mechanism multiple times parallely through the multiple attention heads.
		Thus, different subsequences of a given sequences can be processed differently.
		The query, key and value parameters are split multiple ways and each split is passed separately through a different attention head.
		>>>>>>> Update
		References
		----------
		.. [1] Lukasz Maziarka et al. "Molecule Attention Transformer" Graph Representation Learning workshop and Machine Learning and the Physical Sciences workshop at NeurIPS 2019. 2020. https://arxiv.org/abs/2002.08264
		Examples
		--------
		<<<<<<< HEAD
		>>> import deepchem as dc
		<<<<<<< HEAD
		>>> block = dc.models.torch_models.layers.MATEncoder(dist_kernel = 'softmax', lambda_attention = 0.33, lambda_adistance = 0.33, h = 8, sa_hsize = 1024, sa_dropout_p = 0.1, d_input = 1024, activation = 'relu', n_layers = 1, ff_dropout_p = 0.1, encoder_hsize = 1024, encoder_dropout_p = 0.1, N = 3)
		=======
		=======
		>>> from deepchem.models.torch_models.layers import MultiHeadedMATAttention
		>>>>>>> Added return type annotations + doctest fixes
		>>> from rdkit import Chem
		>>> mol = Chem.MolFromSmiles("CC")
		>>> adj_matrix = Chem.GetAdjacencyMatrix(mol)
		@@ -132,22 +70,9 @@ class MultiHeadedMATAttention(nn.Module):
		>>> layer = MultiHeadedMATAttention(dist_kernel='softmax', lambda_attention=0.33, lambda_distance=0.33, h=2, hsize=2, dropout_p=0.0)
		>>> input_tensor = torch.tensor([[1., 2.], [5., 6.]])
		>>> mask = torch.tensor([[1., 1.], [1., 1.]])
		<<<<<<< HEAD
		>>> result = layer(input_tensor, input_tensor, input_tensor, mask, 0.0, adj_matrix, distance_matrix)
		>>>>>>> Update
		=======
		>>> result = layer(input_tensor, input_tensor, input_tensor, mask, adj_matrix, distance_matrix, 0.0)
		>>>>>>> Added return type annotations + doctest fixes
		"""

		<<<<<<< HEAD
		def __init__(self, dist_kernel, lambda_attention, lambda_distance, h,
		sa_hsize, sa_dropout_p, output_bias, d_input, d_hidden, d_output,
		activation, n_layers, ff_dropout_p, encoder_hsize,
		encoder_dropout_p, N):
		"""Initialize a MATEncoder block.

		=======
		def __init__(self,
		dist_kernel: str = 'softmax',
		lambda_attention: float = 0.33,
		@@ -157,144 +82,23 @@ class MultiHeadedMATAttention(nn.Module):
		dropout_p: float = 0.0,
		output_bias: bool = True):
		"""Initialize a multi-headed attention layer.
		>>>>>>> Update
		Parameters
		----------
		Parameters
		----------
		dist_kernel: str
		Kernel activation to be used. Can be either 'softmax' for softmax or 'exp' for exponential, for the self-attention layer.
		Kernel activation to be used. Can be either 'softmax' for softmax or 'exp' for exponential.
		lambda_attention: float
		Constant to be multiplied with the attention matrix in the self-attention layer.
		Constant to be multiplied with the attention matrix.
		lambda_distance: float
		Constant to be multiplied with the distance matrix in the self-attention layer.
		Constant to be multiplied with the distance matrix.
		h: int
		Number of attention heads for the self-attention layer.
		sa_hsize: int
		Size of dense layer in the self-attention layer.
		sa_dropout_p: float
		Dropout probability for the self-attention layer.
		output_bias: bool
		If True, dense layers will use bias vectors in the self-attention layer.
		d_input: int
		Size of input layer in the feed-forward layer.
		d_hidden: int
		Size of hidden layer in the feed-forward layer.
		d_output: int
		Size of output layer in the feed-forward layer.
		activation: str
		Activation function to be used in the feed-forward layer.
		Can choose between 'relu' for ReLU, 'leakyrelu' for LeakyReLU, 'prelu' for PReLU,
		'tanh' for TanH, 'selu' for SELU, 'elu' for ELU and 'linear' for linear activation.
		n_layers: int
		Number of layers in the feed-forward layer.
		Number of attention heads.
		hsize: int
		Size of dense layer.
		dropout_p: float
		Dropout probability in the feeed-forward layer.
		encoder_hsize: int
		Size of Dense layer for the encoder itself.
		encoder_dropout_p: float
		Dropout probability for connections in the encoder layer.
		N: int
		Number of identical encoder layers to be stacked.
		"""

		super(MATEncoder, self).__init__()
		encoder_layer = MATEncoderLayer(
		dist_kernel, lambda_attention, lambda_distance, h, sa_hsize,
		sa_dropout_p, output_bias, d_input, d_hidden, d_output, activation,
		n_layers, ff_dropout_p, encoder_hsize, encoder_dropout_p)
		self.layers = nn.ModuleList([encoder_layer for _ in range(N)])
		self.norm = nn.LayerNorm(encoder_layer.size)

		def forward(self, x, mask, **kwargs):
		"""Output computation for the MATEncoder block.

		Parameters
		----------
		x: torch.Tensor
		Input tensor.
		mask: torch.Tensor
		Mask for padding so that padded values do not get included in attention score calculation.
		"""

		for layer in self.layers:
		x = layer(x, mask, **kwargs)
		return self.norm(x)


		class MATEncoderLayer(nn.Module):
		"""Encoder layer for use in the Molecular Attention Transformer [1]_.

		The MATEncoder layer is formed by adding self-attention and feed-forward to the encoder block.
		It is the basis of the MATEncoder block.

		References
		----------
		.. [1] Lukasz Maziarka et al. "Molecule Attention Transformer" Graph Representation Learning workshop and Machine Learning and the Physical Sciences workshop at NeurIPS 2019. 2020. https://arxiv.org/abs/2002.08264

		Examples
		--------
		>>> import deepchem as dc
		>>> layer = dc.models.torch_models.layers.MATEncoderLayer(dist_kernel = 'softmax', lambda_attention = 0.33, lambda_distance = 0.33, h = 8, sa_hsize = 1024, sa_dropout_p = 0.1, d_input = 1024, activation = 'relu', n_layers = 1, ff_dropout_p = 0.1, encoder_hsize = 1024, encoder_dropout_p = 0.1)
		"""

		def __init__(self, dist_kernel, lambda_attention, lambda_distance, h,
		sa_hsize, sa_dropout_p, output_bias, d_input, d_hidden, d_output,
		activation, n_layers, ff_dropout_p, encoder_hsize,
		encoder_dropout_p):
		"""Initialize a MATEncoder layer.

		Parameters
		----------
		dist_kernel: str
		Kernel activation to be used. Can be either 'softmax' for softmax or 'exp' for exponential, for the self-attention layer.
		lambda_attention: float
		Constant to be multiplied with the attention matrix in the self-attention layer.
		lambda_distance: float
		Constant to be multiplied with the distance matrix in the self-attention layer.
		h: int
		Number of attention heads for the self-attention layer.
		sa_hsize: int
		Size of dense layer in the self-attention layer.
		sa_dropout_p: float
		Dropout probability for the self-attention layer.
		Dropout probability.
		output_bias: bool
		If True, dense layers will use bias vectors in the self-attention layer.
		d_input: int
		Size of input layer in the feed-forward layer.
		d_hidden: int
		Size of hidden layer in the feed-forward layer.
		d_output: int
		Size of output layer in the feed-forward layer.
		activation: str
		Activation function to be used in the feed-forward layer.
		Can choose between 'relu' for ReLU, 'leakyrelu' for LeakyReLU, 'prelu' for PReLU,
		'tanh' for TanH, 'selu' for SELU, 'elu' for ELU and 'linear' for linear activation.
		n_layers: int
		Number of layers in the feed-forward layer.
		dropout_p: float
		Dropout probability in the feeed-forward layer.
		encoder_hsize: int
		Size of Dense layer for the encoder itself.
		encoder_dropout_p: float
		Dropout probability for connections in the encoder layer.
		If True, dense layers will use bias vectors.
		"""
		<<<<<<< HEAD

		super(MATEncoderLayer, self).__init__()
		self.self_attn = MultiHeadedMATAttention(dist_kernel, lambda_attention,
		lambda_distance, h, sa_hsize,
		sa_dropout_p, output_bias)
		self.feed_forward = PositionwiseFeedForward(
		d_input, d_hidden, d_output, activation, n_layers, ff_dropout_p)
		layer = SublayerConnection(size=encoder_hsize, dropout_p=encoder_dropout_p)
		self.sublayer = nn.ModuleList([layer for _ in range(2)])
		self.size = encoder_hsize

		def forward(self, x, mask, **kwargs):
		"""Output computation for the MATEncoder layer.

		=======
		super().__init__()
		if dist_kernel == "softmax":
		self.dist_kernel = lambda x: torch.softmax(-x, dim=-1)
		@@ -321,48 +125,16 @@ class MATEncoderLayer(nn.Module):
		eps: float = 1e-6,
		inf: float = 1e12) -> Tuple[torch.Tensor, torch.Tensor]:
		"""Defining and computing output for a single MAT attention layer.
		>>>>>>> Update
		Parameters
		----------
		x: torch.Tensor
		Input tensor.
		query: torch.Tensor
		Standard query parameter for attention.
		key: torch.Tensor
		Standard key parameter for attention.
		value: torch.Tensor
		Standard value parameter for attention.
		mask: torch.Tensor
		Masks out padding values so that they are not taken into account when computing the attention score.
		<<<<<<< HEAD
		"""
		x = self.sublayer[0](x,
		lambda x: self.self_attn(x, x, x, mask=mask, **kwargs))
		return self.sublayer[1](x, self.feed_forward)


		class SublayerConnection(nn.Module):
		"""SublayerConnection layer which establishes a residual connection, as used in the Molecular Attention Transformer [1]_.

		The SublayerConnection layer is a residual layer which is then passed through Layer Normalization.
		The residual connection is established by computing the dropout-adjusted layer output of a normalized input tensor and adding this to the originial input tensor.

		References
		----------
		.. [1] Lukasz Maziarka et al. "Molecule Attention Transformer" Graph Representation Learning workshop and Machine Learning and the Physical Sciences workshop at NeurIPS 2019. 2020. https://arxiv.org/abs/2002.08264

		Examples
		--------
		>>> import deepchem as dc
		>>> scale = 0.35
		>>> layer = dc.models.torch_models.layers.SublayerConnection(2, 0.)
		>>> output = layer(torch.Tensor([1.,2.]), nn.Linear(2,1))
		"""

		def __init__(self, size, dropout_p):
		"""Initialize a SublayerConnection Layer.

		Parameters
		----------
		size: int
		Size of layer.
		dropout_p: float
		Dropout probability.
		=======
		adj_matrix: np.ndarray
		Adjacency matrix of the input molecule, returned from dc.feat.MATFeaturizer()
		dist_matrix: np.ndarray
		@@ -373,20 +145,7 @@ class SublayerConnection(nn.Module):
		Epsilon value
		inf: float
		Value of infinity to be used.
		>>>>>>> Added return type annotations + doctest fixes
		"""
		<<<<<<< HEAD

		super(SublayerConnection, self).__init__()
		self.norm = nn.LayerNorm(size)
		self.dropout_p = nn.Dropout(dropout_p)

		def forward(self, x, sublayer):
		"""Output computation for the SublayerConnection layer.

		Takes an input tensor x, then adds the dropout-adjusted sublayer output for normalized x to it.

		=======
		d_k = query.size(-1)
		scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)

		@@ -421,15 +180,8 @@ class SublayerConnection(nn.Module):
		eps: float = 1e-6,
		inf: float = 1e12) -> torch.Tensor:
		"""Output computation for the MultiHeadedAttention layer.
		>>>>>>> Update
		Parameters
		----------
		<<<<<<< HEAD
		x: torch.Tensor
		Input tensor.
		sublayer: nn.Module
		Layer whose output for normalized x will be added to x.
		=======
		query: torch.Tensor
		Standard query parameter for attention.
		key: torch.Tensor
		@@ -448,84 +200,21 @@ class SublayerConnection(nn.Module):
		Epsilon value
		inf: float
		Value of infinity to be used.
		>>>>>>> Added return type annotations + doctest fixes
		"""
		return x + self.dropout_p(sublayer(self.norm(x)))

		if mask is not None:
		mask = mask.unsqueeze(1)

		class PositionwiseFeedForward(nn.Module):
		"""PositionwiseFeedForward is a layer used to define the position-wise feed-forward (FFN) algorithm for the Molecular Attention Transformer [1]_
		batch_size = query.size(0)

		Each layer in the MAT encoder contains a fully connected feed-forward network which applies two linear transformations and the given activation function.
		This is done in addition to the SublayerConnection module.
		query, key, value = [
		layer(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
		for layer, x in zip(self.linear_layers, (query, key, value))
		]

		<<<<<<< HEAD
		<<<<<<< HEAD
		References
		----------
		.. [1] Lukasz Maziarka et al. "Molecule Attention Transformer" Graph Representation Learning workshop and Machine Learning and the Physical Sciences workshop at NeurIPS 2019. 2020. https://arxiv.org/abs/2002.08264
		=======
		x, _ = self._single_attention(query, key, value, mask, dropout_p,
		adj_matrix, distance_matrix, eps, inf)
		=======
		x, _ = self._single_attention(query, key, value, mask, adj_matrix,
		distance_matrix, dropout_p, eps, inf)
		>>>>>>> Added return type annotations + doctest fixes
		x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
		>>>>>>> Removed kwargs

		Examples
		--------
		>>> import deepchem as dc
		>>> feed_fwd_layer = dc.models.torch_models.layers.PositionwiseFeedForward(d_input = 1024, d_hidden = None, d_output = None, activation = 'relu', n_layers = 1, dropout_p = 0.1)
		"""

		def __init__(self,
		*,
		d_input,
		d_hidden=None,
		d_output=None,
		activation,
		n_layers,
		dropout_p):
		"""Initialize a PositionwiseFeedForward layer.

		Parameters
		----------
		d_input: int
		Size of input layer.
		d_hidden: int
		Size of hidden layer.
		d_output: int
		Size of output layer.
		activation: str
		Activation function to be used. Can choose between 'relu' for ReLU, 'leakyrelu' for LeakyReLU, 'prelu' for PReLU,
		'tanh' for TanH, 'selu' for SELU, 'elu' for ELU and 'linear' for linear activation.
		n_layers: int
		Number of layers.
		dropout_p: float
		Dropout probability.
		"""

		super(PositionwiseFeedForward, self).__init__()

		if activation == 'relu':
		self.activation = nn.ReLU()

		elif activation == 'leakyrelu':
		self.activation = nn.LeakyReLU(0.1)

		elif activation == 'prelu':
		self.activation = nn.PReLU()

		elif activation == 'tanh':
		self.activation = nn.Tanh()

		elif activation == 'selu':
		self.activation = nn.SELU()

		<<<<<<< HEAD
		=======
		return self.output_linear(x)


		@@ -774,25 +463,15 @@ class PositionwiseFeedForward(nn.Module):
		elif activation == 'selu':
		self.activation = nn.SELU()

		>>>>>>> Rebase+Update
		elif activation == 'elu':
		self.activation = nn.ELU()

		elif activation == "linear":
		self.activation = lambda x: x

		<<<<<<< HEAD
		self.n_layers = n_layers
		<<<<<<< HEAD
		d_output = d_output if d_output is not None else d_input
		d_hidden = d_hidden if d_hidden is not None else d_input
		=======
		=======
		self.n_layers: int = n_layers
		>>>>>>> Mypy fixes
		d_output = d_output if d_output != 0 else d_input
		d_hidden = d_hidden if d_hidden != 0 else d_input
		>>>>>>> Rebase+Update

		if n_layers == 1:
		self.linears: Any = [nn.Linear(d_input, d_output)]
		@@ -805,18 +484,8 @@ class PositionwiseFeedForward(nn.Module):
		self.linears = nn.ModuleList(self.linears)
		dropout_layer = nn.Dropout(dropout_p)
		self.dropout_p = nn.ModuleList([dropout_layer for _ in range(n_layers)])
		<<<<<<< HEAD
		self.act_func = activation

		def forward(self, x):
		=======

		<<<<<<< HEAD
		def forward(self, x: torch.Tensor):
		>>>>>>> Rebase+Update
		=======
		def forward(self, x: torch.Tensor) -> torch.Tensor:
		>>>>>>> Added return type annotations + doctest fixes
		"""Output Computation for the PositionwiseFeedForward layer.

		Parameters
		@@ -824,23 +493,6 @@ class PositionwiseFeedForward(nn.Module):
		x: torch.Tensor
		Input tensor.
		"""
		<<<<<<< HEAD

		if self.n_layers == 0:
		return x

		elif self.n_layers == 1:
		return self.dropout_p[0](self.act_func(self.linears[0](x)))

		<<<<<<< HEAD
		else:
		for i in range(self.n_layers - 1):
		x = self.dropout_p[i](self.act_func(self.linears[i](x)))
		return self.linears[-1](x)
		=======
		return self.output_linear(x)
		>>>>>>> Update
		=======
		if not self.n_layers:
		return x

		@@ -851,4 +503,3 @@ class PositionwiseFeedForward(nn.Module):
		for i in range(self.n_layers - 1):
		x = self.dropout_p[i](self.activation(self.linears[i](x)))
		return self.linears[-1](x)
		>>>>>>> Rebase+Update

docs/source/api_reference/layers.rst

+0 −13

Original line number	Diff line number	Diff line
		@@ -125,23 +125,10 @@ Torch Layers
		.. autoclass:: deepchem.models.torch_models.layers.ScaleNorm
		:members:

		<<<<<<< HEAD
		<<<<<<< HEAD
		=======
		.. autoclass:: deepchem.models.torch_models.layers.MATEncoderLayer
		:members:

		>>>>>>> Rebase+Update
		.. autoclass:: deepchem.models.torch_models.layers.MultiHeadedMATAttention
		=======
		.. autoclass:: deepchem.models.torch_models.layers.MATEncoderLayer
		:members:

		.. autoclass:: deepchem.models.torch_models.layers.SublayerConnection
		:members:

		.. autoclass:: deepchem.models.torch_models.layers.PositionwiseFeedForward
		>>>>>>> Tests for encoder
		:members:

		.. autoclass:: deepchem.models.torch_models.layers.SublayerConnection

Admin message