Merge pull request #2098 from nd-02110114/metrics (9c879e45) · Commits · 钟慕尧 / deepchem

deepchem/dock/docking.py

+3 −3

Original line number	Diff line number	Diff line
		@@ -3,7 +3,7 @@ Docks Molecular Complexes
		"""
		import logging
		import tempfile
		from typing import cast, Generator, Optional, Tuple, Union
		from typing import Generator, Optional, Tuple, Union
		import numpy as np

		from deepchem.utils.typing import RDKitMol
		@@ -128,8 +128,8 @@ class Docker(object):
		# We know use_pose_generator_scores == False in this case
		if self.scoring_model is not None:
		for posed_complex in complexes:
		# NOTE: this casting is workaround. This line doesn't effect anything to the runtime
		self.featurizer = cast(ComplexFeaturizer, self.featurizer)
		# check whether self.featurizer is instance of ComplexFeaturizer or not
		assert isinstance(self.featurizer, ComplexFeaturizer)
		# TODO: How to handle the failure here?
		(protein_file, ligand_file) = molecular_complex
		features, _ = self.featurizer.featurize([protein_file], [ligand_file])

deepchem/metrics/init.py

+39 −868

File changed.

Preview size limit exceeded, changes collapsed.

deepchem/metrics/genomic_metrics.py

+59 −35

Original line number	Diff line number	Diff line
		"""Evaluation Metrics for Genomics Datasets."""

		from typing import List, Optional
		import numpy as np
		from deepchem.data import NumpyDataset
		from scipy.signal import correlate2d

		from deepchem.models import Model
		from deepchem.data import NumpyDataset


		def get_motif_scores(encoded_sequences,
		motif_names,
		max_scores=None,
		return_positions=False,
		GC_fraction=0.4):
		def get_motif_scores(encoded_sequences: np.ndarray,
		motif_names: List[str],
		max_scores: Optional[int] = None,
		return_positions: bool = False,
		GC_fraction: float = 0.4) -> np.ndarray:
		"""Computes pwm log odds.

		Parameters
		----------
		encoded_sequences : 4darray
		(N_sequences, N_letters, sequence_length, 1) array
		motif_names : list of strings
		encoded_sequences: np.ndarray
		A numpy array of shape `(N_sequences, N_letters, sequence_length, 1)`.
		motif_names: List[str]
		List of motif file names.
		max_scores: int, optional
		return_positions : boolean, optional
		GC_fraction : float, optional
		Get top `max_scores` scores.
		return_positions: bool, default False
		Whether to return postions or not.
		GC_fraction: float, default 0.4
		GC fraction in background sequence.

		Returns
		-------
		(N_sequences, num_motifs, seq_length) complete score array by default.
		If max_scores, (N_sequences, num_motifs*max_scores) max score array.
		If max_scores and return_positions, (N_sequences, 2num_motifsmax_scores)
		array with max scores and their positions.
		np.ndarray
		A numpy array of complete score. The shape is `(N_sequences, num_motifs, seq_length)` by default.
		If max_scores, the shape of score array is `(N_sequences, num_motifs*max_scores)`.
		If max_scores and return_positions, the shape of score array with max scores and their positions.
		is `(N_sequences, 2num_motifsmax_scores)`.

		Notes
		-----
		This method requires simdna to be installed.
		"""
		try:
		import simdna
		from simdna import synthetic
		except ModuleNotFoundError:
		raise ValueError("This function requires simdna to be installed.")

		loaded_motifs = synthetic.LoadedEncodeMotifs(
		simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)
		num_samples, _, seq_length, _ = encoded_sequences.shape
		@@ -59,22 +75,23 @@ def get_motif_scores(encoded_sequences,
		return scores


		def get_pssm_scores(encoded_sequences, pssm):
		def get_pssm_scores(encoded_sequences: np.ndarray,
		pssm: np.ndarray) -> np.ndarray:
		"""
		Convolves pssm and its reverse complement with encoded sequences
		and returns the maximum score at each position of each sequence.

		Parameters
		----------
		encoded_sequences: 3darray
		(N_sequences, N_letters, sequence_length, 1) array
		pssm: 2darray
		(4, pssm_length) array
		encoded_sequences: np.ndarray
		A numpy array of shape `(N_sequences, N_letters, sequence_length, 1)`.
		pssm: np.ndarray
		A numpy array of shape `(4, pssm_length)`.

		Returns
		-------
		scores: 2darray
		(N_sequences, sequence_length)
		scores: np.ndarray
		A numpy array of shape `(N_sequences, sequence_length)`.
		"""
		encoded_sequences = encoded_sequences.squeeze(axis=3)
		# initialize fwd and reverse scores to -infinity
		@@ -97,31 +114,36 @@ def get_pssm_scores(encoded_sequences, pssm):
		return scores


		def in_silico_mutagenesis(model, X):
		def in_silico_mutagenesis(model: Model,
		encoded_sequences: np.ndarray) -> np.ndarray:
		"""Computes in-silico-mutagenesis scores

		Parameters
		----------
		model: Model
		This can be any model that accepts inputs of the required shape and produces
		an output of shape (N_sequences, N_tasks).
		X: ndarray
		Shape (N_sequences, N_letters, sequence_length, 1)
		an output of shape `(N_sequences, N_tasks)`.
		encoded_sequences: np.ndarray
		A numpy array of shape `(N_sequences, N_letters, sequence_length, 1)`

		Returns
		-------
		(num_task, N_sequences, N_letters, sequence_length, 1) ISM score array.
		np.ndarray
		A numpy array of ISM scores. The shape is `(num_task, N_sequences, N_letters, sequence_length, 1)`.
		"""
		# Shape (N_sequences, num_tasks)
		wild_type_predictions = model.predict(NumpyDataset(X))
		wild_type_predictions = model.predict(NumpyDataset(encoded_sequences))
		# check whether wild_type_predictions is np.ndarray or not
		assert isinstance(wild_type_predictions, np.ndarray)
		num_tasks = wild_type_predictions.shape[1]
		# Shape (N_sequences, N_letters, sequence_length, 1, num_tasks)
		mutagenesis_scores = np.empty(X.shape + (num_tasks,), dtype=np.float32)
		mutagenesis_scores = np.empty(
		encoded_sequences.shape + (num_tasks,), dtype=np.float32)
		# Shape (N_sequences, num_tasks, 1, 1, 1)
		wild_type_predictions = wild_type_predictions[:, np.newaxis, np.newaxis,
		np.newaxis]
		for sequence_index, (sequence, wild_type_prediction) in enumerate(
		zip(X, wild_type_predictions)):
		zip(encoded_sequences, wild_type_predictions)):

		# Mutates every position of the sequence to every letter
		# Shape (N_letters * sequence_length, N_letters, sequence_length, 1)
		@@ -142,6 +164,8 @@ def in_silico_mutagenesis(model, X):
		mutated_sequences[arange, vertical_repeat, horizontal_cycle, :] = 1
		# make mutant predictions
		mutated_predictions = model.predict(NumpyDataset(mutated_sequences))
		# check whether wild_type_predictions is np.ndarray or not
		assert isinstance(mutated_predictions, np.ndarray)
		mutated_predictions = mutated_predictions.reshape(sequence.shape +
		(num_tasks,))
		mutagenesis_scores[

deepchem/metrics/metric.py

0 → 100644

+740 −0

File added.

Preview size limit exceeded, changes collapsed.

deepchem/metrics/score_function.py

0 → 100644

+164 −0

Original line number	Diff line number	Diff line
		"""Evaluation metrics."""

		import numpy as np
		from sklearn.metrics import matthews_corrcoef # noqa
		from sklearn.metrics import recall_score # noqa
		from sklearn.metrics import cohen_kappa_score
		from sklearn.metrics import r2_score # noqa
		from sklearn.metrics import mean_squared_error
		from sklearn.metrics import mean_absolute_error
		from sklearn.metrics import precision_score # noqa
		from sklearn.metrics import precision_recall_curve
		from sklearn.metrics import auc
		from sklearn.metrics import jaccard_score
		from sklearn.metrics import f1_score
		from sklearn.metrics import roc_auc_score # noqa
		from sklearn.metrics import accuracy_score # noqa
		from sklearn.metrics import balanced_accuracy_score # noqa
		from scipy.stats import pearsonr

		# kappa_score is an alias for `sklearn.metrics.cohen_kappa_score`
		kappa_score = cohen_kappa_score


		def pearson_r2_score(y: np.ndarray, y_pred: np.ndarray) -> float:
		"""Computes Pearson R^2 (square of Pearson correlation).

		Parameters
		----------
		y: np.ndarray
		ground truth array
		y_pred: np.ndarray
		predicted array

		Returns
		-------
		float
		The Pearson-R^2 score.
		"""
		return pearsonr(y, y_pred)[0]**2


		def jaccard_index(y: np.ndarray, y_pred: np.ndarray) -> float:
		"""Computes Jaccard Index which is the Intersection Over Union metric
		which is commonly used in image segmentation tasks.

		DEPRECATED: WILL BE REMOVED IN A FUTURE VERSION OF DEEEPCHEM. USE `jaccard_score` instead.

		Parameters
		----------
		y: np.ndarray
		ground truth array
		y_pred: np.ndarray
		predicted array

		Returns
		-------
		score: float
		The jaccard index. A number between 0 and 1.
		"""
		return jaccard_score(y, y_pred)


		def pixel_error(y: np.ndarray, y_pred: np.ndarray) -> float:
		"""An error metric in case y, y_pred are images.

		Defined as 1 - the maximal F-score of pixel similarity, or squared
		Euclidean distance between the original and the result labels.

		Parameters
		----------
		y: np.ndarray
		ground truth array
		y_pred: np.ndarray
		predicted array

		Returns
		-------
		score: float
		The pixel-error. A number between 0 and 1.
		"""
		return 1 - f1_score(y, y_pred)


		def prc_auc_score(y: np.ndarray, y_pred: np.ndarray) -> float:
		"""Compute area under precision-recall curve

		Parameters
		----------
		y: np.ndarray
		A numpy array of shape `(N, n_classes)` or `(N,)` with true labels
		y_pred: np.ndarray
		Of shape `(N, n_classes)` with class probabilities.

		Returns
		-------
		float
		The area under the precision-recall curve. A number between 0 and 1.
		"""
		precision, recall, _ = precision_recall_curve(y[:, 1], y_pred[:, 1])
		return auc(recall, precision)


		def rms_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
		"""Computes RMS error."""
		return np.sqrt(mean_squared_error(y_true, y_pred))


		def mae_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
		"""Computes MAE."""
		return mean_absolute_error(y_true, y_pred)


		def bedroc_score(y_true: np.ndarray, y_pred: np.ndarray, alpha: float = 20.0):
		"""Compute BEDROC metric.

		BEDROC metric implemented according to Truchon and Bayley that modifies
		the ROC score by allowing for a factor of early recognition.
		Please confirm details from [1]_.

		Parameters
		----------
		y_true: np.ndarray
		Binary class labels. 1 for positive class, 0 otherwise
		y_pred: np.ndarray
		Predicted labels
		alpha: float, default 20.0
		Early recognition parameter

		Returns
		-------
		float
		Value in [0, 1] that indicates the degree of early recognition

		Notes
		-----
		This function requires RDKit to be installed.

		References
		----------
		.. [1] Truchon et al. "Evaluating virtual screening methods: good and bad metrics
		for the “early recognition” problem." Journal of chemical information and modeling
		47.2 (2007): 488-508.
		"""
		try:
		from rdkit.ML.Scoring.Scoring import CalcBEDROC
		except ModuleNotFoundError:
		raise ValueError("This function requires RDKit to be installed.")

		# validation
		assert len(y_true) == len(y_pred), 'Number of examples do not match'
		assert np.array_equal(
		np.unique(y_true).astype(int),
		[0, 1]), ('Class labels must be binary: %s' % np.unique(y_true))

		yt = np.asarray(y_true)
		yp = np.asarray(y_pred)

		yt = yt.flatten()
		yp = yp[:, 1].flatten() # Index 1 because one_hot predictions

		scores = list(zip(yt, yp))
		scores = sorted(scores, key=lambda pair: pair[1], reverse=True)

		return CalcBEDROC(scores, 0, alpha)

Admin message