Passing singletask/multitask test suite. (5273d873) · Commits · 钟慕尧 / deepchem

deepchem/models/init.py

+12 −17

Original line number	Diff line number	Diff line
		@@ -117,7 +117,6 @@ class Model(object):
		"model_type": self.model_type}
		save_to_disk(params, Model.get_params_filename(out_dir))

		# TODO(rbharath): This training is currently broken w.r.t minibatches! Fix.
		def fit(self, dataset):
		"""
		Fits a model on data in a Dataset object.
		@@ -130,7 +129,8 @@ class Model(object):
		for i, (X, y, w, _) in enumerate(dataset.itershards()):
		print("Training on batch-%s/epoch-%s" % (str(i+1), str(epoch+1)))
		nb_sample = np.shape(X)[0]
		interval_points = np.linspace(0,nb_sample, np.ceil(float(nb_sample)/batch_size)+1).astype(int)
		interval_points = np.linspace(
		0, nb_sample, np.ceil(float(nb_sample)/batch_size)+1).astype(int)
		for j in range(len(interval_points)-1):
		indices = range(interval_points[j],interval_points[j+1])
		X_batch = X[indices, :]
		@@ -154,21 +154,16 @@ class Model(object):
		+ ["y_means", "y_stds"])
		pred_y_df = pd.DataFrame(columns=column_names)

		# TODO(rbharath/enf): This is only for GPU models, and is currently depends
		# on magic numbers.
		MAX_GPU_RAM = float(691007488/50)
		batch_size = self.model_params["batch_size"]
		for (X, y, w, ids) in dataset.itershards():
		if sys.getsizeof(X) > MAX_GPU_RAM:
		nb_block = float(sys.getsizeof(X))/MAX_GPU_RAM
		nb_sample = np.shape(X)[0]
		interval_points = np.linspace(0,nb_sample,nb_block+1).astype(int)
		interval_points = np.linspace(
		0, nb_sample, np.ceil(float(nb_sample)/batch_size)+1).astype(int)
		y_preds = []
		for j in range(0,len(interval_points)-1):
		indices = range(interval_points[j],interval_points[j+1])
		y_preds.append(self.predict_on_batch(X[indices,:]))
		y_pred = np.concatenate(y_preds)
		else:
		y_pred = self.predict_on_batch(X)
		y_pred = np.reshape(y_pred, np.shape(y))

		shard_df = pd.DataFrame(columns=column_names)

deepchem/models/deep.py

+8 −1

Original line number	Diff line number	Diff line
		@@ -136,8 +136,15 @@ class MultiTaskDNN(KerasModel):
		nb_tasks = len(sorted_tasks)
		y_pred = np.zeros((nb_samples, nb_tasks))
		for ind, task in enumerate(sorted_tasks):
		task_type = self.task_types[task]
		taskname = "task%d" % ind
		y_pred[:,ind] = np.squeeze(y_pred_dict[taskname])
		if task_type == "classification":
		# Class probabilities are predicted for classification outputs. Instead,
		# output the most likely class.
		y_pred_task = np.squeeze(np.argmax(y_pred_dict[taskname], axis=1))
		else:
		y_pred_task = np.squeeze(y_pred_dict[taskname])
		y_pred[:,ind] = y_pred_task
		y_pred = np.squeeze(y_pred)
		return y_pred

deepchem/models/standard.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -59,7 +59,7 @@ class SklearnModel(Model):
		Xs.append(X)
		ys.append(y)
		X = np.concatenate(Xs)
		y = np.concatenate(ys)
		y = np.concatenate(ys).ravel()
		self.raw_model.fit(X, y)

		def predict_on_batch(self, X):

deepchem/utils/dataset.py

+22 −18

Original line number	Diff line number	Diff line
		@@ -13,6 +13,10 @@ from deepchem.utils.save import save_to_disk
		from deepchem.utils.save import load_from_disk
		from deepchem.utils.featurize import FeaturizedSamples

		# TODO(rbharath): The semantics of this class are very difficult to debug.
		# Multiple transformations of the data are performed on disk, and computations
		# of mean/std are spread across multiple functions for efficiency. Some
		# refactoring needs to happen here.
		class Dataset(object):
		"""
		Wrapper class for dataset transformed into X, y, w numpy ndarrays.
		@@ -108,7 +112,6 @@ class Dataset(object):
		"""
		nb_shards = self.get_number_shards()
		for i, row in self.metadata_df.iterrows():
		print("Loading shard %d out of %d" % (i+1, nb_shards))
		X = load_from_disk(row['X-transformed'])
		y = load_from_disk(row['y-transformed'])
		w = load_from_disk(row['w'])
		@@ -133,7 +136,6 @@ class Dataset(object):

		# Store input_transforms/output_transforms so the dataset remembers its state.

		print("Transforming data.")
		X_means, X_stds, y_means, y_stds = self._transform(normalize_X, normalize_y,
		truncate_x, truncate_y,
		log_X, log_y,
		@@ -193,10 +195,9 @@ def _transform_row(i, df, normalize_X, normalize_y, truncate_X, truncate_y,
		X = load_from_disk(row['X'])
		if normalize_X or log_X:
		if normalize_X:
		print("Normalizing X sample %d out of %d" % (i+1,total))
		# Turns NaNs to zeros
		X = np.nan_to_num((X - X_means) / X_stds)
		if truncate_X:
		print("Truncating X sample %d out of %d" % (i+1,total))
		X[X > trunc] = trunc
		X[X < (-1.0trunc)] = -1.0 trunc
		if log_X:
		@@ -249,7 +250,6 @@ def compute_sums_and_nb_sample(tensor, W=None):

		def write_dataset_single(val, data_dir, feature_types):
		(df_file, df) = val
		print("Examining %s" % df_file)
		# TODO(rbharath): This is a hack. clean up.
		if not len(df):
		return None
		@@ -301,29 +301,33 @@ def _df_to_numpy(df, feature_types):
		y[missing] = 0.
		w[missing] = 0.

		return sorted_ids, x, y, w
		return sorted_ids, x.astype(float), y.astype(float), w.astype(float)


		def compute_mean_and_std(df):
		"""
		Compute means/stds of X/y from sums/sum_squares of tensors.
		"""
		X_sums, X_sum_squares, X_n = (df['X_sums'],
		df['X_sum_squares'],
		df['X_n'])
		X_sums, X_sum_squares, X_n = (list(df['X_sums']),
		list(df['X_sum_squares']),
		list(df['X_n']))
		# Note that X_n is a list of floats
		n = float(np.sum(X_n))
		X_sums = np.vstack(X_sums)
		X_sum_squares = np.vstack(X_sum_squares)
		overall_X_sums = np.sum(X_sums, axis=0)
		overall_X_means = overall_X_sums / n
		overall_X_sum_squares = np.sum(X_sum_squares, axis=0)

		X_vars = (overall_X_sum_squares - np.square(overall_X_sums)/n)/(n)

		y_sums, y_sum_squares, y_n = (df['y_sums'].values,
		df['y_sum_squares'].values,
		df['y_n'].values)
		y_sums, y_sum_squares, y_n = (list(df['y_sums']),
		list(df['y_sum_squares']),
		list(df['y_n']))
		# Note y_n is a list of arrays of shape (n_tasks,)
		y_n = np.sum(y_n, axis=0)
		y_sums = np.vstack(y_sums)
		y_sum_squares = np.vstack(y_sum_squares)
		n = float(np.sum(y_n))
		y_means = np.sum(y_sums, axis=0)/n
		y_vars = np.sum(y_sum_squares,axis=0)/n - np.square(y_means)
		y_means = np.sum(y_sums, axis=0)/y_n
		y_vars = np.sum(y_sum_squares, axis=0)/y_n - np.square(y_means)
		return overall_X_means, np.sqrt(X_vars), y_means, np.sqrt(y_vars)

deepchem/utils/evaluate.py

+12 −10

Original line number	Diff line number	Diff line
		@@ -84,31 +84,33 @@ class Evaluator(object):
		pred_y_df.to_csv(csv_out)

		if self.task_type == "classification":
		colnames = ["task_name", "roc_auc_score", "matthews_corrcoef", "recall_score", "accuracy_score"]
		colnames = ["task_name", "roc_auc_score", "matthews_corrcoef",
		"recall_score", "accuracy_score"]
		elif self.task_type == "regression":
		colnames = ["task_name", "r2_score", "rms_error"]
		else:
		raise ValueError("Unrecognized task type: %s" % self.task_type)

		performance_df = pd.DataFrame(columns=colnames)
		print("compute_model_performance()")
		y_means = pred_y_df.iterrows().next()[1]["y_means"]
		y_stds = pred_y_df.iterrows().next()[1]["y_stds"]

		for i, task_name in enumerate(self.task_names):
		y = pred_y_df[task_name]
		y_pred = pred_y_df["%s_pred" % task_name]
		w = pred_y_df["%s_weight" % task_name]

		y = pred_y_df[task_name].values
		y_pred = pred_y_df["%s_pred" % task_name].values
		w = pred_y_df["%s_weight" % task_name].values
		y = undo_transform(y, y_means, y_stds, self.output_transforms)
		y_pred = undo_transform(y_pred, y_means, y_stds, self.output_transforms)

		if self.task_type == "classification":
		y, y_pred = y[w.nonzero()], y_pred[w.nonzero()][:, 1]
		y, y_pred = y[w.nonzero()].astype(int), y_pred[w.nonzero()].astype(int)
		# Sometimes all samples have zero weight. In this case, continue.
		if not len(y):
		continue
		auc = compute_roc_auc_scores(y, y_pred, w)
		mcc = matthews_corrcoef(y, np.around(y_pred))
		recall = recall_score(y, np.around(y_pred))
		accuracy = accuracy_score(y, np.around(y_pred))
		mcc = matthews_corrcoef(y, y_pred)
		recall = recall_score(y, y_pred)
		accuracy = accuracy_score(y, y_pred)
		performance_df.loc[i] = [task_name, auc, mcc, recall, accuracy]

		elif self.task_type == "regression":

Admin message