Fixed bug in normalization. Still bug in train/test eval. (c3d0520f) · Commits · 钟慕尧 / deepchem

deepchem/scripts/modeler.py

+17 −12

Original line number	Diff line number	Diff line
		@@ -197,6 +197,9 @@ def add_model_command(subparsers):
		model_cmd.add_argument(
		"--skip-fit", action="store_true",
		help="If set, skip model fit step.")
		model_cmd.add_argument(
		"--skip-eval", action="store_true",
		help="If set, skip model eval step.")
		model_cmd.add_argument(
		"--base-dir", type=str, required=1,
		help="The base directory for the model.")
		@@ -258,6 +261,7 @@ def create_model(args):
		print("+++++++++++++++++++++++++++++++++")
		print("Eval Model on Train")
		print("-------------------")
		if not args.skip_fit:
		csv_out_train = os.path.join(data_dir, "train.csv")
		stats_out_train = os.path.join(data_dir, "train-stats.txt")
		csv_out_test = os.path.join(data_dir, "test.csv")
		@@ -268,6 +272,7 @@ def create_model(args):
		stats_out_train, args.output_transforms, split="train")
		print("Eval Model on Test")
		print("------------------")
		if not args.skip_fit:
		test_dir = os.path.join(data_dir, "test")
		eval_trained_model(
		model_name, model_dir, test_dir, csv_out_test,

deepchem/utils/dataset.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -397,9 +397,9 @@ def compute_sums_and_nb_sample(tensor, W=None):
		nb_sample = np.shape(tensor)[0]
		else:
		nb_task = np.shape(tensor)[1]
		sums = np.zeros((nb_task))
		sum_squares = np.zeros((nb_task))
		nb_sample = np.zeros((nb_task))
		sums = np.zeros(nb_task)
		sum_squares = np.zeros(nb_task)
		nb_sample = np.zeros(nb_task)
		for task in range(nb_task):
		y_task = tensor[:,task]
		W_task = W[:,task]
		@@ -417,7 +417,7 @@ def compute_mean_and_std(df):
		X_sums, X_sum_squares, X_n = (df['X_sums'],
		df['X_sum_squares'],
		df['X_n'])
		n = np.sum(X_n)
		n = float(np.sum(X_n))
		overall_X_sums = np.sum(X_sums, axis=0)
		overall_X_means = overall_X_sums / n
		overall_X_sum_squares = np.sum(X_sum_squares, axis=0)

deepchem/utils/evaluate.py

+6 −1

Original line number	Diff line number	Diff line
		@@ -56,11 +56,16 @@ def compute_model_performance(pred_y_df, task_names, task_type, stats_file, outp
		y_means = pred_y_df.iterrows().next()[1]["y_means"]
		y_stds = pred_y_df.iterrows().next()[1]["y_stds"]

		print("compute_model_performance()")
		for i, task_name in enumerate(task_names):
		y = pred_y_df[task_name]
		y_pred = pred_y_df["%s_pred" % task_name]
		w = pred_y_df["%s_weight" % task_name]

		print("y_means")
		print(y_means)
		print("y_stds")
		print(y_stds)
		y = undo_transform(y, y_means, y_stds, output_transforms)
		y_pred = undo_transform(y_pred, y_means, y_stds, output_transforms)

deepchem/utils/preprocess.py

+1 −129

Original line number	Diff line number	Diff line
		@@ -43,31 +43,9 @@ def train_test_split(paths, input_transforms, output_transforms,
		print("Transforming test data.")
		test_arrays.transform_data(input_transforms, output_transforms)

		'''
		print("About to train/test split dataset")
		train_files, test_files = get_train_test_files(paths, splittype)
		train_metadata = write_dataset(train_files, data_dir, mode, feature_types)
		train_metadata["split"] = "train"
		test_metadata = write_dataset(test_files, data_dir, mode, feature_types)
		test_metadata["split"] = "test"

		metadata = pd.concat([train_metadata, test_metadata])
		metadata["input_transforms"] = ",".join(input_transforms)
		metadata["output_transforms"] = ",".join(output_transforms)

		metadata = transform_data(metadata, input_transforms, output_transforms)

		metadata_filename = get_metadata_filename(data_dir)
		print("Saving metadata file to %s" % metadata_filename)
		save_to_disk(metadata, metadata_filename)
		print("Saved metadata.")
		'''


		def undo_normalization(y, y_means, y_stds):
		"""Undo the applied normalization transform."""
		y = y * y_means + y_stds
		return y * y_means + y_stds
		return y * y_stds + y_means

		def undo_transform(y, y_means, y_stds, output_transforms):
		"""Undo transforms on y_pred, W_pred."""
		@@ -118,109 +96,3 @@ def multitask_to_singletask(dataset):
		singletask_labels[target].append(labels[target])
		return singletask_features, singletask_labels
		'''

		#todo(enf/rbharath): completly broken as well.
		'''
		def split_dataset(dataset, splittype, seed=none):
		"""split provided data using specified method."""
		if splittype == "random":
		train, test = train_test_random_split(dataset, seed=seed)
		elif splittype == "scaffold":
		train, test = train_test_scaffold_split(dataset)
		elif splittype == "specified":
		train, test = train_test_specified_split(dataset)
		else:
		raise valueerror("improper splittype.")
		return train, test

		def train_test_specified_split(dataset):
		"""split provided data due to splits in origin data."""
		train, test = {}, {}
		for mol_id, datapoint in dataset.iteritems():
		if "split" not in datapoint:
		raise valueerror("missing required split information.")
		if datapoint["split"].lower() == "train":
		train[mol_id] = datapoint
		elif datapoint["split"].lower() == "test":
		test[mol_id] = datapoint
		return train, test

		def train_test_random_split(dataset, frac_train=.8, seed=none):
		"""splits provided data into train/test splits randomly.

		performs a random 80/20 split of the data into train/test. returns two
		dictionaries

		parameters
		----------
		dataset: dict
		a dictionary of type produced by load_datasets.
		frac_train: float
		proportion of data in train set.
		seed: int (optional)
		seed to initialize np.random.
		"""
		np.random.seed(seed)
		shuffled = np.random.permutation(dataset.keys())
		train_cutoff = np.floor(frac_train * len(shuffled))
		train_keys, test_keys = shuffled[:train_cutoff], shuffled[train_cutoff:]
		train, test = {}, {}
		for key in train_keys:
		train[key] = dataset[key]
		for key in test_keys:
		test[key] = dataset[key]
		return train, test

		def train_test_scaffold_split(dataset, frac_train=.8):
		"""splits provided data into train/test splits by scaffold.

		groups the largest scaffolds into the train set until the size of the
		train set equals frac_train * len(dataset). adds remaining scaffolds
		to test set. the idea is that the test set contains outlier scaffolds,
		and thus serves as a hard test of generalization capability for the
		model.

		parameters
		----------
		dataset: dict
		a dictionary of type produced by load_datasets.
		frac_train: float
		the fraction (between 0 and 1) of the data to use for train set.
		"""
		scaffolds = scaffold_separate(dataset)
		train_size = frac_train * len(dataset)
		train, test = {}, {}
		for elements in scaffolds:
		# if adding this scaffold makes the train_set too big, add to test set.
		if len(train) + len(elements) > train_size:
		for elt in elements:
		test[elt] = dataset[elt]
		else:
		for elt in elements:
		train[elt] = dataset[elt]
		return train, test

		def scaffold_separate(dataset):
		"""splits provided data by compound scaffolds.

		returns a list of pairs (scaffold, [identifiers]), where each pair
		contains a scaffold and a list of all identifiers for compounds that
		share that scaffold. the list will be sorted in decreasing order of
		number of compounds.

		parameters
		----------
		dataset: dict
		a dictionary of type produced by load_datasets.
		"""
		scaffolds = {}
		for mol_id in dataset:
		datapoint = dataset[mol_id]
		scaffold = datapoint["scaffold"]
		if scaffold not in scaffolds:
		scaffolds[scaffold] = [mol_id]
		else:
		scaffolds[scaffold].append(mol_id)
		# sort from largest to smallest scaffold sets
		return [elt for (scaffold, elt) in sorted(scaffolds.items(), key=lambda x: -len(x[1]))]
		'''

Admin message