Cleaned up modeler, but eval still broken. (2860de81) · Commits · 钟慕尧 / deepchem

deepchem/scripts/modeler.py

+4 −99

Original line number	Diff line number	Diff line
		@@ -22,12 +22,6 @@ import deepchem.models.deep3d
		# the --skip-foo flags, it's possible to run all functionality directly through
		# create_model. Perhaps trim the fat and delete the remaining commands.

		def add_featurization_command(subparsers):
		"""Adds flags for featurize subcommand."""
		featurize_cmd = subparsers.add_parser(
		"featurize", help="Featurize raw input data.")
		add_featurize_group(featurize_cmd)

		def add_featurize_group(featurize_cmd):
		"""Adds flags for featurizization."""
		featurize_group = featurize_cmd.add_argument_group("Input Specifications")
		@@ -60,7 +54,6 @@ def add_featurize_group(featurize_cmd):
		featurize_group.add_argument(
		"--parallel", type=float, default=None,
		help="Use multiprocessing will be used to parallelize featurization.")
		featurize_group.set_defaults(func=featurize_inputs_wrapper)

		def add_transforms_group(cmd):
		"""Adds flags for data transforms."""
		@@ -94,21 +87,6 @@ def add_transforms_group(cmd):
		"--weight-positives", type=bool, default=False,
		help="Weight positive examples to have same total weight as negatives.")

		def add_train_test_command(subparsers):
		"""Adds flags for train-test-split subcommand."""
		train_test_cmd = subparsers.add_parser(
		"train-test-split",
		help="Apply standard data transforms to raw features generated by featurize,\n"
		"then split data into train/test and store data as (X,y) matrices.")
		add_transforms_group(train_test_cmd)
		train_test_cmd.add_argument(
		"--paths", nargs="+", required=1,
		help="Paths to input datasets.")
		train_test_cmd.add_argument(
		"--data-dir", type=str, required=1,
		help="Location to save train and test data.")
		train_test_cmd.set_defaults(func=train_test_split_wrapper)

		def add_model_group(fit_cmd):
		"""Adds flags for specifying models."""
		group = fit_cmd.add_argument_group("model")
		@@ -154,49 +132,6 @@ def add_model_group(fit_cmd):
		"--nesterov", action="store_true",
		help="If set, use Nesterov acceleration.")

		def add_fit_command(subparsers):
		"""Adds arguments for fit subcommand."""
		fit_cmd = subparsers.add_parser(
		"fit", help="Fit a model to training data.")
		group = fit_cmd.add_argument_group("load-and-transform")
		group.add_argument(
		"--data-dir", required=1,
		help="Location of saved transformed data.")
		add_model_group(fit_cmd)
		group = fit_cmd.add_argument_group("save")
		group.add_argument(
		"--model-dir", type=str, required=1,
		help="Location to save trained model.")
		fit_cmd.set_defaults(func=fit_model_wrapper)

		def add_eval_command(subparsers):
		"""Adds arguments for eval subcommand."""
		eval_cmd = subparsers.add_parser(
		"eval",
		help="Evaluate trained model on test data processed by transform.")
		group = eval_cmd.add_argument_group("load model/data")
		group.add_argument(
		"--saved-model", type=str, required=1,
		help="Location from which to load saved model.")
		group.add_argument(
		"--saved-data", required=1, help="Location of saved transformed data.")
		eval_cmd.add_argument(
		"--csv-out", type=str, required=1,
		help="Outputted predictions on evaluated set.")
		eval_cmd.add_argument(
		"--stats-out", type=str, required=1j,
		help="Computed statistics on evaluated set.")
		eval_cmd.set_defaults(func=eval_trained_model_wrapper)

		def add_predict_command(subparsers):
		"""Adds arguments for predict subcommand."""
		predict_cmd = subparsers.add_parser(
		"predict",
		help="Make predictions of model on new data.")
		#group = predict_cmd.add_a

		# TODO(rbharath): There are a lot of duplicate commands introduced here. Is
		# there a nice way to factor them?
		def add_model_command(subparsers):
		"""Adds flags for model subcommand."""
		model_cmd = subparsers.add_parser(
		@@ -297,25 +232,9 @@ def parse_args(input_args=None):
		"""Parse command-line arguments."""
		parser = argparse.ArgumentParser()
		subparsers = parser.add_subparsers(title='Modes')

		add_featurization_command(subparsers)
		add_train_test_command(subparsers)
		add_fit_command(subparsers)
		add_eval_command(subparsers)

		add_model_command(subparsers)

		return parser.parse_args(input_args)

		def featurize_inputs_wrapper(args):
		"""Wrapper function that calls _featurize_input with args unwrapped."""
		if not os.path.exists(args.feature_dir):
		os.makedirs(args.feature_dir)
		featurize_inputs(
		args.feature_dir, args.input_files, args.user_specified_features,
		args.tasks, args.smiles_field, args.split_field, args.id_field,
		args.threshold)

		def featurize_inputs(feature_dir, input_files,
		user_specified_features, tasks, smiles_field,
		split_field, id_field, threshold, parallel):
		@@ -351,12 +270,6 @@ def featurize_input(input_file, feature_dir, user_specified_features, tasks,
		feature_dir, "%s.joblib" %(os.path.splitext(os.path.basename(input_file))[0]))
		featurizer.featurize(input_file, FeaturizedSamples.feature_types, out)

		def train_test_split_wrapper(args):
		"""Wrapper function that calls _train_test_split_wrapper after unwrapping args."""
		train_test_split(args.paths, args.input_transforms,
		args.output_transforms, args.feature_types,
		args.splittype, args.mode, args.data_dir)

		def train_test_split(paths, input_transforms, output_transforms,
		feature_types, splittype, mode, data_dir):
		"""Saves transformed model."""
		@@ -391,12 +304,6 @@ def train_test_split(paths, input_transforms, output_transforms,
		print("Transforming test data.")
		test_dataset.transform(input_transforms, output_transforms)

		def fit_model_wrapper(args):
		"""Wrapper that calls _fit_model with arguments unwrapped."""
		model_params = extract_model_params(args)
		fit_model(
		args.model_name, model_params, args.model_dir, args.data_dir)

		def fit_model(model_name, model_params, model_dir, data_dir):
		"""Builds model from featurized data."""
		task_type = Model.get_task_type(model_name)
		@@ -410,16 +317,14 @@ def fit_model(model_name, model_params, model_dir, data_dir):
		model.fit(train)
		model.save(model_dir)

		def eval_trained_model_wrapper(args):
		"""Wrapper function that calls _eval_trained_model with unwrapped args."""
		eval_trained_model(
		args.model, args.model_dir, args.data_dir,
		args.csv_out, args.stats_out, split="test")

		def eval_trained_model(model_type, model_dir, data_dir,
		csv_out, stats_out, split="test"):
		"""Evaluates a trained model on specified data."""
		model = Model.load(model_type, model_dir)
		print("eval_trained_model()")
		print("data_dir")
		print(data_dir)

		data = Dataset(data_dir)

		evaluator = Evaluator(model, data, verbose=True)

deepchem/utils/dataset.py

+13 −4

Original line number	Diff line number	Diff line
		@@ -39,8 +39,11 @@ class Dataset(object):
		feature_types=feature_types)

		metadata_rows = []
		for df_file in samples.dataset_files:
		metadata_rows.append(write_dataset_single_partial(df_file))
		# TODO(rbharath): Still a bit of information leakage.
		for df_file, df in zip(samples.dataset_files, samples.itersamples()):
		retval = write_dataset_single_partial((df_file, df))
		if retval is not None:
		metadata_rows.append(retval)

		# TODO(rbharath): FeaturizedSamples should not be responsible for
		# X-transform, X_sums, etc. Move that stuff over to Dataset.
		@@ -246,9 +249,12 @@ def compute_sums_and_nb_sample(tensor, W=None):
		# The following are all associated with Dataset, but are separate functions to
		# make it easy to use multiprocessing.

		def write_dataset_single(df_file, data_dir, feature_types):
		def write_dataset_single(val, data_dir, feature_types):
		(df_file, df) = val
		print("Examining %s" % df_file)
		df = load_from_disk(df_file)
		# TODO(rbharath): This is a hack. clean up.
		if not len(df):
		return None
		task_names = FeaturizedSamples.get_sorted_task_names(df)
		ids, X, y, w = df_to_numpy(df, feature_types)
		X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
		@@ -286,6 +292,9 @@ def df_to_numpy(df, feature_types):
		feature_list.append(datapoint[feature_type])
		features = np.squeeze(np.concatenate(feature_list))
		tensors.append(features)
		print("df_to_numpy()")
		print("len(df)")
		print(len(df))
		x = np.stack(tensors)

		# Remove entries with missing labels

deepchem/utils/featurize.py

+16 −0

Original line number	Diff line number	Diff line
		@@ -299,6 +299,22 @@ class FeaturizedSamples(object):
		save_to_disk(df, self._get_compounds_filename())
		self.compounsd_df = df

		# TODO(rbharath): Might this be inefficient?
		def itersamples(self):
		"""
		Provides an iterator over samples.

		Each sample from the iterator is a dataframe of samples.
		"""
		compound_ids = set(list(self.compounds_df["mol_id"]))
		for df_file in self.dataset_files:
		df = load_from_disk(df_file)
		visible_inds = []
		for ind, row in df.iterrows():
		if row["mol_id"] in compound_ids:
		visible_inds.append(ind)
		yield df.iloc[visible_inds]

		def train_test_split(self, splittype, train_dir, test_dir, seed=None,
		frac_train=.8):
		"""

Admin message