Singletask now running, but need to handle missing values cogently. (11d9efd6) · Commits · 钟慕尧 / deepchem

deep_chem/models/deep.py

+16 −16

Original line number	Diff line number	Diff line
		@@ -18,7 +18,7 @@ def fit_multitask_mlp(train_data, task_types, **training_params):
		Parameters
		----------
		task_types: dict
		dict mapping target names to output type. Each output type must be either
		dict mapping task names to output type. Each output type must be either
		"classification" or "regression".
		training_params: dict
		Aggregates keyword parameters to pass to train_multitask_model
		@@ -37,10 +37,10 @@ def fit_singletask_mlp(train_data, task_types, **training_params):
		Perform stochastic gradient descent optimization for a keras MLP.

		task_types: dict
		dict mapping target names to output type. Each output type must be either
		dict mapping task names to output type. Each output type must be either
		"classification" or "regression".
		output_transforms: dict
		dict mapping target names to label transform. Each output type must be either
		dict mapping task names to label transform. Each output type must be either
		None or "log". Only for regression outputs.
		training_params: dict
		Aggregates keyword parameters to pass to train_multitask_model
		@@ -48,14 +48,14 @@ def fit_singletask_mlp(train_data, task_types, **training_params):
		models = {}
		train_ids = train_data["mol_ids"]
		X_train = train_data["features"]
		sorted_targets = train_data["sorted_targets"]
		for index, target in enumerate(sorted_targets):
		sorted_tasks = train_data["sorted_tasks"]
		for index, task in enumerate(sorted_tasks):
		print "Training model %d" % index
		print "Target %s" % target
		(y_train, W_train) = train_data[target]
		print "Target %s" % task
		(y_train, W_train) = train_data[task]
		print "%d compounds in Train" % len(train_ids)
		models[target] = train_multitask_model(X_train, y_train, W_train,
		{target: task_types[target]}, **training_params)
		models[task] = train_multitask_model(X_train, y_train, W_train,
		{task: task_types[task]}, **training_params)
		return models

		def train_multitask_model(X, y, W, task_types,
		@@ -75,7 +75,7 @@ def train_multitask_model(X, y, W, task_types,
		W: np.ndarray
		Weight matrix
		task_types: dict
		dict mapping target names to output type. Each output type must be either
		dict mapping task names to output type. Each output type must be either
		"classification" or "regression".
		learning_rate: float
		Learning rate used.
		@@ -90,9 +90,9 @@ def train_multitask_model(X, y, W, task_types,
		"""
		eps = .001
		num_tasks = len(task_types)
		sorted_targets = sorted(task_types.keys())
		sorted_tasks = sorted(task_types.keys())
		local_task_types = task_types.copy()
		endpoints = sorted_targets
		endpoints = sorted_tasks
		(_, n_inputs) = np.shape(X[0].flatten())
		# Add eps weight to avoid minibatches with zero weight (causes theano to crash).
		W = W + eps * np.ones(np.shape(W))
		@@ -103,8 +103,8 @@ def train_multitask_model(X, y, W, task_types,
		name="dense", input="input")
		model.add_node(Dropout(dropout), name="dropout", input="dense")
		top_layer = "dropout"
		for task, target in enumerate(endpoints):
		task_type = local_task_types[target]
		for task, task in enumerate(endpoints):
		task_type = local_task_types[task]
		if task_type == "classification":
		model.add_node(
		Dense(n_hidden, 2, init='uniform', activation="softmax"),
		@@ -116,8 +116,8 @@ def train_multitask_model(X, y, W, task_types,
		model.add_output(name="task%d" % task, input="dense_head%d" % task)
		data_dict, loss_dict, sample_weights = {}, {}, {}
		data_dict["input"] = X
		for task, target in enumerate(endpoints):
		task_type = local_task_types[target]
		for task, task in enumerate(endpoints):
		task_type = local_task_types[task]
		taskname = "task%d" % task
		sample_weights[taskname] = W[:, task]
		if task_type == "classification":

deep_chem/models/deep3d.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -13,12 +13,12 @@ def fit_3D_convolution(train_data, task_types, **training_params):
		"""
		models = {}
		X_train = train_data["features"]
		if len(train_data["sorted_targets"]) > 1:
		if len(train_data["sorted_tasks"]) > 1:
		raise ValueError("3D Convolutions only supported for singletask.")
		target_name = train_data["sorted_targets"][0]
		(y_train, _) = train_data["sorted_targets"].itervalues().next()
		task_name = train_data["sorted_tasks"][0]
		(y_train, _) = train_data["sorted_tasks"].itervalues().next()
		nb_classes = 2
		models[target_name] = train_3D_convolution(X_train, y_train, **training_params)
		models[task_name] = train_3D_convolution(X_train, y_train, **training_params)
		return models

		def train_3D_convolution(X, y, batch_size=50, nb_epoch=1,learning_rate=0.01,

deep_chem/models/standard.py

+9 −6

Original line number	Diff line number	Diff line
		@@ -24,15 +24,18 @@ def fit_singletask_models(train_data, modeltype):
		seed: int (optional)
		Seed to initialize np.random.
		output_transforms: dict
		dict mapping target names to label transform. Each output type must be either
		dict mapping task names to label transform. Each output type must be either
		None or "log". Only for regression outputs.
		"""
		models = {}
		print "fit_singletask_models()"
		print "train_data.keys()"
		print train_data.keys()
		X_train = train_data["features"]
		sorted_targets = train_data["sorted_targets"]
		for target in sorted_targets:
		print "Building model for target %s" % target
		(y_train, _) = train_data[target]
		sorted_tasks = train_data["sorted_tasks"]
		for task in sorted_tasks:
		print "Building model for task %s" % task
		(y_train, _) = train_data[task]
		if modeltype == "rf_regressor":
		model = RandomForestRegressor(
		n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
		@@ -54,7 +57,7 @@ def fit_singletask_models(train_data, modeltype):
		else:
		raise ValueError("Invalid model type provided.")
		model.fit(X_train, y_train.ravel())
		models[target] = model
		models[task] = model
		return models

		## TODO(rbharath): I believe this is broken. Update it to work with the rest of

deep_chem/scripts/modeler.py

+27 −13

Original line number	Diff line number	Diff line
		@@ -165,7 +165,7 @@ def add_fit_command(subparsers):
		"fit", help="Fit a model to training data.")
		group = fit_cmd.add_argument_group("load-and-transform")
		group.add_argument(
		"--task-type", default="classification",
		"--task-type", required=1,
		choices=["classification", "regression"],
		help="Type of learning task.")
		group.add_argument(
		@@ -206,7 +206,7 @@ def add_eval_command(subparsers):
		# TODO(rbharath): This argument seems a bit extraneous. Is it really
		# necessary?
		group.add_argument(
		"--task-type", default="classification",
		"--task-type", required=1,
		choices=["classification", "regression"],
		help="Type of learning task.")
		group = eval_cmd.add_argument_group("Classification metrics")
		@@ -249,6 +249,12 @@ def add_model_command(subparsers):
		model_cmd.add_argument(
		"--skip-featurization", action="store_true",
		help="If set, skip the featurization step.")
		model_cmd.add_argument(
		"--skip-train-test-split", action="store_true",
		help="If set, skip the train-test-split step.")
		model_cmd.add_argument(
		"--skip-fit", action="store_true",
		help="If set, skip model fit step.")
		add_featurize_group(model_cmd)

		train_test_group = model_cmd.add_argument_group("train_test_group")
		@@ -300,6 +306,7 @@ def create_model(args):
		weight_positives = False # Hard coding this for now
		train_out = os.path.join(data_dir, "%s-train.joblib" % args.name)
		test_out = os.path.join(data_dir, "%s-test.joblib" % args.name)
		if not args.skip_train_test_split:
		_train_test_input(
		paths, args.output_transforms, args.input_transforms, args.feature_types,
		args.splittype, weight_positives, args.mode, train_out, test_out,
		@@ -310,6 +317,7 @@ def create_model(args):
		modeltype = get_model_type(args.model)
		extension = get_model_extension(modeltype)
		saved_out = os.path.join(data_dir, "%s.%s" % (args.model, extension))
		if not args.skip_fit:
		_fit_model(
		paths, args.model, args.task_type, args.n_hidden, args.learning_rate,
		args.dropout, args.n_epochs, args.decay, args.batch_size, args.loss_function,
		@@ -326,6 +334,9 @@ def create_model(args):
		compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef = (
		False, False, False, False)
		compute_r2s, compute_rms = False, False
		print "create_model()"
		print "args.task_type"
		print args.task_type
		if args.task_type == "classification":
		compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef = (
		True, True, True, True)
		@@ -335,6 +346,8 @@ def create_model(args):
		modeltype, saved_out, train_out, paths, args.task_type, compute_aucs,
		compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s,
		compute_rms, csv_out_train, stats_out_train, args.target_fields)
		print "(compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s, compute_rms)"
		print (compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s, compute_rms)
		print "Eval Model on Test"
		print "------------------"
		_eval_trained_model(
		@@ -517,8 +530,9 @@ def _eval_trained_model(modeltype, saved_model, saved_data, paths, task_type,
		with open(stats_out, "wb") as stats_file:
		results, _, _, _ = compute_model_performance(
		raw_test_dict, test_dict, task_types, model, modeltype,
		output_transforms, compute_aucs, compute_r2s, compute_rms, compute_recall,
		compute_accuracy, compute_matthews_corrcoef, print_file=stats_file)
		output_transforms, aucs=compute_aucs, r2s=compute_r2s, rms=compute_rms,
		recall=compute_recall, accuracy=compute_accuracy,
		mcc=compute_matthews_corrcoef, print_file=stats_file)
		with open(stats_out, "r") as stats_file:
		print stats_file.read()
		results_to_csv(results, csv_out, task_type=task_type)

deep_chem/utils/analysis.py

+36 −36

Original line number	Diff line number	Diff line
		@@ -33,42 +33,42 @@ def summarize_distribution(y):
		print "Histogram: "
		print hist

		def analyze_data(dataset, splittype="random"):
		"""Analyzes regression dataset.

		Parameters
		----------
		dataset: dict
		A dictionary of type produced by load_datasets.
		splittype: string
		Type of split for train/test. Either random or scaffold.
		"""
		singletask = multitask_to_singletask(dataset)
		for target in singletask:
		data = singletask[target]
		if len(data.keys()) == 0:
		continue
		if splittype == "random":
		train, test = train_test_random_split(data, seed=0)
		elif splittype == "scaffold":
		train, test = train_test_scaffold_split(data)
		else:
		raise ValueError("Improper splittype. Must be random/scaffold.")
		_, Xtrain, ytrain, _ = dataset_to_numpy(train)
		# TODO(rbharath): Take this out once debugging is completed
		ytrain = np.log(ytrain)
		mean = np.mean(ytrain)
		std = np.std(ytrain)
		minval = np.amin(ytrain)
		maxval = np.amax(ytrain)
		hist = np.histogram(ytrain)
		print target
		print "Mean: %f" % mean
		print "Std: %f" % std
		print "Min: %f" % minval
		print "Max: %f" % maxval
		print "Histogram: "
		print hist
		#def analyze_data(dataset, splittype="random"):
		# """Analyzes regression dataset.
		#
		# Parameters
		# ----------
		# dataset: dict
		# A dictionary of type produced by load_datasets.
		# splittype: string
		# Type of split for train/test. Either random or scaffold.
		# """
		# singletask = multitask_to_singletask(dataset)
		# for target in singletask:
		# data = singletask[target]
		# if len(data.keys()) == 0:
		# continue
		# if splittype == "random":
		# train, test = train_test_random_split(data, seed=0)
		# elif splittype == "scaffold":
		# train, test = train_test_scaffold_split(data)
		# else:
		# raise ValueError("Improper splittype. Must be random/scaffold.")
		# _, Xtrain, ytrain, _ = dataset_to_numpy(train)
		# # TODO(rbharath): Take this out once debugging is completed
		# ytrain = np.log(ytrain)
		# mean = np.mean(ytrain)
		# std = np.std(ytrain)
		# minval = np.amin(ytrain)
		# maxval = np.amax(ytrain)
		# hist = np.histogram(ytrain)
		# print target
		# print "Mean: %f" % mean
		# print "Std: %f" % std
		# print "Min: %f" % minval
		# print "Max: %f" % maxval
		# print "Histogram: "
		# print hist


		def compare_all_datasets():

Admin message