Fixed some bugs in sklearn model usage. (f664d079) · Commits · 钟慕尧 / deepchem

deepchem/models/standard.py

+18 −13

Original line number	Diff line number	Diff line
		"""
		Code for processing datasets using scikit-learn.
		"""
		import numpy as np
		from sklearn.ensemble import RandomForestClassifier
		from sklearn.ensemble import RandomForestRegressor
		from sklearn.linear_model import LogisticRegression
		@@ -10,37 +11,41 @@ from sklearn.linear_model import LassoCV
		from sklearn.linear_model import ElasticNetCV
		from sklearn.linear_model import LassoLarsCV
		from deepchem.models import Model
		from deepchem.utils.dataset import load_from_disk
		from deepchem.utils.dataset import save_to_disk

		class SklearnModel(Model):
		"""
		Abstract base class for different ML models.
		"""
		def __init__(self, task_types, model_params, initialize_raw_model=True):
		super(SklearnModel, self).__init__(task_types, model_params,
		initialize_raw_model)
		def __init__(self, model_type, task_types, model_params,
		initialize_raw_model=True):
		super(SklearnModel, self).__init__(
		model_type, task_types, model_params, initialize_raw_model)
		self.task_types = task_types
		self.model_params = model_params
		if initialize_raw_model:
		if self.modeltype == "rf_regressor":
		if self.model_type == "rf_regressor":
		raw_model = RandomForestRegressor(
		n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
		elif self.modeltype == "rf_classifier":
		elif self.model_type == "rf_classifier":
		raw_model = RandomForestClassifier(
		n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
		elif modeltype == "logistic":
		elif self.model_type == "logistic":
		raw_model = LogisticRegression(class_weight="auto")
		elif modeltype == "linear":
		elif self.model_type == "linear":
		raw_model = LinearRegression(normalize=True)
		elif modeltype == "ridge":
		elif self.model_type == "ridge":
		raw_model = RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0], normalize=True)
		elif modeltype == "lasso":
		elif self.model_type == "lasso":
		raw_model = LassoCV(max_iter=2000, n_jobs=-1)
		elif modeltype == "lasso_lars":
		elif self.model_type == "lasso_lars":
		raw_model = LassoLarsCV(max_iter=2000, n_jobs=-1)
		elif modeltype == "elastic_net":
		elif self.model_type == "elastic_net":
		raw_model = ElasticNetCV(max_iter=2000, n_jobs=-1)
		else:
		raise ValueError("Invalid model type provided.")
		self.raw_model = raw_model

		# TODO(rbharath): This does not work with very large datasets! sklearn does
		# support partial_fit, but only for some models. Might make sense to make
		@@ -66,11 +71,11 @@ class SklearnModel(Model):
		def save(self, out_dir):
		"""Saves sklearn model to disk using joblib."""
		super(SklearnModel, self).save(out_dir)
		joblib.dump(self.raw_model, self.get_model_filename(out_dir))
		save_to_disk(self.raw_model, self.get_model_filename(out_dir))

		def load(self, model_dir):
		"""Loads sklearn model from joblib file on disk."""
		self.raw_model = joblib.load(Model.get_model_filename(model_dir))
		self.raw_model = load_from_disk(Model.get_model_filename(model_dir))

		Model.register_model_type("logistic", SklearnModel)
		Model.register_model_type("rf_classifier", SklearnModel)

deepchem/scripts/modeler.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -282,7 +282,7 @@ def train_test_split(paths, input_transforms, output_transforms,

		print("Loading featurized data.")
		samples_dir = os.path.join(data_dir, "samples")
		samples = FeaturizedSamples(samples_dir, dataset_files)
		samples = FeaturizedSamples(samples_dir, dataset_files, reload=False)

		print("Split data into train/test")
		train_samples_dir = os.path.join(data_dir, "train-samples")

deepchem/utils/dataset.py

+3 −0

Original line number	Diff line number	Diff line
		@@ -37,6 +37,9 @@ class Dataset(object):
		write_dataset_single_partial = partial(
		write_dataset_single, data_dir=self.data_dir,
		feature_types=feature_types)
		print("Dataset()")
		print("samples.compounds_df")
		print(samples.compounds_df)

		metadata_rows = []
		# TODO(rbharath): Still a bit of information leakage.

deepchem/utils/featurize.py

+13 −3

Original line number	Diff line number	Diff line
		@@ -68,7 +68,13 @@ class DataFeaturizer(object):
		df = self._standardize_df(pd.DataFrame(rows))
		for feature_type in feature_types:
		self._featurize_df(df, feature_type)
		print("featurize()")
		print("len(df)")
		print(len(df))
		print("out")
		print(out)
		save_to_disk(df, out)
		df_loaded = load_from_disk(out)

		def _get_fields(self, input_file):
		"""Get the names of fields and field_types for input data."""
		@@ -226,7 +232,7 @@ class FeaturizedSamples(object):
		set(FeaturizedSamples.feature_types))
		return sorted(list(task_names))

		def __init__(self, feature_dir, dataset_files=None, overwrite=True):
		def __init__(self, feature_dir, dataset_files=None, overwrite=True, reload=False):
		"""
		Initialiize FeaturizedSamples

		@@ -241,14 +247,18 @@ class FeaturizedSamples(object):
		if not os.path.exists(feature_dir):
		os.makedirs(feature_dir)
		self.feature_dir = feature_dir

		if os.path.exists(self._get_compounds_filename()):
		print("FeaturizedSamples()")
		if os.path.exists(self._get_compounds_filename()) and reload:
		print("compounds loaded from disk")
		compounds_df = load_from_disk(self._get_compounds_filename())
		else:
		print("compounds recomputed")
		compounds_df = self._get_compounds()
		# compounds_df is not altered by any method after initialization, so it's
		# safe to keep a copy in memory and on disk.
		save_to_disk(compounds_df, self._get_compounds_filename())
		print("len(compounds_df)")
		print(len(compounds_df))
		self._check_validity(compounds_df)
		self.compounds_df = compounds_df

Admin message