Some debugging progress (ef843889) · Commits · 钟慕尧 / deepchem

deepchem/datasets/init.py

+12 −2

Original line number	Diff line number	Diff line
		@@ -636,12 +636,17 @@ def convert_df_to_numpy(df, feature_type, tasks, mol_id_field):
		for task in range(n_tasks):
		if y[ind, task] == "":
		missing[ind, task] = 1
		x = np.squeeze(np.array(list(df[feature_type].values)))
		x_list = list(df[feature_type].values)
		valid_inds = np.array([1 if elt.size > 0 else 0 for elt in x_list], dtype=bool)
		x_list = [elt for (is_valid, elt) in zip(valid_inds, x_list) if is_valid]
		x = np.squeeze(np.array(x_list))
		############################################################## DEBUG
		time2 = time.time()
		print("CONVERT_DF_TO_NUMPY X COMP TOOK %0.3f s" % (time2-time1))
		############################################################## DEBUG
		sorted_ids = df[mol_id_field]
		############################################################## DEBUG
		sorted_ids = df[mol_id_field].values
		############################################################## DEBUG

		# Set missing data to have weight zero
		# TODO(rbharath): There's a better way to do this with numpy indexing
		@@ -658,6 +663,11 @@ def convert_df_to_numpy(df, feature_type, tasks, mol_id_field):
		print("CONVERT_DF_TO_NUMPY MISSING COMP TOOK %0.3f s" % (time2-time1))
		############################################################## DEBUG

		############################################################## DEBUG
		sorted_ids = sorted_ids[valid_inds]
		y = y[valid_inds]
		w = w[valid_inds]
		############################################################## DEBUG
		# Adding this assertion in to avoid ill-formed outputs.
		assert len(sorted_ids) == len(x) == len(y) == len(w)
		return sorted_ids, x.astype(float), y.astype(float), w.astype(float)

deepchem/datasets/tests/init.py

+5 −5

Original line number	Diff line number	Diff line
		@@ -45,17 +45,17 @@ class TestDatasetAPI(TestAPI):
		"""Loads classification data from example.csv"""
		if os.path.exists(self.data_dir):
		shutil.rmtree(self.data_dir)
		featurizer = [CircularFingerprint(size=1024)]
		featurizer = CircularFingerprint(size=1024)
		tasks = ["outcome"]
		task_type = "classification"
		input_file = os.path.join(
		self.current_dir, "../../models/tests/example_classification.csv")
		featurizer = DataFeaturizer(
		loader = DataFeaturizer(
		tasks=tasks,
		smiles_field=self.smiles_field,
		featurizer=featurizer,
		verbosity="low")
		return featurizer.featurize(input_file, self.data_dir)
		return loader.featurize(input_file, self.data_dir)

		def load_multitask_data(self):
		"""Load example multitask data."""
		@@ -67,9 +67,9 @@ class TestDatasetAPI(TestAPI):
		"task13", "task14", "task15", "task16"]
		input_file = os.path.join(
		self.current_dir, "../../models/tests/multitask_example.csv")
		featurizer = DataFeaturizer(
		loader = DataFeaturizer(
		tasks=tasks,
		smiles_field=self.smiles_field,
		featurizer=featurizer,
		verbosity="low")
		return featurizer.featurize(input_file, self.data_dir)
		return loader.featurize(input_file, self.data_dir)

deepchem/datasets/tests/test_drop.py

+5 −5

Original line number	Diff line number	Diff line
		@@ -37,11 +37,11 @@ class TestDrop(TestAPI):
		featurizer = CircularFingerprint(size=1024)
		emols_tasks = ['activity']

		featurizer = DataFeaturizer(tasks=emols_tasks,
		loader = DataFeaturizer(tasks=emols_tasks,
		smiles_field="smiles",
		featurizer=featurizer,
		verbosity=verbosity)
		dataset = featurizer.featurize(dataset_file, data_dir)
		dataset = loader.featurize(dataset_file, data_dir, debug=True, logging=False)

		X, y, w, ids = dataset.to_numpy()
		print("ids.shape, X.shape, y.shape, w.shape")

deepchem/featurizers/featurize.py

+57 −5

Original line number	Diff line number	Diff line
		@@ -16,6 +16,8 @@ import multiprocessing as mp
		from functools import partial
		from rdkit import Chem
		import itertools as it
		import traceback
		from multiprocessing.pool import Pool
		from deepchem.utils.save import log
		from deepchem.utils.save import save_to_disk
		from deepchem.utils.save import load_pickle_from_disk
		@@ -26,11 +28,48 @@ from deepchem.utils.save import load_data
		from deepchem.utils.save import get_input_type
		############################################################## DEBUG
		import time
		import sys
		############################################################## DEBUG

		#def _process_helper(row, loader, fields, input_type):
		# return loader._process_raw_sample(input_type, row, fields)


		# Shortcut to multiprocessing's logger
		# http://stackoverflow.com/questions/6728236/exception-thrown-in-multiprocessing-pool-not-detected
		def error(msg, *args):
		############################################################# DEBUG
		import sys
		sys.stdout.flush()
		############################################################# DEBUG
		return mp.get_logger().error(msg, *args)

		class LogExceptions(object):
		def __init__(self, callable):
		self.__callable = callable

		def __call__(self, args, *kwargs):
		try:
		result = self.__callable(args, *kwargs)

		except Exception as e:
		# Here we add some debugging help. If multiprocessing's
		# debugging is on, it will arrange to log the traceback
		error(traceback.format_exc())
		# Re-raise the original exception so the Pool worker can
		# clean up
		raise

		# It was fine, give a normal answer
		return result

		class LoggingPool(Pool):
		def apply_async(self, func, args=(), kwds={}, callback=None):
		return Pool.apply_async(self, LogExceptions(func), args, kwds, callback)

		def map_async(self, func, iterable, chunksize=None, callback=None):
		return Pool.map_async(self, LogExceptions(func), iterable, chunksize, callback)

		def featurize_map_function(args):
		#try:
		############################################################## DEBUG
		@@ -117,7 +156,8 @@ class DataFeaturizer(object):
		self.log_every_n = log_every_n

		def featurize(self, input_files, data_dir, shard_size=8192,
		num_shards_per_batch=24, worker_pool=None):
		num_shards_per_batch=24, worker_pool=None,
		logging=True, debug=False):
		"""Featurize provided files and write to specified location."""
		############################################################## DEBUG
		time1 = time.time()
		@@ -138,8 +178,15 @@ class DataFeaturizer(object):
		return None
		input_type = get_input_type(input_files[0])

		if logging:
		mp.log_to_stderr()
		if worker_pool is None:
		############################################################## DEBUG
		if logging:
		worker_pool = LoggingPool(processes=1)
		else:
		worker_pool = mp.Pool(processes=1)
		############################################################## DEBUG
		log("Spawning workers now.", self.verbosity)
		metadata_rows = []
		data_iterator = it.izip(
		@@ -159,9 +206,14 @@ class DataFeaturizer(object):
		############################################################## DEBUG
		time1 = time.time()
		############################################################## DEBUG
		iterator = itertools.islice(data_iterator, num_shards_per_batch)
		if not debug:
		batch_metadata = worker_pool.map(
		featurize_map_function,
		itertools.islice(data_iterator, num_shards_per_batch))
		featurize_map_function, iterator)
		else:
		batch_metadata = []
		for elt in iterator:
		batch_metadata.append(featurize_map_function(elt))
		############################################################## DEBUG
		time2 = time.time()
		print("MAP CALL TOOK %0.3f s" % (time2-time1))

deepchem/featurizers/tests/test_data_featurizer.py

+5 −5

Original line number	Diff line number	Diff line
		@@ -29,10 +29,10 @@ class TestDataFeaturizer(TestAPI):

		tasks = ["log-solubility"]
		smiles_field = "smiles"
		featurizer = DataFeaturizer(tasks=tasks,
		loader = DataFeaturizer(tasks=tasks,
		smiles_field=self.smiles_field,
		featurizers=[CircularFingerprint(size=1024)],
		featurizer=CircularFingerprint(size=1024),
		verbosity="low")
		dataset = featurizer.featurize(input_file, self.data_dir)
		dataset = loader.featurize(input_file, self.data_dir)

		assert len(dataset) == 10

Admin message