Unlabelled dataset support WIP (cad42325) · Commits · 钟慕尧 / deepchem

deepchem/data/data_loader.py

+17 −7

Original line number	Diff line number	Diff line
		@@ -20,7 +20,7 @@ from deepchem.utils.save import load_sdf_files
		from deepchem.feat import UserDefinedFeaturizer
		from deepchem.data import DiskDataset

		def convert_df_to_numpy(df, tasks, id_field, verbose=False):
		def convert_df_to_numpy(df, tasks, verbose=False):
		"""Transforms a dataframe containing deepchem input into numpy arrays"""
		n_samples = df.shape[0]
		n_tasks = len(tasks)
		@@ -39,7 +39,7 @@ def convert_df_to_numpy(df, tasks, id_field, verbose=False):
		if y[ind, task] == "":
		missing[ind, task] = 1

		ids = df[id_field].values
		# ids = df[id_field].values
		# Set missing data to have weight zero
		for ind in range(n_samples):
		for task in range(n_tasks):
		@@ -47,7 +47,7 @@ def convert_df_to_numpy(df, tasks, id_field, verbose=False):
		y[ind, task] = 0.
		w[ind, task] = 0.

		return ids, y.astype(float), w.astype(float)
		return y.astype(float), w.astype(float)

		def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
		"""Featurize individual compounds in dataframe.
		@@ -152,10 +152,20 @@ class DataLoader(object):
		for shard_num, shard in enumerate(self.get_shards(input_files, shard_size)):
		time1 = time.time()
		X, valid_inds = self.featurize_shard(shard)
		ids, y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)
		ids = shard[self.id_field].values
		ids = ids[valid_inds]
		if len(self.tasks) > 0:
		# Featurize task results iff they exist.
		y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)
		# Filter out examples where featurization failed.
		ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds])
		y, w = (y[valid_inds], w[valid_inds])
		assert len(X) == len(ids) == len(y) == len(w)
		else:
		# For prospective data where results are unknown, it makes
		# no sense to have y values or weights.
		y, w = (None, None)
		assert len(X) == len(ids)

		time2 = time.time()
		log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2-time1),
		self.verbose)

deepchem/data/datasets.py

+75 −27

Original line number	Diff line number	Diff line
		@@ -410,27 +410,42 @@ class DiskDataset(Dataset):
		metadata_entries should have elements returned by write_data_to_disk
		above.
		"""
		if len(metadata_entries) == 0:
		raise Exception("No metadata entries.")
		columns=('basename','task_names', 'ids', 'X', 'y', 'w')
		metadata_df = pd.DataFrame(
		metadata_entries,
		columns=('basename','task_names', 'ids', 'X', 'y', 'w'))
		columns=columns)
		return metadata_df

		@staticmethod
		def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None,
		ids=None):
		out_X = "%s-X.joblib" % basename
		out_y = "%s-y.joblib" % basename
		out_w = "%s-w.joblib" % basename
		out_ids = "%s-ids.joblib" % basename

		if X is not None:
		out_X = "%s-X.joblib" % basename
		save_to_disk(X, os.path.join(data_dir, out_X))
		else:
		out_X = None

		if y is not None:
		out_y = "%s-y.joblib" % basename
		save_to_disk(y, os.path.join(data_dir, out_y))
		else:
		out_y = None

		if w is not None:
		out_w = "%s-w.joblib" % basename
		save_to_disk(w, os.path.join(data_dir, out_w))
		else:
		out_w = None

		if ids is not None:
		out_ids = "%s-ids.joblib" % basename
		save_to_disk(ids, os.path.join(data_dir, out_ids))
		else:
		out_ids = None

		# note that this corresponds to the _construct_metadata column order
		return [basename, tasks, out_ids, out_X, out_y, out_w]

		def save_to_disk(self):
		@@ -526,15 +541,22 @@ class DiskDataset(Dataset):
		for _, row in dataset.metadata_df.iterrows():
		X = np.array(load_from_disk(
		os.path.join(dataset.data_dir, row['X'])))
		ids = np.array(load_from_disk(
		os.path.join(dataset.data_dir, row['ids'])), dtype=object)
		# These columns may be missing is the dataset is unlabelled.
		if row['y'] is not None:
		y = np.array(load_from_disk(
		os.path.join(dataset.data_dir, row['y'])))
		else:
		y = None
		if row['w'] is not None:
		w_filename = os.path.join(dataset.data_dir, row['w'])
		if os.path.exists(w_filename):
		w = np.array(load_from_disk(w_filename))
		else:
		w = np.ones(y.shape)
		ids = np.array(load_from_disk(
		os.path.join(dataset.data_dir, row['ids'])), dtype=object)
		else:
		w = None
		yield (X, y, w, ids)
		return iterate(self)

		@@ -571,8 +593,17 @@ class DiskDataset(Dataset):
		indices = range(interval_points[j], interval_points[j+1])
		perm_indices = sample_perm[indices]
		X_batch = X[perm_indices]

		if y is not None:
		y_batch = y[perm_indices]
		else:
		y_batch = None

		if w is not None:
		w_batch = w[perm_indices]
		else:
		w_batch = None

		ids_batch = ids[perm_indices]
		if pad_batches:
		(X_batch, y_batch, w_batch, ids_batch) = pad_batch(
		@@ -750,13 +781,23 @@ class DiskDataset(Dataset):
		row = self.metadata_df.iloc[i]
		X = np.array(load_from_disk(
		os.path.join(self.data_dir, row['X'])))

		if row['y'] is not None:
		y = np.array(load_from_disk(
		os.path.join(self.data_dir, row['y'])))
		else:
		y = None

		if row['w'] is not None:
		# TODO (ytz): Under what condition does this exist but the file itself doesn't?
		w_filename = os.path.join(self.data_dir, row['w'])
		if os.path.exists(w_filename):
		w = np.array(load_from_disk(w_filename))
		else:
		w = np.ones(y.shape)
		else:
		w = None

		ids = np.array(load_from_disk(
		os.path.join(self.data_dir, row['ids'])), dtype=object)
		return (X, y, w, ids)
		@@ -871,7 +912,7 @@ class DiskDataset(Dataset):
		"""
		total = 0
		for _, row in self.metadata_df.iterrows():
		y = load_from_disk(os.path.join(self.data_dir, row['y']))
		y = load_from_disk(os.path.join(self.data_dir, row['ids']))
		total += len(y)
		return total

		@@ -879,17 +920,24 @@ class DiskDataset(Dataset):
		"""Finds shape of dataset."""
		n_tasks = len(self.get_task_names())
		X_shape = np.array((0,) + (0,) * len(self.get_data_shape()))
		ids_shape = np.array((0,))
		if n_tasks > 0:
		y_shape = np.array((0,) + (0,))
		w_shape = np.array((0,) + (0,))
		ids_shape = np.array((0,))
		else:
		y_shape = tuple()
		w_shape = tuple()

		for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
		if shard_num == 0:
		X_shape += np.array(X.shape)
		if n_tasks > 0:
		y_shape += np.array(y.shape)
		w_shape += np.array(w.shape)
		ids_shape += np.array(ids.shape)
		else:
		X_shape[0] += np.array(X.shape)[0]
		if n_tasks > 0:
		y_shape[0] += np.array(y.shape)[0]
		w_shape[0] += np.array(w.shape)[0]
		ids_shape[0] += np.array(ids.shape)[0]

Admin message