Merge pull request #380 from proteneer/unlabelled (a3501081) · Commits · 钟慕尧 / deepchem

deepchem/data/data_loader.py

+17 −7

Original line number	Diff line number	Diff line
		@@ -20,7 +20,7 @@ from deepchem.utils.save import load_sdf_files
		from deepchem.feat import UserDefinedFeaturizer
		from deepchem.data import DiskDataset

		def convert_df_to_numpy(df, tasks, id_field, verbose=False):
		def convert_df_to_numpy(df, tasks, verbose=False):
		"""Transforms a dataframe containing deepchem input into numpy arrays"""
		n_samples = df.shape[0]
		n_tasks = len(tasks)
		@@ -39,7 +39,7 @@ def convert_df_to_numpy(df, tasks, id_field, verbose=False):
		if y[ind, task] == "":
		missing[ind, task] = 1

		ids = df[id_field].values
		# ids = df[id_field].values
		# Set missing data to have weight zero
		for ind in range(n_samples):
		for task in range(n_tasks):
		@@ -47,7 +47,7 @@ def convert_df_to_numpy(df, tasks, id_field, verbose=False):
		y[ind, task] = 0.
		w[ind, task] = 0.

		return ids, y.astype(float), w.astype(float)
		return y.astype(float), w.astype(float)

		def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
		"""Featurize individual compounds in dataframe.
		@@ -152,10 +152,20 @@ class DataLoader(object):
		for shard_num, shard in enumerate(self.get_shards(input_files, shard_size)):
		time1 = time.time()
		X, valid_inds = self.featurize_shard(shard)
		ids, y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)
		ids = shard[self.id_field].values
		ids = ids[valid_inds]
		if len(self.tasks) > 0:
		# Featurize task results iff they exist.
		y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)
		# Filter out examples where featurization failed.
		ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds])
		y, w = (y[valid_inds], w[valid_inds])
		assert len(X) == len(ids) == len(y) == len(w)
		else:
		# For prospective data where results are unknown, it makes
		# no sense to have y values or weights.
		y, w = (None, None)
		assert len(X) == len(ids)

		time2 = time.time()
		log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2-time1),
		self.verbose)

deepchem/data/datasets.py

+79 −28

Original line number	Diff line number	Diff line
		@@ -410,27 +410,40 @@ class DiskDataset(Dataset):
		metadata_entries should have elements returned by write_data_to_disk
		above.
		"""
		columns=('basename','task_names', 'ids', 'X', 'y', 'w')
		metadata_df = pd.DataFrame(
		metadata_entries,
		columns=('basename','task_names', 'ids', 'X', 'y', 'w'))
		columns=columns)
		return metadata_df

		@staticmethod
		def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None,
		ids=None):
		out_X = "%s-X.joblib" % basename
		out_y = "%s-y.joblib" % basename
		out_w = "%s-w.joblib" % basename
		out_ids = "%s-ids.joblib" % basename

		if X is not None:
		out_X = "%s-X.joblib" % basename
		save_to_disk(X, os.path.join(data_dir, out_X))
		else:
		out_X = None

		if y is not None:
		out_y = "%s-y.joblib" % basename
		save_to_disk(y, os.path.join(data_dir, out_y))
		else:
		out_y = None

		if w is not None:
		out_w = "%s-w.joblib" % basename
		save_to_disk(w, os.path.join(data_dir, out_w))
		else:
		out_w = None

		if ids is not None:
		out_ids = "%s-ids.joblib" % basename
		save_to_disk(ids, os.path.join(data_dir, out_ids))
		else:
		out_ids = None

		# note that this corresponds to the _construct_metadata column order
		return [basename, tasks, out_ids, out_X, out_y, out_w]

		def save_to_disk(self):
		@@ -526,15 +539,22 @@ class DiskDataset(Dataset):
		for _, row in dataset.metadata_df.iterrows():
		X = np.array(load_from_disk(
		os.path.join(dataset.data_dir, row['X'])))
		ids = np.array(load_from_disk(
		os.path.join(dataset.data_dir, row['ids'])), dtype=object)
		# These columns may be missing is the dataset is unlabelled.
		if row['y'] is not None:
		y = np.array(load_from_disk(
		os.path.join(dataset.data_dir, row['y'])))
		else:
		y = None
		if row['w'] is not None:
		w_filename = os.path.join(dataset.data_dir, row['w'])
		if os.path.exists(w_filename):
		w = np.array(load_from_disk(w_filename))
		else:
		w = np.ones(y.shape)
		ids = np.array(load_from_disk(
		os.path.join(dataset.data_dir, row['ids'])), dtype=object)
		else:
		w = None
		yield (X, y, w, ids)
		return iterate(self)

		@@ -571,8 +591,17 @@ class DiskDataset(Dataset):
		indices = range(interval_points[j], interval_points[j+1])
		perm_indices = sample_perm[indices]
		X_batch = X[perm_indices]

		if y is not None:
		y_batch = y[perm_indices]
		else:
		y_batch = None

		if w is not None:
		w_batch = w[perm_indices]
		else:
		w_batch = None

		ids_batch = ids[perm_indices]
		if pad_batches:
		(X_batch, y_batch, w_batch, ids_batch) = pad_batch(
		@@ -592,7 +621,12 @@ class DiskDataset(Dataset):
		for (X_shard, y_shard, w_shard, ids_shard) in dataset.itershards():
		n_samples = X_shard.shape[0]
		for i in range(n_samples):
		yield (X_shard[i], y_shard[i], w_shard[i], ids_shard[i])
		def sanitize(elem):
		if elem is None:
		return None
		else:
		return elem[i]
		yield map(sanitize, [X_shard, y_shard, w_shard, ids_shard])
		return iterate(self)

		def transform(self, fn, **args):
		@@ -750,13 +784,23 @@ class DiskDataset(Dataset):
		row = self.metadata_df.iloc[i]
		X = np.array(load_from_disk(
		os.path.join(self.data_dir, row['X'])))

		if row['y'] is not None:
		y = np.array(load_from_disk(
		os.path.join(self.data_dir, row['y'])))
		else:
		y = None

		if row['w'] is not None:
		# TODO (ytz): Under what condition does this exist but the file itself doesn't?
		w_filename = os.path.join(self.data_dir, row['w'])
		if os.path.exists(w_filename):
		w = np.array(load_from_disk(w_filename))
		else:
		w = np.ones(y.shape)
		else:
		w = None

		ids = np.array(load_from_disk(
		os.path.join(self.data_dir, row['ids'])), dtype=object)
		return (X, y, w, ids)
		@@ -871,7 +915,7 @@ class DiskDataset(Dataset):
		"""
		total = 0
		for _, row in self.metadata_df.iterrows():
		y = load_from_disk(os.path.join(self.data_dir, row['y']))
		y = load_from_disk(os.path.join(self.data_dir, row['ids']))
		total += len(y)
		return total

		@@ -879,17 +923,24 @@ class DiskDataset(Dataset):
		"""Finds shape of dataset."""
		n_tasks = len(self.get_task_names())
		X_shape = np.array((0,) + (0,) * len(self.get_data_shape()))
		ids_shape = np.array((0,))
		if n_tasks > 0:
		y_shape = np.array((0,) + (0,))
		w_shape = np.array((0,) + (0,))
		ids_shape = np.array((0,))
		else:
		y_shape = tuple()
		w_shape = tuple()

		for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
		if shard_num == 0:
		X_shape += np.array(X.shape)
		if n_tasks > 0:
		y_shape += np.array(y.shape)
		w_shape += np.array(w.shape)
		ids_shape += np.array(ids.shape)
		else:
		X_shape[0] += np.array(X.shape)[0]
		if n_tasks > 0:
		y_shape[0] += np.array(y.shape)[0]
		w_shape[0] += np.array(w.shape)[0]
		ids_shape[0] += np.array(ids.shape)[0]

deepchem/data/tests/init.py

+9 −0

Original line number	Diff line number	Diff line
		@@ -91,3 +91,12 @@ def load_gaussian_cdf_data():
		loader = dc.data.UserCSVLoader(
		tasks=tasks, featurizer=featurizer, id_field="id")
		return loader.featurize(input_file)

		def load_unlabelled_data():
		current_dir = os.path.dirname(os.path.abspath(__file__))
		featurizer = dc.feat.CircularFingerprint(size=1024)
		tasks = []
		input_file = os.path.join(current_dir, "../../data/tests/no_labels.csv")
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		return loader.featurize(input_file)
		No newline at end of file

deepchem/data/tests/no_labels.csv

0 → 100644

+26 −0

Original line number	Diff line number	Diff line
		smiles,id
		O=C1CCc2c(N1)[c-]c([c-][c-]2)OCCCC[N+]1([O-])CCN(CC1)c1[c-][c-][c-]c(c1Cl)Cl,48866084_50429806
		O=C1CCc2c(N1)[c-]c([c-][c-]2)OCCCCN1CC[N+](CC1)([O-])c1[c-][c-][c-]c(c1Cl)Cl,48866086_50429808
		CO[C@H]1O[C@H]2O[C@]3(C)CC[C@H]4[C@@]2([C@@H]([C@H]1C)CC[C@@H]4C)OO3,48866088_48866087
		O=C1O[C@@H]2O[C@]3(C)CC[C@H]4[C@@]2([C@H]([C@@H]1C)CC[C@@H]4C)OO3,48866090_48866089
		O=C1O[C@@H]2O[C@]3(C)CC[C@H]4[C@@]2([C@H](C1=C)CC[C@@H]4C)OO3,48866092_48866091
		OCC1O[C@@H](O[C@@H]2C[C@@H](C(=O)O)[C@@H]3[C@](C2)(C)[C@@H]2CC[C@@H]4C[C@@]2(CC3)[C@@H](O)C4=C)C(C([C@@H]1OS(=O)(=O)[O-])OS(=O)(=O)[O-])OC(=O)CC(C)C.[Na+].[Na+],48866104_48866103
		OC1C[C@@H](O[C@@H]1COP(=O)(O)O)n1cnc(nc1=O)N,48866106_48866105
		C/C=C(/C(=O)OC1C[C@H](OC(=O)C)C2([C@@H]3[C@@]41CO[C@@]([C@H]4[C@@](C)([C@H]([C@H]3OC2)O)[C@@]12OC2(C)C2CC1O[C@@H]1C2(O)C=CO1)(O)C(=O)OC)C(=O)OC)\C,48866108_48866107
		CN1CCC(=C2c3[c-][c-][c-][c-]c3CCc3c2n[c-][c-][c-]3)CC1.OC(=O)/C=C\C(=O)O,48866111_33542275
		Clc1[c-][c-]c([c-][c-]1)Cc1nn(C2CCC[N+](CC2)([O-])C)c(=O)c2c1[c-][c-][c-][c-]2,48866115_48866114
		CC[C@@H]1OC(=O)[C@H](C)[C@H](OC2OC(C)C(C(C2)(C)OC)O)[C@@H](C)[C@H](OC2OC(C)CC(C2O)[N+](C)(C)[O-])[C@](C[C@@H](CN([C@@H]([C@H](C1(C)O)O)C)C)C)(C)O,48866130_48866129
		CO/C=C(\c1[c-][c-][c-][c-]c1Oc1n[c-]nc([c-]1)Oc1[c-][c-][c-][c-]c1C#N)/C(=O)OC,48866134_207297540
		COC(=O)C1=C(C)NC(=C([C@@H]1c1cccc(c1)[N+](=O)[O-])C(=O)O[C@H]1CCN(C1)Cc1ccccc1)C.Cl,48866140_48866139
		O=S1(=O)N[C@H](Cc2[c-][c-][c-][c-][c-]2)Nc2c1[c-]c(c([c-]2)C(F)(F)F)S(=O)(=O)N,48866148_48866147
		O=S1(=O)N[C@@H](Cc2[c-][c-][c-][c-][c-]2)Nc2c1[c-]c(c([c-]2)C(F)(F)F)S(=O)(=O)N,48866150_48866149
		[c-]1[c-][c-]c([c-][c-]1)/C=N/N=C/c1[c-][c-][c-][c-][c-]1,48866152_48866151
		O=C(c1[c-][c-][c-][c-][c-]1)NOCC(=O)O,48866154_48866153
		CC(CC(c1[c-][c-]c([c-][c-]1)OCCOCC[N+](Cc1[c-][c-][c-][c-][c-]1)(C)C)(C)C)(C)C.[Cl-],48866156_515814
		O=C1CN(C1)C(c1[c-][c-][c-][c-][c-]1)c1[c-][c-][c-][c-][c-]1,48866158_48866157
		OC(=O)c1[c-][c-]c2c([c-]1)n[c-]n2,48866160_48866159
		Cc1c(OCC(F)(F)F)[c-][c-]n2c1c(Sc1nc3c(n1)[c-][c-][c-][c-]3)n1c2nc2c1[c-][c-][c-][c-]2,48866162_48866161
		CCc1oc2c(c1C(=O)c1[c-]c(I)c(c([c-]1)I)O)[c-][c-][c-][c-]2,48866164_48866163
		[c-]1[c-]c2[c-]c3c4[c-][c-][c-][c-]c4[c-][c-]c3c3c2c([c-]1)[C-]=[C-]3.[c-]1[c-][c-]c2c([c-]1)[c-]c1c3c2[C-]=[C-]c3[c-]c2c1[c-][c-][c-][c-]2,48866166_48866165
		O=C1CC(=O)Nc2c(N1)[c-][c-][c-][c-]2,48866168_48866167
		ClCC(=O)N1[C@@H](Cc2c([C@H]1c1[c-][c-]c3c([c-]1)OCO3)nc1c2[c-][c-][c-][c-]1)C(=O)OC,48866170_207350992

deepchem/data/tests/test_data_loader.py

+9 −0

Original line number	Diff line number	Diff line
		@@ -23,6 +23,15 @@ class TestDataLoader(unittest.TestCase):
		super(TestDataLoader, self).setUp()
		self.current_dir = os.path.dirname(os.path.abspath(__file__))

		def unlabelled_test(self):
		input_file = os.path.join(
		self.current_dir, "../../data/tests/no_labels.csv")
		featurizer = dc.feat.CircularFingerprint(size=1024)
		loader = dc.data.CSVLoader(
		tasks=[], smiles_field="smiles",
		featurizer=featurizer)
		loader.featurize(input_file)

		def scaffold_test_train_valid_test_split(self):
		"""Test of singletask RF ECFP regression API."""
		splittype = "scaffold"

Admin message