Merge branch 'master' into mat-feat (86b384d8) · Commits · 钟慕尧 / deepchem

deepchem/data/data_loader.py

+245 −167

Original line number	Diff line number	Diff line
		@@ -12,7 +12,7 @@ import time
		import sys
		import logging
		import warnings
		from typing import List, Optional, Dict, Tuple, Any, Sequence, Union
		from typing import List, Optional, Dict, Tuple, Any, Sequence, Union, Iterator

		from deepchem.utils.typing import OneOrMany
		from deepchem.utils.save import load_csv_files, load_json_files
		@@ -25,7 +25,8 @@ import zipfile
		logger = logging.getLogger(__name__)


		def _convert_df_to_numpy(df, tasks):
		def _convert_df_to_numpy(df: pd.DataFrame,
		tasks: List[str]) -> Tuple[np.ndarray, np.ndarray]:
		"""Transforms a dataframe containing deepchem input into numpy arrays

		This is a private helper method intended to help parse labels and
		@@ -38,7 +39,7 @@ def _convert_df_to_numpy(df, tasks):
		----------
		df: pd.DataFrame
		Pandas dataframe with columns for all tasks
		tasks: list
		tasks: List[str]
		List of tasks
		"""
		n_samples = df.shape[0]
		@@ -55,53 +56,8 @@ def _convert_df_to_numpy(df, tasks):
		return y.astype(float), w.astype(float)


		def _featurize_smiles_df(df, featurizer, field, log_every_n=1000):
		"""Featurize individual compounds in dataframe.

		Private helper that given a featurizer that operates on individual
		chemical compounds or macromolecules, compute & add features for
		that compound to the features dataframe

		Parameters
		----------
		df: pd.DataFrame
		DataFrame that holds SMILES strings
		featurizer: Featurizer
		A featurizer object
		field: str
		The name of a column in `df` that holds SMILES strings
		log_every_n: int, optional (default 1000)
		Emit a logging statement every `log_every_n` rows.

		Note
		----
		This function requires RDKit to be installed
		"""
		sample_elems = df[field].tolist()

		features = []
		from rdkit import Chem
		from rdkit.Chem import rdmolfiles
		from rdkit.Chem import rdmolops
		for ind, elem in enumerate(sample_elems):
		mol = Chem.MolFromSmiles(elem)
		# TODO (ytz) this is a bandage solution to reorder the atoms
		# so that they're always in the same canonical order.
		# Presumably this should be correctly implemented in the
		# future for graph mols.
		if mol:
		new_order = rdmolfiles.CanonicalRankAtoms(mol)
		mol = rdmolops.RenumberAtoms(mol, new_order)
		if ind % log_every_n == 0:
		logger.info("Featurizing sample %d" % ind)
		features.append(featurizer.featurize([mol]))
		valid_inds = np.array(
		[1 if elt.size > 0 else 0 for elt in features], dtype=bool)
		features = [elt for (is_valid, elt) in zip(valid_inds, features) if is_valid]
		return np.squeeze(np.array(features), axis=1), valid_inds


		def _get_user_specified_features(df, featurizer):
		def _get_user_specified_features(
		df: pd.DataFrame, featurizer: UserDefinedFeaturizer) -> np.ndarray:
		"""Extract and merge user specified features.

		Private helper methods that merges features included in dataset
		@@ -122,6 +78,11 @@ def _get_user_specified_features(df, featurizer):
		DataFrame that holds SMILES strings
		featurizer: Featurizer
		A featurizer object

		Returns
		-------
		np.ndarray
		Array of features extracted from input dataframe.
		"""
		time1 = time.time()
		df[featurizer.feature_fields] = df[featurizer.feature_fields].apply(
		@@ -133,37 +94,6 @@ def _get_user_specified_features(df, featurizer):
		return X_shard


		def _featurize_mol_df(df, featurizer, field, log_every_n=1000):
		"""Featurize individual compounds in dataframe.

		Used when processing .sdf files, so the 3-D structure should be
		preserved. We use the rdkit "mol" object created from .sdf
		instead of smiles string. Some featurizers such as
		CoulombMatrix also require a 3-D structure. Featurizing from
		.sdf is currently the only way to perform CM feautization.

		Parameters
		----------
		df: Pandas Dataframe
		Should be created by dc.utils.save.load_sdf_files.
		featurizer: dc.feat.MolecularFeaturizer
		Featurizer for molecules.
		log_every_n: int, optional
		Controls how often logging statements are emitted.
		"""
		sample_elems = df[field].tolist()

		features = []
		for ind, mol in enumerate(sample_elems):
		if ind % log_every_n == 0:
		logger.info("Featurizing sample %d" % ind)
		features.append(featurizer.featurize([mol]))
		valid_inds = np.array(
		[1 if elt.size > 0 else 0 for elt in features], dtype=bool)
		features = [elt for (is_valid, elt) in zip(valid_inds, features) if is_valid]
		return np.squeeze(np.array(features)), valid_inds


		class DataLoader(object):
		"""Handles loading/featurizing of data from disk.

		@@ -194,7 +124,11 @@ class DataLoader(object):
		for you by performing this work under the hood.
		"""

		def __init__(self, tasks, id_field=None, featurizer=None, log_every_n=1000):
		def __init__(self,
		tasks: List[str],
		id_field: str = None,
		featurizer: Featurizer = None,
		log_every_n: int = 1000):
		"""Construct a DataLoader object.

		This constructor is provided as a template mainly. You
		@@ -325,7 +259,7 @@ class DataLoader(object):

		return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

		def _get_shards(self, inputs, shard_size):
		def _get_shards(self, inputs: List, shard_size: int) -> Iterator:
		"""Stub for children classes.

		Should implement a generator that walks over the source data in
		@@ -348,7 +282,7 @@ class DataLoader(object):
		"""
		raise NotImplementedError

		def _featurize_shard(self, shard):
		def _featurize_shard(self, shard: Any):
		"""Featurizes a shard of input data.

		Recall a shard is a chunk of input data that can reasonably be
		@@ -368,24 +302,58 @@ class CSVLoader(DataLoader):
		pandas, but this class may prove useful if you're processing
		large CSV files that you don't want to manipulate directly in
		memory.

		Examples
		--------
		Let's suppose we have some smiles and labels

		>>> smiles = ["C", "CCC"]
		>>> labels = [1.5, 2.3]

		Let's put these in a dataframe.

		>>> import pandas as pd
		>>> df = pd.DataFrame(list(zip(smiles, labels)), columns=["smiles", "task1"])

		Let's now write this to disk somewhere. We can now use `CSVLoader` to
		process this CSV dataset.

		>>> import tempfile
		>>> import deepchem as dc
		>>> with tempfile.NamedTemporaryFile(mode='w') as tmpfile:
		... df.to_csv(tmpfile.name)
		... loader = dc.data.CSVLoader(["task1"], feature_field="smiles",
		... featurizer=dc.feat.CircularFingerprint())
		... dataset = loader.create_dataset(tmpfile.name)
		>>> len(dataset)
		2

		Of course in practice you should already have your data in a CSV file if
		you're using `CSVLoader`. If your data is already in memory, use
		`InMemoryLoader` instead.
		"""

		def __init__(self,
		tasks,
		smiles_field=None,
		id_field=None,
		featurizer=None,
		log_every_n=1000):
		tasks: List[str],
		feature_field: Optional[str] = None,
		label_field: Optional[str] = None,
		weight_field: Optional[str] = None,
		smiles_field: Optional[str] = None,
		id_field: str = None,
		featurizer: Optional[Featurizer] = None,
		log_every_n: int = 1000):
		"""Initializes CSVLoader.

		Parameters
		----------
		tasks: list[str]
		tasks : List[str]
		List of task names
		smiles_field: str, optional
		feature_field : str, optional (default None)
		Field with data to be featurized.
		id_field: str, optional, (default None)
		CSV column that holds sample identifier
		smiles_field: str, optional (DEPRECATED)
		Name of field that holds smiles string
		id_field: str, optional
		Name of field that holds sample identifier
		featurizer: dc.feat.Featurizer, optional
		Featurizer to use to process data
		log_every_n: int, optional
		@@ -393,20 +361,32 @@ class CSVLoader(DataLoader):
		"""
		if not isinstance(tasks, list):
		raise ValueError("tasks must be a list.")
		if smiles_field is not None:
		logger.warning(
		"smiles_field is deprecated and will be removed in a future version of DeepChem. Use feature_field instead."
		)
		if feature_field is not None and smiles_field != feature_field:
		raise ValueError(
		"smiles_field and feature_field if both set must have the same value."
		)
		elif feature_field is None:
		feature_field = smiles_field

		self.tasks = tasks
		self.smiles_field = smiles_field
		self.feature_field = feature_field
		self.id_field = id_field
		if id_field is None:
		self.id_field = smiles_field
		self.id_field = feature_field # Use features as unique ids if necessary
		else:
		self.id_field = id_field
		#self.mol_field = mol_field
		self.user_specified_features = None
		if isinstance(featurizer, UserDefinedFeaturizer):
		self.user_specified_features = featurizer.feature_fields
		self.featurizer = featurizer
		self.log_every_n = log_every_n

		def _get_shards(self, input_files, shard_size):
		def _get_shards(self, input_files: List[str],
		shard_size: int) -> Iterator[pd.DataFrame]:
		"""Defines a generator which returns data for each shard

		Parameters
		@@ -415,29 +395,122 @@ class CSVLoader(DataLoader):
		List of filenames to process
		shard_size: int
		The size of a shard of data to process at a time.

		Returns
		-------
		Iterator over shards
		"""
		return load_csv_files(input_files, shard_size)

		def _featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""
		return _featurize_smiles_df(
		shard,
		self.featurizer,
		field=self.smiles_field,
		log_every_n=self.log_every_n)
		def _featurize_shard(self,
		shard: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
		"""Featurizes a shard of an input dataframe.

		Parameters
		----------
		shard: pd.DataFrame
		DataFrame that holds a shard of the input CSV file

		Returns
		-------
		features: np.ndarray
		Features computed from CSV file.
		valid_inds: np.ndarray
		Indices of rows in source CSV with valid data.
		"""
		logger.info("About to featurize shard.")
		if self.featurizer is None:
		raise ValueError(
		"featurizer must be specified in constructor to featurizer data/")
		features = [elt for elt in self.featurizer(shard[self.feature_field])]
		valid_inds = np.array(
		[1 if np.array(elt).size > 0 else 0 for elt in features], dtype=bool)
		features = [
		elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
		]
		return np.array(features), valid_inds


		class UserCSVLoader(CSVLoader):
		"""
		Handles loading of CSV files with user-defined featurizers.
		Handles loading of CSV files with user-defined features.

		This is a convenience class that allows for descriptors already present in a
		CSV file to be extracted without any featurization necessary.

		Examples
		--------
		Let's suppose we have some descriptors and labels. (Imagine that these
		descriptors have been computed by an external program.)

		>>> desc1 = [1, 43]
		>>> desc2 = [-2, -22]
		>>> labels = [1.5, 2.3]
		>>> ids = ["cp1", "cp2"]

		Let's put these in a dataframe.

		>>> import pandas as pd
		>>> df = pd.DataFrame(list(zip(ids, desc1, desc2, labels)), columns=["id", "desc1", "desc2", "task1"])

		Let's now write this to disk somewhere. We can now use `UserCSVLoader` to
		process this CSV dataset.

		>>> import tempfile
		>>> import deepchem as dc
		>>> featurizer = dc.feat.UserDefinedFeaturizer(["desc1", "desc2"])
		>>> with tempfile.NamedTemporaryFile(mode='w') as tmpfile:
		... df.to_csv(tmpfile.name)
		... loader = dc.data.UserCSVLoader(["task1"], id_field="id",
		... featurizer=featurizer)
		... dataset = loader.create_dataset(tmpfile.name)
		>>> len(dataset)
		2
		>>> dataset.X[0, 0]
		1

		The difference between `UserCSVLoader` and `CSVLoader` is that our
		descriptors (our features) have already been computed for us, but are spread
		across multiple columns of the CSV file.

		Of course in practice you should already have your data in a CSV file if
		you're using `UserCSVLoader`. If your data is already in memory, use
		`InMemoryLoader` instead.
		"""

		def _get_shards(self, input_files, shard_size):
		"""Defines a generator which returns data for each shard"""
		def _get_shards(self, input_files: List[str],
		shard_size: int) -> Iterator[pd.DataFrame]:
		"""Defines a generator which returns data for each shard

		Parameters
		----------
		input_files: list[str]
		List of filenames to process
		shard_size: int
		The size of a shard of data to process at a time.

		Returns
		-------
		Iterator over shards
		"""
		return load_csv_files(input_files, shard_size)

		def _featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""
		def _featurize_shard(self,
		shard: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
		"""Featurizes a shard of an input dataframe.

		Parameters
		----------
		shard: pd.DataFrame
		DataFrame that holds a shard of the input CSV file

		Returns
		-------
		features: np.ndarray
		Features extracted from CSV file.
		valid_inds: np.ndarray
		Indices of rows in source CSV with valid data.
		"""
		assert isinstance(self.featurizer, UserDefinedFeaturizer)
		X = _get_user_specified_features(shard, self.featurizer)
		return (X, np.ones(len(X), dtype=bool))
		@@ -471,11 +544,11 @@ class JsonLoader(DataLoader):
		"""

		def __init__(self,
		tasks: OneOrMany[str],
		tasks: List[str],
		feature_field: str,
		label_field: str = None,
		weight_field: str = None,
		id_field: str = None,
		label_field: Optional[str] = None,
		weight_field: Optional[str] = None,
		id_field: Optional[str] = None,
		featurizer: Optional[Featurizer] = None,
		log_every_n: int = 1000):
		"""Initializes JsonLoader.
		@@ -556,7 +629,7 @@ class JsonLoader(DataLoader):
		if self.id_field:
		ids = shard[self.id_field].values
		else:
		ids = np.ones(len(X))
		ids = np.ones(len(valid_inds))
		ids = ids[valid_inds]

		if len(self.tasks) > 0:
		@@ -584,66 +657,65 @@ class JsonLoader(DataLoader):

		return DiskDataset.create_dataset(shard_generator(), data_dir)

		def _get_shards(self, input_files, shard_size):
		def _get_shards(self, input_files: List[str],
		shard_size: int) -> Iterator[pd.DataFrame]:
		"""Defines a generator which returns data for each shard"""
		return load_json_files(input_files, shard_size)

		def _featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""
		return self._featurize_df(
		shard, self.featurizer, log_every_n=self.log_every_n)
		def _featurize_shard(self, shard) -> Tuple[np.ndarray, np.ndarray]:
		"""Featurizes a shard of an input dataframe.

		def _featurize_df(self,
		shard,
		featurizer: Featurizer,
		log_every_n: int = 1000) -> Tuple[np.ndarray, np.ndarray]:
		"""Featurize individual samples in dataframe.

		Helper that given a featurizer that operates on individual
		samples, computes & adds features for that sample to the
		features dataframe.
		Helper that computes features for the given shard of data.

		Parameters
		----------
		shard: pd.DataFrame
		DataFrame that holds data to be featurized.
		featurizer: Featurizer
		An instance of `dc.feat.Featurizer`.
		log_every_n: int, optional (default 1000)
		Emit a logging statement every `log_every_n` rows.

		Returns
		-------
		features : np.ndarray
		Array of feature vectors.
		Array of feature vectors. Note that samples for which featurization has
		failed will be filtered out.
		valid_inds : np.ndarray
		Boolean values indicating successfull featurization.

		Boolean values indicating successful featurization for corresponding
		sample in the source.
		"""

		features = []
		valid_inds = []
		field = self.feature_field
		data = shard[field].tolist()

		for idx, datapoint in enumerate(data):
		feat = featurizer.featurize([datapoint])
		is_valid = True if feat.size > 0 else False
		valid_inds.append(is_valid)
		if is_valid:
		features.append(feat)

		return np.squeeze(np.array(features), axis=1), valid_inds
		logger.info("About to featurize shard.")
		if self.featurizer is None:
		raise ValueError(
		"featurizer must be specified in constructor to featurizer data/")
		features = [elt for elt in self.featurizer(shard[self.feature_field])]
		valid_inds = np.array(
		[1 if np.array(elt).size > 0 else 0 for elt in features], dtype=bool)
		features = [
		elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
		]
		return np.array(features), valid_inds


		class SDFLoader(DataLoader):
		"""
		Creates `Dataset` from SDF input files.
		"""Creates a `Dataset` object from SDF input files.

		This class provides conveniences to load data from SDF files.
		This class provides conveniences to load and featurize data from SDF files.

		Examples
		--------
		>>> import deepchem as dc
		>>> import os
		>>> current_dir = os.path.dirname(os.path.realpath(__file__))
		>>> featurizer = dc.feat.CircularFingerprint(size=16)
		>>> loader = dc.data.SDFLoader(["LogP(RRCK)"], featurizer=featurizer, sanitize=True)
		>>> dataset = loader.create_dataset(os.path.join(current_dir, "tests", "membrane_permeability.sdf")) # doctest:+ELLIPSIS
		>>> len(dataset)
		2
		"""

		def __init__(self, tasks, sanitize=False, featurizer=None, log_every_n=1000):
		def __init__(self,
		tasks: List[str],
		sanitize: bool = False,
		featurizer: Featurizer = None,
		log_every_n: int = 1000):
		"""Initialize SDF Loader

		Parameters
		@@ -670,17 +742,21 @@ class SDFLoader(DataLoader):

		def _get_shards(self, input_files, shard_size):
		"""Defines a generator which returns data for each shard"""
		return load_sdf_files(input_files, self.sanitize, tasks=self.tasks)
		return load_sdf_files(
		input_files=input_files,
		clean_mols=self.sanitize,
		tasks=self.tasks,
		shard_size=shard_size)

		def _featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""
		logger.info("Currently featurizing feature_type: %s" %
		self.featurizer.__class__.__name__)
		return _featurize_mol_df(
		shard,
		self.featurizer,
		field=self.mol_field,
		log_every_n=self.log_every_n)
		features = [elt for elt in self.featurizer(shard[self.mol_field])]
		valid_inds = np.array(
		[1 if np.array(elt).size > 0 else 0 for elt in features], dtype=bool)
		features = [
		elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
		]
		return np.array(features), valid_inds


		class FASTALoader(DataLoader):
		@@ -742,7 +818,7 @@ class ImageLoader(DataLoader):
		traverse subdirectories which contain images.
		"""

		def __init__(self, tasks: OneOrMany[str] = None):
		def __init__(self, tasks: Optional[List[str]] = None):
		"""Initialize image loader.

		At present, custom image featurizers aren't supported by this
		@@ -863,7 +939,7 @@ class ImageLoader(DataLoader):
		return ImageDataset(image_files, y=labels, w=weights, ids=image_files)

		@staticmethod
		def load_img(image_files) -> np.ndarray:
		def load_img(image_files: List[str]) -> np.ndarray:
		"""Loads a set of images from disk.

		Parameters
		@@ -1000,7 +1076,8 @@ class InMemoryLoader(DataLoader):

		return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

		def _get_shards(self, inputs, shard_size):
		def _get_shards(self, inputs: List,
		shard_size: int) -> Iterator[pd.DataFrame]:
		"""Break up input into shards.

		Parameters
		@@ -1016,9 +1093,10 @@ class InMemoryLoader(DataLoader):

		Returns
		-------
		Iterator[pd.DataFrame]
		Iterator which iterates over shards of data.
		"""
		current_shard = []
		current_shard: List = []
		for i, datapoint in enumerate(inputs):
		if i != 0 and i % shard_size == 0:
		shard_data = current_shard

deepchem/data/datasets.py

+93 −20

File changed.

Preview size limit exceeded, changes collapsed.

deepchem/data/tests/membrane_permeability.sdf

0 → 100644

+218 −0

File added.

Preview size limit exceeded, changes collapsed.

deepchem/data/tests/singleton.sdf

0 → 100644

+107 −0

Original line number	Diff line number	Diff line
		10_filipski_40
		RDKit 3D

		48 50 0 0 1 0 0 0 0 0999 V2000
		9.1378 -7.4697 -1.1731 C 0 0 0 0 0 0 0 0 0 0 0 0
		9.0300 -8.7563 -1.7553 C 0 0 0 0 0 0 0 0 0 0 0 0
		10.1829 -9.4791 -2.1168 C 0 0 0 0 0 0 0 0 0 0 0 0
		11.4593 -8.9144 -1.9184 C 0 0 0 0 0 0 0 0 0 0 0 0
		11.5888 -7.6306 -1.3431 C 0 0 0 0 0 0 0 0 0 0 0 0
		10.4211 -6.9229 -0.9733 C 0 0 0 0 0 0 0 0 0 0 0 0
		8.0685 -6.6893 -0.7812 O 0 0 0 0 0 0 0 0 0 0 0 0
		6.7356 -7.1730 -0.9323 C 0 0 0 0 0 0 0 0 0 0 0 0
		5.8194 -5.9457 -0.8867 C 0 0 0 0 0 0 0 0 0 0 0 0
		6.3937 -8.1606 0.1955 C 0 0 0 0 0 0 0 0 0 0 0 0
		10.0417 -10.7213 -2.6806 O 0 0 0 0 0 0 0 0 0 0 0 0
		10.6226 -11.7880 -2.0428 C 0 0 0 0 0 0 0 0 0 0 0 0
		11.4794 -12.6365 -2.7738 C 0 0 0 0 0 0 0 0 0 0 0 0
		12.0777 -13.7503 -2.1503 C 0 0 0 0 0 0 0 0 0 0 0 0
		11.8056 -14.0231 -0.7953 C 0 0 0 0 0 0 0 0 0 0 0 0
		10.9593 -13.1740 -0.0542 C 0 0 0 0 0 0 0 0 0 0 0 0
		10.3610 -12.0614 -0.6807 C 0 0 0 0 0 0 0 0 0 0 0 0
		12.5981 -15.4211 0.0061 S 0 0 0 0 0 0 0 0 0 0 0 0
		14.1883 -14.7546 0.5873 C 0 0 0 0 0 0 0 0 0 0 0 0
		11.8095 -15.8020 1.1921 O 0 0 0 0 0 0 0 0 0 0 0 0
		12.8865 -16.4503 -1.0091 O 0 0 0 0 0 0 0 0 0 0 0 0
		12.9447 -7.0276 -1.1268 C 0 0 0 0 0 0 0 0 0 0 0 0
		14.1048 -7.6753 -1.5778 N 0 0 0 0 0 0 0 0 0 0 0 0
		15.3664 -7.2188 -1.4378 C 0 0 0 0 0 0 0 0 0 0 0 0
		15.4761 -5.9335 -0.7477 C 0 0 0 0 0 0 0 0 0 0 0 0
		14.3478 -5.3279 -0.3229 C 0 0 0 0 0 0 0 0 0 0 0 0
		13.0801 -5.8841 -0.5185 N 0 0 0 0 0 0 0 0 0 0 0 0
		16.3235 -7.8662 -1.8727 O 0 0 0 0 0 0 0 0 0 0 0 0
		17.0235 -5.2108 -0.4863 Cl 0 0 0 0 0 0 0 0 0 0 0 0
		8.0727 -9.2223 -1.9323 H 0 0 0 0 0 0 0 0 0 0 0 0
		12.3294 -9.4833 -2.2114 H 0 0 0 0 0 0 0 0 0 0 0 0
		10.5000 -5.9395 -0.5309 H 0 0 0 0 0 0 0 0 0 0 0 0
		6.5963 -7.6418 -1.9072 H 0 0 0 0 0 0 0 0 0 0 0 0
		4.7728 -6.2316 -0.9963 H 0 0 0 0 0 0 0 0 0 0 0 0
		5.9216 -5.4076 0.0563 H 0 0 0 0 0 0 0 0 0 0 0 0
		6.0566 -5.2512 -1.6930 H 0 0 0 0 0 0 0 0 0 0 0 0
		7.0376 -9.0392 0.1822 H 0 0 0 0 0 0 0 0 0 0 0 0
		6.4989 -7.6921 1.1742 H 0 0 0 0 0 0 0 0 0 0 0 0
		5.3655 -8.5122 0.1058 H 0 0 0 0 0 0 0 0 0 0 0 0
		11.6797 -12.4320 -3.8159 H 0 0 0 0 0 0 0 0 0 0 0 0
		12.7400 -14.3980 -2.7059 H 0 0 0 0 0 0 0 0 0 0 0 0
		10.7684 -13.3823 0.9883 H 0 0 0 0 0 0 0 0 0 0 0 0
		9.7026 -11.4187 -0.1132 H 0 0 0 0 0 0 0 0 0 0 0 0
		14.7527 -14.3892 -0.2677 H 0 0 0 0 0 0 0 0 0 0 0 0
		13.9992 -13.9328 1.2743 H 0 0 0 0 0 0 0 0 0 0 0 0
		14.7461 -15.5395 1.0917 H 0 0 0 0 0 0 0 0 0 0 0 0
		13.9997 -8.5573 -2.0516 H 0 0 0 0 0 0 0 0 0 0 0 0
		14.3815 -4.3776 0.1907 H 0 0 0 0 0 0 0 0 0 0 0 0
		1 2 2 0
		1 6 1 0
		1 7 1 0
		2 3 1 0
		2 30 1 0
		3 4 2 0
		3 11 1 0
		4 5 1 0
		4 31 1 0
		5 6 2 0
		5 22 1 0
		6 32 1 0
		7 8 1 0
		8 9 1 0
		8 10 1 0
		8 33 1 0
		9 34 1 0
		9 35 1 0
		9 36 1 0
		10 37 1 0
		10 38 1 0
		10 39 1 0
		11 12 1 0
		12 13 2 0
		12 17 1 0
		13 14 1 0
		13 40 1 0
		14 15 2 0
		14 41 1 0
		15 16 1 0
		15 18 1 0
		16 17 2 0
		16 42 1 0
		17 43 1 0
		18 19 1 0
		18 20 2 0
		18 21 2 0
		19 44 1 0
		19 45 1 0
		19 46 1 0
		22 23 1 0
		22 27 2 0
		23 24 1 0
		23 47 1 0
		24 25 1 0
		24 28 2 0
		25 26 2 0
		25 29 1 0
		26 27 1 0
		26 48 1 0
		M END
		> <LogP(RRCK)> (1)
		-5.08

		$$$$

deepchem/data/tests/test_csv_loader.py

+11 −16

Original line number	Diff line number	Diff line
		import os
		from unittest import TestCase
		from io import StringIO
		import tempfile
		import shutil

		import deepchem as dc


		class TestCSVLoader(TestCase):

		def test_load_singleton_csv(self):
		def test_load_singleton_csv():
		fin = tempfile.NamedTemporaryFile(mode='w', delete=False)
		fin.write("smiles,endpoint\nc1ccccc1,1")
		fin.close()
		print(fin.name)
		featurizer = dc.feat.CircularFingerprint(size=1024)
		tasks = ["endpoint"]
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		tasks=tasks, feature_field="smiles", featurizer=featurizer)

		X = loader.create_dataset(fin.name)
		self.assertEqual(1, len(X))
		assert len(X) == 1
		os.remove(fin.name)

Admin message