Merge pull request #2341 from nd-02110114/update-data-docs (01ea79da) · Commits · 钟慕尧 / deepchem

deepchem/data/data_loader.py

+22 −11

Original line number	Diff line number	Diff line
		@@ -509,15 +509,25 @@ class JsonLoader(DataLoader):

		Examples
		--------
		>> import pandas as pd
		>> df = pd.DataFrame(some_data)
		>> df.columns.tolist()
		.. ['sample_data', 'sample_name', 'weight', 'task']
		>> df.to_json('file.json', orient='records', lines=True)
		>> loader = JsonLoader(tasks=['task'], feature_field='sample_data',
		label_field='task', weight_field='weight', id_field='sample_name')
		>> dataset = loader.create_dataset('file.json')
		Let's create the sample dataframe.

		>>> composition = ["LiCoO2", "MnO2"]
		>>> labels = [1.5, 2.3]
		>>> import pandas as pd
		>>> df = pd.DataFrame(list(zip(composition, labels)), columns=["composition", "task"])

		Dump the dataframe to the JSON file formatted as "records" in line delimited format and
		load the json file by JsonLoader.

		>>> import tempfile
		>>> import deepchem as dc
		>>> with dc.utils.UniversalNamedTemporaryFile(mode='w') as tmpfile:
		... df.to_json(tmpfile.name, orient='records', lines=True)
		... featurizer = dc.feat.ElementPropertyFingerprint()
		... loader = dc.data.JsonLoader(["task"], feature_field="composition", featurizer=featurizer)
		... dataset = loader.create_dataset(tmpfile.name)
		>>> len(dataset)
		2
		"""

		def __init__(self,
		@@ -885,9 +895,10 @@ class ImageLoader(DataLoader):

		Returns
		-------
		Dataset
		A `Dataset` object containing a featurized representation of data
		from `input_files`, `labels`, and `weights`.
		ImageDataset or NumpyDataset or DiskDataset
		- if `in_memory == False`, the return value is ImageDataset.
		- if `in_memory == True` and `data_dir is None`, the return value is NumpyDataset.
		- if `in_memory == True` and `data_dir is not None`, the return value is DiskDataset.
		"""
		labels, weights = None, None
		if isinstance(inputs, tuple):

deepchem/data/datasets.py

+52 −52

Original line number	Diff line number	Diff line
		@@ -274,8 +274,8 @@ class Dataset(object):
		np.ndarray
		A numpy array of identifiers `X`.

		Notes
		-----
		Note
		----
		If data is stored on disk, accessing this field may involve loading
		data from disk and could potentially be slow. Using
		`iterbatches()` or `itersamples()` may be more efficient for
		@@ -292,8 +292,8 @@ class Dataset(object):
		np.ndarray
		A numpy array of identifiers `y`.

		Notes
		-----
		Note
		----
		If data is stored on disk, accessing this field may involve loading
		data from disk and could potentially be slow. Using
		`iterbatches()` or `itersamples()` may be more efficient for
		@@ -310,8 +310,8 @@ class Dataset(object):
		np.ndarray
		A numpy array of identifiers `ids`.

		Notes
		-----
		Note
		----
		If data is stored on disk, accessing this field may involve loading
		data from disk and could potentially be slow. Using
		`iterbatches()` or `itersamples()` may be more efficient for
		@@ -328,8 +328,8 @@ class Dataset(object):
		np.ndarray
		A numpy array of weights `w`.

		Notes
		-----
		Note
		----
		If data is stored on disk, accessing this field may involve loading
		data from disk and could potentially be slow. Using
		`iterbatches()` or `itersamples()` may be more efficient for
		@@ -451,9 +451,9 @@ class Dataset(object):
		Returns
		-------
		Tuple
		If `X_stats == True`, returns `(X_means, X_stds)`. If `y_stats == True`,
		returns `(y_means, y_stds)`. If both are true, returns
		`(X_means, X_stds, y_means, y_stds)`.
		- If `X_stats == True`, returns `(X_means, X_stds)`.
		- If `y_stats == True`, returns `(y_means, y_stds)`.
		- If both are true, returns `(X_means, X_stds, y_means, y_stds)`.
		"""
		X_means = 0.0
		X_m2 = 0.0
		@@ -513,8 +513,8 @@ class Dataset(object):
		tf.data.Dataset
		TensorFlow Dataset that iterates over the same data.

		Notes
		-----
		Note
		----
		This class requires TensorFlow to be installed.
		"""
		try:
		@@ -563,8 +563,8 @@ class Dataset(object):
		`torch.utils.data.IterableDataset` that iterates over the data in
		this dataset.

		Notes
		-----
		Note
		----
		This class requires PyTorch to be installed.
		"""
		raise NotImplementedError()
		@@ -936,8 +936,8 @@ class NumpyDataset(Dataset):
		`torch.utils.data.IterableDataset` that iterates over the data in
		this dataset.

		Notes
		-----
		Note
		----
		This method requires PyTorch to be installed.
		"""
		try:
		@@ -1052,24 +1052,24 @@ class DiskDataset(Dataset):
		`DiskDataset` are stored in a `data_dir`. The contents of `data_dir` should
		be laid out as follows:

		data_dir/
		\|
		---> metadata.csv.gzip
		\|
		---> tasks.json
		\|
		---> shard-0-X.npy
		\|
		---> shard-0-y.npy
		\|
		---> shard-0-w.npy
		\|
		---> shard-0-ids.npy
		\|
		---> shard-1-X.npy
		.
		.
		.
		\| data_dir/
		\| \|
		\| ---> metadata.csv.gzip
		\| \|
		\| ---> tasks.json
		\| \|
		\| ---> shard-0-X.npy
		\| \|
		\| ---> shard-0-y.npy
		\| \|
		\| ---> shard-0-w.npy
		\| \|
		\| ---> shard-0-ids.npy
		\| \|
		\| ---> shard-1-X.npy
		\| .
		\| .
		\| .

		The metadata is constructed by static method
		`DiskDataset._construct_metadata` and saved to disk by
		@@ -1124,11 +1124,11 @@ class DiskDataset(Dataset):
		legacy_metadata: bool
		Whether this `DiskDataset` uses legacy format.

		Notes
		-----
		Note
		----
		`DiskDataset` originally had a simpler metadata format without shape
		information. Older `DiskDataset` objects had metadata files with columns
		`('ids', 'X', 'y', 'w') and not additional shape columns. `DiskDataset`
		`('ids', 'X', 'y', 'w')` and not additional shape columns. `DiskDataset`
		maintains backwards compatibility with this older metadata format, but we
		recommend for performance reasons not using legacy metadata for new
		projects.
		@@ -1371,8 +1371,8 @@ class DiskDataset(Dataset):
		before moving. This is set to True by default to be backwards compatible
		with behavior in earlier versions of DeepChem.

		Notes
		-----
		Note
		----
		This is a stateful operation! `self.data_dir` will be moved into
		`new_data_dir`. If `delete_if_exists` is set to `True` (by default this is
		set `True`), then `new_data_dir` is deleted if it's a pre-existing
		@@ -1400,8 +1400,8 @@ class DiskDataset(Dataset):
		DiskDataset
		A copied DiskDataset object.

		Notes
		-----
		Note
		----
		This is a stateful operation! Any data at `new_data_dir` will be deleted
		and `self.data_dir` will be deep copied into `new_data_dir`.
		"""
		@@ -1432,8 +1432,8 @@ class DiskDataset(Dataset):
		>>> d.get_number_shards()
		10

		Notes
		-----
		Note
		----
		If this `DiskDataset` is in `legacy_metadata` format, reshard will
		convert this dataset to have non-legacy metadata.
		"""
		@@ -1822,8 +1822,8 @@ class DiskDataset(Dataset):
		`torch.utils.data.IterableDataset` that iterates over the data in
		this dataset.

		Notes
		-----
		Note
		----
		This method requires PyTorch to be installed.
		"""
		try:
		@@ -1974,8 +1974,8 @@ class DiskDataset(Dataset):
		into a compressed representation, then shuffles this compressed dataset in
		memory and writes the results to disk.

		Notes
		-----
		Note
		----
		This method only works for 1-dimensional feature vectors (does not work
		for tensorial featurizations). Note that this shuffle is performed in
		place.
		@@ -2018,8 +2018,8 @@ class DiskDataset(Dataset):
		def complete_shuffle(self, data_dir: Optional[str] = None) -> Dataset:
		"""Completely shuffle across all data, across all shards.

		Notes
		-----
		Note
		----
		The algorithm used for this complete shuffle is O(N^2) where N is the
		number of shards. It simply constructs each shard of the output dataset
		one at a time. Since the complete shuffle can take a long time, it's
		@@ -2864,8 +2864,8 @@ class ImageDataset(Dataset):
		`torch.utils.data.IterableDataset` that iterates over the data in
		this dataset.

		Notes
		-----
		Note
		----
		This method requires PyTorch to be installed.
		"""
		try:

deepchem/feat/graph_data.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -99,8 +99,8 @@ class GraphData:
		torch_geometric.data.Data
		Graph data for PyTorch Geometric

		Notes
		-----
		Note
		----
		This method requires PyTorch Geometric to be installed.
		"""
		try:
		@@ -134,8 +134,8 @@ class GraphData:
		Whether to add self loops for the nodes, i.e. edges from nodes
		to themselves. Default to False.

		Notes
		-----
		Note
		----
		This method requires DGL to be installed.
		"""
		try:

docs/source/api_reference/data.rst

0 → 100644

+169 −0

Original line number	Diff line number	Diff line
		Data
		====

		DeepChem :code:`dc.data` provides APIs for handling your data.

		If your data is stored by the file like CSV and SDF, you can use the Data Loaders.
		The Data Loaders read your data, convert them to features (ex: SMILES to ECFP) and save the features to Dataset class.
		If your data is python objects like Numpy arrays or Pandas DataFrames, you can use the Datasets directly.

		.. contents:: Contents
		:local:


		Datasets
		--------

		DeepChem :code:`dc.data.Dataset` objects are one of the core building blocks of DeepChem programs.
		:code:`Dataset` objects hold representations of data for machine learning and are widely used throughout DeepChem.

		The goal of the :code:`Dataset` class is to be maximally interoperable
		with other common representations of machine learning datasets.
		For this reason we provide interconversion methods mapping from :code:`Dataset` objects
		to pandas DataFrames, TensorFlow Datasets, and PyTorch datasets.

		NumpyDataset
		^^^^^^^^^^^^
		The :code:`dc.data.NumpyDataset` class provides an in-memory implementation of the abstract :code:`Dataset`
		which stores its data in :code:`numpy.ndarray` objects.

		.. autoclass:: deepchem.data.NumpyDataset
		:members:
		:inherited-members:

		DiskDataset
		^^^^^^^^^^^
		The :code:`dc.data.DiskDataset` class allows for the storage of larger
		datasets on disk. Each :code:`DiskDataset` is associated with a
		directory in which it writes its contents to disk. Note that a
		:code:`DiskDataset` can be very large, so some of the utility methods
		to access fields of a :code:`Dataset` can be prohibitively expensive.

		.. autoclass:: deepchem.data.DiskDataset
		:members:
		:inherited-members:

		ImageDataset
		^^^^^^^^^^^^
		The :code:`dc.data.ImageDataset` class is optimized to allow
		for convenient processing of image based datasets.

		.. autoclass:: deepchem.data.ImageDataset
		:members:
		:inherited-members:


		Data Loaders
		------------

		Processing large amounts of input data to construct a :code:`dc.data.Dataset` object can require some amount of hacking.
		To simplify this process for you, you can use the :code:`dc.data.DataLoader` classes.
		These classes provide utilities for you to load and process large amounts of data.

		CSVLoader
		^^^^^^^^^

		.. autoclass:: deepchem.data.CSVLoader
		:members: __init__, create_dataset

		UserCSVLoader
		^^^^^^^^^^^^^

		.. autoclass:: deepchem.data.UserCSVLoader
		:members: __init__, create_dataset

		ImageLoader
		^^^^^^^^^^^

		.. autoclass:: deepchem.data.ImageLoader
		:members: __init__, create_dataset

		JsonLoader
		^^^^^^^^^^
		JSON is a flexible file format that is human-readable, lightweight,
		and more compact than other open standard formats like XML. JSON files
		are similar to python dictionaries of key-value pairs. All keys must
		be strings, but values can be any of (string, number, object, array,
		boolean, or null), so the format is more flexible than CSV. JSON is
		used for describing structured data and to serialize objects. It is
		conveniently used to read/write Pandas dataframes with the
		`pandas.read_json` and `pandas.write_json` methods.

		.. autoclass:: deepchem.data.JsonLoader
		:members: __init__, create_dataset

		SDFLoader
		^^^^^^^^^

		.. autoclass:: deepchem.data.SDFLoader
		:members: __init__, create_dataset

		FASTALoader
		^^^^^^^^^^^

		.. autoclass:: deepchem.data.FASTALoader
		:members: __init__, create_dataset

		InMemoryLoader
		^^^^^^^^^^^^^^
		The :code:`dc.data.InMemoryLoader` is designed to facilitate the processing of large datasets
		where you already hold the raw data in-memory (say in a pandas dataframe).

		.. autoclass:: deepchem.data.InMemoryLoader
		:members: __init__, create_dataset


		Data Classes
		------------
		DeepChem featurizers often transform members into "data classes". These are
		classes that hold all the information needed to train a model on that data
		point. Models then transform these into the tensors for training in their
		:code:`default_generator` methods.

		Graph Data
		^^^^^^^^^^

		These classes document the data classes for graph convolutions.
		We plan to simplify these classes (:code:`ConvMol`, :code:`MultiConvMol`, :code:`WeaveMol`)
		into a joint data representation (:code:`GraphData`) for all graph convolutions in a future version of DeepChem,
		so these APIs may not remain stable.

		The graph convolution models which inherit :code:`KerasModel` depend on :code:`ConvMol`, :code:`MultiConvMol`, or :code:`WeaveMol`.
		On the other hand, the graph convolution models which inherit :code:`TorchModel` depend on :code:`GraphData`.

		.. autoclass:: deepchem.feat.mol_graphs.ConvMol
		:members:

		.. autoclass:: deepchem.feat.mol_graphs.MultiConvMol
		:members:
		:undoc-members:

		.. autoclass:: deepchem.feat.mol_graphs.WeaveMol
		:members:
		:undoc-members:

		.. autoclass:: deepchem.feat.graph_data.GraphData
		:members:


		Base Classes (for develop)
		--------------------------

		Dataset
		^^^^^^^
		The :code:`dc.data.Dataset` class is the abstract parent class for all
		datasets. This class should never be directly initialized, but
		contains a number of useful method implementations.

		.. autoclass:: deepchem.data.Dataset
		:members:

		DataLoader
		^^^^^^^^^^

		The :code:`dc.data.DataLoader` class is the abstract parent class for all
		dataloaders. This class should never be directly initialized, but
		contains a number of useful method implementations.

		.. autoclass:: deepchem.data.DataLoader
		:members:

docs/source/api_reference/dataclasses.rst

deleted100644 → 0

+0 −26

Original line number	Diff line number	Diff line
		Data Classes
		============
		DeepChem featurizers often transform members into "data classes". These are
		classes that hold all the information needed to train a model on that data
		point. Models then transform these into the tensors for training in their
		:code:`default_generator` methods.

		Graph Convolutions
		------------------

		These classes document the data classes for graph convolutions. We plan to simplify these classes into a joint data representation for all graph convolutions in a future version of DeepChem, so these APIs may not remain stable.

		.. autoclass:: deepchem.feat.mol_graphs.ConvMol
		:members:

		.. autoclass:: deepchem.feat.mol_graphs.MultiConvMol
		:members:

		.. autoclass:: deepchem.feat.mol_graphs.WeaveMol
		:members:

		.. autoclass:: deepchem.feat.graph_data.GraphData
		:members:

		.. autoclass:: deepchem.feat.graph_data.BatchGraphData
		:members:

Admin message