Commit af0e3ed7 authored by nd-02110114's avatar nd-02110114
Browse files

Merge branch 'master' into update-feat-docs

parents cac21b16 01ea79da
Loading
Loading
Loading
Loading
+22 −11
Original line number Diff line number Diff line
@@ -509,15 +509,25 @@ class JsonLoader(DataLoader):

  Examples
  --------
  >> import pandas as pd
  >> df = pd.DataFrame(some_data)
  >> df.columns.tolist()
  .. ['sample_data', 'sample_name', 'weight', 'task']
  >> df.to_json('file.json', orient='records', lines=True)
  >> loader = JsonLoader(tasks=['task'], feature_field='sample_data',
      label_field='task', weight_field='weight', id_field='sample_name')
  >> dataset = loader.create_dataset('file.json')
  Let's create the sample dataframe.

  >>> composition = ["LiCoO2", "MnO2"]
  >>> labels = [1.5, 2.3]
  >>> import pandas as pd
  >>> df = pd.DataFrame(list(zip(composition, labels)), columns=["composition", "task"])

  Dump the dataframe to the JSON file formatted as "records" in line delimited format and
  load the json file by JsonLoader.

  >>> import tempfile
  >>> import deepchem as dc
  >>> with dc.utils.UniversalNamedTemporaryFile(mode='w') as tmpfile:
  ...   df.to_json(tmpfile.name, orient='records', lines=True)
  ...   featurizer = dc.feat.ElementPropertyFingerprint()
  ...   loader = dc.data.JsonLoader(["task"], feature_field="composition", featurizer=featurizer)
  ...   dataset = loader.create_dataset(tmpfile.name)
  >>> len(dataset)
  2
  """

  def __init__(self,
@@ -885,9 +895,10 @@ class ImageLoader(DataLoader):

    Returns
    -------
    Dataset
      A `Dataset` object containing a featurized representation of data
      from `input_files`, `labels`, and `weights`.
    ImageDataset or NumpyDataset or DiskDataset
      - if `in_memory == False`, the return value is ImageDataset.
      - if `in_memory == True` and `data_dir is None`, the return value is NumpyDataset.
      - if `in_memory == True` and `data_dir is not None`, the return value is DiskDataset.
    """
    labels, weights = None, None
    if isinstance(inputs, tuple):
+52 −52
Original line number Diff line number Diff line
@@ -274,8 +274,8 @@ class Dataset(object):
    np.ndarray
      A numpy array of identifiers `X`.

    Notes
    -----
    Note
    ----
    If data is stored on disk, accessing this field may involve loading
    data from disk and could potentially be slow. Using
    `iterbatches()` or `itersamples()` may be more efficient for
@@ -292,8 +292,8 @@ class Dataset(object):
    np.ndarray
      A numpy array of identifiers `y`.

    Notes
    -----
    Note
    ----
    If data is stored on disk, accessing this field may involve loading
    data from disk and could potentially be slow. Using
    `iterbatches()` or `itersamples()` may be more efficient for
@@ -310,8 +310,8 @@ class Dataset(object):
    np.ndarray
      A numpy array of identifiers `ids`.

    Notes
    -----
    Note
    ----
    If data is stored on disk, accessing this field may involve loading
    data from disk and could potentially be slow. Using
    `iterbatches()` or `itersamples()` may be more efficient for
@@ -328,8 +328,8 @@ class Dataset(object):
    np.ndarray
      A numpy array of weights `w`.

    Notes
    -----
    Note
    ----
    If data is stored on disk, accessing this field may involve loading
    data from disk and could potentially be slow. Using
    `iterbatches()` or `itersamples()` may be more efficient for
@@ -451,9 +451,9 @@ class Dataset(object):
    Returns
    -------
    Tuple
      If `X_stats == True`, returns `(X_means, X_stds)`. If `y_stats == True`,
      returns `(y_means, y_stds)`. If both are true, returns
      `(X_means, X_stds, y_means, y_stds)`.
      - If `X_stats == True`, returns `(X_means, X_stds)`.
      - If `y_stats == True`, returns `(y_means, y_stds)`.
      - If both are true, returns `(X_means, X_stds, y_means, y_stds)`.
    """
    X_means = 0.0
    X_m2 = 0.0
@@ -513,8 +513,8 @@ class Dataset(object):
    tf.data.Dataset
      TensorFlow Dataset that iterates over the same data.

    Notes
    -----
    Note
    ----
    This class requires TensorFlow to be installed.
    """
    try:
@@ -563,8 +563,8 @@ class Dataset(object):
      `torch.utils.data.IterableDataset` that iterates over the data in
      this dataset.

    Notes
    -----
    Note
    ----
    This class requires PyTorch to be installed.
    """
    raise NotImplementedError()
@@ -936,8 +936,8 @@ class NumpyDataset(Dataset):
      `torch.utils.data.IterableDataset` that iterates over the data in
      this dataset.

    Notes
    -----
    Note
    ----
    This method requires PyTorch to be installed.
    """
    try:
@@ -1052,24 +1052,24 @@ class DiskDataset(Dataset):
  `DiskDataset` are stored in a `data_dir`. The contents of `data_dir` should
  be laid out as follows:

  data_dir/
    |
    ---> metadata.csv.gzip
    |
    ---> tasks.json
    |
    ---> shard-0-X.npy
    |
    ---> shard-0-y.npy
    |
    ---> shard-0-w.npy
    |
    ---> shard-0-ids.npy
    |
    ---> shard-1-X.npy
    .
    .
    .
  | data_dir/
  |   |
  |   ---> metadata.csv.gzip
  |   |
  |   ---> tasks.json
  |   |
  |   ---> shard-0-X.npy
  |   |
  |   ---> shard-0-y.npy
  |   |
  |   ---> shard-0-w.npy
  |   |
  |   ---> shard-0-ids.npy
  |   |
  |   ---> shard-1-X.npy
  |   .
  |   .
  |   .

  The metadata is constructed by static method
  `DiskDataset._construct_metadata` and saved to disk by
@@ -1124,11 +1124,11 @@ class DiskDataset(Dataset):
  legacy_metadata: bool
    Whether this `DiskDataset` uses legacy format.

  Notes
  -----
  Note
  ----
  `DiskDataset` originally had a simpler metadata format without shape
  information. Older `DiskDataset` objects had metadata files with columns
  `('ids', 'X', 'y', 'w') and not additional shape columns. `DiskDataset`
  `('ids', 'X', 'y', 'w')` and not additional shape columns. `DiskDataset`
  maintains backwards compatibility with this older metadata format, but we
  recommend for performance reasons not using legacy metadata for new
  projects.
@@ -1371,8 +1371,8 @@ class DiskDataset(Dataset):
      before moving. This is set to True by default to be backwards compatible
      with behavior in earlier versions of DeepChem.

    Notes
    -----
    Note
    ----
    This is a stateful operation! `self.data_dir` will be moved into
    `new_data_dir`. If `delete_if_exists` is set to `True` (by default this is
    set `True`), then `new_data_dir` is deleted if it's a pre-existing
@@ -1400,8 +1400,8 @@ class DiskDataset(Dataset):
    DiskDataset
      A copied DiskDataset object.

    Notes
    -----
    Note
    ----
    This is a stateful operation! Any data at `new_data_dir` will be deleted
    and `self.data_dir` will be deep copied into `new_data_dir`.
    """
@@ -1432,8 +1432,8 @@ class DiskDataset(Dataset):
    >>> d.get_number_shards()
    10

    Notes
    -----
    Note
    ----
    If this `DiskDataset` is in `legacy_metadata` format, reshard will
    convert this dataset to have non-legacy metadata.
    """
@@ -1822,8 +1822,8 @@ class DiskDataset(Dataset):
      `torch.utils.data.IterableDataset` that iterates over the data in
      this dataset.

    Notes
    -----
    Note
    ----
    This method requires PyTorch to be installed.
    """
    try:
@@ -1974,8 +1974,8 @@ class DiskDataset(Dataset):
    into a compressed representation, then shuffles this compressed dataset in
    memory and writes the results to disk.

    Notes
    -----
    Note
    ----
    This method only works for 1-dimensional feature vectors (does not work
    for tensorial featurizations). Note that this shuffle is performed in
    place.
@@ -2018,8 +2018,8 @@ class DiskDataset(Dataset):
  def complete_shuffle(self, data_dir: Optional[str] = None) -> Dataset:
    """Completely shuffle across all data, across all shards.

    Notes
    -----
    Note
    ----
    The algorithm used for this complete shuffle is O(N^2) where N is the
    number of shards. It simply constructs each shard of the output dataset
    one at a time. Since the complete shuffle can take a long time, it's
@@ -2864,8 +2864,8 @@ class ImageDataset(Dataset):
      `torch.utils.data.IterableDataset` that iterates over the data in
      this dataset.

    Notes
    -----
    Note
    ----
    This method requires PyTorch to be installed.
    """
    try:
+4 −4
Original line number Diff line number Diff line
@@ -99,8 +99,8 @@ class GraphData:
    torch_geometric.data.Data
      Graph data for PyTorch Geometric

    Notes
    -----
    Note
    ----
    This method requires PyTorch Geometric to be installed.
    """
    try:
@@ -134,8 +134,8 @@ class GraphData:
      Whether to add self loops for the nodes, i.e. edges from nodes
      to themselves. Default to False.

    Notes
    -----
    Note
    ----
    This method requires DGL to be installed.
    """
    try:
+2 −2
Original line number Diff line number Diff line
@@ -364,7 +364,7 @@ class TorchModel(Model):

      # Execute the loss function, accumulating the gradients.

      if len(inputs) == 1:
      if isinstance(inputs, list) and len(inputs) == 1:
        inputs = inputs[0]

      optimizer.zero_grad()
@@ -524,7 +524,7 @@ class TorchModel(Model):
      inputs, _, _ = self._prepare_batch((inputs, None, None))

      # Invoke the model.
      if len(inputs) == 1:
      if isinstance(inputs, list) and len(inputs) == 1:
        inputs = inputs[0]
      output_values = self.model(inputs)
      if isinstance(output_values, torch.Tensor):

devtools/archive/README.md

deleted100644 → 0
+0 −39
Original line number Diff line number Diff line
Developer Notes / Tools
=======================

How to do a release
-------------------

### Pre-release
- Create an issue about cutting the release.

### Release
- Tag current master with new release version
- Look at github issues merged since last release
- Bump Dockerfile Version
- Update README with new version string
- Update Website install commands

### Post-release
- Update the docker images
```bash
sudo docker build -f Dockerfile .
sudo docker image list
# smoke test everything
nvidia-docker run -i -t \<IMAGE ID\>
python scripts/detect_devices.py // verify gpu is enabled
cd examples; python benchmark.py -d tox21

sudo docker tag \<IMAGE ID\> deepchemio/deepchem:latest
sudo docker push deepchemio/deepchem:latest

sudo docker tag \<IMAGE ID\> deepchemio/deepchem:<version>
sudo docker push deepchemio/deepchem:<version>
```
  
- Update conda installs
  - edit version in devtools/conda-recipes/deepchem/meta.yml
  - update requirements to be inline with scripts/install_deepchem_conda.sh
  - set deepchem anaconda org token
  - bash devtools/jenkins/conda_build.sh
- Post on Gitter
Loading