Commit d52ed986 authored by Nathan Frey's avatar Nathan Frey
Browse files

Change args

parent 9a8d2a52
Loading
Loading
Loading
Loading
+40 −21
Original line number Diff line number Diff line
@@ -3,13 +3,14 @@ ZINC15 commercially-available compounds for virtual screening.
"""
import os
import logging
import numpy as np
import deepchem
from deepchem.feat import Featurizer
from deepchem.trans import Transformer
from deepchem.splits.splitters import Splitter
from deepchem.molnet.defaults import get_defaults

from typing import List, Tuple, Dict, Optional
from typing import List, Tuple, Dict, Optional, Union

logger = logging.getLogger(__name__)

@@ -51,10 +52,9 @@ def load_zinc15(
            'transform_X': True
        }
    },
    zinc15_kwargs: Dict[str, str] = {
        'dataset_size': '250K',
        'dataset_dimension': '2D'
    },
    dataset_size: str = '250K',
    dataset_dimension: str = '2D',
    test_run: bool = False,
    **kwargs) -> Tuple[List, Optional[Tuple], List]:
  """Load zinc15.

@@ -64,11 +64,13 @@ def load_zinc15(
  in 2D (SMILES string) format.

  MolNet provides subsets of 250K, 1M, and 10M "lead-like" compounds
  from ZINC15. Compounds in ZINC15 are labeled by their molecular weight 
  from ZINC15. The full dataset of 270M "goldilocks" compounds is also
  available. Compounds in ZINC15 are labeled by their molecular weight 
  and LogP (solubility) values. Each compound also has information about how
  readily available (purchasable) it is and its reactivity. Lead-like
  compounds have molecular weight between 300 and 350 Daltons and LogP
  between -1 and 3.5.
  between -1 and 3.5. Goldilocks compounds are lead-like compounds with
  LogP values further restricted to between 2 and 3.

  If `reload = True` and `data_dir` (`save_dir`) is specified, the loader
  will attempt to load the raw dataset (featurized dataset) from disk.
@@ -103,9 +105,12 @@ def load_zinc15(
  transformer_kwargs : dict
    Maps transformer names to constructor arguments, e.g.
    {"BalancingTransformer": {"transform_x":True, "transform_y":False}}
  zinc15_kwargs : dict
    Specify parameters for the ZINC15 dataset. Accepted keywords are
    'dataset_size' and 'dataset_dimension'.
  dataset_size : str (default '250K')
    Number of compounds to download; '250K', '1M', '10M', or '270M'.
  dataset_dimension : str (default '2D')
    SMILES strings (2D) or 3D SDF files; '2D' or '3D'
  test_run : bool (default False)
    Flag to indicate tests, if True dataset is not downloaded.
  **kwargs : additional optional arguments.

  Returns
@@ -124,6 +129,7 @@ def load_zinc15(
  -----
  The total ZINC dataset with SMILES strings contains hundreds of millions
  of compounds and is over 100GB! ZINC250K is recommended for experimentation.
  The full set of 270M goldilocks compounds is 23GB.

  References
  ----------
@@ -131,12 +137,12 @@ def load_zinc15(

  Examples
  --------
  >> import deepchem as dc
  >> tasks, datasets, transformers = dc.molnet.load_zinc15(reload=False)
  >> train_dataset, val_dataset, test_dataset = datasets
  >> n_tasks = len(tasks)
  >> n_features = train_dataset.get_data_shape()[0]
  >> model = dc.models.MultitaskRegressor(n_tasks, n_features)
  >>> import deepchem as dc
  >>> tasks, datasets, transformers = dc.molnet.load_zinc15(test_run=True)
  >>> train_dataset, val_dataset, test_dataset = datasets
  >>> n_tasks = len(tasks)
  >>> n_features = train_dataset.X.shape[1]
  >>> model = dc.models.MultitaskRegressor(n_tasks, n_features)

  """

@@ -144,12 +150,12 @@ def load_zinc15(
  logger.info("About to featurize zinc15.")
  my_tasks = ['mwt', 'logp', 'reactive']  # machine learning targets

  # Get params specific to ZINC15
  dataset_size = zinc15_kwargs.get('dataset_size', '250K')
  dataset_dimension = zinc15_kwargs.get('dataset_dimension', '2D')
  if test_run:
    ds = deepchem.data.NumpyDataset(np.zeros((10, 1)))
    return my_tasks, (ds, ds, ds), []

  # Raise warnings and list other available options
  if dataset_size not in ['250K', '1M', '10M']:
  if dataset_size not in ['250K', '1M', '10M', '270M']:
    raise ValueError("""
      Only '250K', '1M', and '10M' are supported for dataset_size.
      """)
@@ -157,6 +163,14 @@ def load_zinc15(
    raise ValueError("""
      Currently, only '2D' is supported for dataset_dimension.
      """)
  if dataset_size == '270M':
    answer = ''
    while answer not in ['y', 'n']:
      answer = input("""You're about to download 270M SMILES strings.
        This dataset is 23GB. Are you sure you want to continue? (Y/N)"""
                    ).lower()
    if answer == 'n':
      raise ValueError('Choose a smaller dataset_size.')

  dataset_filename = 'zinc15_' + dataset_size + '_' + dataset_dimension + '.tar.gz'

@@ -208,7 +222,7 @@ def load_zinc15(
        featurizer=featurizer)

  # Featurize dataset
  dataset = loader.create_dataset(dataset_file)
  dataset = loader.create_dataset(os.path.join(data_dir, dataset_file))

  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset, **splitter_kwargs)
@@ -231,3 +245,8 @@ def load_zinc15(
        save_folder, train_dataset, valid_dataset, test_dataset, transformers)

  return my_tasks, (train_dataset, valid_dataset, test_dataset), transformers


if __name__ == "__main__":
  import doctest
  doctest.testmod()