Commit dec031e0 authored by Nathan Frey's avatar Nathan Frey
Browse files

Reorganized MolNet materials loaders

parent b4141822
Loading
Loading
Loading
Loading
+5 −1
Original line number Diff line number Diff line
@@ -37,6 +37,10 @@ class ElementPropertyFingerprint(MaterialCompositionFeaturizer):

  .. [4] Pymatgen: Ong, S.P. et al. Comput. Mater. Sci. 68, 314-319 (2013).

  Notes
  -----
  `NaN` feature values are automatically converted to 0 by this featurizer.  

  """

  def __init__(self, data_source='matminer'):
+2 −1
Original line number Diff line number Diff line
@@ -31,7 +31,8 @@ from deepchem.molnet.load_function.kinase_datasets import load_kinase
from deepchem.molnet.load_function.thermosol_datasets import load_thermosol
from deepchem.molnet.load_function.hppb_datasets import load_hppb
from deepchem.molnet.load_function.chembl25_datasets import load_chembl25
from deepchem.molnet.load_function.material_datasets import load_bandgap, load_perovskite
from deepchem.molnet.load_function.material_datasets.load_bandgap import load_bandgap
from deepchem.molnet.load_function.material_datasets.load_perovskite import load_perovskite

from deepchem.molnet.dnasim import simulate_motif_density_localization
from deepchem.molnet.dnasim import simulate_motif_counting
+20 −7
Original line number Diff line number Diff line
@@ -4,12 +4,12 @@ Datasets for inorganic crystal structures.
import os
import logging
import deepchem
from deepchem.feat import Featurizer
from deepchem.feat import Featurizer, MaterialStructureFeaturizer, MaterialCompositionFeaturizer
from deepchem.trans import Transformer
from deepchem.splits.splitters import Splitter
from deepchem.molnet.defaults import get_defaults

from typing import List, Tuple, Dict, Optional
from typing import List, Tuple, Dict, Optional, Union

logger = logging.getLogger(__name__)

@@ -41,7 +41,7 @@ DEFAULT_SPLITTERS = {k: DEFAULT_SPLITTERS[k] for k in splitters}


def load_bandgap(
    featurizer: Featurizer = DEFAULT_FEATURIZERS['ElementPropertyFingerprint'],
    featurizer: MaterialCompositionFeaturizer = DEFAULT_FEATURIZERS['ElementPropertyFingerprint'],
    transformers: List[Transformer] = [
        DEFAULT_TRANSFORMERS['NormalizationTransformer']
    ],
@@ -64,15 +64,21 @@ def load_bandgap(
  """Load band gap dataset.

  Contains 4604 experimentally measured band gaps for inorganic
  crystal structure compositions.
  crystal structure compositions. In benchmark studies, random forest 
  models achieved a mean average error of 0.45 eV during five-fold 
  nested cross validation on this dataset. 

  For more details on the dataset see [1]_. For more details
  on previous benchmarks for this dataset, see [2]_.
  
  Parameters
  ----------
  featurizer : ElementPropertyFingerprint
  featurizer : MaterialCompositionFeaturizer 
    (default ElementPropertyFingerprint)
    A featurizer that inherits from deepchem.feat.Featurizer.
  transformers : List{List of allowed transformers for this dataset}
  transformers : List[Transformer]
    A transformer that inherits from deepchem.trans.Transformer.
  splitter : RandomSplitter
  splitter : Splitter (default RandomSplitter)
    A splitter that inherits from deepchem.splits.splitters.Splitter.
  reload : bool (default True)
    Try to reload dataset from disk if already downloaded. Save to disk
@@ -224,6 +230,13 @@ def load_perovskite(
  """Load perovskite dataset.

  Contains 18928 perovskite structures and their formation energies.
  In benchmark studies, random forest models and crystal graph
  neural networks achieved mean average error of 0.23 and 0.05 eV/atom,
  respectively, during five-fold nested cross validation on this
  dataset. 

  For more details on the dataset see [1]_. For more details
  on previous benchmarks for this dataset, see [2]_.
  
  Parameters
  ----------
+0 −0

Empty file added.

+205 −0
Original line number Diff line number Diff line
"""
Experimental bandgaps for inorganic crystals.
"""
import os
import logging
import deepchem
from deepchem.feat import Featurizer, MaterialStructureFeaturizer, MaterialCompositionFeaturizer
from deepchem.trans import Transformer
from deepchem.splits.splitters import Splitter
from deepchem.molnet.defaults import get_defaults

from typing import List, Tuple, Dict, Optional, Union

logger = logging.getLogger(__name__)

# TODO: Change URLs
DEFAULT_DIR = deepchem.utils.get_data_dir()
BANDGAP_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/expt_gap.tar.gz'

# dict of accepted featurizers for this dataset
# modify the returned dicts for your dataset
DEFAULT_FEATURIZERS = get_defaults("feat")

# Names of supported featurizers
featurizers = [
    'ElementPropertyFingerprint',
]
DEFAULT_FEATURIZERS = {k: DEFAULT_FEATURIZERS[k] for k in featurizers}

# dict of accepted transformers
DEFAULT_TRANSFORMERS = get_defaults("trans")

# dict of accepted splitters
DEFAULT_SPLITTERS = get_defaults("splits")

# names of supported splitters
splitters = ['RandomSplitter']
DEFAULT_SPLITTERS = {k: DEFAULT_SPLITTERS[k] for k in splitters}


def load_bandgap(
    featurizer: MaterialCompositionFeaturizer = DEFAULT_FEATURIZERS[
        'ElementPropertyFingerprint'],
    transformers: List[Transformer] = [
        DEFAULT_TRANSFORMERS['NormalizationTransformer']
    ],
    splitter: Splitter = DEFAULT_SPLITTERS['RandomSplitter'],
    reload: bool = True,
    data_dir: Optional[str] = None,
    save_dir: Optional[str] = None,
    featurizer_kwargs: Dict[str, object] = {'data_source': 'matminer'},
    splitter_kwargs: Dict[str, object] = {
        'frac_train': 0.8,
        'frac_valid': 0.1,
        'frac_test': 0.1
    },
    transformer_kwargs: Dict[str, Dict[str, object]] = {
        'NormalizationTransformer': {
            'transform_X': True
        }
    },
    **kwargs) -> Tuple[List, Tuple, List]:
  """Load band gap dataset.

  Contains 4604 experimentally measured band gaps for inorganic
  crystal structure compositions. In benchmark studies, random forest 
  models achieved a mean average error of 0.45 eV during five-fold 
  nested cross validation on this dataset. 

  For more details on the dataset see [1]_. For more details
  on previous benchmarks for this dataset, see [2]_.
  
  Parameters
  ----------
  featurizer : MaterialCompositionFeaturizer 
    (default ElementPropertyFingerprint)
    A featurizer that inherits from deepchem.feat.Featurizer.
  transformers : List[Transformer]
    A transformer that inherits from deepchem.trans.Transformer.
  splitter : Splitter (default RandomSplitter)
    A splitter that inherits from deepchem.splits.splitters.Splitter.
  reload : bool (default True)
    Try to reload dataset from disk if already downloaded. Save to disk
    after featurizing.
  data_dir : str, optional
    Path to datasets.
  save_dir : str, optional
    Path to featurized datasets.
  featurizer_kwargs : dict
    Specify parameters to featurizer, e.g. {"size": 1024}
  splitter_kwargs : dict
    Specify parameters to splitter, e.g. {"seed": 42}
  transformer_kwargs : dict
    Maps transformer names to constructor arguments, e.g.
    {"BalancingTransformer": {"transform_x":True, "transform_y":False}}
  **kwargs : additional optional arguments.

  Returns
  -------
  tasks, datasets, transformers : tuple
    tasks : list
      Column names corresponding to machine learning target variables.
    datasets : tuple
      train, validation, test splits of data as
      ``deepchem.data.datasets.Dataset`` instances.
    transformers : list
      ``deepchem.trans.transformers.Transformer`` instances applied
      to dataset.

  References
  ----------
  .. [1] Zhuo, Y. et al. "Predicting the Band Gaps of Inorganic Solids by Machine Learning." J. Phys. Chem. Lett. (2018) DOI: 10.1021/acs.jpclett.8b00124.

  .. [2] Dunn, A. et al. "Benchmarking Materials Property Prediction Methods: The Matbench Test Set and Automatminer Reference Algorithm." https://arxiv.org/abs/2005.00707 (2020)

  Examples
  --------
  >> import deepchem as dc
  >> tasks, datasets, transformers = dc.molnet.load_bandgap(reload=False)
  >> train_dataset, val_dataset, test_dataset = datasets
  >> n_tasks = len(tasks)
  >> n_features = train_dataset.get_data_shape()[0]
  >> model = dc.models.MultitaskRegressor(n_tasks, n_features)

  """

  # Featurize
  logger.info("About to featurize band gap dataset.")
  my_tasks = ['experimental_bandgap']  # machine learning targets

  # Get DeepChem data directory if needed
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  if issubclass(featurizer, MaterialCompositionFeaturizer):
    featurizer = featurizer(**featurizer_kwargs)
  else:
    raise TypeError(
        "featurizer must be a subclass of MaterialCompositionFeaturizer.")

  if issubclass(splitter, Splitter):
    splitter = splitter()
  else:
    raise TypeError("splitter must be a subclass of Splitter.")

  # Reload from disk
  if reload:
    featurizer_name = str(featurizer.__class__.__name__)
    splitter_name = str(splitter.__class__.__name__)
    save_folder = os.path.join(save_dir, "bandgap-featurized", featurizer_name,
                               splitter_name)

    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_folder)
    if loaded:
      return my_tasks, all_dataset, transformers

  # First type of supported featurizers
  supported_featurizers = ['ElementPropertyFingerprint'
                          ]  # type: List[Featurizer]

  # Load .tar.gz file
  if featurizer.__class__.__name__ in supported_featurizers:
    dataset_file = os.path.join(data_dir, 'expt_gap.tar.gz')
    deepchem.utils.untargz_file(dataset_file, dest_dir=data_dir)
    dataset_file = os.path.join(data_dir, 'expt_gap.json')

    if not os.path.exists(dataset_file):
      deepchem.utils.download_url(url=BANDGAP_URL, dest_dir=data_dir)
      deepchem.utils.untargz_file(
          os.path.join(data_dir, 'expt_gap.tar.gz'), data_dir)

    # Changer loader to match featurizer and data file type
    loader = deepchem.data.JsonLoader(
        tasks=my_tasks,
        feature_field="composition",
        label_field="experimental_bandgap",
        featurizer=featurizer)

  # Featurize dataset
  dataset = loader.create_dataset(dataset_file)

  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset, **splitter_kwargs)

  # Initialize transformers
  transformers = [
      DEFAULT_TRANSFORMERS[t](dataset=dataset, **transformer_kwargs[t])
      if isinstance(t, str) else t(
          dataset=dataset, **transformer_kwargs[str(t.__name__)])
      for t in transformers
  ]

  for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    test_dataset = transformer.transform(test_dataset)

  if reload:  # save to disk
    deepchem.utils.save.save_dataset_to_disk(
        save_folder, train_dataset, valid_dataset, test_dataset, transformers)

  return my_tasks, (train_dataset, valid_dataset, test_dataset), transformers
Loading