Commit 605386a6 authored by peastman's avatar peastman
Browse files

Converted material datasets to new API

parent ec4dd49a
Loading
Loading
Loading
Loading
+8 −6
Original line number Diff line number Diff line
@@ -50,13 +50,8 @@ class ElementPropertyFingerprint(MaterialCompositionFeaturizer):
    data_source: str of "matminer", "magpie" or "deml" (default "matminer")
      Source for element property data.
    """
    try:
      from matminer.featurizers.composition import ElementProperty
    except ModuleNotFoundError:
      raise ImportError("This class requires matminer to be installed.")

    self.data_source = data_source
    self.ep_featurizer = ElementProperty.from_preset(self.data_source)
    self.ep_featurizer = None

  def _featurize(self, composition: PymatgenComposition) -> np.ndarray:
    """
@@ -73,6 +68,13 @@ class ElementPropertyFingerprint(MaterialCompositionFeaturizer):
      Vector of properties and statistics derived from chemical
      stoichiometry. Some values may be NaN.
    """
    if self.ep_featurizer is None:
      try:
        from matminer.featurizers.composition import ElementProperty
        self.ep_featurizer = ElementProperty.from_preset(self.data_source)
      except ModuleNotFoundError:
        raise ImportError("This class requires matminer to be installed.")

    try:
      feats = self.ep_featurizer.featurize(composition)
    except:
+8 −6
Original line number Diff line number Diff line
@@ -54,14 +54,9 @@ class SineCoulombMatrix(MaterialStructureFeaturizer):
    flatten: bool (default True)
      Return flattened vector of matrix eigenvalues.
    """
    try:
      from matminer.featurizers.structure import SineCoulombMatrix as SCM
    except ModuleNotFoundError:
      raise ImportError("This class requires matminer to be installed.")

    self.max_atoms = max_atoms
    self.flatten = flatten
    self.scm = SCM(flatten=False)
    self.scm = None

  def _featurize(self, struct: PymatgenStructure) -> np.ndarray:
    """
@@ -79,6 +74,13 @@ class SineCoulombMatrix(MaterialStructureFeaturizer):
      2D sine Coulomb matrix with shape (max_atoms, max_atoms),
      or 1D matrix eigenvalues with shape (max_atoms,).
    """
    if self.scm is None:
      try:
        from matminer.featurizers.structure import SineCoulombMatrix as SCM
        self.scm = SCM(flatten=False)
      except ModuleNotFoundError:
        raise ImportError("This class requires matminer to be installed.")

    # Get full N x N SCM
    sine_mat = self.scm.featurize(struct)

+49 −146
Original line number Diff line number Diff line
@@ -2,60 +2,43 @@
Experimental bandgaps for inorganic crystals.
"""
import os
import logging
import deepchem as dc
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
from deepchem.data import Dataset
from typing import List, Optional, Tuple, Union

import deepchem
from deepchem.feat import MaterialCompositionFeaturizer
from deepchem.splits.splitters import Splitter
from deepchem.molnet.defaults import get_defaults

from typing import List, Tuple, Dict, Optional, Any

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.data_utils.get_data_dir()
BANDGAP_URL = 'https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/expt_gap.tar.gz'
BANDGAP_TASKS = ['experimental_bandgap']

# dict of accepted featurizers for this dataset
# modify the returned dicts for your dataset
DEFAULT_FEATURIZERS = get_defaults("feat")

# Names of supported featurizers
featurizers = [
    'ElementPropertyFingerprint',
]
DEFAULT_FEATURIZERS = {k: DEFAULT_FEATURIZERS[k] for k in featurizers}

# dict of accepted transformers
DEFAULT_TRANSFORMERS = get_defaults("trans")
class _BandgapLoader(_MolnetLoader):

# dict of accepted splitters
DEFAULT_SPLITTERS = get_defaults("splits")

# names of supported splitters
splitters = ['RandomSplitter']
DEFAULT_SPLITTERS = {k: DEFAULT_SPLITTERS[k] for k in splitters}
  def create_dataset(self) -> Dataset:
    dataset_file = os.path.join(self.data_dir, 'expt_gap.json')
    targz_file = os.path.join(self.data_dir, 'expt_gap.tar.gz')
    if not os.path.exists(dataset_file):
      if not os.path.exists(targz_file):
        dc.utils.data_utils.download_url(
            url=BANDGAP_URL, dest_dir=self.data_dir)
      dc.utils.data_utils.untargz_file(targz_file, self.data_dir)
    loader = dc.data.JsonLoader(
        tasks=self.tasks,
        feature_field="composition",
        label_field="experimental_bandgap",
        featurizer=self.featurizer)
    return loader.create_dataset(dataset_file)


def load_bandgap(
    featurizer=DEFAULT_FEATURIZERS['ElementPropertyFingerprint'],
    transformers: List = [DEFAULT_TRANSFORMERS['NormalizationTransformer']],
    splitter=DEFAULT_SPLITTERS['RandomSplitter'],
    featurizer: Union[dc.feat.Featurizer,
                      str] = dc.feat.ElementPropertyFingerprint(),
    splitter: Union[dc.splits.Splitter, str, None] = 'random',
    transformers: List[Union[TransformerGenerator, str]] = ['normalization'],
    reload: bool = True,
    data_dir: Optional[str] = None,
    save_dir: Optional[str] = None,
    featurizer_kwargs: Dict[str, Any] = {},
    splitter_kwargs: Dict[str, Any] = {
        'frac_train': 0.8,
        'frac_valid': 0.1,
        'frac_test': 0.1
    },
    transformer_kwargs: Dict[str, Dict[str, Any]] = {
        'NormalizationTransformer': {
            'transform_X': True
        }
    },
    **kwargs) -> Tuple[List, Optional[Tuple], List]:
    **kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
  """Load band gap dataset.

  Contains 4604 experimentally measured band gaps for inorganic
@@ -68,27 +51,25 @@ def load_bandgap(

  Parameters
  ----------
  featurizer : MaterialCompositionFeaturizer (default ElementPropertyFingerprint)
    A featurizer that inherits from deepchem.feat.Featurizer.
  transformers : List[Transformer]
    A transformer that inherits from deepchem.trans.Transformer.
  splitter : Splitter (default RandomSplitter)
    A splitter that inherits from deepchem.splits.splitters.Splitter.
  reload : bool (default True)
    Try to reload dataset from disk if already downloaded. Save to disk
    after featurizing.
  data_dir : str, optional (default None)
    Path to datasets.
  save_dir : str, optional (default None)
    Path to featurized datasets.
  featurizer_kwargs : Dict[str, Any]
    Specify parameters to featurizer, e.g. {"size": 1024}
  splitter_kwargs : Dict[str, Any]
    Specify parameters to splitter, e.g. {"seed": 42}
  transformer_kwargs : dict
    Maps transformer names to constructor arguments, e.g.
    {"BalancingTransformer": {"transform_x":True, "transform_y":False}}
  **kwargs : additional optional arguments.
  featurizer: Featurizer or str
    the featurizer to use for processing the data.  Alternatively you can pass
    one of the names from dc.molnet.featurizers as a shortcut.
  splitter: Splitter or str
    the splitter to use for splitting the data into training, validation, and
    test sets.  Alternatively you can pass one of the names from
    dc.molnet.splitters as a shortcut.  If this is None, all the data
    will be included in a single dataset.
  transformers: list of TransformerGenerators or strings
    the Transformers to apply to the data.  Each one is specified by a
    TransformerGenerator or, as a shortcut, one of the names from
    dc.molnet.transformers.
  reload: bool
    if True, the first call for a particular featurizer and splitter will cache
    the datasets to disk, and subsequent calls will reload the cached datasets.
  data_dir: str
    a directory to save the raw data in
  save_dir: str
    a directory to save the dataset in

  Returns
  -------
@@ -112,91 +93,13 @@ def load_bandgap(
  Examples
  --------
  >> import deepchem as dc
  >> tasks, datasets, transformers = dc.molnet.load_bandgap(reload=False)
  >> tasks, datasets, transformers = dc.molnet.load_bandgap()
  >> train_dataset, val_dataset, test_dataset = datasets
  >> n_tasks = len(tasks)
  >> n_features = train_dataset.get_data_shape()[0]
  >> model = dc.models.MultitaskRegressor(n_tasks, n_features)

  """

  # Featurize
  logger.info("About to featurize band gap dataset.")
  my_tasks = ['experimental_bandgap']  # machine learning targets

  # Get DeepChem data directory if needed
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  if issubclass(featurizer, MaterialCompositionFeaturizer):
    featurizer = featurizer(**featurizer_kwargs)
  else:
    raise TypeError(
        "featurizer must be a subclass of MaterialCompositionFeaturizer.")

  if issubclass(splitter, Splitter):
    splitter = splitter()
  else:
    raise TypeError("splitter must be a subclass of Splitter.")

  # Reload from disk
  if reload:
    featurizer_name = str(featurizer.__class__.__name__)
    splitter_name = str(splitter.__class__.__name__)
    save_folder = os.path.join(save_dir, "bandgap-featurized", featurizer_name,
                               splitter_name)

    loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk(
        save_folder)
    if loaded:
      return my_tasks, all_dataset, transformers

  # First type of supported featurizers
  supported_featurizers: List[str] = ['ElementPropertyFingerprint']

  # Load .tar.gz file
  if featurizer.__class__.__name__ in supported_featurizers:
    dataset_file = os.path.join(data_dir, 'expt_gap.json')

    if not os.path.exists(dataset_file):
      targz_file = os.path.join(data_dir, 'expt_gap.tar.gz')
      if not os.path.exists(targz_file):
        deepchem.utils.data_utils.download_url(
            url=BANDGAP_URL, dest_dir=data_dir)

      deepchem.utils.data_utils.untargz_file(
          os.path.join(data_dir, 'expt_gap.tar.gz'), data_dir)

    # Changer loader to match featurizer and data file type
    loader = deepchem.data.JsonLoader(
        tasks=my_tasks,
        feature_field="composition",
        label_field="experimental_bandgap",
        featurizer=featurizer)

  # Featurize dataset
  dataset = loader.create_dataset(dataset_file)

  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset, **splitter_kwargs)

  # Initialize transformers
  transformers = [
      DEFAULT_TRANSFORMERS[t](dataset=dataset, **transformer_kwargs[t])
      if isinstance(t, str) else t(
          dataset=dataset, **transformer_kwargs[str(t.__name__)])
      for t in transformers
  ]

  for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    test_dataset = transformer.transform(test_dataset)

  if reload:  # save to disk
    deepchem.utils.data_utils.save_dataset_to_disk(
        save_folder, train_dataset, valid_dataset, test_dataset, transformers)

  return my_tasks, (train_dataset, valid_dataset, test_dataset), transformers
  loader = _BandgapLoader(featurizer, splitter, transformers, BANDGAP_TASKS,
                          data_dir, save_dir, **kwargs)
  return loader.load_dataset('bandgap', reload)
+48 −148
Original line number Diff line number Diff line
@@ -2,60 +2,42 @@
Calculated formation energies for inorganic crystals from Materials Project.
"""
import os
import logging
import deepchem
from deepchem.feat import MaterialStructureFeaturizer
from deepchem.splits.splitters import Splitter
from deepchem.molnet.defaults import get_defaults
import deepchem as dc
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
from deepchem.data import Dataset
from typing import List, Optional, Tuple, Union

from typing import List, Tuple, Dict, Optional, Any

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.data_utils.get_data_dir()
MPFORME_URL = 'https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/mp_formation_energy.tar.gz'
MPFORME_TASKS = ['formation_energy']

# dict of accepted featurizers for this dataset
# modify the returned dicts for your dataset
DEFAULT_FEATURIZERS = get_defaults("feat")

# Names of supported featurizers
featurizers = [
    'CGCNNFeaturizer',
    'SineCoulombMatrix',
]
DEFAULT_FEATURIZERS = {k: DEFAULT_FEATURIZERS[k] for k in featurizers}

# dict of accepted transformers
DEFAULT_TRANSFORMERS = get_defaults("trans")

# dict of accepted splitters
DEFAULT_SPLITTERS = get_defaults("splits")
class _MPFormationLoader(_MolnetLoader):

# names of supported splitters
splitters = ['RandomSplitter']
DEFAULT_SPLITTERS = {k: DEFAULT_SPLITTERS[k] for k in splitters}
  def create_dataset(self) -> Dataset:
    dataset_file = os.path.join(self.data_dir, 'mp_formation_energy.json')
    targz_file = os.path.join(self.data_dir, 'mp_formation_energy.tar.gz')
    if not os.path.exists(dataset_file):
      if not os.path.exists(targz_file):
        dc.utils.data_utils.download_url(
            url=MPFORME_URL, dest_dir=self.data_dir)
      dc.utils.data_utils.untargz_file(targz_file, self.data_dir)
    loader = dc.data.JsonLoader(
        tasks=self.tasks,
        feature_field="structure",
        label_field="formation_energy",
        featurizer=self.featurizer)
    return loader.create_dataset(dataset_file)


def load_mp_formation_energy(
    featurizer=DEFAULT_FEATURIZERS['SineCoulombMatrix'],
    transformers: List = [DEFAULT_TRANSFORMERS['NormalizationTransformer']],
    splitter=DEFAULT_SPLITTERS['RandomSplitter'],
    featurizer: Union[dc.feat.Featurizer, str] = dc.feat.SineCoulombMatrix(),
    splitter: Union[dc.splits.Splitter, str, None] = 'random',
    transformers: List[Union[TransformerGenerator, str]] = ['normalization'],
    reload: bool = True,
    data_dir: Optional[str] = None,
    save_dir: Optional[str] = None,
    featurizer_kwargs: Dict[str, Any] = {},
    splitter_kwargs: Dict[str, Any] = {
        'frac_train': 0.8,
        'frac_valid': 0.1,
        'frac_test': 0.1
    },
    transformer_kwargs: Dict[str, Dict[str, Any]] = {
        'NormalizationTransformer': {
            'transform_X': True
        }
    },
    **kwargs) -> Tuple[List, Optional[Tuple], List]:
    **kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
  """Load mp formation energy dataset.

  Contains 132752 calculated formation energies and inorganic
@@ -69,27 +51,25 @@ def load_mp_formation_energy(

  Parameters
  ----------
  featurizer : MaterialStructureFeaturizer (default SineCoulombMatrix)
    A featurizer that inherits from deepchem.feat.Featurizer.
  transformers : List[Transformer]
    A transformer that inherits from deepchem.trans.Transformer.
  splitter : Splitter (default RandomSplitter)
    A splitter that inherits from deepchem.splits.splitters.Splitter.
  reload : bool (default True)
    Try to reload dataset from disk if already downloaded. Save to disk
    after featurizing.
  data_dir : str, optional (default None)
    Path to datasets.
  save_dir : str, optional (default None)
    Path to featurized datasets.
  featurizer_kwargs : Dict[str, Any]
    Specify parameters to featurizer, e.g. {"size": 1024}
  splitter_kwargs : Dict[str, Any]
    Specify parameters to splitter, e.g. {"seed": 42}
  transformer_kwargs : dict
    Maps transformer names to constructor arguments, e.g.
    {"BalancingTransformer": {"transform_X":True, "transform_y":False}}
  **kwargs : additional optional arguments.
  featurizer: Featurizer or str
    the featurizer to use for processing the data.  Alternatively you can pass
    one of the names from dc.molnet.featurizers as a shortcut.
  splitter: Splitter or str
    the splitter to use for splitting the data into training, validation, and
    test sets.  Alternatively you can pass one of the names from
    dc.molnet.splitters as a shortcut.  If this is None, all the data
    will be included in a single dataset.
  transformers: list of TransformerGenerators or strings
    the Transformers to apply to the data.  Each one is specified by a
    TransformerGenerator or, as a shortcut, one of the names from
    dc.molnet.transformers.
  reload: bool
    if True, the first call for a particular featurizer and splitter will cache
    the datasets to disk, and subsequent calls will reload the cached datasets.
  data_dir: str
    a directory to save the raw data in
  save_dir: str
    a directory to save the dataset in

  Returns
  -------
@@ -114,93 +94,13 @@ def load_mp_formation_energy(
  Examples
  --------
  >> import deepchem as dc
  >> tasks, datasets, transformers = dc.molnet.load_mp_formation_energy(reload=False)
  >> tasks, datasets, transformers = dc.molnet.load_mp_formation_energy()
  >> train_dataset, val_dataset, test_dataset = datasets
  >> n_tasks = len(tasks)
  >> n_features = train_dataset.get_data_shape()[0]
  >> model = dc.models.MultitaskRegressor(n_tasks, n_features)

  """

  # Featurize
  logger.info("About to featurize mp formation energy dataset.")
  my_tasks = ['formation_energy']  # machine learning targets

  # Get DeepChem data directory if needed
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  if issubclass(featurizer, MaterialStructureFeaturizer):
    featurizer = featurizer(**featurizer_kwargs)
  else:
    raise TypeError(
        "featurizer must be a subclass of MaterialStructureFeaturizer.")

  if issubclass(splitter, Splitter):
    splitter = splitter()
  else:
    raise TypeError("splitter must be a subclass of Splitter.")

  # Reload from disk
  if reload:
    featurizer_name = str(featurizer.__class__.__name__)
    splitter_name = str(splitter.__class__.__name__)
    save_folder = os.path.join(save_dir, "mp-forme-featurized", featurizer_name,
                               splitter_name)

    loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk(
        save_folder)
    if loaded:
      return my_tasks, all_dataset, transformers

  # First type of supported featurizers
  supported_featurizers: List[str] = [
      'CGCNNFeaturizer',
      'SineCoulombMatrix',
  ]

  # Load .tar.gz file
  if featurizer.__class__.__name__ in supported_featurizers:
    dataset_file = os.path.join(data_dir, 'mp_formation_energy.json')

    if not os.path.exists(dataset_file):
      targz_file = os.path.join(data_dir, 'mp_formation_energy.tar.gz')
      if not os.path.exists(targz_file):
        deepchem.utils.data_utils.download_url(
            url=MPFORME_URL, dest_dir=data_dir)
      deepchem.utils.data_utils.untargz_file(
          os.path.join(data_dir, 'mp_formation_energy.tar.gz'), data_dir)

    # Changer loader to match featurizer and data file type
    loader = deepchem.data.JsonLoader(
        tasks=my_tasks,
        feature_field="structure",
        label_field="formation_energy",
        featurizer=featurizer)

  # Featurize dataset
  dataset = loader.create_dataset(dataset_file)

  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset, **splitter_kwargs)

  # Initialize transformers
  transformers = [
      DEFAULT_TRANSFORMERS[t](dataset=dataset, **transformer_kwargs[t])
      if isinstance(t, str) else t(
          dataset=dataset, **transformer_kwargs[str(t.__name__)])
      for t in transformers
  ]

  for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    test_dataset = transformer.transform(test_dataset)

  if reload:  # save to disk
    deepchem.utils.data_utils.save_dataset_to_disk(
        save_folder, train_dataset, valid_dataset, test_dataset, transformers)

  return my_tasks, (train_dataset, valid_dataset, test_dataset), transformers
  loader = _MPFormationLoader(featurizer, splitter, transformers, MPFORME_TASKS,
                              data_dir, save_dir, **kwargs)
  return loader.load_dataset('mp-forme', reload)
+48 −148

File changed.

Preview size limit exceeded, changes collapsed.

Loading