Unverified Commit ff137c65 authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #2304 from peastman/molnet

Converted more molnet loaders to new API
parents e1e1a1f0 bc6c001f
Loading
Loading
Loading
Loading
+9 −6
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@ import numpy as np

from deepchem.utils.typing import PymatgenComposition
from deepchem.feat import MaterialCompositionFeaturizer
from typing import Any


class ElementPropertyFingerprint(MaterialCompositionFeaturizer):
@@ -50,13 +51,8 @@ class ElementPropertyFingerprint(MaterialCompositionFeaturizer):
    data_source: str of "matminer", "magpie" or "deml" (default "matminer")
      Source for element property data.
    """
    try:
      from matminer.featurizers.composition import ElementProperty
    except ModuleNotFoundError:
      raise ImportError("This class requires matminer to be installed.")

    self.data_source = data_source
    self.ep_featurizer = ElementProperty.from_preset(self.data_source)
    self.ep_featurizer: Any = None

  def _featurize(self, composition: PymatgenComposition) -> np.ndarray:
    """
@@ -73,6 +69,13 @@ class ElementPropertyFingerprint(MaterialCompositionFeaturizer):
      Vector of properties and statistics derived from chemical
      stoichiometry. Some values may be NaN.
    """
    if self.ep_featurizer is None:
      try:
        from matminer.featurizers.composition import ElementProperty
        self.ep_featurizer = ElementProperty.from_preset(self.data_source)
      except ModuleNotFoundError:
        raise ImportError("This class requires matminer to be installed.")

    try:
      feats = self.ep_featurizer.featurize(composition)
    except:
+9 −6
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@ import numpy as np
from deepchem.utils.typing import PymatgenStructure
from deepchem.feat import MaterialStructureFeaturizer
from deepchem.utils.data_utils import pad_array
from typing import Any


class SineCoulombMatrix(MaterialStructureFeaturizer):
@@ -54,14 +55,9 @@ class SineCoulombMatrix(MaterialStructureFeaturizer):
    flatten: bool (default True)
      Return flattened vector of matrix eigenvalues.
    """
    try:
      from matminer.featurizers.structure import SineCoulombMatrix as SCM
    except ModuleNotFoundError:
      raise ImportError("This class requires matminer to be installed.")

    self.max_atoms = max_atoms
    self.flatten = flatten
    self.scm = SCM(flatten=False)
    self.scm: Any = None

  def _featurize(self, struct: PymatgenStructure) -> np.ndarray:
    """
@@ -79,6 +75,13 @@ class SineCoulombMatrix(MaterialStructureFeaturizer):
      2D sine Coulomb matrix with shape (max_atoms, max_atoms),
      or 1D matrix eigenvalues with shape (max_atoms,).
    """
    if self.scm is None:
      try:
        from matminer.featurizers.structure import SineCoulombMatrix as SCM
        self.scm = SCM(flatten=False)
      except ModuleNotFoundError:
        raise ImportError("This class requires matminer to be installed.")

    # Get full N x N SCM
    sine_mat = self.scm.featurize(struct)

+3 −3
Original line number Diff line number Diff line
@@ -23,7 +23,7 @@ def test_cgcnn_regression():
  current_dir = path.dirname(path.abspath(__file__))
  config = {
      "reload": False,
      "featurizer": CGCNNFeaturizer,
      "featurizer": CGCNNFeaturizer(),
      # disable transformer
      "transformers": [],
      "data_dir": current_dir
@@ -59,7 +59,7 @@ def test_cgcnn_classification():
  current_dir = path.dirname(path.abspath(__file__))
  config = {
      "reload": False,
      "featurizer": CGCNNFeaturizer,
      "featurizer": CGCNNFeaturizer(),
      # disable transformer
      "transformers": [],
      "data_dir": current_dir
@@ -101,7 +101,7 @@ def test_cgcnn_reload():
  current_dir = path.dirname(path.abspath(__file__))
  config = {
      "reload": False,
      "featurizer": CGCNNFeaturizer,
      "featurizer": CGCNNFeaturizer(),
      # disable transformer
      "transformers": [],
      "data_dir": current_dir
+1 −1
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@ from deepchem.molnet.load_function.kaggle_datasets import load_kaggle
from deepchem.molnet.load_function.lipo_datasets import load_lipo
from deepchem.molnet.load_function.muv_datasets import load_muv
from deepchem.molnet.load_function.nci_datasets import load_nci
from deepchem.molnet.load_function.pcba_datasets import load_pcba, load_pcba_146, load_pcba_2475
from deepchem.molnet.load_function.pcba_datasets import load_pcba
from deepchem.molnet.load_function.pdbbind_datasets import load_pdbbind_grid, load_pdbbind, load_pdbbind_from_dir
from deepchem.molnet.load_function.ppb_datasets import load_ppb
from deepchem.molnet.load_function.qm7_datasets import load_qm7
+50 −146
Original line number Diff line number Diff line
@@ -2,60 +2,43 @@
Experimental bandgaps for inorganic crystals.
"""
import os
import logging
import deepchem as dc
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
from deepchem.data import Dataset
from typing import List, Optional, Tuple, Union

import deepchem
from deepchem.feat import MaterialCompositionFeaturizer
from deepchem.splits.splitters import Splitter
from deepchem.molnet.defaults import get_defaults

from typing import List, Tuple, Dict, Optional, Any

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.data_utils.get_data_dir()
BANDGAP_URL = 'https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/expt_gap.tar.gz'
BANDGAP_TASKS = ['experimental_bandgap']

# dict of accepted featurizers for this dataset
# modify the returned dicts for your dataset
DEFAULT_FEATURIZERS = get_defaults("feat")

# Names of supported featurizers
featurizers = [
    'ElementPropertyFingerprint',
]
DEFAULT_FEATURIZERS = {k: DEFAULT_FEATURIZERS[k] for k in featurizers}

# dict of accepted transformers
DEFAULT_TRANSFORMERS = get_defaults("trans")
class _BandgapLoader(_MolnetLoader):

# dict of accepted splitters
DEFAULT_SPLITTERS = get_defaults("splits")

# names of supported splitters
splitters = ['RandomSplitter']
DEFAULT_SPLITTERS = {k: DEFAULT_SPLITTERS[k] for k in splitters}
  def create_dataset(self) -> Dataset:
    dataset_file = os.path.join(self.data_dir, 'expt_gap.json')
    targz_file = os.path.join(self.data_dir, 'expt_gap.tar.gz')
    if not os.path.exists(dataset_file):
      if not os.path.exists(targz_file):
        dc.utils.data_utils.download_url(
            url=BANDGAP_URL, dest_dir=self.data_dir)
      dc.utils.data_utils.untargz_file(targz_file, self.data_dir)
    loader = dc.data.JsonLoader(
        tasks=self.tasks,
        feature_field="composition",
        label_field="experimental_bandgap",
        featurizer=self.featurizer)
    return loader.create_dataset(dataset_file)


def load_bandgap(
    featurizer=DEFAULT_FEATURIZERS['ElementPropertyFingerprint'],
    transformers: List = [DEFAULT_TRANSFORMERS['NormalizationTransformer']],
    splitter=DEFAULT_SPLITTERS['RandomSplitter'],
    featurizer: Union[dc.feat.Featurizer,
                      str] = dc.feat.ElementPropertyFingerprint(),
    splitter: Union[dc.splits.Splitter, str, None] = 'random',
    transformers: List[Union[TransformerGenerator, str]] = ['normalization'],
    reload: bool = True,
    data_dir: Optional[str] = None,
    save_dir: Optional[str] = None,
    featurizer_kwargs: Dict[str, Any] = {},
    splitter_kwargs: Dict[str, Any] = {
        'frac_train': 0.8,
        'frac_valid': 0.1,
        'frac_test': 0.1
    },
    transformer_kwargs: Dict[str, Dict[str, Any]] = {
        'NormalizationTransformer': {
            'transform_X': True
        }
    },
    **kwargs) -> Tuple[List, Optional[Tuple], List]:
    **kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
  """Load band gap dataset.

  Contains 4604 experimentally measured band gaps for inorganic
@@ -68,27 +51,25 @@ def load_bandgap(

  Parameters
  ----------
  featurizer : MaterialCompositionFeaturizer (default ElementPropertyFingerprint)
    A featurizer that inherits from deepchem.feat.Featurizer.
  transformers : List[Transformer]
    A transformer that inherits from deepchem.trans.Transformer.
  splitter : Splitter (default RandomSplitter)
    A splitter that inherits from deepchem.splits.splitters.Splitter.
  reload : bool (default True)
    Try to reload dataset from disk if already downloaded. Save to disk
    after featurizing.
  data_dir : str, optional (default None)
    Path to datasets.
  save_dir : str, optional (default None)
    Path to featurized datasets.
  featurizer_kwargs : Dict[str, Any]
    Specify parameters to featurizer, e.g. {"size": 1024}
  splitter_kwargs : Dict[str, Any]
    Specify parameters to splitter, e.g. {"seed": 42}
  transformer_kwargs : dict
    Maps transformer names to constructor arguments, e.g.
    {"BalancingTransformer": {"transform_x":True, "transform_y":False}}
  **kwargs : additional optional arguments.
  featurizer: Featurizer or str
    the featurizer to use for processing the data.  Alternatively you can pass
    one of the names from dc.molnet.featurizers as a shortcut.
  splitter: Splitter or str
    the splitter to use for splitting the data into training, validation, and
    test sets.  Alternatively you can pass one of the names from
    dc.molnet.splitters as a shortcut.  If this is None, all the data
    will be included in a single dataset.
  transformers: list of TransformerGenerators or strings
    the Transformers to apply to the data.  Each one is specified by a
    TransformerGenerator or, as a shortcut, one of the names from
    dc.molnet.transformers.
  reload: bool
    if True, the first call for a particular featurizer and splitter will cache
    the datasets to disk, and subsequent calls will reload the cached datasets.
  data_dir: str
    a directory to save the raw data in
  save_dir: str
    a directory to save the dataset in

  Returns
  -------
@@ -111,92 +92,15 @@ def load_bandgap(

  Examples
  --------
  >>>
  >> import deepchem as dc
  >> tasks, datasets, transformers = dc.molnet.load_bandgap(reload=False)
  >> tasks, datasets, transformers = dc.molnet.load_bandgap()
  >> train_dataset, val_dataset, test_dataset = datasets
  >> n_tasks = len(tasks)
  >> n_features = train_dataset.get_data_shape()[0]
  >> model = dc.models.MultitaskRegressor(n_tasks, n_features)

  """

  # Featurize
  logger.info("About to featurize band gap dataset.")
  my_tasks = ['experimental_bandgap']  # machine learning targets

  # Get DeepChem data directory if needed
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  if issubclass(featurizer, MaterialCompositionFeaturizer):
    featurizer = featurizer(**featurizer_kwargs)
  else:
    raise TypeError(
        "featurizer must be a subclass of MaterialCompositionFeaturizer.")

  if issubclass(splitter, Splitter):
    splitter = splitter()
  else:
    raise TypeError("splitter must be a subclass of Splitter.")

  # Reload from disk
  if reload:
    featurizer_name = str(featurizer.__class__.__name__)
    splitter_name = str(splitter.__class__.__name__)
    save_folder = os.path.join(save_dir, "bandgap-featurized", featurizer_name,
                               splitter_name)

    loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk(
        save_folder)
    if loaded:
      return my_tasks, all_dataset, transformers

  # First type of supported featurizers
  supported_featurizers: List[str] = ['ElementPropertyFingerprint']

  # Load .tar.gz file
  if featurizer.__class__.__name__ in supported_featurizers:
    dataset_file = os.path.join(data_dir, 'expt_gap.json')

    if not os.path.exists(dataset_file):
      targz_file = os.path.join(data_dir, 'expt_gap.tar.gz')
      if not os.path.exists(targz_file):
        deepchem.utils.data_utils.download_url(
            url=BANDGAP_URL, dest_dir=data_dir)

      deepchem.utils.data_utils.untargz_file(
          os.path.join(data_dir, 'expt_gap.tar.gz'), data_dir)

    # Changer loader to match featurizer and data file type
    loader = deepchem.data.JsonLoader(
        tasks=my_tasks,
        feature_field="composition",
        label_field="experimental_bandgap",
        featurizer=featurizer)

  # Featurize dataset
  dataset = loader.create_dataset(dataset_file)

  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset, **splitter_kwargs)

  # Initialize transformers
  transformers = [
      DEFAULT_TRANSFORMERS[t](dataset=dataset, **transformer_kwargs[t])
      if isinstance(t, str) else t(
          dataset=dataset, **transformer_kwargs[str(t.__name__)])
      for t in transformers
  ]

  for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    test_dataset = transformer.transform(test_dataset)

  if reload:  # save to disk
    deepchem.utils.data_utils.save_dataset_to_disk(
        save_folder, train_dataset, valid_dataset, test_dataset, transformers)

  return my_tasks, (train_dataset, valid_dataset, test_dataset), transformers
  loader = _BandgapLoader(featurizer, splitter, transformers, BANDGAP_TASKS,
                          data_dir, save_dir, **kwargs)
  return loader.load_dataset('bandgap', reload)
Loading