Commit ec4516bd authored by Nathan Frey's avatar Nathan Frey
Browse files

Refactored json fields and added docs

parent c821268a
Loading
Loading
Loading
Loading
+98 −30
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@ import time
import sys
import logging
import warnings
from typing import List, Optional, Dict
from typing import List, Optional, Dict, Tuple

from deepchem.utils.save import load_csv_files, load_json_files
from deepchem.utils.save import load_sdf_files
@@ -458,16 +458,19 @@ class JsonLoader(DataLoader):
  >> import pandas as pd
  >> df = pd.DataFrame(some_data)
  >> df.columns.tolist()
  .. ['formula', structure', 'task']
  .. ['sample_data', 'sample_name', 'weight', 'task']
  >> df.to_json('file.json', orient='records', lines=True)
  >> loader = JsonLoader(['task'], {'structure': dict}, 'formula')
  >> loader = JsonLoader(tasks=['task'], feature_field='sample_data',
      label_field='task', weight_field='weight', id_field='sample_name')
  >> dataset = loader.create_dataset('file.json')
  
  """

  def __init__(self,
               tasks: List[str],
               json_fields: Dict[str, type],
               feature_field: str,
               label_field: str = None,
               weight_field: str = None,
               id_field: str = None,
               featurizer: Optional[Featurizer] = None,
               log_every_n: int = 1000):
@@ -477,11 +480,14 @@ class JsonLoader(DataLoader):
    ----------
    tasks : List[str]
      List of task names
    json_fields : Dict[str, type]
      column names and dtypes in dataframe containing data to be featurized
      e.g. {"structure": dict, "composition": str}
    feature_field : str
      JSON field with data to be featurized.
    label_field : str, default None
      Field with target variables.
    weight_field : str, default None
      Field with weights.
    id_field : str, default None
      Column for identifying samples.
      Field for identifying samples.
    featurizer : dc.feat.Featurizer, optional
      Featurizer to use to process data
    log_every_n : int, optional
@@ -492,10 +498,9 @@ class JsonLoader(DataLoader):
    if not isinstance(tasks, list):
      raise ValueError("Tasks must be a list.")
    self.tasks = tasks
    self.json_fields = json_fields
    if id_field is None:
      self.id_field = next(iter(json_fields))
    else:
    self.feature_field = feature_field
    self.label_field = label_field
    self.weight_field = weight_field
    self.id_field = id_field

    self.user_specified_features = None
@@ -504,6 +509,70 @@ class JsonLoader(DataLoader):
    self.featurizer = featurizer
    self.log_every_n = log_every_n

  def create_dataset(self,
                     input_files: List[str],
                     data_dir: Optional[str] = None,
                     shard_size: Optional[int] = 8192) -> DiskDataset:
    """Creates a `Dataset` from input JSON files.

    Parameters
    ----------
    input_files: List[str]
      List of JSON filenames.
    data_dir: Optional[str], default None
      Name of directory where featurized data is stored.
    shard_size: Optional[int], default 8192
      Shard size when loading data.

    Returns
    -------
    dataset: dc.data.Dataset
      A `Dataset` object containing a featurized representation of data
      from `input_files`.

    """

    if not isinstance(input_files, list):
      input_files = [input_files]

    def shard_generator():
      """Yield X, y, w, and ids for shards."""
      for shard_num, shard in enumerate(
          self._get_shards(input_files, shard_size)):

        time1 = time.time()
        X, valid_inds = self._featurize_shard(shard)
        if self.id_field:
          ids = shard[self.id_field].values
        else:
          ids = np.ones(len(X))
        ids = ids[valid_inds]

        if len(self.tasks) > 0:
          # Featurize task results iff they exist.
          y, w = _convert_df_to_numpy(shard, self.tasks)

          if self.label_field:
            y = shard[self.label_field]
          if self.weight_field:
            w = shard[self.weight_field]

          # Filter out examples where featurization failed.
          y, w = (y[valid_inds], w[valid_inds])
          assert len(X) == len(ids) == len(y) == len(w)
        else:
          # For prospective data where results are unknown, it
          # makes no sense to have y values or weights.
          y, w = (None, None)
          assert len(X) == len(ids)

        time2 = time.time()
        logger.info("TIMING: featurizing shard %d took %0.3f s" %
                    (shard_num, time2 - time1))
        yield X, y, w, ids

    return DiskDataset.create_dataset(shard_generator(), data_dir)

  def _get_shards(self, input_files, shard_size):
    """Defines a generator which returns data for each shard"""
    return load_json_files(input_files, shard_size)
@@ -511,39 +580,38 @@ class JsonLoader(DataLoader):
  def _featurize_shard(self, shard):
    """Featurizes a shard of an input dataframe."""
    return self._featurize_df(
        shard,
        self.featurizer,
        json_fields=self.json_fields,
        log_every_n=self.log_every_n)
        shard, self.featurizer, log_every_n=self.log_every_n)

  def _featurize_df(self,
                    shard,
                    featurizer: Featurizer,
                    json_fields: Dict[str, type],
                    log_every_n: int = 1000):
    """Featurize individual materials in dataframe.
                    log_every_n: int = 1000) -> Tuple[np.ndarray, np.ndarray]:
    """Featurize individual samples in dataframe.

    Helper that given a featurizer that operates on individual
    inorganic crystal structures, computes & adds features for
    that compound to the features dataframe.
    samples, computes & adds features for that sample to the 
    features dataframe.

    Parameters
    ----------
    shard: pd.DataFrame
      DataFrame that holds pymatgen.Structure dict or 
      pymatgen.Composition str
    featurizer: CrystalFeaturizer
      A crystal featurizer object
    json_fields : Dict[str, type]
      column names and dtypes in dataframe containing data to be featurized
      e.g. {"structure": dict, "composition": str}
      DataFrame that holds data to be featurized.
    featurizer: Featurizer
      An instance of `dc.feat.Featurizer`.
    log_every_n: int, optional (default 1000)
      Emit a logging statement every `log_every_n` rows.

    Returns
    -------
    features : np.ndarray
      Array of feature vectors.
    valid_inds : np.ndarray
      Boolean values indicating successfull featurization.

    """

    features = []
    field = next(iter(json_fields))
    field = self.feature_field
    data = shard[field].tolist()
    for idx, datapoint in enumerate(data):
      features.append(featurizer.featurize([datapoint]))
+8 −1
Original line number Diff line number Diff line
@@ -26,8 +26,9 @@ class TestJsonLoader(unittest.TestCase):
    featurizer = SineCoulombMatrix(max_atoms=5)
    loader = JsonLoader(
        tasks=['e_form'],
        json_fields={"structure": dict},
        feature_field='structure',
        id_field='formula',
        label_field='e_form',
        featurizer=featurizer)
    dataset = loader.create_dataset(input_file, shard_size=1)

@@ -35,3 +36,9 @@ class TestJsonLoader(unittest.TestCase):

    assert dataset.X.shape == (5, 1, 5)
    assert np.allclose(dataset.X[0][0], a, atol=.5)

    dataset = loader.create_dataset(input_file, shard_size=None)
    assert dataset.X.shape == (5, 1, 5)

    dataset = loader.create_dataset([input_file, input_file], shard_size=5)
    assert dataset.X.shape == (10, 1, 5)
+6 −4
Original line number Diff line number Diff line
@@ -11,11 +11,12 @@ import os
import deepchem
import warnings
import logging
from typing import List, Optional
from typing import List, Optional, Iterator
from deepchem.utils.genomics import encode_bio_sequence as encode_sequence, encode_fasta_sequence as fasta_sequence, seq_one_hot_encode as seq_one_hotencode

logger = logging.getLogger(__name__)


def log(string, verbose=True):
  """Print string if verbose."""
  if verbose:
@@ -120,7 +121,7 @@ def load_csv_files(filenames, shard_size=None, verbose=True):


def load_json_files(filenames: List[str],
                    shard_size: Optional[int] = None):
                    shard_size: Optional[int] = None) -> Iterator[pd.DataFrame]:
  """Load data as pandas dataframe.

  Parameters
@@ -146,12 +147,13 @@ def load_json_files(filenames: List[str],
  shard_num = 1
  for filename in filenames:
    if shard_size is None:
      yield pd.read_json(filename)
      yield pd.read_json(filename, orient='records', lines=True)
    else:
      logger.info("About to start loading json from %s." % filename)
      for df in pd.read_json(
          filename, orient='records', chunksize=shard_size, lines=True):
        logger.info("Loading shard %d of size %s." % (shard_num, str(shard_size)))
        logger.info(
            "Loading shard %d of size %s." % (shard_num, str(shard_size)))
        df = df.replace(np.nan, str(""), regex=True)
        shard_num += 1
        yield df
+8 −0
Original line number Diff line number Diff line
@@ -24,6 +24,14 @@ UserCSVLoader

JsonLoader
^^^^^^^^^^
JSON is a flexible file format that is human-readable, lightweight, 
and more compact than other open standard formats like XML. JSON files
are similar to python dictionaries of key-value pairs. All keys must
be strings, but values can be any of (string, number, object, array,
boolean, or null), so the format is more flexible than CSV. JSON is
used for describing structured data and to serialize objects. It is
conveniently used to read/write Pandas dataframes with the
`pandas.read_json` and `pandas.write_json` methods.

.. autoclass:: deepchem.data.JsonLoader
  :members: