Commit 5811e3b2 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Addressing open review comments

parent b0c38771
Loading
Loading
Loading
Loading
+42 −18
Original line number Diff line number Diff line
@@ -25,7 +25,8 @@ import zipfile
logger = logging.getLogger(__name__)


def _convert_df_to_numpy(df, tasks):
def _convert_df_to_numpy(df: pd.DataFrame,
                         tasks: List[str]) -> Tuple[np.ndarray, np.ndarray]:
  """Transforms a dataframe containing deepchem input into numpy arrays

  This is a private helper method intended to help parse labels and
@@ -38,7 +39,7 @@ def _convert_df_to_numpy(df, tasks):
  ----------
  df: pd.DataFrame
    Pandas dataframe with columns for all tasks
  tasks: list
  tasks: List[str] 
    List of tasks
  """
  n_samples = df.shape[0]
@@ -55,7 +56,8 @@ def _convert_df_to_numpy(df, tasks):
  return y.astype(float), w.astype(float)


def _get_user_specified_features(df, featurizer):
def _get_user_specified_features(
    df: pd.DataFrame, featurizer: UserDefinedFeaturizer) -> np.ndarray:
  """Extract and merge user specified features.

  Private helper methods that merges features included in dataset
@@ -76,6 +78,11 @@ def _get_user_specified_features(df, featurizer):
    DataFrame that holds SMILES strings
  featurizer: Featurizer
    A featurizer object

  Returns
  -------
  np.ndarray
    Array of features extracted from input dataframe.
  """
  time1 = time.time()
  df[featurizer.feature_fields] = df[featurizer.feature_fields].apply(
@@ -117,7 +124,11 @@ class DataLoader(object):
  for you by performing this work under the hood.
  """

  def __init__(self, tasks, id_field=None, featurizer=None, log_every_n=1000):
  def __init__(self,
               tasks: List[str],
               id_field: str = None,
               featurizer: Featurizer = None,
               log_every_n: int = 1000):
    """Construct a DataLoader object.

    This constructor is provided as a template mainly. You
@@ -248,7 +259,7 @@ class DataLoader(object):

    return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

  def _get_shards(self, inputs, shard_size):
  def _get_shards(self, inputs: List, shard_size: int) -> Iterator:
    """Stub for children classes.

    Should implement a generator that walks over the source data in
@@ -271,7 +282,7 @@ class DataLoader(object):
    """
    raise NotImplementedError

  def _featurize_shard(self, shard):
  def _featurize_shard(self, shard: Any):
    """Featurizes a shard of input data.

    Recall a shard is a chunk of input data that can reasonably be
@@ -323,14 +334,14 @@ class CSVLoader(DataLoader):
  """

  def __init__(self,
               tasks: OneOrMany[str],
               tasks: List[str],
               feature_field: Optional[str] = None,
               label_field: Optional[str] = None,
               weight_field: Optional[str] = None,
               smiles_field: Optional[str] = None,
               id_field=None,
               id_field: str = None,
               featurizer: Optional[Featurizer] = None,
               log_every_n=1000):
               log_every_n: int = 1000):
    """Initializes CSVLoader.

    Parameters
@@ -408,6 +419,9 @@ class CSVLoader(DataLoader):
      Indices of rows in source CSV with valid data.
    """
    logger.info("About to featurize shard.")
    if self.featurizer is None:
      raise ValueError(
          "featurizer must be specified in constructor to featurizer data/")
    features = [elt for elt in self.featurizer(shard[self.feature_field])]
    valid_inds = np.array(
        [1 if np.array(elt).size > 0 else 0 for elt in features], dtype=bool)
@@ -419,7 +433,7 @@ class CSVLoader(DataLoader):

class UserCSVLoader(CSVLoader):
  """
  Handles loading of CSV files with user-defined featurizers.
  Handles loading of CSV files with user-defined features.

  This is a convenience class that allows for descriptors already present in a
  CSV file to be extracted without any featurization necessary.
@@ -530,7 +544,7 @@ class JsonLoader(DataLoader):
  """

  def __init__(self,
               tasks: OneOrMany[str],
               tasks: List[str],
               feature_field: str,
               label_field: Optional[str] = None,
               weight_field: Optional[str] = None,
@@ -643,7 +657,8 @@ class JsonLoader(DataLoader):

    return DiskDataset.create_dataset(shard_generator(), data_dir)

  def _get_shards(self, input_files, shard_size):
  def _get_shards(self, input_files: List[str],
                  shard_size: int) -> Iterator[pd.DataFrame]:
    """Defines a generator which returns data for each shard"""
    return load_json_files(input_files, shard_size)

@@ -667,6 +682,9 @@ class JsonLoader(DataLoader):
      sample in the source.
    """
    logger.info("About to featurize shard.")
    if self.featurizer is None:
      raise ValueError(
          "featurizer must be specified in constructor to featurizer data/")
    features = [elt for elt in self.featurizer(shard[self.feature_field])]
    valid_inds = np.array(
        [1 if np.array(elt).size > 0 else 0 for elt in features], dtype=bool)
@@ -694,7 +712,11 @@ class SDFLoader(DataLoader):
  2
  """

  def __init__(self, tasks, sanitize=False, featurizer=None, log_every_n=1000):
  def __init__(self,
               tasks: List[str],
               sanitize: bool = False,
               featurizer: Featurizer = None,
               log_every_n: int = 1000):
    """Initialize SDF Loader

    Parameters
@@ -793,7 +815,7 @@ class ImageLoader(DataLoader):
  traverse subdirectories which contain images.
  """

  def __init__(self, tasks: OneOrMany[str] = None):
  def __init__(self, tasks: Optional[List[str]] = None):
    """Initialize image loader.

    At present, custom image featurizers aren't supported by this
@@ -914,7 +936,7 @@ class ImageLoader(DataLoader):
      return ImageDataset(image_files, y=labels, w=weights, ids=image_files)

  @staticmethod
  def load_img(image_files) -> np.ndarray:
  def load_img(image_files: List[str]) -> np.ndarray:
    """Loads a set of images from disk.

    Parameters
@@ -1051,7 +1073,8 @@ class InMemoryLoader(DataLoader):

    return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

  def _get_shards(self, inputs, shard_size):
  def _get_shards(self, inputs: List,
                  shard_size: int) -> Iterator[pd.DataFrame]:
    """Break up input into shards.

    Parameters
@@ -1067,9 +1090,10 @@ class InMemoryLoader(DataLoader):

    Returns
    -------
    Iterator[pd.DataFrame]
      Iterator which iterates over shards of data.
    """
    current_shard = []
    current_shard: List = []
    for i, datapoint in enumerate(inputs):
      if i != 0 and i % shard_size == 0:
        shard_data = current_shard
+5 −5
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@ import multiprocessing
from deepchem.utils.save import save_to_disk, save_metadata
from deepchem.utils.save import load_from_disk

from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, Union
from deepchem.utils.typing import OneOrMany, Shape

Batch = Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
@@ -2034,8 +2034,8 @@ class ImageDataset(Dataset):
  """A Dataset that loads data from image files on disk."""

  def __init__(self,
               X: Sequence,
               y: Optional[Sequence],
               X: Union[np.ndarray, List[str]],
               y: Optional[Union[np.ndarray, List[str]]],
               w: Optional[Sequence] = None,
               ids: Optional[Sequence] = None) -> None:
    """Create a dataset whose X and/or y array is defined by image files on disk.
@@ -2050,10 +2050,10 @@ class ImageDataset(Dataset):
      The dataset's labels.  This may be either a single NumPy array
      directly containing the data, or a list containing the paths to
      the image files
    w: ndarray
    w: ndarray, optional, (default, None)
      a 1D or 2D array containing the weights for each sample or
      sample/task pair
    ids: ndarray
    ids: ndarray, optional (default None)
      the sample IDs
    """
    n_samples = len(X)
+1 −1
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@ def test_load_singleton_csv():
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["endpoint"]
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
      tasks=tasks, feature_field="smiles", featurizer=featurizer)

  X = loader.create_dataset(fin.name)
  assert len(X) == 1
+9 −9
Original line number Diff line number Diff line
@@ -14,8 +14,9 @@ def test_unlabelled():
  input_file = os.path.join(current_dir, "../../data/tests/no_labels.csv")
  featurizer = dc.feat.CircularFingerprint(size=1024)
  loader = dc.data.CSVLoader(
      tasks=[], smiles_field="smiles", featurizer=featurizer)
  loader.create_dataset(input_file)
      tasks=[], feature_field="smiles", featurizer=featurizer)
  dataset = loader.create_dataset(input_file)
  assert len(dataset.X)


def test_scaffold_test_train_valid_test_split():
@@ -33,7 +34,7 @@ def test_scaffold_test_train_valid_test_split():

  input_file = os.path.join(current_dir, input_file)
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
      tasks=tasks, feature_field="smiles", featurizer=featurizer)

  dataset = loader.create_dataset(input_file)

@@ -61,7 +62,7 @@ def test_scaffold_test_train_test_split():

  input_file = os.path.join(current_dir, input_file)
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
      tasks=tasks, feature_field="smiles", featurizer=featurizer)

  dataset = loader.create_dataset(input_file)

@@ -86,7 +87,7 @@ def test_random_test_train_valid_test_split():

  input_file = os.path.join(current_dir, input_file)
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
      tasks=tasks, feature_field="smiles", featurizer=featurizer)

  dataset = loader.create_dataset(input_file)

@@ -110,7 +111,7 @@ def test_random_test_train_test_split():
  input_file = os.path.join(current_dir, "../../models/tests/example.csv")
  featurizer = dc.feat.CircularFingerprint(size=1024)
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
      tasks=tasks, feature_field="smiles", featurizer=featurizer)

  dataset = loader.create_dataset(input_file)

@@ -128,10 +129,9 @@ def test_log_solubility_dataset():
  input_file = os.path.join(current_dir, input_file)

  tasks = ["log-solubility"]
  smiles_field = "smiles"
  loader = dc.data.CSVLoader(
      tasks=tasks,
      smiles_field="smiles",
      feature_field="smiles",
      featurizer=dc.feat.CircularFingerprint(size=1024))
  dataset = loader.create_dataset(input_file)

@@ -149,7 +149,7 @@ def test_dataset_move():
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
      tasks=tasks, feature_field="smiles", featurizer=featurizer)
  featurized_dataset = loader.create_dataset(dataset_file, data_dir)
  n_dataset = len(featurized_dataset)