Commit c1dbd121 authored by miaecle's avatar miaecle
Browse files

combine master

parents a2be59ac 9e553fca
Loading
Loading
Loading
Loading

MANIFEST.in

0 → 100644
+2 −0
Original line number Diff line number Diff line
prune datasets
prune examples
+1 −1
Original line number Diff line number Diff line
@@ -169,7 +169,7 @@ import deepchem as dc
1. Question: I'm seeing some failures in my test suite having to do with MKL
   ```Intel MKL FATAL ERROR: Cannot load libmkl_avx.so or libmkl_def.so.```

   Answer: This is a general issue with the newest version of `scikit-learn` enabling MKL by default. This doesn't play well with many linux systems. See BVLC/caffe#3884 for discussions. The following seems to fix the issue
   Answer: This is a general issue with the newest version of `scikit-learn` enabling MKL by default. This doesn't play well with many linux systems. See [BVLC/caffe#3884](https://github.com/BVLC/caffe/issues/3884) for discussions. The following seems to fix the issue
   ```bash
   conda install nomkl numpy scipy scikit-learn numexpr
   conda remove mkl mkl-service
+1 −0
Original line number Diff line number Diff line
@@ -18,4 +18,5 @@ from deepchem.data.data_loader import DataLoader
from deepchem.data.data_loader import CSVLoader
from deepchem.data.data_loader import UserCSVLoader
from deepchem.data.data_loader import SDFLoader
from deepchem.data.data_loader import FASTALoader
import deepchem.data.tests
+47 −1
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ import sys
from deepchem.utils.save import log
from deepchem.utils.save import load_csv_files
from deepchem.utils.save import load_sdf_files
from deepchem.utils.save import encode_fasta_sequence
from deepchem.feat import UserDefinedFeaturizer
from deepchem.data import DiskDataset

@@ -186,7 +187,20 @@ class DataLoader(object):
    self.log_every_n = log_every_n

  def featurize(self, input_files, data_dir=None, shard_size=8192):
    """Featurize provided files and write to specified location."""
    """Featurize provided files and write to specified location.
    
    For large datasets, automatically shards into smaller chunks
    for convenience.

    Parameters
    ----------
    input_files: list
      List of input filenames.
    data_dir: str
      (Optional) Directory to store featurized dataset.
    shard_size: int
      (Optional) Number of examples stored in each shard.
    """
    log("Loading raw samples now.", self.verbose)
    log("shard_size: %d" % shard_size, self.verbose)

@@ -280,3 +294,35 @@ class SDFLoader(DataLoader):
    log("Currently featurizing feature_type: %s" %
        self.featurizer.__class__.__name__, self.verbose)
    return featurize_mol_df(shard, self.featurizer, field=self.mol_field)


class FASTALoader(DataLoader):
  """
  Handles loading of FASTA files.
  """

  def __init__(self, verbose=True):
    """Initialize loader."""
    self.verbose = verbose

  def featurize(self, input_files, data_dir=None):
    """Featurizes fasta files.

    Parameters
    ----------
    input_files: list
      List of fasta files.
    data_dir: str
      (Optional) Name of directory where featurized data is stored.
    """
    if not isinstance(input_files, list):
      input_files = [input_files]

    def shard_generator():
      for input_file in input_files:
        X = encode_fasta_sequence(input_file)
        ids = np.ones(len(X))
        # (X, y, w, ids)
        yield X, None, None, ids

    return DiskDataset.create_dataset(shard_generator(), data_dir)
+52 −9
Original line number Diff line number Diff line
@@ -450,6 +450,30 @@ class NumpyDataset(Dataset):
      d = json.load(fin)
      return NumpyDataset(d['X'], d['y'], d['w'], d['ids'])

  @staticmethod
  def merge(datasets):
    """
    Parameters
    ----------
    datasets: list of deepchem.data.NumpyDataset
      list of datasets to merge

    Returns
    -------
    Single deepchem.data.NumpyDataset with data concatenated over axis 0
    """
    X, y, w, ids = datasets[0].X, datasets[0].y, datasets[0].w, datasets[0].ids
    for dataset in datasets[1:]:
      X = np.concatenate([X, dataset.X], axis=0)
      y = np.concatenate([y, dataset.y], axis=0)
      w = np.concatenate([w, dataset.w], axis=0)
      ids = np.concatenate(
          [ids, dataset.ids],
          axis=0,
      )

    return NumpyDataset(X, y, w, ids, n_tasks=y.shape[1])


class DiskDataset(Dataset):
  """
@@ -624,7 +648,8 @@ class DiskDataset(Dataset):
    if not len(self.metadata_df):
      raise ValueError("No data in dataset.")
    sample_X = load_from_disk(
        os.path.join(self.data_dir, next(self.metadata_df.iterrows())[1]['X']))
        os.path.join(self.data_dir,
                     next(self.metadata_df.iterrows())[1]['X']))
    return np.shape(sample_X)[1:]

  def get_shard_size(self):
@@ -632,7 +657,8 @@ class DiskDataset(Dataset):
    if not len(self.metadata_df):
      raise ValueError("No data in dataset.")
    sample_y = load_from_disk(
        os.path.join(self.data_dir, next(self.metadata_df.iterrows())[1]['y']))
        os.path.join(self.data_dir,
                     next(self.metadata_df.iterrows())[1]['y']))
    return len(sample_y)

  def _get_metadata_filename(self):
@@ -916,12 +942,29 @@ class DiskDataset(Dataset):
    else:
      merge_dir = tempfile.mkdtemp()

    # Protect against generator exhaustion
    datasets = list(datasets)

    # This ensures tasks are consistent for all datasets
    tasks = []
    for dataset in datasets:
      try:
        tasks.append(dataset.tasks)
      except AttributeError:
        pass
    if tasks:
      if len(tasks) < len(datasets) or len(set(map(tuple, tasks))) > 1:
        raise ValueError(
            'Cannot merge datasets with different task specifications')
      tasks = tasks[0]

    def generator():
      for ind, dataset in enumerate(datasets):
        X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
        yield (X, y, w, ids)

    return DiskDataset.create_dataset(generator(), data_dir=merge_dir)
    return DiskDataset.create_dataset(
        generator(), data_dir=merge_dir, tasks=tasks)

  def subset(self, shard_nums, subset_dir=None):
    """Creates a subset of the original dataset on disk."""
Loading