Commit 59b2c0b8 authored by leswing's avatar leswing
Browse files

Merge branch 'master' into guzik-auto-encode

parents 04e104b7 c1d38548
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -18,4 +18,5 @@ from deepchem.data.data_loader import DataLoader
from deepchem.data.data_loader import CSVLoader
from deepchem.data.data_loader import UserCSVLoader
from deepchem.data.data_loader import SDFLoader
from deepchem.data.data_loader import FASTALoader
import deepchem.data.tests
+47 −1
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ import sys
from deepchem.utils.save import log
from deepchem.utils.save import load_csv_files
from deepchem.utils.save import load_sdf_files
from deepchem.utils.save import encode_fasta_sequence
from deepchem.feat import UserDefinedFeaturizer
from deepchem.data import DiskDataset

@@ -186,7 +187,20 @@ class DataLoader(object):
    self.log_every_n = log_every_n

  def featurize(self, input_files, data_dir=None, shard_size=8192):
    """Featurize provided files and write to specified location."""
    """Featurize provided files and write to specified location.
    
    For large datasets, automatically shards into smaller chunks
    for convenience.

    Parameters
    ----------
    input_files: list
      List of input filenames.
    data_dir: str
      (Optional) Directory to store featurized dataset.
    shard_size: int
      (Optional) Number of examples stored in each shard.
    """
    log("Loading raw samples now.", self.verbose)
    log("shard_size: %d" % shard_size, self.verbose)

@@ -280,3 +294,35 @@ class SDFLoader(DataLoader):
    log("Currently featurizing feature_type: %s" %
        self.featurizer.__class__.__name__, self.verbose)
    return featurize_mol_df(shard, self.featurizer, field=self.mol_field)


class FASTALoader(DataLoader):
  """
  Handles loading of FASTA files.
  """

  def __init__(self, verbose=True):
    """Initialize loader."""
    self.verbose = verbose

  def featurize(self, input_files, data_dir=None):
    """Featurizes fasta files.

    Parameters
    ----------
    input_files: list
      List of fasta files.
    data_dir: str
      (Optional) Name of directory where featurized data is stored.
    """
    if not isinstance(input_files, list):
      input_files = [input_files]

    def shard_generator():
      for input_file in input_files:
        X = encode_fasta_sequence(input_file)
        ids = np.ones(len(X))
        # (X, y, w, ids)
        yield X, None, None, ids

    return DiskDataset.create_dataset(shard_generator(), data_dir)
+6 −0
Original line number Diff line number Diff line
>seq0
ACGTCCCACACGATGCATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT
>seq1
GTCGATGCATGCTAGCTAGCTAGCTAGCTACGATCGATCGATCGTACGATCGATCGAT
>seq2
ACACATCATCATTACTATATATTATATATCGATCGATCGATCGATCGTACGTAGCTAG
+33 −0
Original line number Diff line number Diff line
"""
Tests that FASTA files can be loaded.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__license__ = "MIT"

import os
import unittest
import deepchem as dc


class TestFASTALoader(unittest.TestCase):
  """
  Test FASTALoader 
  """

  def setUp(self):
    super(TestFASTALoader, self).setUp()
    self.current_dir = os.path.dirname(os.path.abspath(__file__))

  def test_fasta_load(self):
    input_file = os.path.join(self.current_dir,
                              "../../data/tests/example.fasta")
    loader = dc.data.FASTALoader()
    sequences = loader.featurize(input_file)
    # example.fasta contains 3 sequences each of length 58.
    # The one-hot encoding turns base-pairs into vectors of length 4.
    # There is one "image channel")
    assert sequences.X.shape == (3, 4, 58, 1)
+4 −2
Original line number Diff line number Diff line
@@ -187,7 +187,8 @@ class TensorGraph(Model):
        if submodel.loss is not None:
          loss = submodel.loss
      if checkpoint_interval > 0:
        saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
        saver = tf.train.Saver(
            max_to_keep=max_checkpoints_to_keep, save_relative_paths=True)
      if restore:
        self.restore()
      avg_loss, n_averaged_batches = 0.0, 0.0
@@ -788,7 +789,8 @@ class TensorGraph(Model):
      var_names = set([x for x in reader.get_variable_to_shape_map()])
      var_map = {
          x.op.name: x
          for x in tf.global_variables() if x.op.name in var_names
          for x in tf.global_variables()
          if x.op.name in var_names
      }
      saver = tf.train.Saver(var_list=var_map)
      saver.restore(self.session, checkpoint)
Loading