Merge pull request #1020 from rbharath/fasta (8f329dff) · Commits · 钟慕尧 / deepchem

deepchem/data/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -18,4 +18,5 @@ from deepchem.data.data_loader import DataLoader
		from deepchem.data.data_loader import CSVLoader
		from deepchem.data.data_loader import UserCSVLoader
		from deepchem.data.data_loader import SDFLoader
		from deepchem.data.data_loader import FASTALoader
		import deepchem.data.tests

deepchem/data/data_loader.py

+47 −1

Original line number	Diff line number	Diff line
		@@ -19,6 +19,7 @@ import sys
		from deepchem.utils.save import log
		from deepchem.utils.save import load_csv_files
		from deepchem.utils.save import load_sdf_files
		from deepchem.utils.save import encode_fasta_sequence
		from deepchem.feat import UserDefinedFeaturizer
		from deepchem.data import DiskDataset

		@@ -186,7 +187,20 @@ class DataLoader(object):
		self.log_every_n = log_every_n

		def featurize(self, input_files, data_dir=None, shard_size=8192):
		"""Featurize provided files and write to specified location."""
		"""Featurize provided files and write to specified location.

		For large datasets, automatically shards into smaller chunks
		for convenience.

		Parameters
		----------
		input_files: list
		List of input filenames.
		data_dir: str
		(Optional) Directory to store featurized dataset.
		shard_size: int
		(Optional) Number of examples stored in each shard.
		"""
		log("Loading raw samples now.", self.verbose)
		log("shard_size: %d" % shard_size, self.verbose)

		@@ -280,3 +294,35 @@ class SDFLoader(DataLoader):
		log("Currently featurizing feature_type: %s" %
		self.featurizer.__class__.__name__, self.verbose)
		return featurize_mol_df(shard, self.featurizer, field=self.mol_field)


		class FASTALoader(DataLoader):
		"""
		Handles loading of FASTA files.
		"""

		def __init__(self, verbose=True):
		"""Initialize loader."""
		self.verbose = verbose

		def featurize(self, input_files, data_dir=None):
		"""Featurizes fasta files.

		Parameters
		----------
		input_files: list
		List of fasta files.
		data_dir: str
		(Optional) Name of directory where featurized data is stored.
		"""
		if not isinstance(input_files, list):
		input_files = [input_files]

		def shard_generator():
		for input_file in input_files:
		X = encode_fasta_sequence(input_file)
		ids = np.ones(len(X))
		# (X, y, w, ids)
		yield X, None, None, ids

		return DiskDataset.create_dataset(shard_generator(), data_dir)

deepchem/data/tests/example.fasta

0 → 100644

+6 −0

Original line number	Diff line number	Diff line
		>seq0
		ACGTCCCACACGATGCATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT
		>seq1
		GTCGATGCATGCTAGCTAGCTAGCTAGCTACGATCGATCGATCGTACGATCGATCGAT
		>seq2
		ACACATCATCATTACTATATATTATATATCGATCGATCGATCGATCGTACGTAGCTAG

deepchem/data/tests/test_fasta_loader.py

0 → 100644

+33 −0

Original line number	Diff line number	Diff line
		"""
		Tests that FASTA files can be loaded.
		"""
		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals

		__author__ = "Bharath Ramsundar"
		__license__ = "MIT"

		import os
		import unittest
		import deepchem as dc


		class TestFASTALoader(unittest.TestCase):
		"""
		Test FASTALoader
		"""

		def setUp(self):
		super(TestFASTALoader, self).setUp()
		self.current_dir = os.path.dirname(os.path.abspath(__file__))

		def test_fasta_load(self):
		input_file = os.path.join(self.current_dir,
		"../../data/tests/example.fasta")
		loader = dc.data.FASTALoader()
		sequences = loader.featurize(input_file)
		# example.fasta contains 3 sequences each of length 58.
		# The one-hot encoding turns base-pairs into vectors of length 4.
		# There is one "image channel")
		assert sequences.X.shape == (3, 4, 58, 1)

deepchem/utils/save.py

+75 −0

Original line number	Diff line number	Diff line
		@@ -8,12 +8,14 @@ from __future__ import unicode_literals
		# TODO(rbharath): Use standard joblib once old-data has been regenerated.
		import joblib
		from sklearn.externals import joblib as old_joblib
		from sklearn.preprocessing import LabelEncoder, OneHotEncoder
		import gzip
		import json
		import pickle
		import pandas as pd
		import numpy as np
		import os
		import sys
		import deepchem
		from rdkit import Chem

		@@ -104,6 +106,79 @@ def load_csv_files(filenames, shard_size=None, verbose=True):
		yield df


		def seq_one_hot_encode(sequences):
		"""One hot encodes list of genomic sequences.

		Sequences encoded have shape (N_sequences, 4, sequence_length, 1).
		Here 4 is for the 4 basepairs (ACGT) present in genomic sequences.
		These sequences will be processed as images with one color channel.

		Parameters
		----------
		sequences: np.ndarray
		Array of genetic sequences

		Raises
		------
		ValueError:
		If sequences are of different lengths.

		Returns
		-------
		np.ndarray: Shape (N_sequences, 4, sequence_length, 1).
		"""
		sequence_length = len(sequences[0])
		# depends on Python version
		integer_type = np.int32
		# The label encoder is given characters for ACGTN
		label_encoder = LabelEncoder().fit(np.array(('ACGTN',)).view(integer_type))
		# These are transformed in 0, 1, 2, 3, 4 in input sequence
		integer_array = []
		# TODO(rbharath): Unlike the DRAGONN implementation from which this
		# was ported, I couldn't transform the "ACGT..." strings into
		# integers all at once. Had to do one at a time. Might be worth
		# figuring out what's going on under the hood.
		for sequence in sequences:
		if len(sequence) != sequence_length:
		raise ValueError("All sequences must be of same length")
		integer_seq = label_encoder.transform(
		np.array((sequence,)).view(integer_type))
		integer_array.append(integer_seq)
		integer_array = np.concatenate(integer_array)
		integer_array = integer_array.reshape(len(sequences), sequence_length)
		one_hot_encoding = OneHotEncoder(
		sparse=False, n_values=5, dtype=integer_type).fit_transform(integer_array)

		return one_hot_encoding.reshape(len(sequences), sequence_length, 5,
		1).swapaxes(1, 2)[:, [0, 1, 2, 4], :, :]


		def encode_fasta_sequence(fname):
		"""
		Loads fasta file and returns an array of one-hot sequences.

		Parameters
		----------
		fname: str
		Filename of fasta file.
		"""
		name, seq_chars = None, []
		sequences = []
		with open(fname) as fp:
		for line in fp:
		line = line.rstrip()
		if line.startswith(">"):
		if name:
		sequences.append(''.join(seq_chars).upper())
		name, seq_chars = line, []
		else:
		seq_chars.append(line)
		if name is not None:
		sequences.append(''.join(seq_chars).upper())

		return seq_one_hot_encode(np.array(sequences))


		def save_metadata(tasks, metadata_df, data_dir):
		"""
		Saves the metadata for a DiskDataset

Admin message