Merge pull request #1420 from vsag96/Fix-1318 (29bcf0fb) · Commits · 钟慕尧 / deepchem

deepchem/data/data_loader.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -18,7 +18,7 @@ import sys
		from deepchem.utils.save import log
		from deepchem.utils.save import load_csv_files
		from deepchem.utils.save import load_sdf_files
		from deepchem.utils.save import encode_fasta_sequence
		from deepchem.utils.genomics import encode_fasta_sequence
		from deepchem.feat import UserDefinedFeaturizer
		from deepchem.data import DiskDataset
		from deepchem.data import NumpyDataset

deepchem/metrics/tests/test_genomics.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -27,7 +27,7 @@ class TestGenomicMetrics(unittest.TestCase):
		# Encode motif
		motif_name = "TAL1_known4"
		sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
		sequences = dc.utils.save.seq_one_hot_encode(sequences, letters=LETTERS)
		sequences = dc.utils.genomics.seq_one_hot_encode(sequences, letters=LETTERS)
		# sequences now has shape (3, 4, 5, 1)
		self.assertEqual(sequences.shape, (3, 4, 5, 1))

		@@ -38,7 +38,7 @@ class TestGenomicMetrics(unittest.TestCase):
		"""Test get_pssm_scores returns correct shape."""
		motif_name = "TAL1_known4"
		sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
		sequences = dc.utils.save.seq_one_hot_encode(sequences, letters=LETTERS)
		sequences = dc.utils.genomics.seq_one_hot_encode(sequences, letters=LETTERS)
		# sequences now has shape (3, 4, 5, 1)
		self.assertEqual(sequences.shape, (3, 4, 5, 1))
		pssm = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
		@@ -50,7 +50,7 @@ class TestGenomicMetrics(unittest.TestCase):
		"""Test in-silico mutagenesis returns correct shape."""
		# Construct and train SequenceDNN model
		sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
		sequences = dc.utils.save.seq_one_hot_encode(sequences, letters=LETTERS)
		sequences = dc.utils.genomics.seq_one_hot_encode(sequences, letters=LETTERS)
		labels = np.array([1, 0, 0])
		labels = np.reshape(labels, (3, 1))
		self.assertEqual(sequences.shape, (3, 4, 5, 1))
		@@ -71,7 +71,7 @@ class TestGenomicMetrics(unittest.TestCase):
		"""Test in-silico mutagenesis returns nonzero output."""
		# Construct and train SequenceDNN model
		sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
		sequences = dc.utils.save.seq_one_hot_encode(sequences, letters=LETTERS)
		sequences = dc.utils.genomics.seq_one_hot_encode(sequences, letters=LETTERS)
		labels = np.array([1, 0, 0])
		labels = np.reshape(labels, (3, 1))
		self.assertEqual(sequences.shape, (3, 4, 5, 1))

deepchem/utils/genomics.py

+105 −0

Original line number	Diff line number	Diff line
		@@ -3,6 +3,111 @@ Genomic data handling utilities.
		"""
		import simdna
		from simdna.synthetic import LoadedEncodeMotifs
		import numpy as np

		loaded_motifs = LoadedEncodeMotifs(
		simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)


		def seq_one_hot_encode(sequences, letters='ATCGN'):
		"""One hot encodes list of genomic sequences.

		Sequences encoded have shape (N_sequences, N_letters, sequence_length, 1).
		These sequences will be processed as images with one color channel.

		Parameters
		----------
		sequences: np.ndarray
		Array of genetic sequences
		letters: str
		String with the set of possible letters in the sequences.

		Raises
		------
		ValueError:
		If sequences are of different lengths.

		Returns
		-------
		np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
		"""

		# The label encoder is given characters for ACGTN
		letter_encoder = {l: i for i, l in enumerate(letters)}
		alphabet_length = len(letter_encoder)

		# Peak at the first sequence to get the length of the sequence.
		try:
		first_seq = next(sequences)
		tail_seq = sequences
		except TypeError:
		first_seq = sequences[0]
		tail_seq = sequences[1:]

		sequence_length = len(first_seq)

		seqs = []

		seqs.append(
		_seq_to_encoded(first_seq, letter_encoder, alphabet_length,
		sequence_length))

		for other_seq in tail_seq:
		if len(other_seq) != sequence_length:
		raise ValueError

		seqs.append(
		_seq_to_encoded(other_seq, letter_encoder, alphabet_length,
		sequence_length))

		return np.expand_dims(np.array(seqs), -1)


		def _seq_to_encoded(seq, letter_encoder, alphabet_length, sequence_length):
		b = np.zeros((alphabet_length, sequence_length))
		seq_ints = [letter_encoder[s] for s in seq]
		b[seq_ints, np.arange(sequence_length)] = 1

		return b


		def encode_fasta_sequence(fname):
		"""
		Loads fasta file and returns an array of one-hot sequences.

		Parameters
		----------
		fname: str
		Filename of fasta file.

		Returns
		-------
		np.ndarray: Shape (N_sequences, 5, sequence_length, 1).
		"""

		return encode_bio_sequence(fname)


		def encode_bio_sequence(fname, file_type="fasta", letters="ATCGN"):
		"""
		Loads a sequence file and returns an array of one-hot sequences.

		Parameters
		----------
		fname: str
		Filename of fasta file.
		file_type: str
		The type of file encoding to process, e.g. fasta or fastq, this
		is passed to Biopython.SeqIO.parse.
		letters: str
		The set of letters that the sequences consist of, e.g. ATCG.

		Returns
		-------
		np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
		"""

		from Bio import SeqIO

		sequences = SeqIO.parse(fname, file_type)
		return seq_one_hot_encode(sequences, letters)

deepchem/utils/save.py

+14 −44

Original line number	Diff line number	Diff line
		@@ -16,6 +16,8 @@ import numpy as np
		import os
		import deepchem
		from rdkit import Chem
		import warnings
		from deepchem.utils.genomics import encode_bio_sequence as encode_sequence, encode_fasta_sequence as fasta_sequence, seq_one_hot_encode as seq_one_hotencode


		def log(string, verbose=True):
		@@ -126,44 +128,10 @@ def seq_one_hot_encode(sequences, letters='ATCGN'):
		-------
		np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
		"""

		# The label encoder is given characters for ACGTN
		letter_encoder = {l: i for i, l in enumerate(letters)}
		alphabet_length = len(letter_encoder)

		# Peak at the first sequence to get the length of the sequence.
		try:
		first_seq = next(sequences)
		tail_seq = sequences
		except TypeError:
		first_seq = sequences[0]
		tail_seq = sequences[1:]

		sequence_length = len(first_seq)

		seqs = []

		seqs.append(
		_seq_to_encoded(first_seq, letter_encoder, alphabet_length,
		sequence_length))

		for other_seq in tail_seq:
		if len(other_seq) != sequence_length:
		raise ValueError

		seqs.append(
		_seq_to_encoded(other_seq, letter_encoder, alphabet_length,
		sequence_length))

		return np.expand_dims(np.array(seqs), -1)


		def _seq_to_encoded(seq, letter_encoder, alphabet_length, sequence_length):
		b = np.zeros((alphabet_length, sequence_length))
		seq_ints = [letter_encoder[s] for s in seq]
		b[seq_ints, np.arange(sequence_length)] = 1

		return b
		warnings.warn(
		"This Function has been deprecated and now resides in deepchem.utils.genomics ",
		DeprecationWarning)
		return seq_one_hotencode(sequences, letters=letters)


		def encode_fasta_sequence(fname):
		@@ -179,8 +147,11 @@ def encode_fasta_sequence(fname):
		-------
		np.ndarray: Shape (N_sequences, 5, sequence_length, 1).
		"""
		warnings.warn(
		"This Function has been deprecated and now resides in deepchem.utils.genomics",
		DeprecationWarning)

		return encode_bio_sequence(fname)
		return fasta_sequence(fname)


		def encode_bio_sequence(fname, file_type="fasta", letters="ATCGN"):
		@@ -201,11 +172,10 @@ def encode_bio_sequence(fname, file_type="fasta", letters="ATCGN"):
		-------
		np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
		"""

		from Bio import SeqIO

		sequences = SeqIO.parse(fname, file_type)
		return seq_one_hot_encode(sequences, letters)
		warnings.warn(
		"This Function has been deprecated and now resides in deepchem.utils.genomics ",
		DeprecationWarning)
		return encode_sequence(fname, file_type=file_type, letters=letters)


		def save_metadata(tasks, metadata_df, data_dir):

Admin message