Unverified Commit 85519020 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1498 from VIGS25/deepmhc

#991: DeepMHC for protein-peptide binding
parents c0a22937 7e80ca07
Loading
Loading
Loading
Loading
+150 −0
Original line number Diff line number Diff line
"""BD2013 dataset loader to be used with DeepMHC."""

from __future__ import division
from __future__ import print_function

__author__ = "Vignesh Ram Somnath"
__license__ = "MIT"

import numpy as np
import os
import logging

import deepchem as dc

DATASET_URL = "http://tools.iedb.org/static/main/binding_data_2013.zip"
FILE_NAME = "bdata.20130222.mhci.txt"

TEST_FILES = [
    "2016-12-09", "2016-05-03", "2016-02-19", "2015-08-07", "2015-07-31",
    "2015-07-17", "2015-06-26", "2015-06-19", "2015-05-15", "2015-02-06",
    "2015-01-16", "2014-10-31", "2014-06-20", "2014-05-23", "2014-03-28",
    "2014-03-21"
]

TEST_URLS = [
    "http://tools.iedb.org/auto_bench/mhci/weekly/accumulated/" + str(date) +
    "/predictions" for date in TEST_FILES
]

logger = logging.getLogger(__name__)

aa_charset = [
    "A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P",
    "S", "T", "W", "Y", "V"
]


def to_one_hot_array(sequence):
  """Converts the sequence to one-hot-array."""
  one_hot_array = list()
  for letter in sequence:
    one_hot_array.append([letter == i for i in aa_charset])
  return np.asarray(one_hot_array, dtype=np.int32)


def featurize(sequences, pad_length=13):
  """One-hot encoding for sequences with padding."""
  features = list()
  for sequence in sequences:
    one_hot_seq = to_one_hot_array(sequence)
    num_to_pad = pad_length - len(sequence)
    if num_to_pad % 2 == 0:
      one_hot_seq = np.pad(
          one_hot_seq, [(int(num_to_pad / 2), int(num_to_pad / 2)), (0, 0)],
          mode='constant')
    else:
      one_hot_seq = np.pad(
          one_hot_seq, [(int((num_to_pad + 1) / 2), int((num_to_pad - 1) / 2)),
                        (0, 0)],
          mode='constant')
    features.append(one_hot_seq)
  features = np.asarray(features)
  return features


def load_bd2013_human(mhc_allele="HLA-A*02:01",
                      seq_len=9,
                      pad_len=13,
                      test_measure_type="ic50",
                      reload=True):
  """Loads the human specific data from the bd2013 dataset."""
  bd13_tasks = ["-log(IC50)"]

  data_dir = dc.utils.get_data_dir()
  save_dir = os.path.join(data_dir, "bd13", mhc_allele, str(seq_len))
  train_dir = os.path.join(save_dir, "train_dir")
  test_dir = os.path.join(save_dir, "test_dir")

  # TODO (VIGS25): Account for the reload option

  # Downloading train files
  train_file = os.path.join(data_dir, "binding_data_2013.zip")
  if not os.path.exists(train_file):
    logger.info("Downloading Binding data...")
    dc.utils.download_url(url=DATASET_URL, dest_dir=data_dir)
  if os.path.exists(train_dir):
    logger.info("Directory for training data already exists")
  else:
    logger.info("Unzipping full dataset...")
    dc.utils.unzip_file(file=train_file, dest_dir=data_dir)

  # Parsing training data
  train_labels = list()
  train_sequences = list()
  with open(os.path.join(data_dir, FILE_NAME), "r") as f:
    for line in f.readlines():
      elements = line.strip().split("\t")
      # Pick only sequences from humans, belong to specific MHC allele and having given seq_len
      if elements[0] == "human" and elements[1] == mhc_allele and int(
          elements[2]) == seq_len:
        train_sequences.append(elements[3])
        train_labels.append(float(elements[-1]))

  # Test Files loading
  test_labels = list()
  test_sequences = list()
  test_check_file = os.path.join(data_dir, TEST_FILES[0] + '_predictions.tsv')
  if not os.path.exists(test_check_file):
    for index, filename in enumerate(TEST_FILES):
      test_url = TEST_URLS[index]
      test_filename = filename + '_predictions.tsv'
      dc.utils.download_url(url=test_url, dest_dir=data_dir, name=test_filename)

  for filename in TEST_FILES:
    test_filename = os.path.join(data_dir, filename + '_predictions.tsv')
    with open(test_filename, 'r') as f:
      for line in f.readlines():
        elements = line.strip().split("\t")
        if len(elements) == 1:
          continue
        if elements[2] == mhc_allele and int(
            elements[3]) == seq_len and elements[4] == test_measure_type:
          test_sequences.append(elements[5])
          test_labels.append(float(elements[6]))

  # One Hot Featurization
  logger.info("Featurizing training data...")
  train_features = featurize(train_sequences, pad_length=pad_len)
  train_labels = np.array(train_labels).astype(np.float32)
  train_labels = np.expand_dims(train_labels, axis=1)

  logger.info("Featurizing test data...")
  test_features = featurize(test_sequences, pad_length=pad_len)
  test_labels = np.array(test_labels).astype(np.float32)
  test_labels = np.expand_dims(test_labels, axis=1)

  train_dataset = dc.data.DiskDataset.from_numpy(train_features, train_labels)
  test_dataset = dc.data.DiskDataset.from_numpy(test_features, test_labels)

  train_dataset.move(new_data_dir=train_dir)
  test_dataset.move(new_data_dir=test_dir)

  logger.info("Featurization complete.")

  transformers = []
  for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
    test_dataset = transformer.transform(test_dataset)

  return bd13_tasks, (train_dataset, None, test_dataset), transformers
+107 −0
Original line number Diff line number Diff line
"""DeepMHC model, found in https://www.biorxiv.org/content/early/2017/12/24/239236"""

from __future__ import division
from __future__ import unicode_literals

__author__ = "Vignesh Ram Somnath"
__license__ = "MIT"

import numpy as np
import tensorflow as tf

from deepchem.data import NumpyDataset
from deepchem.models.tensorgraph.tensor_graph import TensorGraph
from deepchem.models.tensorgraph.layers import Conv1D, MaxPool1D, Dense, Dropout
from deepchem.models.tensorgraph.layers import Flatten
from deepchem.models.tensorgraph.layers import Feature, Weights, Label
from deepchem.models.tensorgraph.layers import L2Loss, WeightedError


class DeepMHC(TensorGraph):

  name = ['DeepMHC']

  def __init__(self,
               batch_size=64,
               pad_length=13,
               dropout_p=0.5,
               num_amino_acids=20,
               mode="regression",
               **kwargs):

    assert mode in ["regression", "classification"]
    self.mode = mode
    self.batch_size = batch_size
    self.dropout_p = dropout_p
    self.pad_length = pad_length
    self.num_amino_acids = num_amino_acids
    super(DeepMHC, self).__init__(**kwargs)
    self._build_graph()

  def _build_graph(self):

    self.one_hot_seq = Feature(
        shape=(None, self.pad_length, self.num_amino_acids), dtype=tf.float32)

    conv1 = Conv1D(kernel_size=2, filters=512, in_layers=[self.one_hot_seq])

    maxpool1 = MaxPool1D(strides=2, padding="VALID", in_layers=[conv1])
    conv2 = Conv1D(kernel_size=3, filters=512, in_layers=[maxpool1])
    flattened = Flatten(in_layers=[conv2])
    dense1 = Dense(
        out_channels=400, in_layers=[flattened], activation_fn=tf.nn.tanh)
    dropout = Dropout(dropout_prob=self.dropout_p, in_layers=[dense1])
    output = Dense(out_channels=1, in_layers=[dropout], activation_fn=None)
    self.add_output(output)

    if self.mode == "regression":
      label = Label(shape=(None, 1))
      loss = L2Loss(in_layers=[label, output])
    else:
      raise NotImplementedError(
          "Classification support not added yet. Missing details in paper.")
    weights = Weights(shape=(None,))
    weighted_loss = WeightedError(in_layers=[loss, weights])
    self.set_loss(weighted_loss)

  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):

    for epoch in range(epochs):
      for (X_b, y_b, w_b,
           ids_b) in dataset.iterbatches(batch_size=self.batch_size):
        feed_dict = {}
        feed_dict[self.one_hot_seq] = X_b
        if y_b is not None:
          feed_dict[self.labels[0]] = -np.log10(y_b)
        if w_b is not None and not predict:
          feed_dict[self.task_weights[0]] = w_b

        yield feed_dict

  def predict_on_batch(self, X, transformers=[], outputs=None):
    dataset = NumpyDataset(X, y=None)
    generator = self.default_generator(dataset, predict=True, pad_batches=False)
    preds = self.predict_on_generator(generator, transformers, outputs)
    preds = 10**-preds  # Since we get train on -log10(IC50)
    return preds

  def create_estimator_inputs(self, feature_columns, weight_column, features,
                              labels, mode):
    tensors = dict()
    for layer, column in zip(self.features, feature_columns):
      feature_column = tf.feature_column.input_layer(features, [column])
      if feature_column.dtype != column.dtype:
        feature_column = tf.cast(feature_column, column.dtype)
      tensors[layer] = feature_column
    if weight_column is not None:
      tensors[self.task_weights[0]] = tf.feature_column.input_layer(
          features, [weight_column])
    if labels is not None:
      tensors[self.labels[[0]]] = labels

    return tensors
+41 −0
Original line number Diff line number Diff line
"""
Script to train DeepMHC model on the BD2013 dataset.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

from bd13_datasets import aa_charset, load_bd2013_human
from deepmhc import DeepMHC

import numpy as np
import deepchem as dc
from scipy.stats import spearmanr

# Args
pad_len = 13
num_amino_acids = len(aa_charset)
mhc_allele = "HLA-A*02:01"
dropout_p = 0.5
batch_size = 64
nb_epochs = 100

tasks, datasets, transformers = load_bd2013_human(
    mhc_allele=mhc_allele, seq_len=9, pad_len=pad_len)
metric = dc.metrics.Metric(metric=dc.metrics.pearsonr, mode="regression")

train_dataset, _, test_dataset = datasets
model = DeepMHC(num_amino_acids=num_amino_acids, pad_length=pad_len)

model.fit(train_dataset, nb_epoch=nb_epochs)

print("Evaluating models...")
train_scores = model.evaluate(train_dataset, [metric], transformers)
test_scores = model.evaluate(test_dataset, [metric], transformers)

print("Train scores")
print(train_scores['pearsonr'])

print("Test scores")
print(test_scores['pearsonr'])