Unverified Commit f9510bee authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1414 from miaecle/master

Diabetic Retinopathy
parents 2a0aeacc 66d5fa3d
Loading
Loading
Loading
Loading
+107 −0
Original line number Diff line number Diff line
"""
Diabetic Retinopathy Images loader.
"""
from __future__ import division
from __future__ import unicode_literals

import os
import logging
import deepchem
import numpy as np
import pandas as pd

logger = logging.getLogger(__name__)


def load_images_DR(split='random', seed=None):
  """ Loader for DR images """
  data_dir = deepchem.utils.get_data_dir()
  images_path = os.path.join(data_dir, 'DR', 'train')
  label_path = os.path.join(data_dir, 'DR', 'trainLabels.csv')
  if not os.path.exists(images_path) or not os.path.exists(label_path):
    logger.warn("Cannot locate data, \n\
        all images(.png) should be stored in the folder: $DEEPCHEM_DATA_DIR/DR/train/,\n\
        corresponding label file should be stored as $DEEPCHEM_DATA_DIR/DR/trainLabels.csv.\n\
        Please refer to https://www.kaggle.com/c/diabetic-retinopathy-detection for data access"
               )

  image_names = os.listdir(images_path)
  raw_images = []
  for im in image_names:
    if im.endswith('.jpeg') and not im.startswith(
        'cut_') and not 'cut_' + im in image_names:
      raw_images.append(im)
  if len(raw_images) > 0:
    cut_raw_images(raw_images, images_path)

  image_names = [
      p for p in os.listdir(images_path)
      if p.startswith('cut_') and p.endswith('.png')
  ]
  all_labels = dict(zip(*np.transpose(np.array(pd.read_csv(label_path)))))

  print("Number of images: %d" % len(image_names))
  labels = np.array(
      [all_labels[os.path.splitext(n)[0][4:]] for n in image_names]).reshape(
          (-1, 1))
  image_full_paths = [os.path.join(images_path, n) for n in image_names]

  classes, cts = np.unique(all_labels.values(), return_counts=True)
  weight_ratio = dict(zip(classes, np.max(cts) / cts.astype(float)))
  weights = np.array([weight_ratio[l[0]] for l in labels]).reshape((-1, 1))

  loader = deepchem.data.ImageLoader()
  dat = loader.featurize(
      image_full_paths, labels=labels, weights=weights, read_img=False)
  if split == None:
    return dat

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter()
  }
  if not seed is None:
    np.random.seed(seed)
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dat)
  all_dataset = (train, valid, test)
  return all_dataset


def cut_raw_images(all_images, path):
  """Preprocess images:
    (1) Crop the central square including retina
    (2) Reduce resolution to 512 * 512
  """
  print("Num of images to be processed: %d" % len(all_images))
  try:
    import cv2
  except:
    logger.warn("OpenCV required for image preprocessing")
    return

  for i, img_path in enumerate(all_images):
    if i % 100 == 0:
      print("on image %d" % i)
    if os.path.join(path, 'cut_' + os.path.splitext(img_path)[0] + '.png'):
      continue
    img = cv2.imread(os.path.join(path, img_path))
    edges = cv2.Canny(img, 10, 30)
    coords = zip(*np.where(edges > 0))
    n_p = len(coords)

    coords.sort(key=lambda x: (x[0], x[1]))
    center_0 = int(
        (coords[int(0.01 * n_p)][0] + coords[int(0.99 * n_p)][0]) / 2)
    coords.sort(key=lambda x: (x[1], x[0]))
    center_1 = int(
        (coords[int(0.01 * n_p)][1] + coords[int(0.99 * n_p)][1]) / 2)

    edge_size = min(
        [center_0, img.shape[0] - center_0, center_1, img.shape[1] - center_1])
    img_cut = img[(center_0 - edge_size):(center_0 + edge_size), (
        center_1 - edge_size):(center_1 + edge_size)]
    img_cut = cv2.resize(img_cut, (512, 512))
    cv2.imwrite(
        os.path.join(path, 'cut_' + os.path.splitext(img_path)[0] + '.png'),
        img_cut)
+278 −0
Original line number Diff line number Diff line
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 10 06:12:10 2018

@author: zqwu
"""

import numpy as np
import tensorflow as tf

from deepchem.data import NumpyDataset, pad_features
from deepchem.metrics import to_one_hot
from deepchem.models.tensorgraph.layers import Layer, Dense, SoftMax, Reshape, \
    SparseSoftMaxCrossEntropy, BatchNorm, Conv2D, MaxPool2D, WeightedError, \
    Dropout, ReLU, Stack, Flatten, ReduceMax, WeightDecay
from deepchem.models.tensorgraph.layers import L2Loss, Label, Weights, Feature
from deepchem.models.tensorgraph.tensor_graph import TensorGraph
from deepchem.trans import undo_transforms
from deepchem.data.data_loader import ImageLoader
from sklearn.metrics import confusion_matrix, accuracy_score


class DRModel(TensorGraph):

  def __init__(self,
               n_tasks=1,
               image_size=512,
               n_downsample=6,
               n_init_kernel=16,
               n_fully_connected=[1024],
               n_classes=5,
               augment=False,
               **kwargs):
    """
    Parameters
    ----------
    n_tasks: int
      Number of tasks
    image_size: int
      Resolution of the input images(square)
    n_downsample: int
      Downsample ratio in power of 2
    n_init_kernel: int
      Kernel size for the first convolutional layer
    n_fully_connected: list of int
      Shape of FC layers after convolutions
    n_classes: int
      Number of classes to predict (only used in classification mode)
    augment: bool
      If to use data augmentation 
    """
    self.n_tasks = n_tasks
    self.image_size = image_size
    self.n_downsample = n_downsample
    self.n_init_kernel = n_init_kernel
    self.n_fully_connected = n_fully_connected
    self.n_classes = n_classes
    self.augment = augment
    super(DRModel, self).__init__(**kwargs)
    self.build_graph()

  def build_graph(self):
    # inputs placeholder
    self.inputs = Feature(
        shape=(None, self.image_size, self.image_size, 3), dtype=tf.uint8)
    # data preprocessing and augmentation
    in_layer = DRAugment(
        self.augment,
        self.batch_size,
        size=(self.image_size, self.image_size),
        in_layers=[self.inputs])
    # first conv layer
    in_layer = Conv2D(
        self.n_init_kernel,
        kernel_size=7,
        activation_fn=None,
        in_layers=[in_layer])
    in_layer = BatchNorm(in_layers=[in_layer])
    in_layer = ReLU(in_layers=[in_layer])

    # downsample by max pooling
    res_in = MaxPool2D(
        ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], in_layers=[in_layer])

    for ct_module in range(self.n_downsample - 1):
      # each module is a residual convolutional block
      # followed by a convolutional downsample layer
      in_layer = Conv2D(
          self.n_init_kernel * 2**(ct_module - 1),
          kernel_size=1,
          activation_fn=None,
          in_layers=[res_in])
      in_layer = BatchNorm(in_layers=[in_layer])
      in_layer = ReLU(in_layers=[in_layer])
      in_layer = Conv2D(
          self.n_init_kernel * 2**(ct_module - 1),
          kernel_size=3,
          activation_fn=None,
          in_layers=[in_layer])
      in_layer = BatchNorm(in_layers=[in_layer])
      in_layer = ReLU(in_layers=[in_layer])
      in_layer = Conv2D(
          self.n_init_kernel * 2**ct_module,
          kernel_size=1,
          activation_fn=None,
          in_layers=[in_layer])
      res_a = BatchNorm(in_layers=[in_layer])

      res_out = res_in + res_a
      res_in = Conv2D(
          self.n_init_kernel * 2**(ct_module + 1),
          kernel_size=3,
          stride=2,
          in_layers=[res_out])
      res_in = BatchNorm(in_layers=[res_in])

    # max pooling over the final outcome
    in_layer = ReduceMax(axis=(1, 2), in_layers=[res_in])

    for layer_size in self.n_fully_connected:
      # fully connected layers
      in_layer = Dense(
          layer_size, activation_fn=tf.nn.relu, in_layers=[in_layer])
      # dropout for dense layers
      #in_layer = Dropout(0.25, in_layers=[in_layer])

    logit_pred = Dense(
        self.n_tasks * self.n_classes, activation_fn=None, in_layers=[in_layer])
    logit_pred = Reshape(
        shape=(None, self.n_tasks, self.n_classes), in_layers=[logit_pred])

    weights = Weights(shape=(None, self.n_tasks))
    labels = Label(shape=(None, self.n_tasks), dtype=tf.int32)

    output = SoftMax(logit_pred)
    self.add_output(output)
    loss = SparseSoftMaxCrossEntropy(in_layers=[labels, logit_pred])
    weighted_loss = WeightedError(in_layers=[loss, weights])

    # weight decay regularizer
    # weighted_loss = WeightDecay(0.1, 'l2', in_layers=[weighted_loss])
    self.set_loss(weighted_loss)

  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
        feed_dict = dict()

        if None in X_b:
          # load images on the fly
          feed_dict[self.features[0]] = ImageLoader.load_img(ids_b)
        else:
          feed_dict[self.features[0]] = X_b

        if y_b is not None and not predict:
          feed_dict[self.labels[0]] = y_b
        if w_b is not None and not predict:
          feed_dict[self.task_weights[0]] = w_b

        yield feed_dict


def DRAccuracy(y, y_pred):
  y_pred = np.argmax(y_pred, 1)
  return accuracy_score(y, y_pred)


def DRSpecificity(y, y_pred):
  y_pred = (np.argmax(y_pred, 1) > 0) * 1
  y = (y > 0) * 1
  TN = sum((1 - y_pred) * (1 - y))
  N = sum(1 - y)
  return float(TN) / N


def DRSensitivity(y, y_pred):
  y_pred = (np.argmax(y_pred, 1) > 0) * 1
  y = (y > 0) * 1
  TP = sum(y_pred * y)
  P = sum(y)
  return float(TP) / P


def ConfusionMatrix(y, y_pred):
  y_pred = np.argmax(y_pred, 1)
  return confusion_matrix(y, y_pred)


def QuadWeightedKappa(y, y_pred):
  y_pred = np.argmax(y_pred, 1)
  cm = confusion_matrix(y, y_pred)
  classes_y, counts_y = np.unique(y, return_counts=True)
  classes_y_pred, counts_y_pred = np.unique(y_pred, return_counts=True)
  E = np.zeros((classes_y.shape[0], classes_y.shape[0]))
  for i, c1 in enumerate(classes_y):
    for j, c2 in enumerate(classes_y_pred):
      E[c1, c2] = counts_y[i] * counts_y_pred[j]
  E = E / np.sum(E) * np.sum(cm)
  w = np.zeros((classes_y.shape[0], classes_y.shape[0]))
  for i in range(classes_y.shape[0]):
    for j in range(classes_y.shape[0]):
      w[i, j] = float((i - j)**2) / (classes_y.shape[0] - 1)**2
  re = 1 - np.sum(w * cm) / np.sum(w * E)
  return re


class DRAugment(Layer):

  def __init__(self,
               augment,
               batch_size,
               distort_color=True,
               central_crop=True,
               size=(512, 512),
               **kwargs):
    """
    Parameters
    ----------
    augment: bool
      If to use data augmentation 
    batch_size: int
      Number of images in the batch
    distort_color: bool
      If to apply random distortion on the color
    central_crop: bool
      If to randomly crop the sample around the center
    size: int
      Resolution of the input images(square)
    """
    self.augment = augment
    self.batch_size = batch_size
    self.distort_color = distort_color
    self.central_crop = central_crop
    self.size = size
    super(DRAugment, self).__init__(**kwargs)

  def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
    inputs = self._get_input_tensors(in_layers)
    parent_tensor = inputs[0]
    training = kwargs['training'] if 'training' in kwargs else 1.0

    parent_tensor = tf.image.convert_image_dtype(parent_tensor, tf.float32)
    if not self.augment:
      out_tensor = parent_tensor
    else:

      def preprocess(img):
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_flip_up_down(img)
        img = tf.image.rot90(img, k=np.random.randint(0, 4))
        if self.distort_color:
          img = tf.image.random_brightness(img, max_delta=32. / 255.)
          img = tf.image.random_saturation(img, lower=0.5, upper=1.5)
          img = tf.clip_by_value(img, 0.0, 1.0)
        if self.central_crop:
          # sample cut ratio from a clipped gaussian
          img = tf.image.central_crop(img,
                                      np.clip(
                                          np.random.normal(1., 0.06), 0.8, 1.))
          img = tf.image.resize_bilinear(
              tf.expand_dims(img, 0), tf.convert_to_tensor(self.size))[0]
        return img

      outs = tf.map_fn(preprocess, parent_tensor)
      # train/valid differences
      out_tensor = training * outs + (1 - training) * parent_tensor
    if set_tensors:
      self.out_tensor = out_tensor
    return out_tensor
+43 −0
Original line number Diff line number Diff line
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 10 06:12:11 2018

@author: zqwu
"""

import deepchem as dc
import numpy as np
import pandas as pd
import os
import logging
from model import DRModel, DRAccuracy, ConfusionMatrix, QuadWeightedKappa
from data import load_images_DR

train, valid, test = load_images_DR(split='random', seed=123)
# Define and build model
model = DRModel(
    n_init_kernel=32,
    batch_size=32,
    learning_rate=1e-5,
    augment=True,
    model_dir='./test_model')
if not os.path.exists('./test_model'):
  os.mkdir('test_model')
model.build()
#model.restore()
metrics = [
    dc.metrics.Metric(DRAccuracy, mode='classification'),
    dc.metrics.Metric(QuadWeightedKappa, mode='classification')
]
cm = [dc.metrics.Metric(ConfusionMatrix, mode='classification')]

logger = logging.getLogger('deepchem.models.tensorgraph.tensor_graph')
logger.setLevel(logging.DEBUG)
for i in range(10):
  model.fit(train, nb_epoch=10, checkpoint_interval=3512)
  model.evaluate(train, metrics)
  model.evaluate(valid, metrics)
  model.evaluate(valid, cm)
  model.evaluate(test, metrics)
  model.evaluate(test, cm)
+26 −12
Original line number Diff line number Diff line
@@ -230,8 +230,7 @@ class DataLoader(object):
          assert len(X) == len(ids)

        time2 = time.time()
        log(
            "TIMING: featurizing shard %d took %0.3f s" %
        log("TIMING: featurizing shard %d took %0.3f s" %
            (shard_num, time2 - time1), self.verbose)
        yield X, y, w, ids

@@ -295,8 +294,7 @@ class SDFLoader(DataLoader):

  def featurize_shard(self, shard):
    """Featurizes a shard of an input dataframe."""
    log(
        "Currently featurizing feature_type: %s" %
    log("Currently featurizing feature_type: %s" %
        self.featurizer.__class__.__name__, self.verbose)
    return featurize_mol_df(shard, self.featurizer, field=self.mol_field)

@@ -349,7 +347,12 @@ class ImageLoader(DataLoader):
      tasks = []
    self.tasks = tasks

  def featurize(self, input_files, in_memory=True):
  def featurize(self,
                input_files,
                labels=None,
                weights=None,
                read_img=True,
                in_memory=True):
    """Featurizes image files.

    Parameters
@@ -395,6 +398,23 @@ class ImageLoader(DataLoader):
          raise ValueError("Unsupported file format")
      input_files = remainder

    if read_img:
      X = self.load_img(image_files)
    else:
      X = [None] * len(image_files)
    if in_memory:
      return NumpyDataset(X, y=labels, w=weights, ids=image_files)

    else:
      # from_numpy currently requires labels. Make dummy labels
      if labels is None:
        labels = np.zeros((len(image_files), 1))
      if weights is None:
        weights = np.zeros((len(image_files), 1))
      return DiskDataset.from_numpy(X, labels, w=weights, ids=image_files)

  @staticmethod
  def load_img(image_files):
    images = []
    for image_file in image_files:
      _, extension = os.path.splitext(image_file)
@@ -407,10 +427,4 @@ class ImageLoader(DataLoader):
        images.append(imarray)
      else:
        raise ValueError("Unsupported image filetype for %s" % image_file)
    images = np.array(images)
    if in_memory:
      return NumpyDataset(images)
    else:
      # from_numpy currently requires labels. Make dummy labels
      labels = np.zeros((len(images), 1))
      return DiskDataset.from_numpy(images, labels)
    return np.array(images)
+1 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ from deepchem.models.tensorgraph.optimizers import Adam
from deepchem.trans import undo_transforms
from deepchem.utils.evaluate import GeneratorEvaluator

logging.basicConfig()
logger = logging.getLogger(__name__)