Commit 66d5fa3d authored by miaecle's avatar miaecle
Browse files

data loader

parent 49df6560
Loading
Loading
Loading
Loading
+107 −0
Original line number Diff line number Diff line
"""
Diabetic Retinopathy Images loader.
"""
from __future__ import division
from __future__ import unicode_literals

import os
import logging
import deepchem
import numpy as np
import pandas as pd

logger = logging.getLogger(__name__)


def load_images_DR(split='random', seed=None):
  """ Loader for DR images """
  data_dir = deepchem.utils.get_data_dir()
  images_path = os.path.join(data_dir, 'DR', 'train')
  label_path = os.path.join(data_dir, 'DR', 'trainLabels.csv')
  if not os.path.exists(images_path) or not os.path.exists(label_path):
    logger.warn("Cannot locate data, \n\
        all images(.png) should be stored in the folder: $DEEPCHEM_DATA_DIR/DR/train/,\n\
        corresponding label file should be stored as $DEEPCHEM_DATA_DIR/DR/trainLabels.csv.\n\
        Please refer to https://www.kaggle.com/c/diabetic-retinopathy-detection for data access"
               )

  image_names = os.listdir(images_path)
  raw_images = []
  for im in image_names:
    if im.endswith('.jpeg') and not im.startswith(
        'cut_') and not 'cut_' + im in image_names:
      raw_images.append(im)
  if len(raw_images) > 0:
    cut_raw_images(raw_images, images_path)

  image_names = [
      p for p in os.listdir(images_path)
      if p.startswith('cut_') and p.endswith('.png')
  ]
  all_labels = dict(zip(*np.transpose(np.array(pd.read_csv(label_path)))))

  print("Number of images: %d" % len(image_names))
  labels = np.array(
      [all_labels[os.path.splitext(n)[0][4:]] for n in image_names]).reshape(
          (-1, 1))
  image_full_paths = [os.path.join(images_path, n) for n in image_names]

  classes, cts = np.unique(all_labels.values(), return_counts=True)
  weight_ratio = dict(zip(classes, np.max(cts) / cts.astype(float)))
  weights = np.array([weight_ratio[l[0]] for l in labels]).reshape((-1, 1))

  loader = deepchem.data.ImageLoader()
  dat = loader.featurize(
      image_full_paths, labels=labels, weights=weights, read_img=False)
  if split == None:
    return dat

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter()
  }
  if not seed is None:
    np.random.seed(seed)
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dat)
  all_dataset = (train, valid, test)
  return all_dataset


def cut_raw_images(all_images, path):
  """Preprocess images:
    (1) Crop the central square including retina
    (2) Reduce resolution to 512 * 512
  """
  print("Num of images to be processed: %d" % len(all_images))
  try:
    import cv2
  except:
    logger.warn("OpenCV required for image preprocessing")
    return

  for i, img_path in enumerate(all_images):
    if i % 100 == 0:
      print("on image %d" % i)
    if os.path.join(path, 'cut_' + os.path.splitext(img_path)[0] + '.png'):
      continue
    img = cv2.imread(os.path.join(path, img_path))
    edges = cv2.Canny(img, 10, 30)
    coords = zip(*np.where(edges > 0))
    n_p = len(coords)

    coords.sort(key=lambda x: (x[0], x[1]))
    center_0 = int(
        (coords[int(0.01 * n_p)][0] + coords[int(0.99 * n_p)][0]) / 2)
    coords.sort(key=lambda x: (x[1], x[0]))
    center_1 = int(
        (coords[int(0.01 * n_p)][1] + coords[int(0.99 * n_p)][1]) / 2)

    edge_size = min(
        [center_0, img.shape[0] - center_0, center_1, img.shape[1] - center_1])
    img_cut = img[(center_0 - edge_size):(center_0 + edge_size), (
        center_1 - edge_size):(center_1 + edge_size)]
    img_cut = cv2.resize(img_cut, (512, 512))
    cv2.imwrite(
        os.path.join(path, 'cut_' + os.path.splitext(img_path)[0] + '.png'),
        img_cut)
+95 −65
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@ from deepchem.trans import undo_transforms
from deepchem.data.data_loader import ImageLoader
from sklearn.metrics import confusion_matrix, accuracy_score


class DRModel(TensorGraph):

  def __init__(self,
@@ -61,14 +62,17 @@ class DRModel(TensorGraph):

  def build_graph(self):
    # inputs placeholder
    self.inputs = Feature(shape=(None, self.image_size, self.image_size, 3), dtype=tf.uint8)
    self.inputs = Feature(
        shape=(None, self.image_size, self.image_size, 3), dtype=tf.uint8)
    # data preprocessing and augmentation
    in_layer = DRAugment(self.augment,
    in_layer = DRAugment(
        self.augment,
        self.batch_size,
        size=(self.image_size, self.image_size),
        in_layers=[self.inputs])
    # first conv layer
    in_layer = Conv2D(self.n_init_kernel,
    in_layer = Conv2D(
        self.n_init_kernel,
        kernel_size=7,
        activation_fn=None,
        in_layers=[in_layer])
@@ -76,56 +80,55 @@ class DRModel(TensorGraph):
    in_layer = ReLU(in_layers=[in_layer])

    # downsample by max pooling
    res_in = MaxPool2D(ksize=[1, 3, 3, 1],
                       strides=[1, 2, 2, 1],
                       in_layers=[in_layer])
    res_in = MaxPool2D(
        ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], in_layers=[in_layer])

    for ct_module in range(self.n_downsample - 1):
      # each module is a residual convolutional block
      # followed by a convolutional downsample layer
      in_layer = Conv2D(self.n_init_kernel * 2 ** (ct_module - 1),
      in_layer = Conv2D(
          self.n_init_kernel * 2**(ct_module - 1),
          kernel_size=1,
          activation_fn=None,
          in_layers=[res_in])
      in_layer = BatchNorm(in_layers=[in_layer])
      in_layer = ReLU(in_layers=[in_layer])
      in_layer = Conv2D(self.n_init_kernel * 2 ** (ct_module - 1),
      in_layer = Conv2D(
          self.n_init_kernel * 2**(ct_module - 1),
          kernel_size=3,
          activation_fn=None,
          in_layers=[in_layer])
      in_layer = BatchNorm(in_layers=[in_layer])
      in_layer = ReLU(in_layers=[in_layer])
      in_layer = Conv2D(self.n_init_kernel * 2 ** ct_module,
      in_layer = Conv2D(
          self.n_init_kernel * 2**ct_module,
          kernel_size=1,
          activation_fn=None,
          in_layers=[in_layer])
      res_a = BatchNorm(in_layers=[in_layer])

      
      res_out = res_in + res_a
      res_in = Conv2D(self.n_init_kernel * 2 ** (ct_module + 1),
      res_in = Conv2D(
          self.n_init_kernel * 2**(ct_module + 1),
          kernel_size=3,
          stride=2,
          in_layers=[res_out])
      res_in = BatchNorm(in_layers=[res_in])

    # max pooling over the final outcome
    in_layer = ReduceMax(axis=(1, 2),
                         in_layers=[res_in])
    in_layer = ReduceMax(axis=(1, 2), in_layers=[res_in])

    for layer_size in self.n_fully_connected:
      # fully connected layers
      in_layer = Dense(layer_size,
                       activation_fn=tf.nn.relu,
                       in_layers=[in_layer])
      in_layer = Dense(
          layer_size, activation_fn=tf.nn.relu, in_layers=[in_layer])
      # dropout for dense layers
      #in_layer = Dropout(0.25, in_layers=[in_layer])

    logit_pred = Dense(self.n_tasks * self.n_classes,
                       activation_fn=None,
                       in_layers=[in_layer])
    logit_pred = Reshape(shape=(None, self.n_tasks, self.n_classes),
                         in_layers=[logit_pred])
    logit_pred = Dense(
        self.n_tasks * self.n_classes, activation_fn=None, in_layers=[in_layer])
    logit_pred = Reshape(
        shape=(None, self.n_tasks, self.n_classes), in_layers=[logit_pred])

    weights = Weights(shape=(None, self.n_tasks))
    labels = Label(shape=(None, self.n_tasks), dtype=tf.int32)
@@ -163,13 +166,14 @@ class DRModel(TensorGraph):
        if w_b is not None and not predict:
          feed_dict[self.task_weights[0]] = w_b

          
        yield feed_dict


def DRAccuracy(y, y_pred):
  y_pred = np.argmax(y_pred, 1)
  return accuracy_score(y, y_pred)


def DRSpecificity(y, y_pred):
  y_pred = (np.argmax(y_pred, 1) > 0) * 1
  y = (y > 0) * 1
@@ -177,6 +181,7 @@ def DRSpecificity(y, y_pred):
  N = sum(1 - y)
  return float(TN) / N


def DRSensitivity(y, y_pred):
  y_pred = (np.argmax(y_pred, 1) > 0) * 1
  y = (y > 0) * 1
@@ -184,10 +189,30 @@ def DRSensitivity(y, y_pred):
  P = sum(y)
  return float(TP) / P


def ConfusionMatrix(y, y_pred):
  y_pred = np.argmax(y_pred, 1)
  return confusion_matrix(y, y_pred)


def QuadWeightedKappa(y, y_pred):
  y_pred = np.argmax(y_pred, 1)
  cm = confusion_matrix(y, y_pred)
  classes_y, counts_y = np.unique(y, return_counts=True)
  classes_y_pred, counts_y_pred = np.unique(y_pred, return_counts=True)
  E = np.zeros((classes_y.shape[0], classes_y.shape[0]))
  for i, c1 in enumerate(classes_y):
    for j, c2 in enumerate(classes_y_pred):
      E[c1, c2] = counts_y[i] * counts_y_pred[j]
  E = E / np.sum(E) * np.sum(cm)
  w = np.zeros((classes_y.shape[0], classes_y.shape[0]))
  for i in range(classes_y.shape[0]):
    for j in range(classes_y.shape[0]):
      w[i, j] = float((i - j)**2) / (classes_y.shape[0] - 1)**2
  re = 1 - np.sum(w * cm) / np.sum(w * E)
  return re


class DRAugment(Layer):

  def __init__(self,
@@ -227,6 +252,7 @@ class DRAugment(Layer):
    if not self.augment:
      out_tensor = parent_tensor
    else:

      def preprocess(img):
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_flip_up_down(img)
@@ -237,9 +263,13 @@ class DRAugment(Layer):
          img = tf.clip_by_value(img, 0.0, 1.0)
        if self.central_crop:
          # sample cut ratio from a clipped gaussian
          img = tf.image.central_crop(img, np.clip(np.random.normal(1., 0.06), 0.8, 1.))
          img = tf.image.resize_bilinear(tf.expand_dims(img, 0), tf.convert_to_tensor(self.size))[0]
          img = tf.image.central_crop(img,
                                      np.clip(
                                          np.random.normal(1., 0.06), 0.8, 1.))
          img = tf.image.resize_bilinear(
              tf.expand_dims(img, 0), tf.convert_to_tensor(self.size))[0]
        return img

      outs = tf.map_fn(preprocess, parent_tensor)
      # train/valid differences
      out_tensor = training * outs + (1 - training) * parent_tensor
+21 −50
Original line number Diff line number Diff line
@@ -11,62 +11,33 @@ import numpy as np
import pandas as pd
import os
import logging
from model import DRModel, DRAccuracy, DRSensitivity, DRSpecificity, ConfusionMatrix

PATH = './data/train_cut/'

# Input images
image_names = os.listdir(PATH)
labels = dict(zip(*np.transpose(np.array(pd.read_csv('./data/trainLabels.csv')))))

np.random.seed(123)
np.random.shuffle(image_names)
# Use 80% of the data for training, the rest for validation
train_images = image_names[:int(0.8*len(image_names))]
test_images = image_names[int(0.8*len(image_names)):]

print("Number of training images: %d" % len(train_images))

train_labels = np.array([labels[os.path.splitext(n)[0][4:]] for n in train_images]).reshape((-1, 1))
train_image_paths = [os.path.join(PATH, n) for n in train_images]

test_labels = np.array([labels[os.path.splitext(n)[0][4:]] for n in test_images]).reshape((-1, 1))
test_image_paths = [os.path.join(PATH, n) for n in test_images]

# Generate class weights according to training set labels distribution
classes, cts = np.unique(train_labels, return_counts=True)
weight_ratio = dict(zip(classes, np.max(cts) / cts.astype(float)))
train_weights = np.array([weight_ratio[l[0]] for l in train_labels]).reshape((-1, 1))
test_weights = np.array([weight_ratio[l[0]] for l in test_labels]).reshape((-1, 1))
    
loader = dc.data.ImageLoader()
train_data = loader.featurize(train_image_paths, 
                              labels=train_labels, 
                              weights=train_weights, 
                              read_img=False)
test_data = loader.featurize(test_image_paths, 
                             labels=test_labels, 
                             weights=test_weights,
                             read_img=False)
from model import DRModel, DRAccuracy, ConfusionMatrix, QuadWeightedKappa
from data import load_images_DR

train, valid, test = load_images_DR(split='random', seed=123)
# Define and build model
model = DRModel(n_init_kernel=32,
model = DRModel(
    n_init_kernel=32,
    batch_size=32,
    learning_rate=1e-5,
    augment=True,
    model_dir='./test_model')
if not os.path.exists('./test_model'):
  os.mkdir('test_model')
model.build()
#model.restore()
metrics = [dc.metrics.Metric(DRAccuracy, mode='classification'),
           dc.metrics.Metric(DRSensitivity, mode='classification'),
           dc.metrics.Metric(DRSpecificity, mode='classification')]
metrics = [
    dc.metrics.Metric(DRAccuracy, mode='classification'),
    dc.metrics.Metric(QuadWeightedKappa, mode='classification')
]
cm = [dc.metrics.Metric(ConfusionMatrix, mode='classification')]

logger = logging.getLogger('deepchem.models.tensorgraph.tensor_graph')
logger.setLevel(logging.DEBUG)
for i in range(10):
  model.fit(train_data, nb_epoch=10, checkpoint_interval=3512)
  # Evaluate every 10 epochs
  model.evaluate(train_data, metrics)
  model.evaluate(test_data, metrics)
  model.evaluate(test_data, cm)
  model.fit(train, nb_epoch=10, checkpoint_interval=3512)
  model.evaluate(train, metrics)
  model.evaluate(valid, metrics)
  model.evaluate(valid, cm)
  model.evaluate(test, metrics)
  model.evaluate(test, cm)
+7 −9
Original line number Diff line number Diff line
@@ -230,8 +230,7 @@ class DataLoader(object):
          assert len(X) == len(ids)

        time2 = time.time()
        log(
            "TIMING: featurizing shard %d took %0.3f s" %
        log("TIMING: featurizing shard %d took %0.3f s" %
            (shard_num, time2 - time1), self.verbose)
        yield X, y, w, ids

@@ -295,8 +294,7 @@ class SDFLoader(DataLoader):

  def featurize_shard(self, shard):
    """Featurizes a shard of an input dataframe."""
    log(
        "Currently featurizing feature_type: %s" %
    log("Currently featurizing feature_type: %s" %
        self.featurizer.__class__.__name__, self.verbose)
    return featurize_mol_df(shard, self.featurizer, field=self.mol_field)