Merge pull request #1414 from miaecle/master (f9510bee) · Commits · 钟慕尧 / deepchem

contrib/DiabeticRetinopathy/data.py

0 → 100644

+107 −0

Original line number	Diff line number	Diff line
		"""
		Diabetic Retinopathy Images loader.
		"""
		from __future__ import division
		from __future__ import unicode_literals

		import os
		import logging
		import deepchem
		import numpy as np
		import pandas as pd

		logger = logging.getLogger(__name__)


		def load_images_DR(split='random', seed=None):
		""" Loader for DR images """
		data_dir = deepchem.utils.get_data_dir()
		images_path = os.path.join(data_dir, 'DR', 'train')
		label_path = os.path.join(data_dir, 'DR', 'trainLabels.csv')
		if not os.path.exists(images_path) or not os.path.exists(label_path):
		logger.warn("Cannot locate data, \n\
		all images(.png) should be stored in the folder: $DEEPCHEM_DATA_DIR/DR/train/,\n\
		corresponding label file should be stored as $DEEPCHEM_DATA_DIR/DR/trainLabels.csv.\n\
		Please refer to https://www.kaggle.com/c/diabetic-retinopathy-detection for data access"
		)

		image_names = os.listdir(images_path)
		raw_images = []
		for im in image_names:
		if im.endswith('.jpeg') and not im.startswith(
		'cut_') and not 'cut_' + im in image_names:
		raw_images.append(im)
		if len(raw_images) > 0:
		cut_raw_images(raw_images, images_path)

		image_names = [
		p for p in os.listdir(images_path)
		if p.startswith('cut_') and p.endswith('.png')
		]
		all_labels = dict(zip(*np.transpose(np.array(pd.read_csv(label_path)))))

		print("Number of images: %d" % len(image_names))
		labels = np.array(
		[all_labels[os.path.splitext(n)[0][4:]] for n in image_names]).reshape(
		(-1, 1))
		image_full_paths = [os.path.join(images_path, n) for n in image_names]

		classes, cts = np.unique(all_labels.values(), return_counts=True)
		weight_ratio = dict(zip(classes, np.max(cts) / cts.astype(float)))
		weights = np.array([weight_ratio[l[0]] for l in labels]).reshape((-1, 1))

		loader = deepchem.data.ImageLoader()
		dat = loader.featurize(
		image_full_paths, labels=labels, weights=weights, read_img=False)
		if split == None:
		return dat

		splitters = {
		'index': deepchem.splits.IndexSplitter(),
		'random': deepchem.splits.RandomSplitter()
		}
		if not seed is None:
		np.random.seed(seed)
		splitter = splitters[split]
		train, valid, test = splitter.train_valid_test_split(dat)
		all_dataset = (train, valid, test)
		return all_dataset


		def cut_raw_images(all_images, path):
		"""Preprocess images:
		(1) Crop the central square including retina
		(2) Reduce resolution to 512 * 512
		"""
		print("Num of images to be processed: %d" % len(all_images))
		try:
		import cv2
		except:
		logger.warn("OpenCV required for image preprocessing")
		return

		for i, img_path in enumerate(all_images):
		if i % 100 == 0:
		print("on image %d" % i)
		if os.path.join(path, 'cut_' + os.path.splitext(img_path)[0] + '.png'):
		continue
		img = cv2.imread(os.path.join(path, img_path))
		edges = cv2.Canny(img, 10, 30)
		coords = zip(*np.where(edges > 0))
		n_p = len(coords)

		coords.sort(key=lambda x: (x[0], x[1]))
		center_0 = int(
		(coords[int(0.01 * n_p)][0] + coords[int(0.99 * n_p)][0]) / 2)
		coords.sort(key=lambda x: (x[1], x[0]))
		center_1 = int(
		(coords[int(0.01 * n_p)][1] + coords[int(0.99 * n_p)][1]) / 2)

		edge_size = min(
		[center_0, img.shape[0] - center_0, center_1, img.shape[1] - center_1])
		img_cut = img[(center_0 - edge_size):(center_0 + edge_size), (
		center_1 - edge_size):(center_1 + edge_size)]
		img_cut = cv2.resize(img_cut, (512, 512))
		cv2.imwrite(
		os.path.join(path, 'cut_' + os.path.splitext(img_path)[0] + '.png'),
		img_cut)

contrib/DiabeticRetinopathy/model.py

0 → 100755

+278 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python2
		# -- coding: utf-8 --
		"""
		Created on Mon Sep 10 06:12:10 2018

		@author: zqwu
		"""

		import numpy as np
		import tensorflow as tf

		from deepchem.data import NumpyDataset, pad_features
		from deepchem.metrics import to_one_hot
		from deepchem.models.tensorgraph.layers import Layer, Dense, SoftMax, Reshape, \
		SparseSoftMaxCrossEntropy, BatchNorm, Conv2D, MaxPool2D, WeightedError, \
		Dropout, ReLU, Stack, Flatten, ReduceMax, WeightDecay
		from deepchem.models.tensorgraph.layers import L2Loss, Label, Weights, Feature
		from deepchem.models.tensorgraph.tensor_graph import TensorGraph
		from deepchem.trans import undo_transforms
		from deepchem.data.data_loader import ImageLoader
		from sklearn.metrics import confusion_matrix, accuracy_score


		class DRModel(TensorGraph):

		def __init__(self,
		n_tasks=1,
		image_size=512,
		n_downsample=6,
		n_init_kernel=16,
		n_fully_connected=[1024],
		n_classes=5,
		augment=False,
		**kwargs):
		"""
		Parameters
		----------
		n_tasks: int
		Number of tasks
		image_size: int
		Resolution of the input images(square)
		n_downsample: int
		Downsample ratio in power of 2
		n_init_kernel: int
		Kernel size for the first convolutional layer
		n_fully_connected: list of int
		Shape of FC layers after convolutions
		n_classes: int
		Number of classes to predict (only used in classification mode)
		augment: bool
		If to use data augmentation
		"""
		self.n_tasks = n_tasks
		self.image_size = image_size
		self.n_downsample = n_downsample
		self.n_init_kernel = n_init_kernel
		self.n_fully_connected = n_fully_connected
		self.n_classes = n_classes
		self.augment = augment
		super(DRModel, self).__init__(**kwargs)
		self.build_graph()

		def build_graph(self):
		# inputs placeholder
		self.inputs = Feature(
		shape=(None, self.image_size, self.image_size, 3), dtype=tf.uint8)
		# data preprocessing and augmentation
		in_layer = DRAugment(
		self.augment,
		self.batch_size,
		size=(self.image_size, self.image_size),
		in_layers=[self.inputs])
		# first conv layer
		in_layer = Conv2D(
		self.n_init_kernel,
		kernel_size=7,
		activation_fn=None,
		in_layers=[in_layer])
		in_layer = BatchNorm(in_layers=[in_layer])
		in_layer = ReLU(in_layers=[in_layer])

		# downsample by max pooling
		res_in = MaxPool2D(
		ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], in_layers=[in_layer])

		for ct_module in range(self.n_downsample - 1):
		# each module is a residual convolutional block
		# followed by a convolutional downsample layer
		in_layer = Conv2D(
		self.n_init_kernel * 2**(ct_module - 1),
		kernel_size=1,
		activation_fn=None,
		in_layers=[res_in])
		in_layer = BatchNorm(in_layers=[in_layer])
		in_layer = ReLU(in_layers=[in_layer])
		in_layer = Conv2D(
		self.n_init_kernel * 2**(ct_module - 1),
		kernel_size=3,
		activation_fn=None,
		in_layers=[in_layer])
		in_layer = BatchNorm(in_layers=[in_layer])
		in_layer = ReLU(in_layers=[in_layer])
		in_layer = Conv2D(
		self.n_init_kernel * 2**ct_module,
		kernel_size=1,
		activation_fn=None,
		in_layers=[in_layer])
		res_a = BatchNorm(in_layers=[in_layer])

		res_out = res_in + res_a
		res_in = Conv2D(
		self.n_init_kernel * 2**(ct_module + 1),
		kernel_size=3,
		stride=2,
		in_layers=[res_out])
		res_in = BatchNorm(in_layers=[res_in])

		# max pooling over the final outcome
		in_layer = ReduceMax(axis=(1, 2), in_layers=[res_in])

		for layer_size in self.n_fully_connected:
		# fully connected layers
		in_layer = Dense(
		layer_size, activation_fn=tf.nn.relu, in_layers=[in_layer])
		# dropout for dense layers
		#in_layer = Dropout(0.25, in_layers=[in_layer])

		logit_pred = Dense(
		self.n_tasks * self.n_classes, activation_fn=None, in_layers=[in_layer])
		logit_pred = Reshape(
		shape=(None, self.n_tasks, self.n_classes), in_layers=[logit_pred])

		weights = Weights(shape=(None, self.n_tasks))
		labels = Label(shape=(None, self.n_tasks), dtype=tf.int32)

		output = SoftMax(logit_pred)
		self.add_output(output)
		loss = SparseSoftMaxCrossEntropy(in_layers=[labels, logit_pred])
		weighted_loss = WeightedError(in_layers=[loss, weights])

		# weight decay regularizer
		# weighted_loss = WeightDecay(0.1, 'l2', in_layers=[weighted_loss])
		self.set_loss(weighted_loss)

		def default_generator(self,
		dataset,
		epochs=1,
		predict=False,
		deterministic=True,
		pad_batches=True):
		for epoch in range(epochs):
		for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
		batch_size=self.batch_size,
		deterministic=deterministic,
		pad_batches=pad_batches):
		feed_dict = dict()

		if None in X_b:
		# load images on the fly
		feed_dict[self.features[0]] = ImageLoader.load_img(ids_b)
		else:
		feed_dict[self.features[0]] = X_b

		if y_b is not None and not predict:
		feed_dict[self.labels[0]] = y_b
		if w_b is not None and not predict:
		feed_dict[self.task_weights[0]] = w_b

		yield feed_dict


		def DRAccuracy(y, y_pred):
		y_pred = np.argmax(y_pred, 1)
		return accuracy_score(y, y_pred)


		def DRSpecificity(y, y_pred):
		y_pred = (np.argmax(y_pred, 1) > 0) * 1
		y = (y > 0) * 1
		TN = sum((1 - y_pred) * (1 - y))
		N = sum(1 - y)
		return float(TN) / N


		def DRSensitivity(y, y_pred):
		y_pred = (np.argmax(y_pred, 1) > 0) * 1
		y = (y > 0) * 1
		TP = sum(y_pred * y)
		P = sum(y)
		return float(TP) / P


		def ConfusionMatrix(y, y_pred):
		y_pred = np.argmax(y_pred, 1)
		return confusion_matrix(y, y_pred)


		def QuadWeightedKappa(y, y_pred):
		y_pred = np.argmax(y_pred, 1)
		cm = confusion_matrix(y, y_pred)
		classes_y, counts_y = np.unique(y, return_counts=True)
		classes_y_pred, counts_y_pred = np.unique(y_pred, return_counts=True)
		E = np.zeros((classes_y.shape[0], classes_y.shape[0]))
		for i, c1 in enumerate(classes_y):
		for j, c2 in enumerate(classes_y_pred):
		E[c1, c2] = counts_y[i] * counts_y_pred[j]
		E = E / np.sum(E) * np.sum(cm)
		w = np.zeros((classes_y.shape[0], classes_y.shape[0]))
		for i in range(classes_y.shape[0]):
		for j in range(classes_y.shape[0]):
		w[i, j] = float((i - j)2) / (classes_y.shape[0] - 1)2
		re = 1 - np.sum(w * cm) / np.sum(w * E)
		return re


		class DRAugment(Layer):

		def __init__(self,
		augment,
		batch_size,
		distort_color=True,
		central_crop=True,
		size=(512, 512),
		**kwargs):
		"""
		Parameters
		----------
		augment: bool
		If to use data augmentation
		batch_size: int
		Number of images in the batch
		distort_color: bool
		If to apply random distortion on the color
		central_crop: bool
		If to randomly crop the sample around the center
		size: int
		Resolution of the input images(square)
		"""
		self.augment = augment
		self.batch_size = batch_size
		self.distort_color = distort_color
		self.central_crop = central_crop
		self.size = size
		super(DRAugment, self).__init__(**kwargs)

		def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
		inputs = self._get_input_tensors(in_layers)
		parent_tensor = inputs[0]
		training = kwargs['training'] if 'training' in kwargs else 1.0

		parent_tensor = tf.image.convert_image_dtype(parent_tensor, tf.float32)
		if not self.augment:
		out_tensor = parent_tensor
		else:

		def preprocess(img):
		img = tf.image.random_flip_left_right(img)
		img = tf.image.random_flip_up_down(img)
		img = tf.image.rot90(img, k=np.random.randint(0, 4))
		if self.distort_color:
		img = tf.image.random_brightness(img, max_delta=32. / 255.)
		img = tf.image.random_saturation(img, lower=0.5, upper=1.5)
		img = tf.clip_by_value(img, 0.0, 1.0)
		if self.central_crop:
		# sample cut ratio from a clipped gaussian
		img = tf.image.central_crop(img,
		np.clip(
		np.random.normal(1., 0.06), 0.8, 1.))
		img = tf.image.resize_bilinear(
		tf.expand_dims(img, 0), tf.convert_to_tensor(self.size))[0]
		return img

		outs = tf.map_fn(preprocess, parent_tensor)
		# train/valid differences
		out_tensor = training * outs + (1 - training) * parent_tensor
		if set_tensors:
		self.out_tensor = out_tensor
		return out_tensor

contrib/DiabeticRetinopathy/run.py

0 → 100755

+43 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python2
		# -- coding: utf-8 --
		"""
		Created on Mon Sep 10 06:12:11 2018

		@author: zqwu
		"""

		import deepchem as dc
		import numpy as np
		import pandas as pd
		import os
		import logging
		from model import DRModel, DRAccuracy, ConfusionMatrix, QuadWeightedKappa
		from data import load_images_DR

		train, valid, test = load_images_DR(split='random', seed=123)
		# Define and build model
		model = DRModel(
		n_init_kernel=32,
		batch_size=32,
		learning_rate=1e-5,
		augment=True,
		model_dir='./test_model')
		if not os.path.exists('./test_model'):
		os.mkdir('test_model')
		model.build()
		#model.restore()
		metrics = [
		dc.metrics.Metric(DRAccuracy, mode='classification'),
		dc.metrics.Metric(QuadWeightedKappa, mode='classification')
		]
		cm = [dc.metrics.Metric(ConfusionMatrix, mode='classification')]

		logger = logging.getLogger('deepchem.models.tensorgraph.tensor_graph')
		logger.setLevel(logging.DEBUG)
		for i in range(10):
		model.fit(train, nb_epoch=10, checkpoint_interval=3512)
		model.evaluate(train, metrics)
		model.evaluate(valid, metrics)
		model.evaluate(valid, cm)
		model.evaluate(test, metrics)
		model.evaluate(test, cm)

deepchem/data/data_loader.py

+26 −12

Original line number	Diff line number	Diff line
		@@ -230,8 +230,7 @@ class DataLoader(object):
		assert len(X) == len(ids)

		time2 = time.time()
		log(
		"TIMING: featurizing shard %d took %0.3f s" %
		log("TIMING: featurizing shard %d took %0.3f s" %
		(shard_num, time2 - time1), self.verbose)
		yield X, y, w, ids

		@@ -295,8 +294,7 @@ class SDFLoader(DataLoader):

		def featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""
		log(
		"Currently featurizing feature_type: %s" %
		log("Currently featurizing feature_type: %s" %
		self.featurizer.__class__.__name__, self.verbose)
		return featurize_mol_df(shard, self.featurizer, field=self.mol_field)

		@@ -349,7 +347,12 @@ class ImageLoader(DataLoader):
		tasks = []
		self.tasks = tasks

		def featurize(self, input_files, in_memory=True):
		def featurize(self,
		input_files,
		labels=None,
		weights=None,
		read_img=True,
		in_memory=True):
		"""Featurizes image files.

		Parameters
		@@ -395,6 +398,23 @@ class ImageLoader(DataLoader):
		raise ValueError("Unsupported file format")
		input_files = remainder

		if read_img:
		X = self.load_img(image_files)
		else:
		X = [None] * len(image_files)
		if in_memory:
		return NumpyDataset(X, y=labels, w=weights, ids=image_files)

		else:
		# from_numpy currently requires labels. Make dummy labels
		if labels is None:
		labels = np.zeros((len(image_files), 1))
		if weights is None:
		weights = np.zeros((len(image_files), 1))
		return DiskDataset.from_numpy(X, labels, w=weights, ids=image_files)

		@staticmethod
		def load_img(image_files):
		images = []
		for image_file in image_files:
		_, extension = os.path.splitext(image_file)
		@@ -407,10 +427,4 @@ class ImageLoader(DataLoader):
		images.append(imarray)
		else:
		raise ValueError("Unsupported image filetype for %s" % image_file)
		images = np.array(images)
		if in_memory:
		return NumpyDataset(images)
		else:
		# from_numpy currently requires labels. Make dummy labels
		labels = np.zeros((len(images), 1))
		return DiskDataset.from_numpy(images, labels)
		return np.array(images)

deepchem/models/tensorgraph/tensor_graph.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -18,6 +18,7 @@ from deepchem.models.tensorgraph.optimizers import Adam
		from deepchem.trans import undo_transforms
		from deepchem.utils.evaluate import GeneratorEvaluator

		logging.basicConfig()
		logger = logging.getLogger(__name__)

Admin message