data loader (66d5fa3d) · Commits · 钟慕尧 / deepchem

contrib/DiabeticRetinopathy/data.py

0 → 100644

+107 −0

Original line number	Diff line number	Diff line
		"""
		Diabetic Retinopathy Images loader.
		"""
		from __future__ import division
		from __future__ import unicode_literals

		import os
		import logging
		import deepchem
		import numpy as np
		import pandas as pd

		logger = logging.getLogger(__name__)


		def load_images_DR(split='random', seed=None):
		""" Loader for DR images """
		data_dir = deepchem.utils.get_data_dir()
		images_path = os.path.join(data_dir, 'DR', 'train')
		label_path = os.path.join(data_dir, 'DR', 'trainLabels.csv')
		if not os.path.exists(images_path) or not os.path.exists(label_path):
		logger.warn("Cannot locate data, \n\
		all images(.png) should be stored in the folder: $DEEPCHEM_DATA_DIR/DR/train/,\n\
		corresponding label file should be stored as $DEEPCHEM_DATA_DIR/DR/trainLabels.csv.\n\
		Please refer to https://www.kaggle.com/c/diabetic-retinopathy-detection for data access"
		)

		image_names = os.listdir(images_path)
		raw_images = []
		for im in image_names:
		if im.endswith('.jpeg') and not im.startswith(
		'cut_') and not 'cut_' + im in image_names:
		raw_images.append(im)
		if len(raw_images) > 0:
		cut_raw_images(raw_images, images_path)

		image_names = [
		p for p in os.listdir(images_path)
		if p.startswith('cut_') and p.endswith('.png')
		]
		all_labels = dict(zip(*np.transpose(np.array(pd.read_csv(label_path)))))

		print("Number of images: %d" % len(image_names))
		labels = np.array(
		[all_labels[os.path.splitext(n)[0][4:]] for n in image_names]).reshape(
		(-1, 1))
		image_full_paths = [os.path.join(images_path, n) for n in image_names]

		classes, cts = np.unique(all_labels.values(), return_counts=True)
		weight_ratio = dict(zip(classes, np.max(cts) / cts.astype(float)))
		weights = np.array([weight_ratio[l[0]] for l in labels]).reshape((-1, 1))

		loader = deepchem.data.ImageLoader()
		dat = loader.featurize(
		image_full_paths, labels=labels, weights=weights, read_img=False)
		if split == None:
		return dat

		splitters = {
		'index': deepchem.splits.IndexSplitter(),
		'random': deepchem.splits.RandomSplitter()
		}
		if not seed is None:
		np.random.seed(seed)
		splitter = splitters[split]
		train, valid, test = splitter.train_valid_test_split(dat)
		all_dataset = (train, valid, test)
		return all_dataset


		def cut_raw_images(all_images, path):
		"""Preprocess images:
		(1) Crop the central square including retina
		(2) Reduce resolution to 512 * 512
		"""
		print("Num of images to be processed: %d" % len(all_images))
		try:
		import cv2
		except:
		logger.warn("OpenCV required for image preprocessing")
		return

		for i, img_path in enumerate(all_images):
		if i % 100 == 0:
		print("on image %d" % i)
		if os.path.join(path, 'cut_' + os.path.splitext(img_path)[0] + '.png'):
		continue
		img = cv2.imread(os.path.join(path, img_path))
		edges = cv2.Canny(img, 10, 30)
		coords = zip(*np.where(edges > 0))
		n_p = len(coords)

		coords.sort(key=lambda x: (x[0], x[1]))
		center_0 = int(
		(coords[int(0.01 * n_p)][0] + coords[int(0.99 * n_p)][0]) / 2)
		coords.sort(key=lambda x: (x[1], x[0]))
		center_1 = int(
		(coords[int(0.01 * n_p)][1] + coords[int(0.99 * n_p)][1]) / 2)

		edge_size = min(
		[center_0, img.shape[0] - center_0, center_1, img.shape[1] - center_1])
		img_cut = img[(center_0 - edge_size):(center_0 + edge_size), (
		center_1 - edge_size):(center_1 + edge_size)]
		img_cut = cv2.resize(img_cut, (512, 512))
		cv2.imwrite(
		os.path.join(path, 'cut_' + os.path.splitext(img_path)[0] + '.png'),
		img_cut)

contrib/DiabeticRetinopathy/model.py

+95 −65

Original line number	Diff line number	Diff line
		@@ -20,6 +20,7 @@ from deepchem.trans import undo_transforms
		from deepchem.data.data_loader import ImageLoader
		from sklearn.metrics import confusion_matrix, accuracy_score


		class DRModel(TensorGraph):

		def __init__(self,
		@@ -61,14 +62,17 @@ class DRModel(TensorGraph):

		def build_graph(self):
		# inputs placeholder
		self.inputs = Feature(shape=(None, self.image_size, self.image_size, 3), dtype=tf.uint8)
		self.inputs = Feature(
		shape=(None, self.image_size, self.image_size, 3), dtype=tf.uint8)
		# data preprocessing and augmentation
		in_layer = DRAugment(self.augment,
		in_layer = DRAugment(
		self.augment,
		self.batch_size,
		size=(self.image_size, self.image_size),
		in_layers=[self.inputs])
		# first conv layer
		in_layer = Conv2D(self.n_init_kernel,
		in_layer = Conv2D(
		self.n_init_kernel,
		kernel_size=7,
		activation_fn=None,
		in_layers=[in_layer])
		@@ -76,56 +80,55 @@ class DRModel(TensorGraph):
		in_layer = ReLU(in_layers=[in_layer])

		# downsample by max pooling
		res_in = MaxPool2D(ksize=[1, 3, 3, 1],
		strides=[1, 2, 2, 1],
		in_layers=[in_layer])
		res_in = MaxPool2D(
		ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], in_layers=[in_layer])

		for ct_module in range(self.n_downsample - 1):
		# each module is a residual convolutional block
		# followed by a convolutional downsample layer
		in_layer = Conv2D(self.n_init_kernel * 2 ** (ct_module - 1),
		in_layer = Conv2D(
		self.n_init_kernel * 2**(ct_module - 1),
		kernel_size=1,
		activation_fn=None,
		in_layers=[res_in])
		in_layer = BatchNorm(in_layers=[in_layer])
		in_layer = ReLU(in_layers=[in_layer])
		in_layer = Conv2D(self.n_init_kernel * 2 ** (ct_module - 1),
		in_layer = Conv2D(
		self.n_init_kernel * 2**(ct_module - 1),
		kernel_size=3,
		activation_fn=None,
		in_layers=[in_layer])
		in_layer = BatchNorm(in_layers=[in_layer])
		in_layer = ReLU(in_layers=[in_layer])
		in_layer = Conv2D(self.n_init_kernel * 2 ** ct_module,
		in_layer = Conv2D(
		self.n_init_kernel * 2**ct_module,
		kernel_size=1,
		activation_fn=None,
		in_layers=[in_layer])
		res_a = BatchNorm(in_layers=[in_layer])


		res_out = res_in + res_a
		res_in = Conv2D(self.n_init_kernel * 2 ** (ct_module + 1),
		res_in = Conv2D(
		self.n_init_kernel * 2**(ct_module + 1),
		kernel_size=3,
		stride=2,
		in_layers=[res_out])
		res_in = BatchNorm(in_layers=[res_in])

		# max pooling over the final outcome
		in_layer = ReduceMax(axis=(1, 2),
		in_layers=[res_in])
		in_layer = ReduceMax(axis=(1, 2), in_layers=[res_in])

		for layer_size in self.n_fully_connected:
		# fully connected layers
		in_layer = Dense(layer_size,
		activation_fn=tf.nn.relu,
		in_layers=[in_layer])
		in_layer = Dense(
		layer_size, activation_fn=tf.nn.relu, in_layers=[in_layer])
		# dropout for dense layers
		#in_layer = Dropout(0.25, in_layers=[in_layer])

		logit_pred = Dense(self.n_tasks * self.n_classes,
		activation_fn=None,
		in_layers=[in_layer])
		logit_pred = Reshape(shape=(None, self.n_tasks, self.n_classes),
		in_layers=[logit_pred])
		logit_pred = Dense(
		self.n_tasks * self.n_classes, activation_fn=None, in_layers=[in_layer])
		logit_pred = Reshape(
		shape=(None, self.n_tasks, self.n_classes), in_layers=[logit_pred])

		weights = Weights(shape=(None, self.n_tasks))
		labels = Label(shape=(None, self.n_tasks), dtype=tf.int32)
		@@ -163,13 +166,14 @@ class DRModel(TensorGraph):
		if w_b is not None and not predict:
		feed_dict[self.task_weights[0]] = w_b


		yield feed_dict


		def DRAccuracy(y, y_pred):
		y_pred = np.argmax(y_pred, 1)
		return accuracy_score(y, y_pred)


		def DRSpecificity(y, y_pred):
		y_pred = (np.argmax(y_pred, 1) > 0) * 1
		y = (y > 0) * 1
		@@ -177,6 +181,7 @@ def DRSpecificity(y, y_pred):
		N = sum(1 - y)
		return float(TN) / N


		def DRSensitivity(y, y_pred):
		y_pred = (np.argmax(y_pred, 1) > 0) * 1
		y = (y > 0) * 1
		@@ -184,10 +189,30 @@ def DRSensitivity(y, y_pred):
		P = sum(y)
		return float(TP) / P


		def ConfusionMatrix(y, y_pred):
		y_pred = np.argmax(y_pred, 1)
		return confusion_matrix(y, y_pred)


		def QuadWeightedKappa(y, y_pred):
		y_pred = np.argmax(y_pred, 1)
		cm = confusion_matrix(y, y_pred)
		classes_y, counts_y = np.unique(y, return_counts=True)
		classes_y_pred, counts_y_pred = np.unique(y_pred, return_counts=True)
		E = np.zeros((classes_y.shape[0], classes_y.shape[0]))
		for i, c1 in enumerate(classes_y):
		for j, c2 in enumerate(classes_y_pred):
		E[c1, c2] = counts_y[i] * counts_y_pred[j]
		E = E / np.sum(E) * np.sum(cm)
		w = np.zeros((classes_y.shape[0], classes_y.shape[0]))
		for i in range(classes_y.shape[0]):
		for j in range(classes_y.shape[0]):
		w[i, j] = float((i - j)2) / (classes_y.shape[0] - 1)2
		re = 1 - np.sum(w * cm) / np.sum(w * E)
		return re


		class DRAugment(Layer):

		def __init__(self,
		@@ -227,6 +252,7 @@ class DRAugment(Layer):
		if not self.augment:
		out_tensor = parent_tensor
		else:

		def preprocess(img):
		img = tf.image.random_flip_left_right(img)
		img = tf.image.random_flip_up_down(img)
		@@ -237,9 +263,13 @@ class DRAugment(Layer):
		img = tf.clip_by_value(img, 0.0, 1.0)
		if self.central_crop:
		# sample cut ratio from a clipped gaussian
		img = tf.image.central_crop(img, np.clip(np.random.normal(1., 0.06), 0.8, 1.))
		img = tf.image.resize_bilinear(tf.expand_dims(img, 0), tf.convert_to_tensor(self.size))[0]
		img = tf.image.central_crop(img,
		np.clip(
		np.random.normal(1., 0.06), 0.8, 1.))
		img = tf.image.resize_bilinear(
		tf.expand_dims(img, 0), tf.convert_to_tensor(self.size))[0]
		return img

		outs = tf.map_fn(preprocess, parent_tensor)
		# train/valid differences
		out_tensor = training * outs + (1 - training) * parent_tensor

contrib/DiabeticRetinopathy/run.py

+21 −50

Original line number	Diff line number	Diff line
		@@ -11,62 +11,33 @@ import numpy as np
		import pandas as pd
		import os
		import logging
		from model import DRModel, DRAccuracy, DRSensitivity, DRSpecificity, ConfusionMatrix

		PATH = './data/train_cut/'

		# Input images
		image_names = os.listdir(PATH)
		labels = dict(zip(*np.transpose(np.array(pd.read_csv('./data/trainLabels.csv')))))

		np.random.seed(123)
		np.random.shuffle(image_names)
		# Use 80% of the data for training, the rest for validation
		train_images = image_names[:int(0.8*len(image_names))]
		test_images = image_names[int(0.8*len(image_names)):]

		print("Number of training images: %d" % len(train_images))

		train_labels = np.array([labels[os.path.splitext(n)[0][4:]] for n in train_images]).reshape((-1, 1))
		train_image_paths = [os.path.join(PATH, n) for n in train_images]

		test_labels = np.array([labels[os.path.splitext(n)[0][4:]] for n in test_images]).reshape((-1, 1))
		test_image_paths = [os.path.join(PATH, n) for n in test_images]

		# Generate class weights according to training set labels distribution
		classes, cts = np.unique(train_labels, return_counts=True)
		weight_ratio = dict(zip(classes, np.max(cts) / cts.astype(float)))
		train_weights = np.array([weight_ratio[l[0]] for l in train_labels]).reshape((-1, 1))
		test_weights = np.array([weight_ratio[l[0]] for l in test_labels]).reshape((-1, 1))

		loader = dc.data.ImageLoader()
		train_data = loader.featurize(train_image_paths,
		labels=train_labels,
		weights=train_weights,
		read_img=False)
		test_data = loader.featurize(test_image_paths,
		labels=test_labels,
		weights=test_weights,
		read_img=False)
		from model import DRModel, DRAccuracy, ConfusionMatrix, QuadWeightedKappa
		from data import load_images_DR

		train, valid, test = load_images_DR(split='random', seed=123)
		# Define and build model
		model = DRModel(n_init_kernel=32,
		model = DRModel(
		n_init_kernel=32,
		batch_size=32,
		learning_rate=1e-5,
		augment=True,
		model_dir='./test_model')
		if not os.path.exists('./test_model'):
		os.mkdir('test_model')
		model.build()
		#model.restore()
		metrics = [dc.metrics.Metric(DRAccuracy, mode='classification'),
		dc.metrics.Metric(DRSensitivity, mode='classification'),
		dc.metrics.Metric(DRSpecificity, mode='classification')]
		metrics = [
		dc.metrics.Metric(DRAccuracy, mode='classification'),
		dc.metrics.Metric(QuadWeightedKappa, mode='classification')
		]
		cm = [dc.metrics.Metric(ConfusionMatrix, mode='classification')]

		logger = logging.getLogger('deepchem.models.tensorgraph.tensor_graph')
		logger.setLevel(logging.DEBUG)
		for i in range(10):
		model.fit(train_data, nb_epoch=10, checkpoint_interval=3512)
		# Evaluate every 10 epochs
		model.evaluate(train_data, metrics)
		model.evaluate(test_data, metrics)
		model.evaluate(test_data, cm)
		model.fit(train, nb_epoch=10, checkpoint_interval=3512)
		model.evaluate(train, metrics)
		model.evaluate(valid, metrics)
		model.evaluate(valid, cm)
		model.evaluate(test, metrics)
		model.evaluate(test, cm)

deepchem/data/data_loader.py

+7 −9

Original line number	Diff line number	Diff line
		@@ -230,8 +230,7 @@ class DataLoader(object):
		assert len(X) == len(ids)

		time2 = time.time()
		log(
		"TIMING: featurizing shard %d took %0.3f s" %
		log("TIMING: featurizing shard %d took %0.3f s" %
		(shard_num, time2 - time1), self.verbose)
		yield X, y, w, ids

		@@ -295,8 +294,7 @@ class SDFLoader(DataLoader):

		def featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""
		log(
		"Currently featurizing feature_type: %s" %
		log("Currently featurizing feature_type: %s" %
		self.featurizer.__class__.__name__, self.verbose)
		return featurize_mol_df(shard, self.featurizer, field=self.mol_field)

Admin message