Merge pull request #3139 from captin411/focal-point-cropping (ee73341f) · Commits · github_fork / Stable Diffusion Webui

modules/textual_inversion/autocrop.py

0 → 100644

+341 −0

Original line number	Diff line number	Diff line
		import cv2
		import requests
		import os
		from collections import defaultdict
		from math import log, sqrt
		import numpy as np
		from PIL import Image, ImageDraw

		GREEN = "#0F0"
		BLUE = "#00F"
		RED = "#F00"


		def crop_image(im, settings):
		""" Intelligently crop an image to the subject matter """

		scale_by = 1
		if is_landscape(im.width, im.height):
		scale_by = settings.crop_height / im.height
		elif is_portrait(im.width, im.height):
		scale_by = settings.crop_width / im.width
		elif is_square(im.width, im.height):
		if is_square(settings.crop_width, settings.crop_height):
		scale_by = settings.crop_width / im.width
		elif is_landscape(settings.crop_width, settings.crop_height):
		scale_by = settings.crop_width / im.width
		elif is_portrait(settings.crop_width, settings.crop_height):
		scale_by = settings.crop_height / im.height

		im = im.resize((int(im.width * scale_by), int(im.height * scale_by)))
		im_debug = im.copy()

		focus = focal_point(im_debug, settings)

		# take the focal point and turn it into crop coordinates that try to center over the focal
		# point but then get adjusted back into the frame
		y_half = int(settings.crop_height / 2)
		x_half = int(settings.crop_width / 2)

		x1 = focus.x - x_half
		if x1 < 0:
		x1 = 0
		elif x1 + settings.crop_width > im.width:
		x1 = im.width - settings.crop_width

		y1 = focus.y - y_half
		if y1 < 0:
		y1 = 0
		elif y1 + settings.crop_height > im.height:
		y1 = im.height - settings.crop_height

		x2 = x1 + settings.crop_width
		y2 = y1 + settings.crop_height

		crop = [x1, y1, x2, y2]

		results = []

		results.append(im.crop(tuple(crop)))

		if settings.annotate_image:
		d = ImageDraw.Draw(im_debug)
		rect = list(crop)
		rect[2] -= 1
		rect[3] -= 1
		d.rectangle(rect, outline=GREEN)
		results.append(im_debug)
		if settings.destop_view_image:
		im_debug.show()

		return results

		def focal_point(im, settings):
		corner_points = image_corner_points(im, settings) if settings.corner_points_weight > 0 else []
		entropy_points = image_entropy_points(im, settings) if settings.entropy_points_weight > 0 else []
		face_points = image_face_points(im, settings) if settings.face_points_weight > 0 else []

		pois = []

		weight_pref_total = 0
		if len(corner_points) > 0:
		weight_pref_total += settings.corner_points_weight
		if len(entropy_points) > 0:
		weight_pref_total += settings.entropy_points_weight
		if len(face_points) > 0:
		weight_pref_total += settings.face_points_weight

		corner_centroid = None
		if len(corner_points) > 0:
		corner_centroid = centroid(corner_points)
		corner_centroid.weight = settings.corner_points_weight / weight_pref_total
		pois.append(corner_centroid)

		entropy_centroid = None
		if len(entropy_points) > 0:
		entropy_centroid = centroid(entropy_points)
		entropy_centroid.weight = settings.entropy_points_weight / weight_pref_total
		pois.append(entropy_centroid)

		face_centroid = None
		if len(face_points) > 0:
		face_centroid = centroid(face_points)
		face_centroid.weight = settings.face_points_weight / weight_pref_total
		pois.append(face_centroid)

		average_point = poi_average(pois, settings)

		if settings.annotate_image:
		d = ImageDraw.Draw(im)
		max_size = min(im.width, im.height) * 0.07
		if corner_centroid is not None:
		color = BLUE
		box = corner_centroid.bounding(max_size * corner_centroid.weight)
		d.text((box[0], box[1]-15), "Edge: %.02f" % corner_centroid.weight, fill=color)
		d.ellipse(box, outline=color)
		if len(corner_points) > 1:
		for f in corner_points:
		d.rectangle(f.bounding(4), outline=color)
		if entropy_centroid is not None:
		color = "#ff0"
		box = entropy_centroid.bounding(max_size * entropy_centroid.weight)
		d.text((box[0], box[1]-15), "Entropy: %.02f" % entropy_centroid.weight, fill=color)
		d.ellipse(box, outline=color)
		if len(entropy_points) > 1:
		for f in entropy_points:
		d.rectangle(f.bounding(4), outline=color)
		if face_centroid is not None:
		color = RED
		box = face_centroid.bounding(max_size * face_centroid.weight)
		d.text((box[0], box[1]-15), "Face: %.02f" % face_centroid.weight, fill=color)
		d.ellipse(box, outline=color)
		if len(face_points) > 1:
		for f in face_points:
		d.rectangle(f.bounding(4), outline=color)

		d.ellipse(average_point.bounding(max_size), outline=GREEN)

		return average_point


		def image_face_points(im, settings):
		if settings.dnn_model_path is not None:
		detector = cv2.FaceDetectorYN.create(
		settings.dnn_model_path,
		"",
		(im.width, im.height),
		0.9, # score threshold
		0.3, # nms threshold
		5000 # keep top k before nms
		)
		faces = detector.detect(np.array(im))
		results = []
		if faces[1] is not None:
		for face in faces[1]:
		x = face[0]
		y = face[1]
		w = face[2]
		h = face[3]
		results.append(
		PointOfInterest(
		int(x + (w * 0.5)), # face focus left/right is center
		int(y + (h * 0.33)), # face focus up/down is close to the top of the head
		size = w,
		weight = 1/len(faces[1])
		)
		)
		return results
		else:
		np_im = np.array(im)
		gray = cv2.cvtColor(np_im, cv2.COLOR_BGR2GRAY)

		tries = [
		[ f'{cv2.data.haarcascades}haarcascade_eye.xml', 0.01 ],
		[ f'{cv2.data.haarcascades}haarcascade_frontalface_default.xml', 0.05 ],
		[ f'{cv2.data.haarcascades}haarcascade_profileface.xml', 0.05 ],
		[ f'{cv2.data.haarcascades}haarcascade_frontalface_alt.xml', 0.05 ],
		[ f'{cv2.data.haarcascades}haarcascade_frontalface_alt2.xml', 0.05 ],
		[ f'{cv2.data.haarcascades}haarcascade_frontalface_alt_tree.xml', 0.05 ],
		[ f'{cv2.data.haarcascades}haarcascade_eye_tree_eyeglasses.xml', 0.05 ],
		[ f'{cv2.data.haarcascades}haarcascade_upperbody.xml', 0.05 ]
		]
		for t in tries:
		classifier = cv2.CascadeClassifier(t[0])
		minsize = int(min(im.width, im.height) * t[1]) # at least N percent of the smallest side
		try:
		faces = classifier.detectMultiScale(gray, scaleFactor=1.1,
		minNeighbors=7, minSize=(minsize, minsize), flags=cv2.CASCADE_SCALE_IMAGE)
		except:
		continue

		if len(faces) > 0:
		rects = [[f[0], f[1], f[0] + f[2], f[1] + f[3]] for f in faces]
		return [PointOfInterest((r[0] +r[2]) // 2, (r[1] + r[3]) // 2, size=abs(r[0]-r[2]), weight=1/len(rects)) for r in rects]
		return []


		def image_corner_points(im, settings):
		grayscale = im.convert("L")

		# naive attempt at preventing focal points from collecting at watermarks near the bottom
		gd = ImageDraw.Draw(grayscale)
		gd.rectangle([0, im.height*.9, im.width, im.height], fill="#999")

		np_im = np.array(grayscale)

		points = cv2.goodFeaturesToTrack(
		np_im,
		maxCorners=100,
		qualityLevel=0.04,
		minDistance=min(grayscale.width, grayscale.height)*0.06,
		useHarrisDetector=False,
		)

		if points is None:
		return []

		focal_points = []
		for point in points:
		x, y = point.ravel()
		focal_points.append(PointOfInterest(x, y, size=4, weight=1/len(points)))

		return focal_points


		def image_entropy_points(im, settings):
		landscape = im.height < im.width
		portrait = im.height > im.width
		if landscape:
		move_idx = [0, 2]
		move_max = im.size[0]
		elif portrait:
		move_idx = [1, 3]
		move_max = im.size[1]
		else:
		return []

		e_max = 0
		crop_current = [0, 0, settings.crop_width, settings.crop_height]
		crop_best = crop_current
		while crop_current[move_idx[1]] < move_max:
		crop = im.crop(tuple(crop_current))
		e = image_entropy(crop)

		if (e > e_max):
		e_max = e
		crop_best = list(crop_current)

		crop_current[move_idx[0]] += 4
		crop_current[move_idx[1]] += 4

		x_mid = int(crop_best[0] + settings.crop_width/2)
		y_mid = int(crop_best[1] + settings.crop_height/2)

		return [PointOfInterest(x_mid, y_mid, size=25, weight=1.0)]


		def image_entropy(im):
		# greyscale image entropy
		# band = np.asarray(im.convert("L"))
		band = np.asarray(im.convert("1"), dtype=np.uint8)
		hist, _ = np.histogram(band, bins=range(0, 256))
		hist = hist[hist > 0]
		return -np.log2(hist / hist.sum()).sum()

		def centroid(pois):
		x = [poi.x for poi in pois]
		y = [poi.y for poi in pois]
		return PointOfInterest(sum(x)/len(pois), sum(y)/len(pois))


		def poi_average(pois, settings):
		weight = 0.0
		x = 0.0
		y = 0.0
		for poi in pois:
		weight += poi.weight
		x += poi.x * poi.weight
		y += poi.y * poi.weight
		avg_x = round(x / weight)
		avg_y = round(y / weight)

		return PointOfInterest(avg_x, avg_y)


		def is_landscape(w, h):
		return w > h


		def is_portrait(w, h):
		return h > w


		def is_square(w, h):
		return w == h


		def download_and_cache_models(dirname):
		download_url = 'https://github.com/opencv/opencv_zoo/blob/91fb0290f50896f38a0ab1e558b74b16bc009428/models/face_detection_yunet/face_detection_yunet_2022mar.onnx?raw=true'
		model_file_name = 'face_detection_yunet.onnx'

		if not os.path.exists(dirname):
		os.makedirs(dirname)

		cache_file = os.path.join(dirname, model_file_name)
		if not os.path.exists(cache_file):
		print(f"downloading face detection model from '{download_url}' to '{cache_file}'")
		response = requests.get(download_url)
		with open(cache_file, "wb") as f:
		f.write(response.content)

		if os.path.exists(cache_file):
		return cache_file
		return None


		class PointOfInterest:
		def __init__(self, x, y, weight=1.0, size=10):
		self.x = x
		self.y = y
		self.weight = weight
		self.size = size

		def bounding(self, size):
		return [
		self.x - size//2,
		self.y - size//2,
		self.x + size//2,
		self.y + size//2
		]


		class Settings:
		def __init__(self, crop_width=512, crop_height=512, corner_points_weight=0.5, entropy_points_weight=0.5, face_points_weight=0.5, annotate_image=False, dnn_model_path=None):
		self.crop_width = crop_width
		self.crop_height = crop_height
		self.corner_points_weight = corner_points_weight
		self.entropy_points_weight = entropy_points_weight
		self.face_points_weight = face_points_weight
		self.annotate_image = annotate_image
		self.destop_view_image = False
		self.dnn_model_path = dnn_model_path
		No newline at end of file

modules/textual_inversion/preprocess.py

+33 −5

Original line number	Diff line number	Diff line
		@@ -7,12 +7,14 @@ import tqdm
		import time

		from modules import shared, images
		from modules.paths import models_path
		from modules.shared import opts, cmd_opts
		from modules.textual_inversion import autocrop
		if cmd_opts.deepdanbooru:
		import modules.deepbooru as deepbooru


		def preprocess(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru=False, split_threshold=0.5, overlap_ratio=0.2):
		def preprocess(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru=False, split_threshold=0.5, overlap_ratio=0.2, process_focal_crop=False, process_focal_crop_face_weight=0.9, process_focal_crop_entropy_weight=0.3, process_focal_crop_edges_weight=0.5, process_focal_crop_debug=False):
		try:
		if process_caption:
		shared.interrogator.load()
		@@ -22,7 +24,7 @@ def preprocess(process_src, process_dst, process_width, process_height, preproce
		db_opts[deepbooru.OPT_INCLUDE_RANKS] = False
		deepbooru.create_deepbooru_process(opts.interrogate_deepbooru_score_threshold, db_opts)

		preprocess_work(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru, split_threshold, overlap_ratio)
		preprocess_work(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru, split_threshold, overlap_ratio, process_focal_crop, process_focal_crop_face_weight, process_focal_crop_entropy_weight, process_focal_crop_edges_weight, process_focal_crop_debug)

		finally:

		@@ -34,7 +36,7 @@ def preprocess(process_src, process_dst, process_width, process_height, preproce



		def preprocess_work(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru=False, split_threshold=0.5, overlap_ratio=0.2):
		def preprocess_work(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru=False, split_threshold=0.5, overlap_ratio=0.2, process_focal_crop=False, process_focal_crop_face_weight=0.9, process_focal_crop_entropy_weight=0.3, process_focal_crop_edges_weight=0.5, process_focal_crop_debug=False):
		width = process_width
		height = process_height
		src = os.path.abspath(process_src)
		@@ -113,6 +115,7 @@ def preprocess_work(process_src, process_dst, process_width, process_height, pre
		splitted = image.crop((0, y, to_w, y + to_h))
		yield splitted


		for index, imagefile in enumerate(tqdm.tqdm(files)):
		subindex = [0]
		filename = os.path.join(src, imagefile)
		@@ -137,10 +140,35 @@ def preprocess_work(process_src, process_dst, process_width, process_height, pre
		ratio = (img.height * width) / (img.width * height)
		inverse_xy = True

		process_default_resize = True

		if process_split and ratio < 1.0 and ratio <= split_threshold:
		for splitted in split_pic(img, inverse_xy):
		save_pic(splitted, index, existing_caption=existing_caption)
		else:
		process_default_resize = False

		if process_focal_crop and img.height != img.width:

		dnn_model_path = None
		try:
		dnn_model_path = autocrop.download_and_cache_models(os.path.join(models_path, "opencv"))
		except Exception as e:
		print("Unable to load face detection model for auto crop selection. Falling back to lower quality haar method.", e)

		autocrop_settings = autocrop.Settings(
		crop_width = width,
		crop_height = height,
		face_points_weight = process_focal_crop_face_weight,
		entropy_points_weight = process_focal_crop_entropy_weight,
		corner_points_weight = process_focal_crop_edges_weight,
		annotate_image = process_focal_crop_debug,
		dnn_model_path = dnn_model_path,
		)
		for focal in autocrop.crop_image(img, autocrop_settings):
		save_pic(focal, index, existing_caption=existing_caption)
		process_default_resize = False

		if process_default_resize:
		img = images.resize_image(1, img, width, height)
		save_pic(img, index, existing_caption=existing_caption)

modules/ui.py

+18 −0

Original line number	Diff line number	Diff line
		@@ -1261,6 +1261,7 @@ def create_ui(wrap_gradio_gpu_call):
		with gr.Row():
		process_flip = gr.Checkbox(label='Create flipped copies')
		process_split = gr.Checkbox(label='Split oversized images')
		process_focal_crop = gr.Checkbox(label='Auto focal point crop')
		process_caption = gr.Checkbox(label='Use BLIP for caption')
		process_caption_deepbooru = gr.Checkbox(label='Use deepbooru for caption', visible=True if cmd_opts.deepdanbooru else False)

		@@ -1268,6 +1269,12 @@ def create_ui(wrap_gradio_gpu_call):
		process_split_threshold = gr.Slider(label='Split image threshold', value=0.5, minimum=0.0, maximum=1.0, step=0.05)
		process_overlap_ratio = gr.Slider(label='Split image overlap ratio', value=0.2, minimum=0.0, maximum=0.9, step=0.05)

		with gr.Row(visible=False) as process_focal_crop_row:
		process_focal_crop_face_weight = gr.Slider(label='Focal point face weight', value=0.9, minimum=0.0, maximum=1.0, step=0.05)
		process_focal_crop_entropy_weight = gr.Slider(label='Focal point entropy weight', value=0.15, minimum=0.0, maximum=1.0, step=0.05)
		process_focal_crop_edges_weight = gr.Slider(label='Focal point edges weight', value=0.5, minimum=0.0, maximum=1.0, step=0.05)
		process_focal_crop_debug = gr.Checkbox(label='Create debug image')

		with gr.Row():
		with gr.Column(scale=3):
		gr.HTML(value="")
		@@ -1281,6 +1288,12 @@ def create_ui(wrap_gradio_gpu_call):
		outputs=[process_split_extra_row],
		)

		process_focal_crop.change(
		fn=lambda show: gr_show(show),
		inputs=[process_focal_crop],
		outputs=[process_focal_crop_row],
		)

		with gr.Tab(label="Train"):
		gr.HTML(value="<p style='margin-bottom: 0.7em'>Train an embedding or Hypernetwork; you must specify a directory with a set of 1:1 ratio images <a href=\"https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Textual-Inversion\" style=\"font-weight:bold;\">[wiki]</a></p>")
		with gr.Row():
		@@ -1369,6 +1382,11 @@ def create_ui(wrap_gradio_gpu_call):
		process_caption_deepbooru,
		process_split_threshold,
		process_overlap_ratio,
		process_focal_crop,
		process_focal_crop_face_weight,
		process_focal_crop_entropy_weight,
		process_focal_crop_edges_weight,
		process_focal_crop_debug,
		],
		outputs=[
		ti_output,

requirements.txt

+2 −0

Original line number	Diff line number	Diff line
		@@ -8,6 +8,8 @@ gradio==3.5
		invisible-watermark
		numpy
		omegaconf
		opencv-python
		requests
		piexif
		Pillow
		pytorch_lightning

Admin message