Unverified Commit ee73341f authored by AUTOMATIC1111's avatar AUTOMATIC1111 Committed by GitHub
Browse files

Merge pull request #3139 from captin411/focal-point-cropping

[Preprocess image] New option to auto crop based on complexity, edges, faces
parents 7207e3bf df0c5ea2
Loading
Loading
Loading
Loading
+341 −0
Original line number Diff line number Diff line
import cv2
import requests
import os
from collections import defaultdict
from math import log, sqrt
import numpy as np
from PIL import Image, ImageDraw

GREEN = "#0F0"
BLUE = "#00F"
RED = "#F00"


def crop_image(im, settings):
  """ Intelligently crop an image to the subject matter """

  scale_by = 1
  if is_landscape(im.width, im.height):
    scale_by = settings.crop_height / im.height
  elif is_portrait(im.width, im.height):
    scale_by = settings.crop_width / im.width
  elif is_square(im.width, im.height):
    if is_square(settings.crop_width, settings.crop_height):
      scale_by = settings.crop_width / im.width
    elif is_landscape(settings.crop_width, settings.crop_height):
      scale_by = settings.crop_width / im.width
    elif is_portrait(settings.crop_width, settings.crop_height):
      scale_by = settings.crop_height / im.height

  im = im.resize((int(im.width * scale_by), int(im.height * scale_by)))
  im_debug = im.copy()

  focus = focal_point(im_debug, settings)

  # take the focal point and turn it into crop coordinates that try to center over the focal
  # point but then get adjusted back into the frame
  y_half = int(settings.crop_height / 2)
  x_half = int(settings.crop_width / 2)

  x1 = focus.x - x_half
  if x1 < 0:
      x1 = 0
  elif x1 + settings.crop_width > im.width:
      x1 = im.width - settings.crop_width

  y1 = focus.y - y_half
  if y1 < 0:
      y1 = 0
  elif y1 + settings.crop_height > im.height:
      y1 = im.height - settings.crop_height

  x2 = x1 + settings.crop_width
  y2 = y1 + settings.crop_height

  crop = [x1, y1, x2, y2]

  results = []

  results.append(im.crop(tuple(crop)))

  if settings.annotate_image:
    d = ImageDraw.Draw(im_debug)
    rect = list(crop)
    rect[2] -= 1
    rect[3] -= 1
    d.rectangle(rect, outline=GREEN)
    results.append(im_debug)
    if settings.destop_view_image:
      im_debug.show()

  return results

def focal_point(im, settings):
    corner_points = image_corner_points(im, settings) if settings.corner_points_weight > 0 else []
    entropy_points = image_entropy_points(im, settings) if settings.entropy_points_weight > 0 else []
    face_points = image_face_points(im, settings) if settings.face_points_weight > 0 else []

    pois = []

    weight_pref_total = 0
    if len(corner_points) > 0:
      weight_pref_total += settings.corner_points_weight
    if len(entropy_points) > 0:
      weight_pref_total += settings.entropy_points_weight
    if len(face_points) > 0:
      weight_pref_total += settings.face_points_weight

    corner_centroid = None
    if len(corner_points) > 0:
      corner_centroid = centroid(corner_points)
      corner_centroid.weight = settings.corner_points_weight / weight_pref_total 
      pois.append(corner_centroid)

    entropy_centroid = None
    if len(entropy_points) > 0:
      entropy_centroid = centroid(entropy_points)
      entropy_centroid.weight = settings.entropy_points_weight / weight_pref_total
      pois.append(entropy_centroid)

    face_centroid = None
    if len(face_points) > 0:
      face_centroid = centroid(face_points)
      face_centroid.weight = settings.face_points_weight / weight_pref_total 
      pois.append(face_centroid)

    average_point = poi_average(pois, settings)

    if settings.annotate_image:
      d = ImageDraw.Draw(im)
      max_size = min(im.width, im.height) * 0.07
      if corner_centroid is not None:
        color = BLUE
        box = corner_centroid.bounding(max_size * corner_centroid.weight)
        d.text((box[0], box[1]-15), "Edge: %.02f" % corner_centroid.weight, fill=color)
        d.ellipse(box, outline=color)
        if len(corner_points) > 1:
          for f in corner_points:
            d.rectangle(f.bounding(4), outline=color)
      if entropy_centroid is not None:
        color = "#ff0"
        box = entropy_centroid.bounding(max_size * entropy_centroid.weight)
        d.text((box[0], box[1]-15), "Entropy: %.02f" % entropy_centroid.weight, fill=color)
        d.ellipse(box, outline=color)
        if len(entropy_points) > 1:
          for f in entropy_points:
            d.rectangle(f.bounding(4), outline=color)
      if face_centroid is not None:
        color = RED
        box = face_centroid.bounding(max_size * face_centroid.weight)
        d.text((box[0], box[1]-15), "Face: %.02f" % face_centroid.weight, fill=color)
        d.ellipse(box, outline=color)
        if len(face_points) > 1:
          for f in face_points:
            d.rectangle(f.bounding(4), outline=color)

      d.ellipse(average_point.bounding(max_size), outline=GREEN)
      
    return average_point


def image_face_points(im, settings):
    if settings.dnn_model_path is not None:
      detector = cv2.FaceDetectorYN.create(
          settings.dnn_model_path,
          "",
          (im.width, im.height),
          0.9, # score threshold
          0.3, # nms threshold
          5000 # keep top k before nms
      )
      faces = detector.detect(np.array(im))
      results = []
      if faces[1] is not None:
        for face in faces[1]:
          x = face[0]
          y = face[1]
          w = face[2]
          h = face[3]
          results.append(
            PointOfInterest(
              int(x + (w * 0.5)), # face focus left/right is center
              int(y + (h * 0.33)), # face focus up/down is close to the top of the head
              size = w,
              weight = 1/len(faces[1])
            )
          )
      return results
    else:
      np_im = np.array(im)
      gray = cv2.cvtColor(np_im, cv2.COLOR_BGR2GRAY)

      tries = [
        [ f'{cv2.data.haarcascades}haarcascade_eye.xml', 0.01 ],
        [ f'{cv2.data.haarcascades}haarcascade_frontalface_default.xml', 0.05 ],
        [ f'{cv2.data.haarcascades}haarcascade_profileface.xml', 0.05 ],
        [ f'{cv2.data.haarcascades}haarcascade_frontalface_alt.xml', 0.05 ],
        [ f'{cv2.data.haarcascades}haarcascade_frontalface_alt2.xml', 0.05 ],
        [ f'{cv2.data.haarcascades}haarcascade_frontalface_alt_tree.xml', 0.05 ],
        [ f'{cv2.data.haarcascades}haarcascade_eye_tree_eyeglasses.xml', 0.05 ],
        [ f'{cv2.data.haarcascades}haarcascade_upperbody.xml', 0.05 ]
      ]
      for t in tries:
        classifier = cv2.CascadeClassifier(t[0])
        minsize = int(min(im.width, im.height) * t[1]) # at least N percent of the smallest side
        try:
          faces = classifier.detectMultiScale(gray, scaleFactor=1.1,
            minNeighbors=7, minSize=(minsize, minsize), flags=cv2.CASCADE_SCALE_IMAGE)
        except:
          continue

        if len(faces) > 0:
          rects = [[f[0], f[1], f[0] + f[2], f[1] + f[3]] for f in faces]
          return [PointOfInterest((r[0] +r[2]) // 2, (r[1] + r[3]) // 2, size=abs(r[0]-r[2]), weight=1/len(rects)) for r in rects]
    return []


def image_corner_points(im, settings):
    grayscale = im.convert("L")

    # naive attempt at preventing focal points from collecting at watermarks near the bottom
    gd = ImageDraw.Draw(grayscale)
    gd.rectangle([0, im.height*.9, im.width, im.height], fill="#999")

    np_im = np.array(grayscale)

    points = cv2.goodFeaturesToTrack(
        np_im,
        maxCorners=100,
        qualityLevel=0.04,
        minDistance=min(grayscale.width, grayscale.height)*0.06,
        useHarrisDetector=False,
    )

    if points is None:
        return []

    focal_points = []
    for point in points:
      x, y = point.ravel()
      focal_points.append(PointOfInterest(x, y, size=4, weight=1/len(points)))

    return focal_points


def image_entropy_points(im, settings):
    landscape = im.height < im.width
    portrait = im.height > im.width
    if landscape:
      move_idx = [0, 2]
      move_max = im.size[0]
    elif portrait:
      move_idx = [1, 3]
      move_max = im.size[1]
    else:
      return []

    e_max = 0
    crop_current = [0, 0, settings.crop_width, settings.crop_height]
    crop_best = crop_current
    while crop_current[move_idx[1]] < move_max:
        crop = im.crop(tuple(crop_current))
        e = image_entropy(crop)

        if (e > e_max):
          e_max = e
          crop_best = list(crop_current)

        crop_current[move_idx[0]] += 4
        crop_current[move_idx[1]] += 4

    x_mid = int(crop_best[0] + settings.crop_width/2)
    y_mid = int(crop_best[1] + settings.crop_height/2)

    return [PointOfInterest(x_mid, y_mid, size=25, weight=1.0)]


def image_entropy(im):
    # greyscale image entropy
    # band = np.asarray(im.convert("L"))
    band = np.asarray(im.convert("1"), dtype=np.uint8)
    hist, _ = np.histogram(band, bins=range(0, 256))
    hist = hist[hist > 0]
    return -np.log2(hist / hist.sum()).sum()

def centroid(pois):
  x = [poi.x for poi in pois]
  y = [poi.y for poi in pois]
  return PointOfInterest(sum(x)/len(pois), sum(y)/len(pois))


def poi_average(pois, settings):
    weight = 0.0
    x = 0.0
    y = 0.0
    for poi in pois:
        weight += poi.weight
        x += poi.x * poi.weight
        y += poi.y * poi.weight
    avg_x = round(x / weight)
    avg_y = round(y / weight)

    return PointOfInterest(avg_x, avg_y)


def is_landscape(w, h):
  return w > h


def is_portrait(w, h):
  return h > w


def is_square(w, h):
  return w == h


def download_and_cache_models(dirname):
  download_url = 'https://github.com/opencv/opencv_zoo/blob/91fb0290f50896f38a0ab1e558b74b16bc009428/models/face_detection_yunet/face_detection_yunet_2022mar.onnx?raw=true'
  model_file_name = 'face_detection_yunet.onnx'

  if not os.path.exists(dirname):
    os.makedirs(dirname)

  cache_file = os.path.join(dirname, model_file_name)
  if not os.path.exists(cache_file):
    print(f"downloading face detection model from '{download_url}' to '{cache_file}'")
    response = requests.get(download_url)
    with open(cache_file, "wb") as f:
      f.write(response.content)

  if os.path.exists(cache_file):
    return cache_file
  return None


class PointOfInterest:
  def __init__(self, x, y, weight=1.0, size=10):
    self.x = x
    self.y = y
    self.weight = weight
    self.size = size

  def bounding(self, size):
    return [
      self.x - size//2,
      self.y - size//2,
      self.x + size//2,
      self.y + size//2
    ]


class Settings:
  def __init__(self, crop_width=512, crop_height=512, corner_points_weight=0.5, entropy_points_weight=0.5, face_points_weight=0.5, annotate_image=False, dnn_model_path=None):
    self.crop_width = crop_width
    self.crop_height = crop_height
    self.corner_points_weight = corner_points_weight
    self.entropy_points_weight = entropy_points_weight
    self.face_points_weight = face_points_weight
    self.annotate_image = annotate_image
    self.destop_view_image = False
    self.dnn_model_path = dnn_model_path
 No newline at end of file
+33 −5
Original line number Diff line number Diff line
@@ -7,12 +7,14 @@ import tqdm
import time

from modules import shared, images
from modules.paths import models_path
from modules.shared import opts, cmd_opts
from modules.textual_inversion import autocrop
if cmd_opts.deepdanbooru:
    import modules.deepbooru as deepbooru


def preprocess(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru=False, split_threshold=0.5, overlap_ratio=0.2):
def preprocess(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru=False, split_threshold=0.5, overlap_ratio=0.2, process_focal_crop=False, process_focal_crop_face_weight=0.9, process_focal_crop_entropy_weight=0.3, process_focal_crop_edges_weight=0.5, process_focal_crop_debug=False):
    try:
        if process_caption:
            shared.interrogator.load()
@@ -22,7 +24,7 @@ def preprocess(process_src, process_dst, process_width, process_height, preproce
            db_opts[deepbooru.OPT_INCLUDE_RANKS] = False
            deepbooru.create_deepbooru_process(opts.interrogate_deepbooru_score_threshold, db_opts)

        preprocess_work(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru, split_threshold, overlap_ratio)
        preprocess_work(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru, split_threshold, overlap_ratio, process_focal_crop, process_focal_crop_face_weight, process_focal_crop_entropy_weight, process_focal_crop_edges_weight, process_focal_crop_debug)

    finally:

@@ -34,7 +36,7 @@ def preprocess(process_src, process_dst, process_width, process_height, preproce



def preprocess_work(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru=False, split_threshold=0.5, overlap_ratio=0.2):
def preprocess_work(process_src, process_dst, process_width, process_height, preprocess_txt_action, process_flip, process_split, process_caption, process_caption_deepbooru=False, split_threshold=0.5, overlap_ratio=0.2, process_focal_crop=False, process_focal_crop_face_weight=0.9, process_focal_crop_entropy_weight=0.3, process_focal_crop_edges_weight=0.5, process_focal_crop_debug=False):
    width = process_width
    height = process_height
    src = os.path.abspath(process_src)
@@ -113,6 +115,7 @@ def preprocess_work(process_src, process_dst, process_width, process_height, pre
                splitted = image.crop((0, y, to_w, y + to_h))
            yield splitted


    for index, imagefile in enumerate(tqdm.tqdm(files)):
        subindex = [0]
        filename = os.path.join(src, imagefile)
@@ -137,10 +140,35 @@ def preprocess_work(process_src, process_dst, process_width, process_height, pre
            ratio = (img.height * width) / (img.width * height)
            inverse_xy = True

        process_default_resize = True

        if process_split and ratio < 1.0 and ratio <= split_threshold:
            for splitted in split_pic(img, inverse_xy):
                save_pic(splitted, index, existing_caption=existing_caption)
        else:
            process_default_resize = False

        if process_focal_crop and img.height != img.width:

            dnn_model_path = None
            try:
                dnn_model_path = autocrop.download_and_cache_models(os.path.join(models_path, "opencv"))
            except Exception as e:
                print("Unable to load face detection model for auto crop selection. Falling back to lower quality haar method.", e)

            autocrop_settings = autocrop.Settings(
                crop_width = width,
                crop_height = height,
                face_points_weight = process_focal_crop_face_weight,
                entropy_points_weight = process_focal_crop_entropy_weight,
                corner_points_weight = process_focal_crop_edges_weight,
                annotate_image = process_focal_crop_debug,
                dnn_model_path = dnn_model_path,
            )
            for focal in autocrop.crop_image(img, autocrop_settings):
                save_pic(focal, index, existing_caption=existing_caption)
            process_default_resize = False

        if process_default_resize:
            img = images.resize_image(1, img, width, height)
            save_pic(img, index, existing_caption=existing_caption)

+18 −0
Original line number Diff line number Diff line
@@ -1261,6 +1261,7 @@ def create_ui(wrap_gradio_gpu_call):
                    with gr.Row():
                        process_flip = gr.Checkbox(label='Create flipped copies')
                        process_split = gr.Checkbox(label='Split oversized images')
                        process_focal_crop = gr.Checkbox(label='Auto focal point crop')
                        process_caption = gr.Checkbox(label='Use BLIP for caption')
                        process_caption_deepbooru = gr.Checkbox(label='Use deepbooru for caption', visible=True if cmd_opts.deepdanbooru else False)

@@ -1268,6 +1269,12 @@ def create_ui(wrap_gradio_gpu_call):
                        process_split_threshold = gr.Slider(label='Split image threshold', value=0.5, minimum=0.0, maximum=1.0, step=0.05)
                        process_overlap_ratio = gr.Slider(label='Split image overlap ratio', value=0.2, minimum=0.0, maximum=0.9, step=0.05)

                    with gr.Row(visible=False) as process_focal_crop_row:
                        process_focal_crop_face_weight = gr.Slider(label='Focal point face weight', value=0.9, minimum=0.0, maximum=1.0, step=0.05)
                        process_focal_crop_entropy_weight = gr.Slider(label='Focal point entropy weight', value=0.15, minimum=0.0, maximum=1.0, step=0.05)
                        process_focal_crop_edges_weight = gr.Slider(label='Focal point edges weight', value=0.5, minimum=0.0, maximum=1.0, step=0.05)
                        process_focal_crop_debug = gr.Checkbox(label='Create debug image')

                    with gr.Row():
                        with gr.Column(scale=3):
                            gr.HTML(value="")
@@ -1281,6 +1288,12 @@ def create_ui(wrap_gradio_gpu_call):
                        outputs=[process_split_extra_row],
                    )

                    process_focal_crop.change(
                        fn=lambda show: gr_show(show),
                        inputs=[process_focal_crop],
                        outputs=[process_focal_crop_row],
                    )

                with gr.Tab(label="Train"):
                    gr.HTML(value="<p style='margin-bottom: 0.7em'>Train an embedding or Hypernetwork; you must specify a directory with a set of 1:1 ratio images <a href=\"https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Textual-Inversion\" style=\"font-weight:bold;\">[wiki]</a></p>")
                    with gr.Row():
@@ -1369,6 +1382,11 @@ def create_ui(wrap_gradio_gpu_call):
                process_caption_deepbooru,
                process_split_threshold,
                process_overlap_ratio,
                process_focal_crop,
                process_focal_crop_face_weight,
                process_focal_crop_entropy_weight,
                process_focal_crop_edges_weight,
                process_focal_crop_debug,
            ],
            outputs=[
                ti_output,
+2 −0
Original line number Diff line number Diff line
@@ -8,6 +8,8 @@ gradio==3.5
invisible-watermark
numpy
omegaconf
opencv-python
requests
piexif
Pillow
pytorch_lightning