Fix video manager and rectangle artifacts

mike9251 · mike9251 · commit f295dda60b6e · 2022-12-11T21:52:14.000Z
diff --git a/README.md b/README.md
@@ -103,6 +103,7 @@ Config files contain two main parts:
   - *att_image* - target image, attributes of the person on this image will be mixed with the person's identity from the source image. Here you can also specify a folder with multiple images - identity translation will be applied to all images in the folder.
   - *specific_id_image* - a specific person on the *att_image* you would like to replace, leaving others untouched (if there're any other person).
   - *att_video* - the same as *att_image*
+  - *clean_work_dir* - whether remove temp folder with images or not (for video configs only).
 
 
 - **pipeline**
@@ -117,6 +118,7 @@ Config files contain two main parts:
   - *face_alignment_type* - affects reference face key points coordinates. **Possible values are "ffhq" and "none". Try both of them to see which one works better for your data.**
   - *erode_mask_value* - a non-zero value. It's used for the post-processing mask size attenuation. You might want to play with this parameter.
   - *smooth_mask_value* - an odd non-zero value. It's used for smoothing edges of the post-processing mask. Usually is set to *erode_mask_value* + 1.
+  - *sigma_scale_value* - controls the amount of blur added to the post-processing mask. Valid values are in range [0.01...1.0]. Tune it if yuo see artifacts around swapped faces (some rectangles).
   - *face_detector_threshold* - values in range [0.0...1.0]. Higher value reduces probability of FP detections but increases the probability of FN.
   - *specific_latent_match_threshold* - values in range [0.0...inf]. Usually takes small values around 0.05.
   - *enhance_output* - whether to apply GFPGAN model or not as a post-processing step.
diff --git a/app.py b/app.py
@@ -39,7 +39,7 @@ def __init__(self, config: DictConfig):
         self.att_video: Optional[VideoDataManager] = None
         if att_video_path and att_video_path.is_file():
             self.att_video: Optional[VideoDataManager] = VideoDataManager(
-                src_data=att_video_path, output_dir=output_dir
+                src_data=att_video_path, output_dir=output_dir, clean_work_dir=config.data.clean_work_dir
             )
 
         assert not (self.att_video and self.att_image), "Only one attribute source can be used!"
diff --git a/app_web.py b/app_web.py
@@ -41,6 +41,8 @@ def get_np_image(file):
             label="smooth_mask_value", min_value=1, max_value=61, step=2, value=41
         )
 
+        sigma_scale_value = st.slider(label="sigma_scale_value", min_value=0.01, max_value=1.0, step=0.01, value=1.0)
+
         specific_latent_match_threshold = st.slider(
             label="specific_latent_match_threshold",
             min_value=0.0,
@@ -75,6 +77,7 @@ def get_np_image(file):
         model.set_face_alignment_type(face_alignment_type)
         model.set_erode_mask_value(erode_mask_value)
         model.set_smooth_mask_value(smooth_mask_value)
+        model.set_sigma_scale_value(sigma_scale_value)
         model.set_specific_latent_match_threshold(specific_latent_match_threshold)
         model.enhance_output = True if enhance_output == "yes" else False
 
@@ -126,6 +129,7 @@ def load_model(config):
     + " face_alignment_type"
     + " erode_mask_value"
     + " smooth_mask_value"
+    + " sigma_scale_value"
     + " face_detector_threshold"
     + " specific_latent_match_threshold"
     + " enhance_output",
@@ -144,6 +148,7 @@ def load_model(config):
         face_alignment_type="none",
         erode_mask_value=40,
         smooth_mask_value=41,
+        sigma_scale_value=1.0,
         face_detector_threshold=0.6,
         specific_latent_match_threshold=0.05,
         enhance_output=True
diff --git a/configs/run_image.yaml b/configs/run_image.yaml
@@ -18,6 +18,7 @@ pipeline:
   face_alignment_type: "none" #"ffhq"
   erode_mask_value: 40
   smooth_mask_value: 41
+  sigma_scale_value: 1.0
   face_detector_threshold: 0.6
   specific_latent_match_threshold: 0.05
   enhance_output: True
diff --git a/configs/run_image_specific.yaml b/configs/run_image_specific.yaml
@@ -18,6 +18,7 @@ pipeline:
   face_alignment_type: "none" #"ffhq"
   erode_mask_value: 40
   smooth_mask_value: 41
+  sigma_scale_value: 1.0
   face_detector_threshold: 0.6
   specific_latent_match_threshold: 0.05
   enhance_output: True
diff --git a/configs/run_video.yaml b/configs/run_video.yaml
@@ -4,6 +4,7 @@ data:
   specific_id_image: "none"
   att_video: "${hydra:runtime.cwd}/demo_file/multi_people_1080p.mp4"
   output_dir: ${hydra:runtime.cwd}/output
+  clean_work_dir: True
 
 pipeline:
   face_detector_weights: "${hydra:runtime.cwd}/weights/face_detector_scrfd_10g_bnkps.onnx"
@@ -18,6 +19,7 @@ pipeline:
   face_alignment_type: "none" #"ffhq"
   erode_mask_value: 40
   smooth_mask_value: 41
+  sigma_scale_value: 1.0
   face_detector_threshold: 0.6
   specific_latent_match_threshold: 0.05
   enhance_output: True
diff --git a/configs/run_video_specific.yaml b/configs/run_video_specific.yaml
@@ -4,6 +4,7 @@ data:
   specific_id_image: "${hydra:runtime.cwd}/demo_file/specific1.png"
   att_video: "${hydra:runtime.cwd}/demo_file/multi_people_1080p.mp4"
   output_dir: ${hydra:runtime.cwd}/output
+  clean_work_dir: True
 
 pipeline:
   face_detector_weights: "${hydra:runtime.cwd}/weights/face_detector_scrfd_10g_bnkps.onnx"
@@ -18,6 +19,7 @@ pipeline:
   face_alignment_type: "none" #"ffhq"
   erode_mask_value: 40
   smooth_mask_value: 41
+  sigma_scale_value: 1.0
   face_detector_threshold: 0.6
   specific_latent_match_threshold: 0.05
   enhance_output: True
diff --git a/src/DataManager/VideoDataManager.py b/src/DataManager/VideoDataManager.py
@@ -1,33 +1,38 @@
 from src.DataManager.base import BaseDataManager
 from src.DataManager.utils import imwrite_rgb
 
+import cv2
 import numpy as np
 from pathlib import Path
-from typing import Optional
+import shutil
+from typing import Optional, Union
 
 from moviepy.editor import AudioFileClip, VideoFileClip
 from moviepy.video.io.ImageSequenceClip import ImageSequenceClip
 
 
 class VideoDataManager(BaseDataManager):
-    def __init__(self, src_data: Path, output_dir: Path):
-        self.video_handle: Optional[VideoFileClip] = None
+    def __init__(self, src_data: Path, output_dir: Path, clean_work_dir: bool = False):
+        self.video_handle: Optional[cv2.VideoCapture] = None
         self.audio_handle: Optional[AudioFileClip] = None
 
         self.output_dir = output_dir
         self.output_img_dir = output_dir / "img"
         self.output_dir.mkdir(exist_ok=True)
         self.output_img_dir.mkdir(exist_ok=True)
         self.video_name = None
+        self.clean_work_dir = clean_work_dir
 
         if src_data.is_file():
             self.video_name = "swap_" + src_data.name
 
-            self.audio_handle = AudioFileClip(str(src_data))
-            self.video_handle = VideoFileClip(str(src_data))
-            self.fps = self.video_handle.reader.fps
-            self.frame_count = self.video_handle.reader.nframes
-            self.data_iterator = zip(range(self.frame_count), self.video_handle.iter_frames())
+            if VideoFileClip(str(src_data)).audio is not None:
+                self.audio_handle = AudioFileClip(str(src_data))
+
+            self.video_handle = cv2.VideoCapture(str(src_data))
+
+            self.frame_count = int(self.video_handle.get(cv2.CAP_PROP_FRAME_COUNT))
+            self.fps = self.video_handle.get(cv2.CAP_PROP_FPS)
 
         self.last_idx = -1
 
@@ -37,7 +42,14 @@ def __len__(self):
         return self.frame_count
 
     def get(self) -> np.ndarray:
-        self.last_idx, img = next(self.data_iterator)
+        img: Union[None, np.ndarray] = None
+
+        while img is None and self.last_idx < self.frame_count:
+            status, img = self.video_handle.read()
+            self.last_idx += 1
+
+        if img is not None:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
         return img
 
     def save(self, img: np.ndarray):
@@ -51,6 +63,10 @@ def _close(self):
         image_filenames = [str(x) for x in sorted(self.output_img_dir.glob("*.jpg"))]
         clip = ImageSequenceClip(image_filenames, fps=self.fps)
 
-        clip = clip.set_audio(self.audio_handle)
+        if self.audio_handle is not None:
+            clip = clip.set_audio(self.audio_handle)
+
+        clip.write_videofile(str(self.output_dir / self.video_name))
 
-        clip.write_videofile(str(self.output_dir / self.video_name), audio_codec="aac")
+        if self.clean_work_dir:
+            shutil.rmtree(self.output_img_dir, ignore_errors=True)
diff --git a/src/simswap.py b/src/simswap.py
@@ -35,6 +35,7 @@ def __init__(
         self.face_alignment_type: Union[FaceAlignmentType, None] = None
         self.erode_mask_value: Union[int, None] = None
         self.smooth_mask_value: Union[int, None] = None
+        self.sigma_scale_value: Union[float, None] = None
         self.face_detector_threshold: Union[float, None] = None
         self.specific_latent_match_threshold: Union[float, None] = None
         self.device = torch.device(config.device)
@@ -122,6 +123,7 @@ def set_parameters(self, config) -> None:
         self.set_specific_latent_match_threshold(config.specific_latent_match_threshold)
         self.set_erode_mask_value(config.erode_mask_value)
         self.set_smooth_mask_value(config.smooth_mask_value)
+        self.set_sigma_scale_value(config.sigma_scale_value)
 
     def set_crop_size(self, crop_size: int) -> None:
         if crop_size < 0:
@@ -174,6 +176,12 @@ def set_smooth_mask_value(self, smooth_mask_value: int) -> None:
 
         self.smooth_mask_value = smooth_mask_value
 
+    def set_sigma_scale_value(self, sigma_scale_value: float) -> None:
+        if sigma_scale_value < 0 or sigma_scale_value > 1.0:
+            raise "Invalid sigma_scale_value! Must be within 0...1 range."
+
+        self.sigma_scale_value = sigma_scale_value
+
     def run_detect_align(self, image: np.ndarray, for_id: bool = False) -> Tuple[Union[Iterable[np.ndarray], None],
                                                                                  Union[Iterable[np.ndarray], None],
                                                                                  np.ndarray]:
@@ -336,7 +344,8 @@ def __call__(self, att_image: np.ndarray) -> np.ndarray:
             kernel_size = (self.smooth_mask_value, self.smooth_mask_value)
             # https://docs.opencv.org/4.x/d4/d86/group__imgproc__filter.html#gaabe8c836e97159a9193fb0b11ac52cf1
             # https://docs.opencv.org/4.x/d4/d86/group__imgproc__filter.html#gac05a120c1ae92a6060dd0db190a61afa
-            sigma = 2 * 0.3 * ((kernel_size[0] - 1) * 0.5 - 1) + 0.8
+            sigma = 0.3 * ((kernel_size[0] - 1) * 0.5 - 1) + 0.8
+            sigma *= self.sigma_scale_value
             img_mask = kornia.filters.gaussian_blur2d(img_mask, kernel_size, (sigma, sigma), border_type='constant',
                                                       separable=True)
 

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ def __init__(self, config: DictConfig):`
`39`	`39`	`self.att_video: Optional[VideoDataManager] = None`
`40`	`40`	`if att_video_path and att_video_path.is_file():`
`41`	`41`	`self.att_video: Optional[VideoDataManager] = VideoDataManager(`
`42`		`- src_data=att_video_path, output_dir=output_dir`
	`42`	`+ src_data=att_video_path, output_dir=output_dir, clean_work_dir=config.data.clean_work_dir`
`43`	`43`	`)`
`44`	`44`
`45`	`45`	`assert not (self.att_video and self.att_image), "Only one attribute source can be used!"`