Merge pull request #10 from UTAustin-SwarmLab/pix-to-pix-integration

Pix to pix integration
UTAustin-SwarmLab · Sep 16, 2023 · 1709a4e · 1709a4e
2 parents 0996f52 + 80135d9
commit 1709a4e
Show file tree

Hide file tree

Showing 154 changed files with 19,982 additions and 56 deletions.
diff --git a/Makefile b/Makefile
@@ -8,14 +8,15 @@ BASE_IMG=nvidia/cuda:11.8.0-devel-ubuntu20.04
 CODE_PATH := /home/mc76728/repos/Video-to-Automaton/
 
 # Custom Image
-MY_DOCKER_IMG := ${user}ns_vfs
+DOCKER_IMG := ns_vfs
+MY_DOCKER_IMG := ${USER}_ns_vfs
 TAG := latest
 
 pull_docker_image:
 	docker pull ${BASE_IMG}
 
 build_docker_image:
-	docker build --build-arg BASE_IMG=${BASE_IMG} . -f docker/Dockerfile --network=host --tag ${MY_DOCKER_IMG}:${TAG}
+	docker build --build-arg BASE_IMG=${BASE_IMG} . -f docker/Dockerfile --network=host --tag ${DOCKER_IMG}:${TAG}
 
 run_docker_container:
 	docker run --interactive \
@@ -25,7 +26,7 @@ run_docker_container:
 			   --cap-add=SYS_PTRACE \
 			   --ulimit core=0:0 \
 			   --volume ${CODE_PATH}:/opt/Neuro-Symbolic-Video-Frame-Search \
-			   ${MY_DOCKER_IMG}:${TAG} \
+			   ${DOCKER_IMG}:${TAG} \
 			   /bin/bash
 
 run_docker_container_gpu:
@@ -38,8 +39,8 @@ run_docker_container_gpu:
 			   --cap-add=SYS_PTRACE \
 			   --ulimit core=0:0 \
 			   --volume ${CODE_PATH}:/opt/Neuro-Symbolic-Video-Frame-Search \
-			   ${MY_DOCKER_IMG}:${TAG} \
-			   /bin/bashd
+			   ${DOCKER_IMG}:${TAG} \
+			   /bin/bash
 
 exec_docker_container:
 	docker exec -it ${MY_DOCKER_IMG} /bin/bash

diff --git a/README.md b/README.md
@@ -30,3 +30,10 @@ Please avoid stopping and removing the container, as you will need to reinstall
 [The VIRAT Video Dataset](https://viratdata.org/#getting-data): The VIRAT Video Dataset is designed to be realistic, natural and challenging for video surveillance domains in terms of its resolution, background clutter, diversity in scenes, and human activity/event categories than existing action recognition datasets. It has become a benchmark dataset for the computer vision community.
 
 [NuScenes](https://www.nuscenes.org/nuimages#download): A public large-scale dataset for autonomous driving
+
+## Linear Temporal Logitics
+Confiremd Examples:
+- 'P{op}probability [F "proposition"]'
+- 'P{op}probability [G "proposition"]'
+- 'P{op}probability [F "proposition1" | "proposition2"]'
+- 'P{op}probability [F "proposition1" & "proposition2"]'
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -44,12 +44,15 @@ RUN git clone -b stable https://github.com/moves-rwth/storm.git
 RUN git clone -b stable https://github.com/moves-rwth/stormpy.git
 # INSTALL CARL STORM
 WORKDIR /opt/carl-storm
-RUN mkdir build && cd build && cmake ../ && make lib_carl -j 2
+RUN mkdir build && cd build && cmake ../ -DCMAKE_BUILD_TYPE=Release -DUSE_CLN_NUMBERS=ON -DUSE_GINAC=ON -DTHREAD_SAFE=ON 
+WORKDIR /opt/carl-storm/build
+RUN make lib_carl -j 1
 # INSTALL STORM
 WORKDIR /opt/storm
-ENV STORM_HOME=/opt/storm
-RUN mkdir build && cd build && cmake ../ && make
-RUN export PATH=$PATH:$STORM_DIR/build/bin
+RUN mkdir build && cd build && cmake ../ -DCMAKE_BUILD_TYPE=Release -DSTORM_DEVELOPER=OFF -DSTORM_LOG_DISABLE_DEBUG=ON -DSTORM_PORTABLE=ON -DSTORM_USE_SPOT_SHIPPED=ON 
+WORKDIR /opt/storm/build
+RUN make resources -j 1 && make storm -j 1 && make binaries -j 1
+RUN export PATH=$PATH:/opt/storm/build/bin
 # INSTALL STORMPY
 WORKDIR /opt/stormpy
 RUN python3 -m pip install -ve .

diff --git a/install.sh b/install.sh
@@ -15,6 +15,13 @@ elif [[ ! -d $WEIGHT_DIR ]]; then
     echo "$WEIGHT_DIR already exists but is not a directory" 1>&2
 fi
 
+# -- Installing Dependencies which can't be installed via toml -- #
+python3 -m pip install -e "git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers"
+
+# -- Stable Diffusion -- #
+cd /opt/Neuro-Symbolic-Video-Frame-Search/ns_vfs/model/diffusion/stable_diffusion
+python3 -m pip install -e .
+
 # -- Installing video_to_automaton -- #
 echo "Installing video_to_automaton"
 cd $ROOT_DIR

diff --git a/ns_vfs/config/InstructPix2Pix.yaml b/ns_vfs/config/InstructPix2Pix.yaml
@@ -0,0 +1,108 @@
+# File modified by authors of InstructPix2Pix from original (https://github.com/CompVis/stable-diffusion).
+# See more details in LICENSE.
+general:
+  resolution: 512
+  steps: 100
+  checkpoint: /opt/Neuro-Symbolic-Video-Frame-Search/artifacts/weights/instruct-pix2pix-00-22000.ckpt
+  vae_ckpt:
+  edit: "turn human face into a joker's face"
+  cfg_text: 6.5
+  cfg_image: 1.5
+  seed:
+
+model:
+  base_learning_rate: 1.0e-04
+  target: ns_vfs.model.diffusion.stable_diffusion.ldm.models.diffusion.ddpm_edit.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: edited
+    cond_stage_key: edit
+    # image_size: 64
+    # image_size: 32
+    image_size: 16
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: hybrid
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: true
+    load_ema: true
+
+    scheduler_config: # 10000 warmup steps
+      target: ns_vfs.model.diffusion.stable_diffusion.ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 0 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ns_vfs.model.diffusion.stable_diffusion.ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ns_vfs.model.diffusion.stable_diffusion.ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ns_vfs.model.diffusion.stable_diffusion.ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 128
+    num_workers: 1
+    wrap: false
+    validation:
+      target: edit_dataset.EditDataset
+      params:
+        path: data/clip-filtered-dataset
+        cache_dir:  data/
+        cache_name: data_10k
+        split: val
+        min_text_sim: 0.2
+        min_image_sim: 0.75
+        min_direction_sim: 0.2
+        max_samples_per_prompt: 1
+        min_resize_res: 512
+        max_resize_res: 512
+        crop_res: 512
+        output_as_edit: False
+        real_input: True
diff --git a/ns_vfs/config/__init__.py b/ns_vfs/config/__init__.py
@@ -4,9 +4,7 @@
 
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__)).split("ns_vfs")[0]
 
-config_path = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)), "config.yaml"
-)
+config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.yaml")
 
 config = omegaconf.load_config_from_yaml(config_path)
 

diff --git a/ns_vfs/data/frame.py b/ns_vfs/data/frame.py
@@ -9,6 +9,7 @@ class Frame:
 
     frame_index: int
     frame_image: np.ndarray
+    object_detection: dict = dataclasses.field(default_factory=dict)
     propositional_probability: dict = dataclasses.field(default_factory=dict)
 
 

diff --git a/ns_vfs/model/diffusion/__init__.py b/ns_vfs/model/diffusion/__init__.py
diff --git a/ns_vfs/model/diffusion/_base.py b/ns_vfs/model/diffusion/_base.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+import abc
+
+
+class Diffusion(abc.ABC):
+    """Abstract base class for diffusion models."""
+
+    @abc.abstractmethod
+    def diffuse(self, input: any):
+        """Diffuse the input."""
diff --git a/ns_vfs/model/diffusion/pix2pix.py b/ns_vfs/model/diffusion/pix2pix.py
@@ -0,0 +1,131 @@
+import math
+import random
+
+import einops
+import k_diffusion as K
+import numpy as np
+import torch
+from einops import rearrange
+from omegaconf import OmegaConf
+from PIL import Image, ImageOps
+from torch import autocast, nn
+
+from ns_vfs.model.diffusion.stable_diffusion.ldm.util import instantiate_from_config
+
+from ._base import Diffusion
+
+
+class PixToPix(Diffusion):
+    def __init__(self, config: OmegaConf):
+        self._config = config.general
+        self._model_config = config.model
+        self._model = self.load_model_from_config(
+            self._config, self._config.checkpoint, self._config.vae_ckpt
+        )
+        self._model.eval().cuda()
+        self._model_wrap = K.external.CompVisDenoiser(self._model)
+        self._model_wrap_cfg = CFGDenoiser(self._model_wrap)
+        self._null_token = self._model.get_learned_conditioning([""])
+        self._seed = random.randint(0, 100000) if self._config.seed is None else self._config.seed
+
+    def image_process(self, image):
+        if isinstance(image, np.ndarray):
+            input_image = Image.fromarray(image)
+        else:
+            input_image = Image.open(image).convert("RGB")
+        width, height = input_image.size
+        factor = self._config.resolution / max(width, height)
+        factor = math.ceil(min(width, height) * factor / 64) * 64 / min(width, height)
+        width = int((width * factor) // 64) * 64
+        height = int((height * factor) // 64) * 64
+        return ImageOps.fit(input_image, (width, height), method=Image.Resampling.LANCZOS)
+
+    def diffuse(self, input: any):
+        input_image = self.image_process(input)
+        with torch.no_grad(), autocast("cuda"), self._model.ema_scope():
+            cond = {}
+            cond["c_crossattn"] = [self._model.get_learned_conditioning([self._config.edit])]
+            input_image = 2 * torch.tensor(np.array(input_image)).float() / 255 - 1
+            input_image = rearrange(input_image, "h w c -> 1 c h w").to(self._model.device)
+            cond["c_concat"] = [self._model.encode_first_stage(input_image).mode()]
+
+            uncond = {}
+            uncond["c_crossattn"] = [self._null_token]
+            uncond["c_concat"] = [torch.zeros_like(cond["c_concat"][0])]
+
+            sigmas = self._model_wrap.get_sigmas(self._config.steps)
+
+            extra_args = {
+                "cond": cond,
+                "uncond": uncond,
+                "text_cfg_scale": self._config.cfg_text,
+                "image_cfg_scale": self._config.cfg_image,
+            }
+            torch.manual_seed(self._seed)
+            z = torch.randn_like(cond["c_concat"][0]) * sigmas[0]
+            z = K.sampling.sample_euler_ancestral(
+                self._model_wrap_cfg, z, sigmas, extra_args=extra_args
+            )
+            x = self._model.decode_first_stage(z)
+            x = torch.clamp((x + 1.0) / 2.0, min=0.0, max=1.0)
+            x = 255.0 * rearrange(x, "1 c h w -> h w c")
+            edited_img = Image.fromarray(x.type(torch.uint8).cpu().numpy())
+            edited_img.save("output.jpg")
+            return edited_img
+
+    def load_model_from_config(self, config, ckpt, vae_ckpt=None, verbose=False):
+        print(f"Loading model from {ckpt}")
+        pl_sd = torch.load(ckpt, map_location="cpu")
+        if "global_step" in pl_sd:
+            print(f"Global Step: {pl_sd['global_step']}")
+        sd = pl_sd["state_dict"]
+        if vae_ckpt is not None:
+            print(f"Loading VAE from {vae_ckpt}")
+            vae_sd = torch.load(vae_ckpt, map_location="cpu")["state_dict"]
+            sd = {
+                k: vae_sd[k[len("first_stage_model.") :]]
+                if k.startswith("first_stage_model.")
+                else v
+                for k, v in sd.items()
+            }
+        model = instantiate_from_config(self._model_config)
+        m, u = model.load_state_dict(sd, strict=False)
+        if len(m) > 0 and verbose:
+            print("missing keys:")
+            print(m)
+        if len(u) > 0 and verbose:
+            print("unexpected keys:")
+            print(u)
+        return model
+
+
+class CFGDenoiser(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.inner_model = model
+
+    def forward(self, z, sigma, cond, uncond, text_cfg_scale, image_cfg_scale):
+        cfg_z = einops.repeat(z, "1 ... -> n ...", n=3)
+        cfg_sigma = einops.repeat(sigma, "1 ... -> n ...", n=3)
+        cfg_cond = {
+            "c_crossattn": [
+                torch.cat(
+                    [
+                        cond["c_crossattn"][0],
+                        uncond["c_crossattn"][0],
+                        uncond["c_crossattn"][0],
+                    ]
+                )
+            ],
+            "c_concat": [
+                torch.cat([cond["c_concat"][0], cond["c_concat"][0], uncond["c_concat"][0]])
+            ],
+        }
+        out_cond, out_img_cond, out_uncond = self.inner_model(
+            cfg_z, cfg_sigma, cond=cfg_cond
+        ).chunk(3)
+        return (
+            out_uncond
+            + text_cfg_scale * (out_cond - out_img_cond)
+            + image_cfg_scale * (out_img_cond - out_uncond)
+        )