Skip to content

Commit

Permalink
Merge pull request #10 from UTAustin-SwarmLab/pix-to-pix-integration
Browse files Browse the repository at this point in the history
Pix to pix integration
  • Loading branch information
minkyu-choi07 authored Sep 16, 2023
2 parents 0996f52 + 80135d9 commit 1709a4e
Show file tree
Hide file tree
Showing 154 changed files with 19,982 additions and 56 deletions.
11 changes: 6 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@ BASE_IMG=nvidia/cuda:11.8.0-devel-ubuntu20.04
CODE_PATH := /home/mc76728/repos/Video-to-Automaton/

# Custom Image
MY_DOCKER_IMG := ${user}ns_vfs
DOCKER_IMG := ns_vfs
MY_DOCKER_IMG := ${USER}_ns_vfs
TAG := latest

pull_docker_image:
docker pull ${BASE_IMG}

build_docker_image:
docker build --build-arg BASE_IMG=${BASE_IMG} . -f docker/Dockerfile --network=host --tag ${MY_DOCKER_IMG}:${TAG}
docker build --build-arg BASE_IMG=${BASE_IMG} . -f docker/Dockerfile --network=host --tag ${DOCKER_IMG}:${TAG}

run_docker_container:
docker run --interactive \
Expand All @@ -25,7 +26,7 @@ run_docker_container:
--cap-add=SYS_PTRACE \
--ulimit core=0:0 \
--volume ${CODE_PATH}:/opt/Neuro-Symbolic-Video-Frame-Search \
${MY_DOCKER_IMG}:${TAG} \
${DOCKER_IMG}:${TAG} \
/bin/bash

run_docker_container_gpu:
Expand All @@ -38,8 +39,8 @@ run_docker_container_gpu:
--cap-add=SYS_PTRACE \
--ulimit core=0:0 \
--volume ${CODE_PATH}:/opt/Neuro-Symbolic-Video-Frame-Search \
${MY_DOCKER_IMG}:${TAG} \
/bin/bashd
${DOCKER_IMG}:${TAG} \
/bin/bash

exec_docker_container:
docker exec -it ${MY_DOCKER_IMG} /bin/bash
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,10 @@ Please avoid stopping and removing the container, as you will need to reinstall
[The VIRAT Video Dataset](https://viratdata.org/#getting-data): The VIRAT Video Dataset is designed to be realistic, natural and challenging for video surveillance domains in terms of its resolution, background clutter, diversity in scenes, and human activity/event categories than existing action recognition datasets. It has become a benchmark dataset for the computer vision community.

[NuScenes](https://www.nuscenes.org/nuimages#download): A public large-scale dataset for autonomous driving

## Linear Temporal Logitics
Confiremd Examples:
- 'P{op}probability [F "proposition"]'
- 'P{op}probability [G "proposition"]'
- 'P{op}probability [F "proposition1" | "proposition2"]'
- 'P{op}probability [F "proposition1" & "proposition2"]'
11 changes: 7 additions & 4 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,15 @@ RUN git clone -b stable https://github.com/moves-rwth/storm.git
RUN git clone -b stable https://github.com/moves-rwth/stormpy.git
# INSTALL CARL STORM
WORKDIR /opt/carl-storm
RUN mkdir build && cd build && cmake ../ && make lib_carl -j 2
RUN mkdir build && cd build && cmake ../ -DCMAKE_BUILD_TYPE=Release -DUSE_CLN_NUMBERS=ON -DUSE_GINAC=ON -DTHREAD_SAFE=ON
WORKDIR /opt/carl-storm/build
RUN make lib_carl -j 1
# INSTALL STORM
WORKDIR /opt/storm
ENV STORM_HOME=/opt/storm
RUN mkdir build && cd build && cmake ../ && make
RUN export PATH=$PATH:$STORM_DIR/build/bin
RUN mkdir build && cd build && cmake ../ -DCMAKE_BUILD_TYPE=Release -DSTORM_DEVELOPER=OFF -DSTORM_LOG_DISABLE_DEBUG=ON -DSTORM_PORTABLE=ON -DSTORM_USE_SPOT_SHIPPED=ON
WORKDIR /opt/storm/build
RUN make resources -j 1 && make storm -j 1 && make binaries -j 1
RUN export PATH=$PATH:/opt/storm/build/bin
# INSTALL STORMPY
WORKDIR /opt/stormpy
RUN python3 -m pip install -ve .
Expand Down
7 changes: 7 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ elif [[ ! -d $WEIGHT_DIR ]]; then
echo "$WEIGHT_DIR already exists but is not a directory" 1>&2
fi

# -- Installing Dependencies which can't be installed via toml -- #
python3 -m pip install -e "git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers"

# -- Stable Diffusion -- #
cd /opt/Neuro-Symbolic-Video-Frame-Search/ns_vfs/model/diffusion/stable_diffusion
python3 -m pip install -e .

# -- Installing video_to_automaton -- #
echo "Installing video_to_automaton"
cd $ROOT_DIR
Expand Down
108 changes: 108 additions & 0 deletions ns_vfs/config/InstructPix2Pix.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# File modified by authors of InstructPix2Pix from original (https://github.com/CompVis/stable-diffusion).
# See more details in LICENSE.
general:
resolution: 512
steps: 100
checkpoint: /opt/Neuro-Symbolic-Video-Frame-Search/artifacts/weights/instruct-pix2pix-00-22000.ckpt
vae_ckpt:
edit: "turn human face into a joker's face"
cfg_text: 6.5
cfg_image: 1.5
seed:

model:
base_learning_rate: 1.0e-04
target: ns_vfs.model.diffusion.stable_diffusion.ldm.models.diffusion.ddpm_edit.LatentDiffusion
params:
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: edited
cond_stage_key: edit
# image_size: 64
# image_size: 32
image_size: 16
channels: 4
cond_stage_trainable: false # Note: different from the one we trained before
conditioning_key: hybrid
monitor: val/loss_simple_ema
scale_factor: 0.18215
use_ema: true
load_ema: true

scheduler_config: # 10000 warmup steps
target: ns_vfs.model.diffusion.stable_diffusion.ldm.lr_scheduler.LambdaLinearScheduler
params:
warm_up_steps: [ 0 ]
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
f_start: [ 1.e-6 ]
f_max: [ 1. ]
f_min: [ 1. ]

unet_config:
target: ns_vfs.model.diffusion.stable_diffusion.ldm.modules.diffusionmodules.openaimodel.UNetModel
params:
image_size: 32 # unused
in_channels: 8
out_channels: 4
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_heads: 8
use_spatial_transformer: True
transformer_depth: 1
context_dim: 768
use_checkpoint: True
legacy: False

first_stage_config:
target: ns_vfs.model.diffusion.stable_diffusion.ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity

cond_stage_config:
target: ns_vfs.model.diffusion.stable_diffusion.ldm.modules.encoders.modules.FrozenCLIPEmbedder

data:
target: main.DataModuleFromConfig
params:
batch_size: 128
num_workers: 1
wrap: false
validation:
target: edit_dataset.EditDataset
params:
path: data/clip-filtered-dataset
cache_dir: data/
cache_name: data_10k
split: val
min_text_sim: 0.2
min_image_sim: 0.75
min_direction_sim: 0.2
max_samples_per_prompt: 1
min_resize_res: 512
max_resize_res: 512
crop_res: 512
output_as_edit: False
real_input: True
4 changes: 1 addition & 3 deletions ns_vfs/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@

ROOT_DIR = os.path.dirname(os.path.abspath(__file__)).split("ns_vfs")[0]

config_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "config.yaml"
)
config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.yaml")

config = omegaconf.load_config_from_yaml(config_path)

Expand Down
1 change: 1 addition & 0 deletions ns_vfs/data/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class Frame:

frame_index: int
frame_image: np.ndarray
object_detection: dict = dataclasses.field(default_factory=dict)
propositional_probability: dict = dataclasses.field(default_factory=dict)


Expand Down
Empty file.
11 changes: 11 additions & 0 deletions ns_vfs/model/diffusion/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from __future__ import annotations

import abc


class Diffusion(abc.ABC):
"""Abstract base class for diffusion models."""

@abc.abstractmethod
def diffuse(self, input: any):
"""Diffuse the input."""
131 changes: 131 additions & 0 deletions ns_vfs/model/diffusion/pix2pix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import math
import random

import einops
import k_diffusion as K
import numpy as np
import torch
from einops import rearrange
from omegaconf import OmegaConf
from PIL import Image, ImageOps
from torch import autocast, nn

from ns_vfs.model.diffusion.stable_diffusion.ldm.util import instantiate_from_config

from ._base import Diffusion


class PixToPix(Diffusion):
def __init__(self, config: OmegaConf):
self._config = config.general
self._model_config = config.model
self._model = self.load_model_from_config(
self._config, self._config.checkpoint, self._config.vae_ckpt
)
self._model.eval().cuda()
self._model_wrap = K.external.CompVisDenoiser(self._model)
self._model_wrap_cfg = CFGDenoiser(self._model_wrap)
self._null_token = self._model.get_learned_conditioning([""])
self._seed = random.randint(0, 100000) if self._config.seed is None else self._config.seed

def image_process(self, image):
if isinstance(image, np.ndarray):
input_image = Image.fromarray(image)
else:
input_image = Image.open(image).convert("RGB")
width, height = input_image.size
factor = self._config.resolution / max(width, height)
factor = math.ceil(min(width, height) * factor / 64) * 64 / min(width, height)
width = int((width * factor) // 64) * 64
height = int((height * factor) // 64) * 64
return ImageOps.fit(input_image, (width, height), method=Image.Resampling.LANCZOS)

def diffuse(self, input: any):
input_image = self.image_process(input)
with torch.no_grad(), autocast("cuda"), self._model.ema_scope():
cond = {}
cond["c_crossattn"] = [self._model.get_learned_conditioning([self._config.edit])]
input_image = 2 * torch.tensor(np.array(input_image)).float() / 255 - 1
input_image = rearrange(input_image, "h w c -> 1 c h w").to(self._model.device)
cond["c_concat"] = [self._model.encode_first_stage(input_image).mode()]

uncond = {}
uncond["c_crossattn"] = [self._null_token]
uncond["c_concat"] = [torch.zeros_like(cond["c_concat"][0])]

sigmas = self._model_wrap.get_sigmas(self._config.steps)

extra_args = {
"cond": cond,
"uncond": uncond,
"text_cfg_scale": self._config.cfg_text,
"image_cfg_scale": self._config.cfg_image,
}
torch.manual_seed(self._seed)
z = torch.randn_like(cond["c_concat"][0]) * sigmas[0]
z = K.sampling.sample_euler_ancestral(
self._model_wrap_cfg, z, sigmas, extra_args=extra_args
)
x = self._model.decode_first_stage(z)
x = torch.clamp((x + 1.0) / 2.0, min=0.0, max=1.0)
x = 255.0 * rearrange(x, "1 c h w -> h w c")
edited_img = Image.fromarray(x.type(torch.uint8).cpu().numpy())
edited_img.save("output.jpg")
return edited_img

def load_model_from_config(self, config, ckpt, vae_ckpt=None, verbose=False):
print(f"Loading model from {ckpt}")
pl_sd = torch.load(ckpt, map_location="cpu")
if "global_step" in pl_sd:
print(f"Global Step: {pl_sd['global_step']}")
sd = pl_sd["state_dict"]
if vae_ckpt is not None:
print(f"Loading VAE from {vae_ckpt}")
vae_sd = torch.load(vae_ckpt, map_location="cpu")["state_dict"]
sd = {
k: vae_sd[k[len("first_stage_model.") :]]
if k.startswith("first_stage_model.")
else v
for k, v in sd.items()
}
model = instantiate_from_config(self._model_config)
m, u = model.load_state_dict(sd, strict=False)
if len(m) > 0 and verbose:
print("missing keys:")
print(m)
if len(u) > 0 and verbose:
print("unexpected keys:")
print(u)
return model


class CFGDenoiser(nn.Module):
def __init__(self, model):
super().__init__()
self.inner_model = model

def forward(self, z, sigma, cond, uncond, text_cfg_scale, image_cfg_scale):
cfg_z = einops.repeat(z, "1 ... -> n ...", n=3)
cfg_sigma = einops.repeat(sigma, "1 ... -> n ...", n=3)
cfg_cond = {
"c_crossattn": [
torch.cat(
[
cond["c_crossattn"][0],
uncond["c_crossattn"][0],
uncond["c_crossattn"][0],
]
)
],
"c_concat": [
torch.cat([cond["c_concat"][0], cond["c_concat"][0], uncond["c_concat"][0]])
],
}
out_cond, out_img_cond, out_uncond = self.inner_model(
cfg_z, cfg_sigma, cond=cfg_cond
).chunk(3)
return (
out_uncond
+ text_cfg_scale * (out_cond - out_img_cond)
+ image_cfg_scale * (out_img_cond - out_uncond)
)
Loading

0 comments on commit 1709a4e

Please sign in to comment.